diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/crimson | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crimson')
334 files changed, 105416 insertions, 0 deletions
diff --git a/src/crimson/CMakeLists.txt b/src/crimson/CMakeLists.txt new file mode 100644 index 000000000..e2b37fac9 --- /dev/null +++ b/src/crimson/CMakeLists.txt @@ -0,0 +1,203 @@ +add_library(crimson::cflags INTERFACE IMPORTED) +set(crimson_cflag_definitions "WITH_SEASTAR=1") +# disable concepts to address https://github.com/boostorg/asio/issues/312 +if((CMAKE_CXX_COMPILER_ID STREQUAL GNU AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10) OR + (CMAKE_CXX_COMPILER_ID STREQUAL Clang)) + list(APPEND crimson_cflag_definitions + "BOOST_ASIO_DISABLE_CONCEPTS") +endif() +set_target_properties(crimson::cflags PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "${crimson_cflag_definitions}" + INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:-Wno-non-virtual-dtor> + INTERFACE_LINK_LIBRARIES Seastar::seastar) + +set(crimson_common_srcs + common/assert.cc + common/buffer_io.cc + common/config_proxy.cc + common/fatal_signal.cc + common/formatter.cc + common/perf_counters_collection.cc + common/log.cc + common/logclient.cc + common/operation.cc + common/throttle.cc + common/tmap_helpers.cc + common/tri_mutex.cc + crush/CrushLocation.cc) + +# the specialized version of ceph-common, where +# - the logging is sent to Seastar backend +# - and the template parameter of lock_policy is SINGLE +add_library(crimson-common STATIC + ${PROJECT_SOURCE_DIR}/src/auth/Crypto.cc + ${PROJECT_SOURCE_DIR}/src/common/admin_socket_client.cc + ${PROJECT_SOURCE_DIR}/src/common/bit_str.cc + ${PROJECT_SOURCE_DIR}/src/common/bloom_filter.cc + ${PROJECT_SOURCE_DIR}/src/common/buffer.cc + ${PROJECT_SOURCE_DIR}/src/common/buffer_seastar.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_argparse.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_context.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_crypto.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_hash.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_time.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_strings.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_releases.cc + ${PROJECT_SOURCE_DIR}/src/common/cmdparse.cc + ${PROJECT_SOURCE_DIR}/src/common/common_init.cc + ${PROJECT_SOURCE_DIR}/src/common/compat.cc + ${PROJECT_SOURCE_DIR}/src/common/code_environment.cc + ${PROJECT_SOURCE_DIR}/src/common/config.cc + ${PROJECT_SOURCE_DIR}/src/common/config_values.cc + ${PROJECT_SOURCE_DIR}/src/common/dout.cc + ${PROJECT_SOURCE_DIR}/src/common/entity_name.cc + ${PROJECT_SOURCE_DIR}/src/common/environment.cc + ${PROJECT_SOURCE_DIR}/src/common/errno.cc + ${PROJECT_SOURCE_DIR}/src/common/escape.cc + ${PROJECT_SOURCE_DIR}/src/common/hex.cc + ${PROJECT_SOURCE_DIR}/src/common/fs_types.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_json.cc + ${PROJECT_SOURCE_DIR}/src/common/histogram.cc + ${PROJECT_SOURCE_DIR}/src/common/hobject.cc + ${PROJECT_SOURCE_DIR}/src/common/hostname.cc + ${PROJECT_SOURCE_DIR}/src/common/ipaddr.cc + ${PROJECT_SOURCE_DIR}/src/common/mempool.cc + ${PROJECT_SOURCE_DIR}/src/common/options.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_counters.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_counters_key.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_histogram.cc + ${PROJECT_SOURCE_DIR}/src/common/page.cc + ${PROJECT_SOURCE_DIR}/src/common/pick_address.cc + ${PROJECT_SOURCE_DIR}/src/common/snap_types.cc + ${PROJECT_SOURCE_DIR}/src/common/signal.cc + ${PROJECT_SOURCE_DIR}/src/common/str_list.cc + ${PROJECT_SOURCE_DIR}/src/common/str_map.cc + ${PROJECT_SOURCE_DIR}/src/common/strtol.cc + ${PROJECT_SOURCE_DIR}/src/common/reverse.c + ${PROJECT_SOURCE_DIR}/src/common/types.cc + ${PROJECT_SOURCE_DIR}/src/common/utf8.c + ${PROJECT_SOURCE_DIR}/src/common/version.cc + ${PROJECT_SOURCE_DIR}/src/common/BackTrace.cc + ${PROJECT_SOURCE_DIR}/src/common/ConfUtils.cc + ${PROJECT_SOURCE_DIR}/src/common/DecayCounter.cc + ${PROJECT_SOURCE_DIR}/src/common/HTMLFormatter.cc + ${PROJECT_SOURCE_DIR}/src/common/Formatter.cc + ${PROJECT_SOURCE_DIR}/src/common/Graylog.cc + ${PROJECT_SOURCE_DIR}/src/common/Journald.cc + ${PROJECT_SOURCE_DIR}/src/common/ostream_temp.cc + ${PROJECT_SOURCE_DIR}/src/common/LogEntry.cc + ${PROJECT_SOURCE_DIR}/src/common/TextTable.cc + ${PROJECT_SOURCE_DIR}/src/common/Thread.cc + ${PROJECT_SOURCE_DIR}/src/common/PluginRegistry.cc + ${PROJECT_SOURCE_DIR}/src/common/RefCountedObj.cc + ${PROJECT_SOURCE_DIR}/src/common/util.cc + ${PROJECT_SOURCE_DIR}/src/compressor/Compressor.cc + ${PROJECT_SOURCE_DIR}/src/crush/builder.c + ${PROJECT_SOURCE_DIR}/src/crush/mapper.c + ${PROJECT_SOURCE_DIR}/src/crush/crush.c + ${PROJECT_SOURCE_DIR}/src/crush/hash.c + ${PROJECT_SOURCE_DIR}/src/crush/CrushWrapper.cc + ${PROJECT_SOURCE_DIR}/src/crush/CrushCompiler.cc + ${PROJECT_SOURCE_DIR}/src/crush/CrushTester.cc + ${PROJECT_SOURCE_DIR}/src/global/global_context.cc + ${PROJECT_SOURCE_DIR}/src/global/pidfile.cc + ${PROJECT_SOURCE_DIR}/src/librbd/Features.cc + ${PROJECT_SOURCE_DIR}/src/librbd/io/IoOperations.cc + ${PROJECT_SOURCE_DIR}/src/mgr/ServiceMap.cc + ${PROJECT_SOURCE_DIR}/src/mds/inode_backtrace.cc + ${PROJECT_SOURCE_DIR}/src/mds/mdstypes.cc + ${PROJECT_SOURCE_DIR}/src/mds/cephfs_features.cc + ${PROJECT_SOURCE_DIR}/src/mds/FSMap.cc + ${PROJECT_SOURCE_DIR}/src/mds/FSMapUser.cc + ${PROJECT_SOURCE_DIR}/src/mds/MDSMap.cc + ${PROJECT_SOURCE_DIR}/src/msg/msg_types.cc + ${PROJECT_SOURCE_DIR}/src/msg/Message.cc + ${PROJECT_SOURCE_DIR}/src/mon/PGMap.cc + ${PROJECT_SOURCE_DIR}/src/mon/MonCap.cc + ${PROJECT_SOURCE_DIR}/src/mon/MonMap.cc + ${PROJECT_SOURCE_DIR}/src/osd/osd_types.cc + ${PROJECT_SOURCE_DIR}/src/osd/ECMsgTypes.cc + ${PROJECT_SOURCE_DIR}/src/osd/HitSet.cc + ${PROJECT_SOURCE_DIR}/src/osd/OSDMap.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc + ${PROJECT_SOURCE_DIR}/src/xxHash/xxhash.c + ${crimson_common_srcs} + $<TARGET_OBJECTS:common_mountcephfs_objs> + $<TARGET_OBJECTS:common-options-objs>) + +target_compile_definitions(crimson-common PRIVATE + "CMAKE_INSTALL_LIBDIR=\"${CMAKE_INSTALL_LIBDIR}\"" + "CEPH_INSTALL_FULL_PKGLIBDIR=\"${CEPH_INSTALL_FULL_PKGLIBDIR}\"" + "CEPH_INSTALL_DATADIR=\"${CEPH_INSTALL_DATADIR}\"") + +set(crimson_common_deps + Boost::iostreams + Boost::random + json_spirit) + +set(crimson_common_public_deps crimson::cflags) +if(WITH_JAEGER) + list(APPEND crimson_common_public_deps jaeger_base) +endif() + +if(NOT WITH_SYSTEM_BOOST) + list(APPEND crimson_common_deps ${ZLIB_LIBRARIES}) +endif() + +target_link_libraries(crimson-common + PUBLIC + ${crimson_common_public_deps} + PRIVATE + crc32 + ${crimson_common_deps} + OpenSSL::Crypto) + +set(crimson_auth_srcs + auth/KeyRing.cc + ${PROJECT_SOURCE_DIR}/src/auth/AuthClientHandler.cc + ${PROJECT_SOURCE_DIR}/src/auth/AuthMethodList.cc + ${PROJECT_SOURCE_DIR}/src/auth/AuthRegistry.cc + ${PROJECT_SOURCE_DIR}/src/auth/AuthSessionHandler.cc + ${PROJECT_SOURCE_DIR}/src/auth/Crypto.cc + ${PROJECT_SOURCE_DIR}/src/auth/KeyRing.cc + ${PROJECT_SOURCE_DIR}/src/auth/RotatingKeyRing.cc + ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxAuthorizeHandler.cc + ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxClientHandler.cc + ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxProtocol.cc + ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxSessionHandler.cc + ${PROJECT_SOURCE_DIR}/src/auth/none/AuthNoneAuthorizeHandler.cc) +set(crimson_mgr_srcs + mgr/client.cc) +set(crimson_mon_srcs + mon/MonClient.cc + ${PROJECT_SOURCE_DIR}/src/mon/MonSub.cc) +set(crimson_net_srcs + ${PROJECT_SOURCE_DIR}/src/msg/async/crypto_onwire.cc + ${PROJECT_SOURCE_DIR}/src/msg/async/compression_onwire.cc + ${PROJECT_SOURCE_DIR}/src/msg/async/frames_v2.cc + net/Errors.cc + net/FrameAssemblerV2.cc + net/io_handler.cc + net/Messenger.cc + net/SocketConnection.cc + net/SocketMessenger.cc + net/Socket.cc + net/ProtocolV2.cc + net/chained_dispatchers.cc) +add_library(crimson STATIC + ${crimson_auth_srcs} + ${crimson_mgr_srcs} + ${crimson_mon_srcs} + ${crimson_net_srcs}) +target_compile_options(crimson PUBLIC + "-ftemplate-backtrace-limit=0") +set_target_properties(crimson PROPERTIES + JOB_POOL_COMPILE heavy_compile_job_pool) +target_link_libraries(crimson + PUBLIC + crimson-common + crimson::cflags) +add_subdirectory(admin) +add_subdirectory(os) +add_subdirectory(osd) +add_subdirectory(tools) diff --git a/src/crimson/admin/CMakeLists.txt b/src/crimson/admin/CMakeLists.txt new file mode 100644 index 000000000..36a5ae2a9 --- /dev/null +++ b/src/crimson/admin/CMakeLists.txt @@ -0,0 +1,9 @@ +add_library(crimson-admin STATIC + admin_socket.cc + osd_admin.cc + pg_commands.cc) +target_link_libraries(crimson-admin + crimson::cflags + Boost::MPL) +add_dependencies(crimson-admin + legacy-option-headers) diff --git a/src/crimson/admin/admin_socket.cc b/src/crimson/admin/admin_socket.cc new file mode 100644 index 000000000..9db91369a --- /dev/null +++ b/src/crimson/admin/admin_socket.cc @@ -0,0 +1,550 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/admin/admin_socket.h" + +#include <boost/algorithm/string/join.hpp> +#include <fmt/format.h> +#include <fmt/ranges.h> +#include <seastar/net/api.hh> +#include <seastar/net/inet_address.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/sleep.hh> +#include <seastar/core/thread.hh> +#include <seastar/util/std-compat.hh> + +#include "common/options.h" +#include "common/version.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "crimson/common/log.h" +#include "crimson/net/Socket.h" + +using namespace crimson::common; +using namespace std::literals; +using ceph::common::cmdmap_from_json; +using ceph::common::cmd_getval; +using ceph::common::bad_cmd_get; +using ceph::common::validate_cmd; +using ceph::common::dump_cmd_and_help_to_json; + +namespace { +seastar::logger& logger() +{ + return crimson::get_logger(ceph_subsys_osd); +} +} // namespace + +using std::string; +using std::string_view; +using std::stringstream; +using std::unique_ptr; + +namespace crimson::admin { + +tell_result_t::tell_result_t(int ret, std::string&& err) + : ret{ret}, err(std::move(err)) +{} + +tell_result_t::tell_result_t(int ret, std::string&& err, ceph::bufferlist&& out) + : ret{ret}, err(std::move(err)), out(std::move(out)) +{} + +tell_result_t::tell_result_t(std::unique_ptr<Formatter> formatter) +{ + formatter->flush(out); +} + +void AdminSocket::register_command(std::unique_ptr<AdminSocketHook>&& hook) +{ + auto prefix = hook->prefix; + auto [it, added] = hooks.emplace(prefix, std::move(hook)); + assert(added); + logger().info("register_command(): {})", it->first); +} + +auto AdminSocket::parse_cmd(const std::vector<std::string>& cmd) + -> std::variant<parsed_command_t, tell_result_t> +{ + // preliminaries: + // - create the formatter specified by the cmd parameters + // - locate the "op-code" string (the 'prefix' segment) + // - prepare for command parameters extraction via cmdmap_t + cmdmap_t cmdmap; + ceph::bufferlist out; + + try { + stringstream errss; + // note that cmdmap_from_json() may throw on syntax issues + if (!cmdmap_from_json(cmd, &cmdmap, errss)) { + logger().error("{}: incoming command error: {}", __func__, errss.str()); + out.append("error:"s); + out.append(errss.str()); + return tell_result_t{-EINVAL, "invalid json", std::move(out)}; + } + } catch (const std::runtime_error& e) { + logger().error("{}: incoming command syntax: {}", __func__, cmd); + out.append(string{e.what()}); + return tell_result_t{-EINVAL, "invalid json", std::move(out)}; + } + + string format; + string prefix; + try { + cmd_getval(cmdmap, "format", format); + cmd_getval(cmdmap, "prefix", prefix); + // tolerate old-style pg <pgid> command <args> style formatting + if (prefix == "pg") { + cmd_getval(cmdmap, "cmd", prefix); + } + } catch (const bad_cmd_get& e) { + logger().error("{}: invalid syntax: {}", __func__, cmd); + out.append(string{e.what()}); + return tell_result_t{-EINVAL, "invalid json", std::move(out)}; + } + + // match the incoming op-code to one of the registered APIs + if (auto found = hooks.find(prefix); found != hooks.end()) { + return parsed_command_t{ cmdmap, format, *found->second }; + } else { + return tell_result_t{-EINVAL, + fmt::format("unknown command '{}'", prefix), + std::move(out)}; + } +} + +seastar::future<> AdminSocket::finalize_response( + seastar::output_stream<char>& out, ceph::bufferlist&& msgs) +{ + string outbuf_cont = msgs.to_str(); + if (outbuf_cont.empty()) { + outbuf_cont = " {} "; + } + uint32_t response_length = htonl(outbuf_cont.length()); + logger().info("asok response length: {}", outbuf_cont.length()); + + return out.write(reinterpret_cast<char*>(&response_length), + sizeof(response_length)) + .then([&out, outbuf_cont] { return out.write(outbuf_cont.c_str()); }); +} + + +seastar::future<> AdminSocket::handle_command(crimson::net::ConnectionRef conn, + boost::intrusive_ptr<MCommand> m) +{ + return execute_command(m->cmd, std::move(m->get_data())).then( + [conn, tid=m->get_tid()](auto result) { + auto [ret, err, out] = std::move(result); + auto reply = crimson::make_message<MCommandReply>(ret, err); + reply->set_tid(tid); + reply->set_data(out); + return conn->send(std::move(reply)); + }); +} + +seastar::future<> AdminSocket::execute_line(std::string cmdline, + seastar::output_stream<char>& out) +{ + return execute_command({std::move(cmdline)}, {}).then([&out, this](auto result) { + auto [ret, stderr, stdout] = std::move(result); + if (ret < 0) { + stdout.append(fmt::format("ERROR: {}\n", cpp_strerror(ret))); + stdout.append(stderr); + } + return finalize_response(out, std::move(stdout)); + }); +} + +auto AdminSocket::execute_command(const std::vector<std::string>& cmd, + ceph::bufferlist&& buf) + -> seastar::future<tell_result_t> +{ + auto maybe_parsed = parse_cmd(cmd); + if (auto* parsed = std::get_if<parsed_command_t>(&maybe_parsed); parsed) { + stringstream os; + string desc{parsed->hook.desc}; + if (!validate_cmd(desc, parsed->params, os)) { + logger().error("AdminSocket::execute_command: " + "failed to validate '{}': {}", cmd, os.str()); + ceph::bufferlist out; + out.append(os); + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EINVAL, "invalid command json", std::move(out)}); + } + return parsed->hook.call(parsed->params, parsed->format, std::move(buf)); + } else { + auto& result = std::get<tell_result_t>(maybe_parsed); + return seastar::make_ready_future<tell_result_t>(std::move(result)); + } +} + +// an input_stream consumer that reads buffer into a std::string up to the first +// '\0' which indicates the end of command +struct line_consumer { + using tmp_buf = seastar::temporary_buffer<char>; + using consumption_result_type = + typename seastar::input_stream<char>::consumption_result_type; + + seastar::future<consumption_result_type> operator()(tmp_buf&& buf) { + size_t consumed = 0; + for (auto c : buf) { + consumed++; + if (c == '\0') { + buf.trim_front(consumed); + return seastar::make_ready_future<consumption_result_type>( + consumption_result_type::stop_consuming_type(std::move(buf))); + } else { + line.push_back(c); + } + } + return seastar::make_ready_future<consumption_result_type>( + seastar::continue_consuming{}); + } + std::string line; +}; + +seastar::future<> AdminSocket::handle_client(seastar::input_stream<char>& in, + seastar::output_stream<char>& out) +{ + auto consumer = seastar::make_shared<line_consumer>(); + return in.consume(*consumer).then([consumer, &out, this] { + logger().debug("AdminSocket::handle_client: incoming asok string: {}", + consumer->line); + return execute_line(consumer->line, out); + }).then([&out] { + return out.flush(); + }).finally([&out] { + return out.close(); + }).then([&in] { + return in.close(); + }).handle_exception([](auto ep) { + logger().debug("exception on {}: {}", __func__, ep); + }); +} + +seastar::future<> AdminSocket::start(const std::string& path) +{ + if (path.empty()) { + logger().error( + "{}: Admin Socket socket path missing from the configuration", __func__); + return seastar::now(); + } + + logger().debug("{}: asok socket path={}", __func__, path); + auto sock_path = seastar::socket_address{ seastar::unix_domain_addr{ path } }; + try { + server_sock = seastar::engine().listen(sock_path); + } catch (const std::system_error& e) { + if (e.code() == std::errc::address_in_use) { + logger().debug("{}: Admin Socket socket path={} already exists, retrying", + __func__, path); + return seastar::remove_file(path).then([this, path] { + server_sock.reset(); + return start(path); + }); + } + logger().error("{}: unable to listen({}): {}", __func__, path, e.what()); + server_sock.reset(); + return seastar::make_ready_future<>(); + } + // listen in background + task = seastar::keep_doing([this] { + return seastar::try_with_gate(stop_gate, [this] { + assert(!connected_sock.has_value()); + return server_sock->accept().then([this](seastar::accept_result acc) { + connected_sock = std::move(acc.connection); + return seastar::do_with(connected_sock->input(), + connected_sock->output(), + [this](auto& input, auto& output) mutable { + return handle_client(input, output); + }).finally([this] { + assert(connected_sock.has_value()); + connected_sock.reset(); + }); + }).handle_exception([this](auto ep) { + if (!stop_gate.is_closed()) { + logger().error("AdminSocket: terminated: {}", ep); + } + }); + }); + }).handle_exception_type([](const seastar::gate_closed_exception&) { + }).finally([path] { + return seastar::remove_file(path); + }); + return seastar::make_ready_future<>(); +} + +seastar::future<> AdminSocket::stop() +{ + if (!server_sock) { + return seastar::now(); + } + server_sock->abort_accept(); + if (connected_sock) { + connected_sock->shutdown_input(); + connected_sock->shutdown_output(); + } + return stop_gate.close().then([this] { + assert(task.has_value()); + return task->then([] { + logger().info("AdminSocket: stopped"); + return seastar::now(); + }); + }); +} + +///////////////////////////////////////// +// the internal hooks +///////////////////////////////////////// + +class VersionHook final : public AdminSocketHook { + public: + VersionHook() + : AdminSocketHook{"version", "", "get ceph version"} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&&) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("version"); + f->dump_string("version", ceph_version_to_str()); + f->dump_string("release", ceph_release_to_str()); + f->dump_string("release_type", ceph_release_type()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +/** + Note that the git_version command is expected to return a 'version' JSON + segment. +*/ +class GitVersionHook final : public AdminSocketHook { + public: + GitVersionHook() + : AdminSocketHook{"git_version", "", "get git sha1"} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&&) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("version"); + f->dump_string("git_version", git_version_to_str()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +class HelpHook final : public AdminSocketHook { + const AdminSocket& m_as; + + public: + explicit HelpHook(const AdminSocket& as) : + AdminSocketHook{"help", "", "list available commands"}, + m_as{as} + {} + + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&&) const final + { + unique_ptr<Formatter> f{Formatter::create(format, + "json-pretty", "json-pretty")}; + f->open_object_section("help"); + for (const auto& [prefix, hook] : m_as) { + if (!hook->help.empty()) { + f->dump_string(prefix.data(), hook->help); + } + } + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +class GetdescsHook final : public AdminSocketHook { + const AdminSocket& m_as; + + public: + explicit GetdescsHook(const AdminSocket& as) : + AdminSocketHook{"get_command_descriptions", + "", + "list available commands"}, + m_as{ as } {} + + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&&) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + int cmdnum = 0; + f->open_object_section("command_descriptions"); + for (const auto& [prefix, hook] : m_as) { + auto secname = fmt::format("cmd {:>03}", cmdnum); + auto cmd = fmt::format("{} {}", hook->prefix, hook->desc); + dump_cmd_and_help_to_json(f.get(), CEPH_FEATURES_ALL, secname, + cmd, std::string{hook->help}); + cmdnum++; + } + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +class InjectArgsHook final : public AdminSocketHook { +public: + InjectArgsHook() + : AdminSocketHook{"injectargs", + "name=injected_args,type=CephString,n=N", + "inject configuration arguments into running daemon"} + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&&) const final + { + std::vector<std::string> argv; + if (!cmd_getval(cmdmap, "injected_args", argv)) { + return seastar::make_ready_future<tell_result_t>(); + } + const std::string args = boost::algorithm::join(argv, " "); + return local_conf().inject_args(args).then([] { + return seastar::make_ready_future<tell_result_t>(); + }).handle_exception_type([] (const std::invalid_argument& e) { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EINVAL, e.what()}); + }); + } +}; + +/** + * listing the configuration values + */ +class ConfigShowHook : public AdminSocketHook { +public: + ConfigShowHook() : + AdminSocketHook{"config show", + "", + "dump current config settings"} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("config_show"); + local_conf().show_config(f.get()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +/** + * fetching the value of a specific configuration item + */ +class ConfigGetHook : public AdminSocketHook { +public: + ConfigGetHook() : + AdminSocketHook("config get", + "name=var,type=CephString", + "config get <field>: get the config value") + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const final + { + std::string var; + [[maybe_unused]] bool found = cmd_getval(cmdmap, "var", var); + assert(found); + std::string conf_val; + if (int r = local_conf().get_val(var, &conf_val); r < 0) { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{r, fmt::format("error getting {}: {}", + var, cpp_strerror(r))}); + } + unique_ptr<Formatter> f{Formatter::create(format, + "json-pretty", + "json-pretty")}; + f->open_object_section("config_get"); + f->dump_string(var, conf_val); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +/** + * setting the value of a specific configuration item (an example: + * {"prefix": "config set", "var":"debug_osd", "val": ["30/20"]} ) + */ +class ConfigSetHook : public AdminSocketHook { +public: + ConfigSetHook() + : AdminSocketHook("config set", + "name=var,type=CephString " + "name=val,type=CephString,n=N", + "config set <field> <val> [<val> ...]: set a config variable") + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&&) const final + { + std::string var; + std::vector<std::string> new_val; + cmd_getval(cmdmap, "var", var); + cmd_getval(cmdmap, "val", new_val); + // val may be multiple words + const std::string joined_values = boost::algorithm::join(new_val, " "); + return local_conf().set_val(var, joined_values).then([format] { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("config_set"); + f->dump_string("success", ""); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + }).handle_exception_type([](std::invalid_argument& e) { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EINVAL, e.what()}); + }); + } +}; + +/** + * listing the configuration values + */ +class ConfigHelpHook : public AdminSocketHook { +public: + ConfigHelpHook() : + AdminSocketHook{"config help", + "", + "get config setting schema and descriptions"} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + // Output all + f->open_array_section("options"); + for (const auto &option : ceph_options) { + f->dump_object("option", option); + } + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +/// the hooks that are served directly by the admin_socket server +void AdminSocket::register_admin_commands() +{ + register_command(std::make_unique<VersionHook>()); + register_command(std::make_unique<GitVersionHook>()); + register_command(std::make_unique<HelpHook>(*this)); + register_command(std::make_unique<GetdescsHook>(*this)); + register_command(std::make_unique<ConfigGetHook>()); + register_command(std::make_unique<ConfigSetHook>()); + register_command(std::make_unique<ConfigShowHook>()); + register_command(std::make_unique<ConfigHelpHook>()); + register_command(std::make_unique<InjectArgsHook>()); +} + +} // namespace crimson::admin diff --git a/src/crimson/admin/admin_socket.h b/src/crimson/admin/admin_socket.h new file mode 100644 index 000000000..8bf9fd4d3 --- /dev/null +++ b/src/crimson/admin/admin_socket.h @@ -0,0 +1,187 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +/** + A Crimson-wise version of the src/common/admin_socket.h + + Note: assumed to be running on a single core. +*/ +#include <map> +#include <string> +#include <string_view> + +#include <seastar/core/future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/iostream.hh> +#include <seastar/core/shared_mutex.hh> +#include <seastar/core/shared_ptr.hh> +#include <seastar/net/api.hh> + +#include "common/cmdparse.h" +#include "include/buffer.h" +#include "crimson/net/Fwd.h" + +class MCommand; + +namespace crimson::admin { + +class AdminSocket; + +struct tell_result_t { + int ret = 0; + std::string err; + ceph::bufferlist out; + tell_result_t() = default; + tell_result_t(int ret, std::string&& err); + tell_result_t(int ret, std::string&& err, ceph::bufferlist&& out); + /** + * create a \c tell_result_t indicating the successful completion + * of command + * + * \param formatter the content of formatter will be flushed to the + * output buffer + */ + tell_result_t(std::unique_ptr<Formatter> formatter); +}; + +/** + * An abstract class to be inherited by implementations of asock hooks + */ +class AdminSocketHook { + public: + AdminSocketHook(std::string_view prefix, + std::string_view desc, + std::string_view help) : + prefix{prefix}, desc{desc}, help{help} + {} + /** + * handle command defined by cmdmap + * + * \param cmdmap dictionary holding the named parameters + * \param format the expected format of the output + * \param input the binary input of the command + * \pre \c cmdmap should be validated with \c desc + * \retval an instance of \c tell_result_t + * \note a negative \c ret should be set to indicate that the hook fails to + * fulfill the command either because of an invalid input or other + * failures. in that case, a brief reason of the failure should + * noted in \c err in the returned value + */ + virtual seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const = 0; + virtual ~AdminSocketHook() {} + const std::string_view prefix; + const std::string_view desc; + const std::string_view help; +}; + +class AdminSocket : public seastar::enable_lw_shared_from_this<AdminSocket> { + public: + AdminSocket() = default; + ~AdminSocket() = default; + + AdminSocket(const AdminSocket&) = delete; + AdminSocket& operator=(const AdminSocket&) = delete; + AdminSocket(AdminSocket&&) = delete; + AdminSocket& operator=(AdminSocket&&) = delete; + + /** + * create the async Seastar thread that handles asok commands arriving + * over the socket. + */ + seastar::future<> start(const std::string& path); + + seastar::future<> stop(); + + /** + * register an admin socket hook + * + * Commands (APIs) are registered under a command string. Incoming + * commands are split by spaces and matched against the longest + * registered command. For example, if 'foo' and 'foo bar' are + * registered, and an incoming command is 'foo bar baz', it is + * matched with 'foo bar', while 'foo fud' will match 'foo'. + * + * \param hook a hook which includes its identifying command string, the + * expected call syntax, and some help text. + * + * A note regarding the help text: if empty, command will not be + * included in 'help' output. + */ + void register_command(std::unique_ptr<AdminSocketHook>&& hook); + + /** + * Registering the APIs that are served directly by the admin_socket server. + */ + void register_admin_commands(); + /** + * handle a command message by replying an MCommandReply with the same tid + * + * \param conn connection over which the incoming command message is received + * \param m message carrying the command vector and optional input buffer + */ + seastar::future<> handle_command(crimson::net::ConnectionRef conn, + boost::intrusive_ptr<MCommand> m); + +private: + /** + * the result of analyzing an incoming command, and locating it in + * the registered APIs collection. + */ + struct parsed_command_t { + cmdmap_t params; + std::string format; + const AdminSocketHook& hook; + }; + // and the shorthand: + seastar::future<> handle_client(seastar::input_stream<char>& inp, + seastar::output_stream<char>& out); + + seastar::future<> execute_line(std::string cmdline, + seastar::output_stream<char>& out); + + seastar::future<> finalize_response(seastar::output_stream<char>& out, + ceph::bufferlist&& msgs); + + seastar::future<tell_result_t> execute_command(const std::vector<std::string>& cmd, + ceph::bufferlist&& buf); + + std::optional<seastar::future<>> task; + std::optional<seastar::server_socket> server_sock; + std::optional<seastar::connected_socket> connected_sock; + + /** + * stopping incoming ASOK requests at shutdown + */ + seastar::gate stop_gate; + + /** + * parse the incoming command vector, find a registered hook by looking up by + * its prefix, perform sanity checks on the parsed parameters with the hook's + * command description + * + * \param cmd a vector of string which presents a command + * \retval on success, a \c parsed_command_t is returned, tell_result_t with + * detailed error messages is returned otherwise + */ + std::variant<parsed_command_t, tell_result_t> + parse_cmd(const std::vector<std::string>& cmd); + + using hooks_t = std::map<std::string_view, std::unique_ptr<AdminSocketHook>>; + hooks_t hooks; + +public: + /** + * iterator support + */ + hooks_t::const_iterator begin() const { + return hooks.cbegin(); + } + hooks_t::const_iterator end() const { + return hooks.cend(); + } +}; + +} // namespace crimson::admin diff --git a/src/crimson/admin/osd_admin.cc b/src/crimson/admin/osd_admin.cc new file mode 100644 index 000000000..0436e5184 --- /dev/null +++ b/src/crimson/admin/osd_admin.cc @@ -0,0 +1,574 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/admin/osd_admin.h" +#include <string> +#include <string_view> + +#include <fmt/format.h> +#include <seastar/core/do_with.hh> +#include <seastar/core/future.hh> +#include <seastar/core/thread.hh> +#include <seastar/core/scollectd_api.hh> + +#include "common/config.h" +#include "crimson/admin/admin_socket.h" +#include "crimson/common/log.h" +#include "crimson/osd/exceptions.h" +#include "crimson/osd/osd.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/shard_services.h" + +namespace { +seastar::logger& logger() +{ + return crimson::get_logger(ceph_subsys_osd); +} +} // namespace + +using namespace std::literals; +using std::string_view; +using std::unique_ptr; +using crimson::osd::OSD; +using crimson::common::local_conf; +using namespace crimson::common; +using ceph::common::cmd_getval; +using ceph::common::cmd_getval_or; + +namespace crimson::admin { + +template <class Hook, class... Args> +std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args) +{ + return std::make_unique<Hook>(std::forward<Args>(args)...); +} + +/** + * An OSD admin hook: OSD status + */ +class OsdStatusHook : public AdminSocketHook { +public: + explicit OsdStatusHook(const crimson::osd::OSD& osd) : + AdminSocketHook{"status", "", "OSD status"}, + osd(osd) + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("status"); + osd.dump_status(f.get()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +private: + const crimson::osd::OSD& osd; +}; +template std::unique_ptr<AdminSocketHook> +make_asok_hook<OsdStatusHook>(const crimson::osd::OSD& osd); + +/** + * An OSD admin hook: send beacon + */ +class SendBeaconHook : public AdminSocketHook { +public: + explicit SendBeaconHook(crimson::osd::OSD& osd) : + AdminSocketHook{"send_beacon", + "", + "send OSD beacon to mon immediately"}, + osd(osd) + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + return osd.send_beacon().then([] { + return seastar::make_ready_future<tell_result_t>(); + }); + } +private: + crimson::osd::OSD& osd; +}; +template std::unique_ptr<AdminSocketHook> +make_asok_hook<SendBeaconHook>(crimson::osd::OSD& osd); + +/** + * send the latest pg stats to mgr + */ +class FlushPgStatsHook : public AdminSocketHook { +public: + explicit FlushPgStatsHook(crimson::osd::OSD& osd) : + AdminSocketHook("flush_pg_stats", + "", + "flush pg stats"), + osd{osd} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + uint64_t seq = osd.send_pg_stats(); + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->dump_unsigned("stat_seq", seq); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } + +private: + crimson::osd::OSD& osd; +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<FlushPgStatsHook>(crimson::osd::OSD& osd); + +/// dump the history of PGs' peering state +class DumpPGStateHistory final: public AdminSocketHook { +public: + explicit DumpPGStateHistory(const crimson::osd::PGShardManager &pg_shard_manager) : + AdminSocketHook{"dump_pgstate_history", + "", + "dump history of PGs' peering state"}, + pg_shard_manager{pg_shard_manager} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + std::unique_ptr<Formatter> fref{ + Formatter::create(format, "json-pretty", "json-pretty")}; + Formatter *f = fref.get(); + f->open_object_section("pgstate_history"); + f->open_array_section("pgs"); + return pg_shard_manager.for_each_pg([f](auto &pgid, auto &pg) { + f->open_object_section("pg"); + f->dump_stream("pg") << pgid; + const auto& peering_state = pg->get_peering_state(); + f->dump_string("currently", peering_state.get_current_state()); + peering_state.dump_history(f); + f->close_section(); + }).then([fref=std::move(fref)]() mutable { + fref->close_section(); + fref->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(fref)); + }); + } + +private: + const crimson::osd::PGShardManager &pg_shard_manager; +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<DumpPGStateHistory>( + const crimson::osd::PGShardManager &); + +//dump the contents of perfcounters in osd and store +class DumpPerfCountersHook final: public AdminSocketHook { +public: + explicit DumpPerfCountersHook() : + AdminSocketHook{"perfcounters_dump", + "name=logger,type=CephString,req=false " + "name=counter,type=CephString,req=false", + "dump perfcounters in osd and store"} + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const final + { + std::unique_ptr<Formatter> f{Formatter::create(format, + "json-pretty", + "json-pretty")}; + std::string logger; + std::string counter; + cmd_getval(cmdmap, "logger", logger); + cmd_getval(cmdmap, "counter", counter); + + crimson::common::local_perf_coll().dump_formatted(f.get(), false, false, logger, counter); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<DumpPerfCountersHook>(); + + + +/** + * A CephContext admin hook: calling assert (if allowed by + * 'debug_asok_assert_abort') + */ +class AssertAlwaysHook : public AdminSocketHook { +public: + AssertAlwaysHook() : + AdminSocketHook{"assert", + "", + "asserts"} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + if (local_conf().get_val<bool>("debug_asok_assert_abort")) { + ceph_assert_always(0); + return seastar::make_ready_future<tell_result_t>(); + } else { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EPERM, "configuration set to disallow asok assert"}); + } + } +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<AssertAlwaysHook>(); + +/** + * A Seastar admin hook: fetching the values of configured metrics + */ +class DumpMetricsHook : public AdminSocketHook { +public: + DumpMetricsHook() : + AdminSocketHook("dump_metrics", + "name=group,type=CephString,req=false", + "dump current configured seastar metrics and their values") + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const final + { + std::unique_ptr<Formatter> fref{Formatter::create(format, "json-pretty", "json-pretty")}; + auto *f = fref.get(); + std::string prefix; + cmd_getval(cmdmap, "group", prefix); + f->open_object_section("metrics"); + f->open_array_section("metrics"); + return seastar::do_with(std::move(prefix), [f](auto &prefix) { + return crimson::reactor_map_seq([f, &prefix] { + for (const auto& [full_name, metric_family]: seastar::scollectd::get_value_map()) { + if (!prefix.empty() && full_name.compare(0, prefix.size(), prefix) != 0) { + continue; + } + for (const auto& [labels, metric] : metric_family) { + if (metric && metric->is_enabled()) { + f->open_object_section(""); // enclosed by array + DumpMetricsHook::dump_metric_value(f, full_name, *metric, labels); + f->close_section(); + } + } + } + }); + }).then([fref = std::move(fref)]() mutable { + fref->close_section(); + fref->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(fref)); + }); + } +private: + using registered_metric = seastar::metrics::impl::registered_metric; + using data_type = seastar::metrics::impl::data_type; + + static void dump_metric_value(Formatter* f, + string_view full_name, + const registered_metric& metric, + const seastar::metrics::impl::labels_type& labels) + { + f->open_object_section(full_name); + for (const auto& [key, value] : labels) { + f->dump_string(key, value); + } + auto value_name = "value"; + switch (auto v = metric(); v.type()) { + case data_type::GAUGE: + f->dump_float(value_name, v.d()); + break; + case data_type::REAL_COUNTER: + f->dump_float(value_name, v.d()); + break; + case data_type::COUNTER: + double val; + try { + val = v.ui(); + } catch (std::range_error&) { + // seastar's cpu steal time may be negative + val = 0; + } + f->dump_unsigned(value_name, val); + break; + case data_type::HISTOGRAM: { + f->open_object_section(value_name); + auto&& h = v.get_histogram(); + f->dump_float("sum", h.sample_sum); + f->dump_unsigned("count", h.sample_count); + f->open_array_section("buckets"); + for (auto i : h.buckets) { + f->open_object_section("bucket"); + f->dump_float("le", i.upper_bound); + f->dump_unsigned("count", i.count); + f->close_section(); // "bucket" + } + { + f->open_object_section("bucket"); + f->dump_string("le", "+Inf"); + f->dump_unsigned("count", h.sample_count); + f->close_section(); + } + f->close_section(); // "buckets" + f->close_section(); // value_name + } + break; + default: + std::abort(); + break; + } + f->close_section(); // full_name + } +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<DumpMetricsHook>(); + + +static ghobject_t test_ops_get_object_name( + const OSDMap& osdmap, + const cmdmap_t& cmdmap) +{ + auto pool = [&] { + auto pool_arg = cmd_getval<std::string>(cmdmap, "pool"); + if (!pool_arg) { + throw std::invalid_argument{"No 'pool' specified"}; + } + int64_t pool = osdmap.lookup_pg_pool_name(*pool_arg); + if (pool < 0 && std::isdigit((*pool_arg)[0])) { + pool = std::atoll(pool_arg->c_str()); + } + if (pool < 0) { + // the return type of `fmt::format` is `std::string` + throw std::invalid_argument{ + fmt::format("Invalid pool '{}'", *pool_arg) + }; + } + return pool; + }(); + + auto [ objname, nspace, raw_pg ] = [&] { + auto obj_arg = cmd_getval<std::string>(cmdmap, "objname"); + if (!obj_arg) { + throw std::invalid_argument{"No 'objname' specified"}; + } + std::string objname, nspace; + if (std::size_t sep_pos = obj_arg->find_first_of('/'); + sep_pos != obj_arg->npos) { + nspace = obj_arg->substr(0, sep_pos); + objname = obj_arg->substr(sep_pos+1); + } else { + objname = *obj_arg; + } + pg_t raw_pg; + if (object_locator_t oloc(pool, nspace); + osdmap.object_locator_to_pg(object_t(objname), oloc, raw_pg) < 0) { + throw std::invalid_argument{"Invalid namespace/objname"}; + } + return std::make_tuple(std::move(objname), + std::move(nspace), + std::move(raw_pg)); + }(); + + auto shard_id = cmd_getval_or<int64_t>(cmdmap, + "shardid", + shard_id_t::NO_SHARD); + + return ghobject_t{ + hobject_t{ + object_t{objname}, std::string{}, CEPH_NOSNAP, raw_pg.ps(), pool, nspace + }, + ghobject_t::NO_GEN, + shard_id_t{static_cast<int8_t>(shard_id)} + }; +} + +// Usage: +// injectdataerr <pool> [namespace/]<obj-name> [shardid] +class InjectDataErrorHook : public AdminSocketHook { +public: + InjectDataErrorHook(crimson::osd::ShardServices& shard_services) : + AdminSocketHook("injectdataerr", + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=false,range=0|255", + "inject data error to an object"), + shard_services(shard_services) { + } + + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const final + { + ghobject_t obj; + try { + obj = test_ops_get_object_name(*shard_services.get_map(), cmdmap); + } catch (const std::invalid_argument& e) { + logger().info("error during data error injection: {}", e.what()); + return seastar::make_ready_future<tell_result_t>(-EINVAL, + e.what()); + } + return shard_services.get_store().inject_data_error(obj).then([=] { + logger().info("successfully injected data error for obj={}", obj); + ceph::bufferlist bl; + bl.append("ok"sv); + return seastar::make_ready_future<tell_result_t>(0, + std::string{}, // no err + std::move(bl)); + }); + } + +private: + crimson::osd::ShardServices& shard_services; +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<InjectDataErrorHook>( + crimson::osd::ShardServices&); + + +// Usage: +// injectmdataerr <pool> [namespace/]<obj-name> [shardid] +class InjectMDataErrorHook : public AdminSocketHook { +public: + InjectMDataErrorHook(crimson::osd::ShardServices& shard_services) : + AdminSocketHook("injectmdataerr", + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=false,range=0|255", + "inject data error to an object"), + shard_services(shard_services) { + } + + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const final + { + ghobject_t obj; + try { + obj = test_ops_get_object_name(*shard_services.get_map(), cmdmap); + } catch (const std::invalid_argument& e) { + logger().info("error during metadata error injection: {}", e.what()); + return seastar::make_ready_future<tell_result_t>(-EINVAL, + e.what()); + } + return shard_services.get_store().inject_mdata_error(obj).then([=] { + logger().info("successfully injected metadata error for obj={}", obj); + ceph::bufferlist bl; + bl.append("ok"sv); + return seastar::make_ready_future<tell_result_t>(0, + std::string{}, // no err + std::move(bl)); + }); + } + +private: + crimson::osd::ShardServices& shard_services; +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<InjectMDataErrorHook>( + crimson::osd::ShardServices&); + + +/** + * An InFlightOps admin hook: dump current in-flight operations + */ +class DumpInFlightOpsHook : public AdminSocketHook { +public: + explicit DumpInFlightOpsHook(const crimson::osd::PGShardManager &pg_shard_manager) : + AdminSocketHook{"dump_ops_in_flight", "", "show the ops currently in flight"}, + pg_shard_manager(pg_shard_manager) + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + unique_ptr<Formatter> fref{ + Formatter::create(format, "json-pretty", "json-pretty")}; + auto *f = fref.get(); + f->open_object_section("ops_in_flight"); + f->open_array_section("ops_in_flight"); + return pg_shard_manager.invoke_on_each_shard_seq([f](const auto &shard_services) { + return shard_services.dump_ops_in_flight(f); + }).then([fref=std::move(fref)]() mutable { + fref->close_section(); + fref->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(fref)); + }); + } +private: + const crimson::osd::PGShardManager &pg_shard_manager; +}; +template std::unique_ptr<AdminSocketHook> +make_asok_hook<DumpInFlightOpsHook>(const crimson::osd::PGShardManager &); + + +class DumpHistoricOpsHook : public AdminSocketHook { +public: + explicit DumpHistoricOpsHook(const crimson::osd::OSDOperationRegistry& op_registry) : + AdminSocketHook{"dump_historic_ops", "", "show recent ops"}, + op_registry(op_registry) + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("historic_ops"); + op_registry.dump_historic_client_requests(f.get()); + f->close_section(); + f->dump_int("num_ops", 0); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +private: + const crimson::osd::OSDOperationRegistry& op_registry; +}; +template std::unique_ptr<AdminSocketHook> +make_asok_hook<DumpHistoricOpsHook>(const crimson::osd::OSDOperationRegistry& op_registry); + + +class DumpSlowestHistoricOpsHook : public AdminSocketHook { +public: + explicit DumpSlowestHistoricOpsHook(const crimson::osd::OSDOperationRegistry& op_registry) : + AdminSocketHook{"dump_historic_slow_ops", "", "show slowest recent ops"}, + op_registry(op_registry) + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + logger().warn("{}", __func__); + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("historic_slow_ops"); + op_registry.dump_slowest_historic_client_requests(f.get()); + f->close_section(); + f->dump_int("num_ops", 0); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +private: + const crimson::osd::OSDOperationRegistry& op_registry; +}; +template std::unique_ptr<AdminSocketHook> +make_asok_hook<DumpSlowestHistoricOpsHook>(const crimson::osd::OSDOperationRegistry& op_registry); + +class DumpRecoveryReservationsHook : public AdminSocketHook { +public: + explicit DumpRecoveryReservationsHook(crimson::osd::ShardServices& shard_services) : + AdminSocketHook{"dump_recovery_reservations", "", "show recovery reservations"}, + shard_services(shard_services) + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + logger().debug("{}", __func__); + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + return seastar::do_with(std::move(f), [this](auto&& f) { + f->open_object_section("reservations"); + f->open_object_section("local_reservations"); + return shard_services.local_dump_reservations(f.get()).then([&f, this] { + f->close_section(); + f->open_object_section("remote_reservations"); + return shard_services.remote_dump_reservations(f.get()).then([&f] { + f->close_section(); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + }); + }); + }); + } +private: + crimson::osd::ShardServices& shard_services; +}; +template std::unique_ptr<AdminSocketHook> +make_asok_hook<DumpRecoveryReservationsHook>(crimson::osd::ShardServices& shard_services); + +} // namespace crimson::admin diff --git a/src/crimson/admin/osd_admin.h b/src/crimson/admin/osd_admin.h new file mode 100644 index 000000000..a3ddd66b9 --- /dev/null +++ b/src/crimson/admin/osd_admin.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include <memory> + +namespace crimson::admin { + +class AdminSocketHook; + +class AssertAlwaysHook; +class DumpMetricsHook; +class DumpPGStateHistory; +class DumpPerfCountersHook; +class FlushPgStatsHook; +class InjectDataErrorHook; +class InjectMDataErrorHook; +class OsdStatusHook; +class SendBeaconHook; +class DumpInFlightOpsHook; +class DumpHistoricOpsHook; +class DumpSlowestHistoricOpsHook; +class DumpRecoveryReservationsHook; + +template<class Hook, class... Args> +std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args); + +} // namespace crimson::admin diff --git a/src/crimson/admin/pg_commands.cc b/src/crimson/admin/pg_commands.cc new file mode 100644 index 000000000..f2c84b254 --- /dev/null +++ b/src/crimson/admin/pg_commands.cc @@ -0,0 +1,167 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/admin/pg_commands.h" + +#include <memory> +#include <string> +#include <string_view> + +#include <fmt/format.h> +#include <seastar/core/future.hh> + +#include "crimson/admin/admin_socket.h" +#include "crimson/osd/osd.h" +#include "crimson/osd/pg.h" + + +using crimson::osd::OSD; +using crimson::osd::PG; +using namespace crimson::common; +using ceph::common::cmd_getval; + + +namespace crimson::admin::pg { + +class PGCommand : public AdminSocketHook { +public: + // TODO: const correctness of osd + PGCommand(crimson::osd::OSD& osd, + std::string_view prefix, + std::string_view desc, + std::string_view help) + : AdminSocketHook{prefix, desc, help}, osd {osd} + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const final + { + // we have "ceph tell <pgid> <cmd>". and it is the ceph cli's responsibility + // to add "pgid" to the cmd dict. as rados_pg_command() does not set it for + // us. moreover, and "pgid" is not listed in the command description, as user + // command format does not follow the convention of "<prefix> [<args>,...]" + // so we have to verify it on the server side. + std::string pgid_str; + pg_t pgid; + if (!cmd_getval(cmdmap, "pgid", pgid_str)) { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EINVAL, "no pgid specified"}); + } else if (!pgid.parse(pgid_str.c_str())) { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EINVAL, fmt::format("couldn't parse pgid '{}'", pgid_str)}); + } + // am i the primary for this pg? + const auto osdmap = osd.get_shard_services().get_map(); + spg_t spg_id; + if (!osdmap->get_primary_shard(pgid, &spg_id)) { + return seastar::make_ready_future<tell_result_t>(tell_result_t{ + -ENOENT, fmt::format("pgid '{}' does not exist", pgid_str)}); + } + return osd.get_pg_shard_manager().with_pg( + spg_id, + [this, spg_id, + cmdmap=std::move(cmdmap), + format=std::move(format), + input=std::move(input) + ](auto &&pg) mutable { + if (!pg) { + return seastar::make_ready_future<tell_result_t>(tell_result_t{ + -ENOENT, fmt::format("i don't have pgid '{}'", spg_id)}); + } + if (!pg->is_primary()) { + return seastar::make_ready_future<tell_result_t>(tell_result_t{ + -EAGAIN, fmt::format("not primary for pgid '{}'", spg_id)}); + } + return this->do_command(pg, cmdmap, format, std::move(input)); + }); + } + +private: + virtual seastar::future<tell_result_t> + do_command(Ref<PG> pg, + const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const = 0; + + OSD& osd; +}; + +class QueryCommand final : public PGCommand { +public: + // TODO: const correctness of osd + explicit QueryCommand(crimson::osd::OSD& osd) : + PGCommand{osd, + "query", + "", + "show details of a specific pg"} + {} +private: + seastar::future<tell_result_t> + do_command(Ref<PG> pg, + const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + std::unique_ptr<Formatter> f{Formatter::create(format, + "json-pretty", + "json-pretty")}; + f->open_object_section("pg"); + pg->dump_primary(f.get()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +class MarkUnfoundLostCommand final : public PGCommand { +public: + explicit MarkUnfoundLostCommand(crimson::osd::OSD& osd) : + PGCommand{osd, + "mark_unfound_lost", + "name=pgid,type=CephPgid,req=false" + " name=mulcmd,type=CephChoices,strings=revert|delete", + "mark all unfound objects in this pg as lost, either" + " removing or reverting to a prior version if one is" + " available"} + {} + seastar::future<tell_result_t> + do_command(Ref<PG> pg, + const cmdmap_t& cmdmap, + std::string_view, + ceph::bufferlist&&) const final + { + // what to do with the unfound object specifically. + std::string cmd; + int op = -1; + cmd_getval(cmdmap, "mulcmd", cmd); + if (cmd == "revert") { + op = pg_log_entry_t::LOST_REVERT; + } else if (cmd == "delete") { + op = pg_log_entry_t::LOST_DELETE; + } else { + return seastar::make_ready_future<tell_result_t>(tell_result_t{ + -EINVAL, "mode must be 'revert' or 'delete'; mark not yet implemented"}); + } + return pg->mark_unfound_lost(op).then([] { + // TODO + return seastar::make_ready_future<tell_result_t>(); + }); + } +}; + +} // namespace crimson::admin::pg + +namespace crimson::admin { + +template <class Hook, class... Args> +std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args) +{ + return std::make_unique<Hook>(std::forward<Args>(args)...); +} + +template std::unique_ptr<AdminSocketHook> +make_asok_hook<crimson::admin::pg::QueryCommand>(crimson::osd::OSD& osd); + +template std::unique_ptr<AdminSocketHook> +make_asok_hook<crimson::admin::pg::MarkUnfoundLostCommand>(crimson::osd::OSD& osd); + +} // namespace crimson::admin diff --git a/src/crimson/admin/pg_commands.h b/src/crimson/admin/pg_commands.h new file mode 100644 index 000000000..873b3c923 --- /dev/null +++ b/src/crimson/admin/pg_commands.h @@ -0,0 +1,10 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +namespace crimson::admin::pg { + +class QueryCommand; +class MarkUnfoundLostCommand; + +} // namespace crimson::admin::pg diff --git a/src/crimson/auth/AuthClient.h b/src/crimson/auth/AuthClient.h new file mode 100644 index 000000000..2d970c88c --- /dev/null +++ b/src/crimson/auth/AuthClient.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> +#include <string> +#include <tuple> +#include <vector> +#include "include/buffer_fwd.h" +#include "crimson/net/Fwd.h" + +class CryptoKey; + +namespace crimson::auth { + +class error : public std::logic_error { +public: + using std::logic_error::logic_error; +}; + +using method_t = uint32_t; + +// TODO: revisit interfaces for non-dummy implementations +class AuthClient { +public: + virtual ~AuthClient() {} + + struct auth_request_t { + method_t auth_method; + std::vector<uint32_t> preferred_modes; + ceph::bufferlist auth_bl; + }; + /// Build an authentication request to begin the handshake + /// + /// @throw auth::error if unable to build the request + virtual auth_request_t get_auth_request(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta) = 0; + + /// Handle server's request to continue the handshake + /// + /// @throw auth::error if unable to build the request + virtual ceph::bufferlist handle_auth_reply_more( + crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + const ceph::bufferlist& bl) = 0; + + /// Handle server's indication that authentication succeeded + /// + /// @return 0 if authenticated, a negative number otherwise + virtual int handle_auth_done( + crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + uint64_t global_id, + uint32_t con_mode, + const bufferlist& bl) = 0; + + /// Handle server's indication that the previous auth attempt failed + /// + /// @return 0 if will try next auth method, a negative number if we have no + /// more options + virtual int handle_auth_bad_method( + crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) = 0; +}; + +} // namespace crimson::auth diff --git a/src/crimson/auth/AuthServer.h b/src/crimson/auth/AuthServer.h new file mode 100644 index 000000000..a808410d2 --- /dev/null +++ b/src/crimson/auth/AuthServer.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> +#include <utility> +#include <vector> +#include "crimson/net/Fwd.h" + +struct AuthAuthorizeHandler; + +namespace crimson::auth { + +class AuthServer { +public: + virtual ~AuthServer() {} + + // Get authentication methods and connection modes for the given peer type + virtual std::pair<std::vector<uint32_t>, std::vector<uint32_t>> + get_supported_auth_methods(int peer_type) = 0; + // Get support connection modes for the given peer type and auth method + virtual uint32_t pick_con_mode( + int peer_type, + uint32_t auth_method, + const std::vector<uint32_t>& preferred_modes) = 0; + // return an AuthAuthorizeHandler for the given peer type and auth method + virtual AuthAuthorizeHandler* get_auth_authorize_handler( + int peer_type, + int auth_method) = 0; + // Handle an authentication request on an incoming connection + virtual int handle_auth_request( + crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + bool more, //< true if this is not the first part of the handshake + uint32_t auth_method, + const bufferlist& bl, + uint64_t *p_peer_global_id, + bufferlist *reply) = 0; +}; + +} // namespace crimson::auth diff --git a/src/crimson/auth/DummyAuth.h b/src/crimson/auth/DummyAuth.h new file mode 100644 index 000000000..7a3dd7ec4 --- /dev/null +++ b/src/crimson/auth/DummyAuth.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "AuthClient.h" +#include "AuthServer.h" + +namespace crimson::auth { + +class DummyAuthClientServer : public AuthClient, + public AuthServer { +public: + DummyAuthClientServer() {} + + // client + std::pair<std::vector<uint32_t>, std::vector<uint32_t>> + get_supported_auth_methods(int peer_type) final { + return {{CEPH_AUTH_NONE}, {CEPH_AUTH_NONE}}; + } + + uint32_t pick_con_mode(int peer_type, + uint32_t auth_method, + const std::vector<uint32_t>& preferred_modes) final { + ceph_assert(auth_method == CEPH_AUTH_NONE); + ceph_assert(preferred_modes.size() && + preferred_modes[0] == CEPH_CON_MODE_CRC); + return CEPH_CON_MODE_CRC; + } + + AuthAuthorizeHandler* get_auth_authorize_handler(int peer_type, + int auth_method) final { + return nullptr; + } + + AuthClient::auth_request_t get_auth_request( + crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta) override { + return {CEPH_AUTH_NONE, {CEPH_CON_MODE_CRC}, {}}; + } + + ceph::bufferlist handle_auth_reply_more( + crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + const bufferlist& bl) override { + ceph_abort(); + } + + int handle_auth_done( + crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + uint64_t global_id, + uint32_t con_mode, + const bufferlist& bl) override { + return 0; + } + + int handle_auth_bad_method( + crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) override { + ceph_abort(); + } + + // server + int handle_auth_request( + crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + bool more, + uint32_t auth_method, + const bufferlist& bl, + uint64_t *p_peer_global_id, + bufferlist *reply) override { + return 1; + } +}; + +} // namespace crimson::auth diff --git a/src/crimson/auth/KeyRing.cc b/src/crimson/auth/KeyRing.cc new file mode 100644 index 000000000..436e29c1b --- /dev/null +++ b/src/crimson/auth/KeyRing.cc @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "KeyRing.h" + +#include <boost/algorithm/string.hpp> + +#include <seastar/core/do_with.hh> +#include <seastar/core/fstream.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/reactor.hh> + +#include "common/buffer_seastar.h" +#include "auth/KeyRing.h" +#include "include/denc.h" +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" + +namespace crimson::auth { + +seastar::future<KeyRing*> load_from_keyring(KeyRing* keyring) +{ + std::vector<std::string> paths; + boost::split(paths, crimson::common::local_conf()->keyring, + boost::is_any_of(",;")); + std::pair<bool, std::string> found; + return seastar::map_reduce(paths, [](auto path) { + return seastar::engine().file_exists(path).then([path](bool file_exists) { + return std::make_pair(file_exists, path); + }); + }, std::move(found), [](auto found, auto file_exists_and_path) { + if (!found.first && file_exists_and_path.first) { + found = std::move(file_exists_and_path); + } + return found; + }).then([keyring] (auto file_exists_and_path) { + const auto& [exists, path] = file_exists_and_path; + if (exists) { + return read_file(path).then([keyring](auto buf) { + bufferlist bl; + bl.append(buffer::create(std::move(buf))); + auto i = bl.cbegin(); + keyring->decode(i); + return seastar::make_ready_future<KeyRing*>(keyring); + }); + } else { + return seastar::make_ready_future<KeyRing*>(keyring); + } + }); +} + +seastar::future<KeyRing*> load_from_keyfile(KeyRing* keyring) +{ + auto& path = crimson::common::local_conf()->keyfile; + if (!path.empty()) { + return read_file(path).then([keyring](auto buf) { + EntityAuth ea; + ea.key.decode_base64(std::string(buf.begin(), + buf.end())); + keyring->add(crimson::common::local_conf()->name, ea); + return seastar::make_ready_future<KeyRing*>(keyring); + }); + } else { + return seastar::make_ready_future<KeyRing*>(keyring); + } +} + +seastar::future<KeyRing*> load_from_key(KeyRing* keyring) +{ + auto& key = crimson::common::local_conf()->key; + if (!key.empty()) { + EntityAuth ea; + ea.key.decode_base64(key); + keyring->add(crimson::common::local_conf()->name, ea); + } + return seastar::make_ready_future<KeyRing*>(keyring); +} + +} // namespace crimson::auth diff --git a/src/crimson/auth/KeyRing.h b/src/crimson/auth/KeyRing.h new file mode 100644 index 000000000..850f1bb79 --- /dev/null +++ b/src/crimson/auth/KeyRing.h @@ -0,0 +1,15 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +class KeyRing; + +namespace crimson::auth { + // see KeyRing::from_ceph_context + seastar::future<KeyRing*> load_from_keyring(KeyRing* keyring); + seastar::future<KeyRing*> load_from_keyfile(KeyRing* keyring); + seastar::future<KeyRing*> load_from_key(KeyRing* keyring); +} diff --git a/src/crimson/common/assert.cc b/src/crimson/common/assert.cc new file mode 100644 index 000000000..07610c33f --- /dev/null +++ b/src/crimson/common/assert.cc @@ -0,0 +1,81 @@ +#include <cstdarg> +#include <iostream> + +#include <seastar/util/backtrace.hh> +#include <seastar/core/reactor.hh> + +#include "include/ceph_assert.h" + +#include "crimson/common/log.h" + +namespace ceph { + [[gnu::cold]] void __ceph_assert_fail(const ceph::assert_data &ctx) + { + __ceph_assert_fail(ctx.assertion, ctx.file, ctx.line, ctx.function); + } + + [[gnu::cold]] void __ceph_assert_fail(const char* assertion, + const char* file, int line, + const char* func) + { + seastar::logger& logger = crimson::get_logger(0); + logger.error("{}:{} : In function '{}', ceph_assert(%s)\n" + "{}", + file, line, func, assertion, + seastar::current_backtrace()); + std::cout << std::flush; + abort(); + } + [[gnu::cold]] void __ceph_assertf_fail(const char *assertion, + const char *file, int line, + const char *func, const char* msg, + ...) + { + char buf[8096]; + va_list args; + va_start(args, msg); + std::vsnprintf(buf, sizeof(buf), msg, args); + va_end(args); + + seastar::logger& logger = crimson::get_logger(0); + logger.error("{}:{} : In function '{}', ceph_assert(%s)\n" + "{}\n{}\n", + file, line, func, assertion, + buf, + seastar::current_backtrace()); + std::cout << std::flush; + abort(); + } + + [[gnu::cold]] void __ceph_abort(const char* file, int line, + const char* func, const std::string& msg) + { + seastar::logger& logger = crimson::get_logger(0); + logger.error("{}:{} : In function '{}', abort(%s)\n" + "{}", + file, line, func, msg, + seastar::current_backtrace()); + std::cout << std::flush; + abort(); + } + + [[gnu::cold]] void __ceph_abortf(const char* file, int line, + const char* func, const char* fmt, + ...) + { + char buf[8096]; + va_list args; + va_start(args, fmt); + std::vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + seastar::logger& logger = crimson::get_logger(0); + logger.error("{}:{} : In function '{}', abort()\n" + "{}\n{}\n", + file, line, func, + buf, + seastar::current_backtrace()); + std::cout << std::flush; + abort(); + } +} diff --git a/src/crimson/common/auth_handler.h b/src/crimson/common/auth_handler.h new file mode 100644 index 000000000..d4140b6a2 --- /dev/null +++ b/src/crimson/common/auth_handler.h @@ -0,0 +1,17 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +class EntityName; +class AuthCapsInfo; + +namespace crimson::common { +class AuthHandler { +public: + // the peer just got authorized + virtual void handle_authentication(const EntityName& name, + const AuthCapsInfo& caps) = 0; + virtual ~AuthHandler() = default; +}; +} diff --git a/src/crimson/common/buffer_io.cc b/src/crimson/common/buffer_io.cc new file mode 100644 index 000000000..86edf7a6f --- /dev/null +++ b/src/crimson/common/buffer_io.cc @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "buffer_io.h" + +#include <seastar/core/reactor.hh> +#include <seastar/core/fstream.hh> +#include <seastar/core/do_with.hh> + +#include "include/buffer.h" + +namespace crimson { + +seastar::future<> write_file(ceph::buffer::list&& bl, + seastar::sstring fn, + seastar::file_permissions permissions) +{ + const auto flags = (seastar::open_flags::wo | + seastar::open_flags::create | + seastar::open_flags::truncate); + seastar::file_open_options foo; + foo.create_permissions = permissions; + return seastar::open_file_dma(fn, flags, foo).then( + [bl=std::move(bl)](seastar::file f) { + return seastar::make_file_output_stream(f).then( + [bl=std::move(bl), f=std::move(f)](seastar::output_stream<char> out) { + return seastar::do_with(std::move(out), + std::move(f), + std::move(bl), + [](seastar::output_stream<char>& out, + seastar::file& f, + ceph::buffer::list& bl) { + return seastar::do_for_each(bl.buffers(), [&out](auto& buf) { + return out.write(buf.c_str(), buf.length()); + }).then([&out] { + return out.close(); + }); + }); + }); + }); +} + +seastar::future<seastar::temporary_buffer<char>> +read_file(const seastar::sstring fn) +{ + return seastar::open_file_dma(fn, seastar::open_flags::ro).then( + [] (seastar::file f) { + return f.size().then([f = std::move(f)](size_t s) { + return seastar::do_with(seastar::make_file_input_stream(f), + [s](seastar::input_stream<char>& in) { + return in.read_exactly(s); + }); + }); + }); +} + +} diff --git a/src/crimson/common/buffer_io.h b/src/crimson/common/buffer_io.h new file mode 100644 index 000000000..c5ece4a6f --- /dev/null +++ b/src/crimson/common/buffer_io.h @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/file-types.hh> + +#include "include/buffer_fwd.h" + +namespace crimson { + seastar::future<> write_file(ceph::buffer::list&& bl, + seastar::sstring fn, + seastar::file_permissions= // 0644 + (seastar::file_permissions::user_read | + seastar::file_permissions::user_write | + seastar::file_permissions::group_read | + seastar::file_permissions::others_read)); + seastar::future<seastar::temporary_buffer<char>> + read_file(const seastar::sstring fn); +} diff --git a/src/crimson/common/condition_variable.h b/src/crimson/common/condition_variable.h new file mode 100644 index 000000000..19267f38a --- /dev/null +++ b/src/crimson/common/condition_variable.h @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/condition-variable.hh> +#include <seastar/core/loop.hh> + +#include "crimson/common/interruptible_future.h" + +namespace crimson { + +class condition_variable : public seastar::condition_variable { +public: + template <typename Pred, typename Func> + auto wait( + Pred&& pred, + Func&& action) noexcept { + using func_result_t = std::invoke_result_t<Func>; + using intr_errorator_t = typename func_result_t::interrupt_errorator_type; + using intr_cond_t = typename func_result_t::interrupt_cond_type; + using interruptor = crimson::interruptible::interruptor<intr_cond_t>; + return interruptor::repeat( + [this, pred=std::forward<Pred>(pred), + action=std::forward<Func>(action)]() + -> typename intr_errorator_t::template future<seastar::stop_iteration> { + if (!pred()) { + return seastar::condition_variable::wait().then([] { + return seastar::make_ready_future< + seastar::stop_iteration>(seastar::stop_iteration::no); + }); + } else { + return action().si_then([] { + return seastar::make_ready_future< + seastar::stop_iteration>(seastar::stop_iteration::yes); + }); + } + }); + } +}; + +} // namespace crimson diff --git a/src/crimson/common/config_proxy.cc b/src/crimson/common/config_proxy.cc new file mode 100644 index 000000000..88d4679d5 --- /dev/null +++ b/src/crimson/common/config_proxy.cc @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "config_proxy.h" + +#include <filesystem> + +#include "crimson/common/buffer_io.h" + +namespace crimson::common { + +ConfigProxy::ConfigProxy(const EntityName& name, std::string_view cluster) +{ + if (seastar::this_shard_id() != 0) { + return; + } + // set the initial value on CPU#0 + values.reset(seastar::make_lw_shared<ConfigValues>()); + values.get()->name = name; + values.get()->cluster = cluster; + // and the only copy of md_config_impl<> is allocated on CPU#0 + local_config.reset(new md_config_t{*values, obs_mgr, true}); + if (name.is_mds()) { + local_config->set_val_default(*values, obs_mgr, + "keyring", "$mds_data/keyring"); + } else if (name.is_osd()) { + local_config->set_val_default(*values, obs_mgr, + "keyring", "$osd_data/keyring"); + } +} + +seastar::future<> ConfigProxy::start() +{ + // populate values and config to all other shards + if (!values) { + return seastar::make_ready_future<>(); + } + return container().invoke_on_others([this](auto& proxy) { + return values.copy().then([config=local_config.get(), + &proxy](auto foreign_values) { + proxy.values.reset(); + proxy.values = std::move(foreign_values); + proxy.remote_config = config; + return seastar::make_ready_future<>(); + }); + }); +} + +void ConfigProxy::show_config(ceph::Formatter* f) const { + get_config().show_config(*values, f); +} + +seastar::future<> ConfigProxy::parse_config_files(const std::string& conf_files) +{ + auto conffile_paths = + get_config().get_conffile_paths(*values, + conf_files.empty() ? nullptr : conf_files.c_str(), + &std::cerr, + CODE_ENVIRONMENT_DAEMON); + return seastar::do_with(std::move(conffile_paths), [this] (auto& paths) { + return seastar::repeat([path=paths.begin(), e=paths.end(), this]() mutable { + if (path == e) { + // tried all conffile, none of them works + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + return crimson::read_file(*path++).then([this](auto&& buf) { + return do_change([buf=std::move(buf), this](ConfigValues& values) { + if (get_config().parse_buffer(values, obs_mgr, + buf.get(), buf.size(), + &std::cerr) == 0) { + get_config().update_legacy_vals(values); + } else { + throw std::invalid_argument("parse error"); + } + }).then([] { + // this one works! + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + }); + }).handle_exception_type([] (const std::filesystem::filesystem_error&) { + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }).handle_exception_type([] (const std::invalid_argument&) { + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }); + }); + }); +} + +ConfigProxy::ShardedConfig ConfigProxy::sharded_conf; +} diff --git a/src/crimson/common/config_proxy.h b/src/crimson/common/config_proxy.h new file mode 100644 index 000000000..4c0e65507 --- /dev/null +++ b/src/crimson/common/config_proxy.h @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/reactor.hh> +#include <seastar/core/sharded.hh> +#include "common/config.h" +#include "common/config_obs.h" +#include "common/config_obs_mgr.h" +#include "common/errno.h" + +namespace ceph { +class Formatter; +} + +namespace crimson::common { + +// a facade for managing config. each shard has its own copy of ConfigProxy. +// +// In seastar-osd, there could be multiple instances of @c ConfigValues in a +// single process, as we are using a variant of read-copy-update mechinary to +// update the settings at runtime. +class ConfigProxy : public seastar::peering_sharded_service<ConfigProxy> +{ + using LocalConfigValues = seastar::lw_shared_ptr<ConfigValues>; + seastar::foreign_ptr<LocalConfigValues> values; + + md_config_t* remote_config = nullptr; + std::unique_ptr<md_config_t> local_config; + + using ConfigObserver = ceph::md_config_obs_impl<ConfigProxy>; + ObserverMgr<ConfigObserver> obs_mgr; + + const md_config_t& get_config() const { + return remote_config ? *remote_config : * local_config; + } + md_config_t& get_config() { + return remote_config ? *remote_config : * local_config; + } + + // apply changes to all shards + // @param func a functor which accepts @c "ConfigValues&" + template<typename Func> + seastar::future<> do_change(Func&& func) { + return container().invoke_on(values.get_owner_shard(), + [func = std::move(func)](ConfigProxy& owner) { + // apply the changes to a copy of the values + auto new_values = seastar::make_lw_shared(*owner.values); + new_values->changed.clear(); + func(*new_values); + + // always apply the new settings synchronously on the owner shard, to + // avoid racings with other do_change() calls in parallel. + ObserverMgr<ConfigObserver>::rev_obs_map rev_obs; + owner.values.reset(new_values); + owner.obs_mgr.for_each_change(owner.values->changed, owner, + [&rev_obs](ConfigObserver *obs, + const std::string &key) { + rev_obs[obs].insert(key); + }, nullptr); + for (auto& [obs, keys] : rev_obs) { + obs->handle_conf_change(owner, keys); + } + + return seastar::parallel_for_each(boost::irange(1u, seastar::smp::count), + [&owner, new_values] (auto cpu) { + return owner.container().invoke_on(cpu, + [foreign_values = seastar::make_foreign(new_values)](ConfigProxy& proxy) mutable { + proxy.values.reset(); + proxy.values = std::move(foreign_values); + + ObserverMgr<ConfigObserver>::rev_obs_map rev_obs; + proxy.obs_mgr.for_each_change(proxy.values->changed, proxy, + [&rev_obs](ConfigObserver *obs, const std::string& key) { + rev_obs[obs].insert(key); + }, nullptr); + for (auto& obs_keys : rev_obs) { + obs_keys.first->handle_conf_change(proxy, obs_keys.second); + } + }); + }).finally([new_values] { + new_values->changed.clear(); + }); + }); + } +public: + ConfigProxy(const EntityName& name, std::string_view cluster); + const ConfigValues* operator->() const noexcept { + return values.get(); + } + const ConfigValues get_config_values() { + return *values.get(); + } + ConfigValues* operator->() noexcept { + return values.get(); + } + + void get_config_bl(uint64_t have_version, + ceph::buffer::list *bl, + uint64_t *got_version) { + get_config().get_config_bl(get_config_values(), have_version, + bl, got_version); + } + void get_defaults_bl(ceph::buffer::list *bl) { + get_config().get_defaults_bl(get_config_values(), bl); + } + seastar::future<> start(); + // required by sharded<> + seastar::future<> stop() { + return seastar::make_ready_future<>(); + } + void add_observer(ConfigObserver* obs) { + obs_mgr.add_observer(obs); + } + void remove_observer(ConfigObserver* obs) { + obs_mgr.remove_observer(obs); + } + seastar::future<> rm_val(const std::string& key) { + return do_change([key, this](ConfigValues& values) { + auto ret = get_config().rm_val(values, key); + if (ret < 0) { + throw std::invalid_argument(cpp_strerror(ret)); + } + }); + } + seastar::future<> set_val(const std::string& key, + const std::string& val) { + return do_change([key, val, this](ConfigValues& values) { + std::stringstream err; + auto ret = get_config().set_val(values, obs_mgr, key, val, &err); + if (ret < 0) { + throw std::invalid_argument(err.str()); + } + }); + } + int get_val(std::string_view key, std::string *val) const { + return get_config().get_val(*values, key, val); + } + template<typename T> + const T get_val(std::string_view key) const { + return get_config().template get_val<T>(*values, key); + } + + int get_all_sections(std::vector<std::string>& sections) const { + return get_config().get_all_sections(sections); + } + + int get_val_from_conf_file(const std::vector<std::string>& sections, + const std::string& key, std::string& out, + bool expand_meta) const { + return get_config().get_val_from_conf_file(*values, sections, key, + out, expand_meta); + } + + unsigned get_osd_pool_default_min_size(uint8_t size) const { + return get_config().get_osd_pool_default_min_size(*values, size); + } + + seastar::future<> + set_mon_vals(const std::map<std::string,std::string,std::less<>>& kv) { + return do_change([kv, this](ConfigValues& values) { + get_config().set_mon_vals(nullptr, values, obs_mgr, kv, nullptr); + }); + } + + seastar::future<> inject_args(const std::string& s) { + return do_change([s, this](ConfigValues& values) { + std::stringstream err; + if (get_config().injectargs(values, obs_mgr, s, &err)) { + throw std::invalid_argument(err.str()); + } + }); + } + void show_config(ceph::Formatter* f) const; + + seastar::future<> parse_argv(std::vector<const char*>& argv) { + // we could pass whatever is unparsed to seastar, but seastar::app_template + // is used for driving the seastar application, and + // crimson::common::ConfigProxy is not available until seastar engine is up + // and running, so we have to feed the command line args to app_template + // first, then pass them to ConfigProxy. + return do_change([&argv, this](ConfigValues& values) { + get_config().parse_argv(values, + obs_mgr, + argv, + CONF_CMDLINE); + }); + } + + seastar::future<> parse_env() { + return do_change([this](ConfigValues& values) { + get_config().parse_env(CEPH_ENTITY_TYPE_OSD, + values, + obs_mgr); + }); + } + + seastar::future<> parse_config_files(const std::string& conf_files); + + using ShardedConfig = seastar::sharded<ConfigProxy>; + +private: + static ShardedConfig sharded_conf; + friend ConfigProxy& local_conf(); + friend ShardedConfig& sharded_conf(); +}; + +inline ConfigProxy& local_conf() { + return ConfigProxy::sharded_conf.local(); +} + +inline ConfigProxy::ShardedConfig& sharded_conf() { + return ConfigProxy::sharded_conf; +} + +template<typename T> +const T get_conf(const std::string& key) { + return local_conf().template get_val<T>(key); +} + +} diff --git a/src/crimson/common/errorator-loop.h b/src/crimson/common/errorator-loop.h new file mode 100644 index 000000000..bb3b7fb15 --- /dev/null +++ b/src/crimson/common/errorator-loop.h @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <seastar/core/future.hh> + +#include "crimson/common/errorator.h" + + +namespace crimson { +template <class... AllowedErrors> +class parallel_for_each_state final : private seastar::continuation_base<> { + using future_t = typename errorator<AllowedErrors...>::template future<>; + std::vector<future_t> _incomplete; + seastar::promise<> _result; + std::exception_ptr _ex; +private: + void wait_for_one() noexcept { + while (!_incomplete.empty() && _incomplete.back().available()) { + if (_incomplete.back().failed()) { + _ex = _incomplete.back().get_exception(); + } + _incomplete.pop_back(); + } + if (!_incomplete.empty()) { + seastar::internal::set_callback(std::move(_incomplete.back()), + static_cast<continuation_base<>*>(this)); + _incomplete.pop_back(); + return; + } + if (__builtin_expect(bool(_ex), false)) { + _result.set_exception(std::move(_ex)); + } else { + _result.set_value(); + } + delete this; + } + virtual void run_and_dispose() noexcept override { + if (_state.failed()) { + _ex = std::move(_state).get_exception(); + } + _state = {}; + wait_for_one(); + } + task* waiting_task() noexcept override { return _result.waiting_task(); } +public: + parallel_for_each_state(size_t n) { + _incomplete.reserve(n); + } + void add_future(future_t&& f) { + _incomplete.push_back(std::move(f)); + } + future_t get_future() { + auto ret = _result.get_future(); + wait_for_one(); + return ret; + } +}; + +template <typename Iterator, typename Func, typename... AllowedErrors> +static inline typename errorator<AllowedErrors...>::template future<> +parallel_for_each(Iterator first, Iterator last, Func&& func) noexcept { + parallel_for_each_state<AllowedErrors...>* s = nullptr; + // Process all elements, giving each future the following treatment: + // - available, not failed: do nothing + // - available, failed: collect exception in ex + // - not available: collect in s (allocating it if needed) + for (;first != last; ++first) { + auto f = seastar::futurize_invoke(std::forward<Func>(func), *first); + if (!f.available() || f.failed()) { + if (!s) { + using itraits = std::iterator_traits<Iterator>; + auto n = (seastar::internal::iterator_range_estimate_vector_capacity( + first, last, typename itraits::iterator_category()) + 1); + s = new parallel_for_each_state<AllowedErrors...>(n); + } + s->add_future(std::move(f)); + } + } + // If any futures were not available, hand off to parallel_for_each_state::start(). + // Otherwise we can return a result immediately. + if (s) { + // s->get_future() takes ownership of s (and chains it to one of the futures it contains) + // so this isn't a leak + return s->get_future(); + } + return seastar::make_ready_future<>(); +} + +} // namespace crimson diff --git a/src/crimson/common/errorator.h b/src/crimson/common/errorator.h new file mode 100644 index 000000000..c5d63d5b9 --- /dev/null +++ b/src/crimson/common/errorator.h @@ -0,0 +1,1358 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <exception> +#include <system_error> + +#include <seastar/core/future-util.hh> + +#include "crimson/common/utility.h" +#include "include/ceph_assert.h" + +namespace crimson::interruptible { + +template <typename, typename> +class parallel_for_each_state; + +template <typename, typename> +class interruptible_future_detail; + +} + +namespace crimson { + +// crimson::do_for_each_state is the mirror of seastar::do_for_each_state with FutureT +template <typename Iterator, typename AsyncAction, typename FutureT> +class do_for_each_state final : public seastar::continuation_base<> { + Iterator _begin; + Iterator _end; + AsyncAction _action; + seastar::promise<> _pr; + +public: + do_for_each_state(Iterator begin, Iterator end, AsyncAction action, + FutureT&& first_unavailable) + : _begin(std::move(begin)), _end(std::move(end)), _action(std::move(action)) { + seastar::internal::set_callback(std::move(first_unavailable), this); + } + virtual void run_and_dispose() noexcept override { + std::unique_ptr<do_for_each_state> zis(this); + if (_state.failed()) { + _pr.set_urgent_state(std::move(_state)); + return; + } + while (_begin != _end) { + auto f = seastar::futurize_invoke(_action, *_begin); + ++_begin; + if (f.failed()) { + f._forward_to(std::move(_pr)); + return; + } + if (!f.available() || seastar::need_preempt()) { + _state = {}; + seastar::internal::set_callback(std::move(f), this); + zis.release(); + return; + } + } + _pr.set_value(); + } + task* waiting_task() noexcept override { + return _pr.waiting_task(); + } + FutureT get_future() { + return _pr.get_future(); + } +}; + +template<typename Iterator, typename AsyncAction, + typename FutureT = std::invoke_result_t<AsyncAction, typename Iterator::reference>> +inline FutureT do_for_each_impl(Iterator begin, Iterator end, AsyncAction action) { + while (begin != end) { + auto f = seastar::futurize_invoke(action, *begin); + ++begin; + if (f.failed()) { + return f; + } + if (!f.available() || seastar::need_preempt()) { + // s will be freed by run_and_dispose() + auto* s = new crimson::do_for_each_state<Iterator, AsyncAction, FutureT>{ + std::move(begin), std::move(end), std::move(action), std::move(f)}; + return s->get_future(); + } + } + return seastar::make_ready_future<>(); +} + +template<typename Iterator, typename AsyncAction> +inline auto do_for_each(Iterator begin, Iterator end, AsyncAction action) { + return ::crimson::do_for_each_impl(begin, end, std::move(action)); +} + +template<typename Container, typename AsyncAction> +inline auto do_for_each(Container& c, AsyncAction action) { + return ::crimson::do_for_each(std::begin(c), std::end(c), std::move(action)); +} + +template<typename AsyncAction> +inline auto repeat(AsyncAction action) { + using errorator_t = + typename ::seastar::futurize_t<std::invoke_result_t<AsyncAction>>::errorator_type; + + while (true) { + auto f = ::seastar::futurize_invoke(action); + if (f.failed()) { + return errorator_t::template make_exception_future2<>( + f.get_exception() + ); + } else if (f.available()) { + if (auto done = f.get0()) { + return errorator_t::template make_ready_future<>(); + } + } else { + return std::move(f)._then( + [action = std::move(action)] (auto stop) mutable { + if (stop == seastar::stop_iteration::yes) { + return errorator_t::template make_ready_future<>(); + } + return ::crimson::repeat( + std::move(action)); + }); + } + } +} + +// define the interface between error types and errorator +template <class ConcreteErrorT> +class error_t { + static constexpr const std::type_info& get_exception_ptr_type_info() { + return ConcreteErrorT::exception_ptr_type_info(); + } + + decltype(auto) static from_exception_ptr(std::exception_ptr ep) { + return ConcreteErrorT::from_exception_ptr(std::move(ep)); + } + + template <class... AllowedErrorsT> + friend struct errorator; + + template <class ErrorVisitorT, class FuturatorT> + friend class maybe_handle_error_t; + +protected: + std::exception_ptr to_exception_ptr() const { + const auto* concrete_error = static_cast<const ConcreteErrorT*>(this); + return concrete_error->to_exception_ptr(); + } + +public: + template <class Func> + static decltype(auto) handle(Func&& func) { + return ConcreteErrorT::handle(std::forward<Func>(func)); + } +}; + +// unthrowable_wrapper ensures compilation failure when somebody +// would like to `throw make_error<...>)()` instead of returning. +// returning allows for the compile-time verification of future's +// AllowedErrorsV and also avoid the burden of throwing. +template <class ErrorT, ErrorT ErrorV> +struct unthrowable_wrapper : error_t<unthrowable_wrapper<ErrorT, ErrorV>> { + unthrowable_wrapper(const unthrowable_wrapper&) = delete; + [[nodiscard]] static const auto& make() { + static constexpr unthrowable_wrapper instance{}; + return instance; + } + + static auto exception_ptr() { + return make().to_exception_ptr(); + } + + template<class Func> + static auto handle(Func&& func) { + return [ + func = std::forward<Func>(func) + ] (const unthrowable_wrapper& raw_error) mutable -> decltype(auto) { + if constexpr (std::is_invocable_v<Func, ErrorT, decltype(raw_error)>) { + // check whether the handler wants to take the raw error object which + // would be the case if it wants conditionally handle-or-pass-further. + return std::invoke(std::forward<Func>(func), + ErrorV, + std::move(raw_error)); + } else if constexpr (std::is_invocable_v<Func, ErrorT>) { + return std::invoke(std::forward<Func>(func), ErrorV); + } else { + return std::invoke(std::forward<Func>(func)); + } + }; + } + + struct pass_further { + decltype(auto) operator()(const unthrowable_wrapper& e) { + return e; + } + }; + + struct discard { + decltype(auto) operator()(const unthrowable_wrapper&) { + } + }; + + +private: + // can be used only to initialize the `instance` member + explicit unthrowable_wrapper() = default; + + // implement the errorable interface + struct throwable_carrier{}; + static std::exception_ptr carrier_instance; + + static constexpr const std::type_info& exception_ptr_type_info() { + return typeid(throwable_carrier); + } + auto to_exception_ptr() const { + // error codes don't need to instantiate `std::exception_ptr` each + // time as the code is actually a part of the type itself. + // `std::make_exception_ptr()` on modern enough GCCs is quite cheap + // (see the Gleb Natapov's patch eradicating throw/catch there), + // but using one instance per type boils down the overhead to just + // ref-counting. + return carrier_instance; + } + static const auto& from_exception_ptr(std::exception_ptr) { + return make(); + } + + friend class error_t<unthrowable_wrapper<ErrorT, ErrorV>>; +}; + +template <class ErrorT, ErrorT ErrorV> +std::exception_ptr unthrowable_wrapper<ErrorT, ErrorV>::carrier_instance = \ + std::make_exception_ptr< + unthrowable_wrapper<ErrorT, ErrorV>::throwable_carrier>({}); + + +template <class ErrorT> +struct stateful_error_t : error_t<stateful_error_t<ErrorT>> { + template <class... Args> + explicit stateful_error_t(Args&&... args) + : ep(std::make_exception_ptr<ErrorT>(std::forward<Args>(args)...)) { + } + + template<class Func> + static auto handle(Func&& func) { + return [ + func = std::forward<Func>(func) + ] (stateful_error_t<ErrorT>&& e) mutable -> decltype(auto) { + if constexpr (std::is_invocable_v<Func>) { + return std::invoke(std::forward<Func>(func)); + } + try { + std::rethrow_exception(e.ep); + } catch (const ErrorT& obj) { + if constexpr (std::is_invocable_v<Func, decltype(obj), decltype(e)>) { + return std::invoke(std::forward<Func>(func), obj, e); + } else if constexpr (std::is_invocable_v<Func, decltype(obj)>) { + return std::invoke(std::forward<Func>(func), obj); + } + } + ceph_abort_msg("exception type mismatch -- impossible!"); + }; + } + +private: + std::exception_ptr ep; + + explicit stateful_error_t(std::exception_ptr ep) : ep(std::move(ep)) {} + + static constexpr const std::type_info& exception_ptr_type_info() { + return typeid(ErrorT); + } + auto to_exception_ptr() const { + return ep; + } + static stateful_error_t<ErrorT> from_exception_ptr(std::exception_ptr ep) { + return stateful_error_t<ErrorT>(std::move(ep)); + } + + friend class error_t<stateful_error_t<ErrorT>>; +}; + +namespace _impl { + template <class T> struct always_false : std::false_type {}; +}; + +template <class ErrorVisitorT, class FuturatorT> +class maybe_handle_error_t { + const std::type_info& type_info; + typename FuturatorT::type result; + ErrorVisitorT errfunc; + +public: + maybe_handle_error_t(ErrorVisitorT&& errfunc, std::exception_ptr ep) + : type_info(*ep.__cxa_exception_type()), + result(FuturatorT::make_exception_future(std::move(ep))), + errfunc(std::forward<ErrorVisitorT>(errfunc)) { + } + + template <class ErrorT> + void handle() { + static_assert(std::is_invocable<ErrorVisitorT, ErrorT>::value, + "provided Error Visitor is not exhaustive"); + // In C++ throwing an exception isn't the sole way to signal + // error with it. This approach nicely fits cold, infrequent cases + // but when applied to a hot one, it will likely hurt performance. + // + // Alternative approach is to create `std::exception_ptr` on our + // own and place it in the future via `make_exception_future()`. + // When it comes to handling, the pointer can be interrogated for + // pointee's type with `__cxa_exception_type()` instead of costly + // re-throwing (via `std::rethrow_exception()`) and matching with + // `catch`. The limitation here is lack of support for hierarchies + // of exceptions. The code below checks for exact match only while + // `catch` would allow to match against a base class as well. + // However, this shouldn't be a big issue for `errorator` as Error + // Visitors are already checked for exhaustiveness at compile-time. + // + // NOTE: `__cxa_exception_type()` is an extension of the language. + // It should be available both in GCC and Clang but a fallback + // (based on `std::rethrow_exception()` and `catch`) can be made + // to handle other platforms if necessary. + if (type_info == ErrorT::error_t::get_exception_ptr_type_info()) { + // set `state::invalid` in internals of `seastar::future` to not + // call `report_failed_future()` during `operator=()`. + [[maybe_unused]] auto&& ep = std::move(result).get_exception(); + + using return_t = std::invoke_result_t<ErrorVisitorT, ErrorT>; + if constexpr (std::is_assignable_v<decltype(result), return_t>) { + result = std::invoke(std::forward<ErrorVisitorT>(errfunc), + ErrorT::error_t::from_exception_ptr(std::move(ep))); + } else if constexpr (std::is_same_v<return_t, void>) { + // void denotes explicit discarding + // execute for the sake a side effects. Typically this boils down + // to throwing an exception by the handler. + std::invoke(std::forward<ErrorVisitorT>(errfunc), + ErrorT::error_t::from_exception_ptr(std::move(ep))); + } else if constexpr (seastar::Future<decltype(result)>) { + // result is seastar::future but return_t is e.g. int. If so, + // the else clause cannot be used as seastar::future lacks + // errorator_type member. + result = seastar::make_ready_future<return_t>( + std::invoke(std::forward<ErrorVisitorT>(errfunc), + ErrorT::error_t::from_exception_ptr(std::move(ep)))); + } else { + result = FuturatorT::type::errorator_type::template make_ready_future<return_t>( + std::invoke(std::forward<ErrorVisitorT>(errfunc), + ErrorT::error_t::from_exception_ptr(std::move(ep)))); + } + } + } + + auto get_result() && { + return std::move(result); + } +}; + +template <class FuncHead, class... FuncTail> +static constexpr auto composer(FuncHead&& head, FuncTail&&... tail) { + return [ + head = std::forward<FuncHead>(head), + // perfect forwarding in lambda's closure isn't available in C++17 + // using tuple as workaround; see: https://stackoverflow.com/a/49902823 + tail = std::make_tuple(std::forward<FuncTail>(tail)...) + ] (auto&&... args) mutable -> decltype(auto) { + if constexpr (std::is_invocable_v<FuncHead, decltype(args)...>) { + return std::invoke(std::forward<FuncHead>(head), + std::forward<decltype(args)>(args)...); + } else if constexpr (sizeof...(FuncTail) > 0) { + using next_composer_t = decltype(composer<FuncTail...>); + auto&& next = std::apply<next_composer_t>(composer<FuncTail...>, + std::move(tail)); + return std::invoke(std::move(next), + std::forward<decltype(args)>(args)...); + } else { + static_assert( + std::is_invocable_v<FuncHead, decltype(args)...> || + (sizeof...(FuncTail) > 0), + "composition is not exhaustive"); + } + }; +} + +template <class ValueT> +struct errorated_future_marker{}; + +template <class... AllowedErrors> +class parallel_for_each_state; + +template <class T> +static inline constexpr bool is_error_v = std::is_base_of_v<error_t<T>, T>; + +template <typename... AllowedErrors> +struct errorator; + +template <typename Iterator, typename Func, typename... AllowedErrors> +static inline typename errorator<AllowedErrors...>::template future<> +parallel_for_each(Iterator first, Iterator last, Func&& func) noexcept; + +template <class... AllowedErrors> +struct errorator { + + static_assert((... && is_error_v<AllowedErrors>), + "errorator expects presence of ::is_error in all error types"); + + template <class ErrorT> + struct contains_once { + static constexpr bool value = + (0 + ... + std::is_same_v<ErrorT, AllowedErrors>) == 1; + }; + template <class... Errors> + struct contains_once<errorator<Errors...>> { + static constexpr bool value = (... && contains_once<Errors>::value); + }; + template <class T> + static constexpr bool contains_once_v = contains_once<T>::value; + + static_assert((... && contains_once_v<AllowedErrors>), + "no error type in errorator can be duplicated"); + + struct ready_future_marker{}; + struct exception_future_marker{}; + +private: + // see the comment for `using future = _future` below. + template <class> + class [[nodiscard]] _future {}; + template <class ValueT> + class [[nodiscard]] _future<::crimson::errorated_future_marker<ValueT>> + : private seastar::future<ValueT> { + using base_t = seastar::future<ValueT>; + // we need the friendship for the sake of `get_exception() &&` when + // `safe_then()` is going to return an errorated future as a result of + // chaining. In contrast to `seastar::future`, errorator<T...>::future` + // has this member private. + template <class ErrorVisitor, class Futurator> + friend class maybe_handle_error_t; + + // any `seastar::futurize` specialization must be able to access the base. + // see : `satisfy_with_result_of()` far below. + template <typename> + friend struct seastar::futurize; + + template <typename T1, typename T2, typename... More> + friend auto seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, More&&... more); + + template <class, class = std::void_t<>> + struct get_errorator { + // generic template for non-errorated things (plain types and + // vanilla seastar::future as well). + using type = errorator<>; + }; + template <class FutureT> + struct get_errorator<FutureT, + std::void_t<typename FutureT::errorator_type>> { + using type = typename FutureT::errorator_type; + }; + template <class T> + using get_errorator_t = typename get_errorator<T>::type; + + template <class ValueFuncErroratorT, class... ErrorVisitorRetsT> + struct make_errorator { + // NOP. The generic template. + }; + template <class... ValueFuncAllowedErrors, + class ErrorVisitorRetsHeadT, + class... ErrorVisitorRetsTailT> + struct make_errorator<errorator<ValueFuncAllowedErrors...>, + ErrorVisitorRetsHeadT, + ErrorVisitorRetsTailT...> { + private: + using step_errorator = errorator<ValueFuncAllowedErrors...>; + // add ErrorVisitorRetsHeadT only if 1) it's an error type and + // 2) isn't already included in the errorator's error set. + // It's enough to negate contains_once_v as any errorator<...> + // type is already guaranteed to be free of duplications. + using _next_errorator = std::conditional_t< + is_error_v<ErrorVisitorRetsHeadT> && + !step_errorator::template contains_once_v<ErrorVisitorRetsHeadT>, + typename step_errorator::template extend<ErrorVisitorRetsHeadT>, + step_errorator>; + using maybe_head_ertr = get_errorator_t<ErrorVisitorRetsHeadT>; + using next_errorator = + typename _next_errorator::template extend_ertr<maybe_head_ertr>; + + public: + using type = typename make_errorator<next_errorator, + ErrorVisitorRetsTailT...>::type; + }; + // finish the recursion + template <class... ValueFuncAllowedErrors> + struct make_errorator<errorator<ValueFuncAllowedErrors...>> { + using type = ::crimson::errorator<ValueFuncAllowedErrors...>; + }; + template <class... Args> + using make_errorator_t = typename make_errorator<Args...>::type; + + using base_t::base_t; + + template <class Futurator, class Future, class ErrorVisitor> + [[gnu::noinline]] + static auto _safe_then_handle_errors(Future&& future, + ErrorVisitor&& errfunc) { + maybe_handle_error_t<ErrorVisitor, Futurator> maybe_handle_error( + std::forward<ErrorVisitor>(errfunc), + std::move(future).get_exception() + ); + (maybe_handle_error.template handle<AllowedErrors>() , ...); + return std::move(maybe_handle_error).get_result(); + } + + protected: + using base_t::get_exception; + public: + using errorator_type = ::crimson::errorator<AllowedErrors...>; + using promise_type = seastar::promise<ValueT>; + + using base_t::available; + using base_t::failed; + // need this because of the legacy in PG::do_osd_ops(). + using base_t::handle_exception_type; + + [[gnu::always_inline]] + _future(base_t&& base) + : base_t(std::move(base)) { + } + + base_t to_base() && { + return std::move(*this); + } + + template <class... A> + [[gnu::always_inline]] + _future(ready_future_marker, A&&... a) + : base_t(::seastar::make_ready_future<ValueT>(std::forward<A>(a)...)) { + } + [[gnu::always_inline]] + _future(exception_future_marker, ::seastar::future_state_base&& state) noexcept + : base_t(::seastar::futurize<base_t>::make_exception_future(std::move(state))) { + } + [[gnu::always_inline]] + _future(exception_future_marker, std::exception_ptr&& ep) noexcept + : base_t(::seastar::futurize<base_t>::make_exception_future(std::move(ep))) { + } + + template <template <class...> class ErroratedFuture, + class = std::void_t< + typename ErroratedFuture< + ::crimson::errorated_future_marker<ValueT>>::errorator_type>> + operator ErroratedFuture<errorated_future_marker<ValueT>> () && { + using dest_errorator_t = \ + typename ErroratedFuture< + ::crimson::errorated_future_marker<ValueT>>::errorator_type; + static_assert(dest_errorator_t::template contains_once_v<errorator_type>, + "conversion is possible to more-or-eq errorated future!"); + return static_cast<base_t&&>(*this); + } + + // initialize future as failed without throwing. `make_exception_future()` + // internally uses `std::make_exception_ptr()`. cppreference.com shouldn't + // be misinterpreted when it says: + // + // "This is done as if executing the following code: + // try { + // throw e; + // } catch(...) { + // return std::current_exception(); + // }", + // + // the "as if" is absolutely crucial because modern GCCs employ optimized + // path for it. See: + // * https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=cce8e59224e18858749a2324bce583bcfd160d6c, + // * https://gcc.gnu.org/ml/gcc-patches/2016-08/msg00373.html. + // + // This behavior, combined with `__cxa_exception_type()` for inspecting + // exception's type, allows for throw/catch-free handling of stateless + // exceptions (which is fine for error codes). Stateful jumbos would be + // actually a bit harder as `_M_get()` is private, and thus rethrowing is + // necessary to get to the state inside. However, it's not unthinkable to + // see another extension bringing operator*() to the exception pointer... + // + // TODO: we don't really need to `make_exception_ptr` each time. It still + // allocates memory underneath while can be replaced with single instance + // per type created on start-up. + template <class ErrorT, + class DecayedT = std::decay_t<ErrorT>, + bool IsError = is_error_v<DecayedT>, + class = std::enable_if_t<IsError>> + _future(ErrorT&& e) + : base_t( + seastar::make_exception_future<ValueT>( + errorator_type::make_exception_ptr(e))) { + static_assert(errorator_type::contains_once_v<DecayedT>, + "ErrorT is not enlisted in errorator"); + } + + template <class ValueFuncT, class ErrorVisitorT> + auto safe_then(ValueFuncT&& valfunc, ErrorVisitorT&& errfunc) { + static_assert((... && std::is_invocable_v<ErrorVisitorT, + AllowedErrors>), + "provided Error Visitor is not exhaustive"); + static_assert(std::is_void_v<ValueT> ? std::is_invocable_v<ValueFuncT> + : std::is_invocable_v<ValueFuncT, ValueT>, + "Value Func is not invocable with future's value"); + using value_func_result_t = + typename std::conditional_t<std::is_void_v<ValueT>, + std::invoke_result<ValueFuncT>, + std::invoke_result<ValueFuncT, ValueT>>::type; + // recognize whether there can be any error coming from the Value + // Function. + using value_func_errorator_t = get_errorator_t<value_func_result_t>; + // mutate the Value Function's errorator to harvest errors coming + // from the Error Visitor. Yes, it's perfectly fine to fail error + // handling at one step and delegate even broader set of issues + // to next continuation. + using return_errorator_t = make_errorator_t< + value_func_errorator_t, + std::decay_t<std::invoke_result_t<ErrorVisitorT, AllowedErrors>>...>; + // OK, now we know about all errors next continuation must take + // care about. If Visitor handled everything and the Value Func + // doesn't return any, we'll finish with errorator<>::future + // which is just vanilla seastar::future – that's it, next cont + // finally could use `.then()`! + using futurator_t = \ + typename return_errorator_t::template futurize<value_func_result_t>; + // `seastar::futurize`, used internally by `then_wrapped()`, would + // wrap any non-`seastar::future` type coming from Value Func into + // `seastar::future`. As we really don't want to end with things + // like `seastar::future<errorator::future<...>>`, we need either: + // * convert the errorated future into plain in the lambda below + // and back here or + // * specialize the `seastar::futurize<T>` to get proper kind of + // future directly from `::then_wrapped()`. + // As C++17 doesn't guarantee copy elision when non-same types are + // involved while examination of assemblies from GCC 8.1 confirmed + // extra copying, switch to the second approach has been made. + return this->then_wrapped( + [ valfunc = std::forward<ValueFuncT>(valfunc), + errfunc = std::forward<ErrorVisitorT>(errfunc) + ] (auto&& future) mutable noexcept { + if (__builtin_expect(future.failed(), false)) { + return _safe_then_handle_errors<futurator_t>( + std::move(future), std::forward<ErrorVisitorT>(errfunc)); + } else { + // NOTE: using `seastar::future::get()` here is a bit bloaty + // as the method rechecks availability of future's value and, + // if it's unavailable, does the `::do_wait()` path (yes, it + // targets `seastar::thread`). Actually this is dead code as + // `then_wrapped()` executes the lambda only when the future + // is available (which means: failed or ready). However, GCC + // hasn't optimized it out: + // + // if (__builtin_expect(future.failed(), false)) { + // ea25: 48 83 bd c8 fe ff ff cmpq $0x2,-0x138(%rbp) + // ea2c: 02 + // ea2d: 0f 87 f0 05 00 00 ja f023 <ceph::osd:: + // ... + // /// If get() is called in a \ref seastar::thread context, + // /// then it need not be available; instead, the thread will + // /// be paused until the future becomes available. + // [[gnu::always_inline]] + // std::tuple<T...> get() { + // if (!_state.available()) { + // ea3a: 0f 85 1b 05 00 00 jne ef5b <ceph::osd:: + // } + // ... + // + // I don't perceive this as huge issue. Though, it cannot be + // claimed errorator has 0 overhead on hot path. The perfect + // solution here would be mark the `::get_available_state()` + // as `protected` and use dedicated `get_value()` exactly as + // `::then()` already does. + return futurator_t::invoke(std::forward<ValueFuncT>(valfunc), + std::move(future).get()); + } + }); + } + + /** + * unsafe_thread_get + * + * Only valid within a seastar_thread. Ignores errorator protections + * and throws any contained exceptions. + * + * Should really only be used within test code + * (see test/crimson/gtest_seastar.h). + */ + auto &&unsafe_get() { + return seastar::future<ValueT>::get(); + } + auto unsafe_get0() { + return seastar::future<ValueT>::get0(); + } + + template <class FuncT> + _future finally(FuncT &&func) { + return this->then_wrapped( + [func = std::forward<FuncT>(func)](auto &&result) mutable noexcept { + if constexpr (seastar::InvokeReturnsAnyFuture<FuncT>) { + return ::seastar::futurize_invoke(std::forward<FuncT>(func)).then_wrapped( + [result = std::move(result)](auto&& f_res) mutable { + // TODO: f_res.failed() + (void)f_res.discard_result(); + return std::move(result); + }); + } else { + try { + func(); + } catch (...) { + // TODO: rethrow + } + return std::move(result); + } + }); + } + + _future<::crimson::errorated_future_marker<void>> + discard_result() noexcept { + return safe_then([](auto&&) {}); + } + + // taking ErrorFuncOne and ErrorFuncTwo separately from ErrorFuncTail + // to avoid SFINAE + template <class ValueFunc, + class ErrorFuncHead, + class... ErrorFuncTail> + auto safe_then(ValueFunc&& value_func, + ErrorFuncHead&& error_func_head, + ErrorFuncTail&&... error_func_tail) { + static_assert(sizeof...(ErrorFuncTail) > 0); + return safe_then( + std::forward<ValueFunc>(value_func), + composer(std::forward<ErrorFuncHead>(error_func_head), + std::forward<ErrorFuncTail>(error_func_tail)...)); + } + + template <class ValueFunc> + auto safe_then(ValueFunc&& value_func) { + return safe_then(std::forward<ValueFunc>(value_func), + errorator_type::pass_further{}); + } + + template <class ValueFunc, + class... ErrorFuncs> + auto safe_then_unpack(ValueFunc&& value_func, + ErrorFuncs&&... error_funcs) { + return safe_then( + [value_func=std::move(value_func)] (ValueT&& tuple) mutable { + assert_moveable(value_func); + return std::apply(std::move(value_func), std::move(tuple)); + }, + std::forward<ErrorFuncs>(error_funcs)... + ); + } + + template <class Func> + void then(Func&&) = delete; + + template <class ErrorVisitorT> + auto handle_error(ErrorVisitorT&& errfunc) { + static_assert((... && std::is_invocable_v<ErrorVisitorT, + AllowedErrors>), + "provided Error Visitor is not exhaustive"); + using return_errorator_t = make_errorator_t< + errorator<>, + std::decay_t<std::invoke_result_t<ErrorVisitorT, AllowedErrors>>...>; + using futurator_t = \ + typename return_errorator_t::template futurize<::seastar::future<ValueT>>; + return this->then_wrapped( + [ errfunc = std::forward<ErrorVisitorT>(errfunc) + ] (auto&& future) mutable noexcept { + if (__builtin_expect(future.failed(), false)) { + return _safe_then_handle_errors<futurator_t>( + std::move(future), std::forward<ErrorVisitorT>(errfunc)); + } else { + return typename futurator_t::type{ std::move(future) }; + } + }); + } + + template <class ErrorFuncHead, + class... ErrorFuncTail> + auto handle_error(ErrorFuncHead&& error_func_head, + ErrorFuncTail&&... error_func_tail) { + static_assert(sizeof...(ErrorFuncTail) > 0); + return this->handle_error( + composer(std::forward<ErrorFuncHead>(error_func_head), + std::forward<ErrorFuncTail>(error_func_tail)...)); + } + + private: + // for ::crimson::do_for_each + template <class Func> + auto _then(Func&& func) { + return base_t::then(std::forward<Func>(func)); + } + template <class T> + auto _forward_to(T&& pr) { + return base_t::forward_to(std::forward<T>(pr)); + } + template<typename Iterator, typename AsyncAction> + friend inline auto ::crimson::do_for_each(Iterator begin, + Iterator end, + AsyncAction action); + + template <typename Iterator, typename AsyncAction, typename FutureT> + friend class ::crimson::do_for_each_state; + + template<typename AsyncAction> + friend inline auto ::crimson::repeat(AsyncAction action); + + template <typename Result> + friend class ::seastar::future; + + // let seastar::do_with_impl to up-cast us to seastar::future. + template<typename T, typename F> + friend inline auto ::seastar::internal::do_with_impl(T&& rvalue, F&& f); + template<typename T1, typename T2, typename T3_or_F, typename... More> + friend inline auto ::seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, T3_or_F&& rv3, More&&... more); + template<typename, typename> + friend class ::crimson::interruptible::interruptible_future_detail; + friend class ::crimson::parallel_for_each_state<AllowedErrors...>; + template <typename IC, typename FT> + friend class ::crimson::interruptible::parallel_for_each_state; + }; + + class Enabler {}; + + template <typename T> + using EnableIf = typename std::enable_if<contains_once_v<std::decay_t<T>>, Enabler>::type; + + template <typename ErrorFunc> + struct all_same_way_t { + ErrorFunc func; + all_same_way_t(ErrorFunc &&error_func) + : func(std::forward<ErrorFunc>(error_func)) {} + + template <typename ErrorT, EnableIf<ErrorT>...> + decltype(auto) operator()(ErrorT&& e) { + using decayed_t = std::decay_t<decltype(e)>; + auto&& handler = + decayed_t::error_t::handle(std::forward<ErrorFunc>(func)); + static_assert(std::is_invocable_v<decltype(handler), ErrorT>); + return std::invoke(std::move(handler), std::forward<ErrorT>(e)); + } + }; + +public: + // HACK: `errorated_future_marker` and `_future` is just a hack to + // specialize `seastar::futurize` for category of class templates: + // `future<...>` from distinct errorators. Such tricks are usually + // performed basing on SFINAE and `std::void_t` to check existence + // of a trait/member (`future<...>::errorator_type` in our case). + // Unfortunately, this technique can't be applied as the `futurize` + // lacks the optional parameter. The problem looks awfully similar + // to following SO item: https://stackoverflow.com/a/38860413. + template <class ValueT=void> + using future = _future<::crimson::errorated_future_marker<ValueT>>; + + // the visitor that forwards handling of all errors to next continuation + struct pass_further { + template <class ErrorT, EnableIf<ErrorT>...> + decltype(auto) operator()(ErrorT&& e) { + static_assert(contains_once_v<std::decay_t<ErrorT>>, + "passing further disallowed ErrorT"); + return std::forward<ErrorT>(e); + } + }; + + struct discard_all { + template <class ErrorT, EnableIf<ErrorT>...> + void operator()(ErrorT&&) { + static_assert(contains_once_v<std::decay_t<ErrorT>>, + "discarding disallowed ErrorT"); + } + }; + + template <typename T> + static future<T> make_errorator_future(seastar::future<T>&& fut) { + return std::move(fut); + } + + // assert_all{ "TODO" }; + class assert_all { + const char* const msg = nullptr; + public: + template <std::size_t N> + assert_all(const char (&msg)[N]) + : msg(msg) { + } + assert_all() = default; + + template <class ErrorT, EnableIf<ErrorT>...> + void operator()(ErrorT&&) { + static_assert(contains_once_v<std::decay_t<ErrorT>>, + "discarding disallowed ErrorT"); + if (msg) { + ceph_abort_msg(msg); + } else { + ceph_abort(); + } + } + }; + + template <class ErrorFunc> + static decltype(auto) all_same_way(ErrorFunc&& error_func) { + return all_same_way_t<ErrorFunc>{std::forward<ErrorFunc>(error_func)}; + }; + + // get a new errorator by extending current one with new errors + template <class... NewAllowedErrorsT> + using extend = errorator<AllowedErrors..., NewAllowedErrorsT...>; + + // get a new errorator by summing and deduplicating error set of + // the errorator `unify<>` is applied on with another errorator + // provided as template parameter. + template <class OtherErroratorT> + struct unify { + // 1st: generic NOP template + }; + template <class OtherAllowedErrorsHead, + class... OtherAllowedErrorsTail> + struct unify<errorator<OtherAllowedErrorsHead, + OtherAllowedErrorsTail...>> { + private: + // 2nd: specialization for errorators with non-empty error set. + // + // split error set of other errorator, passed as template param, + // into head and tail. Mix error set of this errorator with head + // of the other one only if it isn't already present in the set. + using step_errorator = std::conditional_t< + contains_once_v<OtherAllowedErrorsHead> == false, + errorator<AllowedErrors..., OtherAllowedErrorsHead>, + errorator<AllowedErrors...>>; + using rest_errorator = errorator<OtherAllowedErrorsTail...>; + + public: + using type = typename step_errorator::template unify<rest_errorator>::type; + }; + template <class... EmptyPack> + struct unify<errorator<EmptyPack...>> { + // 3rd: recursion finisher + static_assert(sizeof...(EmptyPack) == 0); + using type = errorator<AllowedErrors...>; + }; + + // get a new errorator by extending current one with another errorator + template <class E> + using extend_ertr = typename unify<E>::type; + + template <typename T=void, typename... A> + static future<T> make_ready_future(A&&... value) { + return future<T>(ready_future_marker(), std::forward<A>(value)...); + } + + template <typename T=void> + static + future<T> make_exception_future2(std::exception_ptr&& ex) noexcept { + return future<T>(exception_future_marker(), std::move(ex)); + } + template <typename T=void> + static + future<T> make_exception_future2(seastar::future_state_base&& state) noexcept { + return future<T>(exception_future_marker(), std::move(state)); + } + template <typename T=void, typename Exception> + static + future<T> make_exception_future2(Exception&& ex) noexcept { + return make_exception_future2<T>(std::make_exception_ptr(std::forward<Exception>(ex))); + } + + static auto now() { + return make_ready_future<>(); + } + + template <typename Container, typename Func> + static inline auto parallel_for_each(Container&& container, Func&& func) noexcept { + return crimson::parallel_for_each<decltype(std::begin(container)), Func, AllowedErrors...>( + std::begin(container), + std::end(container), + std::forward<Func>(func)); + } + + template <typename Iterator, typename Func> + static inline errorator<AllowedErrors...>::future<> + parallel_for_each(Iterator first, Iterator last, Func&& func) noexcept { + return crimson::parallel_for_each<Iterator, Func, AllowedErrors...>( + first, + last, + std::forward<Func>(func)); + } +private: + template <class T> + class futurize { + using vanilla_futurize = seastar::futurize<T>; + + // explicit specializations for nested type is not allowed unless both + // the member template and the enclosing template are specialized. see + // section temp.expl.spec, N4659 + template <class Stored, int Dummy = 0> + struct stored_to_future { + using type = future<Stored>; + }; + template <int Dummy> + struct stored_to_future <seastar::internal::monostate, Dummy> { + using type = future<>; + }; + + public: + using type = + typename stored_to_future<typename vanilla_futurize::value_type>::type; + + template <class Func, class... Args> + static type invoke(Func&& func, Args&&... args) { + try { + return vanilla_futurize::invoke(std::forward<Func>(func), + std::forward<Args>(args)...); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <class Func> + static type invoke(Func&& func, seastar::internal::monostate) { + try { + return vanilla_futurize::invoke(std::forward<Func>(func)); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <typename Arg> + static type make_exception_future(Arg&& arg) { + return vanilla_futurize::make_exception_future(std::forward<Arg>(arg)); + } + }; + template <template <class...> class ErroratedFutureT, + class ValueT> + class futurize<ErroratedFutureT<::crimson::errorated_future_marker<ValueT>>> { + public: + using type = ::crimson::errorator<AllowedErrors...>::future<ValueT>; + + template <class Func, class... Args> + static type invoke(Func&& func, Args&&... args) { + try { + return ::seastar::futurize_invoke(std::forward<Func>(func), + std::forward<Args>(args)...); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <class Func> + static type invoke(Func&& func, seastar::internal::monostate) { + try { + return ::seastar::futurize_invoke(std::forward<Func>(func)); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <typename Arg> + static type make_exception_future(Arg&& arg) { + return ::crimson::errorator<AllowedErrors...>::make_exception_future2<ValueT>(std::forward<Arg>(arg)); + } + }; + + template <typename InterruptCond, typename FutureType> + class futurize< + ::crimson::interruptible::interruptible_future_detail< + InterruptCond, FutureType>> { + public: + using type = ::crimson::interruptible::interruptible_future_detail< + InterruptCond, typename futurize<FutureType>::type>; + + template <typename Func, typename... Args> + static type invoke(Func&& func, Args&&... args) { + try { + return ::seastar::futurize_invoke(std::forward<Func>(func), + std::forward<Args>(args)...); + } catch(...) { + return seastar::futurize< + ::crimson::interruptible::interruptible_future_detail< + InterruptCond, FutureType>>::make_exception_future( + std::current_exception()); + } + } + template <typename Func> + static type invoke(Func&& func, seastar::internal::monostate) { + try { + return ::seastar::futurize_invoke(std::forward<Func>(func)); + } catch(...) { + return seastar::futurize< + ::crimson::interruptible::interruptible_future_detail< + InterruptCond, FutureType>>::make_exception_future( + std::current_exception()); + } + } + template <typename Arg> + static type make_exception_future(Arg&& arg) { + return ::seastar::futurize< + ::crimson::interruptible::interruptible_future_detail< + InterruptCond, FutureType>>::make_exception_future( + std::forward<Arg>(arg)); + } + }; + + template <class ErrorT> + static std::exception_ptr make_exception_ptr(ErrorT&& e) { + // calling via interface class due to encapsulation and friend relations. + return e.error_t<std::decay_t<ErrorT>>::to_exception_ptr(); + } + + // needed because of: + // * return_errorator_t::template futurize<...> in `safe_then()`, + // * conversion to `std::exception_ptr` in `future::future(ErrorT&&)`. + // the friendship with all errorators is an idea from Kefu to fix build + // issues on GCC 9. This version likely fixes some access violation bug + // we were exploiting before. + template <class...> + friend class errorator; + template<typename, typename> + friend class ::crimson::interruptible::interruptible_future_detail; +}; // class errorator, generic template + +// no errors? errorator<>::future is plain seastar::future then! +template <> +class errorator<> { +public: + template <class ValueT=void> + using future = ::seastar::futurize_t<ValueT>; + + template <class T> + using futurize = ::seastar::futurize<T>; + + // get a new errorator by extending current one with errors + template <class... NewAllowedErrors> + using extend = errorator<NewAllowedErrors...>; + + // get a new errorator by extending current one with another errorator + template <class E> + using extend_ertr = E; + + // errorator with empty error set never contains any error + template <class T> + static constexpr bool contains_once_v = false; +}; // class errorator, <> specialization + + +template <class ErroratorOne, + class ErroratorTwo, + class... FurtherErrators> +struct compound_errorator { +private: + // generic template. Empty `FurtherErrators` are handled by + // the specialization below. + static_assert(sizeof...(FurtherErrators) > 0); + using step = + typename compound_errorator<ErroratorOne, ErroratorTwo>::type; + +public: + using type = + typename compound_errorator<step, FurtherErrators...>::type; +}; +template <class ErroratorOne, + class ErroratorTwo> +struct compound_errorator<ErroratorOne, ErroratorTwo> { + // specialization for empty `FurtherErrators` arg pack + using type = + typename ErroratorOne::template unify<ErroratorTwo>::type; +}; +template <class... Args> +using compound_errorator_t = typename compound_errorator<Args...>::type; + +// this is conjunction of two nasty features: C++14's variable template +// and inline global variable of C++17. The latter is crucial to ensure +// the variable will get the same address across all translation units. +template <int ErrorV> +inline std::error_code ec = std::error_code(ErrorV, std::generic_category()); + +template <int ErrorV> +using ct_error_code = unthrowable_wrapper<const std::error_code&, ec<ErrorV>>; + +namespace ct_error { + using enoent = ct_error_code<static_cast<int>(std::errc::no_such_file_or_directory)>; + using enodata = ct_error_code<static_cast<int>(std::errc::no_message_available)>; + using invarg = ct_error_code<static_cast<int>(std::errc::invalid_argument)>; + using input_output_error = ct_error_code<static_cast<int>(std::errc::io_error)>; + using object_corrupted = ct_error_code<static_cast<int>(std::errc::illegal_byte_sequence)>; + using permission_denied = ct_error_code<static_cast<int>(std::errc::permission_denied)>; + using operation_not_supported = + ct_error_code<static_cast<int>(std::errc::operation_not_supported)>; + using not_connected = ct_error_code<static_cast<int>(std::errc::not_connected)>; + using timed_out = ct_error_code<static_cast<int>(std::errc::timed_out)>; + using erange = + ct_error_code<static_cast<int>(std::errc::result_out_of_range)>; + using ebadf = + ct_error_code<static_cast<int>(std::errc::bad_file_descriptor)>; + using enospc = + ct_error_code<static_cast<int>(std::errc::no_space_on_device)>; + using value_too_large = ct_error_code<static_cast<int>(std::errc::value_too_large)>; + using eagain = + ct_error_code<static_cast<int>(std::errc::resource_unavailable_try_again)>; + using file_too_large = + ct_error_code<static_cast<int>(std::errc::file_too_large)>; + using address_in_use = ct_error_code<static_cast<int>(std::errc::address_in_use)>; + using address_not_available = ct_error_code<static_cast<int>(std::errc::address_not_available)>; + using ecanceled = ct_error_code<static_cast<int>(std::errc::operation_canceled)>; + using einprogress = ct_error_code<static_cast<int>(std::errc::operation_in_progress)>; + using enametoolong = ct_error_code<static_cast<int>(std::errc::filename_too_long)>; + using eexist = ct_error_code<static_cast<int>(std::errc::file_exists)>; + using edquot = ct_error_code<int(122)>; + constexpr int cmp_fail_error_value = 4095; + using cmp_fail = ct_error_code<int(cmp_fail_error_value)>; + + struct pass_further_all { + template <class ErrorT> + decltype(auto) operator()(ErrorT&& e) { + return std::forward<ErrorT>(e); + } + }; + + struct discard_all { + template <class ErrorT> + void operator()(ErrorT&&) { + } + }; + + class assert_all { + const char* const msg = nullptr; + public: + template <std::size_t N> + assert_all(const char (&msg)[N]) + : msg(msg) { + } + assert_all() = default; + + template <class ErrorT> + void operator()(ErrorT&&) { + if (msg) { + ceph_abort(msg); + } else { + ceph_abort(); + } + } + }; + + template <class ErrorFunc> + static decltype(auto) all_same_way(ErrorFunc&& error_func) { + return [ + error_func = std::forward<ErrorFunc>(error_func) + ] (auto&& e) mutable -> decltype(auto) { + using decayed_t = std::decay_t<decltype(e)>; + auto&& handler = + decayed_t::error_t::handle(std::forward<ErrorFunc>(error_func)); + return std::invoke(std::move(handler), std::forward<decltype(e)>(e)); + }; + }; +} + +using stateful_errc = stateful_error_t<std::errc>; +using stateful_errint = stateful_error_t<int>; +using stateful_ec = stateful_error_t<std::error_code>; + +template <typename F> +struct is_errorated_future { + static constexpr bool value = false; +}; +template <template <class...> class ErroratedFutureT, + class ValueT> +struct is_errorated_future< + ErroratedFutureT<::crimson::errorated_future_marker<ValueT>> + > { + static constexpr bool value = true; +}; +template <typename T> +constexpr bool is_errorated_future_v = is_errorated_future<T>::value; + +} // namespace crimson + + +// open the `seastar` namespace to specialize `futurize`. This is not +// pretty for sure. I just hope it's not worse than e.g. specializing +// `hash` in the `std` namespace. The justification is copy avoidance +// in `future<...>::safe_then()`. See the comments there for details. +namespace seastar { + +// Container is a placeholder for errorator::_future<> template +template <template <class> class Container, + class Value> +struct futurize<Container<::crimson::errorated_future_marker<Value>>> { + using errorator_type = typename Container< + ::crimson::errorated_future_marker<Value>>::errorator_type; + + using type = typename errorator_type::template future<Value>; + using value_type = seastar::internal::future_stored_type_t<Value>; + + template<typename Func, typename... FuncArgs> + [[gnu::always_inline]] + static type apply(Func&& func, std::tuple<FuncArgs...>&& args) noexcept { + try { + return std::apply( + std::forward<Func>(func), + std::forward<std::tuple<FuncArgs...>>(args)); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template<typename Func, typename... FuncArgs> + [[gnu::always_inline]] + static inline type invoke(Func&& func, FuncArgs&&... args) noexcept { + try { + return func(std::forward<FuncArgs>(args)...); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <class Func> + [[gnu::always_inline]] + static type invoke(Func&& func, seastar::internal::monostate) noexcept { + try { + return func(); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <typename Arg> + [[gnu::always_inline]] + static type make_exception_future(Arg&& arg) { + return errorator_type::template make_exception_future2<Value>(std::forward<Arg>(arg)); + } + +private: + template<typename PromiseT, typename Func> + static void satisfy_with_result_of(PromiseT&& pr, Func&& func) { + // this may use the protected variant of `seastar::future::forward_to()` + // because: + // 1. `seastar::future` established a friendship with with all + // specializations of `seastar::futurize`, including this + // one (we're in the `seastar` namespace!) WHILE + // 2. any errorated future declares now the friendship with any + // `seastar::futurize<...>`. + func().forward_to(std::move(pr)); + } + template <typename U> + friend class future; +}; + +template <template <class> class Container, + class Value> +struct continuation_base_from_future<Container<::crimson::errorated_future_marker<Value>>> { + using type = continuation_base<Value>; +}; + +} // namespace seastar diff --git a/src/crimson/common/exception.h b/src/crimson/common/exception.h new file mode 100644 index 000000000..682fef69b --- /dev/null +++ b/src/crimson/common/exception.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <exception> +#include <seastar/core/future.hh> +#include <seastar/core/future-util.hh> + +#include "crimson/common/log.h" +#include "crimson/common/interruptible_future.h" + +namespace crimson::common { + +class interruption : public std::exception +{}; + +class system_shutdown_exception final : public interruption{ +public: + const char* what() const noexcept final { + return "system shutting down"; + } +}; + +class actingset_changed final : public interruption { +public: + actingset_changed(bool sp) : still_primary(sp) {} + const char* what() const noexcept final { + return "acting set changed"; + } + bool is_primary() const { + return still_primary; + } +private: + const bool still_primary; +}; + +template<typename Func, typename... Args> +inline seastar::future<> handle_system_shutdown(Func&& func, Args&&... args) +{ + return seastar::futurize_invoke(std::forward<Func>(func), + std::forward<Args>(args)...) + .handle_exception([](std::exception_ptr eptr) { + if (*eptr.__cxa_exception_type() == + typeid(crimson::common::system_shutdown_exception)) { + crimson::get_logger(ceph_subsys_osd).debug( + "operation skipped, system shutdown"); + return seastar::now(); + } + std::rethrow_exception(eptr); + }); +} + +} diff --git a/src/crimson/common/fatal_signal.cc b/src/crimson/common/fatal_signal.cc new file mode 100644 index 000000000..f2983769d --- /dev/null +++ b/src/crimson/common/fatal_signal.cc @@ -0,0 +1,172 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "fatal_signal.h" + +#include <csignal> +#include <iostream> +#include <string_view> + +#define BOOST_STACKTRACE_USE_ADDR2LINE +#include <boost/stacktrace.hpp> +#include <seastar/core/reactor.hh> + +#include "common/safe_io.h" +#include "include/scope_guard.h" + +FatalSignal::FatalSignal() +{ + install_oneshot_signals_handler<SIGSEGV, + SIGABRT, + SIGBUS, + SIGILL, + SIGFPE, + SIGXCPU, + SIGXFSZ, + SIGSYS>(); +} + +template <int... SigNums> +void FatalSignal::install_oneshot_signals_handler() +{ + (install_oneshot_signal_handler<SigNums>() , ...); +} + +static void reraise_fatal(const int signum) +{ + // use default handler to dump core + ::signal(signum, SIG_DFL); + + // normally, we won't get here. if we do, something is very weird. + if (::raise(signum)) { + std::cerr << "reraise_fatal: failed to re-raise signal " << signum + << std::endl; + } else { + std::cerr << "reraise_fatal: default handler for signal " << signum + << " didn't terminate the process?" << std::endl; + } + std::cerr << std::flush; + ::_exit(1); +} + +[[gnu::noinline]] void FatalSignal::signal_entry( + const int signum, + siginfo_t* const info, + void*) +{ + if (static std::atomic_bool handled{false}; handled.exchange(true)) { + return; + } + assert(info); + FatalSignal::signaled(signum, *info); + reraise_fatal(signum); +} + +template <int SigNum> +void FatalSignal::install_oneshot_signal_handler() +{ + struct sigaction sa; + // it's a bad idea to use a lambda here. On GCC there are `operator()` + // and `_FUN()`. Controlling their inlineability is hard (impossible?). + sa.sa_sigaction = signal_entry; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO | SA_RESTART | SA_NODEFER; + if constexpr (SigNum == SIGSEGV) { + sa.sa_flags |= SA_ONSTACK; + } + [[maybe_unused]] auto r = ::sigaction(SigNum, &sa, nullptr); + assert(r == 0); +} + + +[[gnu::noinline]] static void print_backtrace(std::string_view cause) { + std::cerr << cause; + if (seastar::engine_is_ready()) { + std::cerr << " on shard " << seastar::this_shard_id(); + } + // nobody wants to see things like `FatalSignal::signaled()` or + // `print_backtrace()` in our backtraces. `+ 1` is for the extra + // frame created by kernel (signal trampoline, it will take care + // about e.g. sigreturn(2) calling; see the man page). + constexpr std::size_t FRAMES_TO_SKIP = 3 + 1; + std::cerr << ".\nBacktrace:\n"; + std::cerr << boost::stacktrace::stacktrace( + FRAMES_TO_SKIP, + static_cast<std::size_t>(-1)/* max depth same as the default one */); + std::cerr << std::flush; + // TODO: dump crash related meta data to $crash_dir + // see handle_fatal_signal() +} + +static void print_segv_info(const siginfo_t& siginfo) +{ + std::cerr \ + << "Dump of siginfo:" << std::endl + << " si_signo: " << siginfo.si_signo << std::endl + << " si_errno: " << siginfo.si_errno << std::endl + << " si_code: " << siginfo.si_code << std::endl + << " si_pid: " << siginfo.si_pid << std::endl + << " si_uid: " << siginfo.si_uid << std::endl + << " si_status: " << siginfo.si_status << std::endl + << " si_utime: " << siginfo.si_utime << std::endl + << " si_stime: " << siginfo.si_stime << std::endl + << " si_int: " << siginfo.si_int << std::endl + << " si_ptr: " << siginfo.si_ptr << std::endl + << " si_overrun: " << siginfo.si_overrun << std::endl + << " si_timerid: " << siginfo.si_timerid << std::endl + << " si_addr: " << siginfo.si_addr << std::endl + << " si_band: " << siginfo.si_band << std::endl + << " si_fd: " << siginfo.si_fd << std::endl + << " si_addr_lsb: " << siginfo.si_addr_lsb << std::endl + << " si_lower: " << siginfo.si_lower << std::endl + << " si_upper: " << siginfo.si_upper << std::endl + << " si_pkey: " << siginfo.si_pkey << std::endl + << " si_call_addr: " << siginfo.si_call_addr << std::endl + << " si_syscall: " << siginfo.si_syscall << std::endl + << " si_arch: " << siginfo.si_arch << std::endl; + std::cerr << std::flush; +} + +static void print_proc_maps() +{ + const int fd = ::open("/proc/self/maps", O_RDONLY); + if (fd < 0) { + std::cerr << "can't open /proc/self/maps. procfs not mounted?" << std::endl; + return; + } + const auto fd_guard = make_scope_guard([fd] { + ::close(fd); + }); + std::cerr << "Content of /proc/self/maps:" << std::endl; + while (true) { + char chunk[4096] = {0, }; + const ssize_t r = safe_read(fd, chunk, sizeof(chunk) - 1); + if (r < 0) { + std::cerr << "error while reading /proc/self/maps: " << r << std::endl; + return; + } else { + std::cerr << chunk << std::flush; + if (r < static_cast<ssize_t>(sizeof(chunk) - 1)) { + return; // eof + } + } + } +} + +[[gnu::noinline]] void FatalSignal::signaled(const int signum, + const siginfo_t& siginfo) +{ + switch (signum) { + case SIGSEGV: + print_backtrace("Segmentation fault"); + print_segv_info(siginfo); + break; + case SIGABRT: + print_backtrace("Aborting"); + break; + default: + print_backtrace(fmt::format("Signal {}", signum)); + break; + } + print_proc_maps(); +} diff --git a/src/crimson/common/fatal_signal.h b/src/crimson/common/fatal_signal.h new file mode 100644 index 000000000..626017c93 --- /dev/null +++ b/src/crimson/common/fatal_signal.h @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <csignal> + +class FatalSignal { +public: + FatalSignal(); + +private: + static void signal_entry(int signum, siginfo_t* siginfo, void* p); + static void signaled(int signum, const siginfo_t& siginfo); + + template <int... SigNums> + void install_oneshot_signals_handler(); + + template <int SigNum> + void install_oneshot_signal_handler(); +}; diff --git a/src/crimson/common/fixed_kv_node_layout.h b/src/crimson/common/fixed_kv_node_layout.h new file mode 100644 index 000000000..676563594 --- /dev/null +++ b/src/crimson/common/fixed_kv_node_layout.h @@ -0,0 +1,730 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <algorithm> +#include <iostream> + +#include <boost/iterator/counting_iterator.hpp> + +#include "include/byteorder.h" + +#include "crimson/common/layout.h" + +namespace crimson::common { + +template <typename T, bool is_const> +struct maybe_const_t { +}; +template<typename T> +struct maybe_const_t<T, true> { + using type = const T*; +}; +template<typename T> +struct maybe_const_t<T, false> { + using type = T*; +}; + + +/** + * FixedKVNodeLayout + * + * Reusable implementation of a fixed size block mapping + * K -> V with internal representations KINT and VINT. + * + * Uses absl::container_internal::Layout for the actual memory layout. + * + * The primary interface exposed is centered on the iterator + * and related methods. + * + * Also included are helpers for doing splits and merges as for a btree. + */ +template < + size_t CAPACITY, + typename Meta, + typename MetaInt, + typename K, + typename KINT, + typename V, + typename VINT, + bool VALIDATE_INVARIANTS=true> +class FixedKVNodeLayout { + char *buf = nullptr; + + using L = absl::container_internal::Layout<ceph_le32, MetaInt, KINT, VINT>; + static constexpr L layout{1, 1, CAPACITY, CAPACITY}; + +public: + template <bool is_const> + struct iter_t { + friend class FixedKVNodeLayout; + using parent_t = typename maybe_const_t<FixedKVNodeLayout, is_const>::type; + + parent_t node; + uint16_t offset = 0; + + iter_t() = default; + iter_t( + parent_t parent, + uint16_t offset) : node(parent), offset(offset) {} + + iter_t(const iter_t &) noexcept = default; + iter_t(iter_t &&) noexcept = default; + template<bool is_const_ = is_const> + iter_t(const iter_t<false>& it, std::enable_if_t<is_const_, int> = 0) + : iter_t{it.node, it.offset} + {} + iter_t &operator=(const iter_t &) = default; + iter_t &operator=(iter_t &&) = default; + + // Work nicely with for loops without requiring a nested type. + using reference = iter_t&; + iter_t &operator*() { return *this; } + iter_t *operator->() { return this; } + + iter_t operator++(int) { + auto ret = *this; + ++offset; + return ret; + } + + iter_t &operator++() { + ++offset; + return *this; + } + + iter_t operator--(int) { + assert(offset > 0); + auto ret = *this; + --offset; + return ret; + } + + iter_t &operator--() { + assert(offset > 0); + --offset; + return *this; + } + + uint16_t operator-(const iter_t &rhs) const { + assert(rhs.node == node); + return offset - rhs.offset; + } + + iter_t operator+(uint16_t off) const { + return iter_t( + node, + offset + off); + } + iter_t operator-(uint16_t off) const { + return iter_t( + node, + offset - off); + } + + friend bool operator==(const iter_t &lhs, const iter_t &rhs) { + assert(lhs.node == rhs.node); + return lhs.offset == rhs.offset; + } + + friend bool operator!=(const iter_t &lhs, const iter_t &rhs) { + return !(lhs == rhs); + } + + friend bool operator==(const iter_t<is_const> &lhs, const iter_t<!is_const> &rhs) { + assert(lhs.node == rhs.node); + return lhs.offset == rhs.offset; + } + friend bool operator!=(const iter_t<is_const> &lhs, const iter_t<!is_const> &rhs) { + return !(lhs == rhs); + } + K get_key() const { + return K(node->get_key_ptr()[offset]); + } + + K get_next_key_or_max() const { + auto next = *this + 1; + if (next == node->end()) + return std::numeric_limits<K>::max(); + else + return next->get_key(); + } + + void set_val(V val) const { + static_assert(!is_const); + node->get_val_ptr()[offset] = VINT(val); + } + + V get_val() const { + return V(node->get_val_ptr()[offset]); + }; + + bool contains(K addr) const { + return (get_key() <= addr) && (get_next_key_or_max() > addr); + } + + uint16_t get_offset() const { + return offset; + } + + private: + void set_key(K _lb) const { + static_assert(!is_const); + KINT lb; + lb = _lb; + node->get_key_ptr()[offset] = lb; + } + + typename maybe_const_t<char, is_const>::type get_key_ptr() const { + return reinterpret_cast< + typename maybe_const_t<char, is_const>::type>( + node->get_key_ptr() + offset); + } + + typename maybe_const_t<char, is_const>::type get_val_ptr() const { + return reinterpret_cast< + typename maybe_const_t<char, is_const>::type>( + node->get_val_ptr() + offset); + } + }; + using const_iterator = iter_t<true>; + using iterator = iter_t<false>; + + struct delta_t { + enum class op_t : uint8_t { + INSERT, + REMOVE, + UPDATE, + } op; + KINT key; + VINT val; + + void replay(FixedKVNodeLayout &l) { + switch (op) { + case op_t::INSERT: { + l.insert(l.lower_bound(key), key, val); + break; + } + case op_t::REMOVE: { + auto iter = l.find(key); + assert(iter != l.end()); + l.remove(iter); + break; + } + case op_t::UPDATE: { + auto iter = l.find(key); + assert(iter != l.end()); + l.update(iter, val); + break; + } + default: + assert(0 == "Impossible"); + } + } + + bool operator==(const delta_t &rhs) const { + return op == rhs.op && + key == rhs.key && + val == rhs.val; + } + }; + +public: + class delta_buffer_t { + std::vector<delta_t> buffer; + public: + bool empty() const { + return buffer.empty(); + } + void insert( + const K &key, + const V &val) { + KINT k; + k = key; + buffer.push_back( + delta_t{ + delta_t::op_t::INSERT, + k, + VINT(val) + }); + } + void update( + const K &key, + const V &val) { + KINT k; + k = key; + buffer.push_back( + delta_t{ + delta_t::op_t::UPDATE, + k, + VINT(val) + }); + } + void remove(const K &key) { + KINT k; + k = key; + buffer.push_back( + delta_t{ + delta_t::op_t::REMOVE, + k, + VINT() + }); + } + void replay(FixedKVNodeLayout &node) { + for (auto &i: buffer) { + i.replay(node); + } + } + size_t get_bytes() const { + return buffer.size() * sizeof(delta_t); + } + void copy_out(char *out, size_t len) { + assert(len == get_bytes()); + ::memcpy(out, reinterpret_cast<const void *>(buffer.data()), get_bytes()); + buffer.clear(); + } + void copy_in(const char *out, size_t len) { + assert(empty()); + assert(len % sizeof(delta_t) == 0); + buffer = std::vector( + reinterpret_cast<const delta_t*>(out), + reinterpret_cast<const delta_t*>(out + len)); + } + bool operator==(const delta_buffer_t &rhs) const { + return buffer == rhs.buffer; + } + }; + + void journal_insert( + const_iterator _iter, + const K &key, + const V &val, + delta_buffer_t *recorder) { + auto iter = iterator(this, _iter.offset); + if (recorder) { + recorder->insert( + key, + val); + } + insert(iter, key, val); + } + + void journal_update( + const_iterator _iter, + const V &val, + delta_buffer_t *recorder) { + auto iter = iterator(this, _iter.offset); + if (recorder) { + recorder->update(iter->get_key(), val); + } + update(iter, val); + } + + void journal_replace( + const_iterator _iter, + const K &key, + const V &val, + delta_buffer_t *recorder) { + auto iter = iterator(this, _iter.offset); + if (recorder) { + recorder->remove(iter->get_key()); + recorder->insert(key, val); + } + replace(iter, key, val); + } + + + void journal_remove( + const_iterator _iter, + delta_buffer_t *recorder) { + auto iter = iterator(this, _iter.offset); + if (recorder) { + recorder->remove(iter->get_key()); + } + remove(iter); + } + + + FixedKVNodeLayout(char *buf) : + buf(buf) {} + + virtual ~FixedKVNodeLayout() = default; + + const_iterator begin() const { + return const_iterator( + this, + 0); + } + + const_iterator end() const { + return const_iterator( + this, + get_size()); + } + + iterator begin() { + return iterator( + this, + 0); + } + + iterator end() { + return iterator( + this, + get_size()); + } + + const_iterator iter_idx(uint16_t off) const { + return const_iterator( + this, + off); + } + + const_iterator find(K l) const { + auto ret = begin(); + for (; ret != end(); ++ret) { + if (ret->get_key() == l) + break; + } + return ret; + } + iterator find(K l) { + const auto &tref = *this; + return iterator(this, tref.find(l).offset); + } + + const_iterator lower_bound(K l) const { + auto it = std::lower_bound(boost::make_counting_iterator<uint16_t>(0), + boost::make_counting_iterator<uint16_t>(get_size()), + l, + [this](uint16_t i, K key) { + const_iterator iter(this, i); + return iter->get_key() < key; + }); + return const_iterator(this, *it); + } + + iterator lower_bound(K l) { + const auto &tref = *this; + return iterator(this, tref.lower_bound(l).offset); + } + + const_iterator upper_bound(K l) const { + auto it = std::upper_bound(boost::make_counting_iterator<uint16_t>(0), + boost::make_counting_iterator<uint16_t>(get_size()), + l, + [this](K key, uint16_t i) { + const_iterator iter(this, i); + return key < iter->get_key(); + }); + return const_iterator(this, *it); + } + + iterator upper_bound(K l) { + const auto &tref = *this; + return iterator(this, tref.upper_bound(l).offset); + } + + const_iterator get_split_pivot() const { + return iter_idx(get_size() / 2); + } + + uint16_t get_size() const { + return *layout.template Pointer<0>(buf); + } + + /** + * set_size + * + * Set size representation to match size + */ + void set_size(uint16_t size) { + *layout.template Pointer<0>(buf) = size; + } + + /** + * get_meta/set_meta + * + * Enables stashing a templated type within the layout. + * Cannot be modified after initial write as it is not represented + * in delta_t + */ + Meta get_meta() const { + MetaInt &metaint = *layout.template Pointer<1>(buf); + return Meta(metaint); + } + void set_meta(const Meta &meta) { + *layout.template Pointer<1>(buf) = MetaInt(meta); + } + + constexpr static size_t get_capacity() { + return CAPACITY; + } + + bool operator==(const FixedKVNodeLayout &rhs) const { + if (get_size() != rhs.get_size()) { + return false; + } + + auto iter = begin(); + auto iter2 = rhs.begin(); + while (iter != end()) { + if (iter->get_key() != iter2->get_key() || + iter->get_val() != iter2->get_val()) { + return false; + } + iter++; + iter2++; + } + return true; + } + + /** + * split_into + * + * Takes *this and splits its contents into left and right. + */ + K split_into( + FixedKVNodeLayout &left, + FixedKVNodeLayout &right) const { + auto piviter = get_split_pivot(); + + left.copy_from_foreign(left.begin(), begin(), piviter); + left.set_size(piviter - begin()); + + right.copy_from_foreign(right.begin(), piviter, end()); + right.set_size(end() - piviter); + + auto [lmeta, rmeta] = get_meta().split_into(piviter->get_key()); + left.set_meta(lmeta); + right.set_meta(rmeta); + + return piviter->get_key(); + } + + /** + * merge_from + * + * Takes two nodes and copies their contents into *this. + * + * precondition: left.size() + right.size() < CAPACITY + */ + void merge_from( + const FixedKVNodeLayout &left, + const FixedKVNodeLayout &right) + { + copy_from_foreign( + end(), + left.begin(), + left.end()); + set_size(left.get_size()); + copy_from_foreign( + end(), + right.begin(), + right.end()); + set_size(left.get_size() + right.get_size()); + set_meta(Meta::merge_from(left.get_meta(), right.get_meta())); + } + + /** + * balance_into_new_nodes + * + * Takes the contents of left and right and copies them into + * replacement_left and replacement_right such that in the + * event that the number of elements is odd the extra goes to + * the left side iff prefer_left. + */ + static K balance_into_new_nodes( + const FixedKVNodeLayout &left, + const FixedKVNodeLayout &right, + bool prefer_left, + FixedKVNodeLayout &replacement_left, + FixedKVNodeLayout &replacement_right) + { + auto total = left.get_size() + right.get_size(); + auto pivot_idx = (left.get_size() + right.get_size()) / 2; + if (total % 2 && prefer_left) { + pivot_idx++; + } + auto replacement_pivot = pivot_idx >= left.get_size() ? + right.iter_idx(pivot_idx - left.get_size())->get_key() : + left.iter_idx(pivot_idx)->get_key(); + + if (pivot_idx < left.get_size()) { + replacement_left.copy_from_foreign( + replacement_left.end(), + left.begin(), + left.iter_idx(pivot_idx)); + replacement_left.set_size(pivot_idx); + + replacement_right.copy_from_foreign( + replacement_right.end(), + left.iter_idx(pivot_idx), + left.end()); + + replacement_right.set_size(left.get_size() - pivot_idx); + replacement_right.copy_from_foreign( + replacement_right.end(), + right.begin(), + right.end()); + replacement_right.set_size(total - pivot_idx); + } else { + replacement_left.copy_from_foreign( + replacement_left.end(), + left.begin(), + left.end()); + replacement_left.set_size(left.get_size()); + + replacement_left.copy_from_foreign( + replacement_left.end(), + right.begin(), + right.iter_idx(pivot_idx - left.get_size())); + replacement_left.set_size(pivot_idx); + + replacement_right.copy_from_foreign( + replacement_right.end(), + right.iter_idx(pivot_idx - left.get_size()), + right.end()); + replacement_right.set_size(total - pivot_idx); + } + + auto [lmeta, rmeta] = Meta::rebalance( + left.get_meta(), right.get_meta(), replacement_pivot); + replacement_left.set_meta(lmeta); + replacement_right.set_meta(rmeta); + return replacement_pivot; + } + +private: + void insert( + iterator iter, + const K &key, + const V &val) { + if (VALIDATE_INVARIANTS) { + if (iter != begin()) { + assert((iter - 1)->get_key() < key); + } + if (iter != end()) { + assert(iter->get_key() > key); + } + assert(get_size() < CAPACITY); + } + copy_from_local(iter + 1, iter, end()); + iter->set_key(key); + iter->set_val(val); + set_size(get_size() + 1); + } + + void update( + iterator iter, + V val) { + assert(iter != end()); + iter->set_val(val); + } + + void replace( + iterator iter, + const K &key, + const V &val) { + assert(iter != end()); + if (VALIDATE_INVARIANTS) { + if (iter != begin()) { + assert((iter - 1)->get_key() < key); + } + if ((iter + 1) != end()) { + assert((iter + 1)->get_key() > key); + } + } + iter->set_key(key); + iter->set_val(val); + } + + void remove(iterator iter) { + assert(iter != end()); + copy_from_local(iter, iter + 1, end()); + set_size(get_size() - 1); + } + + /** + * get_key_ptr + * + * Get pointer to start of key array + */ + KINT *get_key_ptr() { + return layout.template Pointer<2>(buf); + } + const KINT *get_key_ptr() const { + return layout.template Pointer<2>(buf); + } + + /** + * get_val_ptr + * + * Get pointer to start of val array + */ + VINT *get_val_ptr() { + return layout.template Pointer<3>(buf); + } + const VINT *get_val_ptr() const { + return layout.template Pointer<3>(buf); + } + + /** + * node_resolve/unresolve_vals + * + * If the representation for values depends in some way on the + * node in which they are located, users may implement + * resolve/unresolve to enable copy_from_foreign to handle that + * transition. + */ + virtual void node_resolve_vals(iterator from, iterator to) const {} + virtual void node_unresolve_vals(iterator from, iterator to) const {} + + /** + * copy_from_foreign + * + * Copies entries from [from_src, to_src) to tgt. + * + * tgt and from_src must be from different nodes. + * from_src and to_src must be from the same node. + */ + static void copy_from_foreign( + iterator tgt, + const_iterator from_src, + const_iterator to_src) { + assert(tgt->node != from_src->node); + assert(to_src->node == from_src->node); + memcpy( + tgt->get_val_ptr(), from_src->get_val_ptr(), + to_src->get_val_ptr() - from_src->get_val_ptr()); + memcpy( + tgt->get_key_ptr(), from_src->get_key_ptr(), + to_src->get_key_ptr() - from_src->get_key_ptr()); + from_src->node->node_resolve_vals(tgt, tgt + (to_src - from_src)); + tgt->node->node_unresolve_vals(tgt, tgt + (to_src - from_src)); + } + + /** + * copy_from_local + * + * Copies entries from [from_src, to_src) to tgt. + * + * tgt, from_src, and to_src must be from the same node. + */ + static void copy_from_local( + iterator tgt, + iterator from_src, + iterator to_src) { + assert(tgt->node == from_src->node); + assert(to_src->node == from_src->node); + memmove( + tgt->get_val_ptr(), from_src->get_val_ptr(), + to_src->get_val_ptr() - from_src->get_val_ptr()); + memmove( + tgt->get_key_ptr(), from_src->get_key_ptr(), + to_src->get_key_ptr() - from_src->get_key_ptr()); + } +}; + +} diff --git a/src/crimson/common/formatter.cc b/src/crimson/common/formatter.cc new file mode 100644 index 000000000..ab371ddbf --- /dev/null +++ b/src/crimson/common/formatter.cc @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "formatter.h" + +#include <fmt/format.h> +#if FMT_VERSION >= 60000 +#include <fmt/chrono.h> +#else +#include <fmt/time.h> +#endif + + +template <> +struct fmt::formatter<seastar::lowres_system_clock::time_point> { + // ignore the format string + template <typename ParseContext> + constexpr auto parse(ParseContext &ctx) { return ctx.begin(); } + + template <typename FormatContext> + auto format(const seastar::lowres_system_clock::time_point& t, + FormatContext& ctx) { + std::time_t tt = std::chrono::duration_cast<std::chrono::seconds>( + t.time_since_epoch()).count(); + auto milliseconds = (t.time_since_epoch() % + std::chrono::seconds(1)).count(); + return fmt::format_to(ctx.out(), "{:%Y-%m-%d %H:%M:%S} {:03d}", + fmt::localtime(tt), milliseconds); + } +}; + +namespace std { + +ostream& operator<<(ostream& out, + const seastar::lowres_system_clock::time_point& t) +{ + return out << fmt::format("{}", t); +} + +} diff --git a/src/crimson/common/formatter.h b/src/crimson/common/formatter.h new file mode 100644 index 000000000..9b7be428a --- /dev/null +++ b/src/crimson/common/formatter.h @@ -0,0 +1,13 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/lowres_clock.hh> + +#include "common/ceph_time.h" + +namespace std { + +ostream& operator<<(ostream& out, + const seastar::lowres_system_clock::time_point& t); + +} diff --git a/src/crimson/common/gated.h b/src/crimson/common/gated.h new file mode 100644 index 000000000..559a889a3 --- /dev/null +++ b/src/crimson/common/gated.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/gate.hh> +#include <seastar/core/future.hh> +#include <seastar/core/future-util.hh> + +#include "crimson/common/exception.h" +#include "crimson/common/log.h" +#include "include/ceph_assert.h" + +namespace crimson::common { + +class Gated { + public: + static seastar::logger& gated_logger() { + return crimson::get_logger(ceph_subsys_osd); + } + template <typename Func, typename T> + inline void dispatch_in_background(const char* what, T& who, Func&& func) { + (void) dispatch(what, who, func); + } + template <typename Func, typename T> + inline seastar::future<> dispatch(const char* what, T& who, Func&& func) { + return seastar::with_gate(pending_dispatch, std::forward<Func>(func) + ).handle_exception([what, &who] (std::exception_ptr eptr) { + if (*eptr.__cxa_exception_type() == typeid(system_shutdown_exception)) { + gated_logger().debug( + "{}, {} skipped, system shutdown", who, what); + return; + } + try { + std::rethrow_exception(eptr); + } catch (std::exception& e) { + gated_logger().error( + "{} dispatch() {} caught exception: {}", who, what, e.what()); + } + assert(*eptr.__cxa_exception_type() + == typeid(seastar::gate_closed_exception)); + }); + } + + seastar::future<> close() { + return pending_dispatch.close(); + } + bool is_closed() const { + return pending_dispatch.is_closed(); + } + private: + seastar::gate pending_dispatch; +}; + +}// namespace crimson::common diff --git a/src/crimson/common/interruptible_future.h b/src/crimson/common/interruptible_future.h new file mode 100644 index 000000000..c0e2c346c --- /dev/null +++ b/src/crimson/common/interruptible_future.h @@ -0,0 +1,1600 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future-util.hh> +#include <seastar/core/do_with.hh> +#include <seastar/core/when_all.hh> +#include <seastar/core/thread.hh> + +#include "crimson/common/log.h" +#include "crimson/common/errorator.h" +#ifndef NDEBUG +#define INTR_FUT_DEBUG(FMT_MSG, ...) crimson::get_logger(ceph_subsys_).trace(FMT_MSG, ##__VA_ARGS__) +#else +#define INTR_FUT_DEBUG(FMT_MSG, ...) +#endif + +// The interrupt condition generally works this way: +// +// 1. It is created by call_with_interruption_impl method, and is recorded in the thread +// local global variable "::crimson::interruptible::interrupt_cond". +// 2. Any continuation that's created within the execution of the continuation +// that calls the call_with_interruption_impl method will capture the "interrupt_cond"; +// and when they starts to run, they will put that capture interruption condition +// into "::crimson::interruptible::interrupt_cond" so that further continuations +// created can also capture the interruption condition; +// 3. At the end of the continuation run, the global "interrupt_cond" will be cleared +// to prevent other continuations that are not supposed to be interrupted wrongly +// capture an interruption condition. +// With this approach, continuations capture the interrupt condition at their creation, +// restore the interrupt conditions at the beginning of their execution and clear those +// interrupt conditions at the end of their execution. So the global "interrupt_cond" +// only hold valid interrupt conditions when the corresponding continuations are actually +// running after which it gets cleared. Since continuations can't be executed simultaneously, +// different continuation chains won't be able to interfere with each other. +// +// The global "interrupt_cond" can work as a signal about whether the continuation +// is supposed to be interrupted, the reason that the global "interrupt_cond" +// exists is that there may be this scenario: +// +// Say there's some method PG::func1(), in which the continuations created may +// or may not be supposed to be interrupted in different situations. If we don't +// have a global signal, we have to add an extra parameter to every method like +// PG::func1() to indicate whether the current run should create to-be-interrupted +// continuations or not. +// +// interruptor::with_interruption() and helpers can be used by users to wrap a future in +// the interruption machinery. + +namespace crimson::os::seastore { + class TransactionConflictCondition; +} + +// GCC tries to instantiate +// seastar::lw_shared_ptr<crimson::os::seastore::TransactionConflictCondition>. +// but we *may* not have the definition of TransactionConflictCondition at this moment, +// a full specialization for lw_shared_ptr_accessors helps to bypass the default +// lw_shared_ptr_accessors implementation, where std::is_base_of<.., T> is used. +namespace seastar::internal { + template<> + struct lw_shared_ptr_accessors<::crimson::os::seastore::TransactionConflictCondition, void> + : lw_shared_ptr_accessors_no_esft<::crimson::os::seastore::TransactionConflictCondition> + {}; +} + +SEASTAR_CONCEPT( +namespace crimson::interruptible { + template<typename InterruptCond, typename FutureType> + class interruptible_future_detail; +} +namespace seastar::impl { + template <typename InterruptCond, typename FutureType, typename... Rest> + struct is_tuple_of_futures<std::tuple<crimson::interruptible::interruptible_future_detail<InterruptCond, FutureType>, Rest...>> + : is_tuple_of_futures<std::tuple<Rest...>> {}; +} +) + +namespace crimson::interruptible { + +struct ready_future_marker {}; +struct exception_future_marker {}; + +template <typename InterruptCond> +class interruptible_future_builder; + +template <typename InterruptCond> +struct interruptor; + +template <typename InterruptCond> +using InterruptCondRef = seastar::lw_shared_ptr<InterruptCond>; + +template <typename InterruptCond> +struct interrupt_cond_t { + InterruptCondRef<InterruptCond> interrupt_cond; + uint64_t ref_count = 0; + void set( + InterruptCondRef<InterruptCond>& ic) { + INTR_FUT_DEBUG( + "{}: going to set interrupt_cond: {}, ic: {}", + __func__, + (void*)interrupt_cond.get(), + (void*)ic.get()); + if (!interrupt_cond) { + interrupt_cond = ic; + } + assert(interrupt_cond.get() == ic.get()); + ref_count++; + INTR_FUT_DEBUG( + "{}: interrupt_cond: {}, ref_count: {}", + __func__, + (void*)interrupt_cond.get(), + ref_count); + } + void reset() { + if (--ref_count == 0) { + INTR_FUT_DEBUG( + "{}: clearing interrupt_cond: {},{}", + __func__, + (void*)interrupt_cond.get(), + typeid(InterruptCond).name()); + interrupt_cond.release(); + } else { + INTR_FUT_DEBUG( + "{}: end without clearing interrupt_cond: {},{}, ref_count: {}", + __func__, + (void*)interrupt_cond.get(), + typeid(InterruptCond).name(), + ref_count); + } + } +}; + +template <typename InterruptCond> +thread_local interrupt_cond_t<InterruptCond> interrupt_cond; + +extern template thread_local interrupt_cond_t<crimson::os::seastore::TransactionConflictCondition> +interrupt_cond<crimson::os::seastore::TransactionConflictCondition>; + +template <typename InterruptCond, typename FutureType> +class [[nodiscard]] interruptible_future_detail {}; + +template <typename FutureType> +struct is_interruptible_future : public std::false_type {}; + +template <typename InterruptCond, typename FutureType> +struct is_interruptible_future< + interruptible_future_detail< + InterruptCond, + FutureType>> + : public std::true_type {}; +template <typename FutureType> +concept IsInterruptibleFuture = is_interruptible_future<FutureType>::value; +template <typename Func, typename... Args> +concept InvokeReturnsInterruptibleFuture = + IsInterruptibleFuture<std::invoke_result_t<Func, Args...>>; + +namespace internal { + +template <typename InterruptCond, typename Func, typename... Args> +auto call_with_interruption_impl( + InterruptCondRef<InterruptCond> interrupt_condition, + Func&& func, Args&&... args) +{ + using futurator_t = seastar::futurize<std::invoke_result_t<Func, Args...>>; + // there might be a case like this: + // with_interruption([] { + // interruptor::do_for_each([] { + // ... + // return interruptible_errorated_future(); + // }).safe_then_interruptible([] { + // ... + // }); + // }) + // In this case, as crimson::do_for_each would directly do futurize_invoke + // for "call_with_interruption", we have to make sure this invocation would + // not errorly release ::crimson::interruptible::interrupt_cond<InterruptCond> + + // If there exists an interrupt condition, which means "Func" may not be + // permitted to run as a result of the interruption, test it. If it does + // need to be interrupted, return an interruption; otherwise, restore the + // global "interrupt_cond" with the interruption condition, and go ahead + // executing the Func. + assert(interrupt_condition); + auto fut = interrupt_condition->template may_interrupt< + typename futurator_t::type>(); + INTR_FUT_DEBUG( + "call_with_interruption_impl: may_interrupt: {}, " + "local interrupt_condition: {}, " + "global interrupt_cond: {},{}", + (bool)fut, + (void*)interrupt_condition.get(), + (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(), + typeid(InterruptCond).name()); + if (fut) { + return std::move(*fut); + } + interrupt_cond<InterruptCond>.set(interrupt_condition); + + auto fut2 = seastar::futurize_invoke( + std::forward<Func>(func), + std::forward<Args>(args)...); + // Clear the global "interrupt_cond" to prevent it from interfering other + // continuation chains. + interrupt_cond<InterruptCond>.reset(); + return fut2; +} + +} + +template <typename InterruptCond, typename Func, seastar::Future Ret> +requires (!InterruptCond::template is_interruption_v<Ret>) +auto call_with_interruption( + InterruptCondRef<InterruptCond> interrupt_condition, + Func&& func, Ret&& fut) +{ + using Result = std::invoke_result_t<Func, Ret>; + // if "T" is already an interrupt exception, return it directly; + // otherwise, upper layer application may encounter errors executing + // the "Func" body. + if (fut.failed()) { + std::exception_ptr eptr = fut.get_exception(); + if (interrupt_condition->is_interruption(eptr)) { + return seastar::futurize<Result>::make_exception_future(std::move(eptr)); + } + return internal::call_with_interruption_impl( + interrupt_condition, + std::forward<Func>(func), + seastar::futurize<Ret>::make_exception_future( + std::move(eptr))); + } + return internal::call_with_interruption_impl( + interrupt_condition, + std::forward<Func>(func), + std::move(fut)); +} + +template <typename InterruptCond, typename Func, typename T> +requires (InterruptCond::template is_interruption_v<T>) +auto call_with_interruption( + InterruptCondRef<InterruptCond> interrupt_condition, + Func&& func, T&& arg) +{ + using Result = std::invoke_result_t<Func, T>; + // if "T" is already an interrupt exception, return it directly; + // otherwise, upper layer application may encounter errors executing + // the "Func" body. + return seastar::futurize<Result>::make_exception_future( + std::get<0>(std::tuple(std::forward<T>(arg)))); +} + +template <typename InterruptCond, typename Func, typename T> +requires (!InterruptCond::template is_interruption_v<T>) && (!seastar::Future<T>) +auto call_with_interruption( + InterruptCondRef<InterruptCond> interrupt_condition, + Func&& func, T&& arg) +{ + return internal::call_with_interruption_impl( + interrupt_condition, + std::forward<Func>(func), + std::forward<T>(arg)); +} + +template <typename InterruptCond, typename Func, + typename Result = std::invoke_result_t<Func>> +auto call_with_interruption( + InterruptCondRef<InterruptCond> interrupt_condition, + Func&& func) +{ + return internal::call_with_interruption_impl( + interrupt_condition, + std::forward<Func>(func)); +} + +template <typename InterruptCond, typename Func, typename... T, + typename Result = std::invoke_result_t<Func, T...>> +Result non_futurized_call_with_interruption( + InterruptCondRef<InterruptCond> interrupt_condition, + Func&& func, T&&... args) +{ + assert(interrupt_condition); + auto fut = interrupt_condition->template may_interrupt<seastar::future<>>(); + INTR_FUT_DEBUG( + "non_futurized_call_with_interruption may_interrupt: {}, " + "interrupt_condition: {}, interrupt_cond: {},{}", + (bool)fut, + (void*)interrupt_condition.get(), + (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(), + typeid(InterruptCond).name()); + if (fut) { + std::rethrow_exception(fut->get_exception()); + } + interrupt_cond<InterruptCond>.set(interrupt_condition); + try { + if constexpr (std::is_void_v<Result>) { + std::invoke(std::forward<Func>(func), std::forward<T>(args)...); + + // Clear the global "interrupt_cond" to prevent it from interfering other + // continuation chains. + interrupt_cond<InterruptCond>.reset(); + return; + } else { + auto&& err = std::invoke(std::forward<Func>(func), std::forward<T>(args)...); + interrupt_cond<InterruptCond>.reset(); + return std::forward<Result>(err); + } + } catch (std::exception& e) { + // Clear the global "interrupt_cond" to prevent it from interfering other + // continuation chains. + interrupt_cond<InterruptCond>.reset(); + throw e; + } +} + +template <typename InterruptCond, typename Errorator> +struct interruptible_errorator; + +template <typename T> +struct parallel_for_each_ret { + static_assert(seastar::Future<T>); + using type = seastar::future<>; +}; + +template <template <typename...> typename ErroratedFuture, typename T> +struct parallel_for_each_ret< + ErroratedFuture< + ::crimson::errorated_future_marker<T>>> { + using type = ErroratedFuture<::crimson::errorated_future_marker<void>>; +}; + +template <typename InterruptCond, typename FutureType> +class parallel_for_each_state final : private seastar::continuation_base<> { + using elem_ret_t = std::conditional_t< + IsInterruptibleFuture<FutureType>, + typename FutureType::core_type, + FutureType>; + using future_t = interruptible_future_detail< + InterruptCond, + typename parallel_for_each_ret<elem_ret_t>::type>; + std::vector<future_t> _incomplete; + seastar::promise<> _result; + std::exception_ptr _ex; +private: + void wait_for_one() noexcept { + while (!_incomplete.empty() && _incomplete.back().available()) { + if (_incomplete.back().failed()) { + _ex = _incomplete.back().get_exception(); + } + _incomplete.pop_back(); + } + if (!_incomplete.empty()) { + seastar::internal::set_callback(std::move(_incomplete.back()), + static_cast<continuation_base<>*>(this)); + _incomplete.pop_back(); + return; + } + if (__builtin_expect(bool(_ex), false)) { + _result.set_exception(std::move(_ex)); + } else { + _result.set_value(); + } + delete this; + } + virtual void run_and_dispose() noexcept override { + if (_state.failed()) { + _ex = std::move(_state).get_exception(); + } + _state = {}; + wait_for_one(); + } + task* waiting_task() noexcept override { return _result.waiting_task(); } +public: + parallel_for_each_state(size_t n) { + _incomplete.reserve(n); + } + void add_future(future_t&& f) { + _incomplete.push_back(std::move(f)); + } + future_t get_future() { + auto ret = _result.get_future(); + wait_for_one(); + return ret; + } + static future_t now() { + return seastar::now(); + } +}; + +template <typename InterruptCond, typename T> +class [[nodiscard]] interruptible_future_detail<InterruptCond, seastar::future<T>> + : private seastar::future<T> { +public: + using core_type = seastar::future<T>; + template <typename U> + using interrupt_futurize_t = + typename interruptor<InterruptCond>::template futurize_t<U>; + using core_type::get0; + using core_type::core_type; + using core_type::get_exception; + using core_type::ignore_ready_future; + + [[gnu::always_inline]] + interruptible_future_detail(seastar::future<T>&& base) + : core_type(std::move(base)) + {} + + using value_type = typename seastar::future<T>::value_type; + using tuple_type = typename seastar::future<T>::tuple_type; + + [[gnu::always_inline]] + value_type&& get() { + if (core_type::available()) { + return core_type::get(); + } else { + // destined to wait! + auto interruption_condition = interrupt_cond<InterruptCond>.interrupt_cond; + INTR_FUT_DEBUG( + "interruptible_future_detail::get() waiting, interrupt_cond: {},{}", + (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(), + typeid(InterruptCond).name()); + interrupt_cond<InterruptCond>.reset(); + auto&& value = core_type::get(); + interrupt_cond<InterruptCond>.set(interruption_condition); + INTR_FUT_DEBUG( + "interruptible_future_detail::get() got, interrupt_cond: {},{}", + (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(), + typeid(InterruptCond).name()); + return std::move(value); + } + } + + using core_type::available; + using core_type::failed; + + template <typename Func, + typename Result = interrupt_futurize_t< + std::invoke_result_t<Func, seastar::future<T>>>> + [[gnu::always_inline]] + Result then_wrapped_interruptible(Func&& func) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + return core_type::then_wrapped( + [func=std::move(func), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (auto&& fut) mutable { + return call_with_interruption( + std::move(interrupt_condition), + std::forward<Func>(func), + std::move(fut)); + }); + } + + template <typename Func> + [[gnu::always_inline]] + auto then_interruptible(Func&& func) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + if constexpr (std::is_void_v<T>) { + auto fut = core_type::then( + [func=std::move(func), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + () mutable { + return call_with_interruption( + interrupt_condition, + std::move(func)); + }); + return (interrupt_futurize_t<decltype(fut)>)(std::move(fut)); + } else { + auto fut = core_type::then( + [func=std::move(func), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (T&& arg) mutable { + return call_with_interruption( + interrupt_condition, + std::move(func), + std::forward<T>(arg)); + }); + return (interrupt_futurize_t<decltype(fut)>)(std::move(fut)); + } + } + + template <typename Func> + [[gnu::always_inline]] + auto then_unpack_interruptible(Func&& func) { + return then_interruptible([func=std::forward<Func>(func)](T&& tuple) mutable { + return std::apply(std::forward<Func>(func), std::move(tuple)); + }); + } + + template <typename Func, + typename Result =interrupt_futurize_t< + std::result_of_t<Func(std::exception_ptr)>>> + [[gnu::always_inline]] + Result handle_exception_interruptible(Func&& func) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + return core_type::then_wrapped( + [func=std::forward<Func>(func), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (auto&& fut) mutable { + if (!fut.failed()) { + return seastar::make_ready_future<T>(fut.get()); + } else { + return call_with_interruption( + interrupt_condition, + std::move(func), + fut.get_exception()); + } + }); + } + + template <bool may_interrupt = true, typename Func, + typename Result = interrupt_futurize_t< + std::result_of_t<Func()>>> + [[gnu::always_inline]] + Result finally_interruptible(Func&& func) { + if constexpr (may_interrupt) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + return core_type::then_wrapped( + [func=std::forward<Func>(func), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (auto&& fut) mutable { + return call_with_interruption( + interrupt_condition, + std::move(func)); + }); + } else { + return core_type::finally(std::forward<Func>(func)); + } + } + + template <typename Func, + typename Result = interrupt_futurize_t< + std::result_of_t<Func( + typename seastar::function_traits<Func>::template arg<0>::type)>>> + [[gnu::always_inline]] + Result handle_exception_type_interruptible(Func&& func) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + using trait = seastar::function_traits<Func>; + static_assert(trait::arity == 1, "func can take only one parameter"); + using ex_type = typename trait::template arg<0>::type; + return core_type::then_wrapped( + [func=std::forward<Func>(func), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (auto&& fut) mutable -> Result { + if (!fut.failed()) { + return seastar::make_ready_future<T>(fut.get()); + } else { + try { + std::rethrow_exception(fut.get_exception()); + } catch (ex_type& ex) { + return call_with_interruption( + interrupt_condition, + std::move(func), ex); + } + } + }); + } + + + using my_type = interruptible_future_detail<InterruptCond, seastar::future<T>>; + + template <typename Func> + [[gnu::always_inline]] + my_type finally(Func&& func) { + return core_type::finally(std::forward<Func>(func)); + } +private: + template <typename Func> + [[gnu::always_inline]] + auto handle_interruption(Func&& func) { + return core_type::then_wrapped( + [func=std::move(func)](auto&& fut) mutable { + if (fut.failed()) { + std::exception_ptr ex = fut.get_exception(); + if (InterruptCond::is_interruption(ex)) { + return seastar::futurize_invoke(std::move(func), std::move(ex)); + } else { + return seastar::make_exception_future<T>(std::move(ex)); + } + } else { + return seastar::make_ready_future<T>(fut.get()); + } + }); + } + + seastar::future<T> to_future() { + return static_cast<core_type&&>(std::move(*this)); + } + // this is only supposed to be invoked by seastar functions + template <typename Func, + typename Result = interrupt_futurize_t< + std::result_of_t<Func(seastar::future<T>)>>> + [[gnu::always_inline]] + Result then_wrapped(Func&& func) { + return core_type::then_wrapped( + [func=std::move(func), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (auto&& fut) mutable { + return call_with_interruption( + interrupt_condition, + std::forward<Func>(func), + std::move(fut)); + }); + } + friend interruptor<InterruptCond>; + friend class interruptible_future_builder<InterruptCond>; + template <typename U> + friend struct ::seastar::futurize; + template <typename> + friend class ::seastar::future; + template <typename HeldState, typename Future> + friend class seastar::internal::do_with_state; + template<typename TX, typename F> + friend inline auto ::seastar::internal::do_with_impl(TX&& rvalue, F&& f); + template<typename T1, typename T2, typename T3_or_F, typename... More> + friend inline auto ::seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, T3_or_F&& rv3, More&&... more); + template <typename T1, typename T2, typename... More> + friend auto seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, More&&... more); + template <typename, typename> + friend class ::crimson::maybe_handle_error_t; + template <typename> + friend class ::seastar::internal::extract_values_from_futures_vector; + template <typename, typename> + friend class interruptible_future_detail; + template <typename ResolvedVectorTransform, typename Future> + friend inline typename ResolvedVectorTransform::future_type + seastar::internal::complete_when_all( + std::vector<Future>&& futures, + typename std::vector<Future>::iterator pos) noexcept; + template <typename> + friend class ::seastar::internal::when_all_state_component; + template <typename Lock, typename Func> + friend inline auto seastar::with_lock(Lock& lock, Func&& f); + template <typename IC, typename FT> + friend class parallel_for_each_state; +}; + +template <typename InterruptCond, typename Errorator> +struct interruptible_errorator { + using base_ertr = Errorator; + using intr_cond_t = InterruptCond; + + template <typename ValueT = void> + using future = interruptible_future_detail<InterruptCond, + typename Errorator::template future<ValueT>>; + + template <class... NewAllowedErrorsT> + using extend = interruptible_errorator< + InterruptCond, + typename Errorator::template extend<NewAllowedErrorsT...>>; + + template <class Ertr> + using extend_ertr = interruptible_errorator< + InterruptCond, + typename Errorator::template extend_ertr<Ertr>>; + + template <typename ValueT = void, typename... A> + static interruptible_future_detail< + InterruptCond, + typename Errorator::template future<ValueT>> + make_ready_future(A&&... value) { + return interruptible_future_detail< + InterruptCond, typename Errorator::template future<ValueT>>( + Errorator::template make_ready_future<ValueT>( + std::forward<A>(value)...)); + } + static interruptible_future_detail< + InterruptCond, + typename Errorator::template future<>> now() { + return interruptible_future_detail< + InterruptCond, typename Errorator::template future<>>( + Errorator::now()); + } + + using pass_further = typename Errorator::pass_further; +}; + +template <typename InterruptCond, + template <typename...> typename ErroratedFuture, + typename T> +class [[nodiscard]] interruptible_future_detail< + InterruptCond, + ErroratedFuture<::crimson::errorated_future_marker<T>>> + : private ErroratedFuture<::crimson::errorated_future_marker<T>> +{ +public: + using core_type = ErroratedFuture<crimson::errorated_future_marker<T>>; + using errorator_type = typename core_type::errorator_type; + using interrupt_errorator_type = + interruptible_errorator<InterruptCond, errorator_type>; + using interrupt_cond_type = InterruptCond; + + template <typename U> + using interrupt_futurize_t = + typename interruptor<InterruptCond>::template futurize_t<U>; + + using core_type::available; + using core_type::failed; + using core_type::core_type; + using core_type::get_exception; + + using value_type = typename core_type::value_type; + + interruptible_future_detail(seastar::future<T>&& fut) + : core_type(std::move(fut)) + {} + + template <template <typename...> typename ErroratedFuture2, + typename... U> + [[gnu::always_inline]] + interruptible_future_detail( + ErroratedFuture2<::crimson::errorated_future_marker<U...>>&& fut) + : core_type(std::move(fut)) {} + + template <template <typename...> typename ErroratedFuture2, + typename... U> + [[gnu::always_inline]] + interruptible_future_detail( + interruptible_future_detail<InterruptCond, + ErroratedFuture2<::crimson::errorated_future_marker<U...>>>&& fut) + : core_type(static_cast<typename std::decay_t<decltype(fut)>::core_type&&>(fut)) { + using src_errorator_t = \ + typename ErroratedFuture2< + ::crimson::errorated_future_marker<U...>>::errorator_type; + static_assert(core_type::errorator_type::template contains_once_v< + src_errorator_t>, + "conversion is only possible from less-or-eq errorated future!"); + } + + [[gnu::always_inline]] + interruptible_future_detail( + interruptible_future_detail<InterruptCond, seastar::future<T>>&& fut) + : core_type(static_cast<seastar::future<T>&&>(fut)) {} + + template <class... A> + [[gnu::always_inline]] + interruptible_future_detail(ready_future_marker, A&&... a) + : core_type(::seastar::make_ready_future<typename core_type::value_type>( + std::forward<A>(a)...)) { + } + [[gnu::always_inline]] + interruptible_future_detail(exception_future_marker, ::seastar::future_state_base&& state) noexcept + : core_type(::seastar::futurize<core_type>::make_exception_future(std::move(state))) { + } + [[gnu::always_inline]] + interruptible_future_detail(exception_future_marker, std::exception_ptr&& ep) noexcept + : core_type(::seastar::futurize<core_type>::make_exception_future(std::move(ep))) { + } + + template<bool interruptible = true, typename ValueInterruptCondT, typename ErrorVisitorT, + std::enable_if_t<!interruptible, int> = 0> + [[gnu::always_inline]] + auto safe_then_interruptible(ValueInterruptCondT&& valfunc, ErrorVisitorT&& errfunc) { + auto fut = core_type::safe_then( + std::forward<ValueInterruptCondT>(valfunc), + std::forward<ErrorVisitorT>(errfunc)); + return (interrupt_futurize_t<decltype(fut)>)(std::move(fut)); + } + + template <typename... Args> + auto si_then(Args&&... args) { + return safe_then_interruptible(std::forward<Args>(args)...); + } + + + template<bool interruptible = true, typename ValueInterruptCondT, typename ErrorVisitorT, + typename U = T, std::enable_if_t<!std::is_void_v<U> && interruptible, int> = 0> + [[gnu::always_inline]] + auto safe_then_interruptible(ValueInterruptCondT&& valfunc, ErrorVisitorT&& errfunc) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + auto fut = core_type::safe_then( + [func=std::move(valfunc), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (T&& args) mutable { + return call_with_interruption( + interrupt_condition, + std::move(func), + std::forward<T>(args)); + }, [func=std::move(errfunc), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (auto&& err) mutable -> decltype(auto) { + constexpr bool return_void = std::is_void_v< + std::invoke_result_t<ErrorVisitorT, + std::decay_t<decltype(err)>>>; + constexpr bool return_err = ::crimson::is_error_v< + std::decay_t<std::invoke_result_t<ErrorVisitorT, + std::decay_t<decltype(err)>>>>; + if constexpr (return_err || return_void) { + return non_futurized_call_with_interruption( + interrupt_condition, + std::move(func), + std::move(err)); + } else { + return call_with_interruption( + interrupt_condition, + std::move(func), + std::move(err)); + } + }); + return (interrupt_futurize_t<decltype(fut)>)(std::move(fut)); + } + + template<bool interruptible = true, typename ValueInterruptCondT, typename ErrorVisitorT, + typename U = T, std::enable_if_t<std::is_void_v<U> && interruptible, int> = 0> + [[gnu::always_inline]] + auto safe_then_interruptible(ValueInterruptCondT&& valfunc, ErrorVisitorT&& errfunc) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + auto fut = core_type::safe_then( + [func=std::move(valfunc), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + () mutable { + return call_with_interruption( + interrupt_condition, + std::move(func)); + }, [func=std::move(errfunc), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (auto&& err) mutable -> decltype(auto) { + constexpr bool return_void = std::is_void_v< + std::invoke_result_t<ErrorVisitorT, + std::decay_t<decltype(err)>>>; + constexpr bool return_err = ::crimson::is_error_v< + std::decay_t<std::invoke_result_t<ErrorVisitorT, + std::decay_t<decltype(err)>>>>; + if constexpr (return_err || return_void) { + return non_futurized_call_with_interruption( + interrupt_condition, + std::move(func), + std::move(err)); + } else { + return call_with_interruption( + interrupt_condition, + std::move(func), + std::move(err)); + } + }); + return (interrupt_futurize_t<decltype(fut)>)(std::move(fut)); + } + + template <bool interruptible = true, typename ValueInterruptCondT, + typename U = T, std::enable_if_t<std::is_void_v<T> && interruptible, int> = 0> + [[gnu::always_inline]] + auto safe_then_interruptible(ValueInterruptCondT&& valfunc) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + auto fut = core_type::safe_then( + [func=std::move(valfunc), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + () mutable { + return call_with_interruption( + interrupt_condition, + std::move(func)); + }); + return (interrupt_futurize_t<decltype(fut)>)(std::move(fut)); + } + + template <typename ValFuncT, typename ErrorFuncT> + [[gnu::always_inline]] + auto safe_then_unpack_interruptible(ValFuncT&& func, ErrorFuncT&& errfunc) { + return safe_then_interruptible([func=std::forward<ValFuncT>(func)](T&& tuple) mutable { + return std::apply(std::forward<ValFuncT>(func), std::move(tuple)); + }, std::forward<ErrorFuncT>(errfunc)); + } + + template <typename ValFuncT> + [[gnu::always_inline]] + auto safe_then_unpack_interruptible(ValFuncT&& func) { + return safe_then_interruptible([func=std::forward<ValFuncT>(func)](T&& tuple) mutable { + return std::apply(std::forward<ValFuncT>(func), std::move(tuple)); + }); + } + + template <bool interruptible = true, typename ValueInterruptCondT, + typename U = T, std::enable_if_t<!std::is_void_v<T> && interruptible, int> = 0> + [[gnu::always_inline]] + auto safe_then_interruptible(ValueInterruptCondT&& valfunc) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + auto fut = core_type::safe_then( + [func=std::move(valfunc), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (T&& arg) mutable { + return call_with_interruption( + interrupt_condition, + std::move(func), + std::forward<T>(arg)); + }); + return (interrupt_futurize_t<decltype(fut)>)(std::move(fut)); + } + + template <bool interruptible = true, typename ValueInterruptCondT, + std::enable_if_t<!interruptible, int> = 0> + [[gnu::always_inline]] + auto safe_then_interruptible(ValueInterruptCondT&& valfunc) { + auto fut = core_type::safe_then(std::forward<ValueInterruptCondT>(valfunc)); + return (interrupt_futurize_t<decltype(fut)>)(std::move(fut)); + } + + template <typename ValueInterruptCondT, + typename ErrorVisitorHeadT, + typename... ErrorVisitorTailT> + [[gnu::always_inline]] + auto safe_then_interruptible(ValueInterruptCondT&& valfunc, + ErrorVisitorHeadT&& err_func_head, + ErrorVisitorTailT&&... err_func_tail) { + return safe_then_interruptible( + std::forward<ValueInterruptCondT>(valfunc), + ::crimson::composer(std::forward<ErrorVisitorHeadT>(err_func_head), + std::forward<ErrorVisitorTailT>(err_func_tail)...)); + } + + template <typename ValueInterruptCondT, + typename ErrorVisitorHeadT, + typename... ErrorVisitorTailT> + [[gnu::always_inline]] + auto safe_then_interruptible_tuple(ValueInterruptCondT&& valfunc, + ErrorVisitorHeadT&& err_func_head, + ErrorVisitorTailT&&... err_func_tail) { + return safe_then_interruptible( + std::forward<ValueInterruptCondT>(valfunc), + ::crimson::composer(std::forward<ErrorVisitorHeadT>(err_func_head), + std::forward<ErrorVisitorTailT>(err_func_tail)...)); + } + + template <typename ValFuncT, + typename ErrorVisitorHeadT, + typename... ErrorVisitorTailT> + [[gnu::always_inline]] + auto safe_then_unpack_interruptible_tuple( + ValFuncT&& valfunc, + ErrorVisitorHeadT&& err_func_head, + ErrorVisitorTailT&&... err_func_tail) { + return safe_then_interruptible_tuple( + [valfunc=std::forward<ValFuncT>(valfunc)](T&& tuple) mutable { + return std::apply(std::forward<ValFuncT>(valfunc), std::move(tuple)); + }, + ::crimson::composer(std::forward<ErrorVisitorHeadT>(err_func_head), + std::forward<ErrorVisitorTailT>(err_func_tail)...)); + } + + template <bool interruptible = true, typename ErrorFunc> + auto handle_error_interruptible(ErrorFunc&& errfunc) { + if constexpr (interruptible) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + auto fut = core_type::handle_error( + [errfunc=std::move(errfunc), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (auto&& err) mutable -> decltype(auto) { + constexpr bool return_void = std::is_void_v< + std::invoke_result_t<ErrorFunc, + std::decay_t<decltype(err)>>>; + constexpr bool return_err = ::crimson::is_error_v< + std::decay_t<std::invoke_result_t<ErrorFunc, + std::decay_t<decltype(err)>>>>; + if constexpr (return_err || return_void) { + return non_futurized_call_with_interruption( + interrupt_condition, + std::move(errfunc), + std::move(err)); + } else { + return call_with_interruption( + interrupt_condition, + std::move(errfunc), + std::move(err)); + } + }); + return (interrupt_futurize_t<decltype(fut)>)(std::move(fut)); + } else { + return core_type::handle_error(std::forward<ErrorFunc>(errfunc)); + } + } + + template <typename ErrorFuncHead, + typename... ErrorFuncTail> + auto handle_error_interruptible(ErrorFuncHead&& error_func_head, + ErrorFuncTail&&... error_func_tail) { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + static_assert(sizeof...(ErrorFuncTail) > 0); + return this->handle_error_interruptible( + ::crimson::composer( + std::forward<ErrorFuncHead>(error_func_head), + std::forward<ErrorFuncTail>(error_func_tail)...)); + } + + template <typename Func> + [[gnu::always_inline]] + auto finally(Func&& func) { + auto fut = core_type::finally(std::forward<Func>(func)); + return (interrupt_futurize_t<decltype(fut)>)(std::move(fut)); + } + +private: + using core_type::_then; + template <typename Func> + [[gnu::always_inline]] + auto handle_interruption(Func&& func) { + // see errorator.h safe_then definition + using func_result_t = + typename std::invoke_result<Func, std::exception_ptr>::type; + using func_ertr_t = + typename core_type::template get_errorator_t<func_result_t>; + using this_ertr_t = typename core_type::errorator_type; + using ret_ertr_t = typename this_ertr_t::template extend_ertr<func_ertr_t>; + using futurator_t = typename ret_ertr_t::template futurize<func_result_t>; + return core_type::then_wrapped( + [func=std::move(func), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (auto&& fut) mutable + -> typename futurator_t::type { + if (fut.failed()) { + std::exception_ptr ex = fut.get_exception(); + if (InterruptCond::is_interruption(ex)) { + return futurator_t::invoke(std::move(func), std::move(ex)); + } else { + return futurator_t::make_exception_future(std::move(ex)); + } + } else { + return std::move(fut); + } + }); + } + + ErroratedFuture<::crimson::errorated_future_marker<T>> + to_future() { + return static_cast<core_type&&>(std::move(*this)); + } + + friend class interruptor<InterruptCond>; + friend class interruptible_future_builder<InterruptCond>; + template <typename U> + friend struct ::seastar::futurize; + template <typename> + friend class ::seastar::future; + template<typename TX, typename F> + friend inline auto ::seastar::internal::do_with_impl(TX&& rvalue, F&& f); + template<typename T1, typename T2, typename T3_or_F, typename... More> + friend inline auto ::seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, T3_or_F&& rv3, More&&... more); + template <typename T1, typename T2, typename... More> + friend auto seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, More&&... more); + template <typename HeldState, typename Future> + friend class seastar::internal::do_with_state; + template <typename, typename> + friend class ::crimson::maybe_handle_error_t; + template <typename, typename> + friend class interruptible_future_detail; + template <typename Lock, typename Func> + friend inline auto seastar::with_lock(Lock& lock, Func&& f); + template <typename IC, typename FT> + friend class parallel_for_each_state; +}; + +template <typename InterruptCond, typename T = void> +using interruptible_future = + interruptible_future_detail<InterruptCond, seastar::future<T>>; + +template <typename InterruptCond, typename Errorator, typename T = void> +using interruptible_errorated_future = + interruptible_future_detail< + InterruptCond, + typename Errorator::template future<T>>; + +template <typename InterruptCond> +struct interruptor +{ +public: + using condition = InterruptCond; + + template <typename FutureType> + [[gnu::always_inline]] + static interruptible_future_detail<InterruptCond, FutureType> + make_interruptible(FutureType&& fut) { + return interruptible_future_detail<InterruptCond, FutureType>(std::move(fut)); + } + + [[gnu::always_inline]] + static interruptible_future_detail<InterruptCond, seastar::future<>> now() { + return interruptible_future_detail< + InterruptCond, + seastar::future<>>(seastar::now()); + } + + template <typename ValueT = void, typename... A> + [[gnu::always_inline]] + static interruptible_future_detail<InterruptCond, seastar::future<ValueT>> + make_ready_future(A&&... value) { + return interruptible_future_detail<InterruptCond, seastar::future<ValueT>>( + seastar::make_ready_future<ValueT>(std::forward<A>(value)...)); + } + + template <typename T> + struct futurize { + using type = interruptible_future_detail< + InterruptCond, typename seastar::futurize<T>::type>; + }; + + template <typename FutureType> + struct futurize<interruptible_future_detail<InterruptCond, FutureType>> { + using type = interruptible_future_detail<InterruptCond, FutureType>; + }; + + template <typename T> + using futurize_t = typename futurize<T>::type; + + template <typename Container, typename AsyncAction> + [[gnu::always_inline]] + static auto do_for_each(Container& c, AsyncAction&& action) { + return do_for_each(std::begin(c), std::end(c), + std::forward<AsyncAction>(action)); + } + + template <typename OpFunc, typename OnInterrupt, + typename... Params> + static inline auto with_interruption_cond( + OpFunc&& opfunc, OnInterrupt&& efunc, InterruptCond &&cond, Params&&... params) { + INTR_FUT_DEBUG( + "with_interruption_cond: interrupt_cond: {}", + (void*)interrupt_cond<InterruptCond>.interrupt_cond.get()); + return internal::call_with_interruption_impl( + seastar::make_lw_shared<InterruptCond>(std::move(cond)), + std::forward<OpFunc>(opfunc), + std::forward<Params>(params)... + ).template handle_interruption(std::move(efunc)); + } + + template <typename OpFunc, typename OnInterrupt, + typename... InterruptCondParams> + static inline auto with_interruption( + OpFunc&& opfunc, OnInterrupt&& efunc, InterruptCondParams&&... params) { + return with_interruption_cond( + std::forward<OpFunc>(opfunc), + std::forward<OnInterrupt>(efunc), + InterruptCond(std::forward<InterruptCondParams>(params)...)); + } + + template <typename Error, + typename Func, + typename... Params> + static inline auto with_interruption_to_error( + Func &&f, InterruptCond &&cond, Params&&... params) { + using func_result_t = std::invoke_result_t<Func, Params...>; + using func_ertr_t = + typename seastar::template futurize< + func_result_t>::core_type::errorator_type; + using with_trans_ertr = + typename func_ertr_t::template extend_ertr<errorator<Error>>; + + using value_type = typename func_result_t::value_type; + using ftype = typename std::conditional_t< + std::is_same_v<value_type, seastar::internal::monostate>, + typename with_trans_ertr::template future<>, + typename with_trans_ertr::template future<value_type>>; + + return with_interruption_cond( + std::forward<Func>(f), + [](auto e) -> ftype { + return Error::make(); + }, + std::forward<InterruptCond>(cond), + std::forward<Params>(params)...); + } + + template <typename Func> + [[gnu::always_inline]] + static auto wrap_function(Func&& func) { + return [func=std::forward<Func>(func), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]() mutable { + return call_with_interruption( + interrupt_condition, + std::forward<Func>(func)); + }; + } + + template <typename Iterator, + InvokeReturnsInterruptibleFuture<typename Iterator::reference> AsyncAction> + [[gnu::always_inline]] + static auto do_for_each(Iterator begin, Iterator end, AsyncAction&& action) { + using Result = std::invoke_result_t<AsyncAction, typename Iterator::reference>; + if constexpr (seastar::Future<typename Result::core_type>) { + return make_interruptible( + ::seastar::do_for_each(begin, end, + [action=std::move(action), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (typename Iterator::reference x) mutable { + return call_with_interruption( + interrupt_condition, + std::move(action), + std::forward<decltype(*begin)>(x)).to_future(); + }) + ); + } else { + return make_interruptible( + ::crimson::do_for_each(begin, end, + [action=std::move(action), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (typename Iterator::reference x) mutable { + return call_with_interruption( + interrupt_condition, + std::move(action), + std::forward<decltype(*begin)>(x)).to_future(); + }) + ); + } + } + + template <typename Iterator, typename AsyncAction> + requires (!InvokeReturnsInterruptibleFuture<AsyncAction, typename Iterator::reference>) + [[gnu::always_inline]] + static auto do_for_each(Iterator begin, Iterator end, AsyncAction&& action) { + if constexpr (seastar::InvokeReturnsAnyFuture<AsyncAction, typename Iterator::reference>) { + return make_interruptible( + ::seastar::do_for_each(begin, end, + [action=std::move(action), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (typename Iterator::reference x) mutable { + return call_with_interruption( + interrupt_condition, + std::move(action), + std::forward<decltype(*begin)>(x)); + }) + ); + } else { + return make_interruptible( + ::crimson::do_for_each(begin, end, + [action=std::move(action), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (typename Iterator::reference x) mutable { + return call_with_interruption( + interrupt_condition, + std::move(action), + std::forward<decltype(*begin)>(x)); + }) + ); + } + } + + template <InvokeReturnsInterruptibleFuture AsyncAction> + [[gnu::always_inline]] + static auto repeat(AsyncAction&& action) { + using Result = std::invoke_result_t<AsyncAction>; + if constexpr (seastar::Future<typename Result::core_type>) { + return make_interruptible( + ::seastar::repeat( + [action=std::move(action), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] { + return call_with_interruption( + interrupt_condition, + std::move(action)).to_future(); + }) + ); + } else { + return make_interruptible( + ::crimson::repeat( + [action=std::move(action), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond]() mutable { + return call_with_interruption( + interrupt_condition, + std::move(action)).to_future(); + }) + ); + } + } + template <typename AsyncAction> + requires (!InvokeReturnsInterruptibleFuture<AsyncAction>) + [[gnu::always_inline]] + static auto repeat(AsyncAction&& action) { + if constexpr (seastar::InvokeReturnsAnyFuture<AsyncAction>) { + return make_interruptible( + ::seastar::repeat( + [action=std::move(action), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] { + return call_with_interruption( + interrupt_condition, + std::move(action)); + }) + ); + } else { + return make_interruptible( + ::crimson::repeat( + [action=std::move(action), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] { + return call_with_interruption( + interrupt_condition, + std::move(action)); + }) + ); + } + } + + template <typename Iterator, typename Func> + static inline auto parallel_for_each( + Iterator begin, + Iterator end, + Func&& func + ) noexcept { + using ResultType = std::invoke_result_t<Func, typename Iterator::reference>; + parallel_for_each_state<InterruptCond, ResultType>* s = nullptr; + auto decorated_func = + [func=std::forward<Func>(func), + interrupt_condition=interrupt_cond<InterruptCond>.interrupt_cond] + (decltype(*Iterator())&& x) mutable { + return call_with_interruption( + interrupt_condition, + std::forward<Func>(func), + std::forward<decltype(*begin)>(x)); + }; + // Process all elements, giving each future the following treatment: + // - available, not failed: do nothing + // - available, failed: collect exception in ex + // - not available: collect in s (allocating it if needed) + while (begin != end) { + auto f = seastar::futurize_invoke(decorated_func, *begin++); + if (!f.available() || f.failed()) { + if (!s) { + using itraits = std::iterator_traits<Iterator>; + auto n = (seastar::internal::iterator_range_estimate_vector_capacity( + begin, end, typename itraits::iterator_category()) + 1); + s = new parallel_for_each_state<InterruptCond, ResultType>(n); + } + s->add_future(std::move(f)); + } + } + // If any futures were not available, hand off to parallel_for_each_state::start(). + // Otherwise we can return a result immediately. + if (s) { + // s->get_future() takes ownership of s (and chains it to one of the futures it contains) + // so this isn't a leak + return s->get_future(); + } + return parallel_for_each_state<InterruptCond, ResultType>::now(); + } + + template <typename Container, typename Func> + static inline auto parallel_for_each(Container& container, Func&& func) noexcept { + return parallel_for_each( + std::begin(container), + std::end(container), + std::forward<Func>(func)); + } + + template <typename Iterator, typename Mapper, typename Initial, typename Reduce> + static inline interruptible_future<InterruptCond, Initial> map_reduce( + Iterator begin, Iterator end, Mapper&& mapper, Initial initial, Reduce&& reduce) { + struct state { + Initial result; + Reduce reduce; + }; + auto s = seastar::make_lw_shared(state{std::move(initial), std::move(reduce)}); + interruptible_future<InterruptCond> ret = seastar::make_ready_future<>(); + while (begin != end) { + ret = seastar::futurize_invoke(mapper, *begin++).then_wrapped_interruptible( + [s = s.get(), ret = std::move(ret)] (auto f) mutable { + try { + s->result = s->reduce(std::move(s->result), std::move(f.get0())); + return std::move(ret); + } catch (...) { + return std::move(ret).then_wrapped_interruptible([ex = std::current_exception()] (auto f) { + f.ignore_ready_future(); + return seastar::make_exception_future<>(ex); + }); + } + }); + } + return ret.then_interruptible([s] { + return seastar::make_ready_future<Initial>(std::move(s->result)); + }); + } + template <typename Range, typename Mapper, typename Initial, typename Reduce> + static inline interruptible_future<InterruptCond, Initial> map_reduce( + Range&& range, Mapper&& mapper, Initial initial, Reduce&& reduce) { + return map_reduce(std::begin(range), std::end(range), std::forward<Mapper>(mapper), + std::move(initial), std::move(reduce)); + } + + template<typename Fut> + requires seastar::Future<Fut> || IsInterruptibleFuture<Fut> + static auto futurize_invoke_if_func(Fut&& fut) noexcept { + return std::forward<Fut>(fut); + } + + template<typename Func> + requires (!seastar::Future<Func>) && (!IsInterruptibleFuture<Func>) + static auto futurize_invoke_if_func(Func&& func) noexcept { + return seastar::futurize_invoke(std::forward<Func>(func)); + } + + template <typename... FutOrFuncs> + static inline auto when_all(FutOrFuncs&&... fut_or_funcs) noexcept { + return ::seastar::internal::when_all_impl( + futurize_invoke_if_func(std::forward<FutOrFuncs>(fut_or_funcs))...); + } + + template <typename... FutOrFuncs> + static inline auto when_all_succeed(FutOrFuncs&&... fut_or_funcs) noexcept { + return ::seastar::internal::when_all_succeed_impl( + futurize_invoke_if_func(std::forward<FutOrFuncs>(fut_or_funcs))...); + } + + template <typename Func, + typename Result = futurize_t<std::invoke_result_t<Func>>> + static inline Result async(Func&& func) { + auto interruption_condition = interrupt_cond<InterruptCond>.interrupt_cond; + INTR_FUT_DEBUG( + "interruptible_future_detail::async() yielding out, " + "interrupt_cond {},{} cleared", + (void*)interruption_condition.get(), + typeid(InterruptCond).name()); + interrupt_cond<InterruptCond>.reset(); + auto ret = seastar::async([func=std::forward<Func>(func), + interruption_condition] () mutable { + return non_futurized_call_with_interruption( + interruption_condition, std::forward<Func>(func)); + }); + interrupt_cond<InterruptCond>.set(interruption_condition); + INTR_FUT_DEBUG( + "interruptible_future_detail::async() yield back, interrupt_cond: {},{}", + (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(), + typeid(InterruptCond).name()); + return ret; + } + + template <class FutureT> + static decltype(auto) green_get(FutureT&& fut) { + if (fut.available()) { + return fut.get(); + } else { + // destined to wait! + auto interruption_condition = interrupt_cond<InterruptCond>.interrupt_cond; + INTR_FUT_DEBUG( + "green_get() waiting, interrupt_cond: {},{}", + (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(), + typeid(InterruptCond).name()); + interrupt_cond<InterruptCond>.reset(); + auto&& value = fut.get(); + interrupt_cond<InterruptCond>.set(interruption_condition); + INTR_FUT_DEBUG( + "green_get() got, interrupt_cond: {},{}", + (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(), + typeid(InterruptCond).name()); + return std::move(value); + } + } + + static void yield() { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + auto interruption_condition = interrupt_cond<InterruptCond>.interrupt_cond; + INTR_FUT_DEBUG( + "interruptible_future_detail::yield() yielding out, " + "interrupt_cond {},{} cleared", + (void*)interruption_condition.get(), + typeid(InterruptCond).name()); + interrupt_cond<InterruptCond>.reset(); + seastar::thread::yield(); + interrupt_cond<InterruptCond>.set(interruption_condition); + INTR_FUT_DEBUG( + "interruptible_future_detail::yield() yield back, interrupt_cond: {},{}", + (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(), + typeid(InterruptCond).name()); + } + + static void maybe_yield() { + ceph_assert(interrupt_cond<InterruptCond>.interrupt_cond); + if (seastar::thread::should_yield()) { + auto interruption_condition = interrupt_cond<InterruptCond>.interrupt_cond; + INTR_FUT_DEBUG( + "interruptible_future_detail::may_yield() yielding out, " + "interrupt_cond {},{} cleared", + (void*)interruption_condition.get(), + typeid(InterruptCond).name()); + interrupt_cond<InterruptCond>.reset(); + seastar::thread::yield(); + interrupt_cond<InterruptCond>.set(interruption_condition); + INTR_FUT_DEBUG( + "interruptible_future_detail::may_yield() yield back, interrupt_cond: {},{}", + (void*)interrupt_cond<InterruptCond>.interrupt_cond.get(), + typeid(InterruptCond).name()); + } + } +}; + +} // namespace crimson::interruptible + +namespace seastar { + +template <typename InterruptCond, typename... T> +struct futurize<::crimson::interruptible::interruptible_future_detail< + InterruptCond, seastar::future<T...>>> { + using type = ::crimson::interruptible::interruptible_future_detail< + InterruptCond, seastar::future<T...>>; + + using value_type = typename type::value_type; + using tuple_type = typename type::tuple_type; + + static type from_tuple(tuple_type&& value) { + return type(ready_future_marker(), std::move(value)); + } + static type from_tuple(const tuple_type& value) { + return type(ready_future_marker(), value); + } + static type from_tuple(value_type&& value) { + return type(ready_future_marker(), std::move(value)); + } + static type from_tuple(const value_type& value) { + return type(ready_future_marker(), value); + } + + template <typename Func, typename... FuncArgs> + [[gnu::always_inline]] + static inline type invoke(Func&& func, FuncArgs&&... args) noexcept { + try { + return func(std::forward<FuncArgs>(args)...); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <typename Func> + [[gnu::always_inline]] + static type invoke(Func&& func, seastar::internal::monostate) noexcept { + try { + return ::seastar::futurize_invoke(std::forward<Func>(func)); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <typename Arg> + static inline type make_exception_future(Arg&& arg) noexcept { + return seastar::make_exception_future<T...>(std::forward<Arg>(arg)); + } + + static inline type make_exception_future(future_state_base&& state) noexcept { + return seastar::internal::make_exception_future<T...>(std::move(state)); + } + + template<typename PromiseT, typename Func> + static void satisfy_with_result_of(PromiseT&& pr, Func&& func) { + func().forward_to(std::move(pr)); + } +}; + +template <typename InterruptCond, + template <typename...> typename ErroratedFuture, + typename... T> +struct futurize< + ::crimson::interruptible::interruptible_future_detail< + InterruptCond, + ErroratedFuture<::crimson::errorated_future_marker<T...>> + > +> { + using type = ::crimson::interruptible::interruptible_future_detail< + InterruptCond, + ErroratedFuture<::crimson::errorated_future_marker<T...>>>; + using core_type = ErroratedFuture< + ::crimson::errorated_future_marker<T...>>; + using errorator_type = + ::crimson::interruptible::interruptible_errorator< + InterruptCond, + typename ErroratedFuture< + ::crimson::errorated_future_marker<T...>>::errorator_type>; + + template<typename Func, typename... FuncArgs> + static inline type invoke(Func&& func, FuncArgs&&... args) noexcept { + try { + return func(std::forward<FuncArgs>(args)...); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <typename Func> + [[gnu::always_inline]] + static type invoke(Func&& func, seastar::internal::monostate) noexcept { + try { + return ::seastar::futurize_invoke(std::forward<Func>(func)); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <typename Arg> + static inline type make_exception_future(Arg&& arg) noexcept { + return core_type::errorator_type::template make_exception_future2<T...>( + std::forward<Arg>(arg)); + } + + template<typename PromiseT, typename Func> + static void satisfy_with_result_of(PromiseT&& pr, Func&& func) { + func().forward_to(std::move(pr)); + } + +}; + +template <typename InterruptCond, typename FutureType> +struct continuation_base_from_future< + ::crimson::interruptible::interruptible_future_detail<InterruptCond, FutureType>> { + using type = typename seastar::continuation_base_from_future<FutureType>::type; +}; + +template <typename InterruptCond, typename FutureType> +struct is_future< + ::crimson::interruptible::interruptible_future_detail< + InterruptCond, + FutureType>> + : std::true_type {}; +} // namespace seastar diff --git a/src/crimson/common/layout.h b/src/crimson/common/layout.h new file mode 100644 index 000000000..9d54ecd1d --- /dev/null +++ b/src/crimson/common/layout.h @@ -0,0 +1,737 @@ +// Copyright 2018 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// MOTIVATION AND TUTORIAL +// +// If you want to put in a single heap allocation N doubles followed by M ints, +// it's easy if N and M are known at compile time. +// +// struct S { +// double a[N]; +// int b[M]; +// }; +// +// S* p = new S; +// +// But what if N and M are known only in run time? Class template Layout to the +// rescue! It's a portable generalization of the technique known as struct hack. +// +// // This object will tell us everything we need to know about the memory +// // layout of double[N] followed by int[M]. It's structurally identical to +// // size_t[2] that stores N and M. It's very cheap to create. +// const Layout<double, int> layout(N, M); +// +// // Allocate enough memory for both arrays. `AllocSize()` tells us how much +// // memory is needed. We are free to use any allocation function we want as +// // long as it returns aligned memory. +// std::unique_ptr<unsigned char[]> p(new unsigned char[layout.AllocSize()]); +// +// // Obtain the pointer to the array of doubles. +// // Equivalent to `reinterpret_cast<double*>(p.get())`. +// // +// // We could have written layout.Pointer<0>(p) instead. If all the types are +// // unique you can use either form, but if some types are repeated you must +// // use the index form. +// double* a = layout.Pointer<double>(p.get()); +// +// // Obtain the pointer to the array of ints. +// // Equivalent to `reinterpret_cast<int*>(p.get() + N * 8)`. +// int* b = layout.Pointer<int>(p); +// +// If we are unable to specify sizes of all fields, we can pass as many sizes as +// we can to `Partial()`. In return, it'll allow us to access the fields whose +// locations and sizes can be computed from the provided information. +// `Partial()` comes in handy when the array sizes are embedded into the +// allocation. +// +// // size_t[1] containing N, size_t[1] containing M, double[N], int[M]. +// using L = Layout<size_t, size_t, double, int>; +// +// unsigned char* Allocate(size_t n, size_t m) { +// const L layout(1, 1, n, m); +// unsigned char* p = new unsigned char[layout.AllocSize()]; +// *layout.Pointer<0>(p) = n; +// *layout.Pointer<1>(p) = m; +// return p; +// } +// +// void Use(unsigned char* p) { +// // First, extract N and M. +// // Specify that the first array has only one element. Using `prefix` we +// // can access the first two arrays but not more. +// constexpr auto prefix = L::Partial(1); +// size_t n = *prefix.Pointer<0>(p); +// size_t m = *prefix.Pointer<1>(p); +// +// // Now we can get pointers to the payload. +// const L layout(1, 1, n, m); +// double* a = layout.Pointer<double>(p); +// int* b = layout.Pointer<int>(p); +// } +// +// The layout we used above combines fixed-size with dynamically-sized fields. +// This is quite common. Layout is optimized for this use case and generates +// optimal code. All computations that can be performed at compile time are +// indeed performed at compile time. +// +// Efficiency tip: The order of fields matters. In `Layout<T1, ..., TN>` try to +// ensure that `alignof(T1) >= ... >= alignof(TN)`. This way you'll have no +// padding in between arrays. +// +// You can manually override the alignment of an array by wrapping the type in +// `Aligned<T, N>`. `Layout<..., Aligned<T, N>, ...>` has exactly the same API +// and behavior as `Layout<..., T, ...>` except that the first element of the +// array of `T` is aligned to `N` (the rest of the elements follow without +// padding). `N` cannot be less than `alignof(T)`. +// +// `AllocSize()` and `Pointer()` are the most basic methods for dealing with +// memory layouts. Check out the reference or code below to discover more. +// +// EXAMPLE +// +// // Immutable move-only string with sizeof equal to sizeof(void*). The +// // string size and the characters are kept in the same heap allocation. +// class CompactString { +// public: +// CompactString(const char* s = "") { +// const size_t size = strlen(s); +// // size_t[1] followed by char[size + 1]. +// const L layout(1, size + 1); +// p_.reset(new unsigned char[layout.AllocSize()]); +// // If running under ASAN, mark the padding bytes, if any, to catch +// // memory errors. +// layout.PoisonPadding(p_.get()); +// // Store the size in the allocation. +// *layout.Pointer<size_t>(p_.get()) = size; +// // Store the characters in the allocation. +// memcpy(layout.Pointer<char>(p_.get()), s, size + 1); +// } +// +// size_t size() const { +// // Equivalent to reinterpret_cast<size_t&>(*p). +// return *L::Partial().Pointer<size_t>(p_.get()); +// } +// +// const char* c_str() const { +// // Equivalent to reinterpret_cast<char*>(p.get() + sizeof(size_t)). +// // The argument in Partial(1) specifies that we have size_t[1] in front +// // of the characters. +// return L::Partial(1).Pointer<char>(p_.get()); +// } +// +// private: +// // Our heap allocation contains a size_t followed by an array of chars. +// using L = Layout<size_t, char>; +// std::unique_ptr<unsigned char[]> p_; +// }; +// +// int main() { +// CompactString s = "hello"; +// assert(s.size() == 5); +// assert(strcmp(s.c_str(), "hello") == 0); +// } +// +// DOCUMENTATION +// +// The interface exported by this file consists of: +// - class `Layout<>` and its public members. +// - The public members of class `internal_layout::LayoutImpl<>`. That class +// isn't intended to be used directly, and its name and template parameter +// list are internal implementation details, but the class itself provides +// most of the functionality in this file. See comments on its members for +// detailed documentation. +// +// `Layout<T1,... Tn>::Partial(count1,..., countm)` (where `m` <= `n`) returns a +// `LayoutImpl<>` object. `Layout<T1,..., Tn> layout(count1,..., countn)` +// creates a `Layout` object, which exposes the same functionality by inheriting +// from `LayoutImpl<>`. + +#ifndef ABSL_CONTAINER_INTERNAL_LAYOUT_H_ +#define ABSL_CONTAINER_INTERNAL_LAYOUT_H_ + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> +#include <ostream> +#include <string> +#include <tuple> +#include <type_traits> +#include <typeinfo> +#include <utility> + +#ifdef ADDRESS_SANITIZER +#include <sanitizer/asan_interface.h> +#endif + +// for C++20 std::span +#include <boost/beast/core/span.hpp> +#include <fmt/format.h> + +#if defined(__GXX_RTTI) +#define ABSL_INTERNAL_HAS_CXA_DEMANGLE +#endif + +#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE +#include <cxxabi.h> +#endif + +namespace absl { +namespace container_internal { + +// A type wrapper that instructs `Layout` to use the specific alignment for the +// array. `Layout<..., Aligned<T, N>, ...>` has exactly the same API +// and behavior as `Layout<..., T, ...>` except that the first element of the +// array of `T` is aligned to `N` (the rest of the elements follow without +// padding). +// +// Requires: `N >= alignof(T)` and `N` is a power of 2. +template <class T, size_t N> +struct Aligned; + +namespace internal_layout { + +template <class T> +struct NotAligned {}; + +template <class T, size_t N> +struct NotAligned<const Aligned<T, N>> { + static_assert(sizeof(T) == 0, "Aligned<T, N> cannot be const-qualified"); +}; + +template <size_t> +using IntToSize = size_t; + +template <class> +using TypeToSize = size_t; + +template <class T> +struct Type : NotAligned<T> { + using type = T; +}; + +template <class T, size_t N> +struct Type<Aligned<T, N>> { + using type = T; +}; + +template <class T> +struct SizeOf : NotAligned<T>, std::integral_constant<size_t, sizeof(T)> {}; + +template <class T, size_t N> +struct SizeOf<Aligned<T, N>> : std::integral_constant<size_t, sizeof(T)> {}; + +// Note: workaround for https://gcc.gnu.org/PR88115 +template <class T> +struct AlignOf : NotAligned<T> { + static constexpr size_t value = alignof(T); +}; + +template <class T, size_t N> +struct AlignOf<Aligned<T, N>> { + static_assert(N % alignof(T) == 0, + "Custom alignment can't be lower than the type's alignment"); + static constexpr size_t value = N; +}; + +// Does `Ts...` contain `T`? +template <class T, class... Ts> +using Contains = std::disjunction<std::is_same<T, Ts>...>; + +template <class From, class To> +using CopyConst = + typename std::conditional_t<std::is_const_v<From>, const To, To>; + +// Note: We're not qualifying this with absl:: because it doesn't compile under +// MSVC. +template <class T> +using SliceType = boost::beast::span<T>; + +// This namespace contains no types. It prevents functions defined in it from +// being found by ADL. +namespace adl_barrier { + +template <class Needle, class... Ts> +constexpr size_t Find(Needle, Needle, Ts...) { + static_assert(!Contains<Needle, Ts...>(), "Duplicate element type"); + return 0; +} + +template <class Needle, class T, class... Ts> +constexpr size_t Find(Needle, T, Ts...) { + return adl_barrier::Find(Needle(), Ts()...) + 1; +} + +constexpr bool IsPow2(size_t n) { return !(n & (n - 1)); } + +// Returns `q * m` for the smallest `q` such that `q * m >= n`. +// Requires: `m` is a power of two. It's enforced by IsLegalElementType below. +constexpr size_t Align(size_t n, size_t m) { return (n + m - 1) & ~(m - 1); } + +constexpr size_t Min(size_t a, size_t b) { return b < a ? b : a; } + +constexpr size_t Max(size_t a) { return a; } + +template <class... Ts> +constexpr size_t Max(size_t a, size_t b, Ts... rest) { + return adl_barrier::Max(b < a ? a : b, rest...); +} + +template <class T> +std::string TypeName() { + std::string out; + int status = 0; + char* demangled = nullptr; +#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE + demangled = abi::__cxa_demangle(typeid(T).name(), nullptr, nullptr, &status); +#endif + if (status == 0 && demangled != nullptr) { // Demangling succeeded. + out = fmt::format("<{}>", demangled); + free(demangled); + } else { +#if defined(__GXX_RTTI) || defined(_CPPRTTI) + out = fmt::format("<{}>", typeid(T).name()); +#endif + } + return out; +} + +} // namespace adl_barrier + +template <bool C> +using EnableIf = typename std::enable_if_t<C, int>; + +// Can `T` be a template argument of `Layout`? +template <class T> +using IsLegalElementType = std::integral_constant< + bool, !std::is_reference_v<T> && !std::is_volatile_v<T> && + !std::is_reference_v<typename Type<T>::type> && + !std::is_volatile_v<typename Type<T>::type> && + adl_barrier::IsPow2(AlignOf<T>::value)>; + +template <class Elements, class SizeSeq, class OffsetSeq> +class LayoutImpl; + +// Public base class of `Layout` and the result type of `Layout::Partial()`. +// +// `Elements...` contains all template arguments of `Layout` that created this +// instance. +// +// `SizeSeq...` is `[0, NumSizes)` where `NumSizes` is the number of arguments +// passed to `Layout::Partial()` or `Layout::Layout()`. +// +// `OffsetSeq...` is `[0, NumOffsets)` where `NumOffsets` is +// `Min(sizeof...(Elements), NumSizes + 1)` (the number of arrays for which we +// can compute offsets). +template <class... Elements, size_t... SizeSeq, size_t... OffsetSeq> +class LayoutImpl<std::tuple<Elements...>, std::index_sequence<SizeSeq...>, + std::index_sequence<OffsetSeq...>> { + private: + static_assert(sizeof...(Elements) > 0, "At least one field is required"); + static_assert(std::conjunction_v<IsLegalElementType<Elements>...>, + "Invalid element type (see IsLegalElementType)"); + + enum { + NumTypes = sizeof...(Elements), + NumSizes = sizeof...(SizeSeq), + NumOffsets = sizeof...(OffsetSeq), + }; + + // These are guaranteed by `Layout`. + static_assert(NumOffsets == adl_barrier::Min(NumTypes, NumSizes + 1), + "Internal error"); + static_assert(NumTypes > 0, "Internal error"); + + // Returns the index of `T` in `Elements...`. Results in a compilation error + // if `Elements...` doesn't contain exactly one instance of `T`. + template <class T> + static constexpr size_t ElementIndex() { + static_assert(Contains<Type<T>, Type<typename Type<Elements>::type>...>(), + "Type not found"); + return adl_barrier::Find(Type<T>(), + Type<typename Type<Elements>::type>()...); + } + + template <size_t N> + using ElementAlignment = + AlignOf<typename std::tuple_element<N, std::tuple<Elements...>>::type>; + + public: + // Element types of all arrays packed in a tuple. + using ElementTypes = std::tuple<typename Type<Elements>::type...>; + + // Element type of the Nth array. + template <size_t N> + using ElementType = typename std::tuple_element<N, ElementTypes>::type; + + constexpr explicit LayoutImpl(IntToSize<SizeSeq>... sizes) + : size_{sizes...} {} + + // Alignment of the layout, equal to the strictest alignment of all elements. + // All pointers passed to the methods of layout must be aligned to this value. + static constexpr size_t Alignment() { + return adl_barrier::Max(AlignOf<Elements>::value...); + } + + // Offset in bytes of the Nth array. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // assert(x.Offset<0>() == 0); // The ints starts from 0. + // assert(x.Offset<1>() == 16); // The doubles starts from 16. + // + // Requires: `N <= NumSizes && N < sizeof...(Ts)`. + template <size_t N, EnableIf<N == 0> = 0> + constexpr size_t Offset() const { + return 0; + } + + template <size_t N, EnableIf<N != 0> = 0> + constexpr size_t Offset() const { + static_assert(N < NumOffsets, "Index out of bounds"); + return adl_barrier::Align( + Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1], + ElementAlignment<N>::value); + } + + // Offset in bytes of the array with the specified element type. There must + // be exactly one such array and its zero-based index must be at most + // `NumSizes`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // assert(x.Offset<int>() == 0); // The ints starts from 0. + // assert(x.Offset<double>() == 16); // The doubles starts from 16. + template <class T> + constexpr size_t Offset() const { + return Offset<ElementIndex<T>()>(); + } + + // Offsets in bytes of all arrays for which the offsets are known. + constexpr std::array<size_t, NumOffsets> Offsets() const { + return {{Offset<OffsetSeq>()...}}; + } + + // The number of elements in the Nth array. This is the Nth argument of + // `Layout::Partial()` or `Layout::Layout()` (zero-based). + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // assert(x.Size<0>() == 3); + // assert(x.Size<1>() == 4); + // + // Requires: `N < NumSizes`. + template <size_t N> + constexpr size_t Size() const { + static_assert(N < NumSizes, "Index out of bounds"); + return size_[N]; + } + + // The number of elements in the array with the specified element type. + // There must be exactly one such array and its zero-based index must be + // at most `NumSizes`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // assert(x.Size<int>() == 3); + // assert(x.Size<double>() == 4); + template <class T> + constexpr size_t Size() const { + return Size<ElementIndex<T>()>(); + } + + // The number of elements of all arrays for which they are known. + constexpr std::array<size_t, NumSizes> Sizes() const { + return {{Size<SizeSeq>()...}}; + } + + // Pointer to the beginning of the Nth array. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // int* ints = x.Pointer<0>(p); + // double* doubles = x.Pointer<1>(p); + // + // Requires: `N <= NumSizes && N < sizeof...(Ts)`. + // Requires: `p` is aligned to `Alignment()`. + template <size_t N, class Char> + CopyConst<Char, ElementType<N>>* Pointer(Char* p) const { + using C = typename std::remove_const<Char>::type; + static_assert( + std::is_same<C, char>() || std::is_same<C, unsigned char>() || + std::is_same<C, signed char>(), + "The argument must be a pointer to [const] [signed|unsigned] char"); + constexpr size_t alignment = Alignment(); + (void)alignment; + assert(reinterpret_cast<uintptr_t>(p) % alignment == 0); + return reinterpret_cast<CopyConst<Char, ElementType<N>>*>(p + Offset<N>()); + } + + // Pointer to the beginning of the array with the specified element type. + // There must be exactly one such array and its zero-based index must be at + // most `NumSizes`. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // int* ints = x.Pointer<int>(p); + // double* doubles = x.Pointer<double>(p); + // + // Requires: `p` is aligned to `Alignment()`. + template <class T, class Char> + CopyConst<Char, T>* Pointer(Char* p) const { + return Pointer<ElementIndex<T>()>(p); + } + + // Pointers to all arrays for which pointers are known. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // + // int* ints; + // double* doubles; + // std::tie(ints, doubles) = x.Pointers(p); + // + // Requires: `p` is aligned to `Alignment()`. + // + // Note: We're not using ElementType alias here because it does not compile + // under MSVC. + template <class Char> + std::tuple<CopyConst< + Char, typename std::tuple_element<OffsetSeq, ElementTypes>::type>*...> + Pointers(Char* p) const { + return std::tuple<CopyConst<Char, ElementType<OffsetSeq>>*...>( + Pointer<OffsetSeq>(p)...); + } + + // The Nth array. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // Span<int> ints = x.Slice<0>(p); + // Span<double> doubles = x.Slice<1>(p); + // + // Requires: `N < NumSizes`. + // Requires: `p` is aligned to `Alignment()`. + template <size_t N, class Char> + SliceType<CopyConst<Char, ElementType<N>>> Slice(Char* p) const { + return SliceType<CopyConst<Char, ElementType<N>>>(Pointer<N>(p), Size<N>()); + } + + // The array with the specified element type. There must be exactly one + // such array and its zero-based index must be less than `NumSizes`. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // Span<int> ints = x.Slice<int>(p); + // Span<double> doubles = x.Slice<double>(p); + // + // Requires: `p` is aligned to `Alignment()`. + template <class T, class Char> + SliceType<CopyConst<Char, T>> Slice(Char* p) const { + return Slice<ElementIndex<T>()>(p); + } + + // All arrays with known sizes. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // + // Span<int> ints; + // Span<double> doubles; + // std::tie(ints, doubles) = x.Slices(p); + // + // Requires: `p` is aligned to `Alignment()`. + // + // Note: We're not using ElementType alias here because it does not compile + // under MSVC. + template <class Char> + std::tuple<SliceType<CopyConst< + Char, typename std::tuple_element<SizeSeq, ElementTypes>::type>>...> + Slices(Char* p) const { + // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63875 (fixed + // in 6.1). + (void)p; + return std::tuple<SliceType<CopyConst<Char, ElementType<SizeSeq>>>...>( + Slice<SizeSeq>(p)...); + } + + // The size of the allocation that fits all arrays. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; // 48 bytes + // + // Requires: `NumSizes == sizeof...(Ts)`. + constexpr size_t AllocSize() const { + static_assert(NumTypes == NumSizes, "You must specify sizes of all fields"); + return Offset<NumTypes - 1>() + + SizeOf<ElementType<NumTypes - 1>>() * size_[NumTypes - 1]; + } + + // If built with --config=asan, poisons padding bytes (if any) in the + // allocation. The pointer must point to a memory block at least + // `AllocSize()` bytes in length. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // Requires: `p` is aligned to `Alignment()`. + template <class Char, size_t N = NumOffsets - 1, EnableIf<N == 0> = 0> + void PoisonPadding(const Char* p) const { + Pointer<0>(p); // verify the requirements on `Char` and `p` + } + + template <class Char, size_t N = NumOffsets - 1, EnableIf<N != 0> = 0> + void PoisonPadding(const Char* p) const { + static_assert(N < NumOffsets, "Index out of bounds"); + (void)p; +#ifdef ADDRESS_SANITIZER + PoisonPadding<Char, N - 1>(p); + // The `if` is an optimization. It doesn't affect the observable behaviour. + if (ElementAlignment<N - 1>::value % ElementAlignment<N>::value) { + size_t start = + Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1]; + ASAN_POISON_MEMORY_REGION(p + start, Offset<N>() - start); + } +#endif + } + + // Human-readable description of the memory layout. Useful for debugging. + // Slow. + // + // // char[5], 3 bytes of padding, int[3], 4 bytes of padding, followed + // // by an unknown number of doubles. + // auto x = Layout<char, int, double>::Partial(5, 3); + // assert(x.DebugString() == + // "@0<char>(1)[5]; @8<int>(4)[3]; @24<double>(8)"); + // + // Each field is in the following format: @offset<type>(sizeof)[size] (<type> + // may be missing depending on the target platform). For example, + // @8<int>(4)[3] means that at offset 8 we have an array of ints, where each + // int is 4 bytes, and we have 3 of those ints. The size of the last field may + // be missing (as in the example above). Only fields with known offsets are + // described. Type names may differ across platforms: one compiler might + // produce "unsigned*" where another produces "unsigned int *". + std::string DebugString() const { + const auto offsets = Offsets(); + const size_t sizes[] = {SizeOf<ElementType<OffsetSeq>>()...}; + const std::string types[] = { + adl_barrier::TypeName<ElementType<OffsetSeq>>()...}; + std::string res = fmt::format("@0{}({})", types[0], sizes[0]); + for (size_t i = 0; i != NumOffsets - 1; ++i) { + res += fmt::format("[{}]; @({})", size_[i], offsets[i + 1], types[i + 1], sizes[i + 1]); + } + // NumSizes is a constant that may be zero. Some compilers cannot see that + // inside the if statement "size_[NumSizes - 1]" must be valid. + int last = static_cast<int>(NumSizes) - 1; + if (NumTypes == NumSizes && last >= 0) { + res += fmt::format("[{}]", size_[last]); + } + return res; + } + + private: + // Arguments of `Layout::Partial()` or `Layout::Layout()`. + size_t size_[NumSizes > 0 ? NumSizes : 1]; +}; + +template <size_t NumSizes, class... Ts> +using LayoutType = LayoutImpl< + std::tuple<Ts...>, std::make_index_sequence<NumSizes>, + std::make_index_sequence<adl_barrier::Min(sizeof...(Ts), NumSizes + 1)>>; + +} // namespace internal_layout + +// Descriptor of arrays of various types and sizes laid out in memory one after +// another. See the top of the file for documentation. +// +// Check out the public API of internal_layout::LayoutImpl above. The type is +// internal to the library but its methods are public, and they are inherited +// by `Layout`. +template <class... Ts> +class Layout : public internal_layout::LayoutType<sizeof...(Ts), Ts...> { + public: + static_assert(sizeof...(Ts) > 0, "At least one field is required"); + static_assert( + std::conjunction_v<internal_layout::IsLegalElementType<Ts>...>, + "Invalid element type (see IsLegalElementType)"); + + // The result type of `Partial()` with `NumSizes` arguments. + template <size_t NumSizes> + using PartialType = internal_layout::LayoutType<NumSizes, Ts...>; + + // `Layout` knows the element types of the arrays we want to lay out in + // memory but not the number of elements in each array. + // `Partial(size1, ..., sizeN)` allows us to specify the latter. The + // resulting immutable object can be used to obtain pointers to the + // individual arrays. + // + // It's allowed to pass fewer array sizes than the number of arrays. E.g., + // if all you need is to the offset of the second array, you only need to + // pass one argument -- the number of elements in the first array. + // + // // int[3] followed by 4 bytes of padding and an unknown number of + // // doubles. + // auto x = Layout<int, double>::Partial(3); + // // doubles start at byte 16. + // assert(x.Offset<1>() == 16); + // + // If you know the number of elements in all arrays, you can still call + // `Partial()` but it's more convenient to use the constructor of `Layout`. + // + // Layout<int, double> x(3, 5); + // + // Note: The sizes of the arrays must be specified in number of elements, + // not in bytes. + // + // Requires: `sizeof...(Sizes) <= sizeof...(Ts)`. + // Requires: all arguments are convertible to `size_t`. + template <class... Sizes> + static constexpr PartialType<sizeof...(Sizes)> Partial(Sizes&&... sizes) { + static_assert(sizeof...(Sizes) <= sizeof...(Ts)); + return PartialType<sizeof...(Sizes)>(std::forward<Sizes>(sizes)...); + } + + // Creates a layout with the sizes of all arrays specified. If you know + // only the sizes of the first N arrays (where N can be zero), you can use + // `Partial()` defined above. The constructor is essentially equivalent to + // calling `Partial()` and passing in all array sizes; the constructor is + // provided as a convenient abbreviation. + // + // Note: The sizes of the arrays must be specified in number of elements, + // not in bytes. + constexpr explicit Layout(internal_layout::TypeToSize<Ts>... sizes) + : internal_layout::LayoutType<sizeof...(Ts), Ts...>(sizes...) {} +}; + +} // namespace container_internal +} // namespace absl + +#endif // ABSL_CONTAINER_INTERNAL_LAYOUT_H_ diff --git a/src/crimson/common/local_shared_foreign_ptr.h b/src/crimson/common/local_shared_foreign_ptr.h new file mode 100644 index 000000000..c4bd1099a --- /dev/null +++ b/src/crimson/common/local_shared_foreign_ptr.h @@ -0,0 +1,245 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/smp.hh> +#include <seastar/core/future.hh> +#include <seastar/core/sharded.hh> + +namespace crimson { + +/** + * local_shared_foreign_ptr + * + * See seastar/include/seastar/core/sharded.hh:foreign_ptr + * + * seastar::foreign_ptr wraps a smart ptr by proxying the copy() and destructor + * operations back to the original core. This works well except that copy() + * requires a cross-core call. We need a smart_ptr which allows cross-core + * caching of (for example) OSDMaps, but we want to avoid the overhead inherent + * in incrementing the source smart_ptr on every copy. Thus, + * local_shared_foreign_ptr maintains a core-local foreign_ptr back to the + * original core instance with core-local ref counting. + */ +template <typename PtrType> +class local_shared_foreign_ptr { + using element_type = typename std::pointer_traits<PtrType>::element_type; + using pointer = element_type*; + + seastar::lw_shared_ptr<seastar::foreign_ptr<PtrType>> ptr; + + /// Wraps a pointer object and remembers the current core. + local_shared_foreign_ptr(seastar::foreign_ptr<PtrType> &&fptr) + : ptr(fptr ? seastar::make_lw_shared(std::move(fptr)) : nullptr) { + assert(!ptr || (ptr && *ptr)); + } + + template <typename T> + friend local_shared_foreign_ptr<T> make_local_shared_foreign( + seastar::foreign_ptr<T> &&); + +public: + /// Constructs a null local_shared_foreign_ptr<>. + local_shared_foreign_ptr() = default; + + /// Constructs a null local_shared_foreign_ptr<>. + local_shared_foreign_ptr(std::nullptr_t) : local_shared_foreign_ptr() {} + + /// Moves a local_shared_foreign_ptr<> to another object. + local_shared_foreign_ptr(local_shared_foreign_ptr&& other) = default; + + /// Copies a local_shared_foreign_ptr<> + local_shared_foreign_ptr(const local_shared_foreign_ptr &other) = default; + + /// Releases reference to ptr eventually releasing the contained foreign_ptr + ~local_shared_foreign_ptr() = default; + + /// Creates a copy of this foreign ptr. Only works if the stored ptr is copyable. + seastar::future<seastar::foreign_ptr<PtrType>> get_foreign() const noexcept { + assert(!ptr || (ptr && *ptr)); + return ptr ? ptr->copy() : + seastar::make_ready_future<seastar::foreign_ptr<PtrType>>(nullptr); + } + + /// Accesses the wrapped object. + element_type& operator*() const noexcept { + assert(ptr && *ptr); + return **ptr; + } + /// Accesses the wrapped object. + element_type* operator->() const noexcept { + assert(ptr && *ptr); + return &**ptr; + } + + /// Access the raw pointer to the wrapped object. + pointer get() const noexcept { + assert(!ptr || (ptr && *ptr)); + return ptr ? ptr->get() : nullptr; + } + + /// Return the owner-shard of the contained foreign_ptr. + unsigned get_owner_shard() const noexcept { + assert(!ptr || (ptr && *ptr)); + return ptr ? ptr->get_owner_shard() : seastar::this_shard_id(); + } + + /// Checks whether the wrapped pointer is non-null. + operator bool() const noexcept { + assert(!ptr || (ptr && *ptr)); + return static_cast<bool>(ptr); + } + + /// Move-assigns a \c local_shared_foreign_ptr<>. + local_shared_foreign_ptr& operator=(local_shared_foreign_ptr&& other) noexcept { + ptr = std::move(other.ptr); + return *this; + } + + /// Copy-assigns a \c local_shared_foreign_ptr<>. + local_shared_foreign_ptr& operator=(const local_shared_foreign_ptr& other) noexcept { + ptr = other.ptr; + return *this; + } + + /// Reset the containing ptr + void reset() noexcept { + assert(!ptr || (ptr && *ptr)); + ptr = nullptr; + } +}; + +/// Wraps a smart_ptr T in a local_shared_foreign_ptr<>. +template <typename T> +local_shared_foreign_ptr<T> make_local_shared_foreign( + seastar::foreign_ptr<T> &&ptr) { + return local_shared_foreign_ptr<T>(std::move(ptr)); +} + +/// Wraps ptr in a local_shared_foreign_ptr<>. +template <typename T> +local_shared_foreign_ptr<T> make_local_shared_foreign(T &&ptr) { + return make_local_shared_foreign<T>( + ptr ? seastar::make_foreign(std::forward<T>(ptr)) : nullptr); +} + +template <typename T, typename U> +inline bool operator==(const local_shared_foreign_ptr<T> &x, + const local_shared_foreign_ptr<U> &y) { + return x.get() == y.get(); +} + +template <typename T> +inline bool operator==(const local_shared_foreign_ptr<T> &x, std::nullptr_t) { + return x.get() == nullptr; +} + +template <typename T> +inline bool operator==(std::nullptr_t, const local_shared_foreign_ptr<T>& y) { + return nullptr == y.get(); +} + +template <typename T, typename U> +inline bool operator!=(const local_shared_foreign_ptr<T> &x, + const local_shared_foreign_ptr<U> &y) { + return x.get() != y.get(); +} + +template <typename T> +inline bool operator!=(const local_shared_foreign_ptr<T> &x, std::nullptr_t) { + return x.get() != nullptr; +} + +template <typename T> +inline bool operator!=(std::nullptr_t, const local_shared_foreign_ptr<T>& y) { + return nullptr != y.get(); +} + +template <typename T, typename U> +inline bool operator<(const local_shared_foreign_ptr<T> &x, + const local_shared_foreign_ptr<U> &y) { + return x.get() < y.get(); +} + +template <typename T> +inline bool operator<(const local_shared_foreign_ptr<T> &x, std::nullptr_t) { + return x.get() < nullptr; +} + +template <typename T> +inline bool operator<(std::nullptr_t, const local_shared_foreign_ptr<T>& y) { + return nullptr < y.get(); +} + +template <typename T, typename U> +inline bool operator<=(const local_shared_foreign_ptr<T> &x, + const local_shared_foreign_ptr<U> &y) { + return x.get() <= y.get(); +} + +template <typename T> +inline bool operator<=(const local_shared_foreign_ptr<T> &x, std::nullptr_t) { + return x.get() <= nullptr; +} + +template <typename T> +inline bool operator<=(std::nullptr_t, const local_shared_foreign_ptr<T>& y) { + return nullptr <= y.get(); +} + +template <typename T, typename U> +inline bool operator>(const local_shared_foreign_ptr<T> &x, + const local_shared_foreign_ptr<U> &y) { + return x.get() > y.get(); +} + +template <typename T> +inline bool operator>(const local_shared_foreign_ptr<T> &x, std::nullptr_t) { + return x.get() > nullptr; +} + +template <typename T> +inline bool operator>(std::nullptr_t, const local_shared_foreign_ptr<T>& y) { + return nullptr > y.get(); +} + +template <typename T, typename U> +inline bool operator>=(const local_shared_foreign_ptr<T> &x, + const local_shared_foreign_ptr<U> &y) { + return x.get() >= y.get(); +} + +template <typename T> +inline bool operator>=(const local_shared_foreign_ptr<T> &x, std::nullptr_t) { + return x.get() >= nullptr; +} + +template <typename T> +inline bool operator>=(std::nullptr_t, const local_shared_foreign_ptr<T>& y) { + return nullptr >= y.get(); +} + +} + +namespace std { + +template <typename T> +struct hash<crimson::local_shared_foreign_ptr<T>> + : private hash<typename std::pointer_traits<T>::element_type *> { + size_t operator()(const crimson::local_shared_foreign_ptr<T>& p) const { + return hash<typename std::pointer_traits<T>::element_type *>::operator()(p.get()); + } +}; + +} + +namespace seastar { + +template<typename T> +struct is_smart_ptr<crimson::local_shared_foreign_ptr<T>> : std::true_type {}; + +} diff --git a/src/crimson/common/log.cc b/src/crimson/common/log.cc new file mode 100644 index 000000000..cae9f6a7b --- /dev/null +++ b/src/crimson/common/log.cc @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "log.h" + +static std::array<seastar::logger, ceph_subsys_get_num()> loggers{ +#define SUBSYS(name, log_level, gather_level) \ + seastar::logger(#name), +#define DEFAULT_SUBSYS(log_level, gather_level) \ + seastar::logger("none"), + #include "common/subsys.h" +#undef SUBSYS +#undef DEFAULT_SUBSYS +}; + +namespace crimson { +seastar::logger& get_logger(int subsys) { + assert(subsys < ceph_subsys_max); + return loggers[subsys]; +} +} diff --git a/src/crimson/common/log.h b/src/crimson/common/log.h new file mode 100644 index 000000000..27ff550d8 --- /dev/null +++ b/src/crimson/common/log.h @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <fmt/format.h> +#include <seastar/util/log.hh> + +#include "common/subsys_types.h" + +namespace crimson { +seastar::logger& get_logger(int subsys); +static inline seastar::log_level to_log_level(int level) { + if (level < 0) { + return seastar::log_level::error; + } else if (level < 1) { + return seastar::log_level::warn; + } else if (level <= 5) { + return seastar::log_level::info; + } else if (level <= 20) { + return seastar::log_level::debug; + } else { + return seastar::log_level::trace; + } +} +} + +/* Logging convenience macros + * + * The intention here is to standardize prefixing log lines with the function name + * and a context prefix (like the operator<< for the PG). Place + * + * SET_SUBSYS(osd); + * + * at the top of the file to declare the log lines within the file as being (in this case) + * in the osd subsys. At the beginning of each method/function, add + * + * LOG_PREFIX(Class::method_name) + * + * to set the FNAME symbol to Class::method_name. In order to use the log macros + * within lambdas, capture FNAME by value. + * + * Log lines can then be declared using the appropriate macro below. + */ + +#define SET_SUBSYS(subname_) static constexpr auto SOURCE_SUBSYS = ceph_subsys_##subname_ +#define LOCAL_LOGGER crimson::get_logger(SOURCE_SUBSYS) +#define LOGGER(subname_) crimson::get_logger(ceph_subsys_##subname_) +#define LOG_PREFIX(x) constexpr auto FNAME = #x + +#define LOG(level_, MSG, ...) \ + LOCAL_LOGGER.log(level_, "{}: " MSG, FNAME , ##__VA_ARGS__) +#define SUBLOG(subname_, level_, MSG, ...) \ + LOGGER(subname_).log(level_, "{}: " MSG, FNAME , ##__VA_ARGS__) + +#define TRACE(...) LOG(seastar::log_level::trace, __VA_ARGS__) +#define SUBTRACE(subname_, ...) SUBLOG(subname_, seastar::log_level::trace, __VA_ARGS__) + +#define DEBUG(...) LOG(seastar::log_level::debug, __VA_ARGS__) +#define SUBDEBUG(subname_, ...) SUBLOG(subname_, seastar::log_level::debug, __VA_ARGS__) + +#define INFO(...) LOG(seastar::log_level::info, __VA_ARGS__) +#define SUBINFO(subname_, ...) SUBLOG(subname_, seastar::log_level::info, __VA_ARGS__) + +#define WARN(...) LOG(seastar::log_level::warn, __VA_ARGS__) +#define SUBWARN(subname_, ...) SUBLOG(subname_, seastar::log_level::warn, __VA_ARGS__) + +#define ERROR(...) LOG(seastar::log_level::error, __VA_ARGS__) +#define SUBERROR(subname_, ...) SUBLOG(subname_, seastar::log_level::error, __VA_ARGS__) + +// *DPP macros are intended to take DoutPrefixProvider implementations, but anything with +// an operator<< will work as a prefix + +#define SUBLOGDPP(subname_, level_, MSG, dpp, ...) \ + LOGGER(subname_).log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__) +#define SUBTRACEDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::trace, __VA_ARGS__) +#define SUBDEBUGDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::debug, __VA_ARGS__) +#define SUBINFODPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::info, __VA_ARGS__) +#define SUBWARNDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::warn, __VA_ARGS__) +#define SUBERRORDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::error, __VA_ARGS__) + +#define LOGDPP(level_, MSG, dpp, ...) \ + LOCAL_LOGGER.log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__) +#define TRACEDPP(...) LOGDPP(seastar::log_level::trace, __VA_ARGS__) +#define DEBUGDPP(...) LOGDPP(seastar::log_level::debug, __VA_ARGS__) +#define INFODPP(...) LOGDPP(seastar::log_level::info, __VA_ARGS__) +#define WARNDPP(...) LOGDPP(seastar::log_level::warn, __VA_ARGS__) +#define ERRORDPP(...) LOGDPP(seastar::log_level::error, __VA_ARGS__) diff --git a/src/crimson/common/logclient.cc b/src/crimson/common/logclient.cc new file mode 100644 index 000000000..d402ecd19 --- /dev/null +++ b/src/crimson/common/logclient.cc @@ -0,0 +1,364 @@ +#include "crimson/common/logclient.h" +#include <fmt/ranges.h> +#include "include/str_map.h" +#include "messages/MLog.h" +#include "messages/MLogAck.h" +#include "messages/MMonGetVersion.h" +#include "crimson/net/Messenger.h" +#include "crimson/mon/MonClient.h" +#include "mon/MonMap.h" +#include "common/Graylog.h" + +using std::map; +using std::ostream; +using std::ostringstream; +using std::string; +using crimson::common::local_conf; + +namespace { + seastar::logger& logger() + { + return crimson::get_logger(ceph_subsys_monc); + } +} + +//TODO: in order to avoid unnecessary maps declarations and moving around, +// create a named structure containing the maps and return optional +// fit to it. +int parse_log_client_options(CephContext *cct, + map<string,string> &log_to_monitors, + map<string,string> &log_to_syslog, + map<string,string> &log_channels, + map<string,string> &log_prios, + map<string,string> &log_to_graylog, + map<string,string> &log_to_graylog_host, + map<string,string> &log_to_graylog_port, + uuid_d &fsid, + string &host) +{ + ostringstream oss; + + int r = get_conf_str_map_helper( + cct->_conf.get_val<string>("clog_to_monitors"), oss, + &log_to_monitors, CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + logger().error("{} error parsing 'clog_to_monitors'", __func__); + return r; + } + + r = get_conf_str_map_helper( + cct->_conf.get_val<string>("clog_to_syslog"), oss, + &log_to_syslog, CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + logger().error("{} error parsing 'clog_to_syslog'", __func__); + return r; + } + + r = get_conf_str_map_helper( + cct->_conf.get_val<string>("clog_to_syslog_facility"), oss, + &log_channels, CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + logger().error("{} error parsing 'clog_to_syslog_facility'", __func__); + return r; + } + + r = get_conf_str_map_helper( + cct->_conf.get_val<string>("clog_to_syslog_level"), oss, + &log_prios, CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + logger().error("{} error parsing 'clog_to_syslog_level'", __func__); + return r; + } + + r = get_conf_str_map_helper( + cct->_conf.get_val<string>("clog_to_graylog"), oss, + &log_to_graylog, CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + logger().error("{} error parsing 'clog_to_graylog'", __func__); + return r; + } + + r = get_conf_str_map_helper( + cct->_conf.get_val<string>("clog_to_graylog_host"), oss, + &log_to_graylog_host, CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + logger().error("{} error parsing 'clog_to_graylog_host'", __func__); + return r; + } + + r = get_conf_str_map_helper( + cct->_conf.get_val<string>("clog_to_graylog_port"), oss, + &log_to_graylog_port, CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + logger().error("{} error parsing 'clog_to_graylog_port'", __func__); + return r; + } + + fsid = cct->_conf.get_val<uuid_d>("fsid"); + host = cct->_conf->host; + return 0; +} + +LogChannel::LogChannel(LogClient *lc, const string &channel) + : parent(lc), log_channel(channel), log_to_syslog(false), + log_to_monitors(false) +{ +} + +LogChannel::LogChannel(LogClient *lc, const string &channel, + const string &facility, const string &prio) + : parent(lc), log_channel(channel), log_prio(prio), + syslog_facility(facility), log_to_syslog(false), + log_to_monitors(false) +{ +} + +LogClient::LogClient(crimson::net::Messenger *m, + logclient_flag_t flags) + : messenger(m), is_mon(flags & FLAG_MON), + last_log_sent(0), last_log(0) +{ +} + +void LogChannel::set_log_to_monitors(bool v) +{ + if (log_to_monitors != v) { + parent->reset(); + log_to_monitors = v; + } +} + +void LogChannel::update_config(map<string,string> &log_to_monitors, + map<string,string> &log_to_syslog, + map<string,string> &log_channels, + map<string,string> &log_prios, + map<string,string> &log_to_graylog, + map<string,string> &log_to_graylog_host, + map<string,string> &log_to_graylog_port, + uuid_d &fsid, + string &host) +{ + logger().debug( + "{} log_to_monitors {} log_to_syslog {} log_channels {} log_prios {}", + __func__, log_to_monitors, log_to_syslog, log_channels, log_prios); + bool to_monitors = (get_str_map_key(log_to_monitors, log_channel, + &CLOG_CONFIG_DEFAULT_KEY) == "true"); + bool to_syslog = (get_str_map_key(log_to_syslog, log_channel, + &CLOG_CONFIG_DEFAULT_KEY) == "true"); + string syslog_facility = get_str_map_key(log_channels, log_channel, + &CLOG_CONFIG_DEFAULT_KEY); + string prio = get_str_map_key(log_prios, log_channel, + &CLOG_CONFIG_DEFAULT_KEY); + bool to_graylog = (get_str_map_key(log_to_graylog, log_channel, + &CLOG_CONFIG_DEFAULT_KEY) == "true"); + string graylog_host = get_str_map_key(log_to_graylog_host, log_channel, + &CLOG_CONFIG_DEFAULT_KEY); + string graylog_port_str = get_str_map_key(log_to_graylog_port, log_channel, + &CLOG_CONFIG_DEFAULT_KEY); + int graylog_port = atoi(graylog_port_str.c_str()); + + set_log_to_monitors(to_monitors); + set_log_to_syslog(to_syslog); + set_syslog_facility(syslog_facility); + set_log_prio(prio); + + if (to_graylog && !graylog) { /* should but isn't */ + graylog = seastar::make_shared<ceph::logging::Graylog>("clog"); + } else if (!to_graylog && graylog) { /* shouldn't but is */ + graylog = nullptr; + } + + if (to_graylog && graylog) { + graylog->set_fsid(fsid); + graylog->set_hostname(host); + } + + if (graylog && (!graylog_host.empty()) && (graylog_port != 0)) { + graylog->set_destination(graylog_host, graylog_port); + } + + logger().debug("{} to_monitors: {} to_syslog: {}" + "syslog_facility: {} prio: {} to_graylog: {} graylog_host: {}" + "graylog_port: {}", __func__, (to_monitors ? "true" : "false"), + (to_syslog ? "true" : "false"), syslog_facility, prio, + (to_graylog ? "true" : "false"), graylog_host, graylog_port); +} + +void LogChannel::do_log(clog_type prio, std::stringstream& ss) +{ + while (!ss.eof()) { + string s; + getline(ss, s); + if (!s.empty()) { + do_log(prio, s); + } + } +} + +void LogChannel::do_log(clog_type prio, const std::string& s) +{ + if (CLOG_ERROR == prio) { + logger().error("log {} : {}", prio, s); + } else { + logger().warn("log {} : {}", prio, s); + } + LogEntry e; + e.stamp = ceph_clock_now(); + e.addrs = parent->get_myaddrs(); + e.name = parent->get_myname(); + e.rank = parent->get_myrank(); + e.prio = prio; + e.msg = s; + e.channel = get_log_channel(); + + // seq and who should be set for syslog/graylog/log_to_mon + // log to monitor? + if (log_to_monitors) { + e.seq = parent->queue(e); + } else { + e.seq = parent->get_next_seq(); + } + + // log to syslog? + if (do_log_to_syslog()) { + logger().warn("{} log to syslog", __func__); + e.log_to_syslog(get_log_prio(), get_syslog_facility()); + } + + // log to graylog? + if (do_log_to_graylog()) { + logger().warn("{} log to graylog", __func__); + graylog->log_log_entry(&e); + } +} + +MessageURef LogClient::get_mon_log_message(log_flushing_t flush_flag) +{ + if (flush_flag == log_flushing_t::FLUSH) { + if (log_queue.empty()) { + return {}; + } + // reset session + last_log_sent = log_queue.front().seq; + } + return _get_mon_log_message(); +} + +bool LogClient::are_pending() const +{ + return last_log > last_log_sent; +} + +MessageURef LogClient::_get_mon_log_message() +{ + if (log_queue.empty()) { + return {}; + } + + // only send entries that haven't been sent yet during this mon + // session! monclient needs to call reset_session() on mon session + // reset for this to work right. + + if (last_log_sent == last_log) { + return {}; + } + + // limit entries per message + const int64_t num_unsent = last_log - last_log_sent; + int64_t num_to_send; + if (local_conf()->mon_client_max_log_entries_per_message > 0) { + num_to_send = std::min(num_unsent, + local_conf()->mon_client_max_log_entries_per_message); + } else { + num_to_send = num_unsent; + } + + logger().debug("log_queue is {} last_log {} sent {} num {} unsent {}" + " sending {}", log_queue.size(), last_log, + last_log_sent, log_queue.size(), num_unsent, num_to_send); + ceph_assert((unsigned)num_unsent <= log_queue.size()); + auto log_iter = log_queue.begin(); + std::deque<LogEntry> out_log_queue; /* will send the logs contained here */ + while (log_iter->seq <= last_log_sent) { + ++log_iter; + ceph_assert(log_iter != log_queue.end()); + } + while (num_to_send--) { + ceph_assert(log_iter != log_queue.end()); + out_log_queue.push_back(*log_iter); + last_log_sent = log_iter->seq; + logger().debug(" will send {}", *log_iter); + ++log_iter; + } + + return crimson::make_message<MLog>(m_fsid, + std::move(out_log_queue)); +} + +version_t LogClient::queue(LogEntry &entry) +{ + entry.seq = ++last_log; + log_queue.push_back(entry); + + return entry.seq; +} + +void LogClient::reset() +{ + if (log_queue.size()) { + log_queue.clear(); + } + last_log_sent = last_log; +} + +uint64_t LogClient::get_next_seq() +{ + return ++last_log; +} + +entity_addrvec_t LogClient::get_myaddrs() const +{ + return messenger->get_myaddrs(); +} + +entity_name_t LogClient::get_myrank() +{ + return messenger->get_myname(); +} + +const EntityName& LogClient::get_myname() const +{ + return local_conf()->name; +} + +seastar::future<> LogClient::handle_log_ack(Ref<MLogAck> m) +{ + logger().debug("handle_log_ack {}", *m); + + version_t last = m->last; + + auto q = log_queue.begin(); + while (q != log_queue.end()) { + const LogEntry &entry(*q); + if (entry.seq > last) + break; + logger().debug(" logged {}", entry); + q = log_queue.erase(q); + } + return seastar::now(); +} + +LogChannelRef LogClient::create_channel(const std::string& name) { + auto it = channels.find(name); + if (it == channels.end()) { + it = channels.insert(it, + {name, seastar::make_lw_shared<LogChannel>(this, name)}); + } + return it->second; +} + +seastar::future<> LogClient::set_fsid(const uuid_d& fsid) { + m_fsid = fsid; + return seastar::now(); +} + diff --git a/src/crimson/common/logclient.h b/src/crimson/common/logclient.h new file mode 100644 index 000000000..ab9b25091 --- /dev/null +++ b/src/crimson/common/logclient.h @@ -0,0 +1,232 @@ +#ifndef CEPH_LOGCLIENT_H +#define CEPH_LOGCLIENT_H + +#include "common/LogEntry.h" +#include "common/ostream_temp.h" +#include "common/ref.h" +#include "include/health.h" +#include "crimson/net/Fwd.h" + +#include <seastar/core/future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/lowres_clock.hh> +#include <seastar/core/shared_ptr.hh> +#include <seastar/core/timer.hh> + +class LogClient; +class MLog; +class MLogAck; +class Message; +struct uuid_d; +struct Connection; + +class LogChannel; + +namespace ceph { +namespace logging { + class Graylog; +} +} + +template<typename Message> using Ref = boost::intrusive_ptr<Message>; +namespace crimson::net { + class Messenger; +} + +enum class log_flushing_t { + NO_FLUSH, + FLUSH +}; + +int parse_log_client_options(CephContext *cct, + std::map<std::string,std::string> &log_to_monitors, + std::map<std::string,std::string> &log_to_syslog, + std::map<std::string,std::string> &log_channels, + std::map<std::string,std::string> &log_prios, + std::map<std::string,std::string> &log_to_graylog, + std::map<std::string,std::string> &log_to_graylog_host, + std::map<std::string,std::string> &log_to_graylog_port, + uuid_d &fsid, + std::string &host); + +/** Manage where we output to and at which priority + * + * Not to be confused with the LogClient, which is the almighty coordinator + * of channels. We just deal with the boring part of the logging: send to + * syslog, send to file, generate LogEntry and queue it for the LogClient. + * + * Past queueing the LogEntry, the LogChannel is done with the whole thing. + * LogClient will deal with sending and handling of LogEntries. + */ +class LogChannel : public LoggerSinkSet +{ +public: + LogChannel(LogClient *lc, const std::string &channel); + LogChannel(LogClient *lc, const std::string &channel, + const std::string &facility, const std::string &prio); + + OstreamTemp debug() { + return OstreamTemp(CLOG_DEBUG, this); + } + void debug(std::stringstream &s) final { + do_log(CLOG_DEBUG, s); + } + /** + * Convenience function mapping health status to + * the appropriate cluster log severity. + */ + OstreamTemp health(health_status_t health) { + switch(health) { + case HEALTH_OK: + return info(); + case HEALTH_WARN: + return warn(); + case HEALTH_ERR: + return error(); + default: + // Invalid health_status_t value + ceph_abort(); + } + } + OstreamTemp info() final { + return OstreamTemp(CLOG_INFO, this); + } + void info(std::stringstream &s) final { + do_log(CLOG_INFO, s); + } + OstreamTemp warn() final { + return OstreamTemp(CLOG_WARN, this); + } + void warn(std::stringstream &s) final { + do_log(CLOG_WARN, s); + } + OstreamTemp error() final { + return OstreamTemp(CLOG_ERROR, this); + } + void error(std::stringstream &s) final { + do_log(CLOG_ERROR, s); + } + OstreamTemp sec() final { + return OstreamTemp(CLOG_SEC, this); + } + void sec(std::stringstream &s) final { + do_log(CLOG_SEC, s); + } + + void set_log_to_monitors(bool v); + void set_log_to_syslog(bool v) { + log_to_syslog = v; + } + void set_log_channel(const std::string& v) { + log_channel = v; + } + void set_log_prio(const std::string& v) { + log_prio = v; + } + void set_syslog_facility(const std::string& v) { + syslog_facility = v; + } + const std::string& get_log_prio() const { return log_prio; } + const std::string& get_log_channel() const { return log_channel; } + const std::string& get_syslog_facility() const { return syslog_facility; } + bool must_log_to_syslog() const { return log_to_syslog; } + /** + * Do we want to log to syslog? + * + * @return true if log_to_syslog is true and both channel and prio + * are not empty; false otherwise. + */ + bool do_log_to_syslog() { + return must_log_to_syslog() && + !log_prio.empty() && !log_channel.empty(); + } + bool must_log_to_monitors() { return log_to_monitors; } + + bool do_log_to_graylog() { + return (graylog != nullptr); + } + + using Ref = seastar::lw_shared_ptr<LogChannel>; + + /** + * update config values from parsed k/v std::map for each config option + * + * Pick out the relevant value based on our channel. + */ + void update_config(std::map<std::string,std::string> &log_to_monitors, + std::map<std::string,std::string> &log_to_syslog, + std::map<std::string,std::string> &log_channels, + std::map<std::string,std::string> &log_prios, + std::map<std::string,std::string> &log_to_graylog, + std::map<std::string,std::string> &log_to_graylog_host, + std::map<std::string,std::string> &log_to_graylog_port, + uuid_d &fsid, + std::string &host); + + void do_log(clog_type prio, std::stringstream& ss) final; + void do_log(clog_type prio, const std::string& s) final; + +private: + LogClient *parent; + std::string log_channel; + std::string log_prio; + std::string syslog_facility; + bool log_to_syslog; + bool log_to_monitors; + seastar::shared_ptr<ceph::logging::Graylog> graylog; +}; + +using LogChannelRef = LogChannel::Ref; + +class LogClient +{ +public: + enum logclient_flag_t { + NO_FLAGS = 0, + FLAG_MON = 0x1, + }; + + LogClient(crimson::net::Messenger *m, logclient_flag_t flags); + + virtual ~LogClient() = default; + + seastar::future<> handle_log_ack(Ref<MLogAck> m); + MessageURef get_mon_log_message(log_flushing_t flush_flag); + bool are_pending() const; + + LogChannelRef create_channel() { + return create_channel(CLOG_CHANNEL_DEFAULT); + } + + LogChannelRef create_channel(const std::string& name); + + void destroy_channel(const std::string& name) { + channels.erase(name); + } + + void shutdown() { + channels.clear(); + } + + uint64_t get_next_seq(); + entity_addrvec_t get_myaddrs() const; + const EntityName& get_myname() const; + entity_name_t get_myrank(); + version_t queue(LogEntry &entry); + void reset(); + seastar::future<> set_fsid(const uuid_d& fsid); + +private: + MessageURef _get_mon_log_message(); + + crimson::net::Messenger *messenger; + bool is_mon; + version_t last_log_sent; + version_t last_log; + std::deque<LogEntry> log_queue; + + std::map<std::string, LogChannelRef> channels; + uuid_d m_fsid; +}; +#endif + diff --git a/src/crimson/common/operation.cc b/src/crimson/common/operation.cc new file mode 100644 index 000000000..53399fb9b --- /dev/null +++ b/src/crimson/common/operation.cc @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "operation.h" + +namespace crimson { + +void Operation::dump(ceph::Formatter* f) const +{ + f->open_object_section("operation"); + f->dump_string("type", get_type_name()); + f->dump_unsigned("id", id); + { + f->open_object_section("detail"); + dump_detail(f); + f->close_section(); + } + f->close_section(); +} + +void Operation::dump_brief(ceph::Formatter* f) const +{ + f->open_object_section("operation"); + f->dump_string("type", get_type_name()); + f->dump_unsigned("id", id); + f->close_section(); +} + +std::ostream &operator<<(std::ostream &lhs, const Operation &rhs) { + lhs << rhs.get_type_name() << "(id=" << rhs.get_id() << ", detail="; + rhs.print(lhs); + lhs << ")"; + return lhs; +} + +void Blocker::dump(ceph::Formatter* f) const +{ + f->open_object_section("blocker"); + f->dump_string("op_type", get_type_name()); + { + f->open_object_section("detail"); + dump_detail(f); + f->close_section(); + } + f->close_section(); +} + +namespace detail { +void dump_time_event(const char* name, + const utime_t& timestamp, + ceph::Formatter* f) +{ + assert(f); + f->open_object_section("time_event"); + f->dump_string("name", name); + f->dump_stream("initiated_at") << timestamp; + f->close_section(); +} + +void dump_blocking_event(const char* name, + const utime_t& timestamp, + const Blocker* const blocker, + ceph::Formatter* f) +{ + assert(f); + f->open_object_section("blocking_event"); + f->dump_string("name", name); + f->dump_stream("initiated_at") << timestamp; + if (blocker) { + blocker->dump(f); + } + f->close_section(); +} +} // namespace detail +} // namespace crimson diff --git a/src/crimson/common/operation.h b/src/crimson/common/operation.h new file mode 100644 index 000000000..6df2c99fd --- /dev/null +++ b/src/crimson/common/operation.h @@ -0,0 +1,776 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <algorithm> +#include <array> +#include <set> +#include <vector> +#include <boost/core/demangle.hpp> +#include <boost/intrusive/list.hpp> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/shared_mutex.hh> +#include <seastar/core/future.hh> +#include <seastar/core/timer.hh> +#include <seastar/core/lowres_clock.hh> +#include <seastar/core/future-util.hh> + +#include "include/ceph_assert.h" +#include "include/utime.h" +#include "common/Clock.h" +#include "common/Formatter.h" +#include "crimson/common/interruptible_future.h" +#include "crimson/common/smp_helpers.h" +#include "crimson/common/log.h" + +namespace ceph { + class Formatter; +} + +namespace crimson { + +using registry_hook_t = boost::intrusive::list_member_hook< + boost::intrusive::link_mode<boost::intrusive::auto_unlink>>; + +class Operation; +class Blocker; + + +namespace detail { +void dump_time_event(const char* name, + const utime_t& timestamp, + ceph::Formatter* f); +void dump_blocking_event(const char* name, + const utime_t& timestamp, + const Blocker* blocker, + ceph::Formatter* f); +} // namespace detail + +/** + * Provides an interface for dumping diagnostic information about + * why a particular op is not making progress. + */ +class Blocker { +public: + void dump(ceph::Formatter *f) const; + virtual ~Blocker() = default; + +private: + virtual void dump_detail(ceph::Formatter *f) const = 0; + virtual const char *get_type_name() const = 0; +}; + +// the main template. by default an operation has no extenral +// event handler (the empty tuple). specializing the template +// allows to define backends on per-operation-type manner. +// NOTE: basically this could be a function but C++ disallows +// differentiating return type among specializations. +template <class T> +struct EventBackendRegistry { + template <typename...> static constexpr bool always_false = false; + + static std::tuple<> get_backends() { + static_assert(always_false<T>, "Registry specialization not found"); + return {}; + } +}; + +template <class T> +struct Event { + T* that() { + return static_cast<T*>(this); + } + const T* that() const { + return static_cast<const T*>(this); + } + + template <class OpT, class... Args> + void trigger(OpT&& op, Args&&... args) { + that()->internal_backend.handle(*that(), + std::forward<OpT>(op), + std::forward<Args>(args)...); + // let's call `handle()` for concrete event type from each single + // of our backends. the order in the registry matters. + std::apply([&, //args=std::forward_as_tuple(std::forward<Args>(args)...), + this] (auto... backend) { + (..., backend.handle(*that(), + std::forward<OpT>(op), + std::forward<Args>(args)...)); + }, EventBackendRegistry<std::decay_t<OpT>>::get_backends()); + } +}; + + +// simplest event type for recording things like beginning or end +// of TrackableOperation's life. +template <class T> +struct TimeEvent : Event<T> { + struct Backend { + // `T` is passed solely to let implementations to discriminate + // basing on the type-of-event. + virtual void handle(T&, const Operation&) = 0; + }; + + // for the sake of dumping ops-in-flight. + struct InternalBackend final : Backend { + void handle(T&, const Operation&) override { + timestamp = ceph_clock_now(); + } + utime_t timestamp; + } internal_backend; + + void dump(ceph::Formatter *f) const { + auto demangled_name = boost::core::demangle(typeid(T).name()); + detail::dump_time_event( + demangled_name.c_str(), + internal_backend.timestamp, f); + } + + auto get_timestamp() const { + return internal_backend.timestamp; + } +}; + + +template <typename T> +class BlockerT : public Blocker { +public: + struct BlockingEvent : Event<typename T::BlockingEvent> { + using Blocker = std::decay_t<T>; + + struct Backend { + // `T` is based solely to let implementations to discriminate + // basing on the type-of-event. + virtual void handle(typename T::BlockingEvent&, const Operation&, const T&) = 0; + }; + + struct InternalBackend : Backend { + void handle(typename T::BlockingEvent&, + const Operation&, + const T& blocker) override { + this->timestamp = ceph_clock_now(); + this->blocker = &blocker; + } + + utime_t timestamp; + const T* blocker; + } internal_backend; + + // we don't want to make any BlockerT to be aware and coupled with + // an operation. to not templatize an entire path from an op to + // a blocker, type erasuring is used. + struct TriggerI { + TriggerI(BlockingEvent& event) : event(event) {} + + template <class FutureT> + auto maybe_record_blocking(FutureT&& fut, const T& blocker) { + if (!fut.available()) { + // a full blown call via vtable. that's the cost for templatization + // avoidance. anyway, most of the things actually have the type + // knowledge. + record_blocking(blocker); + return std::forward<FutureT>(fut).finally( + [&event=this->event, &blocker] () mutable { + // beware trigger instance may be already dead when this + // is executed! + record_unblocking(event, blocker); + }); + } + return std::forward<FutureT>(fut); + } + virtual ~TriggerI() = default; + protected: + // it's for the sake of erasing the OpT type + virtual void record_blocking(const T& blocker) = 0; + + static void record_unblocking(BlockingEvent& event, const T& blocker) { + assert(event.internal_backend.blocker == &blocker); + event.internal_backend.blocker = nullptr; + } + + BlockingEvent& event; + }; + + template <class OpT> + struct Trigger : TriggerI { + Trigger(BlockingEvent& event, const OpT& op) : TriggerI(event), op(op) {} + + template <class FutureT> + auto maybe_record_blocking(FutureT&& fut, const T& blocker) { + if (!fut.available()) { + // no need for the dynamic dispatch! if we're lucky, a compiler + // should collapse all these abstractions into a bunch of movs. + this->Trigger::record_blocking(blocker); + return std::forward<FutureT>(fut).finally( + [&event=this->event, &blocker] () mutable { + Trigger::record_unblocking(event, blocker); + }); + } + return std::forward<FutureT>(fut); + } + + const OpT &get_op() { return op; } + + protected: + void record_blocking(const T& blocker) override { + this->event.trigger(op, blocker); + } + + const OpT& op; + }; + + void dump(ceph::Formatter *f) const { + auto demangled_name = boost::core::demangle(typeid(T).name()); + detail::dump_blocking_event( + demangled_name.c_str(), + internal_backend.timestamp, + internal_backend.blocker, + f); + } + }; + + virtual ~BlockerT() = default; + template <class TriggerT, class... Args> + decltype(auto) track_blocking(TriggerT&& trigger, Args&&... args) { + return std::forward<TriggerT>(trigger).maybe_record_blocking( + std::forward<Args>(args)..., static_cast<const T&>(*this)); + } + +private: + const char *get_type_name() const final { + return static_cast<const T*>(this)->type_name; + } +}; + +template <class T> +struct AggregateBlockingEvent { + struct TriggerI { + protected: + struct TriggerContainerI { + virtual typename T::TriggerI& get_trigger() = 0; + virtual ~TriggerContainerI() = default; + }; + using TriggerContainerIRef = std::unique_ptr<TriggerContainerI>; + virtual TriggerContainerIRef create_part_trigger() = 0; + + public: + template <class FutureT> + auto maybe_record_blocking(FutureT&& fut, + const typename T::Blocker& blocker) { + // AggregateBlockingEvent is supposed to be used on relatively cold + // paths (recovery), so we don't need to worry about the dynamic + // polymothps / dynamic memory's overhead. + auto tcont = create_part_trigger(); + return tcont->get_trigger().maybe_record_blocking( + std::move(fut), blocker + ).finally([tcont=std::move(tcont)] {}); + } + + virtual ~TriggerI() = default; + }; + + template <class OpT> + struct Trigger final : TriggerI { + Trigger(AggregateBlockingEvent& event, const OpT& op) + : event(event), op(op) {} + + class TriggerContainer final : public TriggerI::TriggerContainerI { + AggregateBlockingEvent& event; + typename decltype(event.events)::iterator iter; + typename T::template Trigger<OpT> trigger; + + typename T::TriggerI &get_trigger() final { + return trigger; + } + + public: + TriggerContainer(AggregateBlockingEvent& _event, const OpT& op) : + event(_event), + iter(event.events.emplace(event.events.end())), + trigger(*iter, op) {} + + ~TriggerContainer() final { + event.events.erase(iter); + } + }; + + protected: + typename TriggerI::TriggerContainerIRef create_part_trigger() final { + return std::make_unique<TriggerContainer>(event, op); + } + + private: + AggregateBlockingEvent& event; + const OpT& op; + }; + +private: + std::list<T> events; + template <class OpT> + friend class Trigger; +}; + +/** + * Common base for all crimson-osd operations. Mainly provides + * an interface for registering ops in flight and dumping + * diagnostic information. + */ +class Operation : public boost::intrusive_ref_counter< + Operation, boost::thread_unsafe_counter> { + public: + using id_t = uint64_t; + static constexpr id_t NULL_ID = std::numeric_limits<uint64_t>::max(); + id_t get_id() const { + return id; + } + + static constexpr bool is_trackable = false; + + virtual unsigned get_type() const = 0; + virtual const char *get_type_name() const = 0; + virtual void print(std::ostream &) const = 0; + + void dump(ceph::Formatter *f) const; + void dump_brief(ceph::Formatter *f) const; + virtual ~Operation() = default; + + private: + virtual void dump_detail(ceph::Formatter *f) const = 0; + + registry_hook_t registry_hook; + + id_t id = 0; + void set_id(id_t in_id) { + id = in_id; + } + + friend class OperationRegistryI; + template <size_t> + friend class OperationRegistryT; +}; +using OperationRef = boost::intrusive_ptr<Operation>; + +std::ostream &operator<<(std::ostream &, const Operation &op); + +/** + * Maintains a set of lists of all active ops. + */ +class OperationRegistryI { + using op_list_member_option = boost::intrusive::member_hook< + Operation, + registry_hook_t, + &Operation::registry_hook + >; + + friend class Operation; + seastar::timer<seastar::lowres_clock> shutdown_timer; + seastar::promise<> shutdown; + +protected: + virtual void do_register(Operation *op) = 0; + virtual bool registries_empty() const = 0; + virtual void do_stop() = 0; + +public: + using op_list = boost::intrusive::list< + Operation, + op_list_member_option, + boost::intrusive::constant_time_size<false>>; + + template <typename T, typename... Args> + auto create_operation(Args&&... args) { + boost::intrusive_ptr<T> op = new T(std::forward<Args>(args)...); + do_register(&*op); + return op; + } + + seastar::future<> stop() { + crimson::get_logger(ceph_subsys_osd).info("OperationRegistryI::{}", __func__); + do_stop(); + shutdown_timer.set_callback([this] { + if (registries_empty()) { + shutdown.set_value(); + shutdown_timer.cancel(); + } + }); + shutdown_timer.arm_periodic( + std::chrono::milliseconds(100/*TODO: use option instead*/)); + return shutdown.get_future(); + } +}; + + +template <size_t NUM_REGISTRIES> +class OperationRegistryT : public OperationRegistryI { + Operation::id_t next_id = 0; + std::array< + op_list, + NUM_REGISTRIES + > registries; + +protected: + void do_register(Operation *op) final { + const auto op_type = op->get_type(); + registries[op_type].push_back(*op); + op->set_id(++next_id); + } + + bool registries_empty() const final { + return std::all_of(registries.begin(), + registries.end(), + [](auto& opl) { + return opl.empty(); + }); + } + +protected: + OperationRegistryT(core_id_t core) + // Use core to initialize upper 8 bits of counters to ensure that + // ids generated by different cores are disjoint + : next_id(static_cast<id_t>(core) << + (std::numeric_limits<id_t>::digits - 8)) + {} + + template <size_t REGISTRY_INDEX> + const op_list& get_registry() const { + static_assert( + REGISTRY_INDEX < std::tuple_size<decltype(registries)>::value); + return registries[REGISTRY_INDEX]; + } + + template <size_t REGISTRY_INDEX> + op_list& get_registry() { + static_assert( + REGISTRY_INDEX < std::tuple_size<decltype(registries)>::value); + return registries[REGISTRY_INDEX]; + } + +public: + /// Iterate over live ops + template <typename F> + void for_each_op(F &&f) const { + for (const auto ®istry: registries) { + for (const auto &op: registry) { + std::invoke(f, op); + } + } + } + + /// Removes op from registry + void remove_from_registry(Operation &op) { + const auto op_type = op.get_type(); + registries[op_type].erase(op_list::s_iterator_to(op)); + } + + /// Adds op to registry + void add_to_registry(Operation &op) { + const auto op_type = op.get_type(); + registries[op_type].push_back(op); + } +}; + +class PipelineExitBarrierI { +public: + using Ref = std::unique_ptr<PipelineExitBarrierI>; + + /// Waits for exit barrier + virtual std::optional<seastar::future<>> wait() = 0; + + /// Releases pipeline stage, can only be called after wait + virtual void exit() = 0; + + /// Releases pipeline resources without waiting on barrier + virtual void cancel() = 0; + + /// Must ensure that resources are released, likely by calling cancel() + virtual ~PipelineExitBarrierI() {} +}; + +template <class T> +class PipelineStageIT : public BlockerT<T> { + const core_id_t core = seastar::this_shard_id(); +public: + core_id_t get_core() const { return core; } + + template <class... Args> + decltype(auto) enter(Args&&... args) { + return static_cast<T*>(this)->enter(std::forward<Args>(args)...); + } +}; + +class PipelineHandle { + PipelineExitBarrierI::Ref barrier; + + std::optional<seastar::future<>> wait_barrier() { + return barrier ? barrier->wait() : std::nullopt; + } + +public: + PipelineHandle() = default; + + PipelineHandle(const PipelineHandle&) = delete; + PipelineHandle(PipelineHandle&&) = default; + PipelineHandle &operator=(const PipelineHandle&) = delete; + PipelineHandle &operator=(PipelineHandle&&) = default; + + /** + * Returns a future which unblocks when the handle has entered the passed + * OrderedPipelinePhase. If already in a phase, enter will also release + * that phase after placing itself in the queue for the next one to preserve + * ordering. + */ + template <typename OpT, typename T> + seastar::future<> + enter(T &stage, typename T::BlockingEvent::template Trigger<OpT>&& t) { + ceph_assert(stage.get_core() == seastar::this_shard_id()); + auto wait_fut = wait_barrier(); + if (wait_fut.has_value()) { + return wait_fut.value().then([this, &stage, t=std::move(t)] () mutable { + auto fut = t.maybe_record_blocking(stage.enter(t), stage); + exit(); + return std::move(fut).then( + [this, t=std::move(t)](auto &&barrier_ref) mutable { + barrier = std::move(barrier_ref); + return seastar::now(); + }); + }); + } else { + auto fut = t.maybe_record_blocking(stage.enter(t), stage); + exit(); + return std::move(fut).then( + [this, t=std::move(t)](auto &&barrier_ref) mutable { + barrier = std::move(barrier_ref); + return seastar::now(); + }); + } + } + + /** + * Completes pending exit barrier without entering a new one. + */ + seastar::future<> complete() { + auto ret = wait_barrier(); + barrier.reset(); + return ret ? std::move(ret.value()) : seastar::now(); + } + + /** + * Exits current phase, skips exit barrier, should only be used for op + * failure. Permitting the handle to be destructed as the same effect. + */ + void exit() { + barrier.reset(); + } + +}; + +/** + * Ensures that at most one op may consider itself in the phase at a time. + * Ops will see enter() unblock in the order in which they tried to enter + * the phase. entering (though not necessarily waiting for the future to + * resolve) a new phase prior to exiting the previous one will ensure that + * the op ordering is preserved. + */ +template <class T> +class OrderedExclusivePhaseT : public PipelineStageIT<T> { + void dump_detail(ceph::Formatter *f) const final { + f->dump_unsigned("waiting", waiting); + if (held_by != Operation::NULL_ID) { + f->dump_unsigned("held_by_operation_id", held_by); + } + } + + class ExitBarrier final : public PipelineExitBarrierI { + OrderedExclusivePhaseT *phase; + Operation::id_t op_id; + public: + ExitBarrier(OrderedExclusivePhaseT *phase, Operation::id_t id) + : phase(phase), op_id(id) {} + + std::optional<seastar::future<>> wait() final { + return std::nullopt; + } + + void exit() final { + if (phase) { + auto *p = phase; + auto id = op_id; + phase = nullptr; + std::ignore = seastar::smp::submit_to( + p->get_core(), + [p, id] { + p->exit(id); + }); + } + } + + void cancel() final { + exit(); + } + + ~ExitBarrier() final { + cancel(); + } + }; + + void exit(Operation::id_t op_id) { + clear_held_by(op_id); + mutex.unlock(); + } + +public: + template <class TriggerT> + seastar::future<PipelineExitBarrierI::Ref> enter(TriggerT& t) { + waiting++; + return mutex.lock().then([this, op_id=t.get_op().get_id()] { + ceph_assert_always(waiting > 0); + --waiting; + set_held_by(op_id); + return PipelineExitBarrierI::Ref(new ExitBarrier{this, op_id}); + }); + } + +private: + void set_held_by(Operation::id_t id) { + ceph_assert_always(held_by == Operation::NULL_ID); + held_by = id; + } + + void clear_held_by(Operation::id_t id) { + ceph_assert_always(held_by == id); + held_by = Operation::NULL_ID; + } + + unsigned waiting = 0; + seastar::shared_mutex mutex; + Operation::id_t held_by = Operation::NULL_ID; +}; + +/** + * Permits multiple ops to inhabit the stage concurrently, but ensures that + * they will proceed to the next stage in the order in which they called + * enter. + */ +template <class T> +class OrderedConcurrentPhaseT : public PipelineStageIT<T> { + using base_t = PipelineStageIT<T>; +public: + struct BlockingEvent : base_t::BlockingEvent { + using base_t::BlockingEvent::BlockingEvent; + + struct ExitBarrierEvent : TimeEvent<ExitBarrierEvent> {}; + + template <class OpT> + struct Trigger : base_t::BlockingEvent::template Trigger<OpT> { + using base_t::BlockingEvent::template Trigger<OpT>::Trigger; + + template <class FutureT> + decltype(auto) maybe_record_exit_barrier(FutureT&& fut) { + if (!fut.available()) { + exit_barrier_event.trigger(this->op); + } + return std::forward<FutureT>(fut); + } + + ExitBarrierEvent exit_barrier_event; + }; + }; + +private: + void dump_detail(ceph::Formatter *f) const final {} + + template <class TriggerT> + class ExitBarrier final : public PipelineExitBarrierI { + OrderedConcurrentPhaseT *phase; + std::optional<seastar::future<>> barrier; + TriggerT trigger; + public: + ExitBarrier( + OrderedConcurrentPhaseT *phase, + seastar::future<> &&barrier, + TriggerT& trigger) : phase(phase), barrier(std::move(barrier)), trigger(trigger) {} + + std::optional<seastar::future<>> wait() final { + assert(phase); + assert(barrier); + auto ret = std::move(*barrier); + barrier = std::nullopt; + return trigger.maybe_record_exit_barrier(std::move(ret)); + } + + void exit() final { + if (barrier) { + static_cast<void>( + std::move(*barrier).then([phase=this->phase] { phase->mutex.unlock(); })); + barrier = std::nullopt; + phase = nullptr; + } + if (phase) { + std::ignore = seastar::smp::submit_to( + phase->get_core(), + [this] { + phase->mutex.unlock(); + phase = nullptr; + }); + } + } + + void cancel() final { + exit(); + } + + ~ExitBarrier() final { + cancel(); + } + }; + +public: + template <class TriggerT> + seastar::future<PipelineExitBarrierI::Ref> enter(TriggerT& t) { + return seastar::make_ready_future<PipelineExitBarrierI::Ref>( + new ExitBarrier<TriggerT>{this, mutex.lock(), t}); + } + +private: + seastar::shared_mutex mutex; +}; + +/** + * Imposes no ordering or exclusivity at all. Ops enter without constraint and + * may exit in any order. Useful mainly for informational purposes between + * stages with constraints. + */ +template <class T> +class UnorderedStageT : public PipelineStageIT<T> { + void dump_detail(ceph::Formatter *f) const final {} + + class ExitBarrier final : public PipelineExitBarrierI { + public: + ExitBarrier() = default; + + std::optional<seastar::future<>> wait() final { + return std::nullopt; + } + + void exit() final {} + + void cancel() final {} + + ~ExitBarrier() final {} + }; + +public: + template <class... IgnoreArgs> + seastar::future<PipelineExitBarrierI::Ref> enter(IgnoreArgs&&...) { + return seastar::make_ready_future<PipelineExitBarrierI::Ref>( + new ExitBarrier); + } +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::Operation> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/common/perf_counters_collection.cc b/src/crimson/common/perf_counters_collection.cc new file mode 100644 index 000000000..254d85278 --- /dev/null +++ b/src/crimson/common/perf_counters_collection.cc @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "common/ceph_context.h" +#include "perf_counters_collection.h" + +namespace crimson::common { +PerfCountersCollection::PerfCountersCollection() +{ + perf_collection = std::make_unique<PerfCountersCollectionImpl>(); +} +PerfCountersCollection::~PerfCountersCollection() +{ + perf_collection->clear(); +} + +PerfCountersCollectionImpl* PerfCountersCollection:: get_perf_collection() +{ + return perf_collection.get(); +} + +void PerfCountersCollection::dump_formatted(ceph::Formatter *f, bool schema, + bool dump_labeled, + const std::string &logger, + const std::string &counter) +{ + perf_collection->dump_formatted(f, schema, dump_labeled, logger, counter); +} + +PerfCountersCollection::ShardedPerfCountersCollection PerfCountersCollection::sharded_perf_coll; + +void PerfCountersDeleter::operator()(PerfCounters* p) noexcept +{ + if (cct) { + cct->get_perfcounters_collection()->remove(p); + } + delete p; +} + +} + diff --git a/src/crimson/common/perf_counters_collection.h b/src/crimson/common/perf_counters_collection.h new file mode 100644 index 000000000..ae0c8670c --- /dev/null +++ b/src/crimson/common/perf_counters_collection.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "common/perf_counters.h" +#include "include/common_fwd.h" +#include <seastar/core/sharded.hh> + +using crimson::common::PerfCountersCollectionImpl; +namespace crimson::common { +class PerfCountersCollection: public seastar::sharded<PerfCountersCollection> +{ + using ShardedPerfCountersCollection = seastar::sharded<PerfCountersCollection>; + +private: + std::unique_ptr<PerfCountersCollectionImpl> perf_collection; + static ShardedPerfCountersCollection sharded_perf_coll; + friend PerfCountersCollection& local_perf_coll(); + friend ShardedPerfCountersCollection& sharded_perf_coll(); + +public: + PerfCountersCollection(); + ~PerfCountersCollection(); + PerfCountersCollectionImpl* get_perf_collection(); + void dump_formatted(ceph::Formatter *f, bool schema, bool dump_labeled, + const std::string &logger = "", + const std::string &counter = ""); +}; + +inline PerfCountersCollection::ShardedPerfCountersCollection& sharded_perf_coll(){ + return PerfCountersCollection::sharded_perf_coll; +} + +inline PerfCountersCollection& local_perf_coll() { + return PerfCountersCollection::sharded_perf_coll.local(); +} + +class PerfCountersDeleter { + CephContext* cct; + +public: + PerfCountersDeleter() noexcept : cct(nullptr) {} + PerfCountersDeleter(CephContext* cct) noexcept : cct(cct) {} + void operator()(PerfCounters* p) noexcept; +}; +} +using PerfCountersRef = std::unique_ptr<crimson::common::PerfCounters, crimson::common::PerfCountersDeleter>; + diff --git a/src/crimson/common/shared_lru.h b/src/crimson/common/shared_lru.h new file mode 100644 index 000000000..186f02a61 --- /dev/null +++ b/src/crimson/common/shared_lru.h @@ -0,0 +1,180 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <optional> +#include <boost/smart_ptr/local_shared_ptr.hpp> +#include <boost/smart_ptr/weak_ptr.hpp> +#include "simple_lru.h" + +/// SharedLRU does its best to cache objects. It not only tracks the objects +/// in its LRU cache with strong references, it also tracks objects with +/// weak_ptr even if the cache does not hold any strong references to them. so +/// that it can return the objects after they are evicted, as long as they've +/// ever been cached and have not been destroyed yet. +template<class K, class V> +class SharedLRU { + using shared_ptr_t = boost::local_shared_ptr<V>; + using weak_ptr_t = boost::weak_ptr<V>; + using value_type = std::pair<K, shared_ptr_t>; + + // weak_refs is already ordered, and we don't use accessors like + // LRUCache::lower_bound(), so unordered LRUCache would suffice. + SimpleLRU<K, shared_ptr_t, false> cache; + std::map<K, std::pair<weak_ptr_t, V*>> weak_refs; + + struct Deleter { + SharedLRU<K,V>* cache; + const K key; + void operator()(V* ptr) { + cache->_erase_weak(key); + delete ptr; + } + }; + void _erase_weak(const K& key) { + weak_refs.erase(key); + } +public: + SharedLRU(size_t max_size = 20) + : cache{max_size} + {} + ~SharedLRU() { + cache.clear(); + // initially, we were assuming that no pointer obtained from SharedLRU + // can outlive the lru itself. However, since going with the interruption + // concept for handling shutdowns, this is no longer valid. + weak_refs.clear(); + } + /** + * Returns a reference to the given key, and perform an insertion if such + * key does not already exist + */ + shared_ptr_t operator[](const K& key); + /** + * Returns true iff there are no live references left to anything that has been + * in the cache. + */ + bool empty() const { + return weak_refs.empty(); + } + size_t size() const { + return cache.size(); + } + size_t capacity() const { + return cache.capacity(); + } + /*** + * Inserts a key if not present, or bumps it to the front of the LRU if + * it is, and then gives you a reference to the value. If the key already + * existed, you are responsible for deleting the new value you tried to + * insert. + * + * @param key The key to insert + * @param value The value that goes with the key + * @param existed Set to true if the value was already in the + * map, false otherwise + * @return A reference to the map's value for the given key + */ + shared_ptr_t insert(const K& key, std::unique_ptr<V> value); + // clear all strong reference from the lru. + void clear() { + cache.clear(); + } + shared_ptr_t find(const K& key); + // return the last element that is not greater than key + shared_ptr_t lower_bound(const K& key); + // return the first element that is greater than key + std::optional<value_type> upper_bound(const K& key); + + void erase(const K& key) { + cache.erase(key); + _erase_weak(key); + } +}; + +template<class K, class V> +typename SharedLRU<K,V>::shared_ptr_t +SharedLRU<K,V>::insert(const K& key, std::unique_ptr<V> value) +{ + shared_ptr_t val; + if (auto found = weak_refs.find(key); found != weak_refs.end()) { + val = found->second.first.lock(); + } + if (!val) { + val.reset(value.release(), Deleter{this, key}); + weak_refs.emplace(key, std::make_pair(val, val.get())); + } + cache.insert(key, val); + return val; +} + +template<class K, class V> +typename SharedLRU<K,V>::shared_ptr_t +SharedLRU<K,V>::operator[](const K& key) +{ + if (auto found = cache.find(key); found) { + return *found; + } + shared_ptr_t val; + if (auto found = weak_refs.find(key); found != weak_refs.end()) { + val = found->second.first.lock(); + } + if (!val) { + val.reset(new V{}, Deleter{this, key}); + weak_refs.emplace(key, std::make_pair(val, val.get())); + } + cache.insert(key, val); + return val; +} + +template<class K, class V> +typename SharedLRU<K,V>::shared_ptr_t +SharedLRU<K,V>::find(const K& key) +{ + if (auto found = cache.find(key); found) { + return *found; + } + shared_ptr_t val; + if (auto found = weak_refs.find(key); found != weak_refs.end()) { + val = found->second.first.lock(); + } + if (val) { + cache.insert(key, val); + } + return val; +} + +template<class K, class V> +typename SharedLRU<K,V>::shared_ptr_t +SharedLRU<K,V>::lower_bound(const K& key) +{ + if (weak_refs.empty()) { + return {}; + } + auto found = weak_refs.lower_bound(key); + if (found == weak_refs.end()) { + --found; + } + if (auto val = found->second.first.lock(); val) { + cache.insert(key, val); + return val; + } else { + return {}; + } +} + +template<class K, class V> +std::optional<typename SharedLRU<K,V>::value_type> +SharedLRU<K,V>::upper_bound(const K& key) +{ + for (auto found = weak_refs.upper_bound(key); + found != weak_refs.end(); + ++found) { + if (auto val = found->second.first.lock(); val) { + return std::make_pair(found->first, val); + } + } + return std::nullopt; +} diff --git a/src/crimson/common/simple_lru.h b/src/crimson/common/simple_lru.h new file mode 100644 index 000000000..1419c4885 --- /dev/null +++ b/src/crimson/common/simple_lru.h @@ -0,0 +1,141 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <list> +#include <map> +#include <optional> +#include <type_traits> +#include <unordered_map> + +template <class Key, class Value, bool Ordered> +class SimpleLRU { + static_assert(std::is_default_constructible_v<Value>); + using list_type = std::list<Key>; + template<class K, class V> + using map_t = std::conditional_t<Ordered, + std::map<K, V>, + std::unordered_map<K, V>>; + using map_type = map_t<Key, std::pair<Value, typename list_type::iterator>>; + list_type lru; + map_type cache; + const size_t max_size; + +public: + SimpleLRU(size_t size = 20) + : cache(size), + max_size(size) + {} + size_t size() const { + return cache.size(); + } + size_t capacity() const { + return max_size; + } + using insert_return_type = std::pair<Value, bool>; + insert_return_type insert(const Key& key, Value value); + std::optional<Value> find(const Key& key); + std::optional<std::enable_if<Ordered, Value>> lower_bound(const Key& key); + void erase(const Key& key); + void clear(); +private: + // bump the item to the front of the lru list + Value _lru_add(typename map_type::iterator found); + // evict the last element of most recently used list + void _evict(); +}; + +template <class Key, class Value, bool Ordered> +typename SimpleLRU<Key,Value,Ordered>::insert_return_type +SimpleLRU<Key,Value,Ordered>::insert(const Key& key, Value value) +{ + if constexpr(Ordered) { + auto found = cache.lower_bound(key); + if (found != cache.end() && found->first == key) { + // already exists + return {found->second.first, true}; + } else { + if (size() >= capacity()) { + _evict(); + } + lru.push_front(key); + // use lower_bound as hint to save the lookup + cache.emplace_hint(found, key, std::make_pair(value, lru.begin())); + return {std::move(value), false}; + } + } else { + // cache is not ordered + auto found = cache.find(key); + if (found != cache.end()) { + // already exists + return {found->second.first, true}; + } else { + if (size() >= capacity()) { + _evict(); + } + lru.push_front(key); + cache.emplace(key, std::make_pair(value, lru.begin())); + return {std::move(value), false}; + } + } +} + +template <class Key, class Value, bool Ordered> +std::optional<Value> SimpleLRU<Key,Value,Ordered>::find(const Key& key) +{ + if (auto found = cache.find(key); found != cache.end()){ + return _lru_add(found); + } else { + return {}; + } +} + +template <class Key, class Value, bool Ordered> +std::optional<std::enable_if<Ordered, Value>> +SimpleLRU<Key,Value,Ordered>::lower_bound(const Key& key) +{ + if (auto found = cache.lower_bound(key); found != cache.end()) { + return _lru_add(found); + } else { + return {}; + } +} + +template <class Key, class Value, bool Ordered> +void SimpleLRU<Key,Value,Ordered>::clear() +{ + lru.clear(); + cache.clear(); +} + +template <class Key, class Value, bool Ordered> +void SimpleLRU<Key,Value,Ordered>::erase(const Key& key) +{ + if (auto found = cache.find(key); found != cache.end()) { + lru.erase(found->second.second); + cache.erase(found); + } +} + +template <class Key, class Value, bool Ordered> +Value SimpleLRU<Key,Value,Ordered>::_lru_add( + typename SimpleLRU<Key,Value,Ordered>::map_type::iterator found) +{ + auto& [value, in_lru] = found->second; + if (in_lru != lru.begin()){ + // move item to the front + lru.splice(lru.begin(), lru, in_lru); + } + // the item is already at the front + return value; +} + +template <class Key, class Value, bool Ordered> +void SimpleLRU<Key,Value,Ordered>::_evict() +{ + // evict the last element of most recently used list + auto last = --lru.end(); + cache.erase(*last); + lru.erase(last); +} diff --git a/src/crimson/common/smp_helpers.h b/src/crimson/common/smp_helpers.h new file mode 100644 index 000000000..c2b7bd964 --- /dev/null +++ b/src/crimson/common/smp_helpers.h @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <limits> + +#include <seastar/core/smp.hh> + +#include "crimson/common/errorator.h" +#include "crimson/common/utility.h" + +namespace crimson { + +using core_id_t = seastar::shard_id; +static constexpr core_id_t NULL_CORE = std::numeric_limits<core_id_t>::max(); + +auto submit_to(core_id_t core, auto &&f) { + using ret_type = decltype(f()); + if constexpr (is_errorated_future_v<ret_type>) { + auto ret = seastar::smp::submit_to( + core, + [f=std::move(f)]() mutable { + return f().to_base(); + }); + return ret_type(std::move(ret)); + } else { + return seastar::smp::submit_to(core, std::move(f)); + } +} + +template <typename Obj, typename Method, typename... Args> +auto proxy_method_on_core( + core_id_t core, Obj &obj, Method method, Args&&... args) { + return crimson::submit_to( + core, + [&obj, method, + arg_tuple=std::make_tuple(std::forward<Args>(args)...)]() mutable { + return apply_method_to_tuple(obj, method, std::move(arg_tuple)); + }); +} + +/** + * reactor_map_seq + * + * Invokes f on each reactor sequentially, Caller may assume that + * f will not be invoked concurrently on multiple cores. + */ +template <typename F> +auto reactor_map_seq(F &&f) { + using ret_type = decltype(f()); + if constexpr (is_errorated_future_v<ret_type>) { + auto ret = crimson::do_for_each( + seastar::smp::all_cpus().begin(), + seastar::smp::all_cpus().end(), + [f=std::move(f)](auto core) mutable { + return seastar::smp::submit_to( + core, + [&f] { + return std::invoke(f); + }); + }); + return ret_type(ret); + } else { + return seastar::do_for_each( + seastar::smp::all_cpus().begin(), + seastar::smp::all_cpus().end(), + [f=std::move(f)](auto core) mutable { + return seastar::smp::submit_to( + core, + [&f] { + return std::invoke(f); + }); + }); + } +} + +/** + * sharded_map_seq + * + * Invokes f on each shard of t sequentially. Caller may assume that + * f will not be invoked concurrently on multiple cores. + */ +template <typename T, typename F> +auto sharded_map_seq(T &t, F &&f) { + return reactor_map_seq( + [&t, f=std::forward<F>(f)]() mutable { + return std::invoke(f, t.local()); + }); +} + +} diff --git a/src/crimson/common/throttle.cc b/src/crimson/common/throttle.cc new file mode 100644 index 000000000..88d1859f3 --- /dev/null +++ b/src/crimson/common/throttle.cc @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "throttle.h" + +namespace crimson::common { + +int64_t Throttle::take(int64_t c) +{ + if (max == 0u) { + return 0; + } + count += c; + return count; +} + +int64_t Throttle::put(int64_t c) +{ + if (max == 0u) { + return 0; + } + if (!c) { + return count; + } + on_free_slots.signal(); + count -= c; + return count; +} + +seastar::future<> Throttle::get(size_t c) +{ + if (max == 0u) { + return seastar::make_ready_future<>(); + } + pending++; + return on_free_slots.wait([this, c] { + return !_should_wait(c); + }).then([this, c] { + pending--; + count += c; + return seastar::make_ready_future<>(); + }); +} + +void Throttle::reset_max(size_t m) { + if (max == m) { + return; + } + + if (m > max) { + on_free_slots.signal(); + } + max = m; +} + +bool Throttle::_should_wait(size_t c) const { + if (!max) { + return false; + } + return ((c <= max && count + c > max) || // normally stay under max + (c >= max && count > max)); // except for large c +} + +} // namespace crimson::common diff --git a/src/crimson/common/throttle.h b/src/crimson/common/throttle.h new file mode 100644 index 000000000..2998cb5f8 --- /dev/null +++ b/src/crimson/common/throttle.h @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/condition-variable.hh> +// pull seastar::timer<...>::timer definitions. FIX SEASTAR or reactor.hh +// is obligatory and should be included everywhere? +#include <seastar/core/reactor.hh> + +#include "common/ThrottleInterface.h" + +namespace crimson::common { + +class Throttle final : public ThrottleInterface { + size_t max = 0; + size_t count = 0; + size_t pending = 0; + // we cannot change the "count" of seastar::semaphore after it is created, + // so use condition_variable instead. + seastar::condition_variable on_free_slots; +public: + explicit Throttle(size_t m) + : max(m) + {} + int64_t take(int64_t c = 1) override; + int64_t put(int64_t c = 1) override; + seastar::future<> get(size_t c); + size_t get_current() const { + return count; + } + size_t get_max() const { + return max; + } + size_t get_pending() const { + return pending; + } + void reset_max(size_t m); +private: + bool _should_wait(size_t c) const; +}; + +} // namespace crimson::common diff --git a/src/crimson/common/tmap_helpers.cc b/src/crimson/common/tmap_helpers.cc new file mode 100644 index 000000000..9c14ebc45 --- /dev/null +++ b/src/crimson/common/tmap_helpers.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/tmap_helpers.h" + +#include "include/buffer.h" +#include "include/encoding.h" +#include "include/rados.h" + +namespace detail { + +#define decode_or_return(v, bp) \ + try { \ + ::decode(v, bp); \ + } catch (...) { \ + return -EINVAL; \ + } + +class TMapContents { + std::map<std::string, bufferlist> keys; + bufferlist header; +public: + TMapContents() = default; + + int decode(bufferlist::const_iterator &bliter) { + keys.clear(); + header.clear(); + if (bliter.end()) { + return 0; + } + decode_or_return(header, bliter); + __u32 num_keys; + decode_or_return(num_keys, bliter); + for (; num_keys > 0; --num_keys) { + std::string key; + decode_or_return(key, bliter); + decode_or_return(keys[key], bliter); + } + return 0; + } + + bufferlist encode() { + bufferlist bl; + ::encode(header, bl); + ::encode(static_cast<__u32>(keys.size()), bl); + for (auto &[k, v]: keys) { + ::encode(k, bl); + ::encode(v, bl); + } + return bl; + } + + int update(bufferlist::const_iterator in) { + while (!in.end()) { + __u8 op; + decode_or_return(op, in); + + if (op == CEPH_OSD_TMAP_HDR) { + decode_or_return(header, in); + continue; + } + + std::string key; + decode_or_return(key, in); + + switch (op) { + case CEPH_OSD_TMAP_SET: { + decode_or_return(keys[key], in); + break; + } + case CEPH_OSD_TMAP_CREATE: { + if (keys.contains(key)) { + return -EEXIST; + } + decode_or_return(keys[key], in); + break; + } + case CEPH_OSD_TMAP_RM: { + auto kiter = keys.find(key); + if (kiter == keys.end()) { + return -ENOENT; + } + keys.erase(kiter); + break; + } + case CEPH_OSD_TMAP_RMSLOPPY: { + keys.erase(key); + break; + } + } + } + return 0; + } + + int put(bufferlist::const_iterator in) { + return 0; + } +}; + +} + +namespace crimson::common { + +using do_tmap_up_ret = tl::expected<bufferlist, int>; +do_tmap_up_ret do_tmap_up(bufferlist::const_iterator in, bufferlist contents) +{ + detail::TMapContents tmap; + auto bliter = contents.cbegin(); + int r = tmap.decode(bliter); + if (r < 0) { + return tl::unexpected(r); + } + r = tmap.update(in); + if (r < 0) { + return tl::unexpected(r); + } + return tmap.encode(); +} + +using do_tmap_up_ret = tl::expected<bufferlist, int>; +do_tmap_up_ret do_tmap_put(bufferlist::const_iterator in) +{ + detail::TMapContents tmap; + int r = tmap.decode(in); + if (r < 0) { + return tl::unexpected(r); + } + return tmap.encode(); +} + +} diff --git a/src/crimson/common/tmap_helpers.h b/src/crimson/common/tmap_helpers.h new file mode 100644 index 000000000..446dbea2a --- /dev/null +++ b/src/crimson/common/tmap_helpers.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/expected.hpp" + +#include "include/buffer.h" +#include "include/encoding.h" + +namespace crimson::common { + +/** + * do_tmap_up + * + * Performs tmap update instructions encoded in buffer referenced by in. + * + * @param [in] in iterator to buffer containing encoded tmap update operations + * @param [in] contents current contents of object + * @return buffer containing new object contents, + * -EINVAL for decoding errors, + * -EEXIST for CEPH_OSD_TMAP_CREATE on a key that exists + * -ENOENT for CEPH_OSD_TMAP_RM on a key that does not exist + */ +using do_tmap_up_ret = tl::expected<bufferlist, int>; +do_tmap_up_ret do_tmap_up(bufferlist::const_iterator in, bufferlist contents); + +/** + * do_tmap_put + * + * Validates passed buffer pointed to by in and returns resulting object buffer. + * + * @param [in] in iterator to buffer containing tmap encoding + * @return buffer containing validated tmap encoded by in + * -EINVAL for decoding errors, + */ +using do_tmap_up_ret = tl::expected<bufferlist, int>; +do_tmap_up_ret do_tmap_put(bufferlist::const_iterator in); + +} diff --git a/src/crimson/common/tri_mutex.cc b/src/crimson/common/tri_mutex.cc new file mode 100644 index 000000000..e4b181280 --- /dev/null +++ b/src/crimson/common/tri_mutex.cc @@ -0,0 +1,225 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "tri_mutex.h" + +seastar::future<> read_lock::lock() +{ + return static_cast<tri_mutex*>(this)->lock_for_read(); +} + +void read_lock::unlock() +{ + static_cast<tri_mutex*>(this)->unlock_for_read(); +} + +seastar::future<> write_lock::lock() +{ + return static_cast<tri_mutex*>(this)->lock_for_write(false); +} + +void write_lock::unlock() +{ + static_cast<tri_mutex*>(this)->unlock_for_write(); +} + +seastar::future<> excl_lock::lock() +{ + return static_cast<tri_mutex*>(this)->lock_for_excl(); +} + +void excl_lock::unlock() +{ + static_cast<tri_mutex*>(this)->unlock_for_excl(); +} + +seastar::future<> excl_lock_from_read::lock() +{ + static_cast<tri_mutex*>(this)->promote_from_read(); + return seastar::make_ready_future<>(); +} + +void excl_lock_from_read::unlock() +{ + static_cast<tri_mutex*>(this)->demote_to_read(); +} + +seastar::future<> excl_lock_from_write::lock() +{ + static_cast<tri_mutex*>(this)->promote_from_write(); + return seastar::make_ready_future<>(); +} + +void excl_lock_from_write::unlock() +{ + static_cast<tri_mutex*>(this)->demote_to_write(); +} + +seastar::future<> excl_lock_from_excl::lock() +{ + return seastar::make_ready_future<>(); +} + +void excl_lock_from_excl::unlock() +{ +} + +tri_mutex::~tri_mutex() +{ + assert(!is_acquired()); +} + +seastar::future<> tri_mutex::lock_for_read() +{ + if (try_lock_for_read()) { + return seastar::make_ready_future<>(); + } + waiters.emplace_back(seastar::promise<>(), type_t::read); + return waiters.back().pr.get_future(); +} + +bool tri_mutex::try_lock_for_read() noexcept +{ + if (!writers && !exclusively_used && waiters.empty()) { + ++readers; + return true; + } else { + return false; + } +} + +void tri_mutex::unlock_for_read() +{ + assert(readers > 0); + if (--readers == 0) { + wake(); + } +} + +void tri_mutex::promote_from_read() +{ + assert(readers == 1); + --readers; + exclusively_used = true; +} + +void tri_mutex::demote_to_read() +{ + assert(exclusively_used); + exclusively_used = false; + ++readers; +} + +seastar::future<> tri_mutex::lock_for_write(bool greedy) +{ + if (try_lock_for_write(greedy)) { + return seastar::make_ready_future<>(); + } + waiters.emplace_back(seastar::promise<>(), type_t::write); + return waiters.back().pr.get_future(); +} + +bool tri_mutex::try_lock_for_write(bool greedy) noexcept +{ + if (!readers && !exclusively_used) { + if (greedy || waiters.empty()) { + ++writers; + return true; + } + } + return false; +} + +void tri_mutex::unlock_for_write() +{ + assert(writers > 0); + if (--writers == 0) { + wake(); + } +} + +void tri_mutex::promote_from_write() +{ + assert(writers == 1); + --writers; + exclusively_used = true; +} + +void tri_mutex::demote_to_write() +{ + assert(exclusively_used); + exclusively_used = false; + ++writers; +} + +// for exclusive users +seastar::future<> tri_mutex::lock_for_excl() +{ + if (try_lock_for_excl()) { + return seastar::make_ready_future<>(); + } + waiters.emplace_back(seastar::promise<>(), type_t::exclusive); + return waiters.back().pr.get_future(); +} + +bool tri_mutex::try_lock_for_excl() noexcept +{ + if (readers == 0u && writers == 0u && !exclusively_used) { + exclusively_used = true; + return true; + } else { + return false; + } +} + +void tri_mutex::unlock_for_excl() +{ + assert(exclusively_used); + exclusively_used = false; + wake(); +} + +bool tri_mutex::is_acquired() const +{ + if (readers != 0u) { + return true; + } else if (writers != 0u) { + return true; + } else if (exclusively_used) { + return true; + } else { + return false; + } +} + +void tri_mutex::wake() +{ + assert(!readers && !writers && !exclusively_used); + type_t type = type_t::none; + while (!waiters.empty()) { + auto& waiter = waiters.front(); + if (type == type_t::exclusive) { + break; + } if (type == type_t::none) { + type = waiter.type; + } else if (type != waiter.type) { + // to be woken in the next batch + break; + } + switch (type) { + case type_t::read: + ++readers; + break; + case type_t::write: + ++writers; + break; + case type_t::exclusive: + exclusively_used = true; + break; + default: + assert(0); + } + waiter.pr.set_value(); + waiters.pop_front(); + } +} diff --git a/src/crimson/common/tri_mutex.h b/src/crimson/common/tri_mutex.h new file mode 100644 index 000000000..0533f3539 --- /dev/null +++ b/src/crimson/common/tri_mutex.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/circular_buffer.hh> + +class read_lock { +public: + seastar::future<> lock(); + void unlock(); +}; + +class write_lock { +public: + seastar::future<> lock(); + void unlock(); +}; + +class excl_lock { +public: + seastar::future<> lock(); + void unlock(); +}; + +// promote from read to excl +class excl_lock_from_read { +public: + seastar::future<> lock(); + void unlock(); +}; + +// promote from write to excl +class excl_lock_from_write { +public: + seastar::future<> lock(); + void unlock(); +}; + +// promote from excl to excl +class excl_lock_from_excl { +public: + seastar::future<> lock(); + void unlock(); +}; + +/// shared/exclusive mutual exclusion +/// +/// this lock design uses reader and writer is entirely and completely +/// independent of the conventional reader/writer lock usage. Here, what we +/// mean is that we can pipeline reads, and we can pipeline writes, but we +/// cannot allow a read while writes are in progress or a write while reads are +/// in progress. Any rmw operation is therefore exclusive. +/// +/// tri_mutex is based on seastar::shared_mutex, but instead of two kinds of +/// waiters, tri_mutex keeps track of three kinds of lock users: +/// - readers +/// - writers +/// - exclusive users +class tri_mutex : private read_lock, + write_lock, + excl_lock, + excl_lock_from_read, + excl_lock_from_write, + excl_lock_from_excl +{ +public: + tri_mutex() = default; + ~tri_mutex(); + + read_lock& for_read() { + return *this; + } + write_lock& for_write() { + return *this; + } + excl_lock& for_excl() { + return *this; + } + excl_lock_from_read& excl_from_read() { + return *this; + } + excl_lock_from_write& excl_from_write() { + return *this; + } + excl_lock_from_excl& excl_from_excl() { + return *this; + } + + // for shared readers + seastar::future<> lock_for_read(); + bool try_lock_for_read() noexcept; + void unlock_for_read(); + void promote_from_read(); + void demote_to_read(); + unsigned get_readers() const { + return readers; + } + + // for shared writers + seastar::future<> lock_for_write(bool greedy); + bool try_lock_for_write(bool greedy) noexcept; + void unlock_for_write(); + void promote_from_write(); + void demote_to_write(); + unsigned get_writers() const { + return writers; + } + + // for exclusive users + seastar::future<> lock_for_excl(); + bool try_lock_for_excl() noexcept; + void unlock_for_excl(); + bool is_excl_acquired() const { + return exclusively_used; + } + + bool is_acquired() const; + + /// pass the provided exception to any waiting waiters + template<typename Exception> + void abort(Exception ex) { + while (!waiters.empty()) { + auto& waiter = waiters.front(); + waiter.pr.set_exception(std::make_exception_ptr(ex)); + waiters.pop_front(); + } + } + +private: + void wake(); + unsigned readers = 0; + unsigned writers = 0; + bool exclusively_used = false; + enum class type_t : uint8_t { + read, + write, + exclusive, + none, + }; + struct waiter_t { + waiter_t(seastar::promise<>&& pr, type_t type) + : pr(std::move(pr)), type(type) + {} + seastar::promise<> pr; + type_t type; + }; + seastar::circular_buffer<waiter_t> waiters; + friend class read_lock; + friend class write_lock; + friend class excl_lock; + friend class excl_lock_from_read; + friend class excl_lock_from_write; + friend class excl_lock_from_excl; +}; diff --git a/src/crimson/common/type_helpers.h b/src/crimson/common/type_helpers.h new file mode 100644 index 000000000..4c606581f --- /dev/null +++ b/src/crimson/common/type_helpers.h @@ -0,0 +1,8 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "boost/intrusive_ptr.hpp" + +template<typename T> using Ref = boost::intrusive_ptr<T>; diff --git a/src/crimson/common/utility.h b/src/crimson/common/utility.h new file mode 100644 index 000000000..86b308155 --- /dev/null +++ b/src/crimson/common/utility.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <type_traits> + +namespace _impl { + template <class T> struct always_false : std::false_type {}; +}; + +template <class T> +void assert_moveable(T& t) { + // It's fine +} +template <class T> +void assert_moveable(const T& t) { + static_assert(_impl::always_false<T>::value, "unable to move-out from T"); +} + +namespace internal { + +template <typename Obj, typename Method, typename ArgTuple, size_t... I> +static auto _apply_method_to_tuple( + Obj &obj, Method method, ArgTuple &&tuple, + std::index_sequence<I...>) { + return (obj.*method)(std::get<I>(std::forward<ArgTuple>(tuple))...); +} + +} + +template <typename Obj, typename Method, typename ArgTuple> +auto apply_method_to_tuple(Obj &obj, Method method, ArgTuple &&tuple) { + constexpr auto tuple_size = std::tuple_size_v<ArgTuple>; + return internal::_apply_method_to_tuple( + obj, method, std::forward<ArgTuple>(tuple), + std::make_index_sequence<tuple_size>()); +} diff --git a/src/crimson/crush/CrushLocation.cc b/src/crimson/crush/CrushLocation.cc new file mode 100644 index 000000000..d45264000 --- /dev/null +++ b/src/crimson/crush/CrushLocation.cc @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include "CrushLocation.h" + +#include <vector> +#include <boost/algorithm/string/trim.hpp> +#include <seastar/util/process.hh> +#include <seastar/util/later.hh> + +#include "crush/CrushWrapper.h" +#include "crimson/common/log.h" +#include "crimson/common/config_proxy.h" + +static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_crush); +} + +using namespace crimson::common; + +namespace crimson::crush { + +seastar::future<> CrushLocation::update_from_conf() +{ + auto crush_location = local_conf().get_val<std::string>("crush_location"); + if (crush_location.length()) { + _parse(crush_location); + } + + return seastar::now(); +} + +void CrushLocation::_parse(const std::string& s) +{ + std::multimap<std::string, std::string> new_crush_location; + std::vector<std::string> lvec; + get_str_vec(s, ";, \t", lvec); + int r = CrushWrapper::parse_loc_multimap(lvec, &new_crush_location); + if (r < 0) { + logger().error("CrushWrapper::parse_loc_multimap error, keeping original\ + crush_location {}", *this); + return; + } + + loc.swap(new_crush_location); + logger().info("{}: crush_location is {}", __func__, *this); + return; +} + +seastar::future<> CrushLocation::update_from_hook() +{ + auto crush_location_hook = local_conf().get_val<std::string>("crush_location_hook"); + if (crush_location_hook.length() == 0) + return seastar::now(); + + return seastar::file_exists( + crush_location_hook + ).then([this] (bool result) { + if (!result) { + std::stringstream errmsg; + errmsg << "the user define crush location hook: " + << local_conf().get_val<std::string>("crush_location_hook") + << " is not exists."; + logger().error("{}", errmsg.str()); + throw std::runtime_error(errmsg.str()); + } + + return seastar::file_accessible( + local_conf().get_val<std::string>("crush_location_hook"), + seastar::access_flags::execute + ).then([this] (bool result) { + if (!result) { + std::stringstream errmsg; + errmsg << "the user define crush location hook: " + << local_conf().get_val<std::string>("crush_location_hook") + << " is not executable."; + logger().error("{}", errmsg.str()); + throw std::runtime_error(errmsg.str()); + } + + seastar::experimental::spawn_parameters params = { + .argv = { + local_conf().get_val<std::string>("crush_location_hook"), + "--cluster", + local_conf()->cluster, + "--id", + local_conf()->name.get_id(), + "--type", + local_conf()->name.get_type_str() + } + }; + return seastar::experimental::spawn_process( + local_conf().get_val<std::string>("crush_location_hook"), + std::move(params) + ).then([this] (auto process) { + auto stdout = process.stdout(); + return do_with( + std::move(process), + std::move(stdout), + [this](auto& process, auto& stdout) + { + return stdout.read().then([] (seastar::temporary_buffer<char> buf) { + auto out = std::string(buf.get(), buf.size()); + boost::algorithm::trim_if(out, boost::algorithm::is_any_of(" \n\r\t")); + return seastar::make_ready_future<std::string>(std::move(out)); + }).then([&process, this] (auto out) { + return process.wait( + ).then([out = std::move(out), this] (auto wstatus) { + auto* exit_signal = std::get_if<seastar::experimental::process::wait_signaled>(&wstatus); + if (exit_signal != nullptr) { + std::stringstream errmsg; + errmsg << "the user define crush location hook: " + << local_conf().get_val<std::string>("crush_location_hook") + << " terminated, terminated signal is " + << exit_signal->terminating_signal; + logger().error("{}", errmsg.str()); + throw std::runtime_error(errmsg.str()); + } + + auto* exit_status = std::get_if<seastar::experimental::process::wait_exited>(&wstatus); + if (exit_status->exit_code != 0) { + std::stringstream errmsg; + errmsg << "the user define crush location hook: " + << local_conf().get_val<std::string>("crush_location_hook") + << " execute failed, exit_code is " << exit_status->exit_code; + logger().error("{}", errmsg.str()); + throw std::runtime_error(errmsg.str()); + } else { + _parse(out); + } + return seastar::now(); + }); + }); + }); + }); + }); + }); +} + +seastar::future<> CrushLocation::init_on_startup() +{ + if (local_conf().get_val<std::string>("crush_location").length()) { + return update_from_conf(); + } + if (local_conf().get_val<std::string>("crush_location_hook").length()) { + return update_from_hook(); + } + + // start with a sane default + char hostname[HOST_NAME_MAX + 1]; + int r = gethostname(hostname, sizeof(hostname)); + if (r < 0) + strcpy(hostname, "unknown_host"); + // use short hostname + for (unsigned i=0; hostname[i]; ++i) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + break; + } + } + + loc.clear(); + loc.insert(std::make_pair<std::string, std::string>("host", hostname)); + loc.insert(std::make_pair<std::string, std::string>("root", "default")); + return seastar::now(); +} + +std::multimap<std::string,std::string> CrushLocation::get_location() const +{ + return loc; +} + +std::ostream& operator<<(std::ostream& os, const CrushLocation& loc) +{ + bool first = true; + for (auto& [type, pos] : loc.get_location()) { + if (first) { + first = false; + } else { + os << ", "; + } + os << '"' << type << '=' << pos << '"'; + } + return os; +} + +} diff --git a/src/crimson/crush/CrushLocation.h b/src/crimson/crush/CrushLocation.h new file mode 100644 index 000000000..9dc954672 --- /dev/null +++ b/src/crimson/crush/CrushLocation.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include <iosfwd> +#include <map> +#include <string> +#if FMT_VERSION >= 90000 +#include <fmt/ostream.h> +#endif + +#include <seastar/core/seastar.hh> + +namespace crimson::crush { + +class CrushLocation { +public: + explicit CrushLocation() { + } + + seastar::future<> update_from_conf(); ///< refresh from config + seastar::future<> init_on_startup(); + seastar::future<> update_from_hook(); ///< call hook, if present + + std::multimap<std::string, std::string> get_location() const; + +private: + void _parse(const std::string& s); + std::multimap<std::string, std::string> loc; +}; + +std::ostream& operator<<(std::ostream& os, const CrushLocation& loc); +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::crush::CrushLocation> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/mgr/client.cc b/src/crimson/mgr/client.cc new file mode 100644 index 000000000..169915c9e --- /dev/null +++ b/src/crimson/mgr/client.cc @@ -0,0 +1,176 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "client.h" + +#include <seastar/core/sleep.hh> + +#include "crimson/common/log.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "messages/MMgrConfigure.h" +#include "messages/MMgrMap.h" +#include "messages/MMgrOpen.h" + +namespace { + seastar::logger& logger() + { + return crimson::get_logger(ceph_subsys_mgrc); + } +} + +using crimson::common::local_conf; + +namespace crimson::mgr +{ + +Client::Client(crimson::net::Messenger& msgr, + WithStats& with_stats) + : msgr{msgr}, + with_stats{with_stats}, + report_timer{[this] {report();}} +{} + +seastar::future<> Client::start() +{ + return seastar::now(); +} + +seastar::future<> Client::stop() +{ + logger().info("{}", __func__); + report_timer.cancel(); + auto fut = gate.close(); + if (conn) { + conn->mark_down(); + } + return fut; +} + +std::optional<seastar::future<>> +Client::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + bool dispatched = true; + gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] { + switch(m->get_type()) { + case MSG_MGR_MAP: + return handle_mgr_map(conn, boost::static_pointer_cast<MMgrMap>(m)); + case MSG_MGR_CONFIGURE: + return handle_mgr_conf(conn, boost::static_pointer_cast<MMgrConfigure>(m)); + default: + dispatched = false; + return seastar::now(); + } + }); + return (dispatched ? std::make_optional(seastar::now()) : std::nullopt); +} + +void Client::ms_handle_connect( + crimson::net::ConnectionRef c, + seastar::shard_id prv_shard) +{ + ceph_assert_always(prv_shard == seastar::this_shard_id()); + gate.dispatch_in_background(__func__, *this, [this, c] { + if (conn == c) { + // ask for the mgrconfigure message + auto m = crimson::make_message<MMgrOpen>(); + m->daemon_name = local_conf()->name.get_id(); + local_conf().get_config_bl(0, &m->config_bl, &last_config_bl_version); + local_conf().get_defaults_bl(&m->config_defaults_bl); + return conn->send(std::move(m)); + } else { + return seastar::now(); + } + }); +} + +void Client::ms_handle_reset(crimson::net::ConnectionRef c, bool /* is_replace */) +{ + gate.dispatch_in_background(__func__, *this, [this, c] { + if (conn == c) { + report_timer.cancel(); + return reconnect(); + } else { + return seastar::now(); + } + }); +} + +seastar::future<> Client::reconnect() +{ + if (conn) { + conn->mark_down(); + conn = {}; + } + if (!mgrmap.get_available()) { + logger().warn("No active mgr available yet"); + return seastar::now(); + } + auto retry_interval = std::chrono::duration<double>( + local_conf().get_val<double>("mgr_connect_retry_interval")); + auto a_while = std::chrono::duration_cast<seastar::steady_clock_type::duration>( + retry_interval); + return seastar::sleep(a_while).then([this] { + auto peer = mgrmap.get_active_addrs().pick_addr(msgr.get_myaddr().get_type()); + if (peer == entity_addr_t{}) { + // crimson msgr only uses the first bound addr + logger().error("mgr.{} does not have an addr compatible with me", + mgrmap.get_active_name()); + return; + } + conn = msgr.connect(peer, CEPH_ENTITY_TYPE_MGR); + }); +} + +seastar::future<> Client::handle_mgr_map(crimson::net::ConnectionRef, + Ref<MMgrMap> m) +{ + mgrmap = m->get_map(); + if (!conn) { + return reconnect(); + } else if (conn->get_peer_addr() != + mgrmap.get_active_addrs().legacy_addr()) { + return reconnect(); + } else { + return seastar::now(); + } +} + +seastar::future<> Client::handle_mgr_conf(crimson::net::ConnectionRef, + Ref<MMgrConfigure> m) +{ + logger().info("{} {}", __func__, *m); + + auto report_period = std::chrono::seconds{m->stats_period}; + if (report_period.count()) { + if (report_timer.armed()) { + report_timer.rearm(report_timer.get_timeout(), report_period); + } else { + report_timer.arm_periodic(report_period); + } + } else { + report_timer.cancel(); + } + return seastar::now(); +} + +void Client::report() +{ + gate.dispatch_in_background(__func__, *this, [this] { + if (!conn) { + logger().warn("report: no conn available; raport skipped"); + return seastar::now(); + } + return with_stats.get_stats( + ).then([this](auto &&pg_stats) { + return conn->send(std::move(pg_stats)); + }); + }); +} + +void Client::print(std::ostream& out) const +{ + out << "mgrc "; +} + +} diff --git a/src/crimson/mgr/client.h b/src/crimson/mgr/client.h new file mode 100644 index 000000000..501949768 --- /dev/null +++ b/src/crimson/mgr/client.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/timer.hh> + +#include "crimson/common/gated.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/net/Fwd.h" +#include "mon/MgrMap.h" + +template<typename Message> using Ref = boost::intrusive_ptr<Message>; +namespace crimson::net { + class Messenger; +} + +class MMgrMap; +class MMgrConfigure; + +namespace crimson::mgr +{ + +// implement WithStats if you want to report stats to mgr periodically +class WithStats { +public: + virtual seastar::future<MessageURef> get_stats() const = 0; + virtual ~WithStats() {} +}; + +class Client : public crimson::net::Dispatcher { +public: + Client(crimson::net::Messenger& msgr, + WithStats& with_stats); + seastar::future<> start(); + seastar::future<> stop(); + void report(); + +private: + std::optional<seastar::future<>> ms_dispatch( + crimson::net::ConnectionRef conn, Ref<Message> m) override; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final; + void ms_handle_connect(crimson::net::ConnectionRef conn, seastar::shard_id) final; + seastar::future<> handle_mgr_map(crimson::net::ConnectionRef conn, + Ref<MMgrMap> m); + seastar::future<> handle_mgr_conf(crimson::net::ConnectionRef conn, + Ref<MMgrConfigure> m); + seastar::future<> reconnect(); + + void print(std::ostream&) const; + friend std::ostream& operator<<(std::ostream& out, const Client& client); +private: + MgrMap mgrmap; + crimson::net::Messenger& msgr; + WithStats& with_stats; + crimson::net::ConnectionRef conn; + seastar::timer<seastar::lowres_clock> report_timer; + crimson::common::Gated gate; + uint64_t last_config_bl_version = 0; +}; + +inline std::ostream& operator<<(std::ostream& out, const Client& client) { + client.print(out); + return out; +} + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::mgr::Client> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/mon/MonClient.cc b/src/crimson/mon/MonClient.cc new file mode 100644 index 000000000..7be09915a --- /dev/null +++ b/src/crimson/mon/MonClient.cc @@ -0,0 +1,1162 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "MonClient.h" + +#include <random> +#include <fmt/ranges.h> +#include <seastar/core/future-util.hh> +#include <seastar/core/lowres_clock.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/util/log.hh> + +#include "auth/AuthClientHandler.h" +#include "auth/RotatingKeyRing.h" + +#include "common/hostname.h" + +#include "crimson/auth/KeyRing.h" +#include "crimson/common/config_proxy.h" +#include "crimson/common/log.h" +#include "crimson/common/logclient.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Errors.h" +#include "crimson/net/Messenger.h" + +#include "messages/MAuth.h" +#include "messages/MAuthReply.h" +#include "messages/MConfig.h" +#include "messages/MLogAck.h" +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" +#include "messages/MMonGetMap.h" +#include "messages/MMonGetVersion.h" +#include "messages/MMonGetVersionReply.h" +#include "messages/MMonMap.h" +#include "messages/MMonSubscribe.h" +#include "messages/MMonSubscribeAck.h" + +using std::string; +using std::tuple; +using std::vector; + +namespace { + seastar::logger& logger() + { + return crimson::get_logger(ceph_subsys_monc); + } +} + +namespace crimson::mon { + +using crimson::common::local_conf; + +class Connection : public seastar::enable_shared_from_this<Connection> { +public: + Connection(const AuthRegistry& auth_registry, + crimson::net::ConnectionRef conn, + KeyRing* keyring); + enum class auth_result_t { + success = 0, + failure, + canceled + }; + seastar::future<> handle_auth_reply(Ref<MAuthReply> m); + // v2 + seastar::future<auth_result_t> authenticate_v2(); + auth::AuthClient::auth_request_t + get_auth_request(const EntityName& name, + uint32_t want_keys); + using secret_t = string; + tuple<CryptoKey, secret_t, bufferlist> + handle_auth_reply_more(const ceph::buffer::list& bl); + int handle_auth_bad_method(uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes); + tuple<CryptoKey, secret_t, int> + handle_auth_done(uint64_t new_global_id, + const ceph::buffer::list& bl); + void close(); + bool is_my_peer(const entity_addr_t& addr) const; + AuthAuthorizer* get_authorizer(entity_type_t peer) const; + KeyStore& get_keys(); + seastar::future<> renew_tickets(); + seastar::future<> renew_rotating_keyring(); + + crimson::net::ConnectionRef get_conn(); + +private: + std::unique_ptr<AuthClientHandler> create_auth(crimson::auth::method_t, + uint64_t global_id, + const EntityName& name, + uint32_t want_keys); + enum class request_t { + rotating, + general, + }; + seastar::future<std::optional<auth_result_t>> do_auth_single(request_t); + seastar::future<auth_result_t> do_auth(request_t); + +private: + bool closed = false; + seastar::shared_promise<Ref<MAuthReply>> auth_reply; + // v2 + using clock_t = seastar::lowres_system_clock; + clock_t::time_point auth_start; + crimson::auth::method_t auth_method = 0; + std::optional<seastar::promise<auth_result_t>> auth_done; + const AuthRegistry& auth_registry; + crimson::net::ConnectionRef conn; + std::unique_ptr<AuthClientHandler> auth; + std::unique_ptr<RotatingKeyRing> rotating_keyring; + uint64_t global_id = 0; + clock_t::time_point last_rotating_renew_sent; +}; + +Connection::Connection(const AuthRegistry& auth_registry, + crimson::net::ConnectionRef conn, + KeyRing* keyring) + : auth_registry{auth_registry}, + conn{conn}, + rotating_keyring{ + std::make_unique<RotatingKeyRing>(nullptr, + CEPH_ENTITY_TYPE_OSD, + keyring)} +{} + +seastar::future<> Connection::handle_auth_reply(Ref<MAuthReply> m) +{ + logger().info("{}", __func__); + ceph_assert(m); + auth_reply.set_value(m); + auth_reply = {}; + return seastar::now(); +} + +seastar::future<> Connection::renew_tickets() +{ + if (auth->need_tickets()) { + logger().info("{}: retrieving new tickets", __func__); + return do_auth(request_t::general).then([](const auth_result_t r) { + if (r == auth_result_t::failure) { + logger().info("renew_tickets: ignoring failed auth reply"); + } + }); + } else { + logger().debug("{}: don't need new tickets", __func__); + return seastar::now(); + } +} + +seastar::future<> Connection::renew_rotating_keyring() +{ + auto now = clock_t::now(); + auto ttl = std::chrono::seconds{ + static_cast<long>(crimson::common::local_conf()->auth_service_ticket_ttl)}; + auto cutoff = utime_t{now - std::min(std::chrono::seconds{30}, ttl / 4)}; + if (!rotating_keyring->need_new_secrets(cutoff)) { + logger().debug("renew_rotating_keyring secrets are up-to-date " + "(they expire after {})", cutoff); + return seastar::now(); + } else { + logger().info("renew_rotating_keyring renewing rotating keys " + " (they expired before {})", cutoff); + } + if ((now > last_rotating_renew_sent) && + (now - last_rotating_renew_sent < std::chrono::seconds{1})) { + logger().info("renew_rotating_keyring called too often (last: {})", + utime_t{last_rotating_renew_sent}); + return seastar::now(); + } + last_rotating_renew_sent = now; + return do_auth(request_t::rotating).then([](const auth_result_t r) { + if (r == auth_result_t::failure) { + logger().info("renew_rotating_keyring: ignoring failed auth reply"); + } + }); +} + +AuthAuthorizer* Connection::get_authorizer(entity_type_t peer) const +{ + if (auth) { + return auth->build_authorizer(peer); + } else { + return nullptr; + } +} + +KeyStore& Connection::get_keys() { + return *rotating_keyring; +} + +std::unique_ptr<AuthClientHandler> +Connection::create_auth(crimson::auth::method_t protocol, + uint64_t global_id, + const EntityName& name, + uint32_t want_keys) +{ + static crimson::common::CephContext cct; + std::unique_ptr<AuthClientHandler> auth; + auth.reset(AuthClientHandler::create(&cct, + protocol, + rotating_keyring.get())); + if (!auth) { + logger().error("no handler for protocol {}", protocol); + throw std::system_error(make_error_code( + crimson::net::error::negotiation_failure)); + } + auth->init(name); + auth->set_want_keys(want_keys); + auth->set_global_id(global_id); + return auth; +} + +seastar::future<std::optional<Connection::auth_result_t>> +Connection::do_auth_single(Connection::request_t what) +{ + auto m = crimson::make_message<MAuth>(); + m->protocol = auth->get_protocol(); + auth->prepare_build_request(); + switch (what) { + case request_t::rotating: + auth->build_rotating_request(m->auth_payload); + break; + case request_t::general: + if (int ret = auth->build_request(m->auth_payload); ret) { + logger().error("missing/bad key for '{}'", local_conf()->name); + throw std::system_error(make_error_code( + crimson::net::error::negotiation_failure)); + } + break; + default: + assert(0); + } + logger().info("sending {}", *m); + return conn->send(std::move(m)).then([this] { + logger().info("waiting"); + return auth_reply.get_shared_future(); + }).then([this, life_extender=shared_from_this()] (Ref<MAuthReply> m) { + if (!m) { + ceph_assert(closed); + logger().info("do_auth_single: connection closed"); + return std::make_optional(auth_result_t::canceled); + } + logger().info("do_auth_single: {} returns {}: {}", + *conn, *m, m->result); + auto p = m->result_bl.cbegin(); + auto ret = auth->handle_response(m->result, p, + nullptr, nullptr); + std::optional<Connection::auth_result_t> auth_result; + switch (ret) { + case -EAGAIN: + auth_result = std::nullopt; + break; + case 0: + auth_result = auth_result_t::success; + break; + default: + auth_result = auth_result_t::failure; + logger().error( + "do_auth_single: got error {} on mon {}", + ret, conn->get_peer_addr()); + break; + } + return auth_result; + }); +} + +seastar::future<Connection::auth_result_t> +Connection::do_auth(Connection::request_t what) { + return seastar::repeat_until_value( + [this, life_extender=shared_from_this(), what]() { + return do_auth_single(what); + }); +} + +seastar::future<Connection::auth_result_t> Connection::authenticate_v2() +{ + auth_start = seastar::lowres_system_clock::now(); + return conn->send(crimson::make_message<MMonGetMap>()).then([this] { + auth_done.emplace(); + return auth_done->get_future(); + }); +} + +auth::AuthClient::auth_request_t +Connection::get_auth_request(const EntityName& entity_name, + uint32_t want_keys) +{ + // choose method + auth_method = [&] { + std::vector<crimson::auth::method_t> methods; + auth_registry.get_supported_methods(conn->get_peer_type(), &methods); + if (methods.empty()) { + logger().info("get_auth_request no methods is supported"); + throw crimson::auth::error("no methods is supported"); + } + return methods.front(); + }(); + + std::vector<uint32_t> modes; + auth_registry.get_supported_modes(conn->get_peer_type(), auth_method, + &modes); + logger().info("method {} preferred_modes {}", auth_method, modes); + if (modes.empty()) { + throw crimson::auth::error("no modes is supported"); + } + auth = create_auth(auth_method, global_id, entity_name, want_keys); + + using ceph::encode; + bufferlist bl; + // initial request includes some boilerplate... + encode((char)AUTH_MODE_MON, bl); + encode(entity_name, bl); + encode(global_id, bl); + // and (maybe) some method-specific initial payload + auth->build_initial_request(&bl); + return {auth_method, modes, bl}; +} + +tuple<CryptoKey, Connection::secret_t, bufferlist> +Connection::handle_auth_reply_more(const ceph::buffer::list& payload) +{ + CryptoKey session_key; + secret_t connection_secret; + bufferlist reply; + auto p = payload.cbegin(); + int r = auth->handle_response(0, p, &session_key, &connection_secret); + if (r == -EAGAIN) { + auth->prepare_build_request(); + auth->build_request(reply); + logger().info(" responding with {} bytes", reply.length()); + return {session_key, connection_secret, reply}; + } else if (r < 0) { + logger().error(" handle_response returned {}", r); + throw crimson::auth::error("unable to build auth"); + } else { + logger().info("authenticated!"); + std::terminate(); + } +} + +tuple<CryptoKey, Connection::secret_t, int> +Connection::handle_auth_done(uint64_t new_global_id, + const ceph::buffer::list& payload) +{ + global_id = new_global_id; + auth->set_global_id(global_id); + auto p = payload.begin(); + CryptoKey session_key; + secret_t connection_secret; + int r = auth->handle_response(0, p, &session_key, &connection_secret); + conn->set_last_keepalive_ack(auth_start); + if (auth_done) { + auth_done->set_value(auth_result_t::success); + auth_done.reset(); + } + return {session_key, connection_secret, r}; +} + +int Connection::handle_auth_bad_method(uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) +{ + logger().info("old_auth_method {} result {} allowed_methods {}", + old_auth_method, cpp_strerror(result), allowed_methods); + std::vector<uint32_t> auth_supported; + auth_registry.get_supported_methods(conn->get_peer_type(), &auth_supported); + auto p = std::find(auth_supported.begin(), auth_supported.end(), + old_auth_method); + assert(p != auth_supported.end()); + p = std::find_first_of(std::next(p), auth_supported.end(), + allowed_methods.begin(), allowed_methods.end()); + if (p == auth_supported.end()) { + logger().error("server allowed_methods {} but i only support {}", + allowed_methods, auth_supported); + assert(auth_done); + auth_done->set_exception(std::system_error(make_error_code( + crimson::net::error::negotiation_failure))); + return -EACCES; + } + auth_method = *p; + logger().info("will try {} next", auth_method); + return 0; +} + +void Connection::close() +{ + logger().info("{}", __func__); + auth_reply.set_value(Ref<MAuthReply>(nullptr)); + auth_reply = {}; + if (auth_done) { + auth_done->set_value(auth_result_t::canceled); + auth_done.reset(); + } + if (conn && !std::exchange(closed, true)) { + conn->mark_down(); + } +} + +bool Connection::is_my_peer(const entity_addr_t& addr) const +{ + ceph_assert(conn); + return conn->get_peer_addr() == addr; +} + +crimson::net::ConnectionRef Connection::get_conn() { + return conn; +} + +Client::mon_command_t::mon_command_t(MURef<MMonCommand> req) + : req(std::move(req)) +{} + +Client::Client(crimson::net::Messenger& messenger, + crimson::common::AuthHandler& auth_handler) + // currently, crimson is OSD-only + : want_keys{CEPH_ENTITY_TYPE_MON | + CEPH_ENTITY_TYPE_OSD | + CEPH_ENTITY_TYPE_MGR}, + timer{[this] { tick(); }}, + msgr{messenger}, + log_client{nullptr}, + auth_registry{&cct}, + auth_handler{auth_handler} +{} + +Client::Client(Client&&) = default; +Client::~Client() = default; + +seastar::future<> Client::start() { + entity_name = crimson::common::local_conf()->name; + auth_registry.refresh_config(); + return load_keyring().then([this] { + return monmap.build_initial(crimson::common::local_conf(), false); + }).then([this] { + return authenticate(); + }).then([this] { + auto interval = + std::chrono::duration_cast<seastar::lowres_clock::duration>( + std::chrono::duration<double>( + local_conf().get_val<double>("mon_client_ping_interval"))); + timer.arm_periodic(interval); + }); +} + +seastar::future<> Client::load_keyring() +{ + if (!auth_registry.is_supported_method(msgr.get_mytype(), CEPH_AUTH_CEPHX)) { + return seastar::now(); + } else { + return crimson::auth::load_from_keyring(&keyring).then([](KeyRing* keyring) { + return crimson::auth::load_from_keyfile(keyring); + }).then([](KeyRing* keyring) { + return crimson::auth::load_from_key(keyring); + }).then([](KeyRing*) { + return seastar::now(); + }); + } +} + +void Client::tick() +{ + gate.dispatch_in_background(__func__, *this, [this] { + if (active_con) { + return seastar::when_all_succeed(wait_for_send_log(), + active_con->get_conn()->send_keepalive(), + active_con->renew_tickets(), + active_con->renew_rotating_keyring()).discard_result(); + } else { + assert(is_hunting()); + logger().info("{} continuing the hunt", __func__); + return authenticate(); + } + }); +} + +seastar::future<> Client::wait_for_send_log() { + utime_t now = ceph_clock_now(); + if (now > last_send_log + cct._conf->mon_client_log_interval) { + last_send_log = now; + return send_log(log_flushing_t::NO_FLUSH); + } + return seastar::now(); +} + +seastar::future<> Client::send_log(log_flushing_t flush_flag) { + if (log_client) { + if (auto lm = log_client->get_mon_log_message(flush_flag); lm) { + return send_message(std::move(lm)); + } + more_log_pending = log_client->are_pending(); + } + return seastar::now(); +} + +bool Client::is_hunting() const { + return !active_con; +} + +std::optional<seastar::future<>> +Client::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + bool dispatched = true; + gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] { + // we only care about these message types + switch (m->get_type()) { + case CEPH_MSG_MON_MAP: + return handle_monmap(*conn, boost::static_pointer_cast<MMonMap>(m)); + case CEPH_MSG_AUTH_REPLY: + return handle_auth_reply( + *conn, boost::static_pointer_cast<MAuthReply>(m)); + case CEPH_MSG_MON_SUBSCRIBE_ACK: + return handle_subscribe_ack( + boost::static_pointer_cast<MMonSubscribeAck>(m)); + case CEPH_MSG_MON_GET_VERSION_REPLY: + return handle_get_version_reply( + boost::static_pointer_cast<MMonGetVersionReply>(m)); + case MSG_MON_COMMAND_ACK: + return handle_mon_command_ack( + boost::static_pointer_cast<MMonCommandAck>(m)); + case MSG_LOGACK: + return handle_log_ack( + boost::static_pointer_cast<MLogAck>(m)); + case MSG_CONFIG: + return handle_config( + boost::static_pointer_cast<MConfig>(m)); + default: + dispatched = false; + return seastar::now(); + } + }); + return (dispatched ? std::make_optional(seastar::now()) : std::nullopt); +} + +void Client::ms_handle_reset(crimson::net::ConnectionRef conn, bool /* is_replace */) +{ + gate.dispatch_in_background(__func__, *this, [this, conn] { + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn->get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found != pending_conns.end()) { + logger().warn("pending conn reset by {}", conn->get_peer_addr()); + (*found)->close(); + pending_conns.erase(found); + return seastar::now(); + } else if (active_con && active_con->is_my_peer(conn->get_peer_addr())) { + logger().warn("active conn reset {}", conn->get_peer_addr()); + return reopen_session(-1).then([this](bool opened) { + if (opened) { + return on_session_opened(); + } else { + return seastar::now(); + } + }); + } else { + return seastar::now(); + } + }); +} + +std::pair<std::vector<uint32_t>, std::vector<uint32_t>> +Client::get_supported_auth_methods(int peer_type) +{ + std::vector<uint32_t> methods; + std::vector<uint32_t> modes; + auth_registry.get_supported_methods(peer_type, &methods, &modes); + return {methods, modes}; +} + +uint32_t Client::pick_con_mode(int peer_type, + uint32_t auth_method, + const std::vector<uint32_t>& preferred_modes) +{ + return auth_registry.pick_mode(peer_type, auth_method, preferred_modes); +} + +AuthAuthorizeHandler* Client::get_auth_authorize_handler(int peer_type, + int auth_method) +{ + return auth_registry.get_handler(peer_type, auth_method); +} + + +int Client::handle_auth_request(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + bool more, + uint32_t auth_method, + const ceph::bufferlist& payload, + uint64_t *p_peer_global_id, + ceph::bufferlist *reply) +{ + if (payload.length() == 0) { + return -EACCES; + } + auth_meta.auth_mode = payload[0]; + if (auth_meta.auth_mode < AUTH_MODE_AUTHORIZER || + auth_meta.auth_mode > AUTH_MODE_AUTHORIZER_MAX) { + return -EACCES; + } + AuthAuthorizeHandler* ah = get_auth_authorize_handler(conn.get_peer_type(), + auth_method); + if (!ah) { + logger().error("no AuthAuthorizeHandler found for auth method: {}", + auth_method); + return -EOPNOTSUPP; + } + auto authorizer_challenge = &auth_meta.authorizer_challenge; + if (auth_meta.skip_authorizer_challenge) { + logger().info("skipping challenge on {}", conn); + authorizer_challenge = nullptr; + } + if (!active_con) { + logger().info("auth request during inactivity period"); + // let's instruct the client to come back later + return -EBUSY; + } + bool was_challenge = (bool)auth_meta.authorizer_challenge; + EntityName name; + AuthCapsInfo caps_info; + bool is_valid = ah->verify_authorizer( + &cct, + active_con->get_keys(), + payload, + auth_meta.get_connection_secret_length(), + reply, + &name, + p_peer_global_id, + &caps_info, + &auth_meta.session_key, + &auth_meta.connection_secret, + authorizer_challenge); + if (is_valid) { + auth_handler.handle_authentication(name, caps_info); + return 1; + } + if (!more && !was_challenge && auth_meta.authorizer_challenge) { + logger().info("added challenge on {}", conn); + return 0; + } else { + logger().info("bad authorizer on {}", conn); + return -EACCES; + } +} + +auth::AuthClient::auth_request_t +Client::get_auth_request(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta) +{ + logger().info("get_auth_request(conn={}, auth_method={})", + conn, auth_meta.auth_method); + // connection to mon? + if (conn.get_peer_type() == CEPH_ENTITY_TYPE_MON) { + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn.get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found == pending_conns.end()) { + throw crimson::auth::error{"unknown connection"}; + } + return (*found)->get_auth_request(entity_name, want_keys); + } else { + // generate authorizer + if (!active_con) { + logger().error(" but no auth handler is set up"); + throw crimson::auth::error("no auth available"); + } + auto authorizer = active_con->get_authorizer(conn.get_peer_type()); + if (!authorizer) { + logger().error("failed to build_authorizer for type {}", + ceph_entity_type_name(conn.get_peer_type())); + throw crimson::auth::error("unable to build auth"); + } + auth_meta.authorizer.reset(authorizer); + auth_meta.auth_method = authorizer->protocol; + vector<uint32_t> modes; + auth_registry.get_supported_modes(conn.get_peer_type(), + auth_meta.auth_method, + &modes); + return {authorizer->protocol, modes, authorizer->bl}; + } +} + +ceph::bufferlist Client::handle_auth_reply_more(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + const bufferlist& bl) +{ + if (conn.get_peer_type() == CEPH_ENTITY_TYPE_MON) { + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn.get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found == pending_conns.end()) { + throw crimson::auth::error{"unknown connection"}; + } + bufferlist reply; + tie(auth_meta.session_key, auth_meta.connection_secret, reply) = + (*found)->handle_auth_reply_more(bl); + return reply; + } else { + // authorizer challenges + if (!active_con || !auth_meta.authorizer) { + logger().error("no authorizer?"); + throw crimson::auth::error("no auth available"); + } + auth_meta.authorizer->add_challenge(&cct, bl); + return auth_meta.authorizer->bl; + } +} + +int Client::handle_auth_done(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + uint64_t global_id, + uint32_t /*con_mode*/, + const bufferlist& bl) +{ + if (conn.get_peer_type() == CEPH_ENTITY_TYPE_MON) { + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn.get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found == pending_conns.end()) { + return -ENOENT; + } + int r = 0; + tie(auth_meta.session_key, auth_meta.connection_secret, r) = + (*found)->handle_auth_done(global_id, bl); + return r; + } else { + // verify authorizer reply + auto p = bl.begin(); + if (!auth_meta.authorizer->verify_reply(p, &auth_meta.connection_secret)) { + logger().error("failed verifying authorizer reply"); + return -EACCES; + } + auth_meta.session_key = auth_meta.authorizer->session_key; + return 0; + } +} + + // Handle server's indication that the previous auth attempt failed +int Client::handle_auth_bad_method(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) +{ + if (conn.get_peer_type() == CEPH_ENTITY_TYPE_MON) { + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn.get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found != pending_conns.end()) { + return (*found)->handle_auth_bad_method( + old_auth_method, result, + allowed_methods, allowed_modes); + } else { + return -ENOENT; + } + } else { + // huh... + logger().info("hmm, they didn't like {} result {}", + old_auth_method, cpp_strerror(result)); + return -EACCES; + } +} + +seastar::future<> Client::handle_monmap(crimson::net::Connection &conn, + Ref<MMonMap> m) +{ + monmap.decode(m->monmapbl); + const auto peer_addr = conn.get_peer_addr(); + auto cur_mon = monmap.get_name(peer_addr); + logger().info("got monmap {}, mon.{}, is now rank {}", + monmap.epoch, cur_mon, monmap.get_rank(cur_mon)); + sub.got("monmap", monmap.get_epoch()); + + if (monmap.get_addr_name(peer_addr, cur_mon)) { + if (active_con) { + logger().info("handle_monmap: renewing tickets"); + return seastar::when_all_succeed( + active_con->renew_tickets(), + active_con->renew_rotating_keyring()).then_unpack([] { + logger().info("handle_mon_map: renewed tickets"); + }); + } else { + return seastar::now(); + } + } else { + logger().warn("mon.{} went away", cur_mon); + return reopen_session(-1).then([this](bool opened) { + if (opened) { + return on_session_opened(); + } else { + return seastar::now(); + } + }); + } +} + +seastar::future<> Client::handle_auth_reply(crimson::net::Connection &conn, + Ref<MAuthReply> m) +{ + logger().info("handle_auth_reply {} returns {}: {}", + conn, *m, m->result); + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn.get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found != pending_conns.end()) { + return (*found)->handle_auth_reply(m); + } else if (active_con) { + return active_con->handle_auth_reply(m).then([this] { + return seastar::when_all_succeed( + active_con->renew_rotating_keyring(), + active_con->renew_tickets()).discard_result(); + }); + } else { + logger().error("unknown auth reply from {}", conn.get_peer_addr()); + return seastar::now(); + } +} + +seastar::future<> Client::handle_subscribe_ack(Ref<MMonSubscribeAck> m) +{ + sub.acked(m->interval); + return seastar::now(); +} + +Client::get_version_t Client::get_version(const std::string& map) +{ + auto m = crimson::make_message<MMonGetVersion>(); + auto tid = ++last_version_req_id; + m->handle = tid; + m->what = map; + auto& req = version_reqs[tid]; + return send_message(std::move(m)).then([&req] { + return req.get_future(); + }); +} + +seastar::future<> +Client::handle_get_version_reply(Ref<MMonGetVersionReply> m) +{ + if (auto found = version_reqs.find(m->handle); + found != version_reqs.end()) { + auto& result = found->second; + logger().trace("{}: {} returns {}", + __func__, m->handle, m->version); + result.set_value(std::make_tuple(m->version, m->oldest_version)); + version_reqs.erase(found); + } else { + logger().warn("{}: version request with handle {} not found", + __func__, m->handle); + } + return seastar::now(); +} + +seastar::future<> Client::handle_mon_command_ack(Ref<MMonCommandAck> m) +{ + const auto tid = m->get_tid(); + if (auto found = std::find_if(mon_commands.begin(), + mon_commands.end(), + [tid](auto& cmd) { + return cmd.req->get_tid() == tid; + }); + found != mon_commands.end()) { + auto& command = *found; + logger().trace("{} {}", __func__, tid); + command.result.set_value(std::make_tuple(m->r, m->rs, std::move(m->get_data()))); + mon_commands.erase(found); + } else { + logger().warn("{} {} not found", __func__, tid); + } + return seastar::now(); +} + +seastar::future<> Client::handle_log_ack(Ref<MLogAck> m) +{ + if (log_client) { + return log_client->handle_log_ack(m).then([this] { + if (more_log_pending) { + return send_log(log_flushing_t::NO_FLUSH); + } else { + return seastar::now(); + } + }); + } + return seastar::now(); +} + +seastar::future<> Client::handle_config(Ref<MConfig> m) +{ + return crimson::common::local_conf().set_mon_vals(m->config).then([this] { + if (config_updated) { + config_updated->set_value(); + } + }); +} + +std::vector<unsigned> Client::get_random_mons(unsigned n) const +{ + uint16_t min_priority = std::numeric_limits<uint16_t>::max(); + for (const auto& m : monmap.mon_info) { + if (m.second.priority < min_priority) { + min_priority = m.second.priority; + } + } + vector<unsigned> ranks; + for (auto [name, info] : monmap.mon_info) { + if (info.priority == min_priority) { + ranks.push_back(monmap.get_rank(name)); + } + } + std::random_device rd; + std::default_random_engine rng{rd()}; + std::shuffle(ranks.begin(), ranks.end(), rng); + if (n == 0 || n > ranks.size()) { + return ranks; + } else { + return {ranks.begin(), ranks.begin() + n}; + } +} + +seastar::future<> Client::authenticate() +{ + return reopen_session(-1).then([this](bool opened) { + if (opened) { + return on_session_opened(); + } else { + return seastar::now(); + } + }); +} + +seastar::future<> Client::stop() +{ + logger().info("{}", __func__); + auto fut = gate.close(); + timer.cancel(); + ready_to_send = false; + for (auto& pending_con : pending_conns) { + pending_con->close(); + } + if (active_con) { + active_con->close(); + } + return fut; +} + +static entity_addr_t choose_client_addr( + const entity_addrvec_t& my_addrs, + const entity_addrvec_t& client_addrs) +{ + // here is where we decide which of the addrs to connect to. always prefer + // the first one, if we support it. + for (const auto& a : client_addrs.v) { + if (a.is_msgr2()) { + // FIXME: for ipv4 vs ipv6, check whether local host can handle ipv6 before + // trying it? for now, just pick whichever is listed first. + return a; + } + } + return entity_addr_t{}; +} + +seastar::future<bool> Client::reopen_session(int rank) +{ + logger().info("{} to mon.{}", __func__, rank); + ready_to_send = false; + if (active_con) { + active_con->close(); + active_con = nullptr; + ceph_assert(pending_conns.empty()); + } else { + for (auto& pending_con : pending_conns) { + pending_con->close(); + } + pending_conns.clear(); + } + vector<unsigned> mons; + if (rank >= 0) { + mons.push_back(rank); + } else { + const auto parallel = + crimson::common::local_conf().get_val<uint64_t>("mon_client_hunt_parallel"); + mons = get_random_mons(parallel); + } + pending_conns.reserve(mons.size()); + return seastar::parallel_for_each(mons, [this](auto rank) { + auto peer = choose_client_addr(msgr.get_myaddrs(), + monmap.get_addrs(rank)); + if (peer == entity_addr_t{}) { + // crimson msgr only uses the first bound addr + logger().warn("mon.{} does not have an addr compatible with me", rank); + return seastar::now(); + } + logger().info("connecting to mon.{}", rank); + auto conn = msgr.connect(peer, CEPH_ENTITY_TYPE_MON); + auto& mc = pending_conns.emplace_back( + seastar::make_shared<Connection>(auth_registry, conn, &keyring)); + assert(conn->get_peer_addr().is_msgr2()); + return mc->authenticate_v2().then([peer, this](auto result) { + if (result == Connection::auth_result_t::success) { + _finish_auth(peer); + } + logger().debug("reopen_session mon connection attempts complete"); + }).handle_exception([](auto ep) { + logger().error("mon connections failed with ep {}", ep); + return seastar::make_exception_future(ep); + }); + }).then([this] { + if (active_con) { + return true; + } else { + logger().warn("cannot establish the active_con with any mon"); + return false; + } + }); +} + +void Client::_finish_auth(const entity_addr_t& peer) +{ + if (!is_hunting()) { + return; + } + logger().info("found mon.{}", monmap.get_name(peer)); + + auto found = std::find_if( + pending_conns.begin(), pending_conns.end(), + [peer](auto& conn) { + return conn->is_my_peer(peer); + }); + if (found == pending_conns.end()) { + // Happens if another connection has won the race + ceph_assert(active_con && pending_conns.empty()); + logger().info("no pending connection for mon.{}, peer {}", + monmap.get_name(peer), peer); + return; + } + + ceph_assert(!active_con && !pending_conns.empty()); + // It's too early to toggle the `ready_to_send` flag. It will + // be set atfer finishing the MAuth exchange and draining out + // the `pending_messages` queue. + active_con = std::move(*found); + *found = nullptr; + for (auto& conn : pending_conns) { + if (conn) { + conn->close(); + } + } + pending_conns.clear(); +} + +Client::command_result_t +Client::run_command(std::string&& cmd, + bufferlist&& bl) +{ + auto m = crimson::make_message<MMonCommand>(monmap.fsid); + auto tid = ++last_mon_command_id; + m->set_tid(tid); + m->cmd = {std::move(cmd)}; + m->set_data(std::move(bl)); + auto& command = mon_commands.emplace_back(crimson::make_message<MMonCommand>(*m)); + return send_message(std::move(m)).then([&result=command.result] { + return result.get_future(); + }); +} + +seastar::future<> Client::send_message(MessageURef m) +{ + if (active_con && ready_to_send) { + assert(pending_messages.empty()); + return active_con->get_conn()->send(std::move(m)); + } else { + auto& delayed = pending_messages.emplace_back(std::move(m)); + return delayed.pr.get_future(); + } +} + +seastar::future<> Client::on_session_opened() +{ + return active_con->renew_rotating_keyring().then([this] { + if (!active_con) { + // the connection can be closed even in the middle of the opening sequence + logger().info("on_session_opened {}: connection closed", __LINE__); + return seastar::now(); + } + for (auto& m : pending_messages) { + (void) active_con->get_conn()->send(std::move(m.msg)); + m.pr.set_value(); + } + pending_messages.clear(); + ready_to_send = true; + return sub.reload() ? renew_subs() : seastar::now(); + }).then([this] { + if (!active_con) { + logger().info("on_session_opened {}: connection closed", __LINE__); + return seastar::now(); + } + return seastar::parallel_for_each(mon_commands, + [this](auto &command) { + return send_message(crimson::make_message<MMonCommand>(*command.req)); + }); + }); +} + +bool Client::sub_want(const std::string& what, version_t start, unsigned flags) +{ + return sub.want(what, start, flags); +} + +void Client::sub_got(const std::string& what, version_t have) +{ + sub.got(what, have); +} + +void Client::sub_unwant(const std::string& what) +{ + sub.unwant(what); +} + +bool Client::sub_want_increment(const std::string& what, + version_t start, + unsigned flags) +{ + return sub.inc_want(what, start, flags); +} + +seastar::future<> Client::renew_subs() +{ + if (!sub.have_new()) { + logger().warn("{} - empty", __func__); + return seastar::now(); + } + logger().trace("{}", __func__); + + auto m = crimson::make_message<MMonSubscribe>(); + m->what = sub.get_subs(); + m->hostname = ceph_get_short_hostname(); + return send_message(std::move(m)).then([this] { + sub.renewed(); + }); +} + +seastar::future<> Client::wait_for_config() +{ + assert(!config_updated); + config_updated = seastar::promise<>(); + return config_updated->get_future(); +} + +void Client::print(std::ostream& out) const +{ + out << "mon." << entity_name; +} + +} // namespace crimson::mon diff --git a/src/crimson/mon/MonClient.h b/src/crimson/mon/MonClient.h new file mode 100644 index 000000000..1228ecd0b --- /dev/null +++ b/src/crimson/mon/MonClient.h @@ -0,0 +1,218 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <vector> + +#include <seastar/core/future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/lowres_clock.hh> +#include <seastar/core/shared_ptr.hh> +#include <seastar/core/timer.hh> + +#include "auth/AuthRegistry.h" +#include "auth/KeyRing.h" +#include "common/ceph_context.h" + +#include "crimson/auth/AuthClient.h" +#include "crimson/auth/AuthServer.h" +#include "crimson/common/auth_handler.h" +#include "crimson/common/gated.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/net/Fwd.h" + +#include "mon/MonMap.h" + +#include "mon/MonSub.h" + +template<typename Message> using Ref = boost::intrusive_ptr<Message>; +namespace crimson::net { + class Messenger; +} + +class LogClient; + +struct AuthAuthorizeHandler; +class MAuthReply; +struct MMonMap; +struct MMonSubscribeAck; +struct MMonGetVersionReply; +struct MMonCommand; +struct MMonCommandAck; +struct MLogAck; +struct MConfig; + +enum class log_flushing_t; + +namespace crimson::mon { + +class Connection; + +class Client : public crimson::net::Dispatcher, + public crimson::auth::AuthClient, + public crimson::auth::AuthServer +{ + EntityName entity_name; + KeyRing keyring; + const uint32_t want_keys; + + MonMap monmap; + bool ready_to_send = false; + seastar::shared_ptr<Connection> active_con; + std::vector<seastar::shared_ptr<Connection>> pending_conns; + seastar::timer<seastar::lowres_clock> timer; + + crimson::net::Messenger& msgr; + + LogClient *log_client; + bool more_log_pending = false; + utime_t last_send_log; + + seastar::future<> send_log(log_flushing_t flush_flag); + seastar::future<> wait_for_send_log(); + + // commands + using get_version_t = seastar::future<std::tuple<version_t, version_t>>; + + ceph_tid_t last_version_req_id = 0; + std::map<ceph_tid_t, typename get_version_t::promise_type> version_reqs; + + ceph_tid_t last_mon_command_id = 0; + using command_result_t = + seastar::future<std::tuple<std::int32_t, std::string, ceph::bufferlist>>; + struct mon_command_t { + MURef<MMonCommand> req; + typename command_result_t::promise_type result; + mon_command_t(MURef<MMonCommand> req); + }; + std::vector<mon_command_t> mon_commands; + + MonSub sub; + +public: + Client(crimson::net::Messenger&, crimson::common::AuthHandler&); + Client(Client&&); + ~Client(); + seastar::future<> start(); + seastar::future<> stop(); + + void set_log_client(LogClient *clog) { + log_client = clog; + } + + const uuid_d& get_fsid() const { + return monmap.fsid; + } + get_version_t get_version(const std::string& map); + command_result_t run_command(std::string&& cmd, + bufferlist&& bl); + seastar::future<> send_message(MessageURef); + bool sub_want(const std::string& what, version_t start, unsigned flags); + void sub_got(const std::string& what, version_t have); + void sub_unwant(const std::string& what); + bool sub_want_increment(const std::string& what, version_t start, unsigned flags); + seastar::future<> renew_subs(); + seastar::future<> wait_for_config(); + + void print(std::ostream&) const; +private: + // AuthServer methods + std::pair<std::vector<uint32_t>, std::vector<uint32_t>> + get_supported_auth_methods(int peer_type) final; + uint32_t pick_con_mode(int peer_type, + uint32_t auth_method, + const std::vector<uint32_t>& preferred_modes) final; + AuthAuthorizeHandler* get_auth_authorize_handler(int peer_type, + int auth_method) final; + int handle_auth_request(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + bool more, + uint32_t auth_method, + const ceph::bufferlist& payload, + uint64_t *p_peer_global_id, + ceph::bufferlist *reply) final; + + crimson::common::CephContext cct; // for auth_registry + AuthRegistry auth_registry; + crimson::common::AuthHandler& auth_handler; + + // AuthClient methods + crimson::auth::AuthClient::auth_request_t + get_auth_request(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta) final; + + // Handle server's request to continue the handshake + ceph::bufferlist handle_auth_reply_more(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + const bufferlist& bl) final; + + // Handle server's indication that authentication succeeded + int handle_auth_done(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + uint64_t global_id, + uint32_t con_mode, + const bufferlist& bl) final; + + // Handle server's indication that the previous auth attempt failed + int handle_auth_bad_method(crimson::net::Connection &conn, + AuthConnectionMeta &auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) final; + +private: + void tick(); + + std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef conn, + MessageRef m) override; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override; + + seastar::future<> handle_monmap(crimson::net::Connection &conn, + Ref<MMonMap> m); + seastar::future<> handle_auth_reply(crimson::net::Connection &conn, + Ref<MAuthReply> m); + seastar::future<> handle_subscribe_ack(Ref<MMonSubscribeAck> m); + seastar::future<> handle_get_version_reply(Ref<MMonGetVersionReply> m); + seastar::future<> handle_mon_command_ack(Ref<MMonCommandAck> m); + seastar::future<> handle_log_ack(Ref<MLogAck> m); + seastar::future<> handle_config(Ref<MConfig> m); + + seastar::future<> on_session_opened(); +private: + seastar::future<> load_keyring(); + seastar::future<> authenticate(); + + bool is_hunting() const; + // @param rank, rank of the monitor to be connected, if it is less than 0, + // try to connect to all monitors in monmap, until one of them + // is connected. + // @return true if a connection to monitor is established + seastar::future<bool> reopen_session(int rank); + std::vector<unsigned> get_random_mons(unsigned n) const; + seastar::future<> _add_conn(unsigned rank, uint64_t global_id); + void _finish_auth(const entity_addr_t& peer); + crimson::common::Gated gate; + + // messages that are waiting for the active_con to be available + struct pending_msg_t { + pending_msg_t(MessageURef m) : msg(std::move(m)) {} + MessageURef msg; + seastar::promise<> pr; + }; + std::deque<pending_msg_t> pending_messages; + std::optional<seastar::promise<>> config_updated; +}; + +inline std::ostream& operator<<(std::ostream& out, const Client& client) { + client.print(out); + return out; +} + +} // namespace crimson::mon + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::mon::Client> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/net/Connection.h b/src/crimson/net/Connection.h new file mode 100644 index 000000000..7141e20f4 --- /dev/null +++ b/src/crimson/net/Connection.h @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <queue> +#include <seastar/core/future.hh> +#include <seastar/core/shared_ptr.hh> + +#include "Fwd.h" + +namespace crimson::net { + +using seq_num_t = uint64_t; + +/** + * Connection + * + * Abstraction for messenger connections. + * + * Except when otherwise specified, methods must be invoked from the core on which + * the connection originates. + */ +class Connection : public seastar::enable_shared_from_this<Connection> { + public: + using clock_t = seastar::lowres_system_clock; + + Connection() {} + + virtual ~Connection() {} + + /** + * get_shard_id + * + * The shard id where the Connection is dispatching events and handling I/O. + * + * May be changed with the accept/connect events. + */ + virtual const seastar::shard_id get_shard_id() const = 0; + + virtual const entity_name_t &get_peer_name() const = 0; + + entity_type_t get_peer_type() const { return get_peer_name().type(); } + int64_t get_peer_id() const { return get_peer_name().num(); } + bool peer_is_mon() const { return get_peer_name().is_mon(); } + bool peer_is_mgr() const { return get_peer_name().is_mgr(); } + bool peer_is_mds() const { return get_peer_name().is_mds(); } + bool peer_is_osd() const { return get_peer_name().is_osd(); } + bool peer_is_client() const { return get_peer_name().is_client(); } + + virtual const entity_addr_t &get_peer_addr() const = 0; + + const entity_addrvec_t get_peer_addrs() const { + return entity_addrvec_t(get_peer_addr()); + } + + virtual const entity_addr_t &get_peer_socket_addr() const = 0; + + virtual uint64_t get_features() const = 0; + + bool has_feature(uint64_t f) const { + return get_features() & f; + } + + /// true if the handshake has completed and no errors have been encountered + virtual bool is_connected() const = 0; + + /** + * send + * + * Send a message over a connection that has completed its handshake. + * + * May be invoked from any core, but that requires to chain the returned + * future to preserve ordering. + */ + virtual seastar::future<> send(MessageURef msg) = 0; + + /** + * send_keepalive + * + * Send a keepalive message over a connection that has completed its + * handshake. + * + * May be invoked from any core, but that requires to chain the returned + * future to preserve ordering. + */ + virtual seastar::future<> send_keepalive() = 0; + + virtual clock_t::time_point get_last_keepalive() const = 0; + + virtual clock_t::time_point get_last_keepalive_ack() const = 0; + + // workaround for the monitor client + virtual void set_last_keepalive_ack(clock_t::time_point when) = 0; + + // close the connection and cancel any any pending futures from read/send, + // without dispatching any reset event + virtual void mark_down() = 0; + + struct user_private_t { + virtual ~user_private_t() = default; + }; + + virtual bool has_user_private() const = 0; + + virtual user_private_t &get_user_private() = 0; + + virtual void set_user_private(std::unique_ptr<user_private_t>) = 0; + + virtual void print(std::ostream& out) const = 0; + +#ifdef UNIT_TESTS_BUILT + virtual bool is_protocol_ready() const = 0; + + virtual bool is_protocol_standby() const = 0; + + virtual bool is_protocol_closed() const = 0; + + virtual bool is_protocol_closed_clean() const = 0; + + virtual bool peer_wins() const = 0; +#endif +}; + +inline std::ostream& operator<<(std::ostream& out, const Connection& conn) { + out << "["; + conn.print(out); + out << "]"; + return out; +} + +} // namespace crimson::net + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::net::Connection> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/net/Dispatcher.h b/src/crimson/net/Dispatcher.h new file mode 100644 index 000000000..9eea0a858 --- /dev/null +++ b/src/crimson/net/Dispatcher.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "Fwd.h" + +class AuthAuthorizer; + +namespace crimson::net { + +class Dispatcher { + public: + virtual ~Dispatcher() {} + + // Dispatchers are put into a chain as described by chain-of-responsibility + // pattern. If any of the dispatchers claims this message, it returns a valid + // future to prevent other dispatchers from processing it, and this is also + // used to throttle the connection if it's too busy. + virtual std::optional<seastar::future<>> ms_dispatch(ConnectionRef, MessageRef) = 0; + + // The connection is moving to the new_shard under accept/connect. + // User should not operate conn in this shard thereafter. + virtual void ms_handle_shard_change( + ConnectionRef conn, + seastar::shard_id new_shard, + bool is_accept_or_connect) {} + + // The connection is accepted or recoverred(lossless), all the followup + // events and messages will be dispatched to this shard. + // + // is_replace=true means the accepted connection has replaced + // another connecting connection with the same peer_addr, which currently only + // happens under lossy policy when both sides wish to connect to each other. + virtual void ms_handle_accept(ConnectionRef conn, seastar::shard_id prv_shard, bool is_replace) {} + + // The connection is (re)connected, all the followup events and messages will + // be dispatched to this shard. + virtual void ms_handle_connect(ConnectionRef conn, seastar::shard_id prv_shard) {} + + // a reset event is dispatched when the connection is closed unexpectedly. + // + // is_replace=true means the reset connection is going to be replaced by + // another accepting connection with the same peer_addr, which currently only + // happens under lossy policy when both sides wish to connect to each other. + virtual void ms_handle_reset(ConnectionRef conn, bool is_replace) {} + + virtual void ms_handle_remote_reset(ConnectionRef conn) {} +}; + +} // namespace crimson::net diff --git a/src/crimson/net/Errors.cc b/src/crimson/net/Errors.cc new file mode 100644 index 000000000..d07c090db --- /dev/null +++ b/src/crimson/net/Errors.cc @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Errors.h" + +namespace crimson::net { + +const std::error_category& net_category() +{ + struct category : public std::error_category { + const char* name() const noexcept override { + return "crimson::net"; + } + + std::string message(int ev) const override { + switch (static_cast<error>(ev)) { + case error::success: + return "success"; + case error::bad_connect_banner: + return "bad connect banner"; + case error::bad_peer_address: + return "bad peer address"; + case error::negotiation_failure: + return "negotiation failure"; + case error::read_eof: + return "read eof"; + case error::corrupted_message: + return "corrupted message"; + case error::protocol_aborted: + return "protocol aborted"; + default: + return "unknown"; + } + } + }; + static category instance; + return instance; +} + +} // namespace crimson::net diff --git a/src/crimson/net/Errors.h b/src/crimson/net/Errors.h new file mode 100644 index 000000000..3a17a103a --- /dev/null +++ b/src/crimson/net/Errors.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <system_error> + +namespace crimson::net { + +/// net error codes +enum class error { + success = 0, + bad_connect_banner, + bad_peer_address, + negotiation_failure, + read_eof, + corrupted_message, + protocol_aborted, +}; + +/// net error category +const std::error_category& net_category(); + +inline std::error_code make_error_code(error e) +{ + return {static_cast<int>(e), net_category()}; +} + +inline std::error_condition make_error_condition(error e) +{ + return {static_cast<int>(e), net_category()}; +} + +} // namespace crimson::net + +namespace std { + +/// enables implicit conversion to std::error_condition +template <> +struct is_error_condition_enum<crimson::net::error> : public true_type {}; + +} // namespace std diff --git a/src/crimson/net/FrameAssemblerV2.cc b/src/crimson/net/FrameAssemblerV2.cc new file mode 100644 index 000000000..273a6350d --- /dev/null +++ b/src/crimson/net/FrameAssemblerV2.cc @@ -0,0 +1,461 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "FrameAssemblerV2.h" + +#include "Errors.h" +#include "SocketConnection.h" + +using ceph::msgr::v2::FrameAssembler; +using ceph::msgr::v2::FrameError; +using ceph::msgr::v2::preamble_block_t; +using ceph::msgr::v2::segment_t; +using ceph::msgr::v2::Tag; + +namespace { + +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); +} + +} // namespace anonymous + +namespace crimson::net { + +FrameAssemblerV2::FrameAssemblerV2(SocketConnection &_conn) + : conn{_conn}, sid{seastar::this_shard_id()} +{ + assert(seastar::this_shard_id() == conn.get_messenger_shard_id()); +} + +FrameAssemblerV2::~FrameAssemblerV2() +{ + assert(seastar::this_shard_id() == conn.get_messenger_shard_id()); + assert(seastar::this_shard_id() == sid); + if (has_socket()) { + std::ignore = move_socket(); + } +} + +#ifdef UNIT_TESTS_BUILT +// should be consistent to intercept() in ProtocolV2.cc +seastar::future<> FrameAssemblerV2::intercept_frames( + std::vector<Breakpoint> bps, + bp_type_t type) +{ + assert(seastar::this_shard_id() == sid); + assert(has_socket()); + if (!conn.interceptor) { + return seastar::now(); + } + return conn.interceptor->intercept(conn, bps + ).then([this, type](bp_action_t action) { + return seastar::smp::submit_to( + socket->get_shard_id(), + [this, type, action] { + socket->set_trap(type, action, &conn.interceptor->blocker); + }); + }); +} +#endif + +void FrameAssemblerV2::set_is_rev1(bool _is_rev1) +{ + assert(seastar::this_shard_id() == sid); + is_rev1 = _is_rev1; + tx_frame_asm.set_is_rev1(_is_rev1); + rx_frame_asm.set_is_rev1(_is_rev1); +} + +void FrameAssemblerV2::create_session_stream_handlers( + const AuthConnectionMeta &auth_meta, + bool crossed) +{ + assert(seastar::this_shard_id() == sid); + session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair( + nullptr, auth_meta, is_rev1, crossed); +} + +void FrameAssemblerV2::reset_handlers() +{ + assert(seastar::this_shard_id() == sid); + session_stream_handlers = { nullptr, nullptr }; + session_comp_handlers = { nullptr, nullptr }; +} + +FrameAssemblerV2::mover_t +FrameAssemblerV2::to_replace() +{ + assert(seastar::this_shard_id() == sid); + assert(is_socket_valid()); + + clear(); + + return mover_t{ + move_socket(), + std::move(session_stream_handlers), + std::move(session_comp_handlers)}; +} + +seastar::future<> FrameAssemblerV2::replace_by(FrameAssemblerV2::mover_t &&mover) +{ + assert(seastar::this_shard_id() == sid); + + clear(); + + session_stream_handlers = std::move(mover.session_stream_handlers); + session_comp_handlers = std::move(mover.session_comp_handlers); + if (has_socket()) { + return replace_shutdown_socket(std::move(mover.socket)); + } else { + set_socket(std::move(mover.socket)); + return seastar::now(); + } +} + +void FrameAssemblerV2::start_recording() +{ + assert(seastar::this_shard_id() == sid); + record_io = true; + rxbuf.clear(); + txbuf.clear(); +} + +FrameAssemblerV2::record_bufs_t +FrameAssemblerV2::stop_recording() +{ + assert(seastar::this_shard_id() == sid); + ceph_assert_always(record_io == true); + record_io = false; + return record_bufs_t{std::move(rxbuf), std::move(txbuf)}; +} + +bool FrameAssemblerV2::has_socket() const +{ + assert((socket && conn.socket) || (!socket && !conn.socket)); + return bool(socket); +} + +bool FrameAssemblerV2::is_socket_valid() const +{ + assert(seastar::this_shard_id() == sid); +#ifndef NDEBUG + if (has_socket() && socket->get_shard_id() == sid) { + assert(socket->is_shutdown() == is_socket_shutdown); + } +#endif + return has_socket() && !is_socket_shutdown; +} + +seastar::shard_id +FrameAssemblerV2::get_socket_shard_id() const +{ + assert(seastar::this_shard_id() == sid); + assert(is_socket_valid()); + return socket->get_shard_id(); +} + +SocketFRef FrameAssemblerV2::move_socket() +{ + assert(has_socket()); + conn.set_socket(nullptr); + return std::move(socket); +} + +void FrameAssemblerV2::set_socket(SocketFRef &&new_socket) +{ + assert(seastar::this_shard_id() == sid); + assert(!has_socket()); + assert(new_socket); + socket = std::move(new_socket); + conn.set_socket(socket.get()); + is_socket_shutdown = false; + assert(is_socket_valid()); +} + +void FrameAssemblerV2::learn_socket_ephemeral_port_as_connector(uint16_t port) +{ + assert(seastar::this_shard_id() == sid); + assert(has_socket()); + // Note: may not invoke on the socket core + socket->learn_ephemeral_port_as_connector(port); +} + +template <bool may_cross_core> +void FrameAssemblerV2::shutdown_socket(crimson::common::Gated *gate) +{ + assert(seastar::this_shard_id() == sid); + assert(is_socket_valid()); + is_socket_shutdown = true; + if constexpr (may_cross_core) { + assert(conn.get_messenger_shard_id() == sid); + assert(gate); + gate->dispatch_in_background("shutdown_socket", conn, [this] { + return seastar::smp::submit_to( + socket->get_shard_id(), [this] { + socket->shutdown(); + }); + }); + } else { + assert(socket->get_shard_id() == sid); + assert(!gate); + socket->shutdown(); + } +} +template void FrameAssemblerV2::shutdown_socket<true>(crimson::common::Gated *); +template void FrameAssemblerV2::shutdown_socket<false>(crimson::common::Gated *); + +seastar::future<> FrameAssemblerV2::replace_shutdown_socket(SocketFRef &&new_socket) +{ + assert(seastar::this_shard_id() == sid); + assert(has_socket()); + assert(!is_socket_valid()); + auto old_socket = move_socket(); + auto old_socket_shard_id = old_socket->get_shard_id(); + set_socket(std::move(new_socket)); + return seastar::smp::submit_to( + old_socket_shard_id, + [old_socket = std::move(old_socket)]() mutable { + return old_socket->close( + ).then([sock = std::move(old_socket)] {}); + }); +} + +seastar::future<> FrameAssemblerV2::close_shutdown_socket() +{ + assert(seastar::this_shard_id() == sid); + assert(has_socket()); + assert(!is_socket_valid()); + return seastar::smp::submit_to( + socket->get_shard_id(), [this] { + return socket->close(); + }); +} + +template <bool may_cross_core> +seastar::future<ceph::bufferptr> +FrameAssemblerV2::read_exactly(std::size_t bytes) +{ + assert(seastar::this_shard_id() == sid); + assert(has_socket()); + if constexpr (may_cross_core) { + assert(conn.get_messenger_shard_id() == sid); + return seastar::smp::submit_to( + socket->get_shard_id(), [this, bytes] { + return socket->read_exactly(bytes); + }).then([this](auto bptr) { + if (record_io) { + rxbuf.append(bptr); + } + return bptr; + }); + } else { + assert(socket->get_shard_id() == sid); + return socket->read_exactly(bytes); + } +} +template seastar::future<ceph::bufferptr> FrameAssemblerV2::read_exactly<true>(std::size_t); +template seastar::future<ceph::bufferptr> FrameAssemblerV2::read_exactly<false>(std::size_t); + +template <bool may_cross_core> +seastar::future<ceph::bufferlist> +FrameAssemblerV2::read(std::size_t bytes) +{ + assert(seastar::this_shard_id() == sid); + assert(has_socket()); + if constexpr (may_cross_core) { + assert(conn.get_messenger_shard_id() == sid); + return seastar::smp::submit_to( + socket->get_shard_id(), [this, bytes] { + return socket->read(bytes); + }).then([this](auto buf) { + if (record_io) { + rxbuf.append(buf); + } + return buf; + }); + } else { + assert(socket->get_shard_id() == sid); + return socket->read(bytes); + } +} +template seastar::future<ceph::bufferlist> FrameAssemblerV2::read<true>(std::size_t); +template seastar::future<ceph::bufferlist> FrameAssemblerV2::read<false>(std::size_t); + +template <bool may_cross_core> +seastar::future<> +FrameAssemblerV2::write(ceph::bufferlist buf) +{ + assert(seastar::this_shard_id() == sid); + assert(has_socket()); + if constexpr (may_cross_core) { + assert(conn.get_messenger_shard_id() == sid); + if (record_io) { + txbuf.append(buf); + } + return seastar::smp::submit_to( + socket->get_shard_id(), [this, buf = std::move(buf)]() mutable { + return socket->write(std::move(buf)); + }); + } else { + assert(socket->get_shard_id() == sid); + return socket->write(std::move(buf)); + } +} +template seastar::future<> FrameAssemblerV2::write<true>(ceph::bufferlist); +template seastar::future<> FrameAssemblerV2::write<false>(ceph::bufferlist); + +template <bool may_cross_core> +seastar::future<> +FrameAssemblerV2::flush() +{ + assert(seastar::this_shard_id() == sid); + assert(has_socket()); + if constexpr (may_cross_core) { + assert(conn.get_messenger_shard_id() == sid); + return seastar::smp::submit_to( + socket->get_shard_id(), [this] { + return socket->flush(); + }); + } else { + assert(socket->get_shard_id() == sid); + return socket->flush(); + } +} +template seastar::future<> FrameAssemblerV2::flush<true>(); +template seastar::future<> FrameAssemblerV2::flush<false>(); + +template <bool may_cross_core> +seastar::future<> +FrameAssemblerV2::write_flush(ceph::bufferlist buf) +{ + assert(seastar::this_shard_id() == sid); + assert(has_socket()); + if constexpr (may_cross_core) { + assert(conn.get_messenger_shard_id() == sid); + if (unlikely(record_io)) { + txbuf.append(buf); + } + return seastar::smp::submit_to( + socket->get_shard_id(), [this, buf = std::move(buf)]() mutable { + return socket->write_flush(std::move(buf)); + }); + } else { + assert(socket->get_shard_id() == sid); + return socket->write_flush(std::move(buf)); + } +} +template seastar::future<> FrameAssemblerV2::write_flush<true>(ceph::bufferlist); +template seastar::future<> FrameAssemblerV2::write_flush<false>(ceph::bufferlist); + +template <bool may_cross_core> +seastar::future<FrameAssemblerV2::read_main_t> +FrameAssemblerV2::read_main_preamble() +{ + assert(seastar::this_shard_id() == sid); + rx_preamble.clear(); + return read_exactly<may_cross_core>( + rx_frame_asm.get_preamble_onwire_len() + ).then([this](auto bptr) { + rx_preamble.append(std::move(bptr)); + Tag tag; + try { + tag = rx_frame_asm.disassemble_preamble(rx_preamble); + } catch (FrameError& e) { + logger().warn("{} read_main_preamble: {}", conn, e.what()); + throw std::system_error(make_error_code(crimson::net::error::negotiation_failure)); + } +#ifdef UNIT_TESTS_BUILT + return intercept_frame(tag, false + ).then([this, tag] { + return read_main_t{tag, &rx_frame_asm}; + }); +#else + return read_main_t{tag, &rx_frame_asm}; +#endif + }); +} +template seastar::future<FrameAssemblerV2::read_main_t> FrameAssemblerV2::read_main_preamble<true>(); +template seastar::future<FrameAssemblerV2::read_main_t> FrameAssemblerV2::read_main_preamble<false>(); + +template <bool may_cross_core> +seastar::future<FrameAssemblerV2::read_payload_t*> +FrameAssemblerV2::read_frame_payload() +{ + assert(seastar::this_shard_id() == sid); + rx_segments_data.clear(); + return seastar::do_until( + [this] { + return rx_frame_asm.get_num_segments() == rx_segments_data.size(); + }, + [this] { + // TODO: create aligned and contiguous buffer from socket + const size_t seg_idx = rx_segments_data.size(); + if (uint16_t alignment = rx_frame_asm.get_segment_align(seg_idx); + alignment != segment_t::DEFAULT_ALIGNMENT) { + logger().trace("{} cannot allocate {} aligned buffer at segment desc index {}", + conn, alignment, rx_segments_data.size()); + } + uint32_t onwire_len = rx_frame_asm.get_segment_onwire_len(seg_idx); + // TODO: create aligned and contiguous buffer from socket + return read_exactly<may_cross_core>(onwire_len + ).then([this](auto bptr) { + logger().trace("{} RECV({}) frame segment[{}]", + conn, bptr.length(), rx_segments_data.size()); + bufferlist segment; + segment.append(std::move(bptr)); + rx_segments_data.emplace_back(std::move(segment)); + }); + } + ).then([this] { + return read_exactly<may_cross_core>(rx_frame_asm.get_epilogue_onwire_len()); + }).then([this](auto bptr) { + logger().trace("{} RECV({}) frame epilogue", conn, bptr.length()); + bool ok = false; + try { + bufferlist rx_epilogue; + rx_epilogue.append(std::move(bptr)); + ok = rx_frame_asm.disassemble_segments(rx_preamble, rx_segments_data.data(), rx_epilogue); + } catch (FrameError& e) { + logger().error("read_frame_payload: {} {}", conn, e.what()); + throw std::system_error(make_error_code(crimson::net::error::negotiation_failure)); + } catch (ceph::crypto::onwire::MsgAuthError&) { + logger().error("read_frame_payload: {} bad auth tag", conn); + throw std::system_error(make_error_code(crimson::net::error::negotiation_failure)); + } + // we do have a mechanism that allows transmitter to start sending message + // and abort after putting entire data field on wire. This will be used by + // the kernel client to avoid unnecessary buffering. + if (!ok) { + ceph_abort("TODO"); + } + return &rx_segments_data; + }); +} +template seastar::future<FrameAssemblerV2::read_payload_t*> FrameAssemblerV2::read_frame_payload<true>(); +template seastar::future<FrameAssemblerV2::read_payload_t*> FrameAssemblerV2::read_frame_payload<false>(); + +void FrameAssemblerV2::log_main_preamble(const ceph::bufferlist &bl) +{ + const auto main_preamble = + reinterpret_cast<const preamble_block_t*>(bl.front().c_str()); + logger().trace("{} SEND({}) frame: tag={}, num_segments={}, crc={}", + conn, bl.length(), (int)main_preamble->tag, + (int)main_preamble->num_segments, main_preamble->crc); +} + +FrameAssemblerV2Ref FrameAssemblerV2::create(SocketConnection &conn) +{ + return std::make_unique<FrameAssemblerV2>(conn); +} + +void FrameAssemblerV2::clear() +{ + record_io = false; + rxbuf.clear(); + txbuf.clear(); + rx_preamble.clear(); + rx_segments_data.clear(); +} + +} // namespace crimson::net diff --git a/src/crimson/net/FrameAssemblerV2.h b/src/crimson/net/FrameAssemblerV2.h new file mode 100644 index 000000000..9c89c144e --- /dev/null +++ b/src/crimson/net/FrameAssemblerV2.h @@ -0,0 +1,257 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "msg/async/frames_v2.h" +#include "msg/async/crypto_onwire.h" +#include "msg/async/compression_onwire.h" + +#include "crimson/common/gated.h" +#include "crimson/net/Socket.h" + +#ifdef UNIT_TESTS_BUILT +#include "Interceptor.h" +#endif + +namespace crimson::net { + +class SocketConnection; +class FrameAssemblerV2; +using FrameAssemblerV2Ref = std::unique_ptr<FrameAssemblerV2>; + +class FrameAssemblerV2 { +public: + FrameAssemblerV2(SocketConnection &conn); + + ~FrameAssemblerV2(); + + FrameAssemblerV2(const FrameAssemblerV2 &) = delete; + + FrameAssemblerV2(FrameAssemblerV2 &&) = delete; + + void set_shard_id(seastar::shard_id _sid) { + assert(seastar::this_shard_id() == sid); + clear(); + sid = _sid; + } + + seastar::shard_id get_shard_id() const { + return sid; + } + + void set_is_rev1(bool is_rev1); + + void create_session_stream_handlers( + const AuthConnectionMeta &auth_meta, + bool crossed); + + void reset_handlers(); + + /* + * replacing + */ + + struct mover_t { + SocketFRef socket; + ceph::crypto::onwire::rxtx_t session_stream_handlers; + ceph::compression::onwire::rxtx_t session_comp_handlers; + }; + + mover_t to_replace(); + + seastar::future<> replace_by(mover_t &&); + + /* + * auth signature interfaces + */ + + void start_recording(); + + struct record_bufs_t { + ceph::bufferlist rxbuf; + ceph::bufferlist txbuf; + }; + record_bufs_t stop_recording(); + + /* + * socket maintainence interfaces + */ + + // the socket exists and not shutdown + bool is_socket_valid() const; + + seastar::shard_id get_socket_shard_id() const; + + void set_socket(SocketFRef &&); + + void learn_socket_ephemeral_port_as_connector(uint16_t port); + + // if may_cross_core == true, gate is required for cross-core shutdown + template <bool may_cross_core> + void shutdown_socket(crimson::common::Gated *gate); + + seastar::future<> replace_shutdown_socket(SocketFRef &&); + + seastar::future<> close_shutdown_socket(); + + /* + * socket read and write interfaces + */ + + template <bool may_cross_core = true> + seastar::future<ceph::bufferptr> read_exactly(std::size_t bytes); + + template <bool may_cross_core = true> + seastar::future<ceph::bufferlist> read(std::size_t bytes); + + template <bool may_cross_core = true> + seastar::future<> write(ceph::bufferlist); + + template <bool may_cross_core = true> + seastar::future<> flush(); + + template <bool may_cross_core = true> + seastar::future<> write_flush(ceph::bufferlist); + + /* + * frame read and write interfaces + */ + + /// may throw negotiation_failure as fault + struct read_main_t { + ceph::msgr::v2::Tag tag; + const ceph::msgr::v2::FrameAssembler *rx_frame_asm; + }; + template <bool may_cross_core = true> + seastar::future<read_main_t> read_main_preamble(); + + /// may throw negotiation_failure as fault + using read_payload_t = ceph::msgr::v2::segment_bls_t; + // FIXME: read_payload_t cannot be no-throw move constructible + template <bool may_cross_core = true> + seastar::future<read_payload_t*> read_frame_payload(); + + template <class F> + ceph::bufferlist get_buffer(F &tx_frame) { + assert(seastar::this_shard_id() == sid); + auto bl = tx_frame.get_buffer(tx_frame_asm); + log_main_preamble(bl); + return bl; + } + + template <class F, bool may_cross_core = true> + seastar::future<> write_flush_frame(F &tx_frame) { + assert(seastar::this_shard_id() == sid); + auto bl = get_buffer(tx_frame); +#ifdef UNIT_TESTS_BUILT + return intercept_frame(F::tag, true + ).then([this, bl=std::move(bl)]() mutable { + return write_flush<may_cross_core>(std::move(bl)); + }); +#else + return write_flush<may_cross_core>(std::move(bl)); +#endif + } + + static FrameAssemblerV2Ref create(SocketConnection &conn); + +#ifdef UNIT_TESTS_BUILT + seastar::future<> intercept_frames( + std::vector<ceph::msgr::v2::Tag> tags, + bool is_write) { + auto type = is_write ? bp_type_t::WRITE : bp_type_t::READ; + std::vector<Breakpoint> bps; + for (auto &tag : tags) { + bps.emplace_back(Breakpoint{tag, type}); + } + return intercept_frames(bps, type); + } + + seastar::future<> intercept_frame( + ceph::msgr::v2::Tag tag, + bool is_write) { + auto type = is_write ? bp_type_t::WRITE : bp_type_t::READ; + std::vector<Breakpoint> bps; + bps.emplace_back(Breakpoint{tag, type}); + return intercept_frames(bps, type); + } + + seastar::future<> intercept_frame( + custom_bp_t bp, + bool is_write) { + auto type = is_write ? bp_type_t::WRITE : bp_type_t::READ; + std::vector<Breakpoint> bps; + bps.emplace_back(Breakpoint{bp}); + return intercept_frames(bps, type); + } +#endif + +private: +#ifdef UNIT_TESTS_BUILT + seastar::future<> intercept_frames( + std::vector<Breakpoint> bps, + bp_type_t type); +#endif + + bool has_socket() const; + + SocketFRef move_socket(); + + void clear(); + + void log_main_preamble(const ceph::bufferlist &bl); + + SocketConnection &conn; + + SocketFRef socket; + + // checking Socket::is_shutdown() synchronously is impossible when sid is + // different from the socket sid. + bool is_socket_shutdown = false; + + // the current working shard, can be messenger or socket shard. + // if is messenger shard, should call interfaces with may_cross_core = true. + seastar::shard_id sid; + + /* + * auth signature + * + * only in the messenger core + */ + + bool record_io = false; + + ceph::bufferlist rxbuf; + + ceph::bufferlist txbuf; + + /* + * frame data and handlers + */ + + ceph::crypto::onwire::rxtx_t session_stream_handlers = { nullptr, nullptr }; + + // TODO + ceph::compression::onwire::rxtx_t session_comp_handlers = { nullptr, nullptr }; + + bool is_rev1 = false; + + ceph::msgr::v2::FrameAssembler tx_frame_asm{ + &session_stream_handlers, is_rev1, common::local_conf()->ms_crc_data, + &session_comp_handlers}; + + ceph::msgr::v2::FrameAssembler rx_frame_asm{ + &session_stream_handlers, is_rev1, common::local_conf()->ms_crc_data, + &session_comp_handlers}; + + // in the messenger core during handshake, + // and in the socket core during open, + // must be cleaned before switching cores. + + ceph::bufferlist rx_preamble; + + read_payload_t rx_segments_data; +}; + +} // namespace crimson::net diff --git a/src/crimson/net/Fwd.h b/src/crimson/net/Fwd.h new file mode 100644 index 000000000..2b1595141 --- /dev/null +++ b/src/crimson/net/Fwd.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <boost/container/small_vector.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/shared_ptr.hh> +#include <seastar/core/sharded.hh> + +#include "msg/Connection.h" +#include "msg/MessageRef.h" +#include "msg/msg_types.h" + +#include "crimson/common/errorator.h" +#include "crimson/common/local_shared_foreign_ptr.h" + +class AuthConnectionMeta; + +namespace crimson::net { + +using msgr_tag_t = uint8_t; +using stop_t = seastar::stop_iteration; + +class Connection; +using ConnectionLRef = seastar::shared_ptr<Connection>; +using ConnectionFRef = seastar::foreign_ptr<ConnectionLRef>; +using ConnectionRef = ::crimson::local_shared_foreign_ptr<ConnectionLRef>; + +class Dispatcher; +class ChainedDispatchers; +constexpr std::size_t NUM_DISPATCHERS = 4u; +using dispatchers_t = boost::container::small_vector<Dispatcher*, NUM_DISPATCHERS>; + +class Messenger; +using MessengerRef = seastar::shared_ptr<Messenger>; + +using MessageFRef = seastar::foreign_ptr<MessageURef>; + +} // namespace crimson::net diff --git a/src/crimson/net/Interceptor.h b/src/crimson/net/Interceptor.h new file mode 100644 index 000000000..35b74e243 --- /dev/null +++ b/src/crimson/net/Interceptor.h @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <variant> +#include <seastar/core/sharded.hh> +#include <seastar/core/sleep.hh> + +#include "Fwd.h" +#include "msg/async/frames_v2.h" + +namespace crimson::net { + +enum class custom_bp_t : uint8_t { + BANNER_WRITE = 0, + BANNER_READ, + BANNER_PAYLOAD_READ, + SOCKET_CONNECTING, + SOCKET_ACCEPTED +}; +inline const char* get_bp_name(custom_bp_t bp) { + uint8_t index = static_cast<uint8_t>(bp); + static const char *const bp_names[] = {"BANNER_WRITE", + "BANNER_READ", + "BANNER_PAYLOAD_READ", + "SOCKET_CONNECTING", + "SOCKET_ACCEPTED"}; + assert(index < std::size(bp_names)); + return bp_names[index]; +} + +enum class bp_type_t { + READ = 0, + WRITE +}; + +enum class bp_action_t { + CONTINUE = 0, + FAULT, + BLOCK, + STALL +}; + +class socket_blocker { + std::optional<seastar::abort_source> p_blocked; + std::optional<seastar::abort_source> p_unblocked; + const seastar::shard_id primary_sid; + + public: + socket_blocker() : primary_sid{seastar::this_shard_id()} {} + + seastar::future<> wait_blocked() { + ceph_assert(seastar::this_shard_id() == primary_sid); + ceph_assert(!p_blocked); + if (p_unblocked) { + return seastar::make_ready_future<>(); + } else { + p_blocked = seastar::abort_source(); + return seastar::sleep_abortable( + std::chrono::seconds(10), *p_blocked + ).then([] { + throw std::runtime_error( + "Timeout (10s) in socket_blocker::wait_blocked()"); + }).handle_exception_type([] (const seastar::sleep_aborted& e) { + // wait done! + }); + } + } + + seastar::future<> block() { + return seastar::smp::submit_to(primary_sid, [this] { + if (p_blocked) { + p_blocked->request_abort(); + p_blocked = std::nullopt; + } + ceph_assert(!p_unblocked); + p_unblocked = seastar::abort_source(); + return seastar::sleep_abortable( + std::chrono::seconds(10), *p_unblocked + ).then([] { + ceph_abort("Timeout (10s) in socket_blocker::block()"); + }).handle_exception_type([] (const seastar::sleep_aborted& e) { + // wait done! + }); + }); + } + + void unblock() { + ceph_assert(seastar::this_shard_id() == primary_sid); + ceph_assert(!p_blocked); + ceph_assert(p_unblocked); + p_unblocked->request_abort(); + p_unblocked = std::nullopt; + } +}; + +struct tag_bp_t { + ceph::msgr::v2::Tag tag; + bp_type_t type; + bool operator==(const tag_bp_t& x) const { + return tag == x.tag && type == x.type; + } + bool operator!=(const tag_bp_t& x) const { return !operator==(x); } + bool operator<(const tag_bp_t& x) const { + return std::tie(tag, type) < std::tie(x.tag, x.type); + } +}; + +struct Breakpoint { + using var_t = std::variant<custom_bp_t, tag_bp_t>; + var_t bp; + Breakpoint(custom_bp_t bp) : bp(bp) { } + Breakpoint(ceph::msgr::v2::Tag tag, bp_type_t type) + : bp(tag_bp_t{tag, type}) { } + bool operator==(const Breakpoint& x) const { return bp == x.bp; } + bool operator!=(const Breakpoint& x) const { return !operator==(x); } + bool operator==(const custom_bp_t& x) const { return bp == var_t(x); } + bool operator!=(const custom_bp_t& x) const { return !operator==(x); } + bool operator==(const tag_bp_t& x) const { return bp == var_t(x); } + bool operator!=(const tag_bp_t& x) const { return !operator==(x); } + bool operator<(const Breakpoint& x) const { return bp < x.bp; } +}; + +struct Interceptor { + socket_blocker blocker; + virtual ~Interceptor() {} + virtual void register_conn(ConnectionRef) = 0; + virtual void register_conn_ready(ConnectionRef) = 0; + virtual void register_conn_closed(ConnectionRef) = 0; + virtual void register_conn_replaced(ConnectionRef) = 0; + + virtual seastar::future<bp_action_t> + intercept(Connection&, std::vector<Breakpoint> bp) = 0; +}; + +} // namespace crimson::net + +template<> +struct fmt::formatter<crimson::net::bp_action_t> : fmt::formatter<std::string_view> { + template <typename FormatContext> + auto format(const crimson::net::bp_action_t& action, FormatContext& ctx) const { + static const char *const action_names[] = {"CONTINUE", + "FAULT", + "BLOCK", + "STALL"}; + assert(static_cast<size_t>(action) < std::size(action_names)); + return formatter<std::string_view>::format(action_names[static_cast<size_t>(action)], ctx); + } +}; + +template<> +struct fmt::formatter<crimson::net::Breakpoint> : fmt::formatter<std::string_view> { + template <typename FormatContext> + auto format(const crimson::net::Breakpoint& bp, FormatContext& ctx) const { + if (auto custom_bp = std::get_if<crimson::net::custom_bp_t>(&bp.bp)) { + return formatter<std::string_view>::format(crimson::net::get_bp_name(*custom_bp), ctx); + } + auto tag_bp = std::get<crimson::net::tag_bp_t>(bp.bp); + static const char *const tag_names[] = {"NONE", + "HELLO", + "AUTH_REQUEST", + "AUTH_BAD_METHOD", + "AUTH_REPLY_MORE", + "AUTH_REQUEST_MORE", + "AUTH_DONE", + "AUTH_SIGNATURE", + "CLIENT_IDENT", + "SERVER_IDENT", + "IDENT_MISSING_FEATURES", + "SESSION_RECONNECT", + "SESSION_RESET", + "SESSION_RETRY", + "SESSION_RETRY_GLOBAL", + "SESSION_RECONNECT_OK", + "WAIT", + "MESSAGE", + "KEEPALIVE2", + "KEEPALIVE2_ACK", + "ACK"}; + assert(static_cast<size_t>(tag_bp.tag) < std::size(tag_names)); + return fmt::format_to(ctx.out(), "{}_{}", + tag_names[static_cast<size_t>(tag_bp.tag)], + tag_bp.type == crimson::net::bp_type_t::WRITE ? "WRITE" : "READ"); + } +}; diff --git a/src/crimson/net/Messenger.cc b/src/crimson/net/Messenger.cc new file mode 100644 index 000000000..1af198589 --- /dev/null +++ b/src/crimson/net/Messenger.cc @@ -0,0 +1,19 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Messenger.h" +#include "SocketMessenger.h" + +namespace crimson::net { + +MessengerRef +Messenger::create(const entity_name_t& name, + const std::string& lname, + uint64_t nonce, + bool dispatch_only_on_this_shard) +{ + return seastar::make_shared<SocketMessenger>( + name, lname, nonce, dispatch_only_on_this_shard); +} + +} // namespace crimson::net diff --git a/src/crimson/net/Messenger.h b/src/crimson/net/Messenger.h new file mode 100644 index 000000000..74df062d8 --- /dev/null +++ b/src/crimson/net/Messenger.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "Fwd.h" +#include "crimson/common/throttle.h" +#include "msg/Message.h" +#include "msg/Policy.h" + +class AuthAuthorizer; + +namespace crimson::auth { +class AuthClient; +class AuthServer; +} + +namespace crimson::net { + +#ifdef UNIT_TESTS_BUILT +class Interceptor; +#endif + +using Throttle = crimson::common::Throttle; +using SocketPolicy = ceph::net::Policy<Throttle>; + +class Messenger { +public: + Messenger() {} + + virtual ~Messenger() {} + + virtual const entity_name_t& get_myname() const = 0; + + entity_type_t get_mytype() const { return get_myname().type(); } + + virtual const entity_addrvec_t &get_myaddrs() const = 0; + + entity_addr_t get_myaddr() const { return get_myaddrs().front(); } + + virtual void set_myaddrs(const entity_addrvec_t& addrs) = 0; + + virtual bool set_addr_unknowns(const entity_addrvec_t &addrs) = 0; + + virtual void set_auth_client(crimson::auth::AuthClient *) = 0; + + virtual void set_auth_server(crimson::auth::AuthServer *) = 0; + + using bind_ertr = crimson::errorator< + crimson::ct_error::address_in_use, // The address (range) is already bound + crimson::ct_error::address_not_available + >; + /// bind to the given address + virtual bind_ertr::future<> bind(const entity_addrvec_t& addr) = 0; + + /// start the messenger + virtual seastar::future<> start(const dispatchers_t&) = 0; + + /// either return an existing connection to the peer, + /// or a new pending connection + virtual ConnectionRef + connect(const entity_addr_t& peer_addr, + const entity_name_t& peer_name) = 0; + + ConnectionRef + connect(const entity_addr_t& peer_addr, + const entity_type_t& peer_type) { + return connect(peer_addr, entity_name_t(peer_type, -1)); + } + + virtual bool owns_connection(Connection &) const = 0; + + // wait for messenger shutdown + virtual seastar::future<> wait() = 0; + + // stop dispatching events and messages + virtual void stop() = 0; + + virtual bool is_started() const = 0; + + // free internal resources before destruction, must be called after stopped, + // and must be called if is bound. + virtual seastar::future<> shutdown() = 0; + + virtual void print(std::ostream& out) const = 0; + + virtual SocketPolicy get_policy(entity_type_t peer_type) const = 0; + + virtual SocketPolicy get_default_policy() const = 0; + + virtual void set_default_policy(const SocketPolicy& p) = 0; + + virtual void set_policy(entity_type_t peer_type, const SocketPolicy& p) = 0; + + virtual void set_policy_throttler(entity_type_t peer_type, Throttle* throttle) = 0; + + static MessengerRef + create(const entity_name_t& name, + const std::string& lname, + uint64_t nonce, + bool dispatch_only_on_this_shard); + +#ifdef UNIT_TESTS_BUILT + virtual void set_interceptor(Interceptor *) = 0; +#endif +}; + +inline std::ostream& operator<<(std::ostream& out, const Messenger& msgr) { + out << "["; + msgr.print(out); + out << "]"; + return out; +} + +} // namespace crimson::net + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::net::Messenger> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/net/ProtocolV2.cc b/src/crimson/net/ProtocolV2.cc new file mode 100644 index 000000000..55b669384 --- /dev/null +++ b/src/crimson/net/ProtocolV2.cc @@ -0,0 +1,2348 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ProtocolV2.h" + +#include <fmt/format.h> +#include <fmt/ranges.h> +#include "include/msgr.h" +#include "include/random.h" +#include "msg/msg_fmt.h" + +#include "crimson/auth/AuthClient.h" +#include "crimson/auth/AuthServer.h" +#include "crimson/common/formatter.h" +#include "crimson/common/log.h" + +#include "Errors.h" +#include "SocketMessenger.h" + +using namespace ceph::msgr::v2; +using crimson::common::local_conf; + +namespace { + +// TODO: CEPH_MSGR2_FEATURE_COMPRESSION +const uint64_t CRIMSON_MSGR2_SUPPORTED_FEATURES = + (CEPH_MSGR2_FEATURE_REVISION_1 | + // CEPH_MSGR2_FEATURE_COMPRESSION | + UINT64_C(0)); + +// Log levels in V2 Protocol: +// * error level, something error that cause connection to terminate: +// - fatal errors; +// - bugs; +// * warn level: something unusual that identifies connection fault or replacement: +// - unstable network; +// - incompatible peer; +// - auth failure; +// - connection race; +// - connection reset; +// * info level, something very important to show connection lifecycle, +// which doesn't happen very frequently; +// * debug level, important logs for debugging, including: +// - all the messages sent/received (-->/<==); +// - all the frames exchanged (WRITE/GOT); +// - important fields updated (UPDATE); +// - connection state transitions (TRIGGER); +// * trace level, trivial logs showing: +// - the exact bytes being sent/received (SEND/RECV(bytes)); +// - detailed information of sub-frames; +// - integrity checks; +// - etc. +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); +} + +[[noreturn]] void abort_in_fault() { + throw std::system_error(make_error_code(crimson::net::error::negotiation_failure)); +} + +[[noreturn]] void abort_protocol() { + throw std::system_error(make_error_code(crimson::net::error::protocol_aborted)); +} + +#define ABORT_IN_CLOSE(is_dispatch_reset) { \ + do_close(is_dispatch_reset); \ + abort_protocol(); \ +} + +inline void expect_tag(const Tag& expected, + const Tag& actual, + crimson::net::SocketConnection& conn, + const char *where) { + if (actual != expected) { + logger().warn("{} {} received wrong tag: {}, expected {}", + conn, where, + static_cast<uint32_t>(actual), + static_cast<uint32_t>(expected)); + abort_in_fault(); + } +} + +inline void unexpected_tag(const Tag& unexpected, + crimson::net::SocketConnection& conn, + const char *where) { + logger().warn("{} {} received unexpected tag: {}", + conn, where, static_cast<uint32_t>(unexpected)); + abort_in_fault(); +} + +inline uint64_t generate_client_cookie() { + return ceph::util::generate_random_number<uint64_t>( + 1, std::numeric_limits<uint64_t>::max()); +} + +} // namespace anonymous + +namespace crimson::net { + +seastar::future<> ProtocolV2::Timer::backoff(double seconds) +{ + logger().warn("{} waiting {} seconds ...", conn, seconds); + cancel(); + last_dur_ = seconds; + as = seastar::abort_source(); + auto dur = std::chrono::duration_cast<seastar::lowres_clock::duration>( + std::chrono::duration<double>(seconds)); + return seastar::sleep_abortable(dur, *as + ).handle_exception_type([this] (const seastar::sleep_aborted& e) { + logger().debug("{} wait aborted", conn); + abort_protocol(); + }); +} + +ProtocolV2::ProtocolV2(SocketConnection& conn, + IOHandler &io_handler) + : conn{conn}, + messenger{conn.messenger}, + io_handler{io_handler}, + frame_assembler{FrameAssemblerV2::create(conn)}, + auth_meta{seastar::make_lw_shared<AuthConnectionMeta>()}, + protocol_timer{conn} +{ + io_states = io_handler.get_states(); +} + +ProtocolV2::~ProtocolV2() {} + +void ProtocolV2::start_connect(const entity_addr_t& _peer_addr, + const entity_name_t& _peer_name) +{ + assert(seastar::this_shard_id() == conn.get_messenger_shard_id()); + ceph_assert(state == state_t::NONE); + ceph_assert(!gate.is_closed()); + conn.peer_addr = _peer_addr; + conn.target_addr = _peer_addr; + conn.set_peer_name(_peer_name); + conn.policy = messenger.get_policy(_peer_name.type()); + client_cookie = generate_client_cookie(); + logger().info("{} ProtocolV2::start_connect(): peer_addr={}, peer_name={}, cc={}" + " policy(lossy={}, server={}, standby={}, resetcheck={})", + conn, _peer_addr, _peer_name, client_cookie, + conn.policy.lossy, conn.policy.server, + conn.policy.standby, conn.policy.resetcheck); + messenger.register_conn( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + execute_connecting(); +} + +void ProtocolV2::start_accept(SocketFRef&& new_socket, + const entity_addr_t& _peer_addr) +{ + assert(seastar::this_shard_id() == conn.get_messenger_shard_id()); + ceph_assert(state == state_t::NONE); + // until we know better + conn.target_addr = _peer_addr; + frame_assembler->set_socket(std::move(new_socket)); + has_socket = true; + is_socket_valid = true; + logger().info("{} ProtocolV2::start_accept(): target_addr={}", conn, _peer_addr); + messenger.accept_conn( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + + auto cc_seq = crosscore.prepare_submit(); + gate.dispatch_in_background("set_accepted_sid", conn, [this, cc_seq] { + return io_handler.set_accepted_sid( + cc_seq, + frame_assembler->get_socket_shard_id(), + seastar::make_foreign(conn.shared_from_this())); + }); + + execute_accepting(); +} + +void ProtocolV2::trigger_state_phase1(state_t new_state) +{ + ceph_assert_always(!gate.is_closed()); + if (new_state == state) { + logger().error("{} is not allowed to re-trigger state {}", + conn, get_state_name(state)); + ceph_abort(); + } + if (state == state_t::CLOSING) { + logger().error("{} CLOSING is not allowed to trigger state {}", + conn, get_state_name(new_state)); + ceph_abort(); + } + logger().debug("{} TRIGGER {}, was {}", + conn, get_state_name(new_state), get_state_name(state)); + + if (state == state_t::READY) { + // from READY + ceph_assert_always(!need_exit_io); + ceph_assert_always(!pr_exit_io.has_value()); + need_exit_io = true; + pr_exit_io = seastar::shared_promise<>(); + } + + if (new_state == state_t::STANDBY && !conn.policy.server) { + need_notify_out = true; + } else { + need_notify_out = false; + } + + state = new_state; +} + +void ProtocolV2::trigger_state_phase2( + state_t new_state, io_state_t new_io_state) +{ + ceph_assert_always(new_state == state); + ceph_assert_always(!gate.is_closed()); + ceph_assert_always(!pr_switch_io_shard.has_value()); + + FrameAssemblerV2Ref fa; + if (new_state == state_t::READY) { + assert(new_io_state == io_state_t::open); + assert(io_handler.get_shard_id() == + frame_assembler->get_socket_shard_id()); + frame_assembler->set_shard_id(io_handler.get_shard_id()); + fa = std::move(frame_assembler); + } else { + assert(new_io_state != io_state_t::open); + } + + auto cc_seq = crosscore.prepare_submit(); + logger().debug("{} send {} IOHandler::set_io_state(): new_state={}, new_io_state={}, " + "fa={}, set_notify_out={}", + conn, cc_seq, get_state_name(new_state), new_io_state, + fa ? fmt::format("(sid={})", fa->get_shard_id()) : "N/A", + need_notify_out); + gate.dispatch_in_background( + "set_io_state", conn, + [this, cc_seq, new_io_state, fa=std::move(fa)]() mutable { + return seastar::smp::submit_to( + io_handler.get_shard_id(), + [this, cc_seq, new_io_state, + fa=std::move(fa), set_notify_out=need_notify_out]() mutable { + return io_handler.set_io_state( + cc_seq, new_io_state, std::move(fa), set_notify_out); + }); + }); + + if (need_exit_io) { + // from READY + auto cc_seq = crosscore.prepare_submit(); + logger().debug("{} send {} IOHandler::wait_io_exit_dispatching() ...", + conn, cc_seq); + assert(pr_exit_io.has_value()); + assert(new_io_state != io_state_t::open); + need_exit_io = false; + gate.dispatch_in_background("exit_io", conn, [this, cc_seq] { + return seastar::smp::submit_to( + io_handler.get_shard_id(), [this, cc_seq] { + return io_handler.wait_io_exit_dispatching(cc_seq); + }).then([this, cc_seq](auto ret) { + logger().debug("{} finish {} IOHandler::wait_io_exit_dispatching(), {}", + conn, cc_seq, ret.io_states); + frame_assembler = std::move(ret.frame_assembler); + assert(seastar::this_shard_id() == conn.get_messenger_shard_id()); + ceph_assert_always( + seastar::this_shard_id() == frame_assembler->get_shard_id()); + ceph_assert_always(!frame_assembler->is_socket_valid()); + assert(!need_exit_io); + io_states = ret.io_states; + pr_exit_io->set_value(); + pr_exit_io = std::nullopt; + }); + }); + } +} + +void ProtocolV2::fault( + state_t expected_state, + const char *where, + std::exception_ptr eptr) +{ + assert(expected_state == state_t::CONNECTING || + expected_state == state_t::ESTABLISHING || + expected_state == state_t::REPLACING || + expected_state == state_t::READY); + const char *e_what; + try { + std::rethrow_exception(eptr); + } catch (std::exception &e) { + e_what = e.what(); + } + + if (state != expected_state) { + logger().info("{} protocol {} {} is aborted at inconsistent {} -- {}", + conn, + get_state_name(expected_state), + where, + get_state_name(state), + e_what); +#ifndef NDEBUG + if (expected_state == state_t::REPLACING) { + assert(state == state_t::CLOSING); + } else if (expected_state == state_t::READY) { + assert(state == state_t::CLOSING || + state == state_t::REPLACING || + state == state_t::CONNECTING || + state == state_t::STANDBY); + } else { + assert(state == state_t::CLOSING || + state == state_t::REPLACING); + } +#endif + return; + } + assert(state == expected_state); + + if (state != state_t::CONNECTING && conn.policy.lossy) { + // socket will be shutdown in do_close() + logger().info("{} protocol {} {} fault on lossy channel, going to CLOSING -- {}", + conn, get_state_name(state), where, e_what); + do_close(true); + return; + } + + if (likely(has_socket)) { + if (likely(is_socket_valid)) { + ceph_assert_always(state != state_t::READY); + frame_assembler->shutdown_socket<true>(&gate); + is_socket_valid = false; + } else { + ceph_assert_always(state != state_t::ESTABLISHING); + } + } else { // !has_socket + ceph_assert_always(state == state_t::CONNECTING); + assert(!is_socket_valid); + } + + if (conn.policy.server || + (conn.policy.standby && !io_states.is_out_queued_or_sent())) { + if (conn.policy.server) { + logger().info("{} protocol {} {} fault as server, going to STANDBY {} -- {}", + conn, + get_state_name(state), + where, + io_states, + e_what); + } else { + logger().info("{} protocol {} {} fault with nothing to send, going to STANDBY {} -- {}", + conn, + get_state_name(state), + where, + io_states, + e_what); + } + execute_standby(); + } else if (state == state_t::CONNECTING || + state == state_t::REPLACING) { + logger().info("{} protocol {} {} fault, going to WAIT {} -- {}", + conn, + get_state_name(state), + where, + io_states, + e_what); + execute_wait(false); + } else { + assert(state == state_t::READY || + state == state_t::ESTABLISHING); + logger().info("{} protocol {} {} fault, going to CONNECTING {} -- {}", + conn, + get_state_name(state), + where, + io_states, + e_what); + execute_connecting(); + } +} + +void ProtocolV2::reset_session(bool full) +{ + server_cookie = 0; + connect_seq = 0; + if (full) { + client_cookie = generate_client_cookie(); + peer_global_seq = 0; + } + + auto cc_seq = crosscore.prepare_submit(); + logger().debug("{} send {} IOHandler::reset_session({})", + conn, cc_seq, full); + io_states.reset_session(full); + gate.dispatch_in_background( + "reset_session", conn, [this, cc_seq, full] { + return seastar::smp::submit_to( + io_handler.get_shard_id(), [this, cc_seq, full] { + return io_handler.reset_session(cc_seq, full); + }); + }); + // user can make changes +} + +seastar::future<std::tuple<entity_type_t, entity_addr_t>> +ProtocolV2::banner_exchange(bool is_connect) +{ + // 1. prepare and send banner + bufferlist banner_payload; + encode((uint64_t)CRIMSON_MSGR2_SUPPORTED_FEATURES, banner_payload, 0); + encode((uint64_t)CEPH_MSGR2_REQUIRED_FEATURES, banner_payload, 0); + + bufferlist bl; + bl.append(CEPH_BANNER_V2_PREFIX, strlen(CEPH_BANNER_V2_PREFIX)); + auto len_payload = static_cast<uint16_t>(banner_payload.length()); + encode(len_payload, bl, 0); + bl.claim_append(banner_payload); + logger().debug("{} SEND({}) banner: len_payload={}, supported={}, " + "required={}, banner=\"{}\"", + conn, bl.length(), len_payload, + CRIMSON_MSGR2_SUPPORTED_FEATURES, + CEPH_MSGR2_REQUIRED_FEATURES, + CEPH_BANNER_V2_PREFIX); +#ifdef UNIT_TESTS_BUILT + return frame_assembler->intercept_frame(custom_bp_t::BANNER_WRITE, true + ).then([this, bl=std::move(bl)]() mutable { + return frame_assembler->write_flush(std::move(bl)); + } +#else + return frame_assembler->write_flush(std::move(bl) +#endif + ).then([this] { + // 2. read peer banner + unsigned banner_len = strlen(CEPH_BANNER_V2_PREFIX) + sizeof(ceph_le16); +#ifdef UNIT_TESTS_BUILT + return frame_assembler->intercept_frame(custom_bp_t::BANNER_READ, false + ).then([this, banner_len] { + return frame_assembler->read_exactly(banner_len); + }); +#else + return frame_assembler->read_exactly(banner_len); +#endif + }).then([this](auto bptr) { + // 3. process peer banner and read banner_payload + unsigned banner_prefix_len = strlen(CEPH_BANNER_V2_PREFIX); + logger().debug("{} RECV({}) banner: \"{}\"", + conn, bptr.length(), + std::string(bptr.c_str(), banner_prefix_len)); + + if (memcmp(bptr.c_str(), CEPH_BANNER_V2_PREFIX, banner_prefix_len) != 0) { + if (memcmp(bptr.c_str(), CEPH_BANNER, strlen(CEPH_BANNER)) == 0) { + logger().warn("{} peer is using V1 protocol", conn); + } else { + logger().warn("{} peer sent bad banner", conn); + } + abort_in_fault(); + } + + bptr.set_offset(bptr.offset() + banner_prefix_len); + bptr.set_length(bptr.length() - banner_prefix_len); + assert(bptr.length() == sizeof(ceph_le16)); + + uint16_t payload_len; + bufferlist buf; + buf.append(std::move(bptr)); + auto ti = buf.cbegin(); + try { + decode(payload_len, ti); + } catch (const buffer::error &e) { + logger().warn("{} decode banner payload len failed", conn); + abort_in_fault(); + } + logger().debug("{} GOT banner: payload_len={}", conn, payload_len); +#ifdef UNIT_TESTS_BUILT + return frame_assembler->intercept_frame( + custom_bp_t::BANNER_PAYLOAD_READ, false + ).then([this, payload_len] { + return frame_assembler->read(payload_len); + }); +#else + return frame_assembler->read(payload_len); +#endif + }).then([this, is_connect] (bufferlist bl) { + // 4. process peer banner_payload and send HelloFrame + auto p = bl.cbegin(); + uint64_t _peer_supported_features; + uint64_t _peer_required_features; + try { + decode(_peer_supported_features, p); + decode(_peer_required_features, p); + } catch (const buffer::error &e) { + logger().warn("{} decode banner payload failed", conn); + abort_in_fault(); + } + logger().debug("{} RECV({}) banner features: supported={} required={}", + conn, bl.length(), + _peer_supported_features, _peer_required_features); + + // Check feature bit compatibility + uint64_t supported_features = CRIMSON_MSGR2_SUPPORTED_FEATURES; + uint64_t required_features = CEPH_MSGR2_REQUIRED_FEATURES; + if ((required_features & _peer_supported_features) != required_features) { + logger().error("{} peer does not support all required features" + " required={} peer_supported={}", + conn, required_features, _peer_supported_features); + ABORT_IN_CLOSE(is_connect); + } + if ((supported_features & _peer_required_features) != _peer_required_features) { + logger().error("{} we do not support all peer required features" + " peer_required={} supported={}", + conn, _peer_required_features, supported_features); + ABORT_IN_CLOSE(is_connect); + } + peer_supported_features = _peer_supported_features; + bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1); + frame_assembler->set_is_rev1(is_rev1); + + auto hello = HelloFrame::Encode(messenger.get_mytype(), + conn.target_addr); + logger().debug("{} WRITE HelloFrame: my_type={}, peer_addr={}", + conn, ceph_entity_type_name(messenger.get_mytype()), + conn.target_addr); + return frame_assembler->write_flush_frame(hello); + }).then([this] { + //5. read peer HelloFrame + return frame_assembler->read_main_preamble(); + }).then([this](auto ret) { + expect_tag(Tag::HELLO, ret.tag, conn, "read_hello_frame"); + return frame_assembler->read_frame_payload(); + }).then([this](auto payload) { + // 6. process peer HelloFrame + auto hello = HelloFrame::Decode(payload->back()); + logger().debug("{} GOT HelloFrame: my_type={} peer_addr={}", + conn, ceph_entity_type_name(hello.entity_type()), + hello.peer_addr()); + return seastar::make_ready_future<std::tuple<entity_type_t, entity_addr_t>>( + std::make_tuple(hello.entity_type(), hello.peer_addr())); + }); +} + +// CONNECTING state + +seastar::future<> ProtocolV2::handle_auth_reply() +{ + return frame_assembler->read_main_preamble( + ).then([this](auto ret) { + switch (ret.tag) { + case Tag::AUTH_BAD_METHOD: + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + // handle_auth_bad_method() logic + auto bad_method = AuthBadMethodFrame::Decode(payload->back()); + logger().warn("{} GOT AuthBadMethodFrame: method={} result={}, " + "allowed_methods={}, allowed_modes={}", + conn, bad_method.method(), cpp_strerror(bad_method.result()), + bad_method.allowed_methods(), bad_method.allowed_modes()); + ceph_assert(messenger.get_auth_client()); + int r = messenger.get_auth_client()->handle_auth_bad_method( + conn, *auth_meta, + bad_method.method(), bad_method.result(), + bad_method.allowed_methods(), bad_method.allowed_modes()); + if (r < 0) { + logger().warn("{} auth_client handle_auth_bad_method returned {}", + conn, r); + abort_in_fault(); + } + return client_auth(bad_method.allowed_methods()); + }); + case Tag::AUTH_REPLY_MORE: + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + // handle_auth_reply_more() logic + auto auth_more = AuthReplyMoreFrame::Decode(payload->back()); + logger().debug("{} GOT AuthReplyMoreFrame: payload_len={}", + conn, auth_more.auth_payload().length()); + ceph_assert(messenger.get_auth_client()); + // let execute_connecting() take care of the thrown exception + auto reply = messenger.get_auth_client()->handle_auth_reply_more( + conn, *auth_meta, auth_more.auth_payload()); + auto more_reply = AuthRequestMoreFrame::Encode(reply); + logger().debug("{} WRITE AuthRequestMoreFrame: payload_len={}", + conn, reply.length()); + return frame_assembler->write_flush_frame(more_reply); + }).then([this] { + return handle_auth_reply(); + }); + case Tag::AUTH_DONE: + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + // handle_auth_done() logic + auto auth_done = AuthDoneFrame::Decode(payload->back()); + logger().debug("{} GOT AuthDoneFrame: gid={}, con_mode={}, payload_len={}", + conn, auth_done.global_id(), + ceph_con_mode_name(auth_done.con_mode()), + auth_done.auth_payload().length()); + ceph_assert(messenger.get_auth_client()); + int r = messenger.get_auth_client()->handle_auth_done( + conn, + *auth_meta, + auth_done.global_id(), + auth_done.con_mode(), + auth_done.auth_payload()); + if (r < 0) { + logger().warn("{} auth_client handle_auth_done returned {}", conn, r); + abort_in_fault(); + } + auth_meta->con_mode = auth_done.con_mode(); + frame_assembler->create_session_stream_handlers(*auth_meta, false); + return finish_auth(); + }); + default: { + unexpected_tag(ret.tag, conn, "handle_auth_reply"); + return seastar::now(); + } + } + }); +} + +seastar::future<> ProtocolV2::client_auth(std::vector<uint32_t> &allowed_methods) +{ + // send_auth_request() logic + ceph_assert(messenger.get_auth_client()); + + try { + auto [auth_method, preferred_modes, bl] = + messenger.get_auth_client()->get_auth_request(conn, *auth_meta); + auth_meta->auth_method = auth_method; + auto frame = AuthRequestFrame::Encode(auth_method, preferred_modes, bl); + logger().debug("{} WRITE AuthRequestFrame: method={}," + " preferred_modes={}, payload_len={}", + conn, auth_method, preferred_modes, bl.length()); + return frame_assembler->write_flush_frame(frame + ).then([this] { + return handle_auth_reply(); + }); + } catch (const crimson::auth::error& e) { + logger().error("{} get_initial_auth_request returned {}", conn, e.what()); + ABORT_IN_CLOSE(true); + return seastar::now(); + } +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::process_wait() +{ + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + // handle_wait() logic + logger().debug("{} GOT WaitFrame", conn); + WaitFrame::Decode(payload->back()); + return next_step_t::wait; + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::client_connect() +{ + // send_client_ident() logic + uint64_t flags = 0; + if (conn.policy.lossy) { + flags |= CEPH_MSG_CONNECT_LOSSY; + } + + auto client_ident = ClientIdentFrame::Encode( + messenger.get_myaddrs(), + conn.target_addr, + messenger.get_myname().num(), + global_seq, + conn.policy.features_supported, + conn.policy.features_required | msgr2_required, flags, + client_cookie); + + logger().debug("{} WRITE ClientIdentFrame: addrs={}, target={}, gid={}," + " gs={}, features_supported={}, features_required={}," + " flags={}, cookie={}", + conn, messenger.get_myaddrs(), conn.target_addr, + messenger.get_myname().num(), global_seq, + conn.policy.features_supported, + conn.policy.features_required | msgr2_required, + flags, client_cookie); + return frame_assembler->write_flush_frame(client_ident + ).then([this] { + return frame_assembler->read_main_preamble(); + }).then([this](auto ret) { + switch (ret.tag) { + case Tag::IDENT_MISSING_FEATURES: + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + // handle_ident_missing_features() logic + auto ident_missing = IdentMissingFeaturesFrame::Decode(payload->back()); + logger().warn("{} GOT IdentMissingFeaturesFrame: features={}" + " (client does not support all server features)", + conn, ident_missing.features()); + abort_in_fault(); + return next_step_t::none; + }); + case Tag::WAIT: + return process_wait(); + case Tag::SERVER_IDENT: + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} at receiving SERVER_IDENT", + conn, get_state_name(state)); + abort_protocol(); + } + + // handle_server_ident() logic + auto cc_seq = crosscore.prepare_submit(); + logger().debug("{} send {} IOHandler::requeue_out_sent()", + conn, cc_seq); + io_states.requeue_out_sent(); + gate.dispatch_in_background( + "requeue_out_sent", conn, [this, cc_seq] { + return seastar::smp::submit_to( + io_handler.get_shard_id(), [this, cc_seq] { + return io_handler.requeue_out_sent(cc_seq); + }); + }); + + auto server_ident = ServerIdentFrame::Decode(payload->back()); + logger().debug("{} GOT ServerIdentFrame:" + " addrs={}, gid={}, gs={}," + " features_supported={}, features_required={}," + " flags={}, cookie={}", + conn, + server_ident.addrs(), server_ident.gid(), + server_ident.global_seq(), + server_ident.supported_features(), + server_ident.required_features(), + server_ident.flags(), server_ident.cookie()); + + // is this who we intended to talk to? + // be a bit forgiving here, since we may be connecting based on addresses parsed out + // of mon_host or something. + if (!server_ident.addrs().contains(conn.target_addr)) { + logger().warn("{} peer identifies as {}, does not include {}", + conn, server_ident.addrs(), conn.target_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + + server_cookie = server_ident.cookie(); + + // TODO: change peer_addr to entity_addrvec_t + if (server_ident.addrs().front() != conn.peer_addr) { + logger().warn("{} peer advertises as {}, does not match {}", + conn, server_ident.addrs(), conn.peer_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (conn.get_peer_id() != entity_name_t::NEW && + conn.get_peer_id() != server_ident.gid()) { + logger().error("{} connection peer id ({}) does not match " + "what it should be ({}) during connecting, close", + conn, server_ident.gid(), conn.get_peer_id()); + ABORT_IN_CLOSE(true); + } + conn.set_peer_id(server_ident.gid()); + conn.set_features(server_ident.supported_features() & + conn.policy.features_supported); + logger().debug("{} UPDATE: features={}", conn, conn.get_features()); + peer_global_seq = server_ident.global_seq(); + + bool lossy = server_ident.flags() & CEPH_MSG_CONNECT_LOSSY; + if (lossy != conn.policy.lossy) { + logger().warn("{} UPDATE Policy(lossy={}) from server flags", conn, lossy); + conn.policy.lossy = lossy; + } + if (lossy && (connect_seq != 0 || server_cookie != 0)) { + logger().warn("{} UPDATE cs=0({}) sc=0({}) for lossy policy", + conn, connect_seq, server_cookie); + connect_seq = 0; + server_cookie = 0; + } + + return seastar::make_ready_future<next_step_t>(next_step_t::ready); + }); + default: { + unexpected_tag(ret.tag, conn, "post_client_connect"); + return seastar::make_ready_future<next_step_t>(next_step_t::none); + } + } + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::client_reconnect() +{ + // send_reconnect() logic + auto reconnect = ReconnectFrame::Encode(messenger.get_myaddrs(), + client_cookie, + server_cookie, + global_seq, + connect_seq, + io_states.in_seq); + logger().debug("{} WRITE ReconnectFrame: addrs={}, client_cookie={}," + " server_cookie={}, gs={}, cs={}, in_seq={}", + conn, messenger.get_myaddrs(), + client_cookie, server_cookie, + global_seq, connect_seq, io_states.in_seq); + return frame_assembler->write_flush_frame(reconnect).then([this] { + return frame_assembler->read_main_preamble(); + }).then([this](auto ret) { + switch (ret.tag) { + case Tag::SESSION_RETRY_GLOBAL: + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + // handle_session_retry_global() logic + auto retry = RetryGlobalFrame::Decode(payload->back()); + logger().warn("{} GOT RetryGlobalFrame: gs={}", + conn, retry.global_seq()); + global_seq = messenger.get_global_seq(retry.global_seq()); + logger().warn("{} UPDATE: gs={} for retry global", conn, global_seq); + return client_reconnect(); + }); + case Tag::SESSION_RETRY: + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + // handle_session_retry() logic + auto retry = RetryFrame::Decode(payload->back()); + logger().warn("{} GOT RetryFrame: cs={}", + conn, retry.connect_seq()); + connect_seq = retry.connect_seq() + 1; + logger().warn("{} UPDATE: cs={}", conn, connect_seq); + return client_reconnect(); + }); + case Tag::SESSION_RESET: + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} before reset_session()", + conn, get_state_name(state)); + abort_protocol(); + } + // handle_session_reset() logic + auto reset = ResetFrame::Decode(payload->back()); + logger().warn("{} GOT ResetFrame: full={}", conn, reset.full()); + + reset_session(reset.full()); + // user can make changes + + return client_connect(); + }); + case Tag::WAIT: + return process_wait(); + case Tag::SESSION_RECONNECT_OK: + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} at receiving RECONNECT_OK", + conn, get_state_name(state)); + abort_protocol(); + } + + // handle_reconnect_ok() logic + auto reconnect_ok = ReconnectOkFrame::Decode(payload->back()); + auto cc_seq = crosscore.prepare_submit(); + logger().debug("{} GOT ReconnectOkFrame: msg_seq={}, " + "send {} IOHandler::requeue_out_sent_up_to()", + conn, reconnect_ok.msg_seq(), cc_seq); + + io_states.requeue_out_sent_up_to(); + auto msg_seq = reconnect_ok.msg_seq(); + gate.dispatch_in_background( + "requeue_out_reconnecting", conn, [this, cc_seq, msg_seq] { + return seastar::smp::submit_to( + io_handler.get_shard_id(), [this, cc_seq, msg_seq] { + return io_handler.requeue_out_sent_up_to(cc_seq, msg_seq); + }); + }); + + return seastar::make_ready_future<next_step_t>(next_step_t::ready); + }); + default: { + unexpected_tag(ret.tag, conn, "post_client_reconnect"); + return seastar::make_ready_future<next_step_t>(next_step_t::none); + } + } + }); +} + +void ProtocolV2::execute_connecting() +{ + ceph_assert_always(!is_socket_valid); + trigger_state(state_t::CONNECTING, io_state_t::delay); + gated_execute("execute_connecting", conn, [this] { + global_seq = messenger.get_global_seq(); + assert(client_cookie != 0); + if (!conn.policy.lossy && server_cookie != 0) { + ++connect_seq; + logger().debug("{} UPDATE: gs={}, cs={} for reconnect", + conn, global_seq, connect_seq); + } else { // conn.policy.lossy || server_cookie == 0 + assert(connect_seq == 0); + assert(server_cookie == 0); + logger().debug("{} UPDATE: gs={} for connect", conn, global_seq); + } + return wait_exit_io().then([this] { +#ifdef UNIT_TESTS_BUILT + // process custom_bp_t::SOCKET_CONNECTING + // supports CONTINUE/FAULT/BLOCK + if (!conn.interceptor) { + return seastar::now(); + } + return conn.interceptor->intercept( + conn, {Breakpoint{custom_bp_t::SOCKET_CONNECTING}} + ).then([this](bp_action_t action) { + switch (action) { + case bp_action_t::CONTINUE: + return seastar::now(); + case bp_action_t::FAULT: + logger().info("[Test] got FAULT"); + abort_in_fault(); + case bp_action_t::BLOCK: + logger().info("[Test] got BLOCK"); + return conn.interceptor->blocker.block(); + default: + ceph_abort("unexpected action from trap"); + return seastar::now(); + } + });; + }).then([this] { +#endif + ceph_assert_always(frame_assembler); + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} before Socket::connect()", + conn, get_state_name(state)); + abort_protocol(); + } + return Socket::connect(conn.peer_addr); + }).then([this](SocketRef _new_socket) { + logger().debug("{} socket connected", conn); + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} during Socket::connect()", + conn, get_state_name(state)); + return _new_socket->close().then([sock=std::move(_new_socket)] { + abort_protocol(); + }); + } + SocketFRef new_socket = seastar::make_foreign(std::move(_new_socket)); + if (!has_socket) { + frame_assembler->set_socket(std::move(new_socket)); + has_socket = true; + } else { + gate.dispatch_in_background( + "replace_socket_connecting", + conn, + [this, new_socket=std::move(new_socket)]() mutable { + return frame_assembler->replace_shutdown_socket(std::move(new_socket)); + } + ); + } + is_socket_valid = true; + return seastar::now(); + }).then([this] { + auth_meta = seastar::make_lw_shared<AuthConnectionMeta>(); + frame_assembler->reset_handlers(); + frame_assembler->start_recording(); + return banner_exchange(true); + }).then([this] (auto&& ret) { + auto [_peer_type, _my_addr_from_peer] = std::move(ret); + if (conn.get_peer_type() != _peer_type) { + logger().warn("{} connection peer type does not match what peer advertises {} != {}", + conn, ceph_entity_type_name(conn.get_peer_type()), + ceph_entity_type_name(_peer_type)); + ABORT_IN_CLOSE(true); + } + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} during banner_exchange(), abort", + conn, get_state_name(state)); + abort_protocol(); + } + frame_assembler->learn_socket_ephemeral_port_as_connector( + _my_addr_from_peer.get_port()); + if (unlikely(_my_addr_from_peer.is_legacy())) { + logger().warn("{} peer sent a legacy address for me: {}", + conn, _my_addr_from_peer); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + _my_addr_from_peer.set_type(entity_addr_t::TYPE_MSGR2); + messenger.learned_addr(_my_addr_from_peer, conn); + return client_auth(); + }).then([this] { + if (server_cookie == 0) { + ceph_assert(connect_seq == 0); + return client_connect(); + } else { + ceph_assert(connect_seq > 0); + return client_reconnect(); + } + }).then([this] (next_step_t next) { + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} at the end of execute_connecting()", + conn, get_state_name(state)); + abort_protocol(); + } + switch (next) { + case next_step_t::ready: { + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} before dispatch_connect(), abort", + conn, get_state_name(state)); + abort_protocol(); + } + + auto cc_seq = crosscore.prepare_submit(); + // there are 2 hops with dispatch_connect() + crosscore.prepare_submit(); + logger().info("{} connected: gs={}, pgs={}, cs={}, " + "client_cookie={}, server_cookie={}, {}, new_sid={}, " + "send {} IOHandler::dispatch_connect()", + conn, global_seq, peer_global_seq, connect_seq, + client_cookie, server_cookie, io_states, + frame_assembler->get_socket_shard_id(), cc_seq); + + // set io_handler to a new shard + auto new_io_shard = frame_assembler->get_socket_shard_id(); + ConnectionFRef conn_fref = seastar::make_foreign( + conn.shared_from_this()); + ceph_assert_always(!pr_switch_io_shard.has_value()); + pr_switch_io_shard = seastar::shared_promise<>(); + return seastar::smp::submit_to( + io_handler.get_shard_id(), + [this, cc_seq, new_io_shard, + conn_fref=std::move(conn_fref)]() mutable { + return io_handler.dispatch_connect( + cc_seq, new_io_shard, std::move(conn_fref)); + }).then([this, new_io_shard] { + ceph_assert_always(io_handler.get_shard_id() == new_io_shard); + pr_switch_io_shard->set_value(); + pr_switch_io_shard = std::nullopt; + // user can make changes + + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} after dispatch_connect(), abort", + conn, get_state_name(state)); + abort_protocol(); + } + execute_ready(); + }); + } + case next_step_t::wait: { + logger().info("{} execute_connecting(): going to WAIT(max-backoff)", conn); + ceph_assert_always(is_socket_valid); + frame_assembler->shutdown_socket<true>(&gate); + is_socket_valid = false; + execute_wait(true); + return seastar::now(); + } + default: { + ceph_abort("impossible next step"); + } + } + }).handle_exception([this](std::exception_ptr eptr) { + fault(state_t::CONNECTING, "execute_connecting", eptr); + }); + }); +} + +// ACCEPTING state + +seastar::future<> ProtocolV2::_auth_bad_method(int r) +{ + // _auth_bad_method() logic + ceph_assert(r < 0); + auto [allowed_methods, allowed_modes] = + messenger.get_auth_server()->get_supported_auth_methods(conn.get_peer_type()); + auto bad_method = AuthBadMethodFrame::Encode( + auth_meta->auth_method, r, allowed_methods, allowed_modes); + logger().warn("{} WRITE AuthBadMethodFrame: method={}, result={}, " + "allowed_methods={}, allowed_modes={})", + conn, auth_meta->auth_method, cpp_strerror(r), + allowed_methods, allowed_modes); + return frame_assembler->write_flush_frame(bad_method + ).then([this] { + return server_auth(); + }); +} + +seastar::future<> ProtocolV2::_handle_auth_request(bufferlist& auth_payload, bool more) +{ + // _handle_auth_request() logic + ceph_assert(messenger.get_auth_server()); + bufferlist reply; + int r = messenger.get_auth_server()->handle_auth_request( + conn, + *auth_meta, + more, + auth_meta->auth_method, + auth_payload, + &conn.peer_global_id, + &reply); + switch (r) { + // successful + case 1: { + auto auth_done = AuthDoneFrame::Encode( + conn.peer_global_id, auth_meta->con_mode, reply); + logger().debug("{} WRITE AuthDoneFrame: gid={}, con_mode={}, payload_len={}", + conn, conn.peer_global_id, + ceph_con_mode_name(auth_meta->con_mode), reply.length()); + return frame_assembler->write_flush_frame(auth_done + ).then([this] { + ceph_assert(auth_meta); + frame_assembler->create_session_stream_handlers(*auth_meta, true); + return finish_auth(); + }); + } + // auth more + case 0: { + auto more = AuthReplyMoreFrame::Encode(reply); + logger().debug("{} WRITE AuthReplyMoreFrame: payload_len={}", + conn, reply.length()); + return frame_assembler->write_flush_frame(more + ).then([this] { + return frame_assembler->read_main_preamble(); + }).then([this](auto ret) { + expect_tag(Tag::AUTH_REQUEST_MORE, ret.tag, conn, "read_auth_request_more"); + return frame_assembler->read_frame_payload(); + }).then([this](auto payload) { + auto auth_more = AuthRequestMoreFrame::Decode(payload->back()); + logger().debug("{} GOT AuthRequestMoreFrame: payload_len={}", + conn, auth_more.auth_payload().length()); + return _handle_auth_request(auth_more.auth_payload(), true); + }); + } + case -EBUSY: { + logger().warn("{} auth_server handle_auth_request returned -EBUSY", conn); + abort_in_fault(); + return seastar::now(); + } + default: { + logger().warn("{} auth_server handle_auth_request returned {}", conn, r); + return _auth_bad_method(r); + } + } +} + +seastar::future<> ProtocolV2::server_auth() +{ + return frame_assembler->read_main_preamble( + ).then([this](auto ret) { + expect_tag(Tag::AUTH_REQUEST, ret.tag, conn, "read_auth_request"); + return frame_assembler->read_frame_payload(); + }).then([this](auto payload) { + // handle_auth_request() logic + auto request = AuthRequestFrame::Decode(payload->back()); + logger().debug("{} GOT AuthRequestFrame: method={}, preferred_modes={}," + " payload_len={}", + conn, request.method(), request.preferred_modes(), + request.auth_payload().length()); + auth_meta->auth_method = request.method(); + auth_meta->con_mode = messenger.get_auth_server()->pick_con_mode( + conn.get_peer_type(), auth_meta->auth_method, + request.preferred_modes()); + if (auth_meta->con_mode == CEPH_CON_MODE_UNKNOWN) { + logger().warn("{} auth_server pick_con_mode returned mode CEPH_CON_MODE_UNKNOWN", conn); + return _auth_bad_method(-EOPNOTSUPP); + } + return _handle_auth_request(request.auth_payload(), false); + }); +} + +bool ProtocolV2::validate_peer_name(const entity_name_t& peer_name) const +{ + auto my_peer_name = conn.get_peer_name(); + if (my_peer_name.type() != peer_name.type()) { + return false; + } + if (my_peer_name.num() != entity_name_t::NEW && + peer_name.num() != entity_name_t::NEW && + my_peer_name.num() != peer_name.num()) { + return false; + } + return true; +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::send_wait() +{ + auto wait = WaitFrame::Encode(); + logger().debug("{} WRITE WaitFrame", conn); + return frame_assembler->write_flush_frame(wait + ).then([] { + return next_step_t::wait; + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::reuse_connection( + ProtocolV2* existing_proto, bool do_reset, + bool reconnect, uint64_t conn_seq, uint64_t msg_seq) +{ + if (unlikely(state != state_t::ACCEPTING)) { + logger().debug("{} triggered {} before trigger_replacing()", + conn, get_state_name(state)); + abort_protocol(); + } + + existing_proto->trigger_replacing(reconnect, + do_reset, + frame_assembler->to_replace(), + std::move(auth_meta), + peer_global_seq, + client_cookie, + conn.get_peer_name(), + conn.get_features(), + peer_supported_features, + conn_seq, + msg_seq); + ceph_assert_always(has_socket && is_socket_valid); + is_socket_valid = false; + has_socket = false; +#ifdef UNIT_TESTS_BUILT + if (conn.interceptor) { + conn.interceptor->register_conn_replaced( + conn.get_local_shared_foreign_from_this()); + } +#endif + // close this connection because all the necessary information is delivered + // to the exisiting connection, and jump to error handling code to abort the + // current state. + ABORT_IN_CLOSE(false); + return seastar::make_ready_future<next_step_t>(next_step_t::none); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::handle_existing_connection(SocketConnectionRef existing_conn) +{ + // handle_existing_connection() logic + ProtocolV2 *existing_proto = dynamic_cast<ProtocolV2*>( + existing_conn->protocol.get()); + ceph_assert(existing_proto); + logger().debug("{}(gs={}, pgs={}, cs={}, cc={}, sc={}) connecting," + " found existing {}(state={}, gs={}, pgs={}, cs={}, cc={}, sc={})", + conn, global_seq, peer_global_seq, connect_seq, + client_cookie, server_cookie, + fmt::ptr(existing_conn.get()), get_state_name(existing_proto->state), + existing_proto->global_seq, + existing_proto->peer_global_seq, + existing_proto->connect_seq, + existing_proto->client_cookie, + existing_proto->server_cookie); + + if (!validate_peer_name(existing_conn->get_peer_name())) { + logger().error("{} server_connect: my peer_name doesn't match" + " the existing connection {}, abort", conn, fmt::ptr(existing_conn.get())); + abort_in_fault(); + } + + if (existing_proto->state == state_t::REPLACING) { + logger().warn("{} server_connect: racing replace happened while" + " replacing existing connection {}, send wait.", + conn, *existing_conn); + return send_wait(); + } + + if (existing_proto->peer_global_seq > peer_global_seq) { + logger().warn("{} server_connect:" + " this is a stale connection, because peer_global_seq({})" + " < existing->peer_global_seq({}), close this connection" + " in favor of existing connection {}", + conn, peer_global_seq, + existing_proto->peer_global_seq, *existing_conn); + abort_in_fault(); + } + + if (existing_conn->policy.lossy) { + // existing connection can be thrown out in favor of this one + logger().warn("{} server_connect:" + " existing connection {} is a lossy channel. Close existing in favor of" + " this connection", conn, *existing_conn); + if (unlikely(state != state_t::ACCEPTING)) { + logger().debug("{} triggered {} before execute_establishing()", + conn, get_state_name(state)); + abort_protocol(); + } + execute_establishing(existing_conn); + return seastar::make_ready_future<next_step_t>(next_step_t::ready); + } + + if (existing_proto->server_cookie != 0) { + if (existing_proto->client_cookie != client_cookie) { + // Found previous session + // peer has reset and we're going to reuse the existing connection + // by replacing the socket + logger().warn("{} server_connect:" + " found new session (cs={})" + " when existing {} {} is with stale session (cs={}, ss={})," + " peer must have reset", + conn, + client_cookie, + get_state_name(existing_proto->state), + *existing_conn, + existing_proto->client_cookie, + existing_proto->server_cookie); + return reuse_connection(existing_proto, conn.policy.resetcheck); + } else { + // session establishment interrupted between client_ident and server_ident, + // continuing... + logger().warn("{} server_connect: found client session with existing {} {}" + " matched (cs={}, ss={}), continuing session establishment", + conn, + get_state_name(existing_proto->state), + *existing_conn, + client_cookie, + existing_proto->server_cookie); + return reuse_connection(existing_proto); + } + } else { + // Looks like a connection race: server and client are both connecting to + // each other at the same time. + if (existing_proto->client_cookie != client_cookie) { + if (existing_conn->peer_wins()) { + // acceptor (this connection, the peer) wins + logger().warn("{} server_connect: connection race detected (cs={}, e_cs={}, ss=0)" + " and win, reusing existing {} {}", + conn, + client_cookie, + existing_proto->client_cookie, + get_state_name(existing_proto->state), + *existing_conn); + return reuse_connection(existing_proto); + } else { + // acceptor (this connection, the peer) loses + logger().warn("{} server_connect: connection race detected (cs={}, e_cs={}, ss=0)" + " and lose to existing {}, ask client to wait", + conn, client_cookie, existing_proto->client_cookie, *existing_conn); + return existing_conn->send_keepalive().then([this] { + return send_wait(); + }); + } + } else { + logger().warn("{} server_connect: found client session with existing {} {}" + " matched (cs={}, ss={}), continuing session establishment", + conn, + get_state_name(existing_proto->state), + *existing_conn, + client_cookie, + existing_proto->server_cookie); + return reuse_connection(existing_proto); + } + } +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::server_connect() +{ + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + // handle_client_ident() logic + auto client_ident = ClientIdentFrame::Decode(payload->back()); + logger().debug("{} GOT ClientIdentFrame: addrs={}, target={}," + " gid={}, gs={}, features_supported={}," + " features_required={}, flags={}, cookie={}", + conn, client_ident.addrs(), client_ident.target_addr(), + client_ident.gid(), client_ident.global_seq(), + client_ident.supported_features(), + client_ident.required_features(), + client_ident.flags(), client_ident.cookie()); + + if (client_ident.addrs().empty() || + client_ident.addrs().front() == entity_addr_t()) { + logger().warn("{} oops, client_ident.addrs() is empty", conn); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (!messenger.get_myaddrs().contains(client_ident.target_addr())) { + logger().warn("{} peer is trying to reach {} which is not us ({})", + conn, client_ident.target_addr(), messenger.get_myaddrs()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + conn.peer_addr = client_ident.addrs().front(); + logger().debug("{} UPDATE: peer_addr={}", conn, conn.peer_addr); + conn.target_addr = conn.peer_addr; + if (!conn.policy.lossy && !conn.policy.server && conn.target_addr.get_port() <= 0) { + logger().warn("{} we don't know how to reconnect to peer {}", + conn, conn.target_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + + if (conn.get_peer_id() != entity_name_t::NEW && + conn.get_peer_id() != client_ident.gid()) { + logger().error("{} client_ident peer_id ({}) does not match" + " what it should be ({}) during accepting, abort", + conn, client_ident.gid(), conn.get_peer_id()); + abort_in_fault(); + } + conn.set_peer_id(client_ident.gid()); + client_cookie = client_ident.cookie(); + + uint64_t feat_missing = + (conn.policy.features_required | msgr2_required) & + ~(uint64_t)client_ident.supported_features(); + if (feat_missing) { + auto ident_missing_features = IdentMissingFeaturesFrame::Encode(feat_missing); + logger().warn("{} WRITE IdentMissingFeaturesFrame: features={} (peer missing)", + conn, feat_missing); + return frame_assembler->write_flush_frame(ident_missing_features + ).then([] { + return next_step_t::wait; + }); + } + conn.set_features(client_ident.supported_features() & + conn.policy.features_supported); + logger().debug("{} UPDATE: features={}", conn, conn.get_features()); + + peer_global_seq = client_ident.global_seq(); + + bool lossy = client_ident.flags() & CEPH_MSG_CONNECT_LOSSY; + if (lossy != conn.policy.lossy) { + logger().warn("{} my lossy policy {} doesn't match client {}, ignore", + conn, conn.policy.lossy, lossy); + } + + // Looks good so far, let's check if there is already an existing connection + // to this peer. + + SocketConnectionRef existing_conn = messenger.lookup_conn(conn.peer_addr); + + if (existing_conn) { + return handle_existing_connection(existing_conn); + } else { + if (unlikely(state != state_t::ACCEPTING)) { + logger().debug("{} triggered {} before execute_establishing()", + conn, get_state_name(state)); + abort_protocol(); + } + execute_establishing(nullptr); + return seastar::make_ready_future<next_step_t>(next_step_t::ready); + } + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::read_reconnect() +{ + return frame_assembler->read_main_preamble( + ).then([this](auto ret) { + expect_tag(Tag::SESSION_RECONNECT, ret.tag, conn, "read_session_reconnect"); + return server_reconnect(); + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::send_retry(uint64_t connect_seq) +{ + auto retry = RetryFrame::Encode(connect_seq); + logger().warn("{} WRITE RetryFrame: cs={}", conn, connect_seq); + return frame_assembler->write_flush_frame(retry + ).then([this] { + return read_reconnect(); + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::send_retry_global(uint64_t global_seq) +{ + auto retry = RetryGlobalFrame::Encode(global_seq); + logger().warn("{} WRITE RetryGlobalFrame: gs={}", conn, global_seq); + return frame_assembler->write_flush_frame(retry + ).then([this] { + return read_reconnect(); + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::send_reset(bool full) +{ + auto reset = ResetFrame::Encode(full); + logger().warn("{} WRITE ResetFrame: full={}", conn, full); + return frame_assembler->write_flush_frame(reset + ).then([this] { + return frame_assembler->read_main_preamble(); + }).then([this](auto ret) { + expect_tag(Tag::CLIENT_IDENT, ret.tag, conn, "post_send_reset"); + return server_connect(); + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::server_reconnect() +{ + return frame_assembler->read_frame_payload( + ).then([this](auto payload) { + // handle_reconnect() logic + auto reconnect = ReconnectFrame::Decode(payload->back()); + + logger().debug("{} GOT ReconnectFrame: addrs={}, client_cookie={}," + " server_cookie={}, gs={}, cs={}, msg_seq={}", + conn, reconnect.addrs(), + reconnect.client_cookie(), reconnect.server_cookie(), + reconnect.global_seq(), reconnect.connect_seq(), + reconnect.msg_seq()); + + // can peer_addrs be changed on-the-fly? + // TODO: change peer_addr to entity_addrvec_t + entity_addr_t paddr = reconnect.addrs().front(); + if (paddr.is_msgr2() || paddr.is_any()) { + // good + } else { + logger().warn("{} peer's address {} is not v2", conn, paddr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (conn.peer_addr == entity_addr_t()) { + conn.peer_addr = paddr; + } else if (conn.peer_addr != paddr) { + logger().error("{} peer identifies as {}, while conn.peer_addr={}," + " reconnect failed", + conn, paddr, conn.peer_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + peer_global_seq = reconnect.global_seq(); + + SocketConnectionRef existing_conn = messenger.lookup_conn(conn.peer_addr); + + if (!existing_conn) { + // there is no existing connection therefore cannot reconnect to previous + // session + logger().warn("{} server_reconnect: no existing connection from address {}," + " reseting client", conn, conn.peer_addr); + return send_reset(true); + } + + ProtocolV2 *existing_proto = dynamic_cast<ProtocolV2*>( + existing_conn->protocol.get()); + ceph_assert(existing_proto); + logger().debug("{}(gs={}, pgs={}, cs={}, cc={}, sc={}) re-connecting," + " found existing {}(state={}, gs={}, pgs={}, cs={}, cc={}, sc={})", + conn, global_seq, peer_global_seq, reconnect.connect_seq(), + reconnect.client_cookie(), reconnect.server_cookie(), + fmt::ptr(existing_conn.get()), + get_state_name(existing_proto->state), + existing_proto->global_seq, + existing_proto->peer_global_seq, + existing_proto->connect_seq, + existing_proto->client_cookie, + existing_proto->server_cookie); + + if (!validate_peer_name(existing_conn->get_peer_name())) { + logger().error("{} server_reconnect: my peer_name doesn't match" + " the existing connection {}, abort", conn, fmt::ptr(existing_conn.get())); + abort_in_fault(); + } + + if (existing_proto->state == state_t::REPLACING) { + logger().warn("{} server_reconnect: racing replace happened while " + " replacing existing connection {}, retry global.", + conn, *existing_conn); + return send_retry_global(existing_proto->peer_global_seq); + } + + if (existing_proto->client_cookie != reconnect.client_cookie()) { + logger().warn("{} server_reconnect:" + " client_cookie mismatch with existing connection {}," + " cc={} rcc={}. I must have reset, reseting client.", + conn, *existing_conn, + existing_proto->client_cookie, reconnect.client_cookie()); + return send_reset(conn.policy.resetcheck); + } else if (existing_proto->server_cookie == 0) { + // this happens when: + // - a connects to b + // - a sends client_ident + // - b gets client_ident, sends server_ident and sets cookie X + // - connection fault + // - b reconnects to a with cookie X, connect_seq=1 + // - a has cookie==0 + logger().warn("{} server_reconnect: I was a client (cc={}) and didn't received the" + " server_ident with existing connection {}." + " Asking peer to resume session establishment", + conn, existing_proto->client_cookie, *existing_conn); + return send_reset(false); + } + + if (existing_proto->peer_global_seq > reconnect.global_seq()) { + logger().warn("{} server_reconnect: stale global_seq: exist_pgs({}) > peer_gs({})," + " with existing connection {}," + " ask client to retry global", + conn, existing_proto->peer_global_seq, + reconnect.global_seq(), *existing_conn); + return send_retry_global(existing_proto->peer_global_seq); + } + + if (existing_proto->connect_seq > reconnect.connect_seq()) { + logger().warn("{} server_reconnect: stale peer connect_seq peer_cs({}) < exist_cs({})," + " with existing connection {}, ask client to retry", + conn, reconnect.connect_seq(), + existing_proto->connect_seq, *existing_conn); + return send_retry(existing_proto->connect_seq); + } else if (existing_proto->connect_seq == reconnect.connect_seq()) { + // reconnect race: both peers are sending reconnect messages + if (existing_conn->peer_wins()) { + // acceptor (this connection, the peer) wins + logger().warn("{} server_reconnect: reconnect race detected (cs={})" + " and win, reusing existing {} {}", + conn, + reconnect.connect_seq(), + get_state_name(existing_proto->state), + *existing_conn); + return reuse_connection( + existing_proto, false, + true, reconnect.connect_seq(), reconnect.msg_seq()); + } else { + // acceptor (this connection, the peer) loses + logger().warn("{} server_reconnect: reconnect race detected (cs={})" + " and lose to existing {}, ask client to wait", + conn, reconnect.connect_seq(), *existing_conn); + return send_wait(); + } + } else { // existing_proto->connect_seq < reconnect.connect_seq() + logger().warn("{} server_reconnect: stale exsiting connect_seq exist_cs({}) < peer_cs({})," + " reusing existing {} {}", + conn, + existing_proto->connect_seq, + reconnect.connect_seq(), + get_state_name(existing_proto->state), + *existing_conn); + return reuse_connection( + existing_proto, false, + true, reconnect.connect_seq(), reconnect.msg_seq()); + } + }); +} + +void ProtocolV2::execute_accepting() +{ + assert(is_socket_valid); + trigger_state(state_t::ACCEPTING, io_state_t::none); + gate.dispatch_in_background("execute_accepting", conn, [this] { + return seastar::futurize_invoke([this] { +#ifdef UNIT_TESTS_BUILT + if (conn.interceptor) { + // only notify socket accepted + gate.dispatch_in_background( + "test_intercept_socket_accepted", conn, [this] { + return conn.interceptor->intercept( + conn, {Breakpoint{custom_bp_t::SOCKET_ACCEPTED}} + ).then([](bp_action_t action) { + ceph_assert(action == bp_action_t::CONTINUE); + }); + }); + } +#endif + auth_meta = seastar::make_lw_shared<AuthConnectionMeta>(); + frame_assembler->reset_handlers(); + frame_assembler->start_recording(); + return banner_exchange(false); + }).then([this] (auto&& ret) { + auto [_peer_type, _my_addr_from_peer] = std::move(ret); + ceph_assert(conn.get_peer_type() == 0); + conn.set_peer_type(_peer_type); + + conn.policy = messenger.get_policy(_peer_type); + logger().info("{} UPDATE: peer_type={}," + " policy(lossy={} server={} standby={} resetcheck={})", + conn, ceph_entity_type_name(_peer_type), + conn.policy.lossy, conn.policy.server, + conn.policy.standby, conn.policy.resetcheck); + if (!messenger.get_myaddr().is_blank_ip() && + (messenger.get_myaddr().get_port() != _my_addr_from_peer.get_port() || + messenger.get_myaddr().get_nonce() != _my_addr_from_peer.get_nonce())) { + logger().warn("{} my_addr_from_peer {} port/nonce doesn't match myaddr {}", + conn, _my_addr_from_peer, messenger.get_myaddr()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + messenger.learned_addr(_my_addr_from_peer, conn); + return server_auth(); + }).then([this] { + return frame_assembler->read_main_preamble(); + }).then([this](auto ret) { + switch (ret.tag) { + case Tag::CLIENT_IDENT: + return server_connect(); + case Tag::SESSION_RECONNECT: + return server_reconnect(); + default: { + unexpected_tag(ret.tag, conn, "post_server_auth"); + return seastar::make_ready_future<next_step_t>(next_step_t::none); + } + } + }).then([this] (next_step_t next) { + switch (next) { + case next_step_t::ready: + assert(state != state_t::ACCEPTING); + break; + case next_step_t::wait: + if (unlikely(state != state_t::ACCEPTING)) { + logger().debug("{} triggered {} at the end of execute_accepting()", + conn, get_state_name(state)); + abort_protocol(); + } + logger().info("{} execute_accepting(): going to SERVER_WAIT", conn); + execute_server_wait(); + break; + default: + ceph_abort("impossible next step"); + } + }).handle_exception([this](std::exception_ptr eptr) { + const char *e_what; + try { + std::rethrow_exception(eptr); + } catch (std::exception &e) { + e_what = e.what(); + } + logger().info("{} execute_accepting(): fault at {}, going to CLOSING -- {}", + conn, get_state_name(state), e_what); + do_close(false); + }); + }); +} + +// CONNECTING or ACCEPTING state + +seastar::future<> ProtocolV2::finish_auth() +{ + ceph_assert(auth_meta); + + auto records = frame_assembler->stop_recording(); + const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() : + auth_meta->session_key.hmac_sha256(nullptr, records.rxbuf); + auto sig_frame = AuthSignatureFrame::Encode(sig); + logger().debug("{} WRITE AuthSignatureFrame: signature={}", conn, sig); + return frame_assembler->write_flush_frame(sig_frame + ).then([this] { + return frame_assembler->read_main_preamble(); + }).then([this](auto ret) { + expect_tag(Tag::AUTH_SIGNATURE, ret.tag, conn, "post_finish_auth"); + return frame_assembler->read_frame_payload(); + }).then([this, txbuf=std::move(records.txbuf)](auto payload) { + // handle_auth_signature() logic + auto sig_frame = AuthSignatureFrame::Decode(payload->back()); + logger().debug("{} GOT AuthSignatureFrame: signature={}", conn, sig_frame.signature()); + + const auto actual_tx_sig = auth_meta->session_key.empty() ? + sha256_digest_t() : auth_meta->session_key.hmac_sha256(nullptr, txbuf); + if (sig_frame.signature() != actual_tx_sig) { + logger().warn("{} pre-auth signature mismatch actual_tx_sig={}" + " sig_frame.signature()={}", + conn, actual_tx_sig, sig_frame.signature()); + abort_in_fault(); + } + }); +} + +// ESTABLISHING + +void ProtocolV2::execute_establishing(SocketConnectionRef existing_conn) { + auto accept_me = [this] { + messenger.register_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + messenger.unaccept_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + }; + + ceph_assert_always(is_socket_valid); + trigger_state(state_t::ESTABLISHING, io_state_t::delay); + bool is_replace; + if (existing_conn) { + logger().info("{} start establishing: gs={}, pgs={}, cs={}, " + "client_cookie={}, server_cookie={}, {}, new_sid={}, " + "close existing {}", + conn, global_seq, peer_global_seq, connect_seq, + client_cookie, server_cookie, + io_states, frame_assembler->get_socket_shard_id(), + *existing_conn); + is_replace = true; + ProtocolV2 *existing_proto = dynamic_cast<ProtocolV2*>( + existing_conn->protocol.get()); + existing_proto->do_close( + true, // is_dispatch_reset + std::move(accept_me)); + if (unlikely(state != state_t::ESTABLISHING)) { + logger().warn("{} triggered {} during execute_establishing(), " + "the accept event will not be delivered!", + conn, get_state_name(state)); + abort_protocol(); + } + } else { + logger().info("{} start establishing: gs={}, pgs={}, cs={}, " + "client_cookie={}, server_cookie={}, {}, new_sid={}, " + "no existing", + conn, global_seq, peer_global_seq, connect_seq, + client_cookie, server_cookie, io_states, + frame_assembler->get_socket_shard_id()); + is_replace = false; + accept_me(); + } + + gated_execute("execute_establishing", conn, [this, is_replace] { + ceph_assert_always(state == state_t::ESTABLISHING); + + // set io_handler to a new shard + auto cc_seq = crosscore.prepare_submit(); + // there are 2 hops with dispatch_accept() + crosscore.prepare_submit(); + auto new_io_shard = frame_assembler->get_socket_shard_id(); + logger().debug("{} send {} IOHandler::dispatch_accept({})", + conn, cc_seq, new_io_shard); + ConnectionFRef conn_fref = seastar::make_foreign( + conn.shared_from_this()); + ceph_assert_always(!pr_switch_io_shard.has_value()); + pr_switch_io_shard = seastar::shared_promise<>(); + return seastar::smp::submit_to( + io_handler.get_shard_id(), + [this, cc_seq, new_io_shard, is_replace, + conn_fref=std::move(conn_fref)]() mutable { + return io_handler.dispatch_accept( + cc_seq, new_io_shard, std::move(conn_fref), is_replace); + }).then([this, new_io_shard] { + ceph_assert_always(io_handler.get_shard_id() == new_io_shard); + pr_switch_io_shard->set_value(); + pr_switch_io_shard = std::nullopt; + // user can make changes + + if (unlikely(state != state_t::ESTABLISHING)) { + logger().debug("{} triggered {} after dispatch_accept() during execute_establishing()", + conn, get_state_name(state)); + abort_protocol(); + } + + return send_server_ident(); + }).then([this] { + if (unlikely(state != state_t::ESTABLISHING)) { + logger().debug("{} triggered {} at the end of execute_establishing()", + conn, get_state_name(state)); + abort_protocol(); + } + logger().info("{} established, going to ready", conn); + execute_ready(); + }).handle_exception([this](std::exception_ptr eptr) { + fault(state_t::ESTABLISHING, "execute_establishing", eptr); + }); + }); +} + +// ESTABLISHING or REPLACING state + +seastar::future<> +ProtocolV2::send_server_ident() +{ + ceph_assert_always(state == state_t::ESTABLISHING || + state == state_t::REPLACING); + // send_server_ident() logic + + // refered to async-conn v2: not assign gs to global_seq + global_seq = messenger.get_global_seq(); + auto cc_seq = crosscore.prepare_submit(); + logger().debug("{} UPDATE: gs={} for server ident, " + "send {} IOHandler::reset_peer_state()", + conn, global_seq, cc_seq); + + // this is required for the case when this connection is being replaced + io_states.reset_peer_state(); + gate.dispatch_in_background( + "reset_peer_state", conn, [this, cc_seq] { + return seastar::smp::submit_to( + io_handler.get_shard_id(), [this, cc_seq] { + return io_handler.reset_peer_state(cc_seq); + }); + }); + + if (!conn.policy.lossy) { + server_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll); + } + + uint64_t flags = 0; + if (conn.policy.lossy) { + flags = flags | CEPH_MSG_CONNECT_LOSSY; + } + + auto server_ident = ServerIdentFrame::Encode( + messenger.get_myaddrs(), + messenger.get_myname().num(), + global_seq, + conn.policy.features_supported, + conn.policy.features_required | msgr2_required, + flags, + server_cookie); + + logger().debug("{} WRITE ServerIdentFrame: addrs={}, gid={}," + " gs={}, features_supported={}, features_required={}," + " flags={}, cookie={}", + conn, messenger.get_myaddrs(), messenger.get_myname().num(), + global_seq, conn.policy.features_supported, + conn.policy.features_required | msgr2_required, + flags, server_cookie); + + return frame_assembler->write_flush_frame(server_ident); +} + +// REPLACING state + +void ProtocolV2::trigger_replacing(bool reconnect, + bool do_reset, + FrameAssemblerV2::mover_t &&mover, + AuthConnectionMetaRef&& new_auth_meta, + uint64_t new_peer_global_seq, + uint64_t new_client_cookie, + entity_name_t new_peer_name, + uint64_t new_conn_features, + uint64_t new_peer_supported_features, + uint64_t new_connect_seq, + uint64_t new_msg_seq) +{ + ceph_assert_always(state >= state_t::ESTABLISHING); + ceph_assert_always(state <= state_t::WAIT); + ceph_assert_always(has_socket || state == state_t::CONNECTING); + // mover.socket shouldn't be shutdown + + logger().info("{} start replacing ({}): pgs was {}, cs was {}, " + "client_cookie was {}, {}, new_sid={}", + conn, reconnect ? "reconnected" : "connected", + peer_global_seq, connect_seq, client_cookie, + io_states, mover.socket->get_shard_id()); + if (is_socket_valid) { + frame_assembler->shutdown_socket<true>(&gate); + is_socket_valid = false; + } + trigger_state_phase1(state_t::REPLACING); + gate.dispatch_in_background( + "trigger_replacing", + conn, + [this, + reconnect, + do_reset, + mover = std::move(mover), + new_auth_meta = std::move(new_auth_meta), + new_client_cookie, new_peer_name, + new_conn_features, new_peer_supported_features, + new_peer_global_seq, + new_connect_seq, new_msg_seq] () mutable { + ceph_assert_always(state == state_t::REPLACING); + auto new_io_shard = mover.socket->get_shard_id(); + // state may become CLOSING below, but we cannot abort the chain until + // mover.socket is correctly handled (closed or replaced). + + // this is preemptive + return wait_switch_io_shard( + ).then([this] { + if (unlikely(state != state_t::REPLACING)) { + ceph_assert_always(state == state_t::CLOSING); + return seastar::now(); + } + + trigger_state_phase2(state_t::REPLACING, io_state_t::delay); + return wait_exit_io(); + }).then([this] { + if (unlikely(state != state_t::REPLACING)) { + ceph_assert_always(state == state_t::CLOSING); + return seastar::now(); + } + + ceph_assert_always(frame_assembler); + protocol_timer.cancel(); + auto done = std::move(execution_done); + execution_done = seastar::now(); + return done; + }).then([this, new_io_shard] { + if (unlikely(state != state_t::REPLACING)) { + ceph_assert_always(state == state_t::CLOSING); + return seastar::now(); + } + + // set io_handler to a new shard + // we should prevent parallel switching core attemps + auto cc_seq = crosscore.prepare_submit(); + // there are 2 hops with dispatch_accept() + crosscore.prepare_submit(); + logger().debug("{} send {} IOHandler::dispatch_accept({})", + conn, cc_seq, new_io_shard); + ConnectionFRef conn_fref = seastar::make_foreign( + conn.shared_from_this()); + ceph_assert_always(!pr_switch_io_shard.has_value()); + pr_switch_io_shard = seastar::shared_promise<>(); + return seastar::smp::submit_to( + io_handler.get_shard_id(), + [this, cc_seq, new_io_shard, + conn_fref=std::move(conn_fref)]() mutable { + return io_handler.dispatch_accept( + cc_seq, new_io_shard, std::move(conn_fref), false); + }).then([this, new_io_shard] { + ceph_assert_always(io_handler.get_shard_id() == new_io_shard); + pr_switch_io_shard->set_value(); + pr_switch_io_shard = std::nullopt; + // user can make changes + }); + }).then([this, + reconnect, + do_reset, + mover = std::move(mover), + new_auth_meta = std::move(new_auth_meta), + new_client_cookie, new_peer_name, + new_conn_features, new_peer_supported_features, + new_peer_global_seq, + new_connect_seq, new_msg_seq] () mutable { + if (state == state_t::REPLACING && do_reset) { + reset_session(true); + // user can make changes + } + + if (unlikely(state != state_t::REPLACING)) { + logger().debug("{} triggered {} in the middle of trigger_replacing(), abort", + conn, get_state_name(state)); + ceph_assert_always(state == state_t::CLOSING); + return mover.socket->close( + ).then([sock = std::move(mover.socket)] { + abort_protocol(); + }); + } + + auth_meta = std::move(new_auth_meta); + peer_global_seq = new_peer_global_seq; + gate.dispatch_in_background( + "replace_frame_assembler", + conn, + [this, mover=std::move(mover)]() mutable { + return frame_assembler->replace_by(std::move(mover)); + } + ); + is_socket_valid = true; + has_socket = true; + + if (reconnect) { + connect_seq = new_connect_seq; + // send_reconnect_ok() logic + + auto cc_seq = crosscore.prepare_submit(); + logger().debug("{} send {} IOHandler::requeue_out_sent_up_to({})", + conn, cc_seq, new_msg_seq); + io_states.requeue_out_sent_up_to(); + gate.dispatch_in_background( + "requeue_out_replacing", conn, [this, cc_seq, new_msg_seq] { + return seastar::smp::submit_to( + io_handler.get_shard_id(), [this, cc_seq, new_msg_seq] { + return io_handler.requeue_out_sent_up_to(cc_seq, new_msg_seq); + }); + }); + + auto reconnect_ok = ReconnectOkFrame::Encode(io_states.in_seq); + logger().debug("{} WRITE ReconnectOkFrame: msg_seq={}", conn, io_states.in_seq); + return frame_assembler->write_flush_frame(reconnect_ok); + } else { + client_cookie = new_client_cookie; + assert(conn.get_peer_type() == new_peer_name.type()); + if (conn.get_peer_id() == entity_name_t::NEW) { + conn.set_peer_id(new_peer_name.num()); + } + conn.set_features(new_conn_features); + peer_supported_features = new_peer_supported_features; + bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1); + frame_assembler->set_is_rev1(is_rev1); + return send_server_ident(); + } + }).then([this, reconnect] { + if (unlikely(state != state_t::REPLACING)) { + logger().debug("{} triggered {} at the end of trigger_replacing(), abort", + conn, get_state_name(state)); + ceph_assert_always(state == state_t::CLOSING); + abort_protocol(); + } + logger().info("{} replaced ({}), going to ready: " + "gs={}, pgs={}, cs={}, " + "client_cookie={}, server_cookie={}, {}", + conn, reconnect ? "reconnected" : "connected", + global_seq, peer_global_seq, connect_seq, + client_cookie, server_cookie, io_states); + execute_ready(); + }).handle_exception([this](std::exception_ptr eptr) { + fault(state_t::REPLACING, "trigger_replacing", eptr); + }); + }); +} + +// READY state + +seastar::future<> ProtocolV2::notify_out_fault( + crosscore_t::seq_t cc_seq, + const char *where, + std::exception_ptr eptr, + io_handler_state _io_states) +{ + assert(seastar::this_shard_id() == conn.get_messenger_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} notify_out_fault(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq, where, eptr, _io_states] { + return notify_out_fault(cc_seq, where, eptr, _io_states); + }); + } + + io_states = _io_states; + logger().debug("{} got {} notify_out_fault(): io_states={}", + conn, cc_seq, io_states); + fault(state_t::READY, where, eptr); + return seastar::now(); +} + +void ProtocolV2::execute_ready() +{ + assert(conn.policy.lossy || (client_cookie != 0 && server_cookie != 0)); + protocol_timer.cancel(); + ceph_assert_always(is_socket_valid); + // I'm not responsible to shutdown the socket at READY + is_socket_valid = false; + trigger_state(state_t::READY, io_state_t::open); +#ifdef UNIT_TESTS_BUILT + if (conn.interceptor) { + // FIXME: doesn't support cross-core + conn.interceptor->register_conn_ready( + conn.get_local_shared_foreign_from_this()); + } +#endif +} + +// STANDBY state + +void ProtocolV2::execute_standby() +{ + ceph_assert_always(!is_socket_valid); + trigger_state(state_t::STANDBY, io_state_t::delay); +} + +seastar::future<> ProtocolV2::notify_out( + crosscore_t::seq_t cc_seq) +{ + assert(seastar::this_shard_id() == conn.get_messenger_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} notify_out(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq] { + return notify_out(cc_seq); + }); + } + + logger().debug("{} got {} notify_out(): at {}", + conn, cc_seq, get_state_name(state)); + io_states.is_out_queued = true; + if (unlikely(state == state_t::STANDBY && !conn.policy.server)) { + logger().info("{} notify_out(): at {}, going to CONNECTING", + conn, get_state_name(state)); + execute_connecting(); + } + return seastar::now(); +} + +// WAIT state + +void ProtocolV2::execute_wait(bool max_backoff) +{ + ceph_assert_always(!is_socket_valid); + trigger_state(state_t::WAIT, io_state_t::delay); + gated_execute("execute_wait", conn, [this, max_backoff] { + double backoff = protocol_timer.last_dur(); + if (max_backoff) { + backoff = local_conf().get_val<double>("ms_max_backoff"); + } else if (backoff > 0) { + backoff = std::min(local_conf().get_val<double>("ms_max_backoff"), 2 * backoff); + } else { + backoff = local_conf().get_val<double>("ms_initial_backoff"); + } + return protocol_timer.backoff(backoff).then([this] { + if (unlikely(state != state_t::WAIT)) { + logger().debug("{} triggered {} at the end of execute_wait()", + conn, get_state_name(state)); + abort_protocol(); + } + logger().info("{} execute_wait(): going to CONNECTING", conn); + execute_connecting(); + }).handle_exception([this](std::exception_ptr eptr) { + const char *e_what; + try { + std::rethrow_exception(eptr); + } catch (std::exception &e) { + e_what = e.what(); + } + logger().info("{} execute_wait(): protocol aborted at {} -- {}", + conn, get_state_name(state), e_what); + assert(state == state_t::REPLACING || + state == state_t::CLOSING); + }); + }); +} + +// SERVER_WAIT state + +void ProtocolV2::execute_server_wait() +{ + ceph_assert_always(is_socket_valid); + trigger_state(state_t::SERVER_WAIT, io_state_t::none); + gated_execute("execute_server_wait", conn, [this] { + return frame_assembler->read_exactly(1 + ).then([this](auto bptr) { + logger().warn("{} SERVER_WAIT got read, abort", conn); + abort_in_fault(); + }).handle_exception([this](std::exception_ptr eptr) { + const char *e_what; + try { + std::rethrow_exception(eptr); + } catch (std::exception &e) { + e_what = e.what(); + } + logger().info("{} execute_server_wait(): fault at {}, going to CLOSING -- {}", + conn, get_state_name(state), e_what); + do_close(false); + }); + }); +} + +// CLOSING state + +seastar::future<> ProtocolV2::notify_mark_down( + crosscore_t::seq_t cc_seq) +{ + assert(seastar::this_shard_id() == conn.get_messenger_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} notify_mark_down(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq] { + return notify_mark_down(cc_seq); + }); + } + + logger().debug("{} got {} notify_mark_down()", + conn, cc_seq); + do_close(false); + return seastar::now(); +} + +seastar::future<> ProtocolV2::close_clean_yielded() +{ + // yield() so that do_close() can be called *after* close_clean_yielded() is + // applied to all connections in a container using + // seastar::parallel_for_each(). otherwise, we could erase a connection in + // the container when seastar::parallel_for_each() is still iterating in it. + // that'd lead to a segfault. + return seastar::yield( + ).then([this] { + do_close(false); + return pr_closed_clean.get_shared_future(); + + // connection may be unreferenced from the messenger, + // so need to hold the additional reference. + }).finally([conn_ref = conn.shared_from_this()] {});; +} + +void ProtocolV2::do_close( + bool is_dispatch_reset, + std::optional<std::function<void()>> f_accept_new) +{ + if (state == state_t::CLOSING) { + // already closing + return; + } + + bool is_replace = f_accept_new ? true : false; + logger().info("{} closing: reset {}, replace {}", conn, + is_dispatch_reset ? "yes" : "no", + is_replace ? "yes" : "no"); + + /* + * atomic operations + */ + + ceph_assert_always(!gate.is_closed()); + + // messenger registrations, must before user events + messenger.closing_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + if (state == state_t::ACCEPTING || state == state_t::SERVER_WAIT) { + messenger.unaccept_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + } else if (state >= state_t::ESTABLISHING && state < state_t::CLOSING) { + messenger.unregister_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + } else { + // cannot happen + ceph_assert(false); + } + if (f_accept_new) { + // the replacing connection must be registerred after the replaced + // connection is unreigsterred. + (*f_accept_new)(); + } + + protocol_timer.cancel(); + if (is_socket_valid) { + frame_assembler->shutdown_socket<true>(&gate); + is_socket_valid = false; + } + + trigger_state_phase1(state_t::CLOSING); + gate.dispatch_in_background( + "close_io", conn, [this, is_dispatch_reset, is_replace] { + // this is preemptive + return wait_switch_io_shard( + ).then([this, is_dispatch_reset, is_replace] { + trigger_state_phase2(state_t::CLOSING, io_state_t::drop); + auto cc_seq = crosscore.prepare_submit(); + logger().debug("{} send {} IOHandler::close_io(reset={}, replace={})", + conn, cc_seq, is_dispatch_reset, is_replace); + + std::ignore = gate.close( + ).then([this] { + ceph_assert_always(!need_exit_io); + ceph_assert_always(!pr_exit_io.has_value()); + if (has_socket) { + ceph_assert_always(frame_assembler); + return frame_assembler->close_shutdown_socket(); + } else { + return seastar::now(); + } + }).then([this] { + logger().debug("{} closed!", conn); + messenger.closed_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + pr_closed_clean.set_value(); +#ifdef UNIT_TESTS_BUILT + closed_clean = true; + if (conn.interceptor) { + conn.interceptor->register_conn_closed( + conn.get_local_shared_foreign_from_this()); + } +#endif + // connection is unreferenced from the messenger, + // so need to hold the additional reference. + }).handle_exception([conn_ref = conn.shared_from_this(), this] (auto eptr) { + logger().error("{} closing got unexpected exception {}", + conn, eptr); + ceph_abort(); + }); + + return seastar::smp::submit_to( + io_handler.get_shard_id(), + [this, cc_seq, is_dispatch_reset, is_replace] { + return io_handler.close_io(cc_seq, is_dispatch_reset, is_replace); + }); + // user can make changes + }); + }); +} + +} // namespace crimson::net diff --git a/src/crimson/net/ProtocolV2.h b/src/crimson/net/ProtocolV2.h new file mode 100644 index 000000000..dd7a1e703 --- /dev/null +++ b/src/crimson/net/ProtocolV2.h @@ -0,0 +1,328 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/shared_future.hh> +#include <seastar/core/sleep.hh> + +#include "io_handler.h" + +namespace crimson::net { + +class ProtocolV2 final : public HandshakeListener { + using AuthConnectionMetaRef = seastar::lw_shared_ptr<AuthConnectionMeta>; + +public: + ProtocolV2(SocketConnection &, + IOHandler &); + + ~ProtocolV2() final; + + ProtocolV2(const ProtocolV2 &) = delete; + ProtocolV2(ProtocolV2 &&) = delete; + ProtocolV2 &operator=(const ProtocolV2 &) = delete; + ProtocolV2 &operator=(ProtocolV2 &&) = delete; + +/** + * as HandshakeListener + */ +private: + seastar::future<> notify_out( + crosscore_t::seq_t cc_seq) final; + + seastar::future<> notify_out_fault( + crosscore_t::seq_t cc_seq, + const char *where, + std::exception_ptr, + io_handler_state) final; + + seastar::future<> notify_mark_down( + crosscore_t::seq_t cc_seq) final; + +/* +* as ProtocolV2 to be called by SocketConnection +*/ +public: + void start_connect(const entity_addr_t& peer_addr, + const entity_name_t& peer_name); + + void start_accept(SocketFRef&& socket, + const entity_addr_t& peer_addr); + + seastar::future<> close_clean_yielded(); + +#ifdef UNIT_TESTS_BUILT + bool is_ready() const { + return state == state_t::READY; + } + + bool is_standby() const { + return state == state_t::STANDBY; + } + + bool is_closed_clean() const { + return closed_clean; + } + + bool is_closed() const { + return state == state_t::CLOSING; + } + +#endif +private: + using io_state_t = IOHandler::io_state_t; + + seastar::future<> wait_switch_io_shard() { + if (pr_switch_io_shard.has_value()) { + return pr_switch_io_shard->get_shared_future(); + } else { + return seastar::now(); + } + } + + seastar::future<> wait_exit_io() { + if (pr_exit_io.has_value()) { + return pr_exit_io->get_shared_future(); + } else { + assert(!need_exit_io); + return seastar::now(); + } + } + + enum class state_t { + NONE = 0, + ACCEPTING, + SERVER_WAIT, + ESTABLISHING, + CONNECTING, + READY, + STANDBY, + WAIT, + REPLACING, + CLOSING + }; + + static const char *get_state_name(state_t state) { + const char *const statenames[] = {"NONE", + "ACCEPTING", + "SERVER_WAIT", + "ESTABLISHING", + "CONNECTING", + "READY", + "STANDBY", + "WAIT", + "REPLACING", + "CLOSING"}; + return statenames[static_cast<int>(state)]; + } + + void trigger_state_phase1(state_t new_state); + + void trigger_state_phase2(state_t new_state, io_state_t new_io_state); + + void trigger_state(state_t new_state, io_state_t new_io_state) { + ceph_assert_always(!pr_switch_io_shard.has_value()); + trigger_state_phase1(new_state); + trigger_state_phase2(new_state, new_io_state); + } + + template <typename Func, typename T> + void gated_execute(const char *what, T &who, Func &&func) { + gate.dispatch_in_background(what, who, [this, &who, &func] { + if (!execution_done.available()) { + // discard the unready future + gate.dispatch_in_background( + "gated_execute_abandon", + who, + [fut=std::move(execution_done)]() mutable { + return std::move(fut); + } + ); + } + seastar::promise<> pr; + execution_done = pr.get_future(); + return seastar::futurize_invoke(std::forward<Func>(func) + ).finally([pr=std::move(pr)]() mutable { + pr.set_value(); + }); + }); + } + + void fault(state_t expected_state, + const char *where, + std::exception_ptr eptr); + + void reset_session(bool is_full); + seastar::future<std::tuple<entity_type_t, entity_addr_t>> + banner_exchange(bool is_connect); + + enum class next_step_t { + ready, + wait, + none, // protocol should have been aborted or failed + }; + + // CONNECTING (client) + seastar::future<> handle_auth_reply(); + inline seastar::future<> client_auth() { + std::vector<uint32_t> empty; + return client_auth(empty); + } + seastar::future<> client_auth(std::vector<uint32_t> &allowed_methods); + + seastar::future<next_step_t> process_wait(); + seastar::future<next_step_t> client_connect(); + seastar::future<next_step_t> client_reconnect(); + void execute_connecting(); + + // ACCEPTING (server) + seastar::future<> _auth_bad_method(int r); + seastar::future<> _handle_auth_request(bufferlist& auth_payload, bool more); + seastar::future<> server_auth(); + + bool validate_peer_name(const entity_name_t& peer_name) const; + seastar::future<next_step_t> send_wait(); + seastar::future<next_step_t> reuse_connection(ProtocolV2* existing_proto, + bool do_reset=false, + bool reconnect=false, + uint64_t conn_seq=0, + uint64_t msg_seq=0); + + seastar::future<next_step_t> handle_existing_connection(SocketConnectionRef existing_conn); + seastar::future<next_step_t> server_connect(); + + seastar::future<next_step_t> read_reconnect(); + seastar::future<next_step_t> send_retry(uint64_t connect_seq); + seastar::future<next_step_t> send_retry_global(uint64_t global_seq); + seastar::future<next_step_t> send_reset(bool full); + seastar::future<next_step_t> server_reconnect(); + + void execute_accepting(); + + // CONNECTING/ACCEPTING + seastar::future<> finish_auth(); + + // ESTABLISHING + void execute_establishing(SocketConnectionRef existing_conn); + + // ESTABLISHING/REPLACING (server) + seastar::future<> send_server_ident(); + + // REPLACING (server) + void trigger_replacing(bool reconnect, + bool do_reset, + FrameAssemblerV2::mover_t &&mover, + AuthConnectionMetaRef&& new_auth_meta, + uint64_t new_peer_global_seq, + // !reconnect + uint64_t new_client_cookie, + entity_name_t new_peer_name, + uint64_t new_conn_features, + uint64_t new_peer_supported_features, + // reconnect + uint64_t new_connect_seq, + uint64_t new_msg_seq); + + // READY + void execute_ready(); + + // STANDBY + void execute_standby(); + + // WAIT + void execute_wait(bool max_backoff); + + // SERVER_WAIT + void execute_server_wait(); + + // CLOSING + // reentrant + void do_close(bool is_dispatch_reset, + std::optional<std::function<void()>> f_accept_new=std::nullopt); + +private: + SocketConnection &conn; + + SocketMessenger &messenger; + + IOHandler &io_handler; + + // asynchronously populated from io_handler + io_handler_state io_states; + + crosscore_t crosscore; + + bool has_socket = false; + + // the socket exists and it is not shutdown + bool is_socket_valid = false; + + FrameAssemblerV2Ref frame_assembler; + + bool need_notify_out = false; + + std::optional<seastar::shared_promise<>> pr_switch_io_shard; + + bool need_exit_io = false; + + std::optional<seastar::shared_promise<>> pr_exit_io; + + AuthConnectionMetaRef auth_meta; + + crimson::common::Gated gate; + + seastar::shared_promise<> pr_closed_clean; + +#ifdef UNIT_TESTS_BUILT + bool closed_clean = false; + +#endif + state_t state = state_t::NONE; + + uint64_t peer_supported_features = 0; + + uint64_t client_cookie = 0; + uint64_t server_cookie = 0; + uint64_t global_seq = 0; + uint64_t peer_global_seq = 0; + uint64_t connect_seq = 0; + + seastar::future<> execution_done = seastar::now(); + + class Timer { + double last_dur_ = 0.0; + const SocketConnection& conn; + std::optional<seastar::abort_source> as; + public: + Timer(SocketConnection& conn) : conn(conn) {} + double last_dur() const { return last_dur_; } + seastar::future<> backoff(double seconds); + void cancel() { + last_dur_ = 0.0; + if (as) { + as->request_abort(); + as = std::nullopt; + } + } + }; + Timer protocol_timer; +}; + +struct create_handlers_ret { + std::unique_ptr<ConnectionHandler> io_handler; + std::unique_ptr<ProtocolV2> protocol; +}; +inline create_handlers_ret create_handlers(ChainedDispatchers &dispatchers, SocketConnection &conn) { + std::unique_ptr<ConnectionHandler> io_handler = std::make_unique<IOHandler>(dispatchers, conn); + IOHandler &io_handler_concrete = static_cast<IOHandler&>(*io_handler); + auto protocol = std::make_unique<ProtocolV2>(conn, io_handler_concrete); + io_handler_concrete.set_handshake_listener(*protocol); + return {std::move(io_handler), std::move(protocol)}; +} + +} // namespace crimson::net + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::net::ProtocolV2> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/net/Socket.cc b/src/crimson/net/Socket.cc new file mode 100644 index 000000000..95b1e2250 --- /dev/null +++ b/src/crimson/net/Socket.cc @@ -0,0 +1,523 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Socket.h" + +#include <seastar/core/sleep.hh> +#include <seastar/core/when_all.hh> +#include <seastar/net/packet.hh> + +#include "crimson/common/log.h" +#include "Errors.h" + +using crimson::common::local_conf; + +namespace crimson::net { + +namespace { + +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); +} + +using tmp_buf = seastar::temporary_buffer<char>; +using packet = seastar::net::packet; + +// an input_stream consumer that reads buffer segments into a bufferlist up to +// the given number of remaining bytes +struct bufferlist_consumer { + bufferlist& bl; + size_t& remaining; + + bufferlist_consumer(bufferlist& bl, size_t& remaining) + : bl(bl), remaining(remaining) {} + + using consumption_result_type = typename seastar::input_stream<char>::consumption_result_type; + + // consume some or all of a buffer segment + seastar::future<consumption_result_type> operator()(tmp_buf&& data) { + if (remaining >= data.size()) { + // consume the whole buffer + remaining -= data.size(); + bl.append(buffer::create(std::move(data))); + if (remaining > 0) { + // return none to request more segments + return seastar::make_ready_future<consumption_result_type>( + seastar::continue_consuming{}); + } else { + // return an empty buffer to singal that we're done + return seastar::make_ready_future<consumption_result_type>( + consumption_result_type::stop_consuming_type({})); + } + } + if (remaining > 0) { + // consume the front + bl.append(buffer::create(data.share(0, remaining))); + data.trim_front(remaining); + remaining = 0; + } + // give the rest back to signal that we're done + return seastar::make_ready_future<consumption_result_type>( + consumption_result_type::stop_consuming_type{std::move(data)}); + }; +}; + +seastar::future<> inject_delay() +{ + if (float delay_period = local_conf()->ms_inject_internal_delays; + delay_period) { + logger().debug("Socket::inject_delay: sleep for {}", delay_period); + return seastar::sleep( + std::chrono::milliseconds((int)(delay_period * 1000.0))); + } + return seastar::now(); +} + +void inject_failure() +{ + if (local_conf()->ms_inject_socket_failures) { + uint64_t rand = + ceph::util::generate_random_number<uint64_t>(1, RAND_MAX); + if (rand % local_conf()->ms_inject_socket_failures == 0) { + logger().warn("Socket::inject_failure: injecting socket failure"); + throw std::system_error(make_error_code( + error::negotiation_failure)); + } + } +} + +} // anonymous namespace + +Socket::Socket( + seastar::connected_socket &&_socket, + side_t _side, + uint16_t e_port, + construct_tag) + : sid{seastar::this_shard_id()}, + socket(std::move(_socket)), + in(socket.input()), + // the default buffer size 8192 is too small that may impact our write + // performance. see seastar::net::connected_socket::output() + out(socket.output(65536)), + socket_is_shutdown(false), + side(_side), + ephemeral_port(e_port) +{ + if (local_conf()->ms_tcp_nodelay) { + socket.set_nodelay(true); + } +} + +Socket::~Socket() +{ + assert(seastar::this_shard_id() == sid); +#ifndef NDEBUG + assert(closed); +#endif +} + +seastar::future<bufferlist> +Socket::read(size_t bytes) +{ + assert(seastar::this_shard_id() == sid); +#ifdef UNIT_TESTS_BUILT + return try_trap_pre(next_trap_read).then([bytes, this] { +#endif + if (bytes == 0) { + return seastar::make_ready_future<bufferlist>(); + } + r.buffer.clear(); + r.remaining = bytes; + return in.consume(bufferlist_consumer{r.buffer, r.remaining}).then([this] { + if (r.remaining) { // throw on short reads + throw std::system_error(make_error_code(error::read_eof)); + } + inject_failure(); + return inject_delay().then([this] { + return seastar::make_ready_future<bufferlist>(std::move(r.buffer)); + }); + }); +#ifdef UNIT_TESTS_BUILT + }).then([this](auto buf) { + return try_trap_post(next_trap_read + ).then([buf = std::move(buf)]() mutable { + return std::move(buf); + }); + }); +#endif +} + +seastar::future<bufferptr> +Socket::read_exactly(size_t bytes) { + assert(seastar::this_shard_id() == sid); +#ifdef UNIT_TESTS_BUILT + return try_trap_pre(next_trap_read).then([bytes, this] { +#endif + if (bytes == 0) { + return seastar::make_ready_future<bufferptr>(); + } + return in.read_exactly(bytes).then([bytes](auto buf) { + bufferptr ptr(buffer::create(buf.share())); + if (ptr.length() < bytes) { + throw std::system_error(make_error_code(error::read_eof)); + } + inject_failure(); + return inject_delay( + ).then([ptr = std::move(ptr)]() mutable { + return seastar::make_ready_future<bufferptr>(std::move(ptr)); + }); + }); +#ifdef UNIT_TESTS_BUILT + }).then([this](auto ptr) { + return try_trap_post(next_trap_read + ).then([ptr = std::move(ptr)]() mutable { + return std::move(ptr); + }); + }); +#endif +} + +seastar::future<> +Socket::write(bufferlist buf) +{ + assert(seastar::this_shard_id() == sid); +#ifdef UNIT_TESTS_BUILT + return try_trap_pre(next_trap_write + ).then([buf = std::move(buf), this]() mutable { +#endif + inject_failure(); + return inject_delay( + ).then([buf = std::move(buf), this]() mutable { + packet p(std::move(buf)); + return out.write(std::move(p)); + }); +#ifdef UNIT_TESTS_BUILT + }).then([this] { + return try_trap_post(next_trap_write); + }); +#endif +} + +seastar::future<> +Socket::flush() +{ + assert(seastar::this_shard_id() == sid); + inject_failure(); + return inject_delay().then([this] { + return out.flush(); + }); +} + +seastar::future<> +Socket::write_flush(bufferlist buf) +{ + assert(seastar::this_shard_id() == sid); +#ifdef UNIT_TESTS_BUILT + return try_trap_pre(next_trap_write + ).then([buf = std::move(buf), this]() mutable { +#endif + inject_failure(); + return inject_delay( + ).then([buf = std::move(buf), this]() mutable { + packet p(std::move(buf)); + return out.write(std::move(p) + ).then([this] { + return out.flush(); + }); + }); +#ifdef UNIT_TESTS_BUILT + }).then([this] { + return try_trap_post(next_trap_write); + }); +#endif +} + +void Socket::shutdown() +{ + assert(seastar::this_shard_id() == sid); + socket_is_shutdown = true; + socket.shutdown_input(); + socket.shutdown_output(); +} + +static inline seastar::future<> +close_and_handle_errors(seastar::output_stream<char>& out) +{ + return out.close().handle_exception_type([](const std::system_error& e) { + if (e.code() != std::errc::broken_pipe && + e.code() != std::errc::connection_reset) { + logger().error("Socket::close(): unexpected error {}", e.what()); + ceph_abort(); + } + // can happen when out is already shutdown, ignore + }); +} + +seastar::future<> +Socket::close() +{ + assert(seastar::this_shard_id() == sid); +#ifndef NDEBUG + ceph_assert_always(!closed); + closed = true; +#endif + return seastar::when_all_succeed( + inject_delay(), + in.close(), + close_and_handle_errors(out) + ).then_unpack([] { + return seastar::make_ready_future<>(); + }).handle_exception([](auto eptr) { + const char *e_what; + try { + std::rethrow_exception(eptr); + } catch (std::exception &e) { + e_what = e.what(); + } + logger().error("Socket::close(): unexpected exception {}", e_what); + ceph_abort(); + }); +} + +seastar::future<SocketRef> +Socket::connect(const entity_addr_t &peer_addr) +{ + inject_failure(); + return inject_delay( + ).then([peer_addr] { + return seastar::connect(peer_addr.in4_addr()); + }).then([peer_addr](seastar::connected_socket socket) { + auto ret = std::make_unique<Socket>( + std::move(socket), side_t::connector, 0, construct_tag{}); + logger().debug("Socket::connect(): connected to {}, socket {}", + peer_addr, fmt::ptr(ret)); + return ret; + }); +} + +#ifdef UNIT_TESTS_BUILT +void Socket::set_trap(bp_type_t type, bp_action_t action, socket_blocker* blocker_) { + assert(seastar::this_shard_id() == sid); + blocker = blocker_; + if (type == bp_type_t::READ) { + ceph_assert_always(next_trap_read == bp_action_t::CONTINUE); + next_trap_read = action; + } else { // type == bp_type_t::WRITE + if (next_trap_write == bp_action_t::CONTINUE) { + next_trap_write = action; + } else if (next_trap_write == bp_action_t::FAULT) { + // do_sweep_messages() may combine multiple write events into one socket write + ceph_assert_always(action == bp_action_t::FAULT || action == bp_action_t::CONTINUE); + } else { + ceph_abort(); + } + } +} + +seastar::future<> +Socket::try_trap_pre(bp_action_t& trap) { + auto action = trap; + trap = bp_action_t::CONTINUE; + switch (action) { + case bp_action_t::CONTINUE: + break; + case bp_action_t::FAULT: + logger().info("[Test] got FAULT"); + throw std::system_error(make_error_code(error::negotiation_failure)); + case bp_action_t::BLOCK: + logger().info("[Test] got BLOCK"); + return blocker->block(); + case bp_action_t::STALL: + trap = action; + break; + default: + ceph_abort("unexpected action from trap"); + } + return seastar::make_ready_future<>(); +} + +seastar::future<> +Socket::try_trap_post(bp_action_t& trap) { + auto action = trap; + trap = bp_action_t::CONTINUE; + switch (action) { + case bp_action_t::CONTINUE: + break; + case bp_action_t::STALL: + logger().info("[Test] got STALL and block"); + force_shutdown(); + return blocker->block(); + default: + ceph_abort("unexpected action from trap"); + } + return seastar::make_ready_future<>(); +} +#endif + +ShardedServerSocket::ShardedServerSocket( + seastar::shard_id sid, + bool dispatch_only_on_primary_sid, + construct_tag) + : primary_sid{sid}, dispatch_only_on_primary_sid{dispatch_only_on_primary_sid} +{ +} + +ShardedServerSocket::~ShardedServerSocket() +{ + assert(!listener); + // detect whether user have called destroy() properly + ceph_assert_always(!service); +} + +listen_ertr::future<> +ShardedServerSocket::listen(entity_addr_t addr) +{ + ceph_assert_always(seastar::this_shard_id() == primary_sid); + logger().debug("ShardedServerSocket({})::listen()...", addr); + return this->container().invoke_on_all([addr](auto& ss) { + ss.listen_addr = addr; + seastar::socket_address s_addr(addr.in4_addr()); + seastar::listen_options lo; + lo.reuse_address = true; + if (ss.dispatch_only_on_primary_sid) { + lo.set_fixed_cpu(ss.primary_sid); + } + ss.listener = seastar::listen(s_addr, lo); + }).then([] { + return listen_ertr::now(); + }).handle_exception_type( + [addr](const std::system_error& e) -> listen_ertr::future<> { + if (e.code() == std::errc::address_in_use) { + logger().debug("ShardedServerSocket({})::listen(): address in use", addr); + return crimson::ct_error::address_in_use::make(); + } else if (e.code() == std::errc::address_not_available) { + logger().debug("ShardedServerSocket({})::listen(): address not available", + addr); + return crimson::ct_error::address_not_available::make(); + } + logger().error("ShardedServerSocket({})::listen(): " + "got unexpeted error {}", addr, e.what()); + ceph_abort(); + }); +} + +seastar::future<> +ShardedServerSocket::accept(accept_func_t &&_fn_accept) +{ + ceph_assert_always(seastar::this_shard_id() == primary_sid); + logger().debug("ShardedServerSocket({})::accept()...", listen_addr); + return this->container().invoke_on_all([_fn_accept](auto &ss) { + assert(ss.listener); + ss.fn_accept = _fn_accept; + // gate accepting + // ShardedServerSocket::shutdown() will drain the continuations in the gate + // so ignore the returned future + std::ignore = seastar::with_gate(ss.shutdown_gate, [&ss] { + return seastar::keep_doing([&ss] { + return ss.listener->accept( + ).then([&ss](seastar::accept_result accept_result) { +#ifndef NDEBUG + if (ss.dispatch_only_on_primary_sid) { + // see seastar::listen_options::set_fixed_cpu() + ceph_assert_always(seastar::this_shard_id() == ss.primary_sid); + } +#endif + auto [socket, paddr] = std::move(accept_result); + entity_addr_t peer_addr; + peer_addr.set_sockaddr(&paddr.as_posix_sockaddr()); + peer_addr.set_type(ss.listen_addr.get_type()); + SocketRef _socket = std::make_unique<Socket>( + std::move(socket), Socket::side_t::acceptor, + peer_addr.get_port(), Socket::construct_tag{}); + logger().debug("ShardedServerSocket({})::accept(): accepted peer {}, " + "socket {}, dispatch_only_on_primary_sid = {}", + ss.listen_addr, peer_addr, fmt::ptr(_socket), + ss.dispatch_only_on_primary_sid); + std::ignore = seastar::with_gate( + ss.shutdown_gate, + [socket=std::move(_socket), peer_addr, &ss]() mutable { + return ss.fn_accept(std::move(socket), peer_addr + ).handle_exception([&ss, peer_addr](auto eptr) { + const char *e_what; + try { + std::rethrow_exception(eptr); + } catch (std::exception &e) { + e_what = e.what(); + } + logger().error("ShardedServerSocket({})::accept(): " + "fn_accept(s, {}) got unexpected exception {}", + ss.listen_addr, peer_addr, e_what); + ceph_abort(); + }); + }); + }); + }).handle_exception_type([&ss](const std::system_error& e) { + if (e.code() == std::errc::connection_aborted || + e.code() == std::errc::invalid_argument) { + logger().debug("ShardedServerSocket({})::accept(): stopped ({})", + ss.listen_addr, e.what()); + } else { + throw; + } + }).handle_exception([&ss](auto eptr) { + const char *e_what; + try { + std::rethrow_exception(eptr); + } catch (std::exception &e) { + e_what = e.what(); + } + logger().error("ShardedServerSocket({})::accept(): " + "got unexpected exception {}", ss.listen_addr, e_what); + ceph_abort(); + }); + }); + }); +} + +seastar::future<> +ShardedServerSocket::shutdown_destroy() +{ + assert(seastar::this_shard_id() == primary_sid); + logger().debug("ShardedServerSocket({})::shutdown_destroy()...", listen_addr); + // shutdown shards + return this->container().invoke_on_all([](auto& ss) { + if (ss.listener) { + ss.listener->abort_accept(); + } + return ss.shutdown_gate.close(); + }).then([this] { + // destroy shards + return this->container().invoke_on_all([](auto& ss) { + assert(ss.shutdown_gate.is_closed()); + ss.listen_addr = entity_addr_t(); + ss.listener.reset(); + }); + }).then([this] { + // stop the sharded service: we should only construct/stop shards on #0 + return this->container().invoke_on(0, [](auto& ss) { + assert(ss.service); + return ss.service->stop().finally([cleanup = std::move(ss.service)] {}); + }); + }); +} + +seastar::future<ShardedServerSocket*> +ShardedServerSocket::create(bool dispatch_only_on_this_shard) +{ + auto primary_sid = seastar::this_shard_id(); + // start the sharded service: we should only construct/stop shards on #0 + return seastar::smp::submit_to(0, [primary_sid, dispatch_only_on_this_shard] { + auto service = std::make_unique<sharded_service_t>(); + return service->start( + primary_sid, dispatch_only_on_this_shard, construct_tag{} + ).then([service = std::move(service)]() mutable { + auto p_shard = service.get(); + p_shard->local().service = std::move(service); + return p_shard; + }); + }).then([](auto p_shard) { + return &p_shard->local(); + }); +} + +} // namespace crimson::net diff --git a/src/crimson/net/Socket.h b/src/crimson/net/Socket.h new file mode 100644 index 000000000..478f2d630 --- /dev/null +++ b/src/crimson/net/Socket.h @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/gate.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/sharded.hh> + +#include "include/buffer.h" + +#include "crimson/common/log.h" +#include "Errors.h" +#include "Fwd.h" + +#ifdef UNIT_TESTS_BUILT +#include "Interceptor.h" +#endif + +namespace crimson::net { + +class Socket; +using SocketRef = std::unique_ptr<Socket>; +using SocketFRef = seastar::foreign_ptr<SocketRef>; + +class Socket { + struct construct_tag {}; + +public: + // if acceptor side, peer is using a different port (ephemeral_port) + // if connector side, I'm using a different port (ephemeral_port) + enum class side_t { + acceptor, + connector + }; + Socket(seastar::connected_socket &&, side_t, uint16_t e_port, construct_tag); + + ~Socket(); + + Socket(Socket&& o) = delete; + + seastar::shard_id get_shard_id() const { + return sid; + } + + side_t get_side() const { + return side; + } + + uint16_t get_ephemeral_port() const { + return ephemeral_port; + } + + seastar::socket_address get_local_address() const { + return socket.local_address(); + } + + bool is_shutdown() const { + assert(seastar::this_shard_id() == sid); + return socket_is_shutdown; + } + + // learn my ephemeral_port as connector. + // unfortunately, there's no way to identify which port I'm using as + // connector with current seastar interface. + void learn_ephemeral_port_as_connector(uint16_t port) { + assert(side == side_t::connector && + (ephemeral_port == 0 || ephemeral_port == port)); + ephemeral_port = port; + } + + /// read the requested number of bytes into a bufferlist + seastar::future<bufferlist> read(size_t bytes); + + seastar::future<bufferptr> read_exactly(size_t bytes); + + seastar::future<> write(bufferlist); + + seastar::future<> flush(); + + seastar::future<> write_flush(bufferlist); + + // preemptively disable further reads or writes, can only be shutdown once. + void shutdown(); + + /// Socket can only be closed once. + seastar::future<> close(); + + static seastar::future<SocketRef> + connect(const entity_addr_t& peer_addr); + + /* + * test interfaces + */ + + // shutdown for tests + void force_shutdown() { + assert(seastar::this_shard_id() == sid); + socket.shutdown_input(); + socket.shutdown_output(); + } + + // shutdown input_stream only, for tests + void force_shutdown_in() { + assert(seastar::this_shard_id() == sid); + socket.shutdown_input(); + } + + // shutdown output_stream only, for tests + void force_shutdown_out() { + assert(seastar::this_shard_id() == sid); + socket.shutdown_output(); + } + +private: + const seastar::shard_id sid; + seastar::connected_socket socket; + seastar::input_stream<char> in; + seastar::output_stream<char> out; + bool socket_is_shutdown; + side_t side; + uint16_t ephemeral_port; + +#ifndef NDEBUG + bool closed = false; +#endif + + /// buffer state for read() + struct { + bufferlist buffer; + size_t remaining; + } r; + +#ifdef UNIT_TESTS_BUILT +public: + void set_trap(bp_type_t type, bp_action_t action, socket_blocker* blocker_); + +private: + seastar::future<> try_trap_pre(bp_action_t& trap); + + seastar::future<> try_trap_post(bp_action_t& trap); + + bp_action_t next_trap_read = bp_action_t::CONTINUE; + bp_action_t next_trap_write = bp_action_t::CONTINUE; + socket_blocker* blocker = nullptr; + +#endif + friend class ShardedServerSocket; +}; + +using listen_ertr = crimson::errorator< + crimson::ct_error::address_in_use, // The address is already bound + crimson::ct_error::address_not_available // https://techoverflow.net/2021/08/06/how-i-fixed-python-oserror-errno-99-cannot-assign-requested-address/ + >; + +class ShardedServerSocket + : public seastar::peering_sharded_service<ShardedServerSocket> { + struct construct_tag {}; + +public: + ShardedServerSocket( + seastar::shard_id sid, + bool dispatch_only_on_primary_sid, + construct_tag); + + ~ShardedServerSocket(); + + ShardedServerSocket(ShardedServerSocket&&) = delete; + ShardedServerSocket(const ShardedServerSocket&) = delete; + ShardedServerSocket& operator=(ShardedServerSocket&&) = delete; + ShardedServerSocket& operator=(const ShardedServerSocket&) = delete; + + bool is_fixed_shard_dispatching() const { + return dispatch_only_on_primary_sid; + } + + listen_ertr::future<> listen(entity_addr_t addr); + + using accept_func_t = + std::function<seastar::future<>(SocketRef, entity_addr_t)>; + seastar::future<> accept(accept_func_t &&_fn_accept); + + seastar::future<> shutdown_destroy(); + + static seastar::future<ShardedServerSocket*> create( + bool dispatch_only_on_this_shard); + +private: + const seastar::shard_id primary_sid; + /// XXX: Remove once all infrastructure uses multi-core messenger + const bool dispatch_only_on_primary_sid; + entity_addr_t listen_addr; + std::optional<seastar::server_socket> listener; + seastar::gate shutdown_gate; + accept_func_t fn_accept; + + using sharded_service_t = seastar::sharded<ShardedServerSocket>; + std::unique_ptr<sharded_service_t> service; +}; + +} // namespace crimson::net diff --git a/src/crimson/net/SocketConnection.cc b/src/crimson/net/SocketConnection.cc new file mode 100644 index 000000000..57e5c12c1 --- /dev/null +++ b/src/crimson/net/SocketConnection.cc @@ -0,0 +1,220 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "SocketConnection.h" + +#include "ProtocolV2.h" +#include "SocketMessenger.h" + +#ifdef UNIT_TESTS_BUILT +#include "Interceptor.h" +#endif + +using std::ostream; +using crimson::common::local_conf; + +namespace crimson::net { + +SocketConnection::SocketConnection(SocketMessenger& messenger, + ChainedDispatchers& dispatchers) + : msgr_sid{messenger.get_shard_id()}, messenger(messenger) +{ + auto ret = create_handlers(dispatchers, *this); + io_handler = std::move(ret.io_handler); + protocol = std::move(ret.protocol); +#ifdef UNIT_TESTS_BUILT + if (messenger.interceptor) { + interceptor = messenger.interceptor; + interceptor->register_conn(this->get_local_shared_foreign_from_this()); + } +#endif +} + +SocketConnection::~SocketConnection() {} + +bool SocketConnection::is_connected() const +{ + return io_handler->is_connected(); +} + +#ifdef UNIT_TESTS_BUILT +bool SocketConnection::is_protocol_ready() const +{ + assert(seastar::this_shard_id() == msgr_sid); + return protocol->is_ready(); +} + +bool SocketConnection::is_protocol_standby() const { + assert(seastar::this_shard_id() == msgr_sid); + return protocol->is_standby(); +} + +bool SocketConnection::is_protocol_closed() const +{ + assert(seastar::this_shard_id() == msgr_sid); + return protocol->is_closed(); +} + +bool SocketConnection::is_protocol_closed_clean() const +{ + assert(seastar::this_shard_id() == msgr_sid); + return protocol->is_closed_clean(); +} + +#endif +bool SocketConnection::peer_wins() const +{ + assert(seastar::this_shard_id() == msgr_sid); + return (messenger.get_myaddr() > peer_addr || policy.server); +} + +seastar::future<> SocketConnection::send(MessageURef _msg) +{ + // may be invoked from any core + MessageFRef msg = seastar::make_foreign(std::move(_msg)); + return io_handler->send(std::move(msg)); +} + +seastar::future<> SocketConnection::send_keepalive() +{ + // may be invoked from any core + return io_handler->send_keepalive(); +} + +SocketConnection::clock_t::time_point +SocketConnection::get_last_keepalive() const +{ + return io_handler->get_last_keepalive(); +} + +SocketConnection::clock_t::time_point +SocketConnection::get_last_keepalive_ack() const +{ + return io_handler->get_last_keepalive_ack(); +} + +void SocketConnection::set_last_keepalive_ack(clock_t::time_point when) +{ + io_handler->set_last_keepalive_ack(when); +} + +void SocketConnection::mark_down() +{ + io_handler->mark_down(); +} + +void +SocketConnection::start_connect(const entity_addr_t& _peer_addr, + const entity_name_t& _peer_name) +{ + assert(seastar::this_shard_id() == msgr_sid); + protocol->start_connect(_peer_addr, _peer_name); +} + +void +SocketConnection::start_accept(SocketFRef&& sock, + const entity_addr_t& _peer_addr) +{ + assert(seastar::this_shard_id() == msgr_sid); + protocol->start_accept(std::move(sock), _peer_addr); +} + +seastar::future<> +SocketConnection::close_clean_yielded() +{ + assert(seastar::this_shard_id() == msgr_sid); + return protocol->close_clean_yielded(); +} + +seastar::socket_address SocketConnection::get_local_address() const { + assert(seastar::this_shard_id() == msgr_sid); + return socket->get_local_address(); +} + +ConnectionRef +SocketConnection::get_local_shared_foreign_from_this() +{ + assert(seastar::this_shard_id() == msgr_sid); + return make_local_shared_foreign( + seastar::make_foreign(shared_from_this())); +} + +SocketMessenger & +SocketConnection::get_messenger() const +{ + assert(seastar::this_shard_id() == msgr_sid); + return messenger; +} + +seastar::shard_id +SocketConnection::get_messenger_shard_id() const +{ + return msgr_sid; +} + +void SocketConnection::set_peer_type(entity_type_t peer_type) { + assert(seastar::this_shard_id() == msgr_sid); + // it is not allowed to assign an unknown value when the current + // value is known + assert(!(peer_type == 0 && + peer_name.type() != 0)); + // it is not allowed to assign a different known value when the + // current value is also known. + assert(!(peer_type != 0 && + peer_name.type() != 0 && + peer_type != peer_name.type())); + peer_name._type = peer_type; +} + +void SocketConnection::set_peer_id(int64_t peer_id) { + assert(seastar::this_shard_id() == msgr_sid); + // it is not allowed to assign an unknown value when the current + // value is known + assert(!(peer_id == entity_name_t::NEW && + peer_name.num() != entity_name_t::NEW)); + // it is not allowed to assign a different known value when the + // current value is also known. + assert(!(peer_id != entity_name_t::NEW && + peer_name.num() != entity_name_t::NEW && + peer_id != peer_name.num())); + peer_name._num = peer_id; +} + +void SocketConnection::set_features(uint64_t f) { + assert(seastar::this_shard_id() == msgr_sid); + features = f; +} + +void SocketConnection::set_socket(Socket *s) { + assert(seastar::this_shard_id() == msgr_sid); + socket = s; +} + +void SocketConnection::print(ostream& out) const { + out << (void*)this << " "; + messenger.print(out); + if (seastar::this_shard_id() != msgr_sid) { + out << " >> " << get_peer_name() << " " << peer_addr; + } else if (!socket) { + out << " >> " << get_peer_name() << " " << peer_addr; + } else if (socket->get_side() == Socket::side_t::acceptor) { + out << " >> " << get_peer_name() << " " << peer_addr + << "@" << socket->get_ephemeral_port(); + } else { // socket->get_side() == Socket::side_t::connector + out << "@" << socket->get_ephemeral_port() + << " >> " << get_peer_name() << " " << peer_addr; + } +} + +} // namespace crimson::net diff --git a/src/crimson/net/SocketConnection.h b/src/crimson/net/SocketConnection.h new file mode 100644 index 000000000..823d6c574 --- /dev/null +++ b/src/crimson/net/SocketConnection.h @@ -0,0 +1,236 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <seastar/core/sharded.hh> + +#include "msg/Policy.h" +#include "crimson/common/throttle.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Socket.h" + +namespace crimson::net { + +class ProtocolV2; +class SocketMessenger; +class SocketConnection; +using SocketConnectionRef = seastar::shared_ptr<SocketConnection>; + +#ifdef UNIT_TESTS_BUILT +class Interceptor; +#endif + +/** + * ConnectionHandler + * + * The interface class to implement Connection, called by SocketConnection. + * + * The operations must be done in get_shard_id(). + */ +class ConnectionHandler { +public: + using clock_t = seastar::lowres_system_clock; + + virtual ~ConnectionHandler() = default; + + ConnectionHandler(const ConnectionHandler &) = delete; + ConnectionHandler(ConnectionHandler &&) = delete; + ConnectionHandler &operator=(const ConnectionHandler &) = delete; + ConnectionHandler &operator=(ConnectionHandler &&) = delete; + + virtual seastar::shard_id get_shard_id() const = 0; + + virtual bool is_connected() const = 0; + + virtual seastar::future<> send(MessageFRef) = 0; + + virtual seastar::future<> send_keepalive() = 0; + + virtual clock_t::time_point get_last_keepalive() const = 0; + + virtual clock_t::time_point get_last_keepalive_ack() const = 0; + + virtual void set_last_keepalive_ack(clock_t::time_point) = 0; + + virtual void mark_down() = 0; + +protected: + ConnectionHandler() = default; +}; + +class SocketConnection : public Connection { + /* + * Connection interfaces, public to users + * Working in ConnectionHandler::get_shard_id() + */ + public: + SocketConnection(SocketMessenger& messenger, + ChainedDispatchers& dispatchers); + + ~SocketConnection() override; + + const seastar::shard_id get_shard_id() const override { + return io_handler->get_shard_id(); + } + + const entity_name_t &get_peer_name() const override { + return peer_name; + } + + const entity_addr_t &get_peer_addr() const override { + return peer_addr; + } + + const entity_addr_t &get_peer_socket_addr() const override { + return target_addr; + } + + uint64_t get_features() const override { + return features; + } + + bool is_connected() const override; + + seastar::future<> send(MessageURef msg) override; + + seastar::future<> send_keepalive() override; + + clock_t::time_point get_last_keepalive() const override; + + clock_t::time_point get_last_keepalive_ack() const override; + + void set_last_keepalive_ack(clock_t::time_point when) override; + + void mark_down() override; + + bool has_user_private() const override { + return user_private != nullptr; + } + + user_private_t &get_user_private() override { + assert(has_user_private()); + return *user_private; + } + + void set_user_private(std::unique_ptr<user_private_t> new_user_private) override { + assert(!has_user_private()); + user_private = std::move(new_user_private); + } + + void print(std::ostream& out) const override; + + /* + * Public to SocketMessenger + * Working in SocketMessenger::get_shard_id(); + */ + public: + /// start a handshake from the client's perspective, + /// only call when SocketConnection first construct + void start_connect(const entity_addr_t& peer_addr, + const entity_name_t& peer_name); + + /// start a handshake from the server's perspective, + /// only call when SocketConnection first construct + void start_accept(SocketFRef&& socket, + const entity_addr_t& peer_addr); + + seastar::future<> close_clean_yielded(); + + seastar::socket_address get_local_address() const; + + seastar::shard_id get_messenger_shard_id() const; + + SocketMessenger &get_messenger() const; + + ConnectionRef get_local_shared_foreign_from_this(); + +private: + void set_peer_type(entity_type_t peer_type); + + void set_peer_id(int64_t peer_id); + + void set_peer_name(entity_name_t name) { + set_peer_type(name.type()); + set_peer_id(name.num()); + } + + void set_features(uint64_t f); + + void set_socket(Socket *s); + +#ifdef UNIT_TESTS_BUILT + bool is_protocol_ready() const override; + + bool is_protocol_standby() const override; + + bool is_protocol_closed_clean() const override; + + bool is_protocol_closed() const override; + + // peer wins if myaddr > peeraddr + bool peer_wins() const override; + + Interceptor *interceptor = nullptr; +#else + // peer wins if myaddr > peeraddr + bool peer_wins() const; +#endif + +private: + const seastar::shard_id msgr_sid; + + /* + * Core owner is messenger core, may allow to access from the I/O core. + */ + SocketMessenger& messenger; + + std::unique_ptr<ProtocolV2> protocol; + + Socket *socket = nullptr; + + entity_name_t peer_name = {0, entity_name_t::NEW}; + + entity_addr_t peer_addr; + + // which of the peer_addrs we're connecting to (as client) + // or should reconnect to (as peer) + entity_addr_t target_addr; + + uint64_t features = 0; + + ceph::net::Policy<crimson::common::Throttle> policy; + + uint64_t peer_global_id = 0; + + /* + * Core owner is I/O core (mutable). + */ + std::unique_ptr<ConnectionHandler> io_handler; + + /* + * Core owner is up to the connection user. + */ + std::unique_ptr<user_private_t> user_private; + + friend class IOHandler; + friend class ProtocolV2; + friend class FrameAssemblerV2; +}; + +} // namespace crimson::net + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::net::SocketConnection> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/net/SocketMessenger.cc b/src/crimson/net/SocketMessenger.cc new file mode 100644 index 000000000..382d08f98 --- /dev/null +++ b/src/crimson/net/SocketMessenger.cc @@ -0,0 +1,485 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "SocketMessenger.h" + +#include <seastar/core/sleep.hh> + +#include <tuple> +#include <boost/functional/hash.hpp> +#include <fmt/os.h> + +#include "auth/Auth.h" +#include "Errors.h" +#include "Socket.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); + } +} + +namespace crimson::net { + +SocketMessenger::SocketMessenger(const entity_name_t& myname, + const std::string& logic_name, + uint32_t nonce, + bool dispatch_only_on_this_shard) + : sid{seastar::this_shard_id()}, + logic_name{logic_name}, + nonce{nonce}, + dispatch_only_on_sid{dispatch_only_on_this_shard}, + my_name{myname} +{} + +SocketMessenger::~SocketMessenger() +{ + logger().debug("~SocketMessenger: {}", logic_name); + ceph_assert_always(seastar::this_shard_id() == sid); + ceph_assert(!listener); +} + +bool SocketMessenger::set_addr_unknowns(const entity_addrvec_t &addrs) +{ + assert(seastar::this_shard_id() == sid); + bool ret = false; + + entity_addrvec_t newaddrs = my_addrs; + for (auto& a : newaddrs.v) { + if (a.is_blank_ip()) { + int type = a.get_type(); + int port = a.get_port(); + uint32_t nonce = a.get_nonce(); + for (auto& b : addrs.v) { + if (a.get_family() == b.get_family()) { + logger().debug(" assuming my addr {} matches provided addr {}", a, b); + a = b; + a.set_nonce(nonce); + a.set_type(type); + a.set_port(port); + ret = true; + break; + } + } + } + } + my_addrs = newaddrs; + return ret; +} + +void SocketMessenger::set_myaddrs(const entity_addrvec_t& addrs) +{ + assert(seastar::this_shard_id() == sid); + my_addrs = addrs; + for (auto& addr : my_addrs.v) { + addr.nonce = nonce; + } +} + +crimson::net::listen_ertr::future<> +SocketMessenger::do_listen(const entity_addrvec_t& addrs) +{ + ceph_assert(addrs.front().get_family() == AF_INET); + set_myaddrs(addrs); + return seastar::futurize_invoke([this] { + if (!listener) { + return ShardedServerSocket::create(dispatch_only_on_sid + ).then([this] (auto _listener) { + listener = _listener; + }); + } else { + return seastar::now(); + } + }).then([this] () -> listen_ertr::future<> { + const entity_addr_t listen_addr = get_myaddr(); + logger().debug("{} do_listen: try listen {}...", *this, listen_addr); + if (!listener) { + logger().warn("{} do_listen: listener doesn't exist", *this); + return listen_ertr::now(); + } + return listener->listen(listen_addr); + }); +} + +SocketMessenger::bind_ertr::future<> +SocketMessenger::try_bind(const entity_addrvec_t& addrs, + uint32_t min_port, uint32_t max_port) +{ + // the classical OSD iterates over the addrvec and tries to listen on each + // addr. crimson doesn't need to follow as there is a consensus we need to + // worry only about proto v2. + assert(addrs.size() == 1); + auto addr = addrs.msgr2_addr(); + if (addr.get_port() != 0) { + return do_listen(addrs).safe_then([this] { + logger().info("{} try_bind: done", *this); + }); + } + ceph_assert(min_port <= max_port); + return seastar::do_with(uint32_t(min_port), + [this, max_port, addr] (auto& port) { + return seastar::repeat_until_value([this, max_port, addr, &port] { + auto to_bind = addr; + to_bind.set_port(port); + return do_listen(entity_addrvec_t{to_bind} + ).safe_then([this] () -> seastar::future<std::optional<std::error_code>> { + logger().info("{} try_bind: done", *this); + return seastar::make_ready_future<std::optional<std::error_code>>( + std::make_optional<std::error_code>(std::error_code{/* success! */})); + }, listen_ertr::all_same_way([this, max_port, &port] + (const std::error_code& e) mutable + -> seastar::future<std::optional<std::error_code>> { + logger().trace("{} try_bind: {} got error {}", *this, port, e); + if (port == max_port) { + return seastar::make_ready_future<std::optional<std::error_code>>( + std::make_optional<std::error_code>(e)); + } + ++port; + return seastar::make_ready_future<std::optional<std::error_code>>( + std::optional<std::error_code>{std::nullopt}); + })); + }).then([] (const std::error_code e) -> bind_ertr::future<> { + if (!e) { + return bind_ertr::now(); // success! + } else if (e == std::errc::address_in_use) { + return crimson::ct_error::address_in_use::make(); + } else if (e == std::errc::address_not_available) { + return crimson::ct_error::address_not_available::make(); + } + ceph_abort(); + }); + }); +} + +SocketMessenger::bind_ertr::future<> +SocketMessenger::bind(const entity_addrvec_t& addrs) +{ + assert(seastar::this_shard_id() == sid); + using crimson::common::local_conf; + return seastar::do_with(int64_t{local_conf()->ms_bind_retry_count}, + [this, addrs] (auto& count) { + return seastar::repeat_until_value([this, addrs, &count] { + assert(count >= 0); + return try_bind(addrs, + local_conf()->ms_bind_port_min, + local_conf()->ms_bind_port_max) + .safe_then([this] { + logger().info("{} try_bind: done", *this); + return seastar::make_ready_future<std::optional<std::error_code>>( + std::make_optional<std::error_code>(std::error_code{/* success! */})); + }, bind_ertr::all_same_way([this, &count] (const std::error_code error) { + if (count-- > 0) { + logger().info("{} was unable to bind. Trying again in {} seconds", + *this, local_conf()->ms_bind_retry_delay); + return seastar::sleep( + std::chrono::seconds(local_conf()->ms_bind_retry_delay) + ).then([] { + // one more time, please + return seastar::make_ready_future<std::optional<std::error_code>>( + std::optional<std::error_code>{std::nullopt}); + }); + } else { + logger().info("{} was unable to bind after {} attempts: {}", + *this, local_conf()->ms_bind_retry_count, error); + return seastar::make_ready_future<std::optional<std::error_code>>( + std::make_optional<std::error_code>(error)); + } + })); + }).then([] (const std::error_code error) -> bind_ertr::future<> { + if (!error) { + return bind_ertr::now(); // success! + } else if (error == std::errc::address_in_use) { + return crimson::ct_error::address_in_use::make(); + } else if (error == std::errc::address_not_available) { + return crimson::ct_error::address_not_available::make(); + } + ceph_abort(); + }); + }); +} + +seastar::future<> SocketMessenger::accept( + SocketFRef &&socket, const entity_addr_t &peer_addr) +{ + assert(seastar::this_shard_id() == sid); + SocketConnectionRef conn = + seastar::make_shared<SocketConnection>(*this, dispatchers); + conn->start_accept(std::move(socket), peer_addr); + return seastar::now(); +} + +seastar::future<> SocketMessenger::start( + const dispatchers_t& _dispatchers) { + assert(seastar::this_shard_id() == sid); + + dispatchers.assign(_dispatchers); + if (listener) { + // make sure we have already bound to a valid address + ceph_assert(get_myaddr().is_msgr2()); + ceph_assert(get_myaddr().get_port() > 0); + + return listener->accept([this](SocketRef _socket, entity_addr_t peer_addr) { + assert(get_myaddr().is_msgr2()); + SocketFRef socket = seastar::make_foreign(std::move(_socket)); + if (listener->is_fixed_shard_dispatching()) { + return accept(std::move(socket), peer_addr); + } else { + return seastar::smp::submit_to(sid, + [this, peer_addr, socket = std::move(socket)]() mutable { + return accept(std::move(socket), peer_addr); + }); + } + }); + } + return seastar::now(); +} + +crimson::net::ConnectionRef +SocketMessenger::connect(const entity_addr_t& peer_addr, const entity_name_t& peer_name) +{ + assert(seastar::this_shard_id() == sid); + + // make sure we connect to a valid peer_addr + if (!peer_addr.is_msgr2()) { + ceph_abort_msg("ProtocolV1 is no longer supported"); + } + ceph_assert(peer_addr.get_port() > 0); + + if (auto found = lookup_conn(peer_addr); found) { + logger().debug("{} connect to existing", *found); + return found->get_local_shared_foreign_from_this(); + } + SocketConnectionRef conn = + seastar::make_shared<SocketConnection>(*this, dispatchers); + conn->start_connect(peer_addr, peer_name); + return conn->get_local_shared_foreign_from_this(); +} + +seastar::future<> SocketMessenger::shutdown() +{ + assert(seastar::this_shard_id() == sid); + return seastar::futurize_invoke([this] { + assert(dispatchers.empty()); + if (listener) { + auto d_listener = listener; + listener = nullptr; + return d_listener->shutdown_destroy(); + } else { + return seastar::now(); + } + // close all connections + }).then([this] { + return seastar::parallel_for_each(accepting_conns, [] (auto conn) { + return conn->close_clean_yielded(); + }); + }).then([this] { + ceph_assert(accepting_conns.empty()); + return seastar::parallel_for_each(connections, [] (auto conn) { + return conn.second->close_clean_yielded(); + }); + }).then([this] { + return seastar::parallel_for_each(closing_conns, [] (auto conn) { + return conn->close_clean_yielded(); + }); + }).then([this] { + ceph_assert(connections.empty()); + shutdown_promise.set_value(); + }); +} + +static entity_addr_t choose_addr( + const entity_addr_t &peer_addr_for_me, + const SocketConnection& conn) +{ + using crimson::common::local_conf; + // XXX: a syscall is here + if (const auto local_addr = conn.get_local_address(); + local_conf()->ms_learn_addr_from_peer) { + logger().info("{} peer {} says I am {} (socket says {})", + conn, conn.get_peer_socket_addr(), peer_addr_for_me, + local_addr); + return peer_addr_for_me; + } else { + const auto local_addr_for_me = conn.get_local_address(); + logger().info("{} socket to {} says I am {} (peer says {})", + conn, conn.get_peer_socket_addr(), + local_addr, peer_addr_for_me); + entity_addr_t addr; + addr.set_sockaddr(&local_addr_for_me.as_posix_sockaddr()); + return addr; + } +} + +void SocketMessenger::learned_addr( + const entity_addr_t &peer_addr_for_me, + const SocketConnection& conn) +{ + assert(seastar::this_shard_id() == sid); + if (!need_addr) { + if ((!get_myaddr().is_any() && + get_myaddr().get_type() != peer_addr_for_me.get_type()) || + get_myaddr().get_family() != peer_addr_for_me.get_family() || + !get_myaddr().is_same_host(peer_addr_for_me)) { + logger().warn("{} peer_addr_for_me {} type/family/IP doesn't match myaddr {}", + conn, peer_addr_for_me, get_myaddr()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + return; + } + + if (get_myaddr().get_type() == entity_addr_t::TYPE_NONE) { + // Not bound + auto addr = choose_addr(peer_addr_for_me, conn); + addr.set_type(entity_addr_t::TYPE_ANY); + addr.set_port(0); + need_addr = false; + set_myaddrs(entity_addrvec_t{addr}); + logger().info("{} learned myaddr={} (unbound)", conn, get_myaddr()); + } else { + // Already bound + if (!get_myaddr().is_any() && + get_myaddr().get_type() != peer_addr_for_me.get_type()) { + logger().warn("{} peer_addr_for_me {} type doesn't match myaddr {}", + conn, peer_addr_for_me, get_myaddr()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (get_myaddr().get_family() != peer_addr_for_me.get_family()) { + logger().warn("{} peer_addr_for_me {} family doesn't match myaddr {}", + conn, peer_addr_for_me, get_myaddr()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (get_myaddr().is_blank_ip()) { + auto addr = choose_addr(peer_addr_for_me, conn); + addr.set_type(get_myaddr().get_type()); + addr.set_port(get_myaddr().get_port()); + need_addr = false; + set_myaddrs(entity_addrvec_t{addr}); + logger().info("{} learned myaddr={} (blank IP)", conn, get_myaddr()); + } else if (!get_myaddr().is_same_host(peer_addr_for_me)) { + logger().warn("{} peer_addr_for_me {} IP doesn't match myaddr {}", + conn, peer_addr_for_me, get_myaddr()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } else { + need_addr = false; + } + } +} + +SocketPolicy SocketMessenger::get_policy(entity_type_t peer_type) const +{ + assert(seastar::this_shard_id() == sid); + return policy_set.get(peer_type); +} + +SocketPolicy SocketMessenger::get_default_policy() const +{ + assert(seastar::this_shard_id() == sid); + return policy_set.get_default(); +} + +void SocketMessenger::set_default_policy(const SocketPolicy& p) +{ + assert(seastar::this_shard_id() == sid); + policy_set.set_default(p); +} + +void SocketMessenger::set_policy(entity_type_t peer_type, + const SocketPolicy& p) +{ + assert(seastar::this_shard_id() == sid); + policy_set.set(peer_type, p); +} + +void SocketMessenger::set_policy_throttler(entity_type_t peer_type, + Throttle* throttle) +{ + assert(seastar::this_shard_id() == sid); + // only byte throttler is used in OSD + policy_set.set_throttlers(peer_type, throttle, nullptr); +} + +crimson::net::SocketConnectionRef SocketMessenger::lookup_conn(const entity_addr_t& addr) +{ + assert(seastar::this_shard_id() == sid); + if (auto found = connections.find(addr); + found != connections.end()) { + return found->second; + } else { + return nullptr; + } +} + +void SocketMessenger::accept_conn(SocketConnectionRef conn) +{ + assert(seastar::this_shard_id() == sid); + accepting_conns.insert(conn); +} + +void SocketMessenger::unaccept_conn(SocketConnectionRef conn) +{ + assert(seastar::this_shard_id() == sid); + accepting_conns.erase(conn); +} + +void SocketMessenger::register_conn(SocketConnectionRef conn) +{ + assert(seastar::this_shard_id() == sid); + auto [i, added] = connections.emplace(conn->get_peer_addr(), conn); + std::ignore = i; + ceph_assert(added); +} + +void SocketMessenger::unregister_conn(SocketConnectionRef conn) +{ + assert(seastar::this_shard_id() == sid); + ceph_assert(conn); + auto found = connections.find(conn->get_peer_addr()); + ceph_assert(found != connections.end()); + ceph_assert(found->second == conn); + connections.erase(found); +} + +void SocketMessenger::closing_conn(SocketConnectionRef conn) +{ + assert(seastar::this_shard_id() == sid); + closing_conns.push_back(conn); +} + +void SocketMessenger::closed_conn(SocketConnectionRef conn) +{ + assert(seastar::this_shard_id() == sid); + for (auto it = closing_conns.begin(); + it != closing_conns.end();) { + if (*it == conn) { + it = closing_conns.erase(it); + } else { + it++; + } + } +} + +uint32_t SocketMessenger::get_global_seq(uint32_t old) +{ + assert(seastar::this_shard_id() == sid); + if (old > global_seq) { + global_seq = old; + } + return ++global_seq; +} + +} // namespace crimson::net diff --git a/src/crimson/net/SocketMessenger.h b/src/crimson/net/SocketMessenger.h new file mode 100644 index 000000000..e4ac63184 --- /dev/null +++ b/src/crimson/net/SocketMessenger.h @@ -0,0 +1,192 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <map> +#include <set> +#include <vector> +#include <seastar/core/gate.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/sharded.hh> +#include <seastar/core/shared_future.hh> + +#include "crimson/net/chained_dispatchers.h" +#include "Messenger.h" +#include "Socket.h" +#include "SocketConnection.h" + +namespace crimson::net { + +class ShardedServerSocket; + +class SocketMessenger final : public Messenger { +// Messenger public interfaces +public: + SocketMessenger(const entity_name_t& myname, + const std::string& logic_name, + uint32_t nonce, + bool dispatch_only_on_this_shard); + + ~SocketMessenger() override; + + const entity_name_t &get_myname() const override { + return my_name; + } + + const entity_addrvec_t &get_myaddrs() const override { + return my_addrs; + } + + void set_myaddrs(const entity_addrvec_t& addr) override; + + bool set_addr_unknowns(const entity_addrvec_t &addr) override; + + void set_auth_client(crimson::auth::AuthClient *ac) override { + assert(seastar::this_shard_id() == sid); + auth_client = ac; + } + + void set_auth_server(crimson::auth::AuthServer *as) override { + assert(seastar::this_shard_id() == sid); + auth_server = as; + } + + bind_ertr::future<> bind(const entity_addrvec_t& addr) override; + + seastar::future<> start(const dispatchers_t& dispatchers) override; + + ConnectionRef connect(const entity_addr_t& peer_addr, + const entity_name_t& peer_name) override; + + bool owns_connection(Connection &conn) const override { + assert(seastar::this_shard_id() == sid); + return this == &static_cast<SocketConnection&>(conn).get_messenger(); + } + + // can only wait once + seastar::future<> wait() override { + assert(seastar::this_shard_id() == sid); + return shutdown_promise.get_future(); + } + + void stop() override { + assert(seastar::this_shard_id() == sid); + dispatchers.clear(); + } + + bool is_started() const override { + assert(seastar::this_shard_id() == sid); + return !dispatchers.empty(); + } + + seastar::future<> shutdown() override; + + void print(std::ostream& out) const override { + out << get_myname() + << "(" << logic_name + << ") " << get_myaddr(); + } + + SocketPolicy get_policy(entity_type_t peer_type) const override; + + SocketPolicy get_default_policy() const override; + + void set_default_policy(const SocketPolicy& p) override; + + void set_policy(entity_type_t peer_type, const SocketPolicy& p) override; + + void set_policy_throttler(entity_type_t peer_type, Throttle* throttle) override; + +// SocketMessenger public interfaces +public: + crimson::auth::AuthClient* get_auth_client() const { + assert(seastar::this_shard_id() == sid); + return auth_client; + } + + crimson::auth::AuthServer* get_auth_server() const { + assert(seastar::this_shard_id() == sid); + return auth_server; + } + + uint32_t get_global_seq(uint32_t old=0); + + void learned_addr(const entity_addr_t &peer_addr_for_me, + const SocketConnection& conn); + + SocketConnectionRef lookup_conn(const entity_addr_t& addr); + + void accept_conn(SocketConnectionRef); + + void unaccept_conn(SocketConnectionRef); + + void register_conn(SocketConnectionRef); + + void unregister_conn(SocketConnectionRef); + + void closing_conn(SocketConnectionRef); + + void closed_conn(SocketConnectionRef); + + seastar::shard_id get_shard_id() const { + return sid; + } + +#ifdef UNIT_TESTS_BUILT + void set_interceptor(Interceptor *i) override { + interceptor = i; + } + + Interceptor *interceptor = nullptr; +#endif + +private: + seastar::future<> accept(SocketFRef &&, const entity_addr_t &); + + listen_ertr::future<> do_listen(const entity_addrvec_t& addr); + + /// try to bind to the first unused port of given address + bind_ertr::future<> try_bind(const entity_addrvec_t& addr, + uint32_t min_port, uint32_t max_port); + + const seastar::shard_id sid; + // Distinguish messengers with meaningful names for debugging + const std::string logic_name; + const uint32_t nonce; + const bool dispatch_only_on_sid; + + entity_name_t my_name; + entity_addrvec_t my_addrs; + crimson::auth::AuthClient* auth_client = nullptr; + crimson::auth::AuthServer* auth_server = nullptr; + + ShardedServerSocket *listener = nullptr; + ChainedDispatchers dispatchers; + std::map<entity_addr_t, SocketConnectionRef> connections; + std::set<SocketConnectionRef> accepting_conns; + std::vector<SocketConnectionRef> closing_conns; + ceph::net::PolicySet<Throttle> policy_set; + // specifying we haven't learned our addr; set false when we find it. + bool need_addr = true; + uint32_t global_seq = 0; + bool started = false; + seastar::promise<> shutdown_promise; +}; + +} // namespace crimson::net + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::net::SocketMessenger> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/net/chained_dispatchers.cc b/src/crimson/net/chained_dispatchers.cc new file mode 100644 index 000000000..1e4af3baa --- /dev/null +++ b/src/crimson/net/chained_dispatchers.cc @@ -0,0 +1,114 @@ +#include "crimson/common/log.h" +#include "crimson/net/chained_dispatchers.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Dispatcher.h" +#include "msg/Message.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); + } +} + +namespace crimson::net { + +seastar::future<> +ChainedDispatchers::ms_dispatch(ConnectionRef conn, + MessageRef m) { + try { + for (auto& dispatcher : dispatchers) { + auto dispatched = dispatcher->ms_dispatch(conn, m); + if (dispatched.has_value()) { + return std::move(*dispatched + ).handle_exception([conn] (std::exception_ptr eptr) { + logger().error("{} got unexpected exception in ms_dispatch() throttling {}", + *conn, eptr); + ceph_abort(); + }); + } + } + } catch (...) { + logger().error("{} got unexpected exception in ms_dispatch() {}", + *conn, std::current_exception()); + ceph_abort(); + } + if (!dispatchers.empty()) { + logger().error("ms_dispatch unhandled message {}", *m); + } + return seastar::now(); +} + +void +ChainedDispatchers::ms_handle_shard_change( + ConnectionRef conn, + seastar::shard_id new_shard, + bool ac) { + try { + for (auto& dispatcher : dispatchers) { + dispatcher->ms_handle_shard_change(conn, new_shard, ac); + } + } catch (...) { + logger().error("{} got unexpected exception in ms_handle_shard_change() {}", + *conn, std::current_exception()); + ceph_abort(); + } +} + +void +ChainedDispatchers::ms_handle_accept( + ConnectionRef conn, + seastar::shard_id prv_shard, + bool is_replace) { + try { + for (auto& dispatcher : dispatchers) { + dispatcher->ms_handle_accept(conn, prv_shard, is_replace); + } + } catch (...) { + logger().error("{} got unexpected exception in ms_handle_accept() {}", + *conn, std::current_exception()); + ceph_abort(); + } +} + +void +ChainedDispatchers::ms_handle_connect( + ConnectionRef conn, + seastar::shard_id prv_shard) { + try { + for(auto& dispatcher : dispatchers) { + dispatcher->ms_handle_connect(conn, prv_shard); + } + } catch (...) { + logger().error("{} got unexpected exception in ms_handle_connect() {}", + *conn, std::current_exception()); + ceph_abort(); + } +} + +void +ChainedDispatchers::ms_handle_reset(ConnectionRef conn, bool is_replace) { + try { + for (auto& dispatcher : dispatchers) { + dispatcher->ms_handle_reset(conn, is_replace); + } + } catch (...) { + logger().error("{} got unexpected exception in ms_handle_reset() {}", + *conn, std::current_exception()); + ceph_abort(); + } +} + +void +ChainedDispatchers::ms_handle_remote_reset(ConnectionRef conn) { + try { + for (auto& dispatcher : dispatchers) { + dispatcher->ms_handle_remote_reset(conn); + } + } catch (...) { + logger().error("{} got unexpected exception in ms_handle_remote_reset() {}", + *conn, std::current_exception()); + ceph_abort(); + } +} + +} diff --git a/src/crimson/net/chained_dispatchers.h b/src/crimson/net/chained_dispatchers.h new file mode 100644 index 000000000..ec085864f --- /dev/null +++ b/src/crimson/net/chained_dispatchers.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/smp.hh> + +#include "Fwd.h" +#include "crimson/common/log.h" + +namespace crimson::net { + +class Dispatcher; + +class ChainedDispatchers { +public: + void assign(const dispatchers_t& _dispatchers) { + assert(empty()); + assert(!_dispatchers.empty()); + dispatchers = _dispatchers; + } + void clear() { + dispatchers.clear(); + } + bool empty() const { + return dispatchers.empty(); + } + seastar::future<> ms_dispatch(ConnectionRef, MessageRef); + void ms_handle_shard_change(ConnectionRef, seastar::shard_id, bool); + void ms_handle_accept(ConnectionRef conn, seastar::shard_id, bool is_replace); + void ms_handle_connect(ConnectionRef conn, seastar::shard_id); + void ms_handle_reset(ConnectionRef conn, bool is_replace); + void ms_handle_remote_reset(ConnectionRef conn); + + private: + dispatchers_t dispatchers; +}; + +} diff --git a/src/crimson/net/io_handler.cc b/src/crimson/net/io_handler.cc new file mode 100644 index 000000000..c414c48e1 --- /dev/null +++ b/src/crimson/net/io_handler.cc @@ -0,0 +1,1287 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "io_handler.h" + +#include "auth/Auth.h" + +#include "crimson/common/formatter.h" +#include "crimson/common/log.h" +#include "crimson/net/Errors.h" +#include "crimson/net/chained_dispatchers.h" +#include "crimson/net/SocketMessenger.h" +#include "msg/Message.h" +#include "msg/msg_fmt.h" + +using namespace ceph::msgr::v2; +using crimson::common::local_conf; + +namespace { + +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); +} + +[[noreturn]] void abort_in_fault() { + throw std::system_error(make_error_code(crimson::net::error::negotiation_failure)); +} + +[[noreturn]] void abort_protocol() { + throw std::system_error(make_error_code(crimson::net::error::protocol_aborted)); +} + +std::size_t get_msg_size(const FrameAssembler &rx_frame_asm) +{ + ceph_assert(rx_frame_asm.get_num_segments() > 0); + size_t sum = 0; + // we don't include SegmentIndex::Msg::HEADER. + for (size_t idx = 1; idx < rx_frame_asm.get_num_segments(); idx++) { + sum += rx_frame_asm.get_segment_logical_len(idx); + } + return sum; +} + +} // namespace anonymous + +namespace crimson::net { + +IOHandler::IOHandler(ChainedDispatchers &dispatchers, + SocketConnection &conn) + : shard_states(shard_states_t::create( + seastar::this_shard_id(), io_state_t::none)), + dispatchers(dispatchers), + conn(conn), + conn_ref(conn.get_local_shared_foreign_from_this()) +{} + +IOHandler::~IOHandler() +{ + // close_io() must be finished + ceph_assert_always(maybe_prv_shard_states == nullptr); + // should be true in the according shard + // ceph_assert_always(shard_states->assert_closed_and_exit()); + assert(!conn_ref); +} + +#ifdef UNIT_TESTS_BUILT +IOHandler::sweep_ret +#else +ceph::bufferlist +#endif +IOHandler::sweep_out_pending_msgs_to_sent( + bool require_keepalive, + std::optional<utime_t> maybe_keepalive_ack, + bool require_ack) +{ + std::size_t num_msgs = out_pending_msgs.size(); + ceph::bufferlist bl; + +#ifdef UNIT_TESTS_BUILT + std::vector<Tag> tags; +#endif + + if (unlikely(require_keepalive)) { + auto keepalive_frame = KeepAliveFrame::Encode(); + bl.append(frame_assembler->get_buffer(keepalive_frame)); +#ifdef UNIT_TESTS_BUILT + auto tag = KeepAliveFrame::tag; + tags.push_back(tag); +#endif + } + + if (unlikely(maybe_keepalive_ack.has_value())) { + auto keepalive_ack_frame = KeepAliveFrameAck::Encode(*maybe_keepalive_ack); + bl.append(frame_assembler->get_buffer(keepalive_ack_frame)); +#ifdef UNIT_TESTS_BUILT + auto tag = KeepAliveFrameAck::tag; + tags.push_back(tag); +#endif + } + + if (require_ack && num_msgs == 0u) { + auto ack_frame = AckFrame::Encode(in_seq); + bl.append(frame_assembler->get_buffer(ack_frame)); +#ifdef UNIT_TESTS_BUILT + auto tag = AckFrame::tag; + tags.push_back(tag); +#endif + } + + std::for_each( + out_pending_msgs.begin(), + out_pending_msgs.begin()+num_msgs, + [this, &bl +#ifdef UNIT_TESTS_BUILT + , &tags +#endif + ](const MessageFRef& msg) { + // set priority + msg->get_header().src = conn.messenger.get_myname(); + + msg->encode(conn.features, 0); + + ceph_assert(!msg->get_seq() && "message already has seq"); + msg->set_seq(++out_seq); + + ceph_msg_header &header = msg->get_header(); + ceph_msg_footer &footer = msg->get_footer(); + + ceph_msg_header2 header2{header.seq, header.tid, + header.type, header.priority, + header.version, + ceph_le32(0), header.data_off, + ceph_le64(in_seq), + footer.flags, header.compat_version, + header.reserved}; + + auto message = MessageFrame::Encode(header2, + msg->get_payload(), msg->get_middle(), msg->get_data()); + logger().debug("{} --> #{} === {} ({})", + conn, msg->get_seq(), *msg, msg->get_type()); + bl.append(frame_assembler->get_buffer(message)); +#ifdef UNIT_TESTS_BUILT + auto tag = MessageFrame::tag; + tags.push_back(tag); +#endif + }); + + if (!conn.policy.lossy) { + out_sent_msgs.insert( + out_sent_msgs.end(), + std::make_move_iterator(out_pending_msgs.begin()), + std::make_move_iterator(out_pending_msgs.end())); + } + out_pending_msgs.clear(); + +#ifdef UNIT_TESTS_BUILT + return sweep_ret{std::move(bl), tags}; +#else + return bl; +#endif +} + +seastar::future<> IOHandler::send(MessageFRef msg) +{ + // sid may be changed on-the-fly during the submission + if (seastar::this_shard_id() == get_shard_id()) { + return do_send(std::move(msg)); + } else { + logger().trace("{} send() is directed to {} -- {}", + conn, get_shard_id(), *msg); + return seastar::smp::submit_to( + get_shard_id(), [this, msg=std::move(msg)]() mutable { + return send_redirected(std::move(msg)); + }); + } +} + +seastar::future<> IOHandler::send_redirected(MessageFRef msg) +{ + // sid may be changed on-the-fly during the submission + if (seastar::this_shard_id() == get_shard_id()) { + return do_send(std::move(msg)); + } else { + logger().debug("{} send() is redirected to {} -- {}", + conn, get_shard_id(), *msg); + return seastar::smp::submit_to( + get_shard_id(), [this, msg=std::move(msg)]() mutable { + return send_redirected(std::move(msg)); + }); + } +} + +seastar::future<> IOHandler::do_send(MessageFRef msg) +{ + assert(seastar::this_shard_id() == get_shard_id()); + logger().trace("{} do_send() got message -- {}", conn, *msg); + if (get_io_state() != io_state_t::drop) { + out_pending_msgs.push_back(std::move(msg)); + notify_out_dispatch(); + } + return seastar::now(); +} + +seastar::future<> IOHandler::send_keepalive() +{ + // sid may be changed on-the-fly during the submission + if (seastar::this_shard_id() == get_shard_id()) { + return do_send_keepalive(); + } else { + logger().trace("{} send_keepalive() is directed to {}", conn, get_shard_id()); + return seastar::smp::submit_to( + get_shard_id(), [this] { + return send_keepalive_redirected(); + }); + } +} + +seastar::future<> IOHandler::send_keepalive_redirected() +{ + // sid may be changed on-the-fly during the submission + if (seastar::this_shard_id() == get_shard_id()) { + return do_send_keepalive(); + } else { + logger().debug("{} send_keepalive() is redirected to {}", conn, get_shard_id()); + return seastar::smp::submit_to( + get_shard_id(), [this] { + return send_keepalive_redirected(); + }); + } +} + +seastar::future<> IOHandler::do_send_keepalive() +{ + assert(seastar::this_shard_id() == get_shard_id()); + logger().trace("{} do_send_keeplive(): need_keepalive={}", conn, need_keepalive); + if (!need_keepalive) { + need_keepalive = true; + notify_out_dispatch(); + } + return seastar::now(); +} + +void IOHandler::mark_down() +{ + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + ceph_assert_always(get_io_state() != io_state_t::none); + need_dispatch_reset = false; + if (get_io_state() == io_state_t::drop) { + return; + } + + auto cc_seq = crosscore.prepare_submit(); + logger().info("{} mark_down() at {}, send {} notify_mark_down()", + conn, io_stat_printer{*this}, cc_seq); + do_set_io_state(io_state_t::drop); + shard_states->dispatch_in_background( + "notify_mark_down", conn, [this, cc_seq] { + return seastar::smp::submit_to( + conn.get_messenger_shard_id(), [this, cc_seq] { + return handshake_listener->notify_mark_down(cc_seq); + }); + }); +} + +void IOHandler::print_io_stat(std::ostream &out) const +{ + assert(seastar::this_shard_id() == get_shard_id()); + out << "io_stat(" + << "io_state=" << fmt::format("{}", get_io_state()) + << ", in_seq=" << in_seq + << ", out_seq=" << out_seq + << ", out_pending_msgs_size=" << out_pending_msgs.size() + << ", out_sent_msgs_size=" << out_sent_msgs.size() + << ", need_ack=" << (ack_left > 0) + << ", need_keepalive=" << need_keepalive + << ", need_keepalive_ack=" << bool(next_keepalive_ack) + << ")"; +} + +void IOHandler::assign_frame_assembler(FrameAssemblerV2Ref fa) +{ + assert(fa != nullptr); + ceph_assert_always(frame_assembler == nullptr); + frame_assembler = std::move(fa); + ceph_assert_always( + frame_assembler->get_shard_id() == get_shard_id()); + // should have been set through dispatch_accept/connect() + ceph_assert_always( + frame_assembler->get_socket_shard_id() == get_shard_id()); + ceph_assert_always(frame_assembler->is_socket_valid()); +} + +void IOHandler::do_set_io_state( + io_state_t new_state, + std::optional<crosscore_t::seq_t> cc_seq, + FrameAssemblerV2Ref fa, + bool set_notify_out) +{ + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + auto prv_state = get_io_state(); + logger().debug("{} got {}do_set_io_state(): prv_state={}, new_state={}, " + "fa={}, set_notify_out={}, at {}", + conn, + cc_seq.has_value() ? fmt::format("{} ", *cc_seq) : "", + prv_state, new_state, + fa ? "present" : "N/A", set_notify_out, + io_stat_printer{*this}); + ceph_assert_always(!( + (new_state == io_state_t::none && prv_state != io_state_t::none) || + (new_state == io_state_t::open && prv_state == io_state_t::open) + )); + + if (prv_state == io_state_t::drop) { + // only possible due to a racing mark_down() from user + if (new_state == io_state_t::open) { + assign_frame_assembler(std::move(fa)); + frame_assembler->shutdown_socket<false>(nullptr); + } else { + assert(fa == nullptr); + } + return; + } + + bool dispatch_in = false; + if (new_state == io_state_t::open) { + // to open + ceph_assert_always(protocol_is_connected == true); + assign_frame_assembler(std::move(fa)); + dispatch_in = true; + } else if (prv_state == io_state_t::open) { + // from open + ceph_assert_always(protocol_is_connected == true); + protocol_is_connected = false; + assert(fa == nullptr); + ceph_assert_always(frame_assembler->is_socket_valid()); + frame_assembler->shutdown_socket<false>(nullptr); + } else { + assert(fa == nullptr); + } + + if (new_state == io_state_t::delay) { + need_notify_out = set_notify_out; + if (need_notify_out) { + maybe_notify_out_dispatch(); + } + } else { + assert(set_notify_out == false); + need_notify_out = false; + } + + // FIXME: simplify and drop the prv_state == new_state case + if (prv_state != new_state) { + shard_states->set_io_state(new_state); + } + + /* + * not atomic below + */ + + if (dispatch_in) { + do_in_dispatch(); + } +} + +seastar::future<> IOHandler::set_io_state( + crosscore_t::seq_t cc_seq, + io_state_t new_state, + FrameAssemblerV2Ref fa, + bool set_notify_out) +{ + assert(seastar::this_shard_id() == get_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} set_io_state(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq, new_state, + fa=std::move(fa), set_notify_out]() mutable { + return set_io_state(cc_seq, new_state, std::move(fa), set_notify_out); + }); + } + + do_set_io_state(new_state, cc_seq, std::move(fa), set_notify_out); + return seastar::now(); +} + +seastar::future<IOHandler::exit_dispatching_ret> +IOHandler::wait_io_exit_dispatching( + crosscore_t::seq_t cc_seq) +{ + assert(seastar::this_shard_id() == get_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} wait_io_exit_dispatching(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq] { + return wait_io_exit_dispatching(cc_seq); + }); + } + + logger().debug("{} got {} wait_io_exit_dispatching()", + conn, cc_seq); + ceph_assert_always(get_io_state() != io_state_t::open); + ceph_assert_always(frame_assembler != nullptr); + ceph_assert_always(!frame_assembler->is_socket_valid()); + return seastar::futurize_invoke([this] { + // cannot be running in parallel with to_new_sid() + if (maybe_dropped_sid.has_value()) { + ceph_assert_always(get_io_state() == io_state_t::drop); + assert(shard_states->assert_closed_and_exit()); + auto prv_sid = *maybe_dropped_sid; + return seastar::smp::submit_to(prv_sid, [this] { + logger().debug("{} got wait_io_exit_dispatching from prv_sid", conn); + assert(maybe_prv_shard_states != nullptr); + return maybe_prv_shard_states->wait_io_exit_dispatching(); + }); + } else { + return shard_states->wait_io_exit_dispatching(); + } + }).then([this] { + logger().debug("{} finish wait_io_exit_dispatching at {}", + conn, io_stat_printer{*this}); + ceph_assert_always(frame_assembler != nullptr); + ceph_assert_always(!frame_assembler->is_socket_valid()); + frame_assembler->set_shard_id(conn.get_messenger_shard_id()); + return exit_dispatching_ret{ + std::move(frame_assembler), + get_states()}; + }); +} + +seastar::future<> IOHandler::reset_session( + crosscore_t::seq_t cc_seq, + bool full) +{ + assert(seastar::this_shard_id() == get_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} reset_session(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq, full] { + return reset_session(cc_seq, full); + }); + } + + logger().debug("{} got {} reset_session({})", + conn, cc_seq, full); + assert(get_io_state() != io_state_t::open); + reset_in(); + if (full) { + reset_out(); + dispatch_remote_reset(); + } + return seastar::now(); +} + +seastar::future<> IOHandler::reset_peer_state( + crosscore_t::seq_t cc_seq) +{ + assert(seastar::this_shard_id() == get_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} reset_peer_state(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq] { + return reset_peer_state(cc_seq); + }); + } + + logger().debug("{} got {} reset_peer_state()", + conn, cc_seq); + assert(get_io_state() != io_state_t::open); + reset_in(); + do_requeue_out_sent_up_to(0); + discard_out_sent(); + return seastar::now(); +} + +seastar::future<> IOHandler::requeue_out_sent( + crosscore_t::seq_t cc_seq) +{ + assert(seastar::this_shard_id() == get_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} requeue_out_sent(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq] { + return requeue_out_sent(cc_seq); + }); + } + + logger().debug("{} got {} requeue_out_sent()", + conn, cc_seq); + do_requeue_out_sent(); + return seastar::now(); +} + +void IOHandler::do_requeue_out_sent() +{ + assert(get_io_state() != io_state_t::open); + if (out_sent_msgs.empty()) { + return; + } + + out_seq -= out_sent_msgs.size(); + logger().debug("{} requeue {} items, revert out_seq to {}", + conn, out_sent_msgs.size(), out_seq); + for (MessageFRef& msg : out_sent_msgs) { + msg->clear_payload(); + msg->set_seq(0); + } + out_pending_msgs.insert( + out_pending_msgs.begin(), + std::make_move_iterator(out_sent_msgs.begin()), + std::make_move_iterator(out_sent_msgs.end())); + out_sent_msgs.clear(); + maybe_notify_out_dispatch(); +} + +seastar::future<> IOHandler::requeue_out_sent_up_to( + crosscore_t::seq_t cc_seq, + seq_num_t msg_seq) +{ + assert(seastar::this_shard_id() == get_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} requeue_out_sent_up_to(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq, msg_seq] { + return requeue_out_sent_up_to(cc_seq, msg_seq); + }); + } + + logger().debug("{} got {} requeue_out_sent_up_to({})", + conn, cc_seq, msg_seq); + do_requeue_out_sent_up_to(msg_seq); + return seastar::now(); +} + +void IOHandler::do_requeue_out_sent_up_to(seq_num_t seq) +{ + assert(get_io_state() != io_state_t::open); + if (out_sent_msgs.empty() && out_pending_msgs.empty()) { + logger().debug("{} nothing to requeue, reset out_seq from {} to seq {}", + conn, out_seq, seq); + out_seq = seq; + return; + } + logger().debug("{} discarding sent msgs by seq {} (sent_len={}, out_seq={})", + conn, seq, out_sent_msgs.size(), out_seq); + while (!out_sent_msgs.empty()) { + auto cur_seq = out_sent_msgs.front()->get_seq(); + if (cur_seq == 0 || cur_seq > seq) { + break; + } else { + out_sent_msgs.pop_front(); + } + } + do_requeue_out_sent(); +} + +void IOHandler::reset_in() +{ + assert(get_io_state() != io_state_t::open); + in_seq = 0; +} + +void IOHandler::reset_out() +{ + assert(get_io_state() != io_state_t::open); + discard_out_sent(); + out_pending_msgs.clear(); + need_keepalive = false; + next_keepalive_ack = std::nullopt; + ack_left = 0; +} + +void IOHandler::discard_out_sent() +{ + assert(get_io_state() != io_state_t::open); + out_seq = 0; + out_sent_msgs.clear(); +} + +seastar::future<> +IOHandler::dispatch_accept( + crosscore_t::seq_t cc_seq, + seastar::shard_id new_sid, + ConnectionFRef conn_fref, + bool is_replace) +{ + return to_new_sid(cc_seq, new_sid, std::move(conn_fref), is_replace); +} + +seastar::future<> +IOHandler::dispatch_connect( + crosscore_t::seq_t cc_seq, + seastar::shard_id new_sid, + ConnectionFRef conn_fref) +{ + return to_new_sid(cc_seq, new_sid, std::move(conn_fref), std::nullopt); +} + +seastar::future<> +IOHandler::cleanup_prv_shard(seastar::shard_id prv_sid) +{ + assert(seastar::this_shard_id() == get_shard_id()); + return seastar::smp::submit_to(prv_sid, [this] { + logger().debug("{} got cleanup_prv_shard()", conn); + assert(maybe_prv_shard_states != nullptr); + auto ref_prv_states = std::move(maybe_prv_shard_states); + auto &prv_states = *ref_prv_states; + return prv_states.close( + ).then([ref_prv_states=std::move(ref_prv_states)] { + ceph_assert_always(ref_prv_states->assert_closed_and_exit()); + }); + }).then([this] { + ceph_assert_always(maybe_prv_shard_states == nullptr); + }); +} + +seastar::future<> +IOHandler::to_new_sid( + crosscore_t::seq_t cc_seq, + seastar::shard_id new_sid, + ConnectionFRef conn_fref, + std::optional<bool> is_replace) +{ + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} to_new_sid(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq, new_sid, is_replace, + conn_fref=std::move(conn_fref)]() mutable { + return to_new_sid(cc_seq, new_sid, std::move(conn_fref), is_replace); + }); + } + + bool is_accept_or_connect = is_replace.has_value(); + logger().debug("{} got {} to_new_sid_1(new_sid={}, {}) at {}", + conn, cc_seq, new_sid, + fmt::format("{}", + is_accept_or_connect ? + (*is_replace ? "accept(replace)" : "accept(!replace)") : + "connect"), + io_stat_printer{*this}); + auto next_cc_seq = ++cc_seq; + + if (get_io_state() != io_state_t::drop) { + ceph_assert_always(conn_ref); + if (new_sid != seastar::this_shard_id()) { + dispatchers.ms_handle_shard_change(conn_ref, new_sid, is_accept_or_connect); + // user can make changes + } + } else { + // it is possible that both io_handler and protocolv2 are + // trying to close each other from different cores simultaneously. + assert(!protocol_is_connected); + } + + if (get_io_state() != io_state_t::drop) { + if (is_accept_or_connect) { + // protocol_is_connected can be from true to true here if the replacing is + // happening to a connected connection. + } else { + ceph_assert_always(protocol_is_connected == false); + } + protocol_is_connected = true; + } else { + assert(!protocol_is_connected); + } + + bool is_dropped = false; + if (get_io_state() == io_state_t::drop) { + is_dropped = true; + } + ceph_assert_always(get_io_state() != io_state_t::open); + + // apply the switching atomically + ceph_assert_always(conn_ref); + conn_ref.reset(); + auto prv_sid = get_shard_id(); + ceph_assert_always(maybe_prv_shard_states == nullptr); + maybe_prv_shard_states = std::move(shard_states); + shard_states = shard_states_t::create_from_previous( + *maybe_prv_shard_states, new_sid); + assert(new_sid == get_shard_id()); + + return seastar::smp::submit_to(new_sid, + [this, next_cc_seq, is_dropped, prv_sid, is_replace, conn_fref=std::move(conn_fref)]() mutable { + logger().debug("{} got {} to_new_sid_2(prv_sid={}, is_dropped={}, {}) at {}", + conn, next_cc_seq, prv_sid, is_dropped, + fmt::format("{}", + is_replace.has_value() ? + (*is_replace ? "accept(replace)" : "accept(!replace)") : + "connect"), + io_stat_printer{*this}); + + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + ceph_assert_always(get_io_state() != io_state_t::open); + ceph_assert_always(!maybe_dropped_sid.has_value()); + ceph_assert_always(crosscore.proceed_or_wait(next_cc_seq)); + + if (is_dropped) { + ceph_assert_always(get_io_state() == io_state_t::drop); + ceph_assert_always(shard_states->assert_closed_and_exit()); + maybe_dropped_sid = prv_sid; + // cleanup_prv_shard() will be done in a follow-up close_io() + } else { + // possible at io_state_t::drop + + // previous shard is not cleaned, + // but close_io() is responsible to clean up the current shard, + // so cleanup the previous shard here. + shard_states->dispatch_in_background( + "cleanup_prv_sid", conn, [this, prv_sid] { + return cleanup_prv_shard(prv_sid); + }); + maybe_notify_out_dispatch(); + } + + ceph_assert_always(!conn_ref); + // assign even if already dropping + conn_ref = make_local_shared_foreign(std::move(conn_fref)); + + if (get_io_state() != io_state_t::drop) { + if (is_replace.has_value()) { + dispatchers.ms_handle_accept(conn_ref, prv_sid, *is_replace); + } else { + dispatchers.ms_handle_connect(conn_ref, prv_sid); + } + // user can make changes + } + }); +} + +seastar::future<> IOHandler::set_accepted_sid( + crosscore_t::seq_t cc_seq, + seastar::shard_id sid, + ConnectionFRef conn_fref) +{ + assert(seastar::this_shard_id() == get_shard_id()); + assert(get_io_state() == io_state_t::none); + ceph_assert_always(conn_ref); + conn_ref.reset(); + assert(maybe_prv_shard_states == nullptr); + shard_states.reset(); + shard_states = shard_states_t::create(sid, io_state_t::none); + return seastar::smp::submit_to(sid, + [this, cc_seq, conn_fref=std::move(conn_fref)]() mutable { + // must be the first to proceed + ceph_assert_always(crosscore.proceed_or_wait(cc_seq)); + + logger().debug("{} set accepted sid", conn); + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + ceph_assert_always(get_io_state() == io_state_t::none); + assert(maybe_prv_shard_states == nullptr); + ceph_assert_always(!conn_ref); + conn_ref = make_local_shared_foreign(std::move(conn_fref)); + }); +} + +void IOHandler::dispatch_reset(bool is_replace) +{ + ceph_assert_always(get_io_state() == io_state_t::drop); + if (!need_dispatch_reset) { + return; + } + need_dispatch_reset = false; + ceph_assert_always(conn_ref); + + dispatchers.ms_handle_reset(conn_ref, is_replace); + // user can make changes +} + +void IOHandler::dispatch_remote_reset() +{ + if (get_io_state() == io_state_t::drop) { + return; + } + ceph_assert_always(conn_ref); + + dispatchers.ms_handle_remote_reset(conn_ref); + // user can make changes +} + +void IOHandler::ack_out_sent(seq_num_t seq) +{ + if (conn.policy.lossy) { // lossy connections don't keep sent messages + return; + } + while (!out_sent_msgs.empty() && + out_sent_msgs.front()->get_seq() <= seq) { + logger().trace("{} got ack seq {} >= {}, pop {}", + conn, seq, out_sent_msgs.front()->get_seq(), + *out_sent_msgs.front()); + out_sent_msgs.pop_front(); + } +} + +seastar::future<> +IOHandler::do_out_dispatch(shard_states_t &ctx) +{ + return seastar::repeat([this, &ctx] { + switch (ctx.get_io_state()) { + case io_state_t::open: { + if (unlikely(!is_out_queued())) { + // try exit open dispatching + return frame_assembler->flush<false>( + ).then([this, &ctx] { + if (ctx.get_io_state() != io_state_t::open || is_out_queued()) { + return seastar::make_ready_future<stop_t>(stop_t::no); + } + // still nothing pending to send after flush, + // open dispatching can ONLY stop now + ctx.exit_out_dispatching("exit-open", conn); + return seastar::make_ready_future<stop_t>(stop_t::yes); + }); + } + + auto require_keepalive = need_keepalive; + need_keepalive = false; + auto maybe_keepalive_ack = next_keepalive_ack; + next_keepalive_ack = std::nullopt; + auto to_ack = ack_left; + assert(to_ack == 0 || in_seq > 0); + ack_left = 0; +#ifdef UNIT_TESTS_BUILT + auto ret = sweep_out_pending_msgs_to_sent( + require_keepalive, maybe_keepalive_ack, to_ack > 0); + return frame_assembler->intercept_frames(ret.tags, true + ).then([this, bl=std::move(ret.bl)]() mutable { + return frame_assembler->write<false>(std::move(bl)); + } +#else + auto bl = sweep_out_pending_msgs_to_sent( + require_keepalive, maybe_keepalive_ack, to_ack > 0); + return frame_assembler->write<false>(std::move(bl) +#endif + ).then([this, &ctx] { + if (ctx.get_io_state() != io_state_t::open) { + return frame_assembler->flush<false>( + ).then([] { + return seastar::make_ready_future<stop_t>(stop_t::no); + }); + } + + // FIXME: may leak a flush if state is changed after return and before + // the next repeat body. + return seastar::make_ready_future<stop_t>(stop_t::no); + }); + } + case io_state_t::delay: + // delay out dispatching until open + ctx.notify_out_dispatching_stopped("delay...", conn); + return ctx.wait_state_change( + ).then([] { return stop_t::no; }); + case io_state_t::drop: + ctx.exit_out_dispatching("dropped", conn); + return seastar::make_ready_future<stop_t>(stop_t::yes); + case io_state_t::switched: + ctx.exit_out_dispatching("switched", conn); + return seastar::make_ready_future<stop_t>(stop_t::yes); + default: + ceph_abort("impossible"); + } + }).handle_exception_type([this, &ctx](const std::system_error& e) { + auto io_state = ctx.get_io_state(); + if (e.code() != std::errc::broken_pipe && + e.code() != std::errc::connection_reset && + e.code() != error::negotiation_failure) { + logger().error("{} do_out_dispatch(): unexpected error at {} -- {}", + conn, io_state, e.what()); + ceph_abort(); + } + + if (io_state == io_state_t::open) { + auto cc_seq = crosscore.prepare_submit(); + logger().info("{} do_out_dispatch(): fault at {}, {}, going to delay -- {}, " + "send {} notify_out_fault()", + conn, io_state, io_stat_printer{*this}, e.what(), cc_seq); + std::exception_ptr eptr; + try { + throw e; + } catch(...) { + eptr = std::current_exception(); + } + do_set_io_state(io_state_t::delay); + shard_states->dispatch_in_background( + "notify_out_fault(out)", conn, [this, cc_seq, eptr] { + auto states = get_states(); + return seastar::smp::submit_to( + conn.get_messenger_shard_id(), [this, cc_seq, eptr, states] { + return handshake_listener->notify_out_fault( + cc_seq, "do_out_dispatch", eptr, states); + }); + }); + } else { + if (io_state != io_state_t::switched) { + logger().info("{} do_out_dispatch(): fault at {}, {} -- {}", + conn, io_state, io_stat_printer{*this}, e.what()); + } else { + logger().info("{} do_out_dispatch(): fault at {} -- {}", + conn, io_state, e.what()); + } + } + + return do_out_dispatch(ctx); + }); +} + +void IOHandler::maybe_notify_out_dispatch() +{ + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + if (is_out_queued()) { + notify_out_dispatch(); + } +} + +void IOHandler::notify_out_dispatch() +{ + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + assert(is_out_queued()); + if (need_notify_out) { + auto cc_seq = crosscore.prepare_submit(); + logger().debug("{} send {} notify_out()", + conn, cc_seq); + shard_states->dispatch_in_background( + "notify_out", conn, [this, cc_seq] { + return seastar::smp::submit_to( + conn.get_messenger_shard_id(), [this, cc_seq] { + return handshake_listener->notify_out(cc_seq); + }); + }); + } + if (shard_states->try_enter_out_dispatching()) { + shard_states->dispatch_in_background( + "do_out_dispatch", conn, [this] { + return do_out_dispatch(*shard_states); + }); + } +} + +seastar::future<> +IOHandler::read_message( + shard_states_t &ctx, + utime_t throttle_stamp, + std::size_t msg_size) +{ + return frame_assembler->read_frame_payload<false>( + ).then([this, throttle_stamp, msg_size, &ctx](auto payload) { + if (unlikely(ctx.get_io_state() != io_state_t::open)) { + logger().debug("{} triggered {} during read_message()", + conn, ctx.get_io_state()); + abort_protocol(); + } + + utime_t recv_stamp{seastar::lowres_system_clock::now()}; + + // we need to get the size before std::moving segments data + auto msg_frame = MessageFrame::Decode(*payload); + // XXX: paranoid copy just to avoid oops + ceph_msg_header2 current_header = msg_frame.header(); + + logger().trace("{} got {} + {} + {} byte message," + " envelope type={} src={} off={} seq={}", + conn, + msg_frame.front_len(), + msg_frame.middle_len(), + msg_frame.data_len(), + current_header.type, + conn.get_peer_name(), + current_header.data_off, + current_header.seq); + + ceph_msg_header header{current_header.seq, + current_header.tid, + current_header.type, + current_header.priority, + current_header.version, + ceph_le32(msg_frame.front_len()), + ceph_le32(msg_frame.middle_len()), + ceph_le32(msg_frame.data_len()), + current_header.data_off, + conn.get_peer_name(), + current_header.compat_version, + current_header.reserved, + ceph_le32(0)}; + ceph_msg_footer footer{ceph_le32(0), ceph_le32(0), + ceph_le32(0), ceph_le64(0), current_header.flags}; + + Message *message = decode_message(nullptr, 0, header, footer, + msg_frame.front(), msg_frame.middle(), msg_frame.data(), nullptr); + if (!message) { + logger().warn("{} decode message failed", conn); + abort_in_fault(); + } + + // store reservation size in message, so we don't get confused + // by messages entering the dispatch queue through other paths. + message->set_dispatch_throttle_size(msg_size); + + message->set_throttle_stamp(throttle_stamp); + message->set_recv_stamp(recv_stamp); + message->set_recv_complete_stamp(utime_t{seastar::lowres_system_clock::now()}); + + // check received seq#. if it is old, drop the message. + // note that incoming messages may skip ahead. this is convenient for the + // client side queueing because messages can't be renumbered, but the (kernel) + // client will occasionally pull a message out of the sent queue to send + // elsewhere. in that case it doesn't matter if we "got" it or not. + uint64_t cur_seq = in_seq; + if (message->get_seq() <= cur_seq) { + logger().error("{} got old message {} <= {} {}, discarding", + conn, message->get_seq(), cur_seq, *message); + if (HAVE_FEATURE(conn.features, RECONNECT_SEQ) && + local_conf()->ms_die_on_old_message) { + ceph_assert(0 == "old msgs despite reconnect_seq feature"); + } + return seastar::now(); + } else if (message->get_seq() > cur_seq + 1) { + logger().error("{} missed message? skipped from seq {} to {}", + conn, cur_seq, message->get_seq()); + if (local_conf()->ms_die_on_skipped_message) { + ceph_assert(0 == "skipped incoming seq"); + } + } + + // note last received message. + in_seq = message->get_seq(); + if (conn.policy.lossy) { + logger().debug("{} <== #{} === {} ({})", + conn, + message->get_seq(), + *message, + message->get_type()); + } else { + logger().debug("{} <== #{},{} === {} ({})", + conn, + message->get_seq(), + current_header.ack_seq, + *message, + message->get_type()); + } + + // notify ack + if (!conn.policy.lossy) { + ++ack_left; + notify_out_dispatch(); + } + + ack_out_sent(current_header.ack_seq); + + // TODO: change MessageRef with seastar::shared_ptr + auto msg_ref = MessageRef{message, false}; + assert(ctx.get_io_state() == io_state_t::open); + assert(get_io_state() == io_state_t::open); + ceph_assert_always(conn_ref); + + // throttle the reading process by the returned future + return dispatchers.ms_dispatch(conn_ref, std::move(msg_ref)); + // user can make changes + }); +} + +void IOHandler::do_in_dispatch() +{ + shard_states->enter_in_dispatching(); + shard_states->dispatch_in_background( + "do_in_dispatch", conn, [this, &ctx=*shard_states] { + return seastar::keep_doing([this, &ctx] { + return frame_assembler->read_main_preamble<false>( + ).then([this, &ctx](auto ret) { + switch (ret.tag) { + case Tag::MESSAGE: { + size_t msg_size = get_msg_size(*ret.rx_frame_asm); + return seastar::futurize_invoke([this] { + // throttle_message() logic + if (!conn.policy.throttler_messages) { + return seastar::now(); + } + // TODO: message throttler + ceph_abort("TODO"); + return seastar::now(); + }).then([this, msg_size] { + // throttle_bytes() logic + if (!conn.policy.throttler_bytes) { + return seastar::now(); + } + if (!msg_size) { + return seastar::now(); + } + logger().trace("{} wants {} bytes from policy throttler {}/{}", + conn, msg_size, + conn.policy.throttler_bytes->get_current(), + conn.policy.throttler_bytes->get_max()); + return conn.policy.throttler_bytes->get(msg_size); + }).then([this, msg_size, &ctx] { + // TODO: throttle_dispatch_queue() logic + utime_t throttle_stamp{seastar::lowres_system_clock::now()}; + return read_message(ctx, throttle_stamp, msg_size); + }); + } + case Tag::ACK: + return frame_assembler->read_frame_payload<false>( + ).then([this](auto payload) { + // handle_message_ack() logic + auto ack = AckFrame::Decode(payload->back()); + logger().debug("{} GOT AckFrame: seq={}", conn, ack.seq()); + ack_out_sent(ack.seq()); + }); + case Tag::KEEPALIVE2: + return frame_assembler->read_frame_payload<false>( + ).then([this](auto payload) { + // handle_keepalive2() logic + auto keepalive_frame = KeepAliveFrame::Decode(payload->back()); + logger().debug("{} GOT KeepAliveFrame: timestamp={}", + conn, keepalive_frame.timestamp()); + // notify keepalive ack + next_keepalive_ack = keepalive_frame.timestamp(); + if (seastar::this_shard_id() == get_shard_id()) { + notify_out_dispatch(); + } + + last_keepalive = seastar::lowres_system_clock::now(); + }); + case Tag::KEEPALIVE2_ACK: + return frame_assembler->read_frame_payload<false>( + ).then([this](auto payload) { + // handle_keepalive2_ack() logic + auto keepalive_ack_frame = KeepAliveFrameAck::Decode(payload->back()); + auto _last_keepalive_ack = + seastar::lowres_system_clock::time_point{keepalive_ack_frame.timestamp()}; + set_last_keepalive_ack(_last_keepalive_ack); + logger().debug("{} GOT KeepAliveFrameAck: timestamp={}", + conn, _last_keepalive_ack); + }); + default: { + logger().warn("{} do_in_dispatch() received unexpected tag: {}", + conn, static_cast<uint32_t>(ret.tag)); + abort_in_fault(); + } + } + }); + }).handle_exception([this, &ctx](std::exception_ptr eptr) { + const char *e_what; + try { + std::rethrow_exception(eptr); + } catch (std::exception &e) { + e_what = e.what(); + } + + auto io_state = ctx.get_io_state(); + if (io_state == io_state_t::open) { + auto cc_seq = crosscore.prepare_submit(); + logger().info("{} do_in_dispatch(): fault at {}, {}, going to delay -- {}, " + "send {} notify_out_fault()", + conn, io_state, io_stat_printer{*this}, e_what, cc_seq); + do_set_io_state(io_state_t::delay); + shard_states->dispatch_in_background( + "notify_out_fault(in)", conn, [this, cc_seq, eptr] { + auto states = get_states(); + return seastar::smp::submit_to( + conn.get_messenger_shard_id(), [this, cc_seq, eptr, states] { + return handshake_listener->notify_out_fault( + cc_seq, "do_in_dispatch", eptr, states); + }); + }); + } else { + if (io_state != io_state_t::switched) { + logger().info("{} do_in_dispatch(): fault at {}, {} -- {}", + conn, io_state, io_stat_printer{*this}, e_what); + } else { + logger().info("{} do_in_dispatch(): fault at {} -- {}", + conn, io_state, e_what); + } + } + }).finally([&ctx] { + ctx.exit_in_dispatching(); + }); + }); +} + +seastar::future<> +IOHandler::close_io( + crosscore_t::seq_t cc_seq, + bool is_dispatch_reset, + bool is_replace) +{ + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + if (!crosscore.proceed_or_wait(cc_seq)) { + logger().debug("{} got {} close_io(), wait at {}", + conn, cc_seq, crosscore.get_in_seq()); + return crosscore.wait(cc_seq + ).then([this, cc_seq, is_dispatch_reset, is_replace] { + return close_io(cc_seq, is_dispatch_reset, is_replace); + }); + } + + logger().debug("{} got {} close_io(reset={}, replace={})", + conn, cc_seq, is_dispatch_reset, is_replace); + ceph_assert_always(get_io_state() == io_state_t::drop); + + if (is_dispatch_reset) { + dispatch_reset(is_replace); + } + + ceph_assert_always(conn_ref); + conn_ref.reset(); + + // cannot be running in parallel with to_new_sid() + if (maybe_dropped_sid.has_value()) { + assert(shard_states->assert_closed_and_exit()); + auto prv_sid = *maybe_dropped_sid; + return cleanup_prv_shard(prv_sid); + } else { + return shard_states->close( + ).then([this] { + assert(shard_states->assert_closed_and_exit()); + }); + } +} + +/* + * IOHandler::shard_states_t + */ + +void +IOHandler::shard_states_t::notify_out_dispatching_stopped( + const char *what, SocketConnection &conn) +{ + assert(seastar::this_shard_id() == sid); + if (unlikely(out_exit_dispatching.has_value())) { + out_exit_dispatching->set_value(); + out_exit_dispatching = std::nullopt; + logger().info("{} do_out_dispatch: stop({}) at {}, set out_exit_dispatching", + conn, what, io_state); + } else { + if (unlikely(io_state != io_state_t::open)) { + logger().info("{} do_out_dispatch: stop({}) at {}, no out_exit_dispatching", + conn, what, io_state); + } + } +} + +seastar::future<> +IOHandler::shard_states_t::wait_io_exit_dispatching() +{ + assert(seastar::this_shard_id() == sid); + assert(io_state != io_state_t::open); + assert(!gate.is_closed()); + return seastar::when_all( + [this] { + if (out_exit_dispatching) { + return out_exit_dispatching->get_future(); + } else { + return seastar::now(); + } + }(), + [this] { + if (in_exit_dispatching) { + return in_exit_dispatching->get_future(); + } else { + return seastar::now(); + } + }() + ).discard_result(); +} + +IOHandler::shard_states_ref_t +IOHandler::shard_states_t::create_from_previous( + shard_states_t &prv_states, + seastar::shard_id new_sid) +{ + auto io_state = prv_states.io_state; + assert(io_state != io_state_t::open); + auto ret = shard_states_t::create(new_sid, io_state); + if (io_state == io_state_t::drop) { + // the new gate should not never be used + auto fut = ret->gate.close(); + ceph_assert_always(fut.available()); + } + prv_states.set_io_state(io_state_t::switched); + return ret; +} + +} // namespace crimson::net diff --git a/src/crimson/net/io_handler.h b/src/crimson/net/io_handler.h new file mode 100644 index 000000000..f53c2ba64 --- /dev/null +++ b/src/crimson/net/io_handler.h @@ -0,0 +1,623 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <vector> + +#include <seastar/core/shared_future.hh> +#include <seastar/util/later.hh> + +#include "crimson/common/gated.h" +#include "Fwd.h" +#include "SocketConnection.h" +#include "FrameAssemblerV2.h" + +namespace crimson::net { + +/** + * crosscore_t + * + * To preserve the event order across cores. + */ +class crosscore_t { +public: + using seq_t = uint64_t; + + crosscore_t() = default; + ~crosscore_t() = default; + + seq_t get_in_seq() const { + return in_seq; + } + + seq_t prepare_submit() { + ++out_seq; + return out_seq; + } + + bool proceed_or_wait(seq_t seq) { + if (seq == in_seq + 1) { + ++in_seq; + if (unlikely(in_pr_wait.has_value())) { + in_pr_wait->set_value(); + in_pr_wait = std::nullopt; + } + return true; + } else { + return false; + } + } + + seastar::future<> wait(seq_t seq) { + assert(seq != in_seq + 1); + if (!in_pr_wait.has_value()) { + in_pr_wait = seastar::shared_promise<>(); + } + return in_pr_wait->get_shared_future(); + } + +private: + seq_t out_seq = 0; + seq_t in_seq = 0; + std::optional<seastar::shared_promise<>> in_pr_wait; +}; + +/** + * io_handler_state + * + * It is required to populate the states from IOHandler to ProtocolV2 + * asynchronously. + */ +struct io_handler_state { + seq_num_t in_seq; + bool is_out_queued; + bool has_out_sent; + + bool is_out_queued_or_sent() const { + return is_out_queued || has_out_sent; + } + + /* + * should be consistent with the accroding interfaces in IOHandler + */ + + void reset_session(bool full) { + in_seq = 0; + if (full) { + is_out_queued = false; + has_out_sent = false; + } + } + + void reset_peer_state() { + in_seq = 0; + is_out_queued = is_out_queued_or_sent(); + has_out_sent = false; + } + + void requeue_out_sent_up_to() { + // noop since the information is insufficient + } + + void requeue_out_sent() { + if (has_out_sent) { + has_out_sent = false; + is_out_queued = true; + } + } +}; + +/** + * HandshakeListener + * + * The interface class for IOHandler to notify the ProtocolV2. + * + * The notifications may be cross-core and must be sent to + * SocketConnection::get_messenger_shard_id() + */ +class HandshakeListener { +public: + virtual ~HandshakeListener() = default; + + HandshakeListener(const HandshakeListener&) = delete; + HandshakeListener(HandshakeListener &&) = delete; + HandshakeListener &operator=(const HandshakeListener &) = delete; + HandshakeListener &operator=(HandshakeListener &&) = delete; + + virtual seastar::future<> notify_out( + crosscore_t::seq_t cc_seq) = 0; + + virtual seastar::future<> notify_out_fault( + crosscore_t::seq_t cc_seq, + const char *where, + std::exception_ptr, + io_handler_state) = 0; + + virtual seastar::future<> notify_mark_down( + crosscore_t::seq_t cc_seq) = 0; + +protected: + HandshakeListener() = default; +}; + +/** + * IOHandler + * + * Implements the message read and write paths after the handshake, and also be + * responsible to dispatch events. It is supposed to be working on the same + * core with the underlying socket and the FrameAssemblerV2 class. + */ +class IOHandler final : public ConnectionHandler { +public: + IOHandler(ChainedDispatchers &, + SocketConnection &); + + ~IOHandler() final; + + IOHandler(const IOHandler &) = delete; + IOHandler(IOHandler &&) = delete; + IOHandler &operator=(const IOHandler &) = delete; + IOHandler &operator=(IOHandler &&) = delete; + +/* + * as ConnectionHandler + */ +public: + seastar::shard_id get_shard_id() const final { + return shard_states->get_shard_id(); + } + + bool is_connected() const final { + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + return protocol_is_connected; + } + + seastar::future<> send(MessageFRef msg) final; + + seastar::future<> send_keepalive() final; + + clock_t::time_point get_last_keepalive() const final { + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + return last_keepalive; + } + + clock_t::time_point get_last_keepalive_ack() const final { + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + return last_keepalive_ack; + } + + void set_last_keepalive_ack(clock_t::time_point when) final { + ceph_assert_always(seastar::this_shard_id() == get_shard_id()); + last_keepalive_ack = when; + } + + void mark_down() final; + +/* + * as IOHandler to be called by ProtocolV2 handshake + * + * The calls may be cross-core and asynchronous + */ +public: + /* + * should not be called cross-core + */ + + void set_handshake_listener(HandshakeListener &hl) { + assert(seastar::this_shard_id() == get_shard_id()); + ceph_assert_always(handshake_listener == nullptr); + handshake_listener = &hl; + } + + io_handler_state get_states() const { + // might be called from prv_sid during wait_io_exit_dispatching() + return {in_seq, is_out_queued(), has_out_sent()}; + } + + struct io_stat_printer { + const IOHandler &io_handler; + }; + void print_io_stat(std::ostream &out) const; + + seastar::future<> set_accepted_sid( + crosscore_t::seq_t cc_seq, + seastar::shard_id sid, + ConnectionFRef conn_fref); + + /* + * may be called cross-core + */ + + seastar::future<> close_io( + crosscore_t::seq_t cc_seq, + bool is_dispatch_reset, + bool is_replace); + + /** + * io_state_t + * + * The io_state is changed with the protocol state, to control the + * io behavior accordingly. + */ + enum class io_state_t : uint8_t { + none, // no IO is possible as the connection is not available to the user yet. + delay, // IO is delayed until open. + open, // Dispatch In and Out concurrently. + drop, // Drop IO as the connection is closed. + switched // IO is switched to a different core + // (is moved to maybe_prv_shard_states) + }; + friend class fmt::formatter<io_state_t>; + + seastar::future<> set_io_state( + crosscore_t::seq_t cc_seq, + io_state_t new_state, + FrameAssemblerV2Ref fa, + bool set_notify_out); + + struct exit_dispatching_ret { + FrameAssemblerV2Ref frame_assembler; + io_handler_state io_states; + }; + seastar::future<exit_dispatching_ret> + wait_io_exit_dispatching( + crosscore_t::seq_t cc_seq); + + seastar::future<> reset_session( + crosscore_t::seq_t cc_seq, + bool full); + + seastar::future<> reset_peer_state( + crosscore_t::seq_t cc_seq); + + seastar::future<> requeue_out_sent_up_to( + crosscore_t::seq_t cc_seq, + seq_num_t msg_seq); + + seastar::future<> requeue_out_sent( + crosscore_t::seq_t cc_seq); + + seastar::future<> dispatch_accept( + crosscore_t::seq_t cc_seq, + seastar::shard_id new_sid, + ConnectionFRef, + bool is_replace); + + seastar::future<> dispatch_connect( + crosscore_t::seq_t cc_seq, + seastar::shard_id new_sid, + ConnectionFRef); + + private: + class shard_states_t; + using shard_states_ref_t = std::unique_ptr<shard_states_t>; + + class shard_states_t { + public: + shard_states_t(seastar::shard_id _sid, io_state_t state) + : sid{_sid}, io_state{state} {} + + seastar::shard_id get_shard_id() const { + return sid; + } + + io_state_t get_io_state() const { + assert(seastar::this_shard_id() == sid); + return io_state; + } + + void set_io_state(io_state_t new_state) { + assert(seastar::this_shard_id() == sid); + assert(io_state != new_state); + pr_io_state_changed.set_value(); + pr_io_state_changed = seastar::promise<>(); + if (io_state == io_state_t::open) { + // from open + if (out_dispatching) { + ceph_assert_always(!out_exit_dispatching.has_value()); + out_exit_dispatching = seastar::promise<>(); + } + } + io_state = new_state; + } + + seastar::future<> wait_state_change() { + assert(seastar::this_shard_id() == sid); + return pr_io_state_changed.get_future(); + } + + template <typename Func> + void dispatch_in_background( + const char *what, SocketConnection &who, Func &&func) { + assert(seastar::this_shard_id() == sid); + ceph_assert_always(!gate.is_closed()); + gate.dispatch_in_background(what, who, std::move(func)); + } + + void enter_in_dispatching() { + assert(seastar::this_shard_id() == sid); + assert(io_state == io_state_t::open); + ceph_assert_always(!in_exit_dispatching.has_value()); + in_exit_dispatching = seastar::promise<>(); + } + + void exit_in_dispatching() { + assert(seastar::this_shard_id() == sid); + assert(io_state != io_state_t::open); + ceph_assert_always(in_exit_dispatching.has_value()); + in_exit_dispatching->set_value(); + in_exit_dispatching = std::nullopt; + } + + bool try_enter_out_dispatching() { + assert(seastar::this_shard_id() == sid); + if (out_dispatching) { + // already dispatching out + return false; + } + switch (io_state) { + case io_state_t::open: + [[fallthrough]]; + case io_state_t::delay: + out_dispatching = true; + return true; + case io_state_t::drop: + [[fallthrough]]; + case io_state_t::switched: + // do not dispatch out + return false; + default: + ceph_abort("impossible"); + } + } + + void notify_out_dispatching_stopped( + const char *what, SocketConnection &conn); + + void exit_out_dispatching( + const char *what, SocketConnection &conn) { + assert(seastar::this_shard_id() == sid); + ceph_assert_always(out_dispatching); + out_dispatching = false; + notify_out_dispatching_stopped(what, conn); + } + + seastar::future<> wait_io_exit_dispatching(); + + seastar::future<> close() { + assert(seastar::this_shard_id() == sid); + assert(!gate.is_closed()); + return gate.close(); + } + + bool assert_closed_and_exit() const { + assert(seastar::this_shard_id() == sid); + if (gate.is_closed()) { + ceph_assert_always(io_state == io_state_t::drop || + io_state == io_state_t::switched); + ceph_assert_always(!out_dispatching); + ceph_assert_always(!out_exit_dispatching); + ceph_assert_always(!in_exit_dispatching); + return true; + } else { + return false; + } + } + + static shard_states_ref_t create( + seastar::shard_id sid, io_state_t state) { + return std::make_unique<shard_states_t>(sid, state); + } + + static shard_states_ref_t create_from_previous( + shard_states_t &prv_states, seastar::shard_id new_sid); + + private: + const seastar::shard_id sid; + io_state_t io_state; + + crimson::common::Gated gate; + seastar::promise<> pr_io_state_changed; + bool out_dispatching = false; + std::optional<seastar::promise<>> out_exit_dispatching; + std::optional<seastar::promise<>> in_exit_dispatching; + }; + + void do_set_io_state( + io_state_t new_state, + std::optional<crosscore_t::seq_t> cc_seq = std::nullopt, + FrameAssemblerV2Ref fa = nullptr, + bool set_notify_out = false); + + io_state_t get_io_state() const { + return shard_states->get_io_state(); + } + + void do_requeue_out_sent(); + + void do_requeue_out_sent_up_to(seq_num_t seq); + + void assign_frame_assembler(FrameAssemblerV2Ref); + + seastar::future<> send_redirected(MessageFRef msg); + + seastar::future<> do_send(MessageFRef msg); + + seastar::future<> send_keepalive_redirected(); + + seastar::future<> do_send_keepalive(); + + seastar::future<> to_new_sid( + crosscore_t::seq_t cc_seq, + seastar::shard_id new_sid, + ConnectionFRef, + std::optional<bool> is_replace); + + void dispatch_reset(bool is_replace); + + void dispatch_remote_reset(); + + bool is_out_queued() const { + return (!out_pending_msgs.empty() || + ack_left > 0 || + need_keepalive || + next_keepalive_ack.has_value()); + } + + bool has_out_sent() const { + return !out_sent_msgs.empty(); + } + + void reset_in(); + + void reset_out(); + + void discard_out_sent(); + + seastar::future<> do_out_dispatch(shard_states_t &ctx); + +#ifdef UNIT_TESTS_BUILT + struct sweep_ret { + ceph::bufferlist bl; + std::vector<ceph::msgr::v2::Tag> tags; + }; + sweep_ret +#else + ceph::bufferlist +#endif + sweep_out_pending_msgs_to_sent( + bool require_keepalive, + std::optional<utime_t> maybe_keepalive_ack, + bool require_ack); + + void maybe_notify_out_dispatch(); + + void notify_out_dispatch(); + + void ack_out_sent(seq_num_t seq); + + seastar::future<> read_message( + shard_states_t &ctx, + utime_t throttle_stamp, + std::size_t msg_size); + + void do_in_dispatch(); + + seastar::future<> cleanup_prv_shard(seastar::shard_id prv_sid); + +private: + shard_states_ref_t shard_states; + + crosscore_t crosscore; + + // drop was happening in the previous sid + std::optional<seastar::shard_id> maybe_dropped_sid; + + // the remaining states in the previous sid for cleanup, see to_new_sid() + shard_states_ref_t maybe_prv_shard_states; + + ChainedDispatchers &dispatchers; + + SocketConnection &conn; + + // core local reference for dispatching, valid until reset/close + ConnectionRef conn_ref; + + HandshakeListener *handshake_listener = nullptr; + + FrameAssemblerV2Ref frame_assembler; + + bool protocol_is_connected = false; + + bool need_dispatch_reset = true; + + /* + * out states for writing + */ + + /// the seq num of the last transmitted message + seq_num_t out_seq = 0; + + // messages to be resent after connection gets reset + std::deque<MessageFRef> out_pending_msgs; + + // messages sent, but not yet acked by peer + std::deque<MessageFRef> out_sent_msgs; + + bool need_keepalive = false; + + std::optional<utime_t> next_keepalive_ack = std::nullopt; + + uint64_t ack_left = 0; + + bool need_notify_out = false; + + /* + * in states for reading + */ + + /// the seq num of the last received message + seq_num_t in_seq = 0; + + clock_t::time_point last_keepalive; + + clock_t::time_point last_keepalive_ack; +}; + +inline std::ostream& operator<<( + std::ostream& out, IOHandler::io_stat_printer stat) { + stat.io_handler.print_io_stat(out); + return out; +} + +} // namespace crimson::net + +template <> +struct fmt::formatter<crimson::net::io_handler_state> { + constexpr auto parse(format_parse_context& ctx) { + return ctx.begin(); + } + + template <typename FormatContext> + auto format(crimson::net::io_handler_state state, FormatContext& ctx) { + return fmt::format_to( + ctx.out(), + "io(in_seq={}, is_out_queued={}, has_out_sent={})", + state.in_seq, + state.is_out_queued, + state.has_out_sent); + } +}; + +template <> +struct fmt::formatter<crimson::net::IOHandler::io_state_t> + : fmt::formatter<std::string_view> { + template <typename FormatContext> + auto format(crimson::net::IOHandler::io_state_t state, FormatContext& ctx) { + using enum crimson::net::IOHandler::io_state_t; + std::string_view name; + switch (state) { + case none: + name = "none"; + break; + case delay: + name = "delay"; + break; + case open: + name = "open"; + break; + case drop: + name = "drop"; + break; + case switched: + name = "switched"; + break; + } + return formatter<string_view>::format(name, ctx); + } +}; + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::net::IOHandler::io_stat_printer> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/CMakeLists.txt b/src/crimson/os/CMakeLists.txt new file mode 100644 index 000000000..5054cabf4 --- /dev/null +++ b/src/crimson/os/CMakeLists.txt @@ -0,0 +1,16 @@ +add_library(crimson-os STATIC + futurized_store.cc + ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc) +add_subdirectory(cyanstore) + +if(WITH_BLUESTORE) + add_subdirectory(alienstore) + set(alienstore_lib crimson-alienstore) +endif() + +add_subdirectory(seastore) +target_link_libraries(crimson-os + crimson-cyanstore + ${alienstore_lib} + crimson-seastore + crimson) diff --git a/src/crimson/os/alienstore/CMakeLists.txt b/src/crimson/os/alienstore/CMakeLists.txt new file mode 100644 index 000000000..c881f4fbc --- /dev/null +++ b/src/crimson/os/alienstore/CMakeLists.txt @@ -0,0 +1,86 @@ +include_directories(SYSTEM "${CMAKE_SOURCE_DIR}/src/rocksdb/include") + +add_library(alien::cflags INTERFACE IMPORTED) +set_target_properties(alien::cflags PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "WITH_SEASTAR;WITH_ALIEN" + INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:Seastar::seastar,INTERFACE_INCLUDE_DIRECTORIES>) + +set(crimson_alien_common_srcs + ${PROJECT_SOURCE_DIR}/src/common/admin_socket.cc + ${PROJECT_SOURCE_DIR}/src/common/url_escape.cc + ${PROJECT_SOURCE_DIR}/src/common/blkdev.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_context.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_crypto.cc + ${PROJECT_SOURCE_DIR}/src/common/Finisher.cc + ${PROJECT_SOURCE_DIR}/src/common/HeartbeatMap.cc + ${PROJECT_SOURCE_DIR}/src/common/PluginRegistry.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_counters.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_counters_collection.cc + ${PROJECT_SOURCE_DIR}/src/common/RefCountedObj.cc + ${PROJECT_SOURCE_DIR}/src/common/SubProcess.cc + ${PROJECT_SOURCE_DIR}/src/common/Throttle.cc + ${PROJECT_SOURCE_DIR}/src/common/Timer.cc + ${PROJECT_SOURCE_DIR}/src/common/TrackedOp.cc + ${PROJECT_SOURCE_DIR}/src/common/WorkQueue.cc + ${PROJECT_SOURCE_DIR}/src/common/util.cc + ${PROJECT_SOURCE_DIR}/src/crush/CrushLocation.cc + ${PROJECT_SOURCE_DIR}/src/global/global_context.cc + ${PROJECT_SOURCE_DIR}/src/log/Log.cc + $<TARGET_OBJECTS:compressor_objs> + $<TARGET_OBJECTS:common_prioritycache_obj>) +if(WITH_CEPH_DEBUG_MUTEX) + list(APPEND crimson_alien_common_srcs + ${PROJECT_SOURCE_DIR}/src/common/lockdep.cc + ${PROJECT_SOURCE_DIR}/src/common/mutex_debug.cc + ${PROJECT_SOURCE_DIR}/src/common/condition_variable_debug.cc + ${PROJECT_SOURCE_DIR}/src/common/shared_mutex_debug.cc) +endif() +add_library(crimson-alien-common STATIC + ${crimson_alien_common_srcs}) + +target_link_libraries(crimson-alien-common + crimson-common + alien::cflags) + +set(alien_store_srcs + alien_store.cc + thread_pool.cc + alien_log.cc + ${PROJECT_SOURCE_DIR}/src/os/ObjectStore.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/Allocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/AvlAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BtreeAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapFreelistManager.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueFS.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluefs_types.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueRocksEnv.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/simple_bitmap.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluestore_types.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/fastbmap_allocator_impl.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/FreelistManager.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/HybridAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/StupidAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/memstore/MemStore.cc) +if(WITH_ZBD) + list(APPEND alien_store_srcs + ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedFreelistManager.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedAllocator.cc) +endif() +add_library(crimson-alienstore STATIC + ${alien_store_srcs}) +if(WITH_LTTNG) + add_dependencies(crimson-alienstore bluestore-tp) +endif() +target_link_libraries(crimson-alienstore + PRIVATE + alien::cflags + fmt::fmt + kv + heap_profiler + crimson-alien-common + ${BLKID_LIBRARIES} + ${UDEV_LIBRARIES} + crimson + blk) diff --git a/src/crimson/os/alienstore/alien_collection.h b/src/crimson/os/alienstore/alien_collection.h new file mode 100644 index 000000000..17a930e77 --- /dev/null +++ b/src/crimson/os/alienstore/alien_collection.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "os/ObjectStore.h" + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "alien_store.h" + +namespace crimson::os { + +class AlienCollection final : public FuturizedCollection { +public: + AlienCollection(ObjectStore::CollectionHandle ch) + : FuturizedCollection(ch->cid), + collection(ch) {} + + ~AlienCollection() {} + + template <typename Func, typename Result = std::invoke_result_t<Func>> + seastar::futurize_t<Result> with_lock(Func&& func) { + // newer versions of Seastar provide two variants of `with_lock` + // - generic, friendly towards throwing move constructors of Func, + // - specialized for `noexcept`. + // unfortunately, the former has a limitation: the return value + // of `Func` must be compatible with `current_exception_as_future()` + // which boils down to returning `seastar::future<void>`. + static_assert(std::is_nothrow_move_constructible_v<Func>); + return seastar::with_lock(mutex, std::forward<Func>(func)); + } + +private: + ObjectStore::CollectionHandle collection; + seastar::shared_mutex mutex; + friend AlienStore; +}; +} diff --git a/src/crimson/os/alienstore/alien_log.cc b/src/crimson/os/alienstore/alien_log.cc new file mode 100644 index 000000000..b371af897 --- /dev/null +++ b/src/crimson/os/alienstore/alien_log.cc @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "alien_log.h" +#include "log/SubsystemMap.h" +#include <seastar/core/alien.hh> +#include "crimson/common/log.h" + +namespace ceph::logging { +CnLog::CnLog(const SubsystemMap *s, seastar::alien::instance& inst, unsigned shard) + :Log(s) + ,inst(inst) + ,shard(shard) { +} + +CnLog::~CnLog() { +} + +void CnLog::_flush(EntryVector& q, bool crash) { + seastar::alien::submit_to(inst, shard, [&q] { + for (auto& it : q) { + crimson::get_logger(it.m_subsys).log( + crimson::to_log_level(it.m_prio), + "{}", + it.strv()); + } + return seastar::make_ready_future<>(); + }).wait(); + q.clear(); + return; +} + +} //namespace ceph::logging diff --git a/src/crimson/os/alienstore/alien_log.h b/src/crimson/os/alienstore/alien_log.h new file mode 100644 index 000000000..0f966d9ab --- /dev/null +++ b/src/crimson/os/alienstore/alien_log.h @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef ALIEN_LOG_H +#define ALIEN_LOG_H + +#include "log/Log.h" + +namespace ceph { +namespace logging { +class SubsystemMap; +} +} + +namespace seastar::alien { + class instance; +} +namespace ceph::logging +{ +class CnLog : public ceph::logging::Log +{ + seastar::alien::instance& inst; + unsigned shard; + void _flush(EntryVector& q, bool crash) override; +public: + CnLog(const SubsystemMap *s, seastar::alien::instance& inst, unsigned shard); + ~CnLog() override; +}; +} + +#endif diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc new file mode 100644 index 000000000..61f23de97 --- /dev/null +++ b/src/crimson/os/alienstore/alien_store.cc @@ -0,0 +1,620 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "alien_collection.h" +#include "alien_store.h" +#include "alien_log.h" + +#include <algorithm> +#include <iterator> +#include <map> +#include <string_view> +#include <boost/algorithm/string/trim.hpp> +#include <boost/iterator/counting_iterator.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include <seastar/core/alien.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/resource.hh> + +#include "common/ceph_context.h" +#include "global/global_context.h" +#include "include/Context.h" +#include "os/ObjectStore.h" +#include "os/Transaction.h" + +#include "crimson/common/config_proxy.h" +#include "crimson/common/log.h" +#include "crimson/os/futurized_store.h" + +using std::map; +using std::set; +using std::string; + +namespace { + +seastar::logger& logger() +{ + return crimson::get_logger(ceph_subsys_alienstore); +} + +class OnCommit final: public Context +{ + const int cpuid; + seastar::alien::instance &alien; + seastar::promise<> &alien_done; +public: + OnCommit( + int id, + seastar::promise<> &done, + seastar::alien::instance &alien, + ceph::os::Transaction& txn) + : cpuid(id), + alien(alien), + alien_done(done) { + } + + void finish(int) final { + return seastar::alien::submit_to(alien, cpuid, [this] { + alien_done.set_value(); + return seastar::make_ready_future<>(); + }).wait(); + } +}; +} + +namespace crimson::os { + +using crimson::common::get_conf; + +AlienStore::AlienStore(const std::string& type, + const std::string& path, + const ConfigValues& values) + : type(type), + path{path}, + values(values) +{ +} + +AlienStore::~AlienStore() +{ +} + +seastar::future<> AlienStore::start() +{ + cct = std::make_unique<CephContext>( + CEPH_ENTITY_TYPE_OSD, + CephContext::create_options { CODE_ENVIRONMENT_UTILITY, 0, + [](const ceph::logging::SubsystemMap* subsys_map) { + return new ceph::logging::CnLog(subsys_map, seastar::engine().alien(), seastar::this_shard_id()); + } + } + ); + g_ceph_context = cct.get(); + cct->_conf.set_config_values(values); + cct->_log->start(); + + store = ObjectStore::create(cct.get(), type, path); + if (!store) { + ceph_abort_msgf("unsupported objectstore type: %s", type.c_str()); + } + auto cpu_cores = seastar::resource::parse_cpuset( + get_conf<std::string>("crimson_alien_thread_cpu_cores")); + // cores except the first "N_CORES_FOR_SEASTAR" ones will + // be used for alien threads scheduling: + // [0, N_CORES_FOR_SEASTAR) are reserved for seastar reactors + // [N_CORES_FOR_SEASTAR, ..] are assigned to alien threads. + if (!cpu_cores.has_value()) { + seastar::resource::cpuset cpuset; + std::copy(boost::counting_iterator<unsigned>(N_CORES_FOR_SEASTAR), + boost::counting_iterator<unsigned>(sysconf(_SC_NPROCESSORS_ONLN)), + std::inserter(cpuset, cpuset.end())); + if (cpuset.empty()) { + logger().error("{}: unable to get nproc: {}", __func__, errno); + } else { + cpu_cores = cpuset; + } + } + const auto num_threads = + get_conf<uint64_t>("crimson_alien_op_num_threads"); + tp = std::make_unique<crimson::os::ThreadPool>(num_threads, 128, cpu_cores); + return tp->start(); +} + +seastar::future<> AlienStore::stop() +{ + if (!tp) { + // not really started yet + return seastar::now(); + } + return tp->submit([this] { + for (auto [cid, ch]: coll_map) { + static_cast<AlienCollection*>(ch.get())->collection.reset(); + } + store.reset(); + cct.reset(); + g_ceph_context = nullptr; + + }).then([this] { + return tp->stop(); + }); +} + +AlienStore::mount_ertr::future<> AlienStore::mount() +{ + logger().debug("{}", __func__); + assert(tp); + return tp->submit([this] { + return store->mount(); + }).then([] (const int r) -> mount_ertr::future<> { + if (r != 0) { + return crimson::stateful_ec{ + std::error_code(-r, std::generic_category()) }; + } else { + return mount_ertr::now(); + } + }); +} + +seastar::future<> AlienStore::umount() +{ + logger().info("{}", __func__); + if (!tp) { + // not really started yet + return seastar::now(); + } + return op_gate.close().then([this] { + return tp->submit([this] { + return store->umount(); + }); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +AlienStore::mkfs_ertr::future<> AlienStore::mkfs(uuid_d osd_fsid) +{ + logger().debug("{}", __func__); + store->set_fsid(osd_fsid); + assert(tp); + return tp->submit([this] { + return store->mkfs(); + }).then([] (int r) -> mkfs_ertr::future<> { + if (r != 0) { + return crimson::stateful_ec{ + std::error_code(-r, std::generic_category()) }; + } else { + return mkfs_ertr::now(); + } + }); +} + +seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> +AlienStore::list_objects(CollectionRef ch, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const +{ + logger().debug("{}", __func__); + assert(tp); + return do_with_op_gate(std::vector<ghobject_t>(), ghobject_t(), + [=, this] (auto &objects, auto &next) { + objects.reserve(limit); + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), + [=, this, &objects, &next] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->collection_list(c->collection, start, end, + store->get_ideal_list_max(), + &objects, &next); + }).then([&objects, &next] (int r) { + assert(r == 0); + return seastar::make_ready_future< + std::tuple<std::vector<ghobject_t>, ghobject_t>>( + std::move(objects), std::move(next)); + }); + }); +} + +seastar::future<CollectionRef> AlienStore::create_new_collection(const coll_t& cid) +{ + logger().debug("{}", __func__); + assert(tp); + return tp->submit([this, cid] { + return store->create_new_collection(cid); + }).then([this, cid] (ObjectStore::CollectionHandle c) { + CollectionRef ch; + auto cp = coll_map.find(c->cid); + if (cp == coll_map.end()) { + ch = new AlienCollection(c); + coll_map[c->cid] = ch; + } else { + ch = cp->second; + auto ach = static_cast<AlienCollection*>(ch.get()); + if (ach->collection != c) { + ach->collection = c; + } + } + return seastar::make_ready_future<CollectionRef>(ch); + }); + +} + +seastar::future<CollectionRef> AlienStore::open_collection(const coll_t& cid) +{ + logger().debug("{}", __func__); + assert(tp); + return tp->submit([this, cid] { + return store->open_collection(cid); + }).then([this] (ObjectStore::CollectionHandle c) { + if (!c) { + return seastar::make_ready_future<CollectionRef>(); + } + CollectionRef ch; + auto cp = coll_map.find(c->cid); + if (cp == coll_map.end()){ + ch = new AlienCollection(c); + coll_map[c->cid] = ch; + } else { + ch = cp->second; + auto ach = static_cast<AlienCollection*>(ch.get()); + if (ach->collection != c){ + ach->collection = c; + } + } + return seastar::make_ready_future<CollectionRef>(ch); + }); +} + +seastar::future<std::vector<coll_core_t>> AlienStore::list_collections() +{ + logger().debug("{}", __func__); + assert(tp); + + return do_with_op_gate(std::vector<coll_t>{}, [this] (auto &ls) { + return tp->submit([this, &ls] { + return store->list_collections(ls); + }).then([&ls] (int r) -> seastar::future<std::vector<coll_core_t>> { + assert(r == 0); + std::vector<coll_core_t> ret; + ret.resize(ls.size()); + std::transform( + ls.begin(), ls.end(), ret.begin(), + [](auto p) { return std::make_pair(p, NULL_CORE); }); + return seastar::make_ready_future<std::vector<coll_core_t>>(std::move(ret)); + }); + }); +} + +AlienStore::read_errorator::future<ceph::bufferlist> +AlienStore::read(CollectionRef ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags) +{ + logger().debug("{}", __func__); + assert(tp); + return do_with_op_gate(ceph::bufferlist{}, [=, this] (auto &bl) { + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &bl] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->read(c->collection, oid, offset, len, bl, op_flags); + }).then([&bl] (int r) -> read_errorator::future<ceph::bufferlist> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r == -EIO) { + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<ceph::bufferlist>( + std::move(bl)); + } + }); + }); +} + +AlienStore::read_errorator::future<ceph::bufferlist> +AlienStore::readv(CollectionRef ch, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags) +{ + logger().debug("{}", __func__); + assert(tp); + return do_with_op_gate(ceph::bufferlist{}, + [this, ch, oid, &m, op_flags](auto& bl) { + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), + [this, ch, oid, &m, op_flags, &bl] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->readv(c->collection, oid, m, bl, op_flags); + }).then([&bl](int r) -> read_errorator::future<ceph::bufferlist> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r == -EIO) { + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<ceph::bufferlist>( + std::move(bl)); + } + }); + }); +} + +AlienStore::get_attr_errorator::future<ceph::bufferlist> +AlienStore::get_attr(CollectionRef ch, + const ghobject_t& oid, + std::string_view name) const +{ + logger().debug("{}", __func__); + assert(tp); + return do_with_op_gate(ceph::bufferlist{}, std::string{name}, + [=, this] (auto &value, const auto& name) { + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &value, &name] { + // XXX: `name` isn't a `std::string_view` anymore! it had to be converted + // to `std::string` for the sake of extending life-time not only of + // a _ptr-to-data_ but _data_ as well. Otherwise we would run into a use- + // after-free issue. + auto c = static_cast<AlienCollection*>(ch.get()); + return store->getattr(c->collection, oid, name.c_str(), value); + }).then([oid, &value](int r) -> get_attr_errorator::future<ceph::bufferlist> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r == -ENODATA) { + return crimson::ct_error::enodata::make(); + } else { + return get_attr_errorator::make_ready_future<ceph::bufferlist>( + std::move(value)); + } + }); + }); +} + +AlienStore::get_attrs_ertr::future<AlienStore::attrs_t> +AlienStore::get_attrs(CollectionRef ch, + const ghobject_t& oid) +{ + logger().debug("{}", __func__); + assert(tp); + return do_with_op_gate(attrs_t{}, [=, this] (auto &aset) { + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &aset] { + auto c = static_cast<AlienCollection*>(ch.get()); + const auto r = store->getattrs(c->collection, oid, aset); + return r; + }).then([&aset] (int r) -> get_attrs_ertr::future<attrs_t> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else { + return get_attrs_ertr::make_ready_future<attrs_t>(std::move(aset)); + } + }); + }); +} + +auto AlienStore::omap_get_values(CollectionRef ch, + const ghobject_t& oid, + const set<string>& keys) + -> read_errorator::future<omap_values_t> +{ + logger().debug("{}", __func__); + assert(tp); + return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) { + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->omap_get_values(c->collection, oid, keys, + reinterpret_cast<map<string, bufferlist>*>(&values)); + }).then([&values] (int r) -> read_errorator::future<omap_values_t> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else { + assert(r == 0); + return read_errorator::make_ready_future<omap_values_t>( + std::move(values)); + } + }); + }); +} + +auto AlienStore::omap_get_values(CollectionRef ch, + const ghobject_t &oid, + const std::optional<string> &start) + -> read_errorator::future<std::tuple<bool, omap_values_t>> +{ + logger().debug("{} with_start", __func__); + assert(tp); + return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) { + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->omap_get_values(c->collection, oid, start, + reinterpret_cast<map<string, bufferlist>*>(&values)); + }).then([&values] (int r) + -> read_errorator::future<std::tuple<bool, omap_values_t>> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r < 0){ + logger().error("omap_get_values(start): {}", r); + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<std::tuple<bool, omap_values_t>>( + true, std::move(values)); + } + }); + }); +} + +seastar::future<> AlienStore::do_transaction_no_callbacks( + CollectionRef ch, + ceph::os::Transaction&& txn) +{ + logger().debug("{}", __func__); + auto id = seastar::this_shard_id(); + auto done = seastar::promise<>(); + return do_with_op_gate( + std::move(txn), + std::move(done), + [this, ch, id] (auto &txn, auto &done) { + AlienCollection* alien_coll = static_cast<AlienCollection*>(ch.get()); + // moving the `ch` is crucial for buildability on newer S* versions. + return alien_coll->with_lock([this, ch=std::move(ch), id, &txn, &done] { + assert(tp); + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), + [this, ch, id, &txn, &done, &alien=seastar::engine().alien()] { + txn.register_on_commit(new OnCommit(id, done, alien, txn)); + auto c = static_cast<AlienCollection*>(ch.get()); + return store->queue_transaction(c->collection, std::move(txn)); + }); + }).then([&done] (int r) { + assert(r == 0); + return done.get_future(); + }); + }); +} + +seastar::future<> AlienStore::inject_data_error(const ghobject_t& o) +{ + logger().debug("{}", __func__); + assert(tp); + return seastar::with_gate(op_gate, [=, this] { + return tp->submit([o, this] { + return store->inject_data_error(o); + }); + }); +} + +seastar::future<> AlienStore::inject_mdata_error(const ghobject_t& o) +{ + logger().debug("{}", __func__); + assert(tp); + return seastar::with_gate(op_gate, [=, this] { + return tp->submit([=, this] { + return store->inject_mdata_error(o); + }); + }); +} + +seastar::future<> AlienStore::write_meta(const std::string& key, + const std::string& value) +{ + logger().debug("{}", __func__); + assert(tp); + return seastar::with_gate(op_gate, [=, this] { + return tp->submit([=, this] { + return store->write_meta(key, value); + }).then([] (int r) { + assert(r == 0); + return seastar::make_ready_future<>(); + }); + }); +} + +seastar::future<std::tuple<int, std::string>> +AlienStore::read_meta(const std::string& key) +{ + logger().debug("{}", __func__); + assert(tp); + return seastar::with_gate(op_gate, [this, key] { + return tp->submit([this, key] { + std::string value; + int r = store->read_meta(key, &value); + if (r > 0) { + value.resize(r); + boost::algorithm::trim_right_if(value, + [] (unsigned char c) {return isspace(c);}); + } else { + value.clear(); + } + return std::make_pair(r, value); + }).then([] (auto entry) { + return seastar::make_ready_future<std::tuple<int, std::string>>( + std::move(entry)); + }); + }); +} + +uuid_d AlienStore::get_fsid() const +{ + logger().debug("{}", __func__); + return store->get_fsid(); +} + +seastar::future<store_statfs_t> AlienStore::stat() const +{ + logger().info("{}", __func__); + assert(tp); + return do_with_op_gate(store_statfs_t{}, [this] (store_statfs_t &st) { + return tp->submit([this, &st] { + return store->statfs(&st, nullptr); + }).then([&st] (int r) { + assert(r == 0); + return seastar::make_ready_future<store_statfs_t>(std::move(st)); + }); + }); +} + +unsigned AlienStore::get_max_attr_name_length() const +{ + logger().info("{}", __func__); + return 256; +} + +seastar::future<struct stat> AlienStore::stat( + CollectionRef ch, + const ghobject_t& oid) +{ + assert(tp); + return do_with_op_gate((struct stat){}, [this, ch, oid](auto& st) { + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [this, ch, oid, &st] { + auto c = static_cast<AlienCollection*>(ch.get()); + store->stat(c->collection, oid, &st); + return st; + }); + }); +} + +auto AlienStore::omap_get_header(CollectionRef ch, + const ghobject_t& oid) + -> get_attr_errorator::future<ceph::bufferlist> +{ + assert(tp); + return do_with_op_gate(ceph::bufferlist(), [=, this](auto& bl) { + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &bl] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->omap_get_header(c->collection, oid, &bl); + }).then([&bl](int r) -> get_attr_errorator::future<ceph::bufferlist> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r < 0) { + logger().error("omap_get_header: {}", r); + ceph_assert(0 == "impossible"); + } else { + return get_attr_errorator::make_ready_future<ceph::bufferlist>( + std::move(bl)); + } + }); + }); +} + +AlienStore::read_errorator::future<std::map<uint64_t, uint64_t>> AlienStore::fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + assert(tp); + return do_with_op_gate(std::map<uint64_t, uint64_t>(), [=, this](auto& destmap) { + return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &destmap] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->fiemap(c->collection, oid, off, len, destmap); + }).then([&destmap](int r) + -> read_errorator::future<std::map<uint64_t, uint64_t>> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else { + return read_errorator::make_ready_future<std::map<uint64_t, uint64_t>>( + std::move(destmap)); + } + }); + }); +} + +} diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h new file mode 100644 index 000000000..79c19b298 --- /dev/null +++ b/src/crimson/os/alienstore/alien_store.h @@ -0,0 +1,133 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/shared_mutex.hh> + +#include "common/ceph_context.h" +#include "os/ObjectStore.h" +#include "osd/osd_types.h" + +#include "crimson/os/alienstore/thread_pool.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +namespace ceph::os { +class Transaction; +} + +namespace crimson::os { +using coll_core_t = FuturizedStore::coll_core_t; +class AlienStore final : public FuturizedStore, + public FuturizedStore::Shard { +public: + AlienStore(const std::string& type, + const std::string& path, + const ConfigValues& values); + ~AlienStore() final; + + seastar::future<> start() final; + seastar::future<> stop() final; + mount_ertr::future<> mount() final; + seastar::future<> umount() final; + + mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final; + read_errorator::future<ceph::bufferlist> read(CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) final; + read_errorator::future<ceph::bufferlist> readv(CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) final; + + + get_attr_errorator::future<ceph::bufferlist> get_attr(CollectionRef c, + const ghobject_t& oid, + std::string_view name) const final; + get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) final; + + /// Retrieves paged set of values > start (if present) + read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) final; ///< @return <done, values> values.empty() iff done + + seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const final; + + seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; + seastar::future<CollectionRef> open_collection(const coll_t& cid) final; + seastar::future<std::vector<coll_core_t>> list_collections() final; + + seastar::future<> do_transaction_no_callbacks( + CollectionRef c, + ceph::os::Transaction&& txn) final; + + // error injection + seastar::future<> inject_data_error(const ghobject_t& o) final; + seastar::future<> inject_mdata_error(const ghobject_t& o) final; + + seastar::future<> write_meta(const std::string& key, + const std::string& value) final; + seastar::future<std::tuple<int, std::string>> read_meta( + const std::string& key) final; + uuid_d get_fsid() const final; + seastar::future<store_statfs_t> stat() const final; + unsigned get_max_attr_name_length() const final; + seastar::future<struct stat> stat( + CollectionRef, + const ghobject_t&) final; + get_attr_errorator::future<ceph::bufferlist> omap_get_header( + CollectionRef, + const ghobject_t&) final; + read_errorator::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef, + const ghobject_t&, + uint64_t off, + uint64_t len) final; + + FuturizedStore::Shard& get_sharded_store() final { + return *this; + } + +private: + template <class... Args> + auto do_with_op_gate(Args&&... args) const { + return seastar::with_gate(op_gate, + // perfect forwarding in lambda's closure isn't available in C++17 + // using tuple as workaround; see: https://stackoverflow.com/a/49902823 + [args = std::make_tuple(std::forward<Args>(args)...)] () mutable { + return std::apply([] (auto&&... args) { + return seastar::do_with(std::forward<decltype(args)>(args)...); + }, std::move(args)); + }); + } + + // number of cores that are PREVENTED from being scheduled + // to run alien store threads. + static constexpr int N_CORES_FOR_SEASTAR = 3; + mutable std::unique_ptr<crimson::os::ThreadPool> tp; + const std::string type; + const std::string path; + const ConfigValues values; + uint64_t used_bytes = 0; + std::unique_ptr<ObjectStore> store; + std::unique_ptr<CephContext> cct; + mutable seastar::gate op_gate; + std::unordered_map<coll_t, CollectionRef> coll_map; +}; +} diff --git a/src/crimson/os/alienstore/semaphore.h b/src/crimson/os/alienstore/semaphore.h new file mode 100644 index 000000000..8cba02ab1 --- /dev/null +++ b/src/crimson/os/alienstore/semaphore.h @@ -0,0 +1,90 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab +#pragma once + +#include <semaphore.h> +#include <ctime> +#include <cerrno> +#include <exception> +#include <chrono> + +namespace crimson { + +// an implementation of std::counting_semaphore<> in C++17 using the POSIX +// semaphore. +// +// LeastMaxValue is ignored, as we don't have different backends optimized +// for different LeastMaxValues +template<unsigned LeastMaxValue = 64> +class counting_semaphore { + using clock_t = std::chrono::system_clock; +public: + explicit counting_semaphore(unsigned count) noexcept { + sem_init(&sem, 0, count); + } + + counting_semaphore(const counting_semaphore&) = delete; + counting_semaphore& operator=(const counting_semaphore&) = delete; + + ~counting_semaphore() { + sem_destroy(&sem); + } + + void acquire() noexcept { + for (;;) { + int err = sem_wait(&sem); + if (err != 0) { + if (errno == EINTR) { + continue; + } else { + std::terminate(); + } + } else { + break; + } + } + } + + void release(unsigned update = 1) { + for (; update != 0; --update) { + int err = sem_post(&sem); + if (err != 0) { + std::terminate(); + } + } + } + + template<typename Clock, typename Duration> + bool try_acquire_until(const std::chrono::time_point<Clock, Duration>& abs_time) noexcept { + auto s = std::chrono::time_point_cast<std::chrono::seconds>(abs_time); + auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(abs_time - s); + struct timespec ts = { + static_cast<std::time_t>(s.time_since_epoch().count()), + static_cast<long>(ns.count()) + }; + for (;;) { + if (int err = sem_timedwait(&sem, &ts); err) { + if (errno == EINTR) { + continue; + } else if (errno == ETIMEDOUT || errno == EINVAL) { + return false; + } else { + std::terminate(); + } + } else { + break; + } + } + return true; + } + + template<typename Rep, typename Period> + bool try_acquire_for(const std::chrono::duration<Rep, Period>& rel_time) { + return try_acquire_until(clock_t::now() + rel_time); + } + +private: + sem_t sem; +}; + +} diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc new file mode 100644 index 000000000..5cf9590e6 --- /dev/null +++ b/src/crimson/os/alienstore/thread_pool.cc @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "thread_pool.h" + +#include <chrono> +#include <pthread.h> + +#include "include/ceph_assert.h" +#include "crimson/common/config_proxy.h" + +using crimson::common::local_conf; + +namespace crimson::os { + +ThreadPool::ThreadPool(size_t n_threads, + size_t queue_sz, + const std::optional<seastar::resource::cpuset>& cpus) + : n_threads(n_threads), + queue_size{round_up_to(queue_sz, seastar::smp::count)}, + pending_queues(n_threads) +{ + auto queue_max_wait = std::chrono::seconds(local_conf()->threadpool_empty_queue_max_wait); + for (size_t i = 0; i < n_threads; i++) { + threads.emplace_back([this, cpus, queue_max_wait, i] { + if (cpus.has_value()) { + pin(*cpus); + } + block_sighup(); + (void) pthread_setname_np(pthread_self(), "alien-store-tp"); + loop(queue_max_wait, i); + }); + } +} + +ThreadPool::~ThreadPool() +{ + for (auto& thread : threads) { + thread.join(); + } +} + +void ThreadPool::pin(const seastar::resource::cpuset& cpus) +{ + cpu_set_t cs; + CPU_ZERO(&cs); + for (auto cpu : cpus) { + CPU_SET(cpu, &cs); + } + [[maybe_unused]] auto r = pthread_setaffinity_np(pthread_self(), + sizeof(cs), &cs); + ceph_assert(r == 0); +} + +void ThreadPool::block_sighup() +{ + sigset_t sigs; + sigemptyset(&sigs); + // alien threads must ignore the SIGHUP. It's necessary as in + // `crimson/osd/main.cc` we set a handler using the Seastar's + // signal handling infrastrucute which assumes the `_backend` + // of `seastar::engine()` is not null. Grep `reactor.cc` for + // `sigaction` or just visit `reactor::signals::handle_signal()`. + sigaddset(&sigs, SIGHUP); + pthread_sigmask(SIG_BLOCK, &sigs, nullptr); +} + +void ThreadPool::loop(std::chrono::milliseconds queue_max_wait, size_t shard) +{ + auto& pending = pending_queues[shard]; + for (;;) { + WorkItem* work_item = nullptr; + work_item = pending.pop_front(queue_max_wait); + if (work_item) { + work_item->process(); + } else if (is_stopping()) { + break; + } + } +} + +seastar::future<> ThreadPool::start() +{ + auto slots_per_shard = queue_size / seastar::smp::count; + return submit_queue.start(slots_per_shard); +} + +seastar::future<> ThreadPool::stop() +{ + return submit_queue.stop().then([this] { + stopping = true; + for (auto& q : pending_queues) { + q.stop(); + } + }); +} + +} // namespace crimson::os diff --git a/src/crimson/os/alienstore/thread_pool.h b/src/crimson/os/alienstore/thread_pool.h new file mode 100644 index 000000000..78e18219a --- /dev/null +++ b/src/crimson/os/alienstore/thread_pool.h @@ -0,0 +1,184 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab +#pragma once + +#include <atomic> +#include <condition_variable> +#include <tuple> +#include <type_traits> +#include <boost/lockfree/queue.hpp> +#include <boost/optional.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/resource.hh> +#include <seastar/core/semaphore.hh> +#include <seastar/core/sharded.hh> + +#if __cplusplus > 201703L +#include <semaphore> +namespace crimson { + using std::counting_semaphore; +} +#else +#include "semaphore.h" +#endif + +namespace crimson::os { + +struct WorkItem { + virtual ~WorkItem() {} + virtual void process() = 0; +}; + +template<typename Func> +struct Task final : WorkItem { + using T = std::invoke_result_t<Func>; + using future_stored_type_t = + std::conditional_t<std::is_void_v<T>, + seastar::internal::future_stored_type_t<>, + seastar::internal::future_stored_type_t<T>>; + using futurator_t = seastar::futurize<T>; +public: + explicit Task(Func&& f) + : func(std::move(f)) + {} + void process() override { + try { + if constexpr (std::is_void_v<T>) { + func(); + state.set(); + } else { + state.set(func()); + } + } catch (...) { + state.set_exception(std::current_exception()); + } + on_done.write_side().signal(1); + } + typename futurator_t::type get_future() { + return on_done.wait().then([this](size_t) { + if (state.failed()) { + return futurator_t::make_exception_future(state.get_exception()); + } else { + return futurator_t::from_tuple(state.get_value()); + } + }); + } +private: + Func func; + seastar::future_state<future_stored_type_t> state; + seastar::readable_eventfd on_done; +}; + +struct SubmitQueue { + seastar::semaphore free_slots; + seastar::gate pending_tasks; + explicit SubmitQueue(size_t num_free_slots) + : free_slots(num_free_slots) + {} + seastar::future<> stop() { + return pending_tasks.close(); + } +}; + +struct ShardedWorkQueue { +public: + WorkItem* pop_front(std::chrono::milliseconds& queue_max_wait) { + if (sem.try_acquire_for(queue_max_wait)) { + if (!is_stopping()) { + WorkItem* work_item = nullptr; + [[maybe_unused]] bool popped = pending.pop(work_item); + assert(popped); + return work_item; + } + } + return nullptr; + } + void stop() { + stopping = true; + sem.release(); + } + void push_back(WorkItem* work_item) { + [[maybe_unused]] bool pushed = pending.push(work_item); + assert(pushed); + sem.release(); + } +private: + bool is_stopping() const { + return stopping; + } + std::atomic<bool> stopping = false; + static constexpr unsigned QUEUE_SIZE = 128; + crimson::counting_semaphore<QUEUE_SIZE> sem{0}; + boost::lockfree::queue<WorkItem*> pending{QUEUE_SIZE}; +}; + +/// an engine for scheduling non-seastar tasks from seastar fibers +class ThreadPool { +public: + /** + * @param queue_sz the depth of pending queue. before a task is scheduled, + * it waits in this queue. we will round this number to + * multiple of the number of cores. + * @param n_threads the number of threads in this thread pool. + * @param cpu the CPU core to which this thread pool is assigned + * @note each @c Task has its own crimson::thread::Condition, which possesses + * an fd, so we should keep the size of queue under a reasonable limit. + */ + ThreadPool(size_t n_threads, size_t queue_sz, const std::optional<seastar::resource::cpuset>& cpus); + ~ThreadPool(); + seastar::future<> start(); + seastar::future<> stop(); + size_t size() { + return n_threads; + } + template<typename Func, typename...Args> + auto submit(int shard, Func&& func, Args&&... args) { + auto packaged = [func=std::move(func), + args=std::forward_as_tuple(args...)] { + return std::apply(std::move(func), std::move(args)); + }; + return seastar::with_gate(submit_queue.local().pending_tasks, + [packaged=std::move(packaged), shard, this] { + return local_free_slots().wait() + .then([packaged=std::move(packaged), shard, this] { + auto task = new Task{std::move(packaged)}; + auto fut = task->get_future(); + pending_queues[shard].push_back(task); + return fut.finally([task, this] { + local_free_slots().signal(); + delete task; + }); + }); + }); + } + + template<typename Func> + auto submit(Func&& func) { + return submit(::rand() % n_threads, std::forward<Func>(func)); + } + +private: + void loop(std::chrono::milliseconds queue_max_wait, size_t shard); + bool is_stopping() const { + return stopping.load(std::memory_order_relaxed); + } + static void pin(const seastar::resource::cpuset& cpus); + static void block_sighup(); + seastar::semaphore& local_free_slots() { + return submit_queue.local().free_slots; + } + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator=(const ThreadPool&) = delete; + +private: + size_t n_threads; + std::atomic<bool> stopping = false; + std::vector<std::thread> threads; + seastar::sharded<SubmitQueue> submit_queue; + const size_t queue_size; + std::vector<ShardedWorkQueue> pending_queues; +}; + +} // namespace crimson::os diff --git a/src/crimson/os/cyanstore/CMakeLists.txt b/src/crimson/os/cyanstore/CMakeLists.txt new file mode 100644 index 000000000..65f2b5498 --- /dev/null +++ b/src/crimson/os/cyanstore/CMakeLists.txt @@ -0,0 +1,7 @@ +add_library(crimson-cyanstore STATIC + cyan_store.cc + cyan_collection.cc + cyan_object.cc) +target_link_libraries(crimson-cyanstore + crimson + crimson-os) diff --git a/src/crimson/os/cyanstore/cyan_collection.cc b/src/crimson/os/cyanstore/cyan_collection.cc new file mode 100644 index 000000000..9a814f978 --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_collection.cc @@ -0,0 +1,78 @@ +#include "cyan_collection.h" + +#include "cyan_object.h" + +using std::make_pair; + +namespace crimson::os +{ + +Collection::Collection(const coll_t& c) + : FuturizedCollection{c} +{} + +Collection::~Collection() = default; + +Collection::ObjectRef Collection::create_object() const +{ + return new crimson::os::Object; +} + +Collection::ObjectRef Collection::get_object(ghobject_t oid) +{ + auto o = object_hash.find(oid); + if (o == object_hash.end()) + return ObjectRef(); + return o->second; +} + +Collection::ObjectRef Collection::get_or_create_object(ghobject_t oid) +{ + auto result = object_hash.emplace(oid, ObjectRef{}); + if (result.second) + object_map[oid] = result.first->second = create_object(); + return result.first->second; +} + +uint64_t Collection::used_bytes() const +{ + uint64_t result = 0; + for (auto& obj : object_map) { + result += obj.second->get_size(); + } + return result; +} + +void Collection::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(xattr, bl); + encode(use_page_set, bl); + uint32_t s = object_map.size(); + encode(s, bl); + for (auto& [oid, obj] : object_map) { + encode(oid, bl); + obj->encode(bl); + } + ENCODE_FINISH(bl); +} + +void Collection::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(xattr, p); + decode(use_page_set, p); + uint32_t s; + decode(s, p); + while (s--) { + ghobject_t k; + decode(k, p); + auto o = create_object(); + o->decode(p); + object_map.insert(make_pair(k, o)); + object_hash.insert(make_pair(k, o)); + } + DECODE_FINISH(p); +} + +} diff --git a/src/crimson/os/cyanstore/cyan_collection.h b/src/crimson/os/cyanstore/cyan_collection.h new file mode 100644 index 000000000..068e427d8 --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_collection.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <unordered_map> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "include/buffer.h" +#include "osd/osd_types.h" + +#include "crimson/os/futurized_collection.h" + +namespace crimson::os { + +class Object; +/** + * a collection also orders transactions + * + * Any transactions queued under a given collection will be applied in + * sequence. Transactions queued under different collections may run + * in parallel. + * + * ObjectStore users may get collection handles with open_collection() (or, + * for bootstrapping a new collection, create_new_collection()). + */ +struct Collection final : public FuturizedCollection { + using ObjectRef = boost::intrusive_ptr<Object>; + int bits = 0; + // always use bufferlist object for testing + bool use_page_set = false; + std::unordered_map<ghobject_t, ObjectRef> object_hash; ///< for lookup + std::map<ghobject_t, ObjectRef> object_map; ///< for iteration + std::map<std::string,bufferptr> xattr; + bool exists = true; + + Collection(const coll_t& c); + ~Collection() final; + + ObjectRef create_object() const; + ObjectRef get_object(ghobject_t oid); + ObjectRef get_or_create_object(ghobject_t oid); + uint64_t used_bytes() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); +}; + +} diff --git a/src/crimson/os/cyanstore/cyan_object.cc b/src/crimson/os/cyanstore/cyan_object.cc new file mode 100644 index 000000000..34bc13b7f --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_object.cc @@ -0,0 +1,89 @@ +#include "cyan_object.h" +#include "include/encoding.h" + +namespace crimson::os { + +size_t Object::get_size() const { + return data.length(); +} + +ceph::bufferlist Object::read(uint64_t offset, uint64_t len) +{ + bufferlist ret; + ret.substr_of(data, offset, len); + return ret; +} + +int Object::write(uint64_t offset, const bufferlist &src) +{ + unsigned len = src.length(); + // before + bufferlist newdata; + if (get_size() >= offset) { + newdata.substr_of(data, 0, offset); + } else { + if (get_size()) { + newdata.substr_of(data, 0, get_size()); + } + newdata.append_zero(offset - get_size()); + } + + newdata.append(src); + + // after + if (get_size() > offset + len) { + bufferlist tail; + tail.substr_of(data, offset + len, get_size() - (offset + len)); + newdata.append(tail); + } + + data = std::move(newdata); + return 0; +} + +int Object::clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff) +{ + bufferlist bl; + if (srcoff == dstoff && len == src->get_size()) { + data = src->data; + return 0; + } + bl.substr_of(src->data, srcoff, len); + return write(dstoff, bl); + +} + +int Object::truncate(uint64_t size) +{ + if (get_size() > size) { + bufferlist bl; + bl.substr_of(data, 0, size); + data = std::move(bl); + } else if (get_size() == size) { + // do nothing + } else { + data.append_zero(size - get_size()); + } + return 0; +} + +void Object::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(data, bl); + encode(xattr, bl); + encode(omap_header, bl); + encode(omap, bl); + ENCODE_FINISH(bl); +} + +void Object::decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(data, p); + decode(xattr, p); + decode(omap_header, p); + decode(omap, p); + DECODE_FINISH(p); +} + +} diff --git a/src/crimson/os/cyanstore/cyan_object.h b/src/crimson/os/cyanstore/cyan_object.h new file mode 100644 index 000000000..624f9513a --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_object.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include <cstddef> +#include <map> +#include <string> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "include/buffer.h" + +namespace crimson::os { + +struct Object : public boost::intrusive_ref_counter< + Object, + boost::thread_unsafe_counter> +{ + using bufferlist = ceph::bufferlist; + + bufferlist data; + // use transparent comparator for better performance, see + // https://en.cppreference.com/w/cpp/utility/functional/less_void + std::map<std::string,bufferlist,std::less<>> xattr; + bufferlist omap_header; + std::map<std::string,bufferlist> omap; + + typedef boost::intrusive_ptr<Object> Ref; + + Object() = default; + + // interface for object data + size_t get_size() const; + ceph::bufferlist read(uint64_t offset, uint64_t len); + int write(uint64_t offset, const bufferlist &bl); + int clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff); + int truncate(uint64_t offset); + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); +}; +using ObjectRef = boost::intrusive_ptr<Object>; + +} diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc new file mode 100644 index 000000000..f2a6018e3 --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_store.cc @@ -0,0 +1,952 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cyan_store.h" + +#include <boost/algorithm/string/trim.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "common/safe_io.h" +#include "os/Transaction.h" + +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" +#include "cyan_collection.h" +#include "cyan_object.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_cyanstore); + } +} + +using std::string; +using crimson::common::local_conf; + +namespace crimson::os { + +using ObjectRef = boost::intrusive_ptr<Object>; + +CyanStore::CyanStore(const std::string& path) + : path{path} +{} + +CyanStore::~CyanStore() = default; + +template <const char* MsgV> +struct singleton_ec : std::error_code { + singleton_ec() + : error_code(42, this_error_category{}) { + }; +private: + struct this_error_category : std::error_category { + const char* name() const noexcept final { + // XXX: we could concatenate with MsgV at compile-time but the burden + // isn't worth the benefit. + return "singleton_ec"; + } + std::string message([[maybe_unused]] const int ev) const final { + assert(ev == 42); + return MsgV; + } + }; +}; + +seastar::future<store_statfs_t> CyanStore::stat() const +{ + ceph_assert(seastar::this_shard_id() == primary_core); + logger().debug("{}", __func__); + return shard_stores.map_reduce0( + [](const CyanStore::Shard &local_store) { + return local_store.get_used_bytes(); + }, + (uint64_t)0, + std::plus<uint64_t>() + ).then([](uint64_t used_bytes) { + store_statfs_t st; + st.total = crimson::common::local_conf().get_val<Option::size_t>("memstore_device_bytes"); + st.available = st.total - used_bytes; + return seastar::make_ready_future<store_statfs_t>(std::move(st)); + }); +} + + +CyanStore::mkfs_ertr::future<> CyanStore::mkfs(uuid_d new_osd_fsid) +{ + ceph_assert(seastar::this_shard_id() == primary_core); + static const char read_meta_errmsg[]{"read_meta"}; + static const char parse_fsid_errmsg[]{"failed to parse fsid"}; + static const char match_ofsid_errmsg[]{"unmatched osd_fsid"}; + return read_meta("fsid").then([=, this](auto&& ret) -> mkfs_ertr::future<> { + auto& [r, fsid_str] = ret; + if (r == -ENOENT) { + if (new_osd_fsid.is_zero()) { + osd_fsid.generate_random(); + } else { + osd_fsid = new_osd_fsid; + } + return write_meta("fsid", fmt::format("{}", osd_fsid)); + } else if (r < 0) { + return crimson::stateful_ec{ singleton_ec<read_meta_errmsg>() }; + } else { + logger().info("mkfs already has fsid {}", fsid_str); + if (!osd_fsid.parse(fsid_str.c_str())) { + return crimson::stateful_ec{ singleton_ec<parse_fsid_errmsg>() }; + } else if (osd_fsid != new_osd_fsid) { + logger().error("on-disk fsid {} != provided {}", osd_fsid, new_osd_fsid); + return crimson::stateful_ec{ singleton_ec<match_ofsid_errmsg>() }; + } else { + return mkfs_ertr::now(); + } + } + }).safe_then([this]{ + return write_meta("type", "memstore"); + }).safe_then([this] { + return shard_stores.invoke_on_all( + [](auto &local_store) { + return local_store.mkfs(); + }); + }); +} + +seastar::future<> CyanStore::Shard::mkfs() +{ + std::string fn = + path + "/collections" + std::to_string(seastar::this_shard_id()); + ceph::bufferlist bl; + std::set<coll_t> collections; + ceph::encode(collections, bl); + return crimson::write_file(std::move(bl), fn); +} + +using coll_core_t = FuturizedStore::coll_core_t; +seastar::future<std::vector<coll_core_t>> +CyanStore::list_collections() +{ + ceph_assert(seastar::this_shard_id() == primary_core); + return seastar::do_with(std::vector<coll_core_t>{}, [this](auto &collections) { + return shard_stores.map([](auto &local_store) { + return local_store.list_collections(); + }).then([&collections](std::vector<std::vector<coll_core_t>> results) { + for (auto& colls : results) { + collections.insert(collections.end(), colls.begin(), colls.end()); + } + return seastar::make_ready_future<std::vector<coll_core_t>>( + std::move(collections)); + }); + }); +} + +CyanStore::mount_ertr::future<> CyanStore::Shard::mount() +{ + static const char read_file_errmsg[]{"read_file"}; + ceph::bufferlist bl; + std::string fn = + path + "/collections" + std::to_string(seastar::this_shard_id()); + std::string err; + if (int r = bl.read_file(fn.c_str(), &err); r < 0) { + return crimson::stateful_ec{ singleton_ec<read_file_errmsg>() }; + } + + std::set<coll_t> collections; + auto p = bl.cbegin(); + ceph::decode(collections, p); + + for (auto& coll : collections) { + std::string fn = fmt::format("{}/{}{}", path, coll, + std::to_string(seastar::this_shard_id())); + ceph::bufferlist cbl; + if (int r = cbl.read_file(fn.c_str(), &err); r < 0) { + return crimson::stateful_ec{ singleton_ec<read_file_errmsg>() }; + } + boost::intrusive_ptr<Collection> c{new Collection{coll}}; + auto p = cbl.cbegin(); + c->decode(p); + coll_map[coll] = c; + used_bytes += c->used_bytes(); + } + return mount_ertr::now(); +} + +seastar::future<> CyanStore::Shard::umount() +{ + return seastar::do_with(std::set<coll_t>{}, [this](auto& collections) { + return seastar::do_for_each(coll_map, [&collections, this](auto& coll) { + auto& [col, ch] = coll; + collections.insert(col); + ceph::bufferlist bl; + ceph_assert(ch); + ch->encode(bl); + std::string fn = fmt::format("{}/{}{}", path, col, + std::to_string(seastar::this_shard_id())); + return crimson::write_file(std::move(bl), fn); + }).then([&collections, this] { + ceph::bufferlist bl; + ceph::encode(collections, bl); + std::string fn = fmt::format("{}/collections{}", + path, std::to_string(seastar::this_shard_id())); + return crimson::write_file(std::move(bl), fn); + }); + }); +} + +seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> +CyanStore::Shard::list_objects( + CollectionRef ch, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {} {} {}", + __func__, c->get_cid(), start, end, limit); + std::vector<ghobject_t> objects; + objects.reserve(limit); + ghobject_t next = ghobject_t::get_max(); + for (const auto& [oid, obj] : + boost::make_iterator_range(c->object_map.lower_bound(start), + c->object_map.end())) { + std::ignore = obj; + if (oid >= end || objects.size() >= limit) { + next = oid; + break; + } + objects.push_back(oid); + } + return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>( + std::make_tuple(std::move(objects), next)); +} + +seastar::future<CollectionRef> +CyanStore::Shard::create_new_collection(const coll_t& cid) +{ + auto c = new Collection{cid}; + new_coll_map[cid] = c; + return seastar::make_ready_future<CollectionRef>(c); +} + +seastar::future<CollectionRef> +CyanStore::Shard::open_collection(const coll_t& cid) +{ + return seastar::make_ready_future<CollectionRef>(_get_collection(cid)); +} + +seastar::future<std::vector<coll_core_t>> +CyanStore::Shard::list_collections() +{ + std::vector<coll_core_t> collections; + for (auto& coll : coll_map) { + collections.push_back(std::make_pair(coll.first, seastar::this_shard_id())); + } + return seastar::make_ready_future<std::vector<coll_core_t>>(std::move(collections)); +} + +CyanStore::Shard::read_errorator::future<ceph::bufferlist> +CyanStore::Shard::read( + CollectionRef ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags) +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {} {}~{}", + __func__, c->get_cid(), oid, offset, len); + if (!c->exists) { + return crimson::ct_error::enoent::make(); + } + ObjectRef o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + if (offset >= o->get_size()) + return read_errorator::make_ready_future<ceph::bufferlist>(); + size_t l = len; + if (l == 0 && offset == 0) // note: len == 0 means read the entire object + l = o->get_size(); + else if (offset + l > o->get_size()) + l = o->get_size() - offset; + return read_errorator::make_ready_future<ceph::bufferlist>(o->read(offset, l)); +} + +CyanStore::Shard::read_errorator::future<ceph::bufferlist> +CyanStore::Shard::readv( + CollectionRef ch, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags) +{ + return seastar::do_with(ceph::bufferlist{}, + [this, ch, oid, &m, op_flags](auto& bl) { + return crimson::do_for_each(m, + [this, ch, oid, op_flags, &bl](auto& p) { + return read(ch, oid, p.first, p.second, op_flags) + .safe_then([&bl](auto ret) { + bl.claim_append(ret); + }); + }).safe_then([&bl] { + return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl)); + }); + }); +} + +CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist> +CyanStore::Shard::get_attr( + CollectionRef ch, + const ghobject_t& oid, + std::string_view name) const +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + if (auto found = o->xattr.find(name); found != o->xattr.end()) { + return get_attr_errorator::make_ready_future<ceph::bufferlist>(found->second); + } else { + return crimson::ct_error::enodata::make(); + } +} + +CyanStore::Shard::get_attrs_ertr::future<CyanStore::Shard::attrs_t> +CyanStore::Shard::get_attrs( + CollectionRef ch, + const ghobject_t& oid) +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + return get_attrs_ertr::make_ready_future<attrs_t>(o->xattr); +} + +auto CyanStore::Shard::omap_get_values( + CollectionRef ch, + const ghobject_t& oid, + const omap_keys_t& keys) + -> read_errorator::future<omap_values_t> +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + omap_values_t values; + for (auto& key : keys) { + if (auto found = o->omap.find(key); found != o->omap.end()) { + values.insert(*found); + } + } + return seastar::make_ready_future<omap_values_t>(std::move(values)); +} + +auto CyanStore::Shard::omap_get_values( + CollectionRef ch, + const ghobject_t &oid, + const std::optional<string> &start) + -> CyanStore::Shard::read_errorator::future<std::tuple<bool, omap_values_t>> +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + omap_values_t values; + for (auto i = start ? o->omap.upper_bound(*start) : o->omap.begin(); + i != o->omap.end(); + ++i) { + values.insert(*i); + } + return seastar::make_ready_future<std::tuple<bool, omap_values_t>>( + std::make_tuple(true, std::move(values))); +} + +auto CyanStore::Shard::omap_get_header( + CollectionRef ch, + const ghobject_t& oid) + -> CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist> +{ + auto c = static_cast<Collection*>(ch.get()); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + + return get_attr_errorator::make_ready_future<ceph::bufferlist>( + o->omap_header); +} + +seastar::future<> CyanStore::Shard::do_transaction_no_callbacks( + CollectionRef ch, + ceph::os::Transaction&& t) +{ + using ceph::os::Transaction; + int r = 0; + try { + auto i = t.begin(); + while (i.have_op()) { + r = 0; + switch (auto op = i.decode_op(); op->op) { + case Transaction::OP_NOP: + break; + case Transaction::OP_REMOVE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _remove(cid, oid); + if (r == -ENOENT) { + r = 0; + } + } + break; + case Transaction::OP_TOUCH: + case Transaction::OP_CREATE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _touch(cid, oid); + } + break; + case Transaction::OP_WRITE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + ceph::bufferlist bl; + i.decode_bl(bl); + r = _write(cid, oid, off, len, bl, fadvise_flags); + } + break; + case Transaction::OP_ZERO: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + r = _zero(cid, oid, off, len); + } + break; + case Transaction::OP_TRUNCATE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + r = _truncate(cid, oid, off); + } + break; + case Transaction::OP_CLONE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + ghobject_t noid = i.get_oid(op->dest_oid); + r = _clone(cid, oid, noid); + } + break; + case Transaction::OP_SETATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + std::string name = i.decode_string(); + ceph::bufferlist bl; + i.decode_bl(bl); + std::map<std::string, bufferlist> to_set; + to_set.emplace(name, std::move(bl)); + r = _setattrs(cid, oid, std::move(to_set)); + } + break; + case Transaction::OP_SETATTRS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + std::map<std::string, bufferlist> aset; + i.decode_attrset(aset); + r = _setattrs(cid, oid, std::move(aset)); + } + break; + case Transaction::OP_RMATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + std::string name = i.decode_string(); + r = _rm_attr(cid, oid, name); + } + break; + case Transaction::OP_RMATTRS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _rm_attrs(cid, oid); + } + break; + case Transaction::OP_MKCOLL: + { + coll_t cid = i.get_cid(op->cid); + r = _create_collection(cid, op->split_bits); + } + break; + case Transaction::OP_SETALLOCHINT: + { + r = 0; + } + break; + case Transaction::OP_OMAP_CLEAR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _omap_clear(cid, oid); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + std::map<std::string, ceph::bufferlist> aset; + i.decode_attrset(aset); + r = _omap_set_values(cid, oid, std::move(aset)); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + const coll_t &cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + ceph::bufferlist bl; + i.decode_bl(bl); + r = _omap_set_header(cid, oid, bl); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + const coll_t &cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + omap_keys_t keys; + i.decode_keyset(keys); + r = _omap_rmkeys(cid, oid, keys); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + const coll_t &cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + r = _omap_rmkeyrange(cid, oid, first, last); + } + break; + case Transaction::OP_COLL_HINT: + { + ceph::bufferlist hint; + i.decode_bl(hint); + // ignored + break; + } + default: + logger().error("bad op {}", static_cast<unsigned>(op->op)); + abort(); + } + if (r < 0) { + break; + } + } + } catch (std::exception &e) { + logger().error("{} got exception {}", __func__, e); + r = -EINVAL; + } + if (r < 0) { + logger().error(" transaction dump:\n"); + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + std::stringstream str; + f.flush(str); + logger().error("{}", str.str()); + ceph_assert(r == 0); + } + return seastar::now(); +} + +int CyanStore::Shard::_remove(const coll_t& cid, const ghobject_t& oid) +{ + logger().debug("{} cid={} oid={}", + __func__, cid, oid); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + auto i = c->object_hash.find(oid); + if (i == c->object_hash.end()) + return -ENOENT; + used_bytes -= i->second->get_size(); + c->object_hash.erase(i); + c->object_map.erase(oid); + return 0; +} + +int CyanStore::Shard::_touch(const coll_t& cid, const ghobject_t& oid) +{ + logger().debug("{} cid={} oid={}", + __func__, cid, oid); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + c->get_or_create_object(oid); + return 0; +} + +int CyanStore::Shard::_write( + const coll_t& cid, + const ghobject_t& oid, + uint64_t offset, + size_t len, + const ceph::bufferlist& bl, + uint32_t fadvise_flags) +{ + logger().debug("{} {} {} {} ~ {}", + __func__, cid, oid, offset, len); + assert(len == bl.length()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + if (len > 0 && !local_conf()->memstore_debug_omit_block_device_write) { + const ssize_t old_size = o->get_size(); + o->write(offset, bl); + used_bytes += (o->get_size() - old_size); + } + + return 0; +} + +int CyanStore::Shard::_zero( + const coll_t& cid, + const ghobject_t& oid, + uint64_t offset, + size_t len) +{ + logger().debug("{} {} {} {} ~ {}", + __func__, cid, oid, offset, len); + + ceph::buffer::list bl; + bl.append_zero(len); + return _write(cid, oid, offset, len, bl, 0); +} + +int CyanStore::Shard::_omap_clear( + const coll_t& cid, + const ghobject_t& oid) +{ + logger().debug("{} {} {}", __func__, cid, oid); + + auto c = _get_collection(cid); + if (!c) { + return -ENOENT; + } + ObjectRef o = c->get_object(oid); + if (!o) { + return -ENOENT; + } + o->omap.clear(); + o->omap_header.clear(); + return 0; +} + +int CyanStore::Shard::_omap_set_values( + const coll_t& cid, + const ghobject_t& oid, + std::map<std::string, ceph::bufferlist> &&aset) +{ + logger().debug( + "{} {} {} {} keys", + __func__, cid, oid, aset.size()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + for (auto&& [key, val]: aset) { + o->omap.insert_or_assign(std::move(key), std::move(val)); + } + return 0; +} + +int CyanStore::Shard::_omap_set_header( + const coll_t& cid, + const ghobject_t& oid, + const ceph::bufferlist &header) +{ + logger().debug( + "{} {} {} {} bytes", + __func__, cid, oid, header.length()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + o->omap_header = header; + return 0; +} + +int CyanStore::Shard::_omap_rmkeys( + const coll_t& cid, + const ghobject_t& oid, + const omap_keys_t& aset) +{ + logger().debug( + "{} {} {} {} keys", + __func__, cid, oid, aset.size()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + for (auto &i: aset) { + o->omap.erase(i); + } + return 0; +} + +int CyanStore::Shard::_omap_rmkeyrange( + const coll_t& cid, + const ghobject_t& oid, + const std::string &first, + const std::string &last) +{ + logger().debug( + "{} {} {} first={} last={}", + __func__, cid, oid, first, last); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + for (auto i = o->omap.lower_bound(first); + i != o->omap.end() && i->first < last; + o->omap.erase(i++)); + return 0; +} + +int CyanStore::Shard::_truncate( + const coll_t& cid, + const ghobject_t& oid, + uint64_t size) +{ + logger().debug("{} cid={} oid={} size={}", + __func__, cid, oid, size); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + if (local_conf()->memstore_debug_omit_block_device_write) + return 0; + const ssize_t old_size = o->get_size(); + int r = o->truncate(size); + used_bytes += (o->get_size() - old_size); + return r; +} + +int CyanStore::Shard::_clone( + const coll_t& cid, + const ghobject_t& oid, + const ghobject_t& noid) +{ + logger().debug("{} cid={} oid={} noid={}", + __func__, cid, oid, noid); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef oo = c->get_object(oid); + if (!oo) + return -ENOENT; + if (local_conf()->memstore_debug_omit_block_device_write) + return 0; + ObjectRef no = c->get_or_create_object(noid); + used_bytes += ((ssize_t)oo->get_size() - (ssize_t)no->get_size()); + no->clone(oo.get(), 0, oo->get_size(), 0); + + no->omap_header = oo->omap_header; + no->omap = oo->omap; + no->xattr = oo->xattr; + return 0; +} + +int CyanStore::Shard::_setattrs( + const coll_t& cid, + const ghobject_t& oid, + std::map<std::string,bufferlist>&& aset) +{ + logger().debug("{} cid={} oid={}", + __func__, cid, oid); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + for (auto&& [key, val]: aset) { + o->xattr.insert_or_assign(std::move(key), std::move(val)); + } + return 0; +} + +int CyanStore::Shard::_rm_attr( + const coll_t& cid, + const ghobject_t& oid, + std::string_view name) +{ + logger().debug("{} cid={} oid={} name={}", __func__, cid, oid, name); + auto c = _get_collection(cid); + if (!c) { + return -ENOENT; + } + ObjectRef o = c->get_object(oid); + if (!o) { + return -ENOENT; + } + auto i = o->xattr.find(name); + if (i == o->xattr.end()) { + return -ENODATA; + } + o->xattr.erase(i); + return 0; +} + +int CyanStore::Shard::_rm_attrs( + const coll_t& cid, + const ghobject_t& oid) +{ + logger().debug("{} cid={} oid={}", __func__, cid, oid); + auto c = _get_collection(cid); + if (!c) { + return -ENOENT; + } + ObjectRef o = c->get_object(oid); + if (!o) { + return -ENOENT; + } + o->xattr.clear(); + return 0; +} + +int CyanStore::Shard::_create_collection(const coll_t& cid, int bits) +{ + auto result = coll_map.try_emplace(cid); + if (!result.second) + return -EEXIST; + auto p = new_coll_map.find(cid); + assert(p != new_coll_map.end()); + result.first->second = p->second; + result.first->second->bits = bits; + new_coll_map.erase(p); + return 0; +} + +boost::intrusive_ptr<Collection> +CyanStore::Shard::_get_collection(const coll_t& cid) +{ + auto cp = coll_map.find(cid); + if (cp == coll_map.end()) + return {}; + return cp->second; +} + +seastar::future<> CyanStore::write_meta( + const std::string& key, + const std::string& value) +{ + ceph_assert(seastar::this_shard_id() == primary_core); + std::string v = value; + v += "\n"; + if (int r = safe_write_file(path.c_str(), key.c_str(), + v.c_str(), v.length(), 0600); + r < 0) { + throw std::runtime_error{fmt::format("unable to write_meta({})", key)}; + } + return seastar::make_ready_future<>(); +} + +seastar::future<std::tuple<int, std::string>> +CyanStore::read_meta(const std::string& key) +{ + ceph_assert(seastar::this_shard_id() == primary_core); + std::string fsid(4096, '\0'); + int r = safe_read_file(path.c_str(), key.c_str(), fsid.data(), fsid.size()); + if (r > 0) { + fsid.resize(r); + // drop trailing newlines + boost::algorithm::trim_right_if(fsid, + [](unsigned char c) {return isspace(c);}); + } else { + fsid.clear(); + } + return seastar::make_ready_future<std::tuple<int, std::string>>( + std::make_tuple(r, fsid)); +} + +uuid_d CyanStore::get_fsid() const +{ + ceph_assert(seastar::this_shard_id() == primary_core); + return osd_fsid; +} + +unsigned CyanStore::Shard::get_max_attr_name_length() const +{ + // arbitrary limitation exactly like in the case of MemStore. + return 256; +} + +CyanStore::Shard::read_errorator::future<std::map<uint64_t, uint64_t>> +CyanStore::Shard::fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + auto c = static_cast<Collection*>(ch.get()); + + ObjectRef o = c->get_object(oid); + if (!o) { + throw std::runtime_error(fmt::format("object does not exist: {}", oid)); + } + std::map<uint64_t, uint64_t> m{{0, o->get_size()}}; + return seastar::make_ready_future<std::map<uint64_t, uint64_t>>(std::move(m)); +} + +seastar::future<struct stat> +CyanStore::Shard::stat( + CollectionRef ch, + const ghobject_t& oid) +{ + auto c = static_cast<Collection*>(ch.get()); + auto o = c->get_object(oid); + if (!o) { + throw std::runtime_error(fmt::format("object does not exist: {}", oid)); + } + struct stat st; + st.st_size = o->get_size(); + return seastar::make_ready_future<struct stat>(std::move(st)); +} + +} diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h new file mode 100644 index 000000000..307f9ec32 --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_store.h @@ -0,0 +1,219 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <unordered_map> +#include <map> +#include <typeinfo> +#include <vector> + +#include <optional> +#include <seastar/core/future.hh> +#include <seastar/core/future-util.hh> + +#include "osd/osd_types.h" +#include "include/uuid.h" + +#include "crimson/os/cyanstore/cyan_object.h" +#include "crimson/os/cyanstore/cyan_collection.h" +#include "crimson/os/futurized_store.h" + +namespace ceph::os { +class Transaction; +} + +namespace crimson::os { +class CyanStore final : public FuturizedStore { + class Shard : public FuturizedStore::Shard { + public: + Shard(std::string path) + :path(path){} + + seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<ceph::bufferlist> read( + CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) final; + + read_errorator::future<ceph::bufferlist> readv( + CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) final; + + get_attr_errorator::future<ceph::bufferlist> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name) const final; + + get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) final; + + read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) final; + + get_attr_errorator::future<ceph::bufferlist> omap_get_header( + CollectionRef c, + const ghobject_t& oid) final; + + seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> + list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const final; + + seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; + + seastar::future<CollectionRef> open_collection(const coll_t& cid) final; + + seastar::future<> do_transaction_no_callbacks( + CollectionRef ch, + ceph::os::Transaction&& txn) final; + + read_errorator::future<std::map<uint64_t, uint64_t>> + fiemap( + CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len) final; + + unsigned get_max_attr_name_length() const final; + + public: + // only exposed to CyanStore + mount_ertr::future<> mount(); + + seastar::future<> umount(); + + seastar::future<> mkfs(); + + mkfs_ertr::future<> mkcoll(uuid_d new_osd_fsid); + + using coll_core_t = FuturizedStore::coll_core_t; + seastar::future<std::vector<coll_core_t>> list_collections(); + + uint64_t get_used_bytes() const { return used_bytes; } + + private: + int _remove(const coll_t& cid, const ghobject_t& oid); + int _touch(const coll_t& cid, const ghobject_t& oid); + int _write(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len, const ceph::bufferlist& bl, + uint32_t fadvise_flags); + int _zero(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len); + int _omap_clear( + const coll_t& cid, + const ghobject_t& oid); + int _omap_set_values( + const coll_t& cid, + const ghobject_t& oid, + std::map<std::string, ceph::bufferlist> &&aset); + int _omap_set_header( + const coll_t& cid, + const ghobject_t& oid, + const ceph::bufferlist &header); + int _omap_rmkeys( + const coll_t& cid, + const ghobject_t& oid, + const omap_keys_t& aset); + int _omap_rmkeyrange( + const coll_t& cid, + const ghobject_t& oid, + const std::string &first, + const std::string &last); + int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size); + int _clone(const coll_t& cid, const ghobject_t& oid, + const ghobject_t& noid); + int _setattrs(const coll_t& cid, const ghobject_t& oid, + std::map<std::string,bufferlist>&& aset); + int _rm_attr(const coll_t& cid, const ghobject_t& oid, + std::string_view name); + int _rm_attrs(const coll_t& cid, const ghobject_t& oid); + int _create_collection(const coll_t& cid, int bits); + boost::intrusive_ptr<Collection> _get_collection(const coll_t& cid); + + private: + uint64_t used_bytes = 0; + const std::string path; + std::unordered_map<coll_t, boost::intrusive_ptr<Collection>> coll_map; + std::map<coll_t, boost::intrusive_ptr<Collection>> new_coll_map; + }; + +public: + CyanStore(const std::string& path); + ~CyanStore() final; + + seastar::future<> start() final { + ceph_assert(seastar::this_shard_id() == primary_core); + return shard_stores.start(path); + } + + seastar::future<> stop() final { + ceph_assert(seastar::this_shard_id() == primary_core); + return shard_stores.stop(); + } + + mount_ertr::future<> mount() final { + ceph_assert(seastar::this_shard_id() == primary_core); + return shard_stores.invoke_on_all( + [](auto &local_store) { + return local_store.mount().handle_error( + crimson::stateful_ec::handle([](const auto& ec) { + crimson::get_logger(ceph_subsys_cyanstore).error( + "error mounting cyanstore: ({}) {}", + ec.value(), ec.message()); + std::exit(EXIT_FAILURE); + })); + }); + } + + seastar::future<> umount() final { + ceph_assert(seastar::this_shard_id() == primary_core); + return shard_stores.invoke_on_all( + [](auto &local_store) { + return local_store.umount(); + }); + } + + mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final; + + seastar::future<store_statfs_t> stat() const final; + + uuid_d get_fsid() const final; + + seastar::future<> write_meta(const std::string& key, + const std::string& value) final; + + FuturizedStore::Shard& get_sharded_store() final{ + return shard_stores.local(); + } + + seastar::future<std::tuple<int, std::string>> + read_meta(const std::string& key) final; + + seastar::future<std::vector<coll_core_t>> list_collections() final; + +private: + seastar::sharded<CyanStore::Shard> shard_stores; + const std::string path; + uuid_d osd_fsid; +}; +} diff --git a/src/crimson/os/futurized_collection.h b/src/crimson/os/futurized_collection.h new file mode 100644 index 000000000..7b460dffe --- /dev/null +++ b/src/crimson/os/futurized_collection.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "osd/osd_types.h" + +namespace crimson::os { +class FuturizedStore; + +class FuturizedCollection + : public boost::intrusive_ref_counter<FuturizedCollection, + boost::thread_safe_counter> +{ +public: + FuturizedCollection(const coll_t& cid) + : cid{cid} {} + virtual ~FuturizedCollection() {} + virtual seastar::future<> flush() { + return seastar::make_ready_future<>(); + } + virtual seastar::future<bool> flush_commit() { + return seastar::make_ready_future<bool>(true); + } + const coll_t& get_cid() const { + return cid; + } +private: + const coll_t cid; +}; + +using CollectionRef = boost::intrusive_ptr<FuturizedCollection>; +} diff --git a/src/crimson/os/futurized_store.cc b/src/crimson/os/futurized_store.cc new file mode 100644 index 000000000..bc47ec78f --- /dev/null +++ b/src/crimson/os/futurized_store.cc @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "futurized_store.h" +#include "cyanstore/cyan_store.h" +#ifdef WITH_BLUESTORE +#include "alienstore/alien_store.h" +#endif +#include "seastore/seastore.h" + +namespace crimson::os { + +std::unique_ptr<FuturizedStore> +FuturizedStore::create(const std::string& type, + const std::string& data, + const ConfigValues& values) +{ + if (type == "cyanstore") { + using crimson::os::CyanStore; + return std::make_unique<CyanStore>(data); + } else if (type == "seastore") { + return crimson::os::seastore::make_seastore( + data); + } else { + using crimson::os::AlienStore; +#ifdef WITH_BLUESTORE + // use AlienStore as a fallback. It adapts e.g. BlueStore. + return std::make_unique<AlienStore>(type, data, values); +#else + ceph_abort_msgf("unsupported objectstore type: %s", type.c_str()); + return {}; +#endif + } +} + +} diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h new file mode 100644 index 000000000..783cd7485 --- /dev/null +++ b/src/crimson/os/futurized_store.h @@ -0,0 +1,195 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <map> +#include <optional> +#include <vector> + +#include <seastar/core/future.hh> + +#include "os/Transaction.h" +#include "crimson/common/smp_helpers.h" +#include "crimson/common/smp_helpers.h" +#include "crimson/osd/exceptions.h" +#include "include/buffer_fwd.h" +#include "include/uuid.h" +#include "osd/osd_types.h" + +namespace ceph::os { +class Transaction; +} + +namespace crimson::os { +class FuturizedCollection; + +class FuturizedStore { +public: + class Shard { + public: + Shard() = default; + virtual ~Shard() = default; + // no copying + explicit Shard(const Shard& o) = delete; + const Shard& operator=(const Shard& o) = delete; + + using CollectionRef = boost::intrusive_ptr<FuturizedCollection>; + using read_errorator = crimson::errorator<crimson::ct_error::enoent, + crimson::ct_error::input_output_error>; + virtual read_errorator::future<ceph::bufferlist> read( + CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) = 0; + + virtual read_errorator::future<ceph::bufferlist> readv( + CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) = 0; + + using get_attr_errorator = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::enodata>; + virtual get_attr_errorator::future<ceph::bufferlist> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name) const = 0; + + using get_attrs_ertr = crimson::errorator< + crimson::ct_error::enoent>; + using attrs_t = std::map<std::string, ceph::bufferlist, std::less<>>; + virtual get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid) = 0; + + virtual seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) = 0; + + using omap_values_t = std::map<std::string, ceph::bufferlist, std::less<>>; + using omap_keys_t = std::set<std::string>; + virtual read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) = 0; + + virtual read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) = 0; ///< @return <done, values> values.empty() only if done + + virtual get_attr_errorator::future<bufferlist> omap_get_header( + CollectionRef c, + const ghobject_t& oid) = 0; + + virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const = 0; + + virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0; + + virtual seastar::future<CollectionRef> open_collection(const coll_t& cid) = 0; + + protected: + virtual seastar::future<> do_transaction_no_callbacks( + CollectionRef ch, + ceph::os::Transaction&& txn) = 0; + + public: + seastar::future<> do_transaction( + CollectionRef ch, + ceph::os::Transaction&& txn) { + std::unique_ptr<Context> on_commit( + ceph::os::Transaction::collect_all_contexts(txn)); + return do_transaction_no_callbacks( + std::move(ch), std::move(txn) + ).then([on_commit=std::move(on_commit)]() mutable { + auto c = on_commit.release(); + if (c) c->complete(0); + return seastar::now(); + }); + } + + + /** + * flush + * + * Flushes outstanding transactions on ch, returned future resolves + * after any previously submitted transactions on ch have committed. + * + * @param ch [in] collection on which to flush + */ + virtual seastar::future<> flush(CollectionRef ch) { + return do_transaction(ch, ceph::os::Transaction{}); + } + + // error injection + virtual seastar::future<> inject_data_error(const ghobject_t& o) { + return seastar::now(); + } + + virtual seastar::future<> inject_mdata_error(const ghobject_t& o) { + return seastar::now(); + } + + virtual read_errorator::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) = 0; + + virtual unsigned get_max_attr_name_length() const = 0; + }; + +public: + static std::unique_ptr<FuturizedStore> create(const std::string& type, + const std::string& data, + const ConfigValues& values); + FuturizedStore() + : primary_core(seastar::this_shard_id()) + {} + + virtual ~FuturizedStore() = default; + + // no copying + explicit FuturizedStore(const FuturizedStore& o) = delete; + const FuturizedStore& operator=(const FuturizedStore& o) = delete; + + virtual seastar::future<> start() = 0; + + virtual seastar::future<> stop() = 0; + + using mount_ertr = crimson::errorator<crimson::stateful_ec>; + virtual mount_ertr::future<> mount() = 0; + + virtual seastar::future<> umount() = 0; + + using mkfs_ertr = crimson::errorator<crimson::stateful_ec>; + virtual mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) = 0; + + virtual seastar::future<store_statfs_t> stat() const = 0; + + virtual uuid_d get_fsid() const = 0; + + virtual seastar::future<> write_meta(const std::string& key, + const std::string& value) = 0; + // called on the shard and get this FuturizedStore::shard; + virtual Shard& get_sharded_store() = 0; + + virtual seastar::future<std::tuple<int, std::string>> read_meta( + const std::string& key) = 0; + + using coll_core_t = std::pair<coll_t, core_id_t>; + virtual seastar::future<std::vector<coll_core_t>> list_collections() = 0; + +protected: + const core_id_t primary_core; +}; +} diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt new file mode 100644 index 000000000..4bdbab8c4 --- /dev/null +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -0,0 +1,79 @@ +set(crimson_seastore_srcs + cached_extent.cc + seastore_types.cc + segment_manager.cc + segment_manager/ephemeral.cc + segment_manager/block.cc + transaction_manager.cc + transaction.cc + cache.cc + root_block.cc + lba_manager.cc + async_cleaner.cc + backref_manager.cc + backref/backref_tree_node.cc + backref/btree_backref_manager.cc + lba_manager/btree/btree_lba_manager.cc + lba_manager/btree/lba_btree_node.cc + omap_manager.cc + omap_manager/btree/btree_omap_manager.cc + omap_manager/btree/omap_btree_node_impl.cc + btree/btree_range_pin.cc + btree/fixed_kv_node.cc + onode.cc + onode_manager/staged-fltree/node.cc + onode_manager/staged-fltree/node_extent_manager.cc + onode_manager/staged-fltree/node_extent_manager/seastore.cc + onode_manager/staged-fltree/node_impl.cc + onode_manager/staged-fltree/stages/item_iterator_stage.cc + onode_manager/staged-fltree/stages/key_layout.cc + onode_manager/staged-fltree/stages/node_stage_layout.cc + onode_manager/staged-fltree/stages/node_stage.cc + onode_manager/staged-fltree/stages/sub_items_stage.cc + onode_manager/staged-fltree/super.cc + onode_manager/staged-fltree/value.cc + onode_manager/staged-fltree/fltree_onode_manager.cc + collection_manager.cc + collection_manager/flat_collection_manager.cc + collection_manager/collection_flat_node.cc + extent_placement_manager.cc + object_data_handler.cc + seastore.cc + random_block_manager.cc + random_block_manager/block_rb_manager.cc + random_block_manager/rbm_device.cc + random_block_manager/nvme_block_device.cc + random_block_manager/avlallocator.cc + journal/segmented_journal.cc + journal/segment_allocator.cc + journal/record_submitter.cc + journal/circular_journal_space.cc + journal.cc + device.cc + segment_manager_group.cc + record_scanner.cc + journal/circular_bounded_journal.cc + ../../../test/crimson/seastore/test_block.cc + ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc + ) + +CMAKE_DEPENDENT_OPTION(WITH_ZNS "enable Linux ZNS support" OFF + "CMAKE_SYSTEM_NAME STREQUAL Linux" OFF) +if(WITH_ZNS) + find_package(LinuxZNS REQUIRED) + list(APPEND crimson_seastore_srcs + segment_manager/zbd.cc) +endif() + +add_library(crimson-seastore STATIC + ${crimson_seastore_srcs}) + +target_link_libraries(crimson-seastore + crimson) +if(WITH_ZNS) + target_link_libraries(crimson-seastore + Linux::ZNS) +endif() + +set_target_properties(crimson-seastore PROPERTIES + JOB_POOL_COMPILE heavy_compile_job_pool) diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc new file mode 100644 index 000000000..d7e398f5f --- /dev/null +++ b/src/crimson/os/seastore/async_cleaner.cc @@ -0,0 +1,1817 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/chrono.h> +#include <seastar/core/metrics.hh> + +#include "crimson/os/seastore/logging.h" + +#include "crimson/os/seastore/async_cleaner.h" +#include "crimson/os/seastore/backref_manager.h" +#include "crimson/os/seastore/transaction_manager.h" + +SET_SUBSYS(seastore_cleaner); + +namespace { + +enum class gc_formula_t { + GREEDY, + BENEFIT, + COST_BENEFIT, +}; +constexpr auto gc_formula = gc_formula_t::COST_BENEFIT; + +} + +namespace crimson::os::seastore { + +void segment_info_t::set_open( + segment_seq_t _seq, segment_type_t _type, + data_category_t _category, rewrite_gen_t _generation) +{ + ceph_assert(_seq != NULL_SEG_SEQ); + ceph_assert(_type != segment_type_t::NULL_SEG); + ceph_assert(_category != data_category_t::NUM); + ceph_assert(is_rewrite_generation(_generation)); + state = Segment::segment_state_t::OPEN; + seq = _seq; + type = _type; + category = _category; + generation = _generation; + written_to = 0; +} + +void segment_info_t::set_empty() +{ + state = Segment::segment_state_t::EMPTY; + seq = NULL_SEG_SEQ; + type = segment_type_t::NULL_SEG; + category = data_category_t::NUM; + generation = NULL_GENERATION; + modify_time = NULL_TIME; + num_extents = 0; + written_to = 0; +} + +void segment_info_t::set_closed() +{ + state = Segment::segment_state_t::CLOSED; + // the rest of information is unchanged +} + +void segment_info_t::init_closed( + segment_seq_t _seq, segment_type_t _type, + data_category_t _category, rewrite_gen_t _generation, + segment_off_t seg_size) +{ + ceph_assert(_seq != NULL_SEG_SEQ); + ceph_assert(_type != segment_type_t::NULL_SEG); + ceph_assert(_category != data_category_t::NUM); + ceph_assert(is_rewrite_generation(_generation)); + state = Segment::segment_state_t::CLOSED; + seq = _seq; + type = _type; + category = _category; + generation = _generation; + written_to = seg_size; +} + +std::ostream& operator<<(std::ostream &out, const segment_info_t &info) +{ + out << "seg_info_t(" + << "state=" << info.state + << ", " << info.id; + if (info.is_empty()) { + // pass + } else { // open or closed + out << " " << info.type + << " " << segment_seq_printer_t{info.seq} + << " " << info.category + << " " << rewrite_gen_printer_t{info.generation} + << ", modify_time=" << sea_time_point_printer_t{info.modify_time} + << ", num_extents=" << info.num_extents + << ", written_to=" << info.written_to; + } + return out << ")"; +} + +void segments_info_t::reset() +{ + segments.clear(); + + segment_size = 0; + + journal_segment_id = NULL_SEG_ID; + num_in_journal_open = 0; + num_type_journal = 0; + num_type_ool = 0; + + num_open = 0; + num_empty = 0; + num_closed = 0; + + count_open_journal = 0; + count_open_ool = 0; + count_release_journal = 0; + count_release_ool = 0; + count_close_journal = 0; + count_close_ool = 0; + + total_bytes = 0; + avail_bytes_in_open = 0; + + modify_times.clear(); +} + +void segments_info_t::add_segment_manager( + SegmentManager &segment_manager) +{ + LOG_PREFIX(segments_info_t::add_segment_manager); + device_id_t d_id = segment_manager.get_device_id(); + auto ssize = segment_manager.get_segment_size(); + auto nsegments = segment_manager.get_num_segments(); + auto sm_size = segment_manager.get_available_size(); + INFO("adding segment manager {}, size={}, ssize={}, segments={}", + device_id_printer_t{d_id}, sm_size, ssize, nsegments); + ceph_assert(ssize > 0); + ceph_assert(nsegments > 0); + ceph_assert(sm_size > 0); + + // also validate if the device is duplicated + segments.add_device(d_id, nsegments, segment_info_t{}); + + // assume all the segment managers share the same settings as follows. + if (segment_size == 0) { + ceph_assert(ssize > 0); + segment_size = ssize; + } else { + ceph_assert(segment_size == ssize); + } + + // NOTE: by default the segments are empty + num_empty += nsegments; + + total_bytes += sm_size; +} + +void segments_info_t::init_closed( + segment_id_t segment, segment_seq_t seq, segment_type_t type, + data_category_t category, rewrite_gen_t generation) +{ + LOG_PREFIX(segments_info_t::init_closed); + auto& segment_info = segments[segment]; + DEBUG("initiating {} {} {} {} {}, {}, " + "num_segments(empty={}, opened={}, closed={})", + segment, type, segment_seq_printer_t{seq}, + category, rewrite_gen_printer_t{generation}, + segment_info, num_empty, num_open, num_closed); + ceph_assert(segment_info.is_empty()); + ceph_assert(num_empty > 0); + --num_empty; + ++num_closed; + if (type == segment_type_t::JOURNAL) { + // init_closed won't initialize journal_segment_id + ceph_assert(get_submitted_journal_head() == JOURNAL_SEQ_NULL); + ++num_type_journal; + } else { + ++num_type_ool; + } + // do not increment count_close_*; + + if (segment_info.modify_time != NULL_TIME) { + modify_times.insert(segment_info.modify_time); + } else { + ceph_assert(segment_info.num_extents == 0); + } + + segment_info.init_closed( + seq, type, category, generation, get_segment_size()); +} + +void segments_info_t::mark_open( + segment_id_t segment, segment_seq_t seq, segment_type_t type, + data_category_t category, rewrite_gen_t generation) +{ + LOG_PREFIX(segments_info_t::mark_open); + auto& segment_info = segments[segment]; + INFO("opening {} {} {} {} {}, {}, " + "num_segments(empty={}, opened={}, closed={})", + segment, type, segment_seq_printer_t{seq}, + category, rewrite_gen_printer_t{generation}, + segment_info, num_empty, num_open, num_closed); + ceph_assert(segment_info.is_empty()); + ceph_assert(num_empty > 0); + --num_empty; + ++num_open; + if (type == segment_type_t::JOURNAL) { + if (journal_segment_id != NULL_SEG_ID) { + auto& last_journal_segment = segments[journal_segment_id]; + ceph_assert(last_journal_segment.is_closed()); + ceph_assert(last_journal_segment.type == segment_type_t::JOURNAL); + ceph_assert(last_journal_segment.seq + 1 == seq); + } + journal_segment_id = segment; + + ++num_in_journal_open; + ++num_type_journal; + ++count_open_journal; + } else { + ++num_type_ool; + ++count_open_ool; + } + avail_bytes_in_open += get_segment_size(); + + segment_info.set_open(seq, type, category, generation); +} + +void segments_info_t::mark_empty( + segment_id_t segment) +{ + LOG_PREFIX(segments_info_t::mark_empty); + auto& segment_info = segments[segment]; + INFO("releasing {}, {}, num_segments(empty={}, opened={}, closed={})", + segment, segment_info, + num_empty, num_open, num_closed); + ceph_assert(segment_info.is_closed()); + auto type = segment_info.type; + assert(type != segment_type_t::NULL_SEG); + ceph_assert(num_closed > 0); + --num_closed; + ++num_empty; + if (type == segment_type_t::JOURNAL) { + ceph_assert(num_type_journal > 0); + --num_type_journal; + ++count_release_journal; + } else { + ceph_assert(num_type_ool > 0); + --num_type_ool; + ++count_release_ool; + } + + if (segment_info.modify_time != NULL_TIME) { + auto to_erase = modify_times.find(segment_info.modify_time); + ceph_assert(to_erase != modify_times.end()); + modify_times.erase(to_erase); + } else { + ceph_assert(segment_info.num_extents == 0); + } + + segment_info.set_empty(); +} + +void segments_info_t::mark_closed( + segment_id_t segment) +{ + LOG_PREFIX(segments_info_t::mark_closed); + auto& segment_info = segments[segment]; + INFO("closing {}, {}, num_segments(empty={}, opened={}, closed={})", + segment, segment_info, + num_empty, num_open, num_closed); + ceph_assert(segment_info.is_open()); + ceph_assert(num_open > 0); + --num_open; + ++num_closed; + if (segment_info.type == segment_type_t::JOURNAL) { + ceph_assert(num_in_journal_open > 0); + --num_in_journal_open; + ++count_close_journal; + } else { + ++count_close_ool; + } + ceph_assert(get_segment_size() >= segment_info.written_to); + auto seg_avail_bytes = get_segment_size() - segment_info.written_to; + ceph_assert(avail_bytes_in_open >= (std::size_t)seg_avail_bytes); + avail_bytes_in_open -= seg_avail_bytes; + + if (segment_info.modify_time != NULL_TIME) { + modify_times.insert(segment_info.modify_time); + } else { + ceph_assert(segment_info.num_extents == 0); + } + + segment_info.set_closed(); +} + +void segments_info_t::update_written_to( + segment_type_t type, + paddr_t offset) +{ + LOG_PREFIX(segments_info_t::update_written_to); + auto& saddr = offset.as_seg_paddr(); + auto& segment_info = segments[saddr.get_segment_id()]; + if (!segment_info.is_open()) { + ERROR("segment is not open, not updating, type={}, offset={}, {}", + type, offset, segment_info); + ceph_abort(); + } + + auto new_written_to = saddr.get_segment_off(); + ceph_assert(new_written_to <= get_segment_size()); + if (segment_info.written_to > new_written_to) { + ERROR("written_to should not decrease! type={}, offset={}, {}", + type, offset, segment_info); + ceph_abort(); + } + + DEBUG("type={}, offset={}, {}", type, offset, segment_info); + ceph_assert(type == segment_info.type); + auto avail_deduction = new_written_to - segment_info.written_to; + ceph_assert(avail_bytes_in_open >= (std::size_t)avail_deduction); + avail_bytes_in_open -= avail_deduction; + segment_info.written_to = new_written_to; +} + +std::ostream &operator<<(std::ostream &os, const segments_info_t &infos) +{ + return os << "segments(" + << "empty=" << infos.get_num_empty() + << ", open=" << infos.get_num_open() + << ", closed=" << infos.get_num_closed() + << ", type_journal=" << infos.get_num_type_journal() + << ", type_ool=" << infos.get_num_type_ool() + << ", total=" << infos.get_total_bytes() << "B" + << ", available=" << infos.get_available_bytes() << "B" + << ", unavailable=" << infos.get_unavailable_bytes() << "B" + << ", available_ratio=" << infos.get_available_ratio() + << ", submitted_head=" << infos.get_submitted_journal_head() + << ", time_bound=" << sea_time_point_printer_t{infos.get_time_bound()} + << ")"; +} + +void JournalTrimmerImpl::config_t::validate() const +{ + ceph_assert(max_journal_bytes <= DEVICE_OFF_MAX); + ceph_assert(max_journal_bytes > target_journal_dirty_bytes); + ceph_assert(max_journal_bytes > target_journal_alloc_bytes); + ceph_assert(rewrite_dirty_bytes_per_cycle > 0); + ceph_assert(rewrite_backref_bytes_per_cycle > 0); +} + +JournalTrimmerImpl::config_t +JournalTrimmerImpl::config_t::get_default( + std::size_t roll_size, journal_type_t type) +{ + assert(roll_size); + std::size_t target_dirty_bytes = 0; + std::size_t target_alloc_bytes = 0; + std::size_t max_journal_bytes = 0; + if (type == journal_type_t::SEGMENTED) { + target_dirty_bytes = 12 * roll_size; + target_alloc_bytes = 2 * roll_size; + max_journal_bytes = 16 * roll_size; + } else { + assert(type == journal_type_t::RANDOM_BLOCK); + target_dirty_bytes = roll_size / 4; + target_alloc_bytes = roll_size / 4; + max_journal_bytes = roll_size / 2; + } + return config_t{ + target_dirty_bytes, + target_alloc_bytes, + max_journal_bytes, + 1<<17,// rewrite_dirty_bytes_per_cycle + 1<<24 // rewrite_backref_bytes_per_cycle + }; +} + +JournalTrimmerImpl::config_t +JournalTrimmerImpl::config_t::get_test( + std::size_t roll_size, journal_type_t type) +{ + assert(roll_size); + std::size_t target_dirty_bytes = 0; + std::size_t target_alloc_bytes = 0; + std::size_t max_journal_bytes = 0; + if (type == journal_type_t::SEGMENTED) { + target_dirty_bytes = 2 * roll_size; + target_alloc_bytes = 2 * roll_size; + max_journal_bytes = 4 * roll_size; + } else { + assert(type == journal_type_t::RANDOM_BLOCK); + target_dirty_bytes = roll_size / 4; + target_alloc_bytes = roll_size / 4; + max_journal_bytes = roll_size / 2; + } + return config_t{ + target_dirty_bytes, + target_alloc_bytes, + max_journal_bytes, + 1<<17,// rewrite_dirty_bytes_per_cycle + 1<<24 // rewrite_backref_bytes_per_cycle + }; +} + +JournalTrimmerImpl::JournalTrimmerImpl( + BackrefManager &backref_manager, + config_t config, + journal_type_t type, + device_off_t roll_start, + device_off_t roll_size) + : backref_manager(backref_manager), + config(config), + journal_type(type), + roll_start(roll_start), + roll_size(roll_size), + reserved_usage(0) +{ + config.validate(); + ceph_assert(roll_start >= 0); + ceph_assert(roll_size > 0); + register_metrics(); +} + +void JournalTrimmerImpl::set_journal_head(journal_seq_t head) +{ + LOG_PREFIX(JournalTrimmerImpl::set_journal_head); + + ceph_assert(head != JOURNAL_SEQ_NULL); + ceph_assert(journal_head == JOURNAL_SEQ_NULL || + head >= journal_head); + ceph_assert(journal_alloc_tail == JOURNAL_SEQ_NULL || + head >= journal_alloc_tail); + ceph_assert(journal_dirty_tail == JOURNAL_SEQ_NULL || + head >= journal_dirty_tail); + + std::swap(journal_head, head); + if (journal_head.segment_seq == head.segment_seq) { + DEBUG("journal_head {} => {}, {}", + head, journal_head, stat_printer_t{*this, false}); + } else { + INFO("journal_head {} => {}, {}", + head, journal_head, stat_printer_t{*this, false}); + } + background_callback->maybe_wake_background(); +} + +void JournalTrimmerImpl::update_journal_tails( + journal_seq_t dirty_tail, + journal_seq_t alloc_tail) +{ + LOG_PREFIX(JournalTrimmerImpl::update_journal_tails); + + if (dirty_tail != JOURNAL_SEQ_NULL) { + ceph_assert(journal_head == JOURNAL_SEQ_NULL || + journal_head >= dirty_tail); + if (journal_dirty_tail != JOURNAL_SEQ_NULL && + journal_dirty_tail > dirty_tail) { + ERROR("journal_dirty_tail {} => {} is backwards!", + journal_dirty_tail, dirty_tail); + ceph_abort(); + } + std::swap(journal_dirty_tail, dirty_tail); + if (journal_dirty_tail.segment_seq == dirty_tail.segment_seq) { + DEBUG("journal_dirty_tail {} => {}, {}", + dirty_tail, journal_dirty_tail, stat_printer_t{*this, false}); + } else { + INFO("journal_dirty_tail {} => {}, {}", + dirty_tail, journal_dirty_tail, stat_printer_t{*this, false}); + } + } + + if (alloc_tail != JOURNAL_SEQ_NULL) { + ceph_assert(journal_head == JOURNAL_SEQ_NULL || + journal_head >= alloc_tail); + if (journal_alloc_tail != JOURNAL_SEQ_NULL && + journal_alloc_tail > alloc_tail) { + ERROR("journal_alloc_tail {} => {} is backwards!", + journal_alloc_tail, alloc_tail); + ceph_abort(); + } + std::swap(journal_alloc_tail, alloc_tail); + if (journal_alloc_tail.segment_seq == alloc_tail.segment_seq) { + DEBUG("journal_alloc_tail {} => {}, {}", + alloc_tail, journal_alloc_tail, stat_printer_t{*this, false}); + } else { + INFO("journal_alloc_tail {} => {}, {}", + alloc_tail, journal_alloc_tail, stat_printer_t{*this, false}); + } + } + + background_callback->maybe_wake_background(); + background_callback->maybe_wake_blocked_io(); +} + +journal_seq_t JournalTrimmerImpl::get_tail_limit() const +{ + assert(background_callback->is_ready()); + auto ret = journal_head.add_offset( + journal_type, + -static_cast<device_off_t>(config.max_journal_bytes), + roll_start, + roll_size); + return ret; +} + +journal_seq_t JournalTrimmerImpl::get_dirty_tail_target() const +{ + assert(background_callback->is_ready()); + auto ret = journal_head.add_offset( + journal_type, + -static_cast<device_off_t>(config.target_journal_dirty_bytes), + roll_start, + roll_size); + return ret; +} + +journal_seq_t JournalTrimmerImpl::get_alloc_tail_target() const +{ + assert(background_callback->is_ready()); + auto ret = journal_head.add_offset( + journal_type, + -static_cast<device_off_t>(config.target_journal_alloc_bytes), + roll_start, + roll_size); + return ret; +} + +std::size_t JournalTrimmerImpl::get_dirty_journal_size() const +{ + if (!background_callback->is_ready()) { + return 0; + } + auto ret = journal_head.relative_to( + journal_type, + journal_dirty_tail, + roll_start, + roll_size); + ceph_assert(ret >= 0); + return static_cast<std::size_t>(ret); +} + +std::size_t JournalTrimmerImpl::get_alloc_journal_size() const +{ + if (!background_callback->is_ready()) { + return 0; + } + auto ret = journal_head.relative_to( + journal_type, + journal_alloc_tail, + roll_start, + roll_size); + ceph_assert(ret >= 0); + return static_cast<std::size_t>(ret); +} + +seastar::future<> JournalTrimmerImpl::trim() { + return seastar::when_all( + [this] { + if (should_trim_alloc()) { + return trim_alloc( + ).handle_error( + crimson::ct_error::assert_all{ + "encountered invalid error in trim_alloc" + } + ); + } else { + return seastar::now(); + } + }, + [this] { + if (should_trim_dirty()) { + return trim_dirty( + ).handle_error( + crimson::ct_error::assert_all{ + "encountered invalid error in trim_dirty" + } + ); + } else { + return seastar::now(); + } + } + ).discard_result(); +} + +JournalTrimmerImpl::trim_ertr::future<> +JournalTrimmerImpl::trim_alloc() +{ + LOG_PREFIX(JournalTrimmerImpl::trim_alloc); + assert(background_callback->is_ready()); + return repeat_eagain([this, FNAME] { + return extent_callback->with_transaction_intr( + Transaction::src_t::TRIM_ALLOC, + "trim_alloc", + [this, FNAME](auto &t) + { + auto target = get_alloc_tail_target(); + DEBUGT("start, alloc_tail={}, target={}", + t, journal_alloc_tail, target); + return backref_manager.merge_cached_backrefs( + t, + target, + config.rewrite_backref_bytes_per_cycle + ).si_then([this, FNAME, &t](auto trim_alloc_to) + -> ExtentCallbackInterface::submit_transaction_direct_iertr::future<> + { + DEBUGT("trim_alloc_to={}", t, trim_alloc_to); + if (trim_alloc_to != JOURNAL_SEQ_NULL) { + return extent_callback->submit_transaction_direct( + t, std::make_optional<journal_seq_t>(trim_alloc_to)); + } + return seastar::now(); + }); + }); + }).safe_then([this, FNAME] { + DEBUG("finish, alloc_tail={}", journal_alloc_tail); + }); +} + +JournalTrimmerImpl::trim_ertr::future<> +JournalTrimmerImpl::trim_dirty() +{ + LOG_PREFIX(JournalTrimmerImpl::trim_dirty); + assert(background_callback->is_ready()); + return repeat_eagain([this, FNAME] { + return extent_callback->with_transaction_intr( + Transaction::src_t::TRIM_DIRTY, + "trim_dirty", + [this, FNAME](auto &t) + { + auto target = get_dirty_tail_target(); + DEBUGT("start, dirty_tail={}, target={}", + t, journal_dirty_tail, target); + return extent_callback->get_next_dirty_extents( + t, + target, + config.rewrite_dirty_bytes_per_cycle + ).si_then([this, FNAME, &t](auto dirty_list) { + DEBUGT("rewrite {} dirty extents", t, dirty_list.size()); + return seastar::do_with( + std::move(dirty_list), + [this, &t](auto &dirty_list) + { + return trans_intr::do_for_each( + dirty_list, + [this, &t](auto &e) { + return extent_callback->rewrite_extent( + t, e, INIT_GENERATION, NULL_TIME); + }); + }); + }).si_then([this, &t] { + return extent_callback->submit_transaction_direct(t); + }); + }); + }).safe_then([this, FNAME] { + DEBUG("finish, dirty_tail={}", journal_dirty_tail); + }); +} + +void JournalTrimmerImpl::register_metrics() +{ + namespace sm = seastar::metrics; + metrics.add_group("journal_trimmer", { + sm::make_counter("dirty_journal_bytes", + [this] { return get_dirty_journal_size(); }, + sm::description("the size of the journal for dirty extents")), + sm::make_counter("alloc_journal_bytes", + [this] { return get_alloc_journal_size(); }, + sm::description("the size of the journal for alloc info")) + }); +} + +std::ostream &operator<<( + std::ostream &os, const JournalTrimmerImpl::stat_printer_t &stats) +{ + os << "JournalTrimmer("; + if (stats.trimmer.background_callback->is_ready()) { + os << "should_block_io_on_trim=" << stats.trimmer.should_block_io_on_trim() + << ", should_(trim_dirty=" << stats.trimmer.should_trim_dirty() + << ", trim_alloc=" << stats.trimmer.should_trim_alloc() << ")"; + } else { + os << "not-ready"; + } + if (stats.detailed) { + os << ", journal_head=" << stats.trimmer.get_journal_head() + << ", alloc_tail=" << stats.trimmer.get_alloc_tail() + << ", dirty_tail=" << stats.trimmer.get_dirty_tail(); + if (stats.trimmer.background_callback->is_ready()) { + os << ", alloc_tail_target=" << stats.trimmer.get_alloc_tail_target() + << ", dirty_tail_target=" << stats.trimmer.get_dirty_tail_target() + << ", tail_limit=" << stats.trimmer.get_tail_limit(); + } + } + os << ")"; + return os; +} + +bool SpaceTrackerSimple::equals(const SpaceTrackerI &_other) const +{ + LOG_PREFIX(SpaceTrackerSimple::equals); + const auto &other = static_cast<const SpaceTrackerSimple&>(_other); + + if (other.live_bytes_by_segment.size() != live_bytes_by_segment.size()) { + ERROR("different segment counts, bug in test"); + assert(0 == "segment counts should match"); + return false; + } + + bool all_match = true; + for (auto i = live_bytes_by_segment.begin(), j = other.live_bytes_by_segment.begin(); + i != live_bytes_by_segment.end(); ++i, ++j) { + if (i->second.live_bytes != j->second.live_bytes) { + all_match = false; + DEBUG("segment_id {} live bytes mismatch *this: {}, other: {}", + i->first, i->second.live_bytes, j->second.live_bytes); + } + } + return all_match; +} + +int64_t SpaceTrackerDetailed::SegmentMap::allocate( + device_segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size) +{ + LOG_PREFIX(SegmentMap::allocate); + assert(offset % block_size == 0); + assert(len % block_size == 0); + + const auto b = (offset / block_size); + const auto e = (offset + len) / block_size; + + bool error = false; + for (auto i = b; i < e; ++i) { + if (bitmap[i]) { + if (!error) { + ERROR("found allocated in {}, {} ~ {}", segment, offset, len); + error = true; + } + DEBUG("block {} allocated", i * block_size); + } + bitmap[i] = true; + } + return update_usage(len); +} + +int64_t SpaceTrackerDetailed::SegmentMap::release( + device_segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size) +{ + LOG_PREFIX(SegmentMap::release); + assert(offset % block_size == 0); + assert(len % block_size == 0); + + const auto b = (offset / block_size); + const auto e = (offset + len) / block_size; + + bool error = false; + for (auto i = b; i < e; ++i) { + if (!bitmap[i]) { + if (!error) { + ERROR("found unallocated in {}, {} ~ {}", segment, offset, len); + error = true; + } + DEBUG("block {} unallocated", i * block_size); + } + bitmap[i] = false; + } + return update_usage(-(int64_t)len); +} + +bool SpaceTrackerDetailed::equals(const SpaceTrackerI &_other) const +{ + LOG_PREFIX(SpaceTrackerDetailed::equals); + const auto &other = static_cast<const SpaceTrackerDetailed&>(_other); + + if (other.segment_usage.size() != segment_usage.size()) { + ERROR("different segment counts, bug in test"); + assert(0 == "segment counts should match"); + return false; + } + + bool all_match = true; + for (auto i = segment_usage.begin(), j = other.segment_usage.begin(); + i != segment_usage.end(); ++i, ++j) { + if (i->second.get_usage() != j->second.get_usage()) { + all_match = false; + ERROR("segment_id {} live bytes mismatch *this: {}, other: {}", + i->first, i->second.get_usage(), j->second.get_usage()); + } + } + return all_match; +} + +void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const +{ + LOG_PREFIX(SegmentMap::dump_usage); + INFO("dump start"); + for (unsigned i = 0; i < bitmap.size(); ++i) { + if (bitmap[i]) { + LOCAL_LOGGER.info(" {} still live", i * block_size); + } + } +} + +void SpaceTrackerDetailed::dump_usage(segment_id_t id) const +{ + LOG_PREFIX(SpaceTrackerDetailed::dump_usage); + INFO("{}", id); + segment_usage[id].dump_usage( + block_size_by_segment_manager[id.device_id()]); +} + +void SpaceTrackerSimple::dump_usage(segment_id_t id) const +{ + LOG_PREFIX(SpaceTrackerSimple::dump_usage); + INFO("id: {}, live_bytes: {}", + id, live_bytes_by_segment[id].live_bytes); +} + +std::ostream &operator<<( + std::ostream &os, const AsyncCleaner::stat_printer_t &stats) +{ + stats.cleaner.print(os, stats.detailed); + return os; +} + +SegmentCleaner::SegmentCleaner( + config_t config, + SegmentManagerGroupRef&& sm_group, + BackrefManager &backref_manager, + SegmentSeqAllocator &segment_seq_allocator, + bool detailed, + bool is_cold) + : detailed(detailed), + is_cold(is_cold), + config(config), + sm_group(std::move(sm_group)), + backref_manager(backref_manager), + ool_segment_seq_allocator(segment_seq_allocator) +{ + config.validate(); +} + +void SegmentCleaner::register_metrics() +{ + namespace sm = seastar::metrics; + stats.segment_util.buckets.resize(UTIL_BUCKETS); + std::size_t i; + for (i = 0; i < UTIL_BUCKETS; ++i) { + stats.segment_util.buckets[i].upper_bound = ((double)(i + 1)) / 10; + stats.segment_util.buckets[i].count = 0; + } + // NOTE: by default the segments are empty + i = get_bucket_index(UTIL_STATE_EMPTY); + stats.segment_util.buckets[i].count = segments.get_num_segments(); + + std::string prefix; + if (is_cold) { + prefix.append("cold_"); + } + prefix.append("segment_cleaner"); + + metrics.add_group(prefix, { + sm::make_counter("segments_number", + [this] { return segments.get_num_segments(); }, + sm::description("the number of segments")), + sm::make_counter("segment_size", + [this] { return segments.get_segment_size(); }, + sm::description("the bytes of a segment")), + sm::make_counter("segments_in_journal", + [this] { return get_segments_in_journal(); }, + sm::description("the number of segments in journal")), + sm::make_counter("segments_type_journal", + [this] { return segments.get_num_type_journal(); }, + sm::description("the number of segments typed journal")), + sm::make_counter("segments_type_ool", + [this] { return segments.get_num_type_ool(); }, + sm::description("the number of segments typed out-of-line")), + sm::make_counter("segments_open", + [this] { return segments.get_num_open(); }, + sm::description("the number of open segments")), + sm::make_counter("segments_empty", + [this] { return segments.get_num_empty(); }, + sm::description("the number of empty segments")), + sm::make_counter("segments_closed", + [this] { return segments.get_num_closed(); }, + sm::description("the number of closed segments")), + + sm::make_counter("segments_count_open_journal", + [this] { return segments.get_count_open_journal(); }, + sm::description("the count of open journal segment operations")), + sm::make_counter("segments_count_open_ool", + [this] { return segments.get_count_open_ool(); }, + sm::description("the count of open ool segment operations")), + sm::make_counter("segments_count_release_journal", + [this] { return segments.get_count_release_journal(); }, + sm::description("the count of release journal segment operations")), + sm::make_counter("segments_count_release_ool", + [this] { return segments.get_count_release_ool(); }, + sm::description("the count of release ool segment operations")), + sm::make_counter("segments_count_close_journal", + [this] { return segments.get_count_close_journal(); }, + sm::description("the count of close journal segment operations")), + sm::make_counter("segments_count_close_ool", + [this] { return segments.get_count_close_ool(); }, + sm::description("the count of close ool segment operations")), + + sm::make_counter("total_bytes", + [this] { return segments.get_total_bytes(); }, + sm::description("the size of the space")), + sm::make_counter("available_bytes", + [this] { return segments.get_available_bytes(); }, + sm::description("the size of the space is available")), + sm::make_counter("unavailable_unreclaimable_bytes", + [this] { return get_unavailable_unreclaimable_bytes(); }, + sm::description("the size of the space is unavailable and unreclaimable")), + sm::make_counter("unavailable_reclaimable_bytes", + [this] { return get_unavailable_reclaimable_bytes(); }, + sm::description("the size of the space is unavailable and reclaimable")), + sm::make_counter("used_bytes", stats.used_bytes, + sm::description("the size of the space occupied by live extents")), + sm::make_counter("unavailable_unused_bytes", + [this] { return get_unavailable_unused_bytes(); }, + sm::description("the size of the space is unavailable and not alive")), + + sm::make_counter("projected_count", stats.projected_count, + sm::description("the number of projected usage reservations")), + sm::make_counter("projected_used_bytes_sum", stats.projected_used_bytes_sum, + sm::description("the sum of the projected usage in bytes")), + + sm::make_counter("reclaimed_bytes", stats.reclaimed_bytes, + sm::description("rewritten bytes due to reclaim")), + sm::make_counter("reclaimed_segment_bytes", stats.reclaimed_segment_bytes, + sm::description("rewritten bytes due to reclaim")), + sm::make_counter("closed_journal_used_bytes", stats.closed_journal_used_bytes, + sm::description("used bytes when close a journal segment")), + sm::make_counter("closed_journal_total_bytes", stats.closed_journal_total_bytes, + sm::description("total bytes of closed journal segments")), + sm::make_counter("closed_ool_used_bytes", stats.closed_ool_used_bytes, + sm::description("used bytes when close a ool segment")), + sm::make_counter("closed_ool_total_bytes", stats.closed_ool_total_bytes, + sm::description("total bytes of closed ool segments")), + + sm::make_gauge("available_ratio", + [this] { return segments.get_available_ratio(); }, + sm::description("ratio of available space to total space")), + sm::make_gauge("reclaim_ratio", + [this] { return get_reclaim_ratio(); }, + sm::description("ratio of reclaimable space to unavailable space")), + + sm::make_histogram("segment_utilization_distribution", + [this]() -> seastar::metrics::histogram& { + return stats.segment_util; + }, + sm::description("utilization distribution of all segments")) + }); +} + +segment_id_t SegmentCleaner::allocate_segment( + segment_seq_t seq, + segment_type_t type, + data_category_t category, + rewrite_gen_t generation) +{ + LOG_PREFIX(SegmentCleaner::allocate_segment); + assert(seq != NULL_SEG_SEQ); + ceph_assert(type == segment_type_t::OOL || + trimmer != nullptr); // segment_type_t::JOURNAL + for (auto it = segments.begin(); + it != segments.end(); + ++it) { + auto seg_id = it->first; + auto& segment_info = it->second; + if (segment_info.is_empty()) { + auto old_usage = calc_utilization(seg_id); + segments.mark_open(seg_id, seq, type, category, generation); + background_callback->maybe_wake_background(); + auto new_usage = calc_utilization(seg_id); + adjust_segment_util(old_usage, new_usage); + INFO("opened, {}", stat_printer_t{*this, false}); + return seg_id; + } + } + ERROR("out of space with {} {} {} {}", + type, segment_seq_printer_t{seq}, category, + rewrite_gen_printer_t{generation}); + ceph_abort("seastore device size setting is too small"); + return NULL_SEG_ID; +} + +void SegmentCleaner::close_segment(segment_id_t segment) +{ + LOG_PREFIX(SegmentCleaner::close_segment); + auto old_usage = calc_utilization(segment); + segments.mark_closed(segment); + auto &seg_info = segments[segment]; + if (seg_info.type == segment_type_t::JOURNAL) { + stats.closed_journal_used_bytes += space_tracker->get_usage(segment); + stats.closed_journal_total_bytes += segments.get_segment_size(); + } else { + stats.closed_ool_used_bytes += space_tracker->get_usage(segment); + stats.closed_ool_total_bytes += segments.get_segment_size(); + } + auto new_usage = calc_utilization(segment); + adjust_segment_util(old_usage, new_usage); + INFO("closed, {} -- {}", stat_printer_t{*this, false}, seg_info); +} + +double SegmentCleaner::calc_gc_benefit_cost( + segment_id_t id, + const sea_time_point &now_time, + const sea_time_point &bound_time) const +{ + double util = calc_utilization(id); + ceph_assert(util >= 0 && util < 1); + if constexpr (gc_formula == gc_formula_t::GREEDY) { + return 1 - util; + } + + if constexpr (gc_formula == gc_formula_t::COST_BENEFIT) { + if (util == 0) { + return std::numeric_limits<double>::max(); + } + auto modify_time = segments[id].modify_time; + double age_segment = modify_time.time_since_epoch().count(); + double age_now = now_time.time_since_epoch().count(); + if (likely(age_now > age_segment)) { + return (1 - util) * (age_now - age_segment) / (2 * util); + } else { + // time is wrong + return (1 - util) / (2 * util); + } + } + + assert(gc_formula == gc_formula_t::BENEFIT); + auto modify_time = segments[id].modify_time; + double age_factor = 0.5; // middle value if age is invalid + if (likely(bound_time != NULL_TIME && + modify_time != NULL_TIME && + now_time > modify_time)) { + assert(modify_time >= bound_time); + double age_bound = bound_time.time_since_epoch().count(); + double age_now = now_time.time_since_epoch().count(); + double age_segment = modify_time.time_since_epoch().count(); + age_factor = (age_now - age_segment) / (age_now - age_bound); + } + return ((1 - 2 * age_factor) * util * util + + (2 * age_factor - 2) * util + 1); +} + +SegmentCleaner::do_reclaim_space_ret +SegmentCleaner::do_reclaim_space( + const std::vector<CachedExtentRef> &backref_extents, + const backref_pin_list_t &pin_list, + std::size_t &reclaimed, + std::size_t &runs) +{ + return repeat_eagain([this, &backref_extents, + &pin_list, &reclaimed, &runs] { + reclaimed = 0; + runs++; + auto src = Transaction::src_t::CLEANER_MAIN; + if (is_cold) { + src = Transaction::src_t::CLEANER_COLD; + } + return extent_callback->with_transaction_intr( + src, + "clean_reclaim_space", + [this, &backref_extents, &pin_list, &reclaimed](auto &t) + { + return seastar::do_with( + std::vector<CachedExtentRef>(backref_extents), + [this, &t, &reclaimed, &pin_list](auto &extents) + { + LOG_PREFIX(SegmentCleaner::do_reclaim_space); + // calculate live extents + auto cached_backref_entries = + backref_manager.get_cached_backref_entries_in_range( + reclaim_state->start_pos, reclaim_state->end_pos); + backref_entry_query_set_t backref_entries; + for (auto &pin : pin_list) { + backref_entries.emplace( + pin->get_key(), + pin->get_val(), + pin->get_length(), + pin->get_type(), + JOURNAL_SEQ_NULL); + } + for (auto &cached_backref : cached_backref_entries) { + if (cached_backref.laddr == L_ADDR_NULL) { + auto it = backref_entries.find(cached_backref.paddr); + assert(it->len == cached_backref.len); + backref_entries.erase(it); + } else { + backref_entries.emplace(cached_backref); + } + } + // retrieve live extents + DEBUGT("start, backref_entries={}, backref_extents={}", + t, backref_entries.size(), extents.size()); + return seastar::do_with( + std::move(backref_entries), + [this, &extents, &t](auto &backref_entries) { + return trans_intr::parallel_for_each( + backref_entries, + [this, &extents, &t](auto &ent) + { + LOG_PREFIX(SegmentCleaner::do_reclaim_space); + TRACET("getting extent of type {} at {}~{}", + t, + ent.type, + ent.paddr, + ent.len); + return extent_callback->get_extents_if_live( + t, ent.type, ent.paddr, ent.laddr, ent.len + ).si_then([FNAME, &extents, &ent, &t](auto list) { + if (list.empty()) { + TRACET("addr {} dead, skipping", t, ent.paddr); + } else { + for (auto &e : list) { + extents.emplace_back(std::move(e)); + } + } + }); + }); + }).si_then([FNAME, &extents, this, &reclaimed, &t] { + DEBUGT("reclaim {} extents", t, extents.size()); + // rewrite live extents + auto modify_time = segments[reclaim_state->get_segment_id()].modify_time; + return trans_intr::do_for_each( + extents, + [this, modify_time, &t, &reclaimed](auto ext) + { + reclaimed += ext->get_length(); + return extent_callback->rewrite_extent( + t, ext, reclaim_state->target_generation, modify_time); + }); + }); + }).si_then([this, &t] { + return extent_callback->submit_transaction_direct(t); + }); + }); + }); +} + +SegmentCleaner::clean_space_ret SegmentCleaner::clean_space() +{ + LOG_PREFIX(SegmentCleaner::clean_space); + assert(background_callback->is_ready()); + ceph_assert(can_clean_space()); + if (!reclaim_state) { + segment_id_t seg_id = get_next_reclaim_segment(); + auto &segment_info = segments[seg_id]; + INFO("reclaim {} {} start, usage={}, time_bound={}", + seg_id, segment_info, + space_tracker->calc_utilization(seg_id), + sea_time_point_printer_t{segments.get_time_bound()}); + ceph_assert(segment_info.is_closed()); + reclaim_state = reclaim_state_t::create( + seg_id, segment_info.generation, segments.get_segment_size()); + } + reclaim_state->advance(config.reclaim_bytes_per_cycle); + + DEBUG("reclaiming {} {}~{}", + rewrite_gen_printer_t{reclaim_state->generation}, + reclaim_state->start_pos, + reclaim_state->end_pos); + double pavail_ratio = get_projected_available_ratio(); + sea_time_point start = seastar::lowres_system_clock::now(); + + // Backref-tree doesn't support tree-read during tree-updates with parallel + // transactions. So, concurrent transactions between trim and reclaim are + // not allowed right now. + return seastar::do_with( + std::pair<std::vector<CachedExtentRef>, backref_pin_list_t>(), + [this](auto &weak_read_ret) { + return repeat_eagain([this, &weak_read_ret] { + return extent_callback->with_transaction_intr( + Transaction::src_t::READ, + "retrieve_from_backref_tree", + [this, &weak_read_ret](auto &t) { + return backref_manager.get_mappings( + t, + reclaim_state->start_pos, + reclaim_state->end_pos + ).si_then([this, &t, &weak_read_ret](auto pin_list) { + if (!pin_list.empty()) { + auto it = pin_list.begin(); + auto &first_pin = *it; + if (first_pin->get_key() < reclaim_state->start_pos) { + // BackrefManager::get_mappings may include a entry before + // reclaim_state->start_pos, which is semantically inconsistent + // with the requirements of the cleaner + pin_list.erase(it); + } + } + return backref_manager.retrieve_backref_extents_in_range( + t, + reclaim_state->start_pos, + reclaim_state->end_pos + ).si_then([pin_list=std::move(pin_list), + &weak_read_ret](auto extents) mutable { + weak_read_ret = std::make_pair(std::move(extents), std::move(pin_list)); + }); + }); + }); + }).safe_then([&weak_read_ret] { + return std::move(weak_read_ret); + }); + }).safe_then([this, FNAME, pavail_ratio, start](auto weak_read_ret) { + return seastar::do_with( + std::move(weak_read_ret.first), + std::move(weak_read_ret.second), + (size_t)0, + (size_t)0, + [this, FNAME, pavail_ratio, start]( + auto &backref_extents, auto &pin_list, auto &reclaimed, auto &runs) + { + return do_reclaim_space( + backref_extents, + pin_list, + reclaimed, + runs + ).safe_then([this, FNAME, pavail_ratio, start, &reclaimed, &runs] { + stats.reclaiming_bytes += reclaimed; + auto d = seastar::lowres_system_clock::now() - start; + DEBUG("duration: {}, pavail_ratio before: {}, repeats: {}", + d, pavail_ratio, runs); + if (reclaim_state->is_complete()) { + auto segment_to_release = reclaim_state->get_segment_id(); + INFO("reclaim {} finish, reclaimed alive/total={}", + segment_to_release, + stats.reclaiming_bytes/(double)segments.get_segment_size()); + stats.reclaimed_bytes += stats.reclaiming_bytes; + stats.reclaimed_segment_bytes += segments.get_segment_size(); + stats.reclaiming_bytes = 0; + reclaim_state.reset(); + return sm_group->release_segment(segment_to_release + ).handle_error( + clean_space_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "SegmentCleaner::clean_space encountered invalid error in release_segment" + } + ).safe_then([this, FNAME, segment_to_release] { + auto old_usage = calc_utilization(segment_to_release); + if(unlikely(old_usage != 0)) { + space_tracker->dump_usage(segment_to_release); + ERROR("segment {} old_usage {} != 0", + segment_to_release, old_usage); + ceph_abort(); + } + segments.mark_empty(segment_to_release); + auto new_usage = calc_utilization(segment_to_release); + adjust_segment_util(old_usage, new_usage); + INFO("released {}, {}", + segment_to_release, stat_printer_t{*this, false}); + background_callback->maybe_wake_blocked_io(); + }); + } else { + return clean_space_ertr::now(); + } + }); + }); + }); +} + +SegmentCleaner::mount_ret SegmentCleaner::mount() +{ + LOG_PREFIX(SegmentCleaner::mount); + const auto& sms = sm_group->get_segment_managers(); + INFO("{} segment managers", sms.size()); + + assert(background_callback->get_state() == state_t::MOUNT); + + space_tracker.reset( + detailed ? + (SpaceTrackerI*)new SpaceTrackerDetailed( + sms) : + (SpaceTrackerI*)new SpaceTrackerSimple( + sms)); + + segments.reset(); + for (auto sm : sms) { + segments.add_segment_manager(*sm); + } + segments.assign_ids(); + + stats = {}; + metrics.clear(); + register_metrics(); + + INFO("{} segments", segments.get_num_segments()); + return crimson::do_for_each( + segments.begin(), + segments.end(), + [this, FNAME](auto& it) + { + auto segment_id = it.first; + return sm_group->read_segment_header( + segment_id + ).safe_then([segment_id, this, FNAME](auto header) { + DEBUG("segment_id={} -- {}", segment_id, header); + auto s_type = header.get_type(); + if (s_type == segment_type_t::NULL_SEG) { + ERROR("got null segment, segment_id={} -- {}", segment_id, header); + ceph_abort(); + } + return sm_group->read_segment_tail( + segment_id + ).safe_then([this, FNAME, segment_id, header](auto tail) + -> scan_extents_ertr::future<> { + if (tail.segment_nonce != header.segment_nonce) { + return scan_no_tail_segment(header, segment_id); + } + ceph_assert(header.get_type() == tail.get_type()); + + sea_time_point modify_time = mod_to_timepoint(tail.modify_time); + std::size_t num_extents = tail.num_extents; + if ((modify_time == NULL_TIME && num_extents == 0) || + (modify_time != NULL_TIME && num_extents != 0)) { + segments.update_modify_time(segment_id, modify_time, num_extents); + } else { + ERROR("illegal modify time {}", tail); + return crimson::ct_error::input_output_error::make(); + } + + init_mark_segment_closed( + segment_id, + header.segment_seq, + header.type, + header.category, + header.generation); + return seastar::now(); + }).handle_error( + crimson::ct_error::enodata::handle( + [this, header, segment_id](auto) { + return scan_no_tail_segment(header, segment_id); + }), + crimson::ct_error::pass_further_all{} + ); + }).handle_error( + crimson::ct_error::enoent::handle([](auto) { + return mount_ertr::now(); + }), + crimson::ct_error::enodata::handle([](auto) { + return mount_ertr::now(); + }), + crimson::ct_error::input_output_error::pass_further{}, + crimson::ct_error::assert_all{"unexpected error"} + ); + }).safe_then([this, FNAME] { + INFO("done, {}", segments); + }); +} + +SegmentCleaner::scan_extents_ret SegmentCleaner::scan_no_tail_segment( + const segment_header_t &segment_header, + segment_id_t segment_id) +{ + LOG_PREFIX(SegmentCleaner::scan_no_tail_segment); + INFO("scan {} {}", segment_id, segment_header); + return seastar::do_with( + scan_valid_records_cursor({ + segments[segment_id].seq, + paddr_t::make_seg_paddr(segment_id, 0) + }), + SegmentManagerGroup::found_record_handler_t( + [this, segment_id, segment_header, FNAME]( + record_locator_t locator, + const record_group_header_t &record_group_header, + const bufferlist& mdbuf + ) mutable -> SegmentManagerGroup::scan_valid_records_ertr::future<> + { + DEBUG("{} {}, decoding {} records", + segment_id, segment_header.get_type(), record_group_header.records); + + auto maybe_headers = try_decode_record_headers( + record_group_header, mdbuf); + if (!maybe_headers) { + // This should be impossible, we did check the crc on the mdbuf + ERROR("unable to decode record headers for record group {}", + locator.record_block_base); + return crimson::ct_error::input_output_error::make(); + } + + for (auto &record_header : *maybe_headers) { + auto modify_time = mod_to_timepoint(record_header.modify_time); + if (record_header.extents == 0 || modify_time != NULL_TIME) { + segments.update_modify_time( + segment_id, modify_time, record_header.extents); + } else { + ERROR("illegal modify time {}", record_header); + return crimson::ct_error::input_output_error::make(); + } + } + return seastar::now(); + }), + [this, segment_header](auto &cursor, auto &handler) + { + return sm_group->scan_valid_records( + cursor, + segment_header.segment_nonce, + segments.get_segment_size(), + handler).discard_result(); + }).safe_then([this, segment_id, segment_header] { + init_mark_segment_closed( + segment_id, + segment_header.segment_seq, + segment_header.type, + segment_header.category, + segment_header.generation); + }); +} + +bool SegmentCleaner::check_usage() +{ + SpaceTrackerIRef tracker(space_tracker->make_empty()); + extent_callback->with_transaction_weak( + "check_usage", + [this, &tracker](auto &t) { + return backref_manager.scan_mapped_space( + t, + [&tracker]( + paddr_t paddr, + paddr_t backref_key, + extent_len_t len, + extent_types_t type, + laddr_t laddr) + { + if (paddr.get_addr_type() == paddr_types_t::SEGMENT) { + if (is_backref_node(type)) { + assert(laddr == L_ADDR_NULL); + assert(backref_key != P_ADDR_NULL); + tracker->allocate( + paddr.as_seg_paddr().get_segment_id(), + paddr.as_seg_paddr().get_segment_off(), + len); + } else if (laddr == L_ADDR_NULL) { + assert(backref_key == P_ADDR_NULL); + tracker->release( + paddr.as_seg_paddr().get_segment_id(), + paddr.as_seg_paddr().get_segment_off(), + len); + } else { + assert(backref_key == P_ADDR_NULL); + tracker->allocate( + paddr.as_seg_paddr().get_segment_id(), + paddr.as_seg_paddr().get_segment_off(), + len); + } + } + }); + }).unsafe_get0(); + return space_tracker->equals(*tracker); +} + +void SegmentCleaner::mark_space_used( + paddr_t addr, + extent_len_t len) +{ + LOG_PREFIX(SegmentCleaner::mark_space_used); + assert(background_callback->get_state() >= state_t::SCAN_SPACE); + assert(len); + // TODO: drop + if (addr.get_addr_type() != paddr_types_t::SEGMENT) { + return; + } + + auto& seg_addr = addr.as_seg_paddr(); + stats.used_bytes += len; + auto old_usage = calc_utilization(seg_addr.get_segment_id()); + [[maybe_unused]] auto ret = space_tracker->allocate( + seg_addr.get_segment_id(), + seg_addr.get_segment_off(), + len); + auto new_usage = calc_utilization(seg_addr.get_segment_id()); + adjust_segment_util(old_usage, new_usage); + + background_callback->maybe_wake_background(); + assert(ret > 0); + DEBUG("segment {} new len: {}~{}, live_bytes: {}", + seg_addr.get_segment_id(), + addr, + len, + space_tracker->get_usage(seg_addr.get_segment_id())); +} + +void SegmentCleaner::mark_space_free( + paddr_t addr, + extent_len_t len) +{ + LOG_PREFIX(SegmentCleaner::mark_space_free); + assert(background_callback->get_state() >= state_t::SCAN_SPACE); + assert(len); + // TODO: drop + if (addr.get_addr_type() != paddr_types_t::SEGMENT) { + return; + } + + ceph_assert(stats.used_bytes >= len); + stats.used_bytes -= len; + auto& seg_addr = addr.as_seg_paddr(); + + DEBUG("segment {} free len: {}~{}", + seg_addr.get_segment_id(), addr, len); + auto old_usage = calc_utilization(seg_addr.get_segment_id()); + [[maybe_unused]] auto ret = space_tracker->release( + seg_addr.get_segment_id(), + seg_addr.get_segment_off(), + len); + auto new_usage = calc_utilization(seg_addr.get_segment_id()); + adjust_segment_util(old_usage, new_usage); + background_callback->maybe_wake_blocked_io(); + assert(ret >= 0); + DEBUG("segment {} free len: {}~{}, live_bytes: {}", + seg_addr.get_segment_id(), + addr, + len, + space_tracker->get_usage(seg_addr.get_segment_id())); +} + +segment_id_t SegmentCleaner::get_next_reclaim_segment() const +{ + LOG_PREFIX(SegmentCleaner::get_next_reclaim_segment); + segment_id_t id = NULL_SEG_ID; + double max_benefit_cost = 0; + sea_time_point now_time; + if constexpr (gc_formula != gc_formula_t::GREEDY) { + now_time = seastar::lowres_system_clock::now(); + } else { + now_time = NULL_TIME; + } + sea_time_point bound_time; + if constexpr (gc_formula == gc_formula_t::BENEFIT) { + bound_time = segments.get_time_bound(); + if (bound_time == NULL_TIME) { + WARN("BENEFIT -- bound_time is NULL_TIME"); + } + } else { + bound_time = NULL_TIME; + } + for (auto& [_id, segment_info] : segments) { + if (segment_info.is_closed() && + (trimmer == nullptr || + !segment_info.is_in_journal(trimmer->get_journal_tail()))) { + double benefit_cost = calc_gc_benefit_cost(_id, now_time, bound_time); + if (benefit_cost > max_benefit_cost) { + id = _id; + max_benefit_cost = benefit_cost; + } + } + } + if (id != NULL_SEG_ID) { + DEBUG("segment {}, benefit_cost {}", + id, max_benefit_cost); + return id; + } else { + ceph_assert(get_segments_reclaimable() == 0); + // see should_clean_space() + ceph_abort("impossible!"); + return NULL_SEG_ID; + } +} + +bool SegmentCleaner::try_reserve_projected_usage(std::size_t projected_usage) +{ + assert(background_callback->is_ready()); + stats.projected_used_bytes += projected_usage; + if (should_block_io_on_clean()) { + stats.projected_used_bytes -= projected_usage; + return false; + } else { + ++stats.projected_count; + stats.projected_used_bytes_sum += stats.projected_used_bytes; + return true; + } +} + +void SegmentCleaner::release_projected_usage(std::size_t projected_usage) +{ + assert(background_callback->is_ready()); + ceph_assert(stats.projected_used_bytes >= projected_usage); + stats.projected_used_bytes -= projected_usage; + background_callback->maybe_wake_blocked_io(); +} + +void SegmentCleaner::print(std::ostream &os, bool is_detailed) const +{ + os << "SegmentCleaner("; + if (background_callback->is_ready()) { + os << "should_block_io_on_clean=" << should_block_io_on_clean() + << ", should_clean=" << should_clean_space(); + } else { + os << "not-ready"; + } + os << ", projected_avail_ratio=" << get_projected_available_ratio() + << ", reclaim_ratio=" << get_reclaim_ratio() + << ", alive_ratio=" << get_alive_ratio(); + if (is_detailed) { + os << ", unavailable_unreclaimable=" + << get_unavailable_unreclaimable_bytes() << "B" + << ", unavailable_reclaimble=" + << get_unavailable_reclaimable_bytes() << "B" + << ", alive=" << stats.used_bytes << "B" + << ", " << segments; + } + os << ")"; +} + +RBMCleaner::RBMCleaner( + RBMDeviceGroupRef&& rb_group, + BackrefManager &backref_manager, + bool detailed) + : detailed(detailed), + rb_group(std::move(rb_group)), + backref_manager(backref_manager) +{} + +void RBMCleaner::print(std::ostream &os, bool is_detailed) const +{ + // TODO + return; +} + +void RBMCleaner::mark_space_used( + paddr_t addr, + extent_len_t len) +{ + LOG_PREFIX(RBMCleaner::mark_space_used); + assert(addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK); + auto rbms = rb_group->get_rb_managers(); + for (auto rbm : rbms) { + if (addr.get_device_id() == rbm->get_device_id()) { + if (rbm->get_start() <= addr) { + INFO("allocate addr: {} len: {}", addr, len); + stats.used_bytes += len; + rbm->mark_space_used(addr, len); + } + return; + } + } +} + +void RBMCleaner::mark_space_free( + paddr_t addr, + extent_len_t len) +{ + LOG_PREFIX(RBMCleaner::mark_space_free); + assert(addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK); + auto rbms = rb_group->get_rb_managers(); + for (auto rbm : rbms) { + if (addr.get_device_id() == rbm->get_device_id()) { + if (rbm->get_start() <= addr) { + INFO("free addr: {} len: {}", addr, len); + ceph_assert(stats.used_bytes >= len); + stats.used_bytes -= len; + rbm->mark_space_free(addr, len); + } + return; + } + } +} + +void RBMCleaner::commit_space_used(paddr_t addr, extent_len_t len) +{ + auto rbms = rb_group->get_rb_managers(); + for (auto rbm : rbms) { + if (addr.get_device_id() == rbm->get_device_id()) { + if (rbm->get_start() <= addr) { + rbm->complete_allocation(addr, len); + } + return; + } + } +} + +bool RBMCleaner::try_reserve_projected_usage(std::size_t projected_usage) +{ + assert(background_callback->is_ready()); + stats.projected_used_bytes += projected_usage; + return true; +} + +void RBMCleaner::release_projected_usage(std::size_t projected_usage) +{ + assert(background_callback->is_ready()); + ceph_assert(stats.projected_used_bytes >= projected_usage); + stats.projected_used_bytes -= projected_usage; + background_callback->maybe_wake_blocked_io(); +} + +RBMCleaner::clean_space_ret RBMCleaner::clean_space() +{ + // TODO + return clean_space_ertr::now(); +} + +RBMCleaner::mount_ret RBMCleaner::mount() +{ + stats = {}; + register_metrics(); + return seastar::do_with( + rb_group->get_rb_managers(), + [](auto &rbs) { + return crimson::do_for_each( + rbs.begin(), + rbs.end(), + [](auto& it) { + return it->open( + ).handle_error( + crimson::ct_error::input_output_error::pass_further(), + crimson::ct_error::assert_all{ + "Invalid error when opening RBM"} + ); + }); + }); +} + +bool RBMCleaner::check_usage() +{ + assert(detailed); + const auto& rbms = rb_group->get_rb_managers(); + RBMSpaceTracker tracker(rbms); + extent_callback->with_transaction_weak( + "check_usage", + [this, &tracker, &rbms](auto &t) { + return backref_manager.scan_mapped_space( + t, + [&tracker, &rbms]( + paddr_t paddr, + paddr_t backref_key, + extent_len_t len, + extent_types_t type, + laddr_t laddr) + { + for (auto rbm : rbms) { + if (rbm->get_device_id() == paddr.get_device_id()) { + if (is_backref_node(type)) { + assert(laddr == L_ADDR_NULL); + assert(backref_key != P_ADDR_NULL); + tracker.allocate( + paddr, + len); + } else if (laddr == L_ADDR_NULL) { + assert(backref_key == P_ADDR_NULL); + tracker.release( + paddr, + len); + } else { + assert(backref_key == P_ADDR_NULL); + tracker.allocate( + paddr, + len); + } + } + } + }); + }).unsafe_get0(); + return equals(tracker); +} + +bool RBMCleaner::equals(const RBMSpaceTracker &_other) const +{ + LOG_PREFIX(RBMSpaceTracker::equals); + const auto &other = static_cast<const RBMSpaceTracker&>(_other); + auto rbs = rb_group->get_rb_managers(); + //TODO: multiple rbm allocator + auto rbm = rbs[0]; + assert(rbm); + + if (rbm->get_device()->get_available_size() / rbm->get_block_size() + != other.block_usage.size()) { + assert(0 == "block counts should match"); + return false; + } + bool all_match = true; + for (auto i = other.block_usage.begin(); + i != other.block_usage.end(); ++i) { + if (i->first < rbm->get_start().as_blk_paddr().get_device_off()) { + continue; + } + auto addr = i->first; + auto state = rbm->get_extent_state( + convert_abs_addr_to_paddr(addr, rbm->get_device_id()), + rbm->get_block_size()); + if ((i->second.used && state == rbm_extent_state_t::ALLOCATED) || + (!i->second.used && (state == rbm_extent_state_t::FREE || + state == rbm_extent_state_t::RESERVED))) { + // pass + } else { + all_match = false; + ERROR("block addr {} mismatch other used: {}", + addr, i->second.used); + } + } + return all_match; +} + +void RBMCleaner::register_metrics() +{ + namespace sm = seastar::metrics; + + metrics.add_group("rbm_cleaner", { + sm::make_counter("total_bytes", + [this] { return get_total_bytes(); }, + sm::description("the size of the space")), + sm::make_counter("available_bytes", + [this] { return get_total_bytes() - get_journal_bytes() - stats.used_bytes; }, + sm::description("the size of the space is available")), + sm::make_counter("used_bytes", stats.used_bytes, + sm::description("the size of the space occupied by live extents")), + }); +} + +} diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h new file mode 100644 index 000000000..fb8e03bb4 --- /dev/null +++ b/src/crimson/os/seastore/async_cleaner.h @@ -0,0 +1,1761 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive/set.hpp> +#include <seastar/core/metrics_types.hh> + +#include "common/ceph_time.h" + +#include "osd/osd_types.h" + +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/segment_manager_group.h" +#include "crimson/os/seastore/randomblock_manager_group.h" +#include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/segment_seq_allocator.h" + +namespace crimson::os::seastore { + +/* + * segment_info_t + * + * Maintains the tracked information for a segment. + * It is read-only outside segments_info_t. + */ +struct segment_info_t { + segment_id_t id = NULL_SEG_ID; + + // segment_info_t is initiated as set_empty() + Segment::segment_state_t state = Segment::segment_state_t::EMPTY; + + // Will be non-null for any segments in the current journal + segment_seq_t seq = NULL_SEG_SEQ; + + segment_type_t type = segment_type_t::NULL_SEG; + + data_category_t category = data_category_t::NUM; + + rewrite_gen_t generation = NULL_GENERATION; + + sea_time_point modify_time = NULL_TIME; + + std::size_t num_extents = 0; + + segment_off_t written_to = 0; + + bool is_in_journal(journal_seq_t tail_committed) const { + return type == segment_type_t::JOURNAL && + tail_committed.segment_seq <= seq; + } + + bool is_empty() const { + return state == Segment::segment_state_t::EMPTY; + } + + bool is_closed() const { + return state == Segment::segment_state_t::CLOSED; + } + + bool is_open() const { + return state == Segment::segment_state_t::OPEN; + } + + void init_closed(segment_seq_t, segment_type_t, + data_category_t, rewrite_gen_t, + segment_off_t); + + void set_open(segment_seq_t, segment_type_t, + data_category_t, rewrite_gen_t); + + void set_empty(); + + void set_closed(); + + void update_modify_time(sea_time_point _modify_time, std::size_t _num_extents) { + ceph_assert(!is_closed()); + assert(_modify_time != NULL_TIME); + assert(_num_extents != 0); + if (modify_time == NULL_TIME) { + modify_time = _modify_time; + num_extents = _num_extents; + } else { + modify_time = get_average_time( + modify_time, num_extents, _modify_time, _num_extents); + num_extents += _num_extents; + } + } +}; + +std::ostream& operator<<(std::ostream&, const segment_info_t&); + +/* + * segments_info_t + * + * Keep track of all segments and related information. + */ +class segments_info_t { +public: + segments_info_t() { + reset(); + } + + const segment_info_t& operator[](segment_id_t id) const { + return segments[id]; + } + + auto begin() const { + return segments.begin(); + } + + auto end() const { + return segments.end(); + } + + std::size_t get_num_segments() const { + assert(segments.size() > 0); + return segments.size(); + } + segment_off_t get_segment_size() const { + assert(segment_size > 0); + return segment_size; + } + std::size_t get_num_in_journal_open() const { + return num_in_journal_open; + } + std::size_t get_num_type_journal() const { + return num_type_journal; + } + std::size_t get_num_type_ool() const { + return num_type_ool; + } + std::size_t get_num_open() const { + return num_open; + } + std::size_t get_num_empty() const { + return num_empty; + } + std::size_t get_num_closed() const { + return num_closed; + } + std::size_t get_count_open_journal() const { + return count_open_journal; + } + std::size_t get_count_open_ool() const { + return count_open_ool; + } + std::size_t get_count_release_journal() const { + return count_release_journal; + } + std::size_t get_count_release_ool() const { + return count_release_ool; + } + std::size_t get_count_close_journal() const { + return count_close_journal; + } + std::size_t get_count_close_ool() const { + return count_close_ool; + } + + std::size_t get_total_bytes() const { + return total_bytes; + } + /// the available space that is writable, including in open segments + std::size_t get_available_bytes() const { + return num_empty * get_segment_size() + avail_bytes_in_open; + } + /// the unavailable space that is not writable + std::size_t get_unavailable_bytes() const { + assert(total_bytes >= get_available_bytes()); + return total_bytes - get_available_bytes(); + } + std::size_t get_available_bytes_in_open() const { + return avail_bytes_in_open; + } + double get_available_ratio() const { + return (double)get_available_bytes() / (double)total_bytes; + } + + journal_seq_t get_submitted_journal_head() const { + if (unlikely(journal_segment_id == NULL_SEG_ID)) { + return JOURNAL_SEQ_NULL; + } + auto &segment_info = segments[journal_segment_id]; + assert(!segment_info.is_empty()); + assert(segment_info.type == segment_type_t::JOURNAL); + assert(segment_info.seq != NULL_SEG_SEQ); + return journal_seq_t{ + segment_info.seq, + paddr_t::make_seg_paddr( + journal_segment_id, + segment_info.written_to) + }; + } + + sea_time_point get_time_bound() const { + if (!modify_times.empty()) { + return *modify_times.begin(); + } else { + return NULL_TIME; + } + } + + void reset(); + + void add_segment_manager(SegmentManager &segment_manager); + + void assign_ids() { + for (auto &item : segments) { + item.second.id = item.first; + } + } + + // initiate non-empty segments, the others are by default empty + void init_closed(segment_id_t, segment_seq_t, segment_type_t, + data_category_t, rewrite_gen_t); + + void mark_open(segment_id_t, segment_seq_t, segment_type_t, + data_category_t, rewrite_gen_t); + + void mark_empty(segment_id_t); + + void mark_closed(segment_id_t); + + void update_written_to(segment_type_t, paddr_t); + + void update_modify_time( + segment_id_t id, sea_time_point tp, std::size_t num) { + if (num == 0) { + return; + } + + assert(tp != NULL_TIME); + segments[id].update_modify_time(tp, num); + } + +private: + // See reset() for member initialization + segment_map_t<segment_info_t> segments; + + segment_off_t segment_size; + + segment_id_t journal_segment_id; + std::size_t num_in_journal_open; + std::size_t num_type_journal; + std::size_t num_type_ool; + + std::size_t num_open; + std::size_t num_empty; + std::size_t num_closed; + + std::size_t count_open_journal; + std::size_t count_open_ool; + std::size_t count_release_journal; + std::size_t count_release_ool; + std::size_t count_close_journal; + std::size_t count_close_ool; + + std::size_t total_bytes; + std::size_t avail_bytes_in_open; + + std::multiset<sea_time_point> modify_times; +}; + +std::ostream &operator<<(std::ostream &, const segments_info_t &); + +/** + * Callback interface for querying extents and operating on transactions. + */ +class ExtentCallbackInterface { +public: + using base_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using base_iertr = trans_iertr<base_ertr>; + + virtual ~ExtentCallbackInterface() = default; + + /// Creates empty transaction + /// weak transaction should be type READ + virtual TransactionRef create_transaction( + Transaction::src_t, const char *name, bool is_weak=false) = 0; + + /// Creates empty transaction with interruptible context + template <typename Func> + auto with_transaction_intr( + Transaction::src_t src, + const char* name, + Func &&f) { + return do_with_transaction_intr<Func, false>( + src, name, std::forward<Func>(f)); + } + + template <typename Func> + auto with_transaction_weak( + const char* name, + Func &&f) { + return do_with_transaction_intr<Func, true>( + Transaction::src_t::READ, name, std::forward<Func>(f) + ).handle_error( + crimson::ct_error::eagain::handle([] { + ceph_assert(0 == "eagain impossible"); + }), + crimson::ct_error::pass_further_all{} + ); + } + + /// See Cache::get_next_dirty_extents + using get_next_dirty_extents_iertr = base_iertr; + using get_next_dirty_extents_ret = get_next_dirty_extents_iertr::future< + std::vector<CachedExtentRef>>; + virtual get_next_dirty_extents_ret get_next_dirty_extents( + Transaction &t, ///< [in] current transaction + journal_seq_t bound,///< [in] return extents with dirty_from < bound + size_t max_bytes ///< [in] return up to max_bytes of extents + ) = 0; + + /** + * rewrite_extent + * + * Updates t with operations moving the passed extents to a new + * segment. extent may be invalid, implementation must correctly + * handle finding the current instance if it is still alive and + * otherwise ignore it. + */ + using rewrite_extent_iertr = base_iertr; + using rewrite_extent_ret = rewrite_extent_iertr::future<>; + virtual rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent, + rewrite_gen_t target_generation, + sea_time_point modify_time) = 0; + + /** + * get_extent_if_live + * + * Returns extent at specified location if still referenced by + * lba_manager and not removed by t. + * + * See TransactionManager::get_extent_if_live and + * LBAManager::get_physical_extent_if_live. + */ + using get_extents_if_live_iertr = base_iertr; + using get_extents_if_live_ret = get_extents_if_live_iertr::future< + std::list<CachedExtentRef>>; + virtual get_extents_if_live_ret get_extents_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + extent_len_t len) = 0; + + /** + * submit_transaction_direct + * + * Submits transaction without any space throttling. + */ + using submit_transaction_direct_iertr = base_iertr; + using submit_transaction_direct_ret = + submit_transaction_direct_iertr::future<>; + virtual submit_transaction_direct_ret submit_transaction_direct( + Transaction &t, + std::optional<journal_seq_t> seq_to_trim = std::nullopt) = 0; + +private: + template <typename Func, bool IsWeak> + auto do_with_transaction_intr( + Transaction::src_t src, + const char* name, + Func &&f) { + return seastar::do_with( + create_transaction(src, name, IsWeak), + [f=std::forward<Func>(f)](auto &ref_t) mutable { + return with_trans_intr( + *ref_t, + [f=std::forward<Func>(f)](auto& t) mutable { + return f(t); + } + ); + } + ); + } +}; + +/** + * Callback interface to wake up background works + */ +struct BackgroundListener { + enum class state_t { + STOP, + MOUNT, + SCAN_SPACE, + RUNNING, + HALT, + }; + + virtual ~BackgroundListener() = default; + virtual void maybe_wake_background() = 0; + virtual void maybe_wake_blocked_io() = 0; + virtual state_t get_state() const = 0; + + bool is_ready() const { + return get_state() >= state_t::RUNNING; + } +}; + +/** + * Callback interface for Journal + */ +class JournalTrimmer { +public: + // get the committed journal head + virtual journal_seq_t get_journal_head() const = 0; + + // set the committed journal head + virtual void set_journal_head(journal_seq_t) = 0; + + // get the committed journal dirty tail + virtual journal_seq_t get_dirty_tail() const = 0; + + // get the committed journal alloc tail + virtual journal_seq_t get_alloc_tail() const = 0; + + // set the committed journal tails + virtual void update_journal_tails( + journal_seq_t dirty_tail, journal_seq_t alloc_tail) = 0; + + // try reserve the projected usage in journal + // returns if the reservation is successful + // if the reservation is successful, user should call + // release_inline_usage to restore. + virtual bool try_reserve_inline_usage(std::size_t usage) = 0; + + // release the projected usage in journal + virtual void release_inline_usage(std::size_t usage) = 0; + + virtual ~JournalTrimmer() {} + + journal_seq_t get_journal_tail() const { + return std::min(get_alloc_tail(), get_dirty_tail()); + } + + virtual std::size_t get_trim_size_per_cycle() const = 0; + + bool check_is_ready() const { + return (get_journal_head() != JOURNAL_SEQ_NULL && + get_dirty_tail() != JOURNAL_SEQ_NULL && + get_alloc_tail() != JOURNAL_SEQ_NULL); + } + + std::size_t get_num_rolls() const { + if (!check_is_ready()) { + return 0; + } + assert(get_journal_head().segment_seq >= + get_journal_tail().segment_seq); + return get_journal_head().segment_seq + 1 - + get_journal_tail().segment_seq; + } +}; + +class BackrefManager; +class JournalTrimmerImpl; +using JournalTrimmerImplRef = std::unique_ptr<JournalTrimmerImpl>; + +/** + * Journal trimming implementation + */ +class JournalTrimmerImpl : public JournalTrimmer { +public: + struct config_t { + /// Number of minimum bytes to stop trimming dirty. + std::size_t target_journal_dirty_bytes = 0; + /// Number of minimum bytes to stop trimming allocation + /// (having the corresponding backrefs unmerged) + std::size_t target_journal_alloc_bytes = 0; + /// Number of maximum bytes to block user transactions. + std::size_t max_journal_bytes = 0; + /// Number of bytes to rewrite dirty per cycle + std::size_t rewrite_dirty_bytes_per_cycle = 0; + /// Number of bytes to rewrite backref per cycle + std::size_t rewrite_backref_bytes_per_cycle = 0; + + void validate() const; + + static config_t get_default( + std::size_t roll_size, journal_type_t type); + + static config_t get_test( + std::size_t roll_size, journal_type_t type); + }; + + JournalTrimmerImpl( + BackrefManager &backref_manager, + config_t config, + journal_type_t type, + device_off_t roll_start, + device_off_t roll_size); + + ~JournalTrimmerImpl() = default; + + /* + * JournalTrimmer interfaces + */ + + journal_seq_t get_journal_head() const final { + return journal_head; + } + + void set_journal_head(journal_seq_t) final; + + journal_seq_t get_dirty_tail() const final { + return journal_dirty_tail; + } + + journal_seq_t get_alloc_tail() const final { + return journal_alloc_tail; + } + + void update_journal_tails( + journal_seq_t dirty_tail, journal_seq_t alloc_tail) final; + + std::size_t get_trim_size_per_cycle() const final { + return config.rewrite_backref_bytes_per_cycle + + config.rewrite_dirty_bytes_per_cycle; + } + + journal_type_t get_journal_type() const { + return journal_type; + } + + void set_extent_callback(ExtentCallbackInterface *cb) { + extent_callback = cb; + } + + void set_background_callback(BackgroundListener *cb) { + background_callback = cb; + } + + void reset() { + journal_head = JOURNAL_SEQ_NULL; + journal_dirty_tail = JOURNAL_SEQ_NULL; + journal_alloc_tail = JOURNAL_SEQ_NULL; + } + + bool should_trim() const { + return should_trim_alloc() || should_trim_dirty(); + } + + bool should_block_io_on_trim() const { + return get_tail_limit() > + get_journal_tail().add_offset( + journal_type, reserved_usage, roll_start, roll_size); + } + + bool try_reserve_inline_usage(std::size_t usage) final { + reserved_usage += usage; + if (should_block_io_on_trim()) { + reserved_usage -= usage; + return false; + } else { + return true; + } + } + + void release_inline_usage(std::size_t usage) final { + ceph_assert(reserved_usage >= usage); + reserved_usage -= usage; + } + + seastar::future<> trim(); + + static JournalTrimmerImplRef create( + BackrefManager &backref_manager, + config_t config, + journal_type_t type, + device_off_t roll_start, + device_off_t roll_size) { + return std::make_unique<JournalTrimmerImpl>( + backref_manager, config, type, roll_start, roll_size); + } + + struct stat_printer_t { + const JournalTrimmerImpl &trimmer; + bool detailed = false; + }; + friend std::ostream &operator<<(std::ostream &, const stat_printer_t &); + +private: + bool should_trim_dirty() const { + return get_dirty_tail_target() > journal_dirty_tail; + } + + bool should_trim_alloc() const { + return get_alloc_tail_target() > journal_alloc_tail; + } + + using trim_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + trim_ertr::future<> trim_dirty(); + + trim_ertr::future<> trim_alloc(); + + journal_seq_t get_tail_limit() const; + journal_seq_t get_dirty_tail_target() const; + journal_seq_t get_alloc_tail_target() const; + std::size_t get_dirty_journal_size() const; + std::size_t get_alloc_journal_size() const; + void register_metrics(); + + ExtentCallbackInterface *extent_callback = nullptr; + BackgroundListener *background_callback = nullptr; + BackrefManager &backref_manager; + + config_t config; + journal_type_t journal_type; + device_off_t roll_start; + device_off_t roll_size; + + journal_seq_t journal_head; + journal_seq_t journal_dirty_tail; + journal_seq_t journal_alloc_tail; + + std::size_t reserved_usage; + + seastar::metrics::metric_group metrics; +}; + +std::ostream &operator<<( + std::ostream &, const JournalTrimmerImpl::stat_printer_t &); + +/** + * Callback interface for managing available segments + */ +class SegmentProvider { +public: + virtual const segment_info_t& get_seg_info(segment_id_t id) const = 0; + + virtual segment_id_t allocate_segment( + segment_seq_t, segment_type_t, data_category_t, rewrite_gen_t) = 0; + + virtual void close_segment(segment_id_t) = 0; + + // set the submitted segment writes in order + virtual void update_segment_avail_bytes(segment_type_t, paddr_t) = 0; + + virtual void update_modify_time( + segment_id_t, sea_time_point, std::size_t) = 0; + + virtual SegmentManagerGroup* get_segment_manager_group() = 0; + + virtual ~SegmentProvider() {} +}; + +class SpaceTrackerI { +public: + virtual int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) = 0; + + virtual int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) = 0; + + virtual int64_t get_usage( + segment_id_t segment) const = 0; + + virtual bool equals(const SpaceTrackerI &other) const = 0; + + virtual std::unique_ptr<SpaceTrackerI> make_empty() const = 0; + + virtual void dump_usage(segment_id_t) const = 0; + + virtual double calc_utilization(segment_id_t segment) const = 0; + + virtual void reset() = 0; + + virtual ~SpaceTrackerI() = default; +}; +using SpaceTrackerIRef = std::unique_ptr<SpaceTrackerI>; + +class SpaceTrackerSimple : public SpaceTrackerI { + struct segment_bytes_t { + int64_t live_bytes = 0; + segment_off_t total_bytes = 0; + }; + // Tracks live space for each segment + segment_map_t<segment_bytes_t> live_bytes_by_segment; + + int64_t update_usage(segment_id_t segment, int64_t delta) { + live_bytes_by_segment[segment].live_bytes += delta; + assert(live_bytes_by_segment[segment].live_bytes >= 0); + return live_bytes_by_segment[segment].live_bytes; + } +public: + SpaceTrackerSimple(const SpaceTrackerSimple &) = default; + SpaceTrackerSimple(const std::vector<SegmentManager*> &sms) { + for (auto sm : sms) { + live_bytes_by_segment.add_device( + sm->get_device_id(), + sm->get_num_segments(), + {0, sm->get_segment_size()}); + } + } + + int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + return update_usage(segment, len); + } + + int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + return update_usage(segment, -(int64_t)len); + } + + int64_t get_usage(segment_id_t segment) const final { + return live_bytes_by_segment[segment].live_bytes; + } + + double calc_utilization(segment_id_t segment) const final { + auto& seg_bytes = live_bytes_by_segment[segment]; + return (double)seg_bytes.live_bytes / (double)seg_bytes.total_bytes; + } + + void dump_usage(segment_id_t) const final; + + void reset() final { + for (auto &i : live_bytes_by_segment) { + i.second = {0, 0}; + } + } + + SpaceTrackerIRef make_empty() const final { + auto ret = SpaceTrackerIRef(new SpaceTrackerSimple(*this)); + ret->reset(); + return ret; + } + + bool equals(const SpaceTrackerI &other) const; +}; + +class SpaceTrackerDetailed : public SpaceTrackerI { + class SegmentMap { + int64_t used = 0; + segment_off_t total_bytes = 0; + std::vector<bool> bitmap; + + public: + SegmentMap( + size_t blocks, + segment_off_t total_bytes) + : total_bytes(total_bytes), + bitmap(blocks, false) {} + + int64_t update_usage(int64_t delta) { + used += delta; + return used; + } + + int64_t allocate( + device_segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size); + + int64_t release( + device_segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size); + + int64_t get_usage() const { + return used; + } + + void dump_usage(extent_len_t block_size) const; + + double calc_utilization() const { + return (double)used / (double)total_bytes; + } + + void reset() { + used = 0; + for (auto &&i: bitmap) { + i = false; + } + } + }; + + // Tracks live space for each segment + segment_map_t<SegmentMap> segment_usage; + std::vector<size_t> block_size_by_segment_manager; + +public: + SpaceTrackerDetailed(const SpaceTrackerDetailed &) = default; + SpaceTrackerDetailed(const std::vector<SegmentManager*> &sms) + { + block_size_by_segment_manager.resize(DEVICE_ID_MAX, 0); + for (auto sm : sms) { + segment_usage.add_device( + sm->get_device_id(), + sm->get_num_segments(), + SegmentMap( + sm->get_segment_size() / sm->get_block_size(), + sm->get_segment_size())); + block_size_by_segment_manager[sm->get_device_id()] = sm->get_block_size(); + } + } + + int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + return segment_usage[segment].allocate( + segment.device_segment_id(), + offset, + len, + block_size_by_segment_manager[segment.device_id()]); + } + + int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + return segment_usage[segment].release( + segment.device_segment_id(), + offset, + len, + block_size_by_segment_manager[segment.device_id()]); + } + + int64_t get_usage(segment_id_t segment) const final { + return segment_usage[segment].get_usage(); + } + + double calc_utilization(segment_id_t segment) const final { + return segment_usage[segment].calc_utilization(); + } + + void dump_usage(segment_id_t seg) const final; + + void reset() final { + for (auto &i: segment_usage) { + i.second.reset(); + } + } + + SpaceTrackerIRef make_empty() const final { + auto ret = SpaceTrackerIRef(new SpaceTrackerDetailed(*this)); + ret->reset(); + return ret; + } + + bool equals(const SpaceTrackerI &other) const; +}; + +template <typename T> +class block_map_t { +public: + block_map_t() { + device_to_blocks.resize(DEVICE_ID_MAX_VALID); + device_block_size.resize(DEVICE_ID_MAX_VALID); + } + void add_device(device_id_t device, std::size_t blocks, const T& init, + size_t block_size) { + ceph_assert(device <= DEVICE_ID_MAX_VALID); + ceph_assert(device_to_blocks[device].size() == 0); + ceph_assert(blocks > 0); + device_to_blocks[device].resize(blocks, init); + total_blocks += blocks; + device_block_size[device] = block_size; + } + void clear() { + device_to_blocks.clear(); + device_to_blocks.resize(DEVICE_ID_MAX_VALID); + total_blocks = 0; + } + + T& operator[](paddr_t block) { + ceph_assert(device_to_blocks[block.get_device_id()].size() != 0); + auto &blk = block.as_blk_paddr(); + auto block_id = get_block_id(block.get_device_id(), blk.get_device_off()); + return device_to_blocks[block.get_device_id()][block_id]; + } + const T& operator[](paddr_t block) const { + ceph_assert(device_to_blocks[block.get_device_id()].size() != 0); + auto &blk = block.as_blk_paddr(); + auto block_id = get_block_id(block.get_device_id(), blk.get_device_off()); + return device_to_blocks[block.get_device_id()][block_id]; + } + + auto begin() { + return iterator<false>::lower_bound(*this, 0, 0); + } + auto begin() const { + return iterator<true>::lower_bound(*this, 0, 0); + } + + auto end() { + return iterator<false>::end_iterator(*this); + } + auto end() const { + return iterator<true>::end_iterator(*this); + } + + size_t size() const { + return total_blocks; + } + + uint64_t get_block_size(device_id_t device_id) { + return device_block_size[device_id]; + } + + uint32_t get_block_id(device_id_t device_id, device_off_t blk_off) const { + auto block_size = device_block_size[device_id]; + return blk_off == 0 ? 0 : blk_off/block_size; + } + + template <bool is_const = false> + class iterator { + /// points at set being iterated over + std::conditional_t< + is_const, + const block_map_t &, + block_map_t &> parent; + + /// points at current device, or DEVICE_ID_MAX_VALID if is_end() + device_id_t device_id; + + /// segment at which we are pointing, 0 if is_end() + device_off_t blk_off; + + /// holds referent for operator* and operator-> when !is_end() + std::optional< + std::pair< + const device_off_t, + std::conditional_t<is_const, const T&, T&> + >> current; + + bool is_end() const { + return device_id == DEVICE_ID_MAX_VALID; + } + + uint32_t get_block_id() { + return parent.get_block_id(device_id, blk_off); + } + + void find_valid() { + assert(!is_end()); + auto &device_vec = parent.device_to_blocks[device_id]; + if (device_vec.size() == 0 || + get_block_id() == device_vec.size()) { + while (++device_id < DEVICE_ID_MAX_VALID&& + parent.device_to_blocks[device_id].size() == 0); + blk_off = 0; + } + if (is_end()) { + current = std::nullopt; + } else { + current.emplace( + blk_off, + parent.device_to_blocks[device_id][get_block_id()] + ); + } + } + + iterator( + decltype(parent) &parent, + device_id_t device_id, + device_off_t device_block_off) + : parent(parent), device_id(device_id), + blk_off(device_block_off) {} + + public: + static iterator lower_bound( + decltype(parent) &parent, + device_id_t device_id, + device_off_t block_off) { + if (device_id == DEVICE_ID_MAX_VALID) { + return end_iterator(parent); + } else { + auto ret = iterator{parent, device_id, block_off}; + ret.find_valid(); + return ret; + } + } + + static iterator end_iterator( + decltype(parent) &parent) { + return iterator{parent, DEVICE_ID_MAX_VALID, 0}; + } + + iterator<is_const>& operator++() { + assert(!is_end()); + auto block_size = parent.device_block_size[device_id]; + blk_off += block_size; + find_valid(); + return *this; + } + + bool operator==(iterator<is_const> rit) { + return (device_id == rit.device_id && + blk_off == rit.blk_off); + } + + bool operator!=(iterator<is_const> rit) { + return !(*this == rit); + } + template <bool c = is_const, std::enable_if_t<c, int> = 0> + const std::pair<const device_off_t, const T&> *operator->() { + assert(!is_end()); + return &*current; + } + template <bool c = is_const, std::enable_if_t<!c, int> = 0> + std::pair<const device_off_t, T&> *operator->() { + assert(!is_end()); + return &*current; + } + template <bool c = is_const, std::enable_if_t<c, int> = 0> + const std::pair<const device_off_t, const T&> &operator*() { + assert(!is_end()); + return *current; + } + template <bool c = is_const, std::enable_if_t<!c, int> = 0> + std::pair<const device_off_t, T&> &operator*() { + assert(!is_end()); + return *current; + } + }; + std::vector<std::vector<T>> device_to_blocks; + std::vector<size_t> device_block_size; + size_t total_blocks = 0; +}; + +class RBMSpaceTracker { + struct random_block_t { + bool used = false; + void allocate() { + used = true; + } + void release() { + used = false; + } + }; + block_map_t<random_block_t> block_usage; + +public: + RBMSpaceTracker(const RBMSpaceTracker &) = default; + RBMSpaceTracker(const std::vector<RandomBlockManager*> &rbms) { + for (auto rbm : rbms) { + block_usage.add_device( + rbm->get_device_id(), + rbm->get_device()->get_available_size() / rbm->get_block_size(), + {false}, + rbm->get_block_size()); + } + } + + void allocate( + paddr_t addr, + extent_len_t len) { + paddr_t cursor = addr; + paddr_t end = addr.add_offset(len); + do { + block_usage[cursor].allocate(); + cursor = cursor.add_offset( + block_usage.get_block_size(addr.get_device_id())); + } while (cursor < end); + } + + void release( + paddr_t addr, + extent_len_t len) { + paddr_t cursor = addr; + paddr_t end = addr.add_offset(len); + do { + block_usage[cursor].release(); + cursor = cursor.add_offset( + block_usage.get_block_size(addr.get_device_id())); + } while (cursor < end); + } + + void reset() { + for (auto &i : block_usage) { + i.second = {false}; + } + } + + std::unique_ptr<RBMSpaceTracker> make_empty() const { + auto ret = std::make_unique<RBMSpaceTracker>(*this); + ret->reset(); + return ret; + } + friend class RBMCleaner; +}; +using RBMSpaceTrackerRef = std::unique_ptr<RBMSpaceTracker>; + +/* + * AsyncCleaner + * + * Interface for ExtentPlacementManager::BackgroundProcess + * to do background cleaning. + */ +class AsyncCleaner { +public: + using state_t = BackgroundListener::state_t; + using base_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + + virtual void set_background_callback(BackgroundListener *) = 0; + + virtual void set_extent_callback(ExtentCallbackInterface *) = 0; + + virtual store_statfs_t get_stat() const = 0; + + virtual void print(std::ostream &, bool is_detailed) const = 0; + + virtual bool check_usage_is_empty() const = 0; + + using mount_ertr = base_ertr; + using mount_ret = mount_ertr::future<>; + virtual mount_ret mount() = 0; + + virtual void mark_space_used(paddr_t, extent_len_t) = 0; + + virtual void mark_space_free(paddr_t, extent_len_t) = 0; + + virtual void commit_space_used(paddr_t, extent_len_t) = 0; + + // try reserve the projected usage in cleaner + // returns if the reservation is successful + // if the reservation is successful, user should call + // release_projected_usage to restore. + virtual bool try_reserve_projected_usage(std::size_t) = 0; + + virtual void release_projected_usage(std::size_t) = 0; + + virtual bool should_block_io_on_clean() const = 0; + + virtual bool can_clean_space() const = 0; + + virtual bool should_clean_space() const = 0; + + using clean_space_ertr = base_ertr; + using clean_space_ret = clean_space_ertr::future<>; + virtual clean_space_ret clean_space() = 0; + + virtual const std::set<device_id_t>& get_device_ids() const = 0; + + virtual std::size_t get_reclaim_size_per_cycle() const = 0; + + // test only + virtual bool check_usage() = 0; + + struct stat_printer_t { + const AsyncCleaner &cleaner; + bool detailed = false; + }; + + virtual ~AsyncCleaner() {} +}; + +using AsyncCleanerRef = std::unique_ptr<AsyncCleaner>; + +std::ostream &operator<<( + std::ostream &, const AsyncCleaner::stat_printer_t &); + +class SegmentCleaner; +using SegmentCleanerRef = std::unique_ptr<SegmentCleaner>; + +class SegmentCleaner : public SegmentProvider, public AsyncCleaner { +public: + /// Config + struct config_t { + /// Ratio of maximum available space to disable reclaiming. + double available_ratio_gc_max = 0; + /// Ratio of minimum available space to force reclaiming. + double available_ratio_hard_limit = 0; + /// Ratio of minimum reclaimable space to stop reclaiming. + double reclaim_ratio_gc_threshold = 0; + /// Number of bytes to reclaim per cycle + std::size_t reclaim_bytes_per_cycle = 0; + + void validate() const { + ceph_assert(available_ratio_gc_max > available_ratio_hard_limit); + ceph_assert(reclaim_bytes_per_cycle > 0); + } + + static config_t get_default() { + return config_t{ + .15, // available_ratio_gc_max + .1, // available_ratio_hard_limit + .1, // reclaim_ratio_gc_threshold + 1<<20 // reclaim_bytes_per_cycle + }; + } + + static config_t get_test() { + return config_t{ + .99, // available_ratio_gc_max + .2, // available_ratio_hard_limit + .6, // reclaim_ratio_gc_threshold + 1<<20 // reclaim_bytes_per_cycle + }; + } + }; + + SegmentCleaner( + config_t config, + SegmentManagerGroupRef&& sm_group, + BackrefManager &backref_manager, + SegmentSeqAllocator &segment_seq_allocator, + bool detailed, + bool is_cold); + + void set_journal_trimmer(JournalTrimmer &_trimmer) { + trimmer = &_trimmer; + } + + static SegmentCleanerRef create( + config_t config, + SegmentManagerGroupRef&& sm_group, + BackrefManager &backref_manager, + SegmentSeqAllocator &ool_seq_allocator, + bool detailed, + bool is_cold = false) { + return std::make_unique<SegmentCleaner>( + config, std::move(sm_group), backref_manager, + ool_seq_allocator, detailed, is_cold); + } + + /* + * SegmentProvider interfaces + */ + + const segment_info_t& get_seg_info(segment_id_t id) const final { + return segments[id]; + } + + segment_id_t allocate_segment( + segment_seq_t, segment_type_t, data_category_t, rewrite_gen_t) final; + + void close_segment(segment_id_t segment) final; + + void update_segment_avail_bytes(segment_type_t type, paddr_t offset) final { + assert(type == segment_type_t::OOL || + trimmer != nullptr); // segment_type_t::JOURNAL + segments.update_written_to(type, offset); + background_callback->maybe_wake_background(); + } + + void update_modify_time( + segment_id_t id, sea_time_point tp, std::size_t num_extents) final { + ceph_assert(num_extents == 0 || tp != NULL_TIME); + segments.update_modify_time(id, tp, num_extents); + } + + SegmentManagerGroup* get_segment_manager_group() final { + return sm_group.get(); + } + + /* + * AsyncCleaner interfaces + */ + + void set_background_callback(BackgroundListener *cb) final { + background_callback = cb; + } + + void set_extent_callback(ExtentCallbackInterface *cb) final { + extent_callback = cb; + } + + store_statfs_t get_stat() const final { + store_statfs_t st; + st.total = segments.get_total_bytes(); + st.available = segments.get_total_bytes() - stats.used_bytes; + st.allocated = stats.used_bytes; + st.data_stored = stats.used_bytes; + + // TODO add per extent type counters for omap_allocated and + // internal metadata + return st; + } + + void print(std::ostream &, bool is_detailed) const final; + + bool check_usage_is_empty() const final { + return space_tracker->equals(*space_tracker->make_empty()); + } + + mount_ret mount() final; + + void mark_space_used(paddr_t, extent_len_t) final; + + void mark_space_free(paddr_t, extent_len_t) final; + + void commit_space_used(paddr_t addr, extent_len_t len) final { + mark_space_used(addr, len); + } + + bool try_reserve_projected_usage(std::size_t) final; + + void release_projected_usage(size_t) final; + + bool should_block_io_on_clean() const final { + assert(background_callback->is_ready()); + if (get_segments_reclaimable() == 0) { + return false; + } + auto aratio = get_projected_available_ratio(); + return aratio < config.available_ratio_hard_limit; + } + + bool can_clean_space() const final { + assert(background_callback->is_ready()); + return get_segments_reclaimable() > 0; + } + + bool should_clean_space() const final { + assert(background_callback->is_ready()); + if (get_segments_reclaimable() == 0) { + return false; + } + auto aratio = segments.get_available_ratio(); + auto rratio = get_reclaim_ratio(); + return ( + (aratio < config.available_ratio_hard_limit) || + ((aratio < config.available_ratio_gc_max) && + (rratio > config.reclaim_ratio_gc_threshold)) + ); + } + + clean_space_ret clean_space() final; + + const std::set<device_id_t>& get_device_ids() const final { + return sm_group->get_device_ids(); + } + + std::size_t get_reclaim_size_per_cycle() const final { + return config.reclaim_bytes_per_cycle; + } + + // Testing interfaces + + bool check_usage() final; + +private: + /* + * 10 buckets for the number of closed segments by usage + * 2 extra buckets for the number of open and empty segments + */ + static constexpr double UTIL_STATE_OPEN = 1.05; + static constexpr double UTIL_STATE_EMPTY = 1.15; + static constexpr std::size_t UTIL_BUCKETS = 12; + static std::size_t get_bucket_index(double util) { + auto index = std::floor(util * 10); + assert(index < UTIL_BUCKETS); + return index; + } + double calc_utilization(segment_id_t id) const { + auto& info = segments[id]; + if (info.is_open()) { + return UTIL_STATE_OPEN; + } else if (info.is_empty()) { + return UTIL_STATE_EMPTY; + } else { + auto ret = space_tracker->calc_utilization(id); + assert(ret >= 0 && ret < 1); + return ret; + } + } + + // journal status helpers + + double calc_gc_benefit_cost( + segment_id_t id, + const sea_time_point &now_time, + const sea_time_point &bound_time) const; + + segment_id_t get_next_reclaim_segment() const; + + struct reclaim_state_t { + rewrite_gen_t generation; + rewrite_gen_t target_generation; + segment_off_t segment_size; + paddr_t start_pos; + paddr_t end_pos; + + static reclaim_state_t create( + segment_id_t segment_id, + rewrite_gen_t generation, + segment_off_t segment_size) { + ceph_assert(is_rewrite_generation(generation)); + + rewrite_gen_t target_gen; + if (generation < MIN_REWRITE_GENERATION) { + target_gen = MIN_REWRITE_GENERATION; + } else { + // tolerate the target_gen to exceed MAX_REWRETE_GENERATION to make EPM + // aware of its original generation for the decisions. + target_gen = generation + 1; + } + + assert(is_target_rewrite_generation(target_gen)); + return {generation, + target_gen, + segment_size, + P_ADDR_NULL, + paddr_t::make_seg_paddr(segment_id, 0)}; + } + + segment_id_t get_segment_id() const { + return end_pos.as_seg_paddr().get_segment_id(); + } + + bool is_complete() const { + return end_pos.as_seg_paddr().get_segment_off() >= segment_size; + } + + void advance(std::size_t bytes) { + assert(!is_complete()); + start_pos = end_pos; + auto &end_seg_paddr = end_pos.as_seg_paddr(); + auto next_off = end_seg_paddr.get_segment_off() + bytes; + if (next_off > (std::size_t)segment_size) { + end_seg_paddr.set_segment_off(segment_size); + } else { + end_seg_paddr.set_segment_off(next_off); + } + } + }; + std::optional<reclaim_state_t> reclaim_state; + + using do_reclaim_space_ertr = base_ertr; + using do_reclaim_space_ret = do_reclaim_space_ertr::future<>; + do_reclaim_space_ret do_reclaim_space( + const std::vector<CachedExtentRef> &backref_extents, + const backref_pin_list_t &pin_list, + std::size_t &reclaimed, + std::size_t &runs); + + /* + * Segments calculations + */ + std::size_t get_segments_in_journal() const { + if (trimmer != nullptr) { + return trimmer->get_num_rolls(); + } else { + return 0; + } + } + std::size_t get_segments_in_journal_closed() const { + auto in_journal = get_segments_in_journal(); + auto in_journal_open = segments.get_num_in_journal_open(); + if (in_journal >= in_journal_open) { + return in_journal - in_journal_open; + } else { + return 0; + } + } + std::size_t get_segments_reclaimable() const { + assert(segments.get_num_closed() >= get_segments_in_journal_closed()); + return segments.get_num_closed() - get_segments_in_journal_closed(); + } + + /* + * Space calculations + */ + /// the unavailable space that is not reclaimable yet + std::size_t get_unavailable_unreclaimable_bytes() const { + auto ret = (segments.get_num_open() + get_segments_in_journal_closed()) * + segments.get_segment_size(); + assert(ret >= segments.get_available_bytes_in_open()); + return ret - segments.get_available_bytes_in_open(); + } + /// the unavailable space that can be reclaimed + std::size_t get_unavailable_reclaimable_bytes() const { + auto ret = get_segments_reclaimable() * segments.get_segment_size(); + ceph_assert(ret + get_unavailable_unreclaimable_bytes() == segments.get_unavailable_bytes()); + return ret; + } + /// the unavailable space that is not alive + std::size_t get_unavailable_unused_bytes() const { + assert(segments.get_unavailable_bytes() > stats.used_bytes); + return segments.get_unavailable_bytes() - stats.used_bytes; + } + double get_reclaim_ratio() const { + if (segments.get_unavailable_bytes() == 0) return 0; + return (double)get_unavailable_unused_bytes() / (double)segments.get_unavailable_bytes(); + } + double get_alive_ratio() const { + return stats.used_bytes / (double)segments.get_total_bytes(); + } + + /* + * Space calculations (projected) + */ + std::size_t get_projected_available_bytes() const { + return (segments.get_available_bytes() > stats.projected_used_bytes) ? + segments.get_available_bytes() - stats.projected_used_bytes: + 0; + } + double get_projected_available_ratio() const { + return (double)get_projected_available_bytes() / + (double)segments.get_total_bytes(); + } + + using scan_extents_ertr = SegmentManagerGroup::scan_valid_records_ertr; + using scan_extents_ret = scan_extents_ertr::future<>; + scan_extents_ret scan_no_tail_segment( + const segment_header_t& header, + segment_id_t segment_id); + + void adjust_segment_util(double old_usage, double new_usage) { + auto old_index = get_bucket_index(old_usage); + auto new_index = get_bucket_index(new_usage); + assert(stats.segment_util.buckets[old_index].count > 0); + stats.segment_util.buckets[old_index].count--; + stats.segment_util.buckets[new_index].count++; + } + + void init_mark_segment_closed( + segment_id_t segment, + segment_seq_t seq, + segment_type_t s_type, + data_category_t category, + rewrite_gen_t generation) { + assert(background_callback->get_state() == state_t::MOUNT); + ceph_assert(s_type == segment_type_t::OOL || + trimmer != nullptr); // segment_type_t::JOURNAL + auto old_usage = calc_utilization(segment); + segments.init_closed(segment, seq, s_type, category, generation); + auto new_usage = calc_utilization(segment); + adjust_segment_util(old_usage, new_usage); + if (s_type == segment_type_t::OOL) { + ool_segment_seq_allocator.set_next_segment_seq(seq); + } + } + + const bool detailed; + const bool is_cold; + const config_t config; + + SegmentManagerGroupRef sm_group; + BackrefManager &backref_manager; + + SpaceTrackerIRef space_tracker; + segments_info_t segments; + + struct { + /** + * used_bytes + * + * Bytes occupied by live extents + */ + uint64_t used_bytes = 0; + + /** + * projected_used_bytes + * + * Sum of projected bytes used by each transaction between throttle + * acquisition and commit completion. See try_reserve_projected_usage() + */ + uint64_t projected_used_bytes = 0; + uint64_t projected_count = 0; + uint64_t projected_used_bytes_sum = 0; + + uint64_t closed_journal_used_bytes = 0; + uint64_t closed_journal_total_bytes = 0; + uint64_t closed_ool_used_bytes = 0; + uint64_t closed_ool_total_bytes = 0; + + uint64_t reclaiming_bytes = 0; + uint64_t reclaimed_bytes = 0; + uint64_t reclaimed_segment_bytes = 0; + + seastar::metrics::histogram segment_util; + } stats; + seastar::metrics::metric_group metrics; + void register_metrics(); + + // optional, set if this cleaner is assigned to SegmentedJournal + JournalTrimmer *trimmer = nullptr; + + ExtentCallbackInterface *extent_callback = nullptr; + + BackgroundListener *background_callback = nullptr; + + // TODO: drop once paddr->journal_seq_t is introduced + SegmentSeqAllocator &ool_segment_seq_allocator; +}; + +class RBMCleaner; +using RBMCleanerRef = std::unique_ptr<RBMCleaner>; + +class RBMCleaner : public AsyncCleaner { +public: + RBMCleaner( + RBMDeviceGroupRef&& rb_group, + BackrefManager &backref_manager, + bool detailed); + + static RBMCleanerRef create( + RBMDeviceGroupRef&& rb_group, + BackrefManager &backref_manager, + bool detailed) { + return std::make_unique<RBMCleaner>( + std::move(rb_group), backref_manager, detailed); + } + + RBMDeviceGroup* get_rb_group() { + return rb_group.get(); + } + + /* + * AsyncCleaner interfaces + */ + + void set_background_callback(BackgroundListener *cb) final { + background_callback = cb; + } + + void set_extent_callback(ExtentCallbackInterface *cb) final { + extent_callback = cb; + } + + store_statfs_t get_stat() const final { + store_statfs_t st; + st.total = get_total_bytes(); + st.available = get_total_bytes() - get_journal_bytes() - stats.used_bytes; + st.allocated = get_journal_bytes() + stats.used_bytes; + st.data_stored = get_journal_bytes() + stats.used_bytes; + return st; + } + + void print(std::ostream &, bool is_detailed) const final; + + mount_ret mount() final; + + void mark_space_used(paddr_t, extent_len_t) final; + + void mark_space_free(paddr_t, extent_len_t) final; + + void commit_space_used(paddr_t, extent_len_t) final; + + bool try_reserve_projected_usage(std::size_t) final; + + void release_projected_usage(size_t) final; + + bool should_block_io_on_clean() const final { + return false; + } + + bool can_clean_space() const final { + return false; + } + + bool should_clean_space() const final { + return false; + } + + clean_space_ret clean_space() final; + + const std::set<device_id_t>& get_device_ids() const final { + return rb_group->get_device_ids(); + } + + std::size_t get_reclaim_size_per_cycle() const final { + return 0; + } + + RandomBlockManager* get_rbm(paddr_t paddr) { + auto rbs = rb_group->get_rb_managers(); + for (auto p : rbs) { + if (p->get_device_id() == paddr.get_device_id()) { + return p; + } + } + return nullptr; + } + + paddr_t alloc_paddr(extent_len_t length) { + // TODO: implement allocation strategy (dirty metadata and multiple devices) + auto rbs = rb_group->get_rb_managers(); + auto paddr = rbs[0]->alloc_extent(length); + stats.used_bytes += length; + return paddr; + } + + size_t get_total_bytes() const { + auto rbs = rb_group->get_rb_managers(); + size_t total = 0; + for (auto p : rbs) { + total += p->get_device()->get_available_size(); + } + return total; + } + + size_t get_journal_bytes() const { + auto rbs = rb_group->get_rb_managers(); + size_t total = 0; + for (auto p : rbs) { + total += p->get_journal_size(); + } + return total; + } + + // Testing interfaces + + bool check_usage() final; + + bool check_usage_is_empty() const final { + // TODO + return true; + } + +private: + bool equals(const RBMSpaceTracker &other) const; + + const bool detailed; + RBMDeviceGroupRef rb_group; + BackrefManager &backref_manager; + + struct { + /** + * used_bytes + * + * Bytes occupied by live extents + */ + uint64_t used_bytes = 0; + + /** + * projected_used_bytes + * + * Sum of projected bytes used by each transaction between throttle + * acquisition and commit completion. See reserve_projected_usage() + */ + uint64_t projected_used_bytes = 0; + } stats; + seastar::metrics::metric_group metrics; + void register_metrics(); + + ExtentCallbackInterface *extent_callback = nullptr; + BackgroundListener *background_callback = nullptr; +}; +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::segment_info_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::segments_info_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::AsyncCleaner::stat_printer_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::JournalTrimmerImpl::stat_printer_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/backref/backref_tree_node.cc b/src/crimson/os/seastore/backref/backref_tree_node.cc new file mode 100644 index 000000000..513c29994 --- /dev/null +++ b/src/crimson/os/seastore/backref/backref_tree_node.cc @@ -0,0 +1,14 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/backref/backref_tree_node.h" + +namespace crimson::os::seastore::backref { + +std::ostream& operator<<(std::ostream &out, const backref_map_val_t& val) { + return out << "backref_map_val_t(" + << val.laddr + << "~" << val.len << ")"; +} + +} // namespace crimson::os::seastore::backref diff --git a/src/crimson/os/seastore/backref/backref_tree_node.h b/src/crimson/os/seastore/backref/backref_tree_node.h new file mode 100644 index 000000000..c3ff52520 --- /dev/null +++ b/src/crimson/os/seastore/backref/backref_tree_node.h @@ -0,0 +1,137 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/btree/fixed_kv_node.h" + +namespace crimson::os::seastore::backref { + +using backref_node_meta_t = fixed_kv_node_meta_t<paddr_t>; +using backref_node_meta_le_t = fixed_kv_node_meta_le_t<paddr_t>; + +constexpr size_t INTERNAL_NODE_CAPACITY = 254; +constexpr size_t LEAF_NODE_CAPACITY = 169; + +using BackrefNode = FixedKVNode<paddr_t>; + +struct backref_map_val_t { + extent_len_t len = 0; ///< length of extents + laddr_t laddr = 0; ///< logical address of extents + extent_types_t type = extent_types_t::ROOT; + + backref_map_val_t() = default; + backref_map_val_t( + extent_len_t len, + laddr_t laddr, + extent_types_t type) + : len(len), laddr(laddr), type(type) {} + + bool operator==(const backref_map_val_t& rhs) const noexcept { + return len == rhs.len && laddr == rhs.laddr; + } +}; + +std::ostream& operator<<(std::ostream &out, const backref_map_val_t& val); + +struct backref_map_val_le_t { + extent_len_le_t len = init_extent_len_le(0); + laddr_le_t laddr = laddr_le_t(0); + extent_types_le_t type = 0; + + backref_map_val_le_t() = default; + backref_map_val_le_t(const backref_map_val_le_t &) = default; + explicit backref_map_val_le_t(const backref_map_val_t &val) + : len(init_extent_len_le(val.len)), + laddr(val.laddr), + type(extent_types_le_t(val.type)) {} + + operator backref_map_val_t() const { + return backref_map_val_t{len, laddr, (extent_types_t)type}; + } +}; + +class BackrefInternalNode + : public FixedKVInternalNode< + INTERNAL_NODE_CAPACITY, + paddr_t, paddr_le_t, + BACKREF_NODE_SIZE, + BackrefInternalNode> { +public: + template <typename... T> + BackrefInternalNode(T&&... t) : + FixedKVInternalNode(std::forward<T>(t)...) {} + + static constexpr extent_types_t TYPE = extent_types_t::BACKREF_INTERNAL; + + extent_types_t get_type() const final { + return TYPE; + } +}; +using BackrefInternalNodeRef = BackrefInternalNode::Ref; + +class BackrefLeafNode + : public FixedKVLeafNode< + LEAF_NODE_CAPACITY, + paddr_t, paddr_le_t, + backref_map_val_t, backref_map_val_le_t, + BACKREF_NODE_SIZE, + BackrefLeafNode, + false> { +public: + template <typename... T> + BackrefLeafNode(T&&... t) : + FixedKVLeafNode(std::forward<T>(t)...) {} + + static constexpr extent_types_t TYPE = extent_types_t::BACKREF_LEAF; + + extent_types_t get_type() const final { + return TYPE; + } + + const_iterator insert( + const_iterator iter, + paddr_t key, + backref_map_val_t val, + LogicalCachedExtent*) final { + journal_insert( + iter, + key, + val, + maybe_get_delta_buffer()); + return iter; + } + + void update( + const_iterator iter, + backref_map_val_t val, + LogicalCachedExtent*) final { + return journal_update( + iter, + val, + maybe_get_delta_buffer()); + } + + void remove(const_iterator iter) final { + return journal_remove( + iter, + maybe_get_delta_buffer()); + } + + // backref leaf nodes don't have to resolve relative addresses + void resolve_relative_addrs(paddr_t base) final {} + + void node_resolve_vals(iterator from, iterator to) const final {} + + void node_unresolve_vals(iterator from, iterator to) const final {} +}; +using BackrefLeafNodeRef = BackrefLeafNode::Ref; + +} // namespace crimson::os::seastore::backref + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::backref::backref_map_val_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::backref::BackrefInternalNode> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::backref::BackrefLeafNode> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::backref::backref_node_meta_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc new file mode 100644 index 000000000..30ff45540 --- /dev/null +++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc @@ -0,0 +1,609 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/backref/btree_backref_manager.h" + +SET_SUBSYS(seastore_backref); + +namespace crimson::os::seastore { + +template<> +Transaction::tree_stats_t& get_tree_stats< + crimson::os::seastore::backref::BackrefBtree>(Transaction &t) { + return t.get_backref_tree_stats(); +} + +template<> +phy_tree_root_t& get_phy_tree_root< + crimson::os::seastore::backref::BackrefBtree>(root_t &r) { + return r.backref_root; +} + +template<> +const get_phy_tree_root_node_ret get_phy_tree_root_node< + crimson::os::seastore::backref::BackrefBtree>( + const RootBlockRef &root_block, op_context_t<paddr_t> c) { + auto backref_root = root_block->backref_root_node; + if (backref_root) { + ceph_assert(backref_root->is_initial_pending() + == root_block->is_pending()); + return {true, + trans_intr::make_interruptible( + c.cache.get_extent_viewable_by_trans(c.trans, backref_root))}; + } else if (root_block->is_pending()) { + auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance()); + backref_root = prior.backref_root_node; + if (backref_root) { + return {true, + trans_intr::make_interruptible( + c.cache.get_extent_viewable_by_trans(c.trans, backref_root))}; + } else { + return {false, + trans_intr::make_interruptible( + Cache::get_extent_ertr::make_ready_future< + CachedExtentRef>())}; + } + } else { + return {false, + trans_intr::make_interruptible( + Cache::get_extent_ertr::make_ready_future< + CachedExtentRef>())}; + } +} + +template <typename ROOT> +void link_phy_tree_root_node(RootBlockRef &root_block, ROOT* backref_root) { + root_block->backref_root_node = backref_root; + ceph_assert(backref_root != nullptr); + backref_root->root_block = root_block; +} + +template void link_phy_tree_root_node( + RootBlockRef &root_block, backref::BackrefInternalNode* backref_root); +template void link_phy_tree_root_node( + RootBlockRef &root_block, backref::BackrefLeafNode* backref_root); +template void link_phy_tree_root_node( + RootBlockRef &root_block, backref::BackrefNode* backref_root); + +template <> +void unlink_phy_tree_root_node<paddr_t>(RootBlockRef &root_block) { + root_block->backref_root_node = nullptr; +} + +} + +namespace crimson::os::seastore::backref { + +BtreeBackrefManager::mkfs_ret +BtreeBackrefManager::mkfs( + Transaction &t) +{ + LOG_PREFIX(BtreeBackrefManager::mkfs); + INFOT("start", t); + return cache.get_root(t).si_then([this, &t](auto croot) { + assert(croot->is_mutation_pending()); + croot->get_root().backref_root = BackrefBtree::mkfs(croot, get_context(t)); + return mkfs_iertr::now(); + }).handle_error_interruptible( + mkfs_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in BtreeBackrefManager::mkfs" + } + ); +} + +BtreeBackrefManager::get_mapping_ret +BtreeBackrefManager::get_mapping( + Transaction &t, + paddr_t offset) +{ + LOG_PREFIX(BtreeBackrefManager::get_mapping); + TRACET("{}", t, offset); + auto c = get_context(t); + return with_btree_ret<BackrefBtree, BackrefMappingRef>( + cache, + c, + [c, offset](auto &btree) { + return btree.lower_bound( + c, offset + ).si_then([offset, c](auto iter) -> get_mapping_ret { + LOG_PREFIX(BtreeBackrefManager::get_mapping); + if (iter.is_end() || iter.get_key() != offset) { + ERRORT("{} doesn't exist", c.trans, offset); + return crimson::ct_error::enoent::make(); + } else { + TRACET("{} got {}, {}", + c.trans, offset, iter.get_key(), iter.get_val()); + return get_mapping_ret( + interruptible::ready_future_marker{}, + iter.get_pin(c)); + } + }); + }); +} + +BtreeBackrefManager::get_mappings_ret +BtreeBackrefManager::get_mappings( + Transaction &t, + paddr_t offset, + paddr_t end) +{ + LOG_PREFIX(BtreeBackrefManager::get_mappings); + TRACET("{}~{}", t, offset, end); + auto c = get_context(t); + return with_btree_state<BackrefBtree, backref_pin_list_t>( + cache, + c, + [c, offset, end](auto &btree, auto &ret) { + return BackrefBtree::iterate_repeat( + c, + btree.upper_bound_right(c, offset), + [&ret, offset, end, c](auto &pos) { + LOG_PREFIX(BtreeBackrefManager::get_mappings); + if (pos.is_end() || pos.get_key() >= end) { + TRACET("{}~{} done with {} results", + c.trans, offset, end, ret.size()); + return BackrefBtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } + TRACET("{}~{} got {}, {}, repeat ...", + c.trans, offset, end, pos.get_key(), pos.get_val()); + ceph_assert((pos.get_key().add_offset(pos.get_val().len)) > offset); + ret.emplace_back(pos.get_pin(c)); + return BackrefBtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + }); + }); +} + +BtreeBackrefManager::new_mapping_ret +BtreeBackrefManager::new_mapping( + Transaction &t, + paddr_t key, + extent_len_t len, + laddr_t addr, + extent_types_t type) +{ + ceph_assert( + is_aligned( + key.get_addr_type() == paddr_types_t::SEGMENT ? + key.as_seg_paddr().get_segment_off() : + key.as_blk_paddr().get_device_off(), + cache.get_block_size())); + struct state_t { + paddr_t last_end; + + std::optional<BackrefBtree::iterator> insert_iter; + std::optional<BackrefBtree::iterator> ret; + + state_t(paddr_t hint) : last_end(hint) {} + }; + + LOG_PREFIX(BtreeBackrefManager::new_mapping); + DEBUGT("{}~{}, paddr={}", t, addr, len, key); + backref_map_val_t val{len, addr, type}; + auto c = get_context(t); + //++stats.num_alloc_extents; + //auto lookup_attempts = stats.num_alloc_extents_iter_nexts; + return crimson::os::seastore::with_btree_state<BackrefBtree, state_t>( + cache, + c, + key, + [val, c, key, len, addr, /*lookup_attempts,*/ &t] + (auto &btree, auto &state) { + return BackrefBtree::iterate_repeat( + c, + btree.upper_bound_right(c, key), + [&state, len, addr, &t, key/*, lookup_attempts*/](auto &pos) { + LOG_PREFIX(BtreeBackrefManager::new_mapping); + //++stats.num_alloc_extents_iter_nexts; + if (pos.is_end()) { + DEBUGT("{}~{}, paddr={}, state: end, insert at {}", + t, addr, len, key, + //stats.num_alloc_extents_iter_nexts - lookup_attempts, + state.last_end); + state.insert_iter = pos; + return BackrefBtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } else if (pos.get_key() >= (state.last_end.add_offset(len))) { + DEBUGT("{}~{}, paddr={}, state: {}~{}, " + "insert at {} -- {}", + t, addr, len, key, + pos.get_key(), pos.get_val().len, + //stats.num_alloc_extents_iter_nexts - lookup_attempts, + state.last_end, + pos.get_val()); + state.insert_iter = pos; + return BackrefBtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } else { + ERRORT("{}~{}, paddr={}, state: {}~{}, repeat ... -- {}", + t, addr, len, key, + pos.get_key(), pos.get_val().len, + pos.get_val()); + ceph_abort("not possible for the backref tree"); + return BackrefBtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + } + }).si_then([c, addr, len, key, &btree, &state, val] { + return btree.insert( + c, + *state.insert_iter, + state.last_end, + val, + nullptr + ).si_then([&state, c, addr, len, key](auto &&p) { + LOG_PREFIX(BtreeBackrefManager::new_mapping); + auto [iter, inserted] = std::move(p); + TRACET("{}~{}, paddr={}, inserted at {}, leaf {}", + c.trans, addr, len, key, state.last_end, *iter.get_leaf_node()); + ceph_assert(inserted); + state.ret = iter; + }); + }); + }).si_then([c](auto &&state) { + return new_mapping_iertr::make_ready_future<BackrefMappingRef>( + state.ret->get_pin(c)); + }); +} + +BtreeBackrefManager::merge_cached_backrefs_ret +BtreeBackrefManager::merge_cached_backrefs( + Transaction &t, + const journal_seq_t &limit, + const uint64_t max) +{ + LOG_PREFIX(BtreeBackrefManager::merge_cached_backrefs); + DEBUGT("insert up to {}", t, limit); + return seastar::do_with( + limit, + JOURNAL_SEQ_NULL, + [this, &t, max](auto &limit, auto &inserted_to) { + auto &backref_entryrefs_by_seq = cache.get_backref_entryrefs_by_seq(); + return seastar::do_with( + backref_entryrefs_by_seq.begin(), + JOURNAL_SEQ_NULL, + [this, &t, &limit, &backref_entryrefs_by_seq, max](auto &iter, auto &inserted_to) { + return trans_intr::repeat( + [&iter, this, &t, &limit, &backref_entryrefs_by_seq, max, &inserted_to]() + -> merge_cached_backrefs_iertr::future<seastar::stop_iteration> { + if (iter == backref_entryrefs_by_seq.end()) { + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + auto &seq = iter->first; + auto &backref_entry_refs = iter->second; + LOG_PREFIX(BtreeBackrefManager::merge_cached_backrefs); + DEBUGT("seq {}, limit {}, num_fresh_backref {}" + , t, seq, limit, t.get_num_fresh_backref()); + if (seq <= limit && t.get_num_fresh_backref() * BACKREF_NODE_SIZE < max) { + inserted_to = seq; + return trans_intr::do_for_each( + backref_entry_refs, + [this, &t](auto &backref_entry_ref) { + LOG_PREFIX(BtreeBackrefManager::merge_cached_backrefs); + auto &backref_entry = *backref_entry_ref; + if (backref_entry.laddr != L_ADDR_NULL) { + DEBUGT("new mapping: {}~{} -> {}", + t, + backref_entry.paddr, + backref_entry.len, + backref_entry.laddr); + return new_mapping( + t, + backref_entry.paddr, + backref_entry.len, + backref_entry.laddr, + backref_entry.type).si_then([](auto &&pin) { + return seastar::now(); + }); + } else { + DEBUGT("remove mapping: {}", t, backref_entry.paddr); + return remove_mapping( + t, + backref_entry.paddr + ).si_then([](auto&&) { + return seastar::now(); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further(), + crimson::ct_error::assert_all("no enoent possible") + ); + } + }).si_then([&iter] { + iter++; + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }); + } + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + }).si_then([&inserted_to] { + return seastar::make_ready_future<journal_seq_t>( + std::move(inserted_to)); + }); + }); + return merge_cached_backrefs_iertr::make_ready_future<journal_seq_t>( + std::move(inserted_to)); + }); +} + +BtreeBackrefManager::scan_mapped_space_ret +BtreeBackrefManager::scan_mapped_space( + Transaction &t, + BtreeBackrefManager::scan_mapped_space_func_t &&f) +{ + LOG_PREFIX(BtreeBackrefManager::scan_mapped_space); + DEBUGT("scan backref tree", t); + auto c = get_context(t); + return seastar::do_with( + std::move(f), + [this, c, FNAME](auto &scan_visitor) + { + auto block_size = cache.get_block_size(); + // traverse leaf-node entries + return with_btree<BackrefBtree>( + cache, c, + [c, &scan_visitor, block_size, FNAME](auto &btree) + { + return BackrefBtree::iterate_repeat( + c, + btree.lower_bound( + c, + P_ADDR_MIN), + [c, &scan_visitor, block_size, FNAME](auto &pos) { + if (pos.is_end()) { + return BackrefBtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } + TRACET("tree value {}~{} {}~{} {} used", + c.trans, + pos.get_key(), + pos.get_val().len, + pos.get_val().laddr, + pos.get_val().len, + pos.get_val().type); + ceph_assert(pos.get_key().is_absolute()); + ceph_assert(pos.get_val().len > 0 && + pos.get_val().len % block_size == 0); + ceph_assert(!is_backref_node(pos.get_val().type)); + ceph_assert(pos.get_val().laddr != L_ADDR_NULL); + scan_visitor( + pos.get_key(), + P_ADDR_NULL, + pos.get_val().len, + pos.get_val().type, + pos.get_val().laddr); + return BackrefBtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + } + ); + }).si_then([this, &scan_visitor, c, FNAME, block_size] { + // traverse alloc-deltas in order + auto &backref_entryrefs = cache.get_backref_entryrefs_by_seq(); + for (auto &[seq, refs] : backref_entryrefs) { + boost::ignore_unused(seq); + DEBUGT("scan {} backref entries", c.trans, refs.size()); + for (auto &backref_entry : refs) { + if (backref_entry->laddr == L_ADDR_NULL) { + TRACET("backref entry {}~{} {} free", + c.trans, + backref_entry->paddr, + backref_entry->len, + backref_entry->type); + } else { + TRACET("backref entry {}~{} {}~{} {} used", + c.trans, + backref_entry->paddr, + backref_entry->len, + backref_entry->laddr, + backref_entry->len, + backref_entry->type); + } + ceph_assert(backref_entry->paddr.is_absolute()); + ceph_assert(backref_entry->len > 0 && + backref_entry->len % block_size == 0); + ceph_assert(!is_backref_node(backref_entry->type)); + scan_visitor( + backref_entry->paddr, + P_ADDR_NULL, + backref_entry->len, + backref_entry->type, + backref_entry->laddr); + } + } + }).si_then([this, &scan_visitor, block_size, c, FNAME] { + BackrefBtree::mapped_space_visitor_t f = + [&scan_visitor, block_size, FNAME, c]( + paddr_t paddr, paddr_t key, extent_len_t len, + depth_t depth, extent_types_t type, BackrefBtree::iterator&) { + TRACET("tree node {}~{} {}, depth={} used", + c.trans, paddr, len, type, depth); + ceph_assert(paddr.is_absolute()); + ceph_assert(len > 0 && len % block_size == 0); + ceph_assert(depth >= 1); + ceph_assert(is_backref_node(type)); + return scan_visitor(paddr, key, len, type, L_ADDR_NULL); + }; + return seastar::do_with( + std::move(f), + [this, c](auto &tree_visitor) + { + // traverse internal-node entries + return with_btree<BackrefBtree>( + cache, c, + [c, &tree_visitor](auto &btree) + { + return BackrefBtree::iterate_repeat( + c, + btree.lower_bound( + c, + P_ADDR_MIN, + &tree_visitor), + [](auto &pos) { + if (pos.is_end()) { + return BackrefBtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } + return BackrefBtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + }, + &tree_visitor + ); + }); + }); + }); + }); +} + +BtreeBackrefManager::base_iertr::future<> _init_cached_extent( + op_context_t<paddr_t> c, + const CachedExtentRef &e, + BackrefBtree &btree, + bool &ret) +{ + return btree.init_cached_extent(c, e + ).si_then([&ret](bool is_alive) { + ret = is_alive; + }); +} + +BtreeBackrefManager::init_cached_extent_ret BtreeBackrefManager::init_cached_extent( + Transaction &t, + CachedExtentRef e) +{ + LOG_PREFIX(BtreeBackrefManager::init_cached_extent); + TRACET("{}", t, *e); + return seastar::do_with(bool(), [this, e, &t](bool &ret) { + auto c = get_context(t); + return with_btree<BackrefBtree>(cache, c, [c, e, &ret](auto &btree) + -> base_iertr::future<> { + LOG_PREFIX(BtreeBackrefManager::init_cached_extent); + DEBUGT("extent {}", c.trans, *e); + return _init_cached_extent(c, e, btree, ret); + }).si_then([&ret] { return ret; }); + }); +} + +BtreeBackrefManager::rewrite_extent_ret +BtreeBackrefManager::rewrite_extent( + Transaction &t, + CachedExtentRef extent) +{ + auto c = get_context(t); + return with_btree<BackrefBtree>( + cache, + c, + [c, extent](auto &btree) mutable { + return btree.rewrite_extent(c, extent); + }); +} + +BtreeBackrefManager::remove_mapping_ret +BtreeBackrefManager::remove_mapping( + Transaction &t, + paddr_t addr) +{ + auto c = get_context(t); + return with_btree_ret<BackrefBtree, remove_mapping_result_t>( + cache, + c, + [c, addr](auto &btree) mutable { + return btree.lower_bound( + c, addr + ).si_then([&btree, c, addr](auto iter) + -> remove_mapping_ret { + if (iter.is_end() || iter.get_key() != addr) { + LOG_PREFIX(BtreeBackrefManager::remove_mapping); + WARNT("paddr={} doesn't exist, state: {}, leaf {}", + c.trans, addr, iter.get_key(), *iter.get_leaf_node()); + return remove_mapping_iertr::make_ready_future< + remove_mapping_result_t>(remove_mapping_result_t()); + } + + auto ret = remove_mapping_result_t{ + iter.get_key(), + iter.get_val().len, + iter.get_val().laddr}; + return btree.remove( + c, + iter + ).si_then([ret] { + return ret; + }); + }); + }); +} + +Cache::backref_entry_query_mset_t +BtreeBackrefManager::get_cached_backref_entries_in_range( + paddr_t start, + paddr_t end) +{ + return cache.get_backref_entries_in_range(start, end); +} + +void BtreeBackrefManager::cache_new_backref_extent( + paddr_t paddr, + paddr_t key, + extent_types_t type) +{ + return cache.add_backref_extent(paddr, key, type); +} + +BtreeBackrefManager::retrieve_backref_extents_in_range_ret +BtreeBackrefManager::retrieve_backref_extents_in_range( + Transaction &t, + paddr_t start, + paddr_t end) +{ + auto backref_extents = cache.get_backref_extents_in_range(start, end); + return seastar::do_with( + std::vector<CachedExtentRef>(), + std::move(backref_extents), + [this, &t](auto &extents, auto &backref_extents) { + return trans_intr::parallel_for_each( + backref_extents, + [this, &extents, &t](auto &ent) { + // only the gc fiber which is single can rewrite backref extents, + // so it must be alive + assert(is_backref_node(ent.type)); + LOG_PREFIX(BtreeBackrefManager::retrieve_backref_extents_in_range); + DEBUGT("getting backref extent of type {} at {}, key {}", + t, + ent.type, + ent.paddr, + ent.key); + + auto c = get_context(t); + return with_btree_ret<BackrefBtree, CachedExtentRef>( + cache, + c, + [c, &ent](auto &btree) { + if (ent.type == extent_types_t::BACKREF_INTERNAL) { + return btree.get_internal_if_live( + c, ent.paddr, ent.key, BACKREF_NODE_SIZE); + } else { + assert(ent.type == extent_types_t::BACKREF_LEAF); + return btree.get_leaf_if_live( + c, ent.paddr, ent.key, BACKREF_NODE_SIZE); + } + }).si_then([&extents](auto ext) { + ceph_assert(ext); + extents.emplace_back(std::move(ext)); + }); + }).si_then([&extents] { + return std::move(extents); + }); + }); +} + +} // namespace crimson::os::seastore::backref diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h new file mode 100644 index 000000000..952e78b65 --- /dev/null +++ b/src/crimson/os/seastore/backref/btree_backref_manager.h @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/backref_manager.h" +#include "crimson/os/seastore/backref/backref_tree_node.h" +#include "crimson/os/seastore/btree/fixed_kv_btree.h" + +namespace crimson::os::seastore::backref { + +constexpr size_t BACKREF_BLOCK_SIZE = 4096; + +class BtreeBackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> { + extent_types_t type; +public: + BtreeBackrefMapping(op_context_t<paddr_t> ctx) + : BtreeNodeMapping(ctx) {} + BtreeBackrefMapping( + op_context_t<paddr_t> ctx, + CachedExtentRef parent, + uint16_t pos, + backref_map_val_t &val, + backref_node_meta_t &&meta) + : BtreeNodeMapping( + ctx, + parent, + pos, + val.laddr, + val.len, + std::forward<backref_node_meta_t>(meta)), + type(val.type) + {} + extent_types_t get_type() const final { + return type; + } + +protected: + std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>> _duplicate( + op_context_t<paddr_t> ctx) const final { + return std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>>( + new BtreeBackrefMapping(ctx)); + } +}; + +using BackrefBtree = FixedKVBtree< + paddr_t, backref_map_val_t, BackrefInternalNode, + BackrefLeafNode, BtreeBackrefMapping, BACKREF_BLOCK_SIZE, false>; + +class BtreeBackrefManager : public BackrefManager { +public: + + BtreeBackrefManager(Cache &cache) + : cache(cache) + {} + + mkfs_ret mkfs( + Transaction &t) final; + + get_mapping_ret get_mapping( + Transaction &t, + paddr_t offset) final; + + get_mappings_ret get_mappings( + Transaction &t, + paddr_t offset, + paddr_t end) final; + + new_mapping_ret new_mapping( + Transaction &t, + paddr_t key, + extent_len_t len, + laddr_t val, + extent_types_t type) final; + + merge_cached_backrefs_ret merge_cached_backrefs( + Transaction &t, + const journal_seq_t &limit, + const uint64_t max) final; + + remove_mapping_ret remove_mapping( + Transaction &t, + paddr_t offset) final; + + scan_mapped_space_ret scan_mapped_space( + Transaction &t, + scan_mapped_space_func_t &&f) final; + + init_cached_extent_ret init_cached_extent( + Transaction &t, + CachedExtentRef e) final; + + rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) final; + + Cache::backref_entry_query_mset_t + get_cached_backref_entries_in_range( + paddr_t start, + paddr_t end) final; + + retrieve_backref_extents_in_range_ret + retrieve_backref_extents_in_range( + Transaction &t, + paddr_t start, + paddr_t end) final; + + void cache_new_backref_extent( + paddr_t paddr, + paddr_t key, + extent_types_t type) final; + +private: + Cache &cache; + + op_context_t<paddr_t> get_context(Transaction &t) { + return op_context_t<paddr_t>{cache, t}; + } +}; + +} // namespace crimson::os::seastore::backref diff --git a/src/crimson/os/seastore/backref_manager.cc b/src/crimson/os/seastore/backref_manager.cc new file mode 100644 index 000000000..c596ee41d --- /dev/null +++ b/src/crimson/os/seastore/backref_manager.cc @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/backref_manager.h" +#include "crimson/os/seastore/backref/btree_backref_manager.h" + +namespace crimson::os::seastore { + +BackrefManagerRef create_backref_manager( + Cache &cache) +{ + return BackrefManagerRef( + new backref::BtreeBackrefManager(cache)); +} + +} // namespace crimson::os::seastore::backref + diff --git a/src/crimson/os/seastore/backref_manager.h b/src/crimson/os/seastore/backref_manager.h new file mode 100644 index 000000000..3feedb997 --- /dev/null +++ b/src/crimson/os/seastore/backref_manager.h @@ -0,0 +1,152 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/transaction.h" + +namespace crimson::os::seastore { + +/** + * Abstract interface for managing back references that map paddr_t to laddr_t + */ +class BackrefManager { +public: + using base_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using base_iertr = trans_iertr<base_ertr>; + + using mkfs_iertr = base_iertr; + using mkfs_ret = mkfs_iertr::future<>; + virtual mkfs_ret mkfs( + Transaction &t) = 0; + + /** + * Fetches mappings for paddr_t in range [offset, offset + len) + * + * Future will not resolve until all pins have resolved + */ + using get_mappings_iertr = base_iertr; + using get_mappings_ret = get_mappings_iertr::future<backref_pin_list_t>; + virtual get_mappings_ret get_mappings( + Transaction &t, + paddr_t offset, + paddr_t end) = 0; + + /** + * Fetches the mapping for paddr_t + * + * Future will not resolve until the pin has resolved + */ + using get_mapping_iertr = base_iertr::extend< + crimson::ct_error::enoent>; + using get_mapping_ret = get_mapping_iertr::future<BackrefMappingRef>; + virtual get_mapping_ret get_mapping( + Transaction &t, + paddr_t offset) = 0; + + /** + * rewrite_extent + * + * rewrite extent into passed transaction + */ + using rewrite_extent_iertr = base_iertr; + using rewrite_extent_ret = rewrite_extent_iertr::future<>; + virtual rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) = 0; + + /** + * Insert new paddr_t -> laddr_t mapping + */ + using new_mapping_iertr = base_iertr; + using new_mapping_ret = new_mapping_iertr::future<BackrefMappingRef>; + virtual new_mapping_ret new_mapping( + Transaction &t, + paddr_t key, + extent_len_t len, + laddr_t val, + extent_types_t type) = 0; + + /** + * Check if a CachedExtent is alive, should be called + * after replay on each cached extent. + * + * @return returns whether the extent is alive + */ + using init_cached_extent_iertr = base_iertr; + using init_cached_extent_ret = init_cached_extent_iertr::future<bool>; + virtual init_cached_extent_ret init_cached_extent( + Transaction &t, + CachedExtentRef e) = 0; + + virtual Cache::backref_entry_query_mset_t + get_cached_backref_entries_in_range( + paddr_t start, + paddr_t end) = 0; + + using retrieve_backref_extents_in_range_iertr = base_iertr; + using retrieve_backref_extents_in_range_ret = + retrieve_backref_extents_in_range_iertr::future<std::vector<CachedExtentRef>>; + virtual retrieve_backref_extents_in_range_ret + retrieve_backref_extents_in_range( + Transaction &t, + paddr_t start, + paddr_t end) = 0; + + virtual void cache_new_backref_extent( + paddr_t paddr, + paddr_t key, + extent_types_t type) = 0; + + /** + * merge in-cache paddr_t -> laddr_t mappings to the on-disk backref tree + */ + using merge_cached_backrefs_iertr = base_iertr; + using merge_cached_backrefs_ret = merge_cached_backrefs_iertr::future<journal_seq_t>; + virtual merge_cached_backrefs_ret merge_cached_backrefs( + Transaction &t, + const journal_seq_t &limit, + const uint64_t max) = 0; + + struct remove_mapping_result_t { + paddr_t offset = P_ADDR_NULL; + extent_len_t len = 0; + laddr_t laddr = L_ADDR_NULL; + }; + + /** + * delete the mapping for paddr_t offset + */ + using remove_mapping_iertr = base_iertr::extend< + crimson::ct_error::enoent>; + using remove_mapping_ret = remove_mapping_iertr::future<remove_mapping_result_t>; + virtual remove_mapping_ret remove_mapping( + Transaction &t, + paddr_t offset) = 0; + + /** + * scan all extents in both tree and cache, + * including backref extents, logical extents and lba extents, + * visit them with scan_mapped_space_func_t + */ + using scan_mapped_space_iertr = base_iertr; + using scan_mapped_space_ret = scan_mapped_space_iertr::future<>; + using scan_mapped_space_func_t = std::function< + void(paddr_t, paddr_t, extent_len_t, extent_types_t, laddr_t)>; + virtual scan_mapped_space_ret scan_mapped_space( + Transaction &t, + scan_mapped_space_func_t &&f) = 0; + + virtual ~BackrefManager() {} +}; + +using BackrefManagerRef = + std::unique_ptr<BackrefManager>; + +BackrefManagerRef create_backref_manager( + Cache &cache); + +} // namespace crimson::os::seastore::backref diff --git a/src/crimson/os/seastore/btree/btree_range_pin.cc b/src/crimson/os/seastore/btree/btree_range_pin.cc new file mode 100644 index 000000000..2f801dcf1 --- /dev/null +++ b/src/crimson/os/seastore/btree/btree_range_pin.cc @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/btree/btree_range_pin.h" +#include "crimson/os/seastore/btree/fixed_kv_node.h" + +namespace crimson::os::seastore { + +template <typename key_t, typename val_t> +get_child_ret_t<LogicalCachedExtent> +BtreeNodeMapping<key_t, val_t>::get_logical_extent( + Transaction &t) +{ + assert(parent); + assert(parent->is_valid()); + assert(pos != std::numeric_limits<uint16_t>::max()); + auto &p = (FixedKVNode<key_t>&)*parent; + auto v = p.get_logical_child(ctx, pos); + if (!v.has_child()) { + this->child_pos = v.get_child_pos(); + } + return v; +} + +template class BtreeNodeMapping<laddr_t, paddr_t>; +template class BtreeNodeMapping<paddr_t, laddr_t>; +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h new file mode 100644 index 000000000..68188e9ff --- /dev/null +++ b/src/crimson/os/seastore/btree/btree_range_pin.h @@ -0,0 +1,204 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive/set.hpp> + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { + +template <typename node_key_t> +struct op_context_t { + Cache &cache; + Transaction &trans; +}; + +constexpr uint16_t MAX_FIXEDKVBTREE_DEPTH = 8; + +template <typename bound_t> +struct fixed_kv_node_meta_t { + bound_t begin = min_max_t<bound_t>::min; + bound_t end = min_max_t<bound_t>::min; + depth_t depth = 0; + + bool is_parent_of(const fixed_kv_node_meta_t &other) const { + return (depth == other.depth + 1) && + (begin <= other.begin) && + (end > other.begin); + } + + bool is_in_range(const bound_t key) const { + return begin <= key && end > key; + } + + std::pair<fixed_kv_node_meta_t, fixed_kv_node_meta_t> split_into(bound_t pivot) const { + return std::make_pair( + fixed_kv_node_meta_t{begin, pivot, depth}, + fixed_kv_node_meta_t{pivot, end, depth}); + } + + static fixed_kv_node_meta_t merge_from( + const fixed_kv_node_meta_t &lhs, const fixed_kv_node_meta_t &rhs) { + ceph_assert(lhs.depth == rhs.depth); + return fixed_kv_node_meta_t{lhs.begin, rhs.end, lhs.depth}; + } + + static std::pair<fixed_kv_node_meta_t, fixed_kv_node_meta_t> + rebalance(const fixed_kv_node_meta_t &lhs, const fixed_kv_node_meta_t &rhs, bound_t pivot) { + ceph_assert(lhs.depth == rhs.depth); + return std::make_pair( + fixed_kv_node_meta_t{lhs.begin, pivot, lhs.depth}, + fixed_kv_node_meta_t{pivot, rhs.end, lhs.depth}); + } + + bool is_root() const { + return begin == min_max_t<bound_t>::min && end == min_max_t<bound_t>::max; + } +}; + +template <typename bound_t> +inline std::ostream &operator<<( + std::ostream &lhs, + const fixed_kv_node_meta_t<bound_t> &rhs) +{ + return lhs << "btree_node_meta_t(" + << "begin=" << rhs.begin + << ", end=" << rhs.end + << ", depth=" << rhs.depth + << ")"; +} + +/** + * fixed_kv_node_meta_le_t + * + * On disk layout for fixed_kv_node_meta_t + */ +template <typename bound_le_t> +struct fixed_kv_node_meta_le_t { + bound_le_t begin = bound_le_t(0); + bound_le_t end = bound_le_t(0); + depth_le_t depth = init_depth_le(0); + + fixed_kv_node_meta_le_t() = default; + fixed_kv_node_meta_le_t( + const fixed_kv_node_meta_le_t<bound_le_t> &) = default; + explicit fixed_kv_node_meta_le_t( + const fixed_kv_node_meta_t<typename bound_le_t::orig_type> &val) + : begin(val.begin), + end(val.end), + depth(init_depth_le(val.depth)) {} + + operator fixed_kv_node_meta_t<typename bound_le_t::orig_type>() const { + return fixed_kv_node_meta_t<typename bound_le_t::orig_type>{ + begin, end, depth }; + } +}; + +template <typename key_t, typename val_t> +class BtreeNodeMapping : public PhysicalNodeMapping<key_t, val_t> { +protected: + op_context_t<key_t> ctx; + /** + * parent + * + * populated until link_extent is called to ensure cache residence + * until add_pin is called. + */ + CachedExtentRef parent; + + pladdr_t value; + extent_len_t len = 0; + fixed_kv_node_meta_t<key_t> range; + uint16_t pos = std::numeric_limits<uint16_t>::max(); + + virtual std::unique_ptr<BtreeNodeMapping> _duplicate(op_context_t<key_t>) const = 0; + fixed_kv_node_meta_t<key_t> _get_pin_range() const { + return range; + } + +public: + using val_type = val_t; + BtreeNodeMapping(op_context_t<key_t> ctx) : ctx(ctx) {} + + BtreeNodeMapping( + op_context_t<key_t> ctx, + CachedExtentRef parent, + uint16_t pos, + pladdr_t value, + extent_len_t len, + fixed_kv_node_meta_t<key_t> meta) + : ctx(ctx), + parent(parent), + value(value), + len(len), + range(meta), + pos(pos) + { + if (!parent->is_pending()) { + this->child_pos = {parent, pos}; + } + } + + CachedExtentRef get_parent() const final { + return parent; + } + + CachedExtentRef get_parent() { + return parent; + } + + void set_parent(CachedExtentRef ext) { + parent = ext; + } + + uint16_t get_pos() const final { + return pos; + } + + extent_len_t get_length() const final { + ceph_assert(range.end > range.begin); + return len; + } + + extent_types_t get_type() const override { + ceph_abort("should never happen"); + return extent_types_t::ROOT; + } + + val_t get_val() const final { + if constexpr (std::is_same_v<val_t, paddr_t>) { + return value.get_paddr(); + } else { + static_assert(std::is_same_v<val_t, laddr_t>); + return value.get_laddr(); + } + } + + key_t get_key() const override { + return range.begin; + } + + PhysicalNodeMappingRef<key_t, val_t> duplicate() const final { + auto ret = _duplicate(ctx); + ret->range = range; + ret->value = value; + ret->parent = parent; + ret->len = len; + ret->pos = pos; + return ret; + } + + bool has_been_invalidated() const final { + return parent->has_been_invalidated(); + } + + get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction&) final; +}; + +} diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h new file mode 100644 index 000000000..2970d0440 --- /dev/null +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -0,0 +1,2251 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <boost/container/static_vector.hpp> +#include <sys/mman.h> +#include <memory> +#include <string.h> + +#include "crimson/os/seastore/logging.h" + +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/btree/btree_range_pin.h" +#include "crimson/os/seastore/root_block.h" + +#define RESERVATION_PTR reinterpret_cast<ChildableCachedExtent*>(0x1) + +namespace crimson::os::seastore::lba_manager::btree { +struct lba_map_val_t; +} + +namespace crimson::os::seastore { + +bool is_valid_child_ptr(ChildableCachedExtent* child); + +template <typename T> +phy_tree_root_t& get_phy_tree_root(root_t& r); + +using get_child_iertr = + ::crimson::interruptible::interruptible_errorator< + typename trans_intr::condition, + get_child_ertr>; +using get_phy_tree_root_node_ret = + std::pair<bool, get_child_iertr::future<CachedExtentRef>>; + +template <typename T, typename key_t> +const get_phy_tree_root_node_ret get_phy_tree_root_node( + const RootBlockRef &root_block, + op_context_t<key_t> c); + +template <typename ROOT_T> +void link_phy_tree_root_node(RootBlockRef &root_block, ROOT_T* root_node); + +template <typename T> +void unlink_phy_tree_root_node(RootBlockRef &root_block); + +template <typename T> +Transaction::tree_stats_t& get_tree_stats(Transaction &t); + +template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size, + bool leaf_has_children> +class FixedKVBtree { + static constexpr size_t MAX_DEPTH = 16; + using self_type = FixedKVBtree< + node_key_t, + node_val_t, + internal_node_t, + leaf_node_t, + pin_t, + node_size, + leaf_has_children>; +public: + using InternalNodeRef = TCachedExtentRef<internal_node_t>; + using LeafNodeRef = TCachedExtentRef<leaf_node_t>; + + using base_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using base_iertr = trans_iertr<base_ertr>; + + class iterator; + using iterator_fut = base_iertr::future<iterator>; + + using mapped_space_visitor_t = std::function< + void(paddr_t, node_key_t, extent_len_t, depth_t, extent_types_t, iterator&)>; + + class iterator { + public: + iterator(const iterator &rhs) noexcept : + internal(rhs.internal), leaf(rhs.leaf) {} + iterator(iterator &&rhs) noexcept : + internal(std::move(rhs.internal)), leaf(std::move(rhs.leaf)) {} + + iterator &operator=(const iterator &) = default; + iterator &operator=(iterator &&) = default; + + iterator_fut next( + op_context_t<node_key_t> c, + mapped_space_visitor_t *visitor=nullptr) const + { + assert_valid(); + assert(!is_end()); + + auto ret = *this; + ret.leaf.pos++; + if (ret.at_boundary()) { + return seastar::do_with( + ret, + [c, visitor](auto &ret) mutable { + return ret.handle_boundary( + c, visitor + ).si_then([&ret] { + return std::move(ret); + }); + }); + } else { + return iterator_fut( + interruptible::ready_future_marker{}, + ret); + } + + } + + iterator_fut prev(op_context_t<node_key_t> c) const + { + assert_valid(); + assert(!is_begin()); + + auto ret = *this; + + if (ret.leaf.pos > 0) { + ret.leaf.pos--; + return iterator_fut( + interruptible::ready_future_marker{}, + ret); + } + + depth_t depth_with_space = 2; + for (; depth_with_space <= get_depth(); ++depth_with_space) { + if (ret.get_internal(depth_with_space).pos > 0) { + break; + } + } + + assert(depth_with_space <= ret.get_depth()); // must not be begin() + return seastar::do_with( + std::move(ret), + [](const internal_node_t &internal) { return --internal.end(); }, + [](const leaf_node_t &leaf) { return --leaf.end(); }, + [c, depth_with_space](auto &ret, auto &li, auto &ll) { + for (depth_t depth = 2; depth < depth_with_space; ++depth) { + ret.get_internal(depth).reset(); + } + ret.leaf.reset(); + ret.get_internal(depth_with_space).pos--; + // note, cannot result in at_boundary() by construction + return lookup_depth_range( + c, ret, depth_with_space - 1, 0, li, ll, nullptr + ).si_then([&ret] { + assert(!ret.at_boundary()); + return std::move(ret); + }); + }); + } + + void assert_valid() const { + assert(leaf.node); + assert(leaf.pos <= leaf.node->get_size()); + + for (auto &i: internal) { + (void)i; + assert(i.node); + assert(i.pos < i.node->get_size()); + } + } + + depth_t get_depth() const { + return internal.size() + 1; + } + + auto &get_internal(depth_t depth) { + assert(depth > 1); + assert((depth - 2) < internal.size()); + return internal[depth - 2]; + } + + const auto &get_internal(depth_t depth) const { + assert(depth > 1); + assert((depth - 2) < internal.size()); + return internal[depth - 2]; + } + + node_key_t get_key() const { + assert(!is_end()); + return leaf.node->iter_idx(leaf.pos).get_key(); + } + node_val_t get_val() const { + assert(!is_end()); + auto ret = leaf.node->iter_idx(leaf.pos).get_val(); + if constexpr ( + std::is_same_v<crimson::os::seastore::lba_manager::btree::lba_map_val_t, + node_val_t>) { + if (ret.pladdr.is_paddr()) { + ret.pladdr = ret.pladdr.get_paddr().maybe_relative_to( + leaf.node->get_paddr()); + } + } + return ret; + } + + bool is_end() const { + // external methods may only resolve at a boundary if at end + return at_boundary(); + } + + bool is_begin() const { + for (auto &i: internal) { + if (i.pos != 0) + return false; + } + return leaf.pos == 0; + } + + std::unique_ptr<pin_t> get_pin(op_context_t<node_key_t> ctx) const { + assert(!is_end()); + auto val = get_val(); + auto key = get_key(); + return std::make_unique<pin_t>( + ctx, + leaf.node, + leaf.pos, + val, + fixed_kv_node_meta_t<node_key_t>{ key, key + val.len, 0 }); + } + + typename leaf_node_t::Ref get_leaf_node() { + return leaf.node; + } + + uint16_t get_leaf_pos() { + return leaf.pos; + } + private: + iterator() noexcept {} + iterator(depth_t depth) noexcept : internal(depth - 1) {} + + friend class FixedKVBtree; + static constexpr uint16_t INVALID = std::numeric_limits<uint16_t>::max(); + template <typename NodeType> + struct node_position_t { + typename NodeType::Ref node; + uint16_t pos = INVALID; + + node_position_t() = default; + node_position_t( + typename NodeType::Ref node, + uint16_t pos) + : node(node), pos(pos) {} + + void reset() { + *this = node_position_t{}; + } + + auto get_iter() { + assert(pos != INVALID); + assert(pos < node->get_size()); + return node->iter_idx(pos); + } + }; + boost::container::static_vector< + node_position_t<internal_node_t>, MAX_DEPTH> internal; + node_position_t<leaf_node_t> leaf; + + bool at_boundary() const { + assert(leaf.pos <= leaf.node->get_size()); + return leaf.pos == leaf.node->get_size(); + } + + using handle_boundary_ertr = base_iertr; + using handle_boundary_ret = handle_boundary_ertr::future<>; + handle_boundary_ret handle_boundary( + op_context_t<node_key_t> c, + mapped_space_visitor_t *visitor) + { + assert(at_boundary()); + depth_t depth_with_space = 2; + for (; depth_with_space <= get_depth(); ++depth_with_space) { + if ((get_internal(depth_with_space).pos + 1) < + get_internal(depth_with_space).node->get_size()) { + break; + } + } + + if (depth_with_space <= get_depth()) { + return seastar::do_with( + [](const internal_node_t &internal) { return internal.begin(); }, + [](const leaf_node_t &leaf) { return leaf.begin(); }, + [this, c, depth_with_space, visitor](auto &li, auto &ll) { + for (depth_t depth = 2; depth < depth_with_space; ++depth) { + get_internal(depth).reset(); + } + leaf.reset(); + get_internal(depth_with_space).pos++; + // note, cannot result in at_boundary() by construction + return lookup_depth_range( + c, *this, depth_with_space - 1, 0, li, ll, visitor + ); + }); + } else { + // end + return seastar::now(); + } + } + + depth_t check_split() const { + if (!leaf.node->at_max_capacity()) { + return 0; + } + for (depth_t split_from = 1; split_from < get_depth(); ++split_from) { + if (!get_internal(split_from + 1).node->at_max_capacity()) + return split_from; + } + return get_depth(); + } + + depth_t check_merge() const { + if (!leaf.node->below_min_capacity()) { + return 0; + } + for (depth_t merge_from = 1; merge_from < get_depth(); ++merge_from) { + if (!get_internal(merge_from + 1).node->below_min_capacity()) + return merge_from; + } + return get_depth(); + } + }; + + FixedKVBtree(RootBlockRef &root_block) : root_block(root_block) {} + + auto& get_root() { + return get_phy_tree_root<self_type>(root_block->get_root()); + } + + auto& get_root() const { + return get_phy_tree_root<self_type>(root_block->get_root()); + } + + template <typename T> + void set_root_node(const TCachedExtentRef<T> &root_node) { + static_assert(std::is_base_of_v<typename internal_node_t::base_t, T>); + link_phy_tree_root_node(root_block, root_node.get()); + } + + auto get_root_node(op_context_t<node_key_t> c) const { + return get_phy_tree_root_node<self_type>(root_block, c); + } + + /// mkfs + using mkfs_ret = phy_tree_root_t; + static mkfs_ret mkfs(RootBlockRef &root_block, op_context_t<node_key_t> c) { + assert(root_block->is_mutation_pending()); + auto root_leaf = c.cache.template alloc_new_extent<leaf_node_t>( + c.trans, + node_size, + placement_hint_t::HOT, + INIT_GENERATION); + root_leaf->set_size(0); + fixed_kv_node_meta_t<node_key_t> meta{min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, 1}; + root_leaf->set_meta(meta); + root_leaf->range = meta; + get_tree_stats<self_type>(c.trans).depth = 1u; + get_tree_stats<self_type>(c.trans).extents_num_delta++; + link_phy_tree_root_node(root_block, root_leaf.get()); + return phy_tree_root_t{root_leaf->get_paddr(), 1u}; + } + + /** + * lower_bound + * + * @param c [in] context + * @param addr [in] ddr + * @return least iterator >= key + */ + iterator_fut lower_bound( + op_context_t<node_key_t> c, + node_key_t addr, + mapped_space_visitor_t *visitor=nullptr, + depth_t min_depth = 1) const + { + LOG_PREFIX(FixedKVBtree::lower_bound); + return lookup( + c, + [addr](const internal_node_t &internal) { + assert(internal.get_size() > 0); + auto iter = internal.upper_bound(addr); + assert(iter != internal.begin()); + --iter; + return iter; + }, + [FNAME, c, addr](const leaf_node_t &leaf) { + auto ret = leaf.lower_bound(addr); + SUBTRACET( + seastore_fixedkv_tree, + "leaf addr {}, got ret offset {}, size {}, end {}", + c.trans, + addr, + ret.get_offset(), + leaf.get_size(), + ret == leaf.end()); + return ret; + }, + min_depth, + visitor + ).si_then([FNAME, c, min_depth](auto &&ret) { + SUBTRACET( + seastore_fixedkv_tree, + "ret.leaf.pos {}", + c.trans, + ret.leaf.pos); + if (min_depth == 1) { + ret.assert_valid(); + } + return std::move(ret); + }); + } + + + /** + * upper_bound + * + * @param c [in] context + * @param addr [in] ddr + * @return least iterator > key + */ + iterator_fut upper_bound( + op_context_t<node_key_t> c, + node_key_t addr + ) const { + return lower_bound( + c, addr + ).si_then([c, addr](auto iter) { + if (!iter.is_end() && iter.get_key() == addr) { + return iter.next(c); + } else { + return iterator_fut( + interruptible::ready_future_marker{}, + iter); + } + }); + } + + /** + * upper_bound_right + * + * @param c [in] context + * @param addr [in] addr + * @return least iterator i s.t. i.get_key() + i.get_val().len > key + */ + iterator_fut upper_bound_right( + op_context_t<node_key_t> c, + node_key_t addr) const + { + return lower_bound( + c, addr + ).si_then([c, addr](auto iter) { + if (iter.is_begin()) { + return iterator_fut( + interruptible::ready_future_marker{}, + iter); + } else { + return iter.prev( + c + ).si_then([iter, addr](auto prev) { + if ((prev.get_key() + prev.get_val().len) > addr) { + return iterator_fut( + interruptible::ready_future_marker{}, + prev); + } else { + return iterator_fut( + interruptible::ready_future_marker{}, + iter); + } + }); + } + }); + } + + iterator_fut begin(op_context_t<node_key_t> c) const { + return lower_bound(c, 0); + } + iterator_fut end(op_context_t<node_key_t> c) const { + return upper_bound(c, min_max_t<node_key_t>::max); + } + + template <typename child_node_t, typename node_t, bool lhc = leaf_has_children, + typename std::enable_if<lhc, int>::type = 0> + void check_node( + op_context_t<node_key_t> c, + TCachedExtentRef<node_t> node) + { + assert(leaf_has_children); + for (auto i : *node) { + CachedExtentRef child_node; + Transaction::get_extent_ret ret; + + if constexpr (std::is_base_of_v<typename internal_node_t::base_t, child_node_t>) { + ret = c.trans.get_extent( + i->get_val().maybe_relative_to(node->get_paddr()), + &child_node); + } else { + assert(i->get_val().pladdr.is_paddr()); + ret = c.trans.get_extent( + i->get_val().pladdr.get_paddr().maybe_relative_to(node->get_paddr()), + &child_node); + } + if (ret == Transaction::get_extent_ret::PRESENT) { + if (child_node->is_stable()) { + assert(child_node->is_valid()); + auto cnode = child_node->template cast<child_node_t>(); + assert(cnode->has_parent_tracker()); + if (node->is_pending()) { + auto &n = node->get_stable_for_key(i->get_key()); + assert(cnode->get_parent_node().get() == &n); + auto pos = n.lower_bound_offset(i->get_key()); + assert(pos < n.get_node_size()); + assert(n.children[pos] == cnode.get()); + } else { + assert(cnode->get_parent_node().get() == node.get()); + assert(node->children[i->get_offset()] == cnode.get()); + } + } else if (child_node->is_pending()) { + if (child_node->is_mutation_pending()) { + auto &prior = (child_node_t &)*child_node->prior_instance; + assert(prior.is_valid()); + assert(prior.is_parent_valid()); + if (node->is_mutation_pending()) { + auto &n = node->get_stable_for_key(i->get_key()); + assert(prior.get_parent_node().get() == &n); + auto pos = n.lower_bound_offset(i->get_key()); + assert(pos < n.get_node_size()); + assert(n.children[pos] == &prior); + } else { + assert(prior.get_parent_node().get() == node.get()); + assert(node->children[i->get_offset()] == &prior); + } + } else { + auto cnode = child_node->template cast<child_node_t>(); + auto pos = node->find(i->get_key()).get_offset(); + auto child = node->children[pos]; + assert(child); + assert(child == cnode.get()); + assert(cnode->is_parent_valid()); + } + } else { + ceph_assert(!child_node->is_valid()); + ceph_abort("impossible"); + } + } else if (ret == Transaction::get_extent_ret::ABSENT) { + ChildableCachedExtent* child = nullptr; + if (node->is_pending()) { + auto &n = node->get_stable_for_key(i->get_key()); + auto pos = n.lower_bound_offset(i->get_key()); + assert(pos < n.get_node_size()); + child = n.children[pos]; + if (is_valid_child_ptr(child)) { + auto c = (child_node_t*)child; + assert(c->has_parent_tracker()); + assert(c->get_parent_node().get() == &n); + } + } else { + child = node->children[i->get_offset()]; + if (is_valid_child_ptr(child)) { + auto c = (child_node_t*)child; + assert(c->has_parent_tracker()); + assert(c->get_parent_node().get() == node.get()); + } + } + + if (!is_valid_child_ptr(child)) { + if constexpr ( + std::is_base_of_v<typename internal_node_t::base_t, child_node_t>) + { + assert(!c.cache.query_cache(i->get_val(), nullptr)); + } else { + if constexpr (leaf_has_children) { + assert(i->get_val().pladdr.is_paddr() + ? (bool)!c.cache.query_cache( + i->get_val().pladdr.get_paddr(), nullptr) + : true); + } + } + } + } else { + ceph_abort("impossible"); + } + } + } + + using check_child_trackers_ret = base_iertr::future<>; + template <bool lhc = leaf_has_children, + typename std::enable_if<lhc, int>::type = 0> + check_child_trackers_ret check_child_trackers( + op_context_t<node_key_t> c) { + mapped_space_visitor_t checker = [c, this]( + paddr_t, + node_key_t, + extent_len_t, + depth_t depth, + extent_types_t, + iterator& iter) { + if constexpr (!leaf_has_children) { + if (depth == 1) { + return seastar::now(); + } + } + if (depth > 1) { + auto &node = iter.get_internal(depth).node; + assert(node->is_valid()); + check_node<typename internal_node_t::base_t>(c, node); + } else { + assert(depth == 1); + auto &node = iter.leaf.node; + assert(node->is_valid()); + check_node<LogicalCachedExtent>(c, node); + } + return seastar::now(); + }; + + return seastar::do_with( + std::move(checker), + [this, c](auto &checker) { + return iterate_repeat( + c, + lower_bound( + c, + min_max_t<node_key_t>::min, + &checker), + [](auto &pos) { + if (pos.is_end()) { + return base_iertr::make_ready_future< + seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + return base_iertr::make_ready_future< + seastar::stop_iteration>( + seastar::stop_iteration::no); + }, + &checker); + }); + } + + using iterate_repeat_ret_inner = base_iertr::future< + seastar::stop_iteration>; + template <typename F> + static base_iertr::future<> iterate_repeat( + op_context_t<node_key_t> c, + iterator_fut &&iter_fut, + F &&f, + mapped_space_visitor_t *visitor=nullptr) { + return std::move( + iter_fut + ).si_then([c, visitor, f=std::forward<F>(f)](auto iter) { + return seastar::do_with( + iter, + std::move(f), + [c, visitor](auto &pos, auto &f) { + return trans_intr::repeat( + [c, visitor, &f, &pos] { + return f( + pos + ).si_then([c, visitor, &pos](auto done) { + if (done == seastar::stop_iteration::yes) { + return iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } else { + ceph_assert(!pos.is_end()); + return pos.next( + c, visitor + ).si_then([&pos](auto next) { + pos = next; + return iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + }); + } + }); + }); + }); + }); + } + + /** + * insert + * + * Inserts val at laddr with iter as a hint. If element at laddr already + * exists returns iterator to that element unchanged and returns false. + * + * Invalidates all outstanding iterators for this tree on this transaction. + * + * @param c [in] op context + * @param iter [in] hint, insertion constant if immediately prior to iter + * @param laddr [in] addr at which to insert + * @param val [in] val to insert + * @return pair<iter, bool> where iter points to element at addr, bool true + * iff element at laddr did not exist. + */ + using insert_iertr = base_iertr; + using insert_ret = insert_iertr::future<std::pair<iterator, bool>>; + insert_ret insert( + op_context_t<node_key_t> c, + iterator iter, + node_key_t laddr, + node_val_t val, + LogicalCachedExtent* nextent + ) { + LOG_PREFIX(FixedKVBtree::insert); + SUBTRACET( + seastore_fixedkv_tree, + "inserting laddr {} at iter {}", + c.trans, + laddr, + iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key()); + return seastar::do_with( + iter, + [this, c, laddr, val, nextent](auto &ret) { + return find_insertion( + c, laddr, ret + ).si_then([this, c, laddr, val, &ret, nextent] { + if (!ret.at_boundary() && ret.get_key() == laddr) { + return insert_ret( + interruptible::ready_future_marker{}, + std::make_pair(ret, false)); + } else { + ++(get_tree_stats<self_type>(c.trans).num_inserts); + return handle_split( + c, ret + ).si_then([c, laddr, val, &ret, nextent] { + if (!ret.leaf.node->is_mutable()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, ret.leaf.node + ); + ret.leaf.node = mut->cast<leaf_node_t>(); + } + auto iter = typename leaf_node_t::const_iterator( + ret.leaf.node.get(), ret.leaf.pos); + assert(iter == ret.leaf.node->lower_bound(laddr)); + assert(iter == ret.leaf.node->end() || iter->get_key() > laddr); + assert(laddr >= ret.leaf.node->get_meta().begin && + laddr < ret.leaf.node->get_meta().end); + ret.leaf.node->insert(iter, laddr, val, nextent); + return insert_ret( + interruptible::ready_future_marker{}, + std::make_pair(ret, true)); + }); + } + }); + }); + } + + insert_ret insert( + op_context_t<node_key_t> c, + node_key_t laddr, + node_val_t val, + LogicalCachedExtent* nextent) { + return lower_bound( + c, laddr + ).si_then([this, c, laddr, val, nextent](auto iter) { + return this->insert(c, iter, laddr, val, nextent); + }); + } + + /** + * update + * + * Invalidates all outstanding iterators for this tree on this transaction. + * + * @param c [in] op context + * @param iter [in] iterator to element to update, must not be end + * @param val [in] val with which to update + * @return iterator to newly updated element + */ + using update_iertr = base_iertr; + using update_ret = update_iertr::future<iterator>; + update_ret update( + op_context_t<node_key_t> c, + iterator iter, + node_val_t val, + LogicalCachedExtent* nextent) + { + LOG_PREFIX(FixedKVBtree::update); + SUBTRACET( + seastore_fixedkv_tree, + "update element at {}", + c.trans, + iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key()); + if (!iter.leaf.node->is_mutable()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, iter.leaf.node + ); + iter.leaf.node = mut->cast<leaf_node_t>(); + } + ++(get_tree_stats<self_type>(c.trans).num_updates); + iter.leaf.node->update( + iter.leaf.node->iter_idx(iter.leaf.pos), + val, + nextent); + return update_ret( + interruptible::ready_future_marker{}, + iter); + } + + + /** + * remove + * + * Invalidates all outstanding iterators for this tree on this transaction. + * + * @param c [in] op context + * @param iter [in] iterator to element to remove, must not be end + */ + using remove_iertr = base_iertr; + using remove_ret = remove_iertr::future<>; + remove_ret remove( + op_context_t<node_key_t> c, + iterator iter) + { + LOG_PREFIX(FixedKVBtree::remove); + SUBTRACET( + seastore_fixedkv_tree, + "remove element at {}", + c.trans, + iter.is_end() ? min_max_t<node_key_t>::max : iter.get_key()); + assert(!iter.is_end()); + ++(get_tree_stats<self_type>(c.trans).num_erases); + return seastar::do_with( + iter, + [this, c](auto &ret) { + if (!ret.leaf.node->is_mutable()) { + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, ret.leaf.node + ); + ret.leaf.node = mut->cast<leaf_node_t>(); + } + ret.leaf.node->remove( + ret.leaf.node->iter_idx(ret.leaf.pos)); + + return handle_merge( + c, ret + ); + }); + } + + /** + * init_cached_extent + * + * Checks whether e is live (reachable from fixed kv tree) and drops or initializes + * accordingly. + * + * Returns if e is live. + */ + using init_cached_extent_iertr = base_iertr; + using init_cached_extent_ret = init_cached_extent_iertr::future<bool>; + init_cached_extent_ret init_cached_extent( + op_context_t<node_key_t> c, + CachedExtentRef e) + { + assert(!e->is_logical()); + LOG_PREFIX(FixedKVTree::init_cached_extent); + SUBTRACET(seastore_fixedkv_tree, "extent {}", c.trans, *e); + if (e->get_type() == internal_node_t::TYPE) { + auto eint = e->cast<internal_node_t>(); + return lower_bound( + c, eint->get_node_meta().begin + ).si_then([e, c, eint](auto iter) { + // Note, this check is valid even if iter.is_end() + LOG_PREFIX(FixedKVTree::init_cached_extent); + depth_t cand_depth = eint->get_node_meta().depth; + if (cand_depth <= iter.get_depth() && + &*iter.get_internal(cand_depth).node == &*eint) { + SUBTRACET( + seastore_fixedkv_tree, + "extent {} is live", + c.trans, + *eint); + return true; + } else { + SUBTRACET( + seastore_fixedkv_tree, + "extent {} is not live", + c.trans, + *eint); + return false; + } + }); + } else if (e->get_type() == leaf_node_t::TYPE) { + auto eleaf = e->cast<leaf_node_t>(); + return lower_bound( + c, eleaf->get_node_meta().begin + ).si_then([c, e, eleaf](auto iter) { + // Note, this check is valid even if iter.is_end() + LOG_PREFIX(FixedKVTree::init_cached_extent); + if (iter.leaf.node == &*eleaf) { + SUBTRACET( + seastore_fixedkv_tree, + "extent {} is live", + c.trans, + *eleaf); + return true; + } else { + SUBTRACET( + seastore_fixedkv_tree, + "extent {} is not live", + c.trans, + *eleaf); + return false; + } + }); + } else { + SUBTRACET( + seastore_fixedkv_tree, + "found other extent {} type {}", + c.trans, + *e, + e->get_type()); + return init_cached_extent_ret( + interruptible::ready_future_marker{}, + true); + } + } + + /// get_leaf_if_live: get leaf node at laddr/addr if still live + using get_leaf_if_live_iertr = base_iertr; + using get_leaf_if_live_ret = get_leaf_if_live_iertr::future<CachedExtentRef>; + get_leaf_if_live_ret get_leaf_if_live( + op_context_t<node_key_t> c, + paddr_t addr, + node_key_t laddr, + extent_len_t len) + { + LOG_PREFIX(FixedKVBtree::get_leaf_if_live); + return lower_bound( + c, laddr + ).si_then([FNAME, c, addr, laddr, len](auto iter) { + if (iter.leaf.node->get_paddr() == addr) { + SUBTRACET( + seastore_fixedkv_tree, + "extent laddr {} addr {}~{} found: {}", + c.trans, + laddr, + addr, + len, + *iter.leaf.node); + return CachedExtentRef(iter.leaf.node); + } else { + SUBTRACET( + seastore_fixedkv_tree, + "extent laddr {} addr {}~{} is not live, does not match node {}", + c.trans, + laddr, + addr, + len, + *iter.leaf.node); + return CachedExtentRef(); + } + }); + } + + + /// get_internal_if_live: get internal node at laddr/addr if still live + using get_internal_if_live_iertr = base_iertr; + using get_internal_if_live_ret = get_internal_if_live_iertr::future<CachedExtentRef>; + get_internal_if_live_ret get_internal_if_live( + op_context_t<node_key_t> c, + paddr_t addr, + node_key_t laddr, + extent_len_t len) + { + LOG_PREFIX(FixedKVBtree::get_internal_if_live); + return lower_bound( + c, laddr + ).si_then([FNAME, c, addr, laddr, len](auto iter) { + for (depth_t d = 2; d <= iter.get_depth(); ++d) { + CachedExtent &node = *iter.get_internal(d).node; + auto internal_node = node.cast<internal_node_t>(); + if (internal_node->get_paddr() == addr) { + SUBTRACET( + seastore_fixedkv_tree, + "extent laddr {} addr {}~{} found: {}", + c.trans, + laddr, + addr, + len, + *internal_node); + assert(internal_node->get_node_meta().begin == laddr); + return CachedExtentRef(internal_node); + } + } + SUBTRACET( + seastore_fixedkv_tree, + "extent laddr {} addr {}~{} is not live, no matching internal node", + c.trans, + laddr, + addr, + len); + return CachedExtentRef(); + }); + } + + + /** + * rewrite_extent + * + * Rewrites a fresh copy of extent into transaction and updates internal + * references. + */ + using rewrite_extent_iertr = base_iertr; + using rewrite_extent_ret = rewrite_extent_iertr::future<>; + rewrite_extent_ret rewrite_extent( + op_context_t<node_key_t> c, + CachedExtentRef e) { + LOG_PREFIX(FixedKVBtree::rewrite_extent); + assert(is_lba_backref_node(e->get_type())); + + auto do_rewrite = [&](auto &fixed_kv_extent) { + auto n_fixed_kv_extent = c.cache.template alloc_new_extent< + std::remove_reference_t<decltype(fixed_kv_extent)> + >( + c.trans, + fixed_kv_extent.get_length(), + fixed_kv_extent.get_user_hint(), + // get target rewrite generation + fixed_kv_extent.get_rewrite_generation()); + fixed_kv_extent.get_bptr().copy_out( + 0, + fixed_kv_extent.get_length(), + n_fixed_kv_extent->get_bptr().c_str()); + n_fixed_kv_extent->set_modify_time(fixed_kv_extent.get_modify_time()); + n_fixed_kv_extent->range = n_fixed_kv_extent->get_node_meta(); + + if (fixed_kv_extent.get_type() == internal_node_t::TYPE || + leaf_node_t::do_has_children) { + if (!fixed_kv_extent.is_pending()) { + n_fixed_kv_extent->copy_sources.emplace(&fixed_kv_extent); + n_fixed_kv_extent->prior_instance = &fixed_kv_extent; + } else { + ceph_assert(fixed_kv_extent.is_mutation_pending()); + n_fixed_kv_extent->copy_sources.emplace( + (typename internal_node_t::base_t* + )fixed_kv_extent.get_prior_instance().get()); + n_fixed_kv_extent->children = std::move(fixed_kv_extent.children); + n_fixed_kv_extent->prior_instance = fixed_kv_extent.get_prior_instance(); + n_fixed_kv_extent->adjust_ptracker_for_children(); + } + } + + /* This is a bit underhanded. Any relative addrs here must necessarily + * be record relative as we are rewriting a dirty extent. Thus, we + * are using resolve_relative_addrs with a (likely negative) block + * relative offset to correct them to block-relative offsets adjusted + * for our new transaction location. + * + * Upon commit, these now block relative addresses will be interpretted + * against the real final address. + */ + if (!n_fixed_kv_extent->get_paddr().is_absolute()) { + // backend_type_t::SEGMENTED + assert(n_fixed_kv_extent->get_paddr().is_record_relative()); + n_fixed_kv_extent->resolve_relative_addrs( + make_record_relative_paddr(0).block_relative_to( + n_fixed_kv_extent->get_paddr())); + } // else: backend_type_t::RANDOM_BLOCK + + SUBTRACET( + seastore_fixedkv_tree, + "rewriting {} into {}", + c.trans, + fixed_kv_extent, + *n_fixed_kv_extent); + + return update_internal_mapping( + c, + n_fixed_kv_extent->get_node_meta().depth, + n_fixed_kv_extent->get_node_meta().begin, + e->get_paddr(), + n_fixed_kv_extent->get_paddr(), + n_fixed_kv_extent + ).si_then([c, e] { + c.cache.retire_extent(c.trans, e); + }); + }; + + CachedExtentRef n_fixed_kv_extent; + if (e->get_type() == internal_node_t::TYPE) { + auto lint = e->cast<internal_node_t>(); + return do_rewrite(*lint); + } else { + assert(e->get_type() == leaf_node_t::TYPE); + auto lleaf = e->cast<leaf_node_t>(); + return do_rewrite(*lleaf); + } + } + + using update_internal_mapping_iertr = base_iertr; + using update_internal_mapping_ret = update_internal_mapping_iertr::future<>; + update_internal_mapping_ret update_internal_mapping( + op_context_t<node_key_t> c, + depth_t depth, + node_key_t laddr, + paddr_t old_addr, + paddr_t new_addr, + typename internal_node_t::base_ref nextent) + { + LOG_PREFIX(FixedKVBtree::update_internal_mapping); + SUBTRACET( + seastore_fixedkv_tree, + "updating laddr {} at depth {} from {} to {}, nextent {}", + c.trans, + laddr, + depth, + old_addr, + new_addr, + *nextent); + + return lower_bound( + c, laddr, nullptr, depth + 1 + ).si_then([=, this](auto iter) { + assert(iter.get_depth() >= depth); + if (depth == iter.get_depth()) { + SUBTRACET(seastore_fixedkv_tree, "update at root", c.trans); + + if (laddr != min_max_t<node_key_t>::min) { + SUBERRORT( + seastore_fixedkv_tree, + "updating root laddr {} at depth {} from {} to {}," + "laddr is not 0", + c.trans, + laddr, + depth, + old_addr, + new_addr, + get_root().get_location()); + ceph_assert(0 == "impossible"); + } + + if (get_root().get_location() != old_addr) { + SUBERRORT( + seastore_fixedkv_tree, + "updating root laddr {} at depth {} from {} to {}," + "root addr {} does not match", + c.trans, + laddr, + depth, + old_addr, + new_addr, + get_root().get_location()); + ceph_assert(0 == "impossible"); + } + + root_block = c.cache.duplicate_for_write( + c.trans, root_block)->template cast<RootBlock>(); + get_root().set_location(new_addr); + set_root_node(nextent); + } else { + auto &parent = iter.get_internal(depth + 1); + assert(parent.node); + assert(parent.pos < parent.node->get_size()); + auto piter = parent.node->iter_idx(parent.pos); + + if (piter->get_key() != laddr) { + SUBERRORT( + seastore_fixedkv_tree, + "updating laddr {} at depth {} from {} to {}," + "node {} pos {} val pivot addr {} does not match", + c.trans, + laddr, + depth, + old_addr, + new_addr, + *(parent.node), + parent.pos, + piter->get_key()); + ceph_assert(0 == "impossible"); + } + + + if (piter->get_val() != old_addr) { + SUBERRORT( + seastore_fixedkv_tree, + "updating laddr {} at depth {} from {} to {}," + "node {} pos {} val addr {} does not match", + c.trans, + laddr, + depth, + old_addr, + new_addr, + *(parent.node), + parent.pos, + piter->get_val()); + ceph_assert(0 == "impossible"); + } + + CachedExtentRef mut = c.cache.duplicate_for_write( + c.trans, + parent.node + ); + typename internal_node_t::Ref mparent = mut->cast<internal_node_t>(); + mparent->update(piter, new_addr, nextent.get()); + + /* Note, iter is now invalid as we didn't udpate either the parent + * node reference to the new mutable instance nor did we update the + * child pointer to the new node. Not a problem as we'll now just + * destruct it. + */ + } + return seastar::now(); + }); + } + + +private: + RootBlockRef root_block; + + template <typename T> + using node_position_t = typename iterator::template node_position_t<T>; + + using get_internal_node_iertr = base_iertr; + using get_internal_node_ret = get_internal_node_iertr::future<InternalNodeRef>; + static get_internal_node_ret get_internal_node( + op_context_t<node_key_t> c, + depth_t depth, + paddr_t offset, + node_key_t begin, + node_key_t end, + typename std::optional<node_position_t<internal_node_t>> parent_pos) + { + LOG_PREFIX(FixedKVBtree::get_internal_node); + SUBTRACET( + seastore_fixedkv_tree, + "reading internal at offset {}, depth {}, begin {}, end {}", + c.trans, + offset, + depth, + begin, + end); + assert(depth > 1); + auto init_internal = [c, depth, begin, end, + parent_pos=std::move(parent_pos)] + (internal_node_t &node) { + assert(!node.is_pending()); + assert(!node.is_linked()); + node.range = fixed_kv_node_meta_t<node_key_t>{begin, end, depth}; + if (parent_pos) { + auto &parent = parent_pos->node; + parent->link_child(&node, parent_pos->pos); + } else { + assert(node.range.is_root()); + auto root_block = c.cache.get_root_fast(c.trans); + if (root_block->is_mutation_pending()) { + auto &stable_root = (RootBlockRef&)*root_block->get_prior_instance(); + link_phy_tree_root_node(stable_root, &node); + } else { + assert(!root_block->is_pending()); + link_phy_tree_root_node(root_block, &node); + } + } + }; + return c.cache.template get_absent_extent<internal_node_t>( + c.trans, + offset, + node_size, + init_internal + ).si_then([FNAME, c, offset, init_internal, depth, begin, end]( + typename internal_node_t::Ref ret) { + SUBTRACET( + seastore_fixedkv_tree, + "read internal at offset {} {}", + c.trans, + offset, + *ret); + // This can only happen during init_cached_extent + // or when backref extent being rewritten by gc space reclaiming + if (!ret->is_pending() && !ret->is_linked()) { + assert(ret->is_dirty() + || (is_backref_node(ret->get_type()) + && ret->is_clean())); + init_internal(*ret); + } + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + ceph_assert(depth == meta.depth); + ceph_assert(begin == meta.begin); + ceph_assert(end == meta.end); + return get_internal_node_ret( + interruptible::ready_future_marker{}, + ret); + }); + } + + + using get_leaf_node_iertr = base_iertr; + using get_leaf_node_ret = get_leaf_node_iertr::future<LeafNodeRef>; + static get_leaf_node_ret get_leaf_node( + op_context_t<node_key_t> c, + paddr_t offset, + node_key_t begin, + node_key_t end, + typename std::optional<node_position_t<leaf_node_t>> parent_pos) + { + LOG_PREFIX(FixedKVBtree::get_leaf_node); + SUBTRACET( + seastore_fixedkv_tree, + "reading leaf at offset {}, begin {}, end {}", + c.trans, + offset, + begin, + end); + auto init_leaf = [c, begin, end, + parent_pos=std::move(parent_pos)] + (leaf_node_t &node) { + assert(!node.is_pending()); + assert(!node.is_linked()); + node.range = fixed_kv_node_meta_t<node_key_t>{begin, end, 1}; + if (parent_pos) { + auto &parent = parent_pos->node; + parent->link_child(&node, parent_pos->pos); + } else { + assert(node.range.is_root()); + auto root_block = c.cache.get_root_fast(c.trans); + if (root_block->is_mutation_pending()) { + auto &stable_root = (RootBlockRef&)*root_block->get_prior_instance(); + link_phy_tree_root_node(stable_root, &node); + } else { + assert(!root_block->is_pending()); + link_phy_tree_root_node(root_block, &node); + } + } + }; + return c.cache.template get_absent_extent<leaf_node_t>( + c.trans, + offset, + node_size, + init_leaf + ).si_then([FNAME, c, offset, init_leaf, begin, end] + (typename leaf_node_t::Ref ret) { + SUBTRACET( + seastore_fixedkv_tree, + "read leaf at offset {} {}", + c.trans, + offset, + *ret); + // This can only happen during init_cached_extent + // or when backref extent being rewritten by gc space reclaiming + if (!ret->is_pending() && !ret->is_linked()) { + assert(ret->is_dirty() + || (is_backref_node(ret->get_type()) + && ret->is_clean())); + init_leaf(*ret); + } + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + ceph_assert(1 == meta.depth); + ceph_assert(begin == meta.begin); + ceph_assert(end == meta.end); + return get_leaf_node_ret( + interruptible::ready_future_marker{}, + ret); + }); + } + + using lookup_root_iertr = base_iertr; + using lookup_root_ret = lookup_root_iertr::future<>; + lookup_root_ret lookup_root( + op_context_t<node_key_t> c, + iterator &iter, + mapped_space_visitor_t *visitor) const { + LOG_PREFIX(FixedKVBtree::lookup_root); + SUBTRACET(seastore_fixedkv_tree, + "looking up root on {}", + c.trans, + *root_block); + auto [found, fut] = get_root_node(c); + + auto on_found_internal = + [this, visitor, &iter](InternalNodeRef &root_node) { + iter.get_internal(get_root().get_depth()).node = root_node; + if (visitor) (*visitor)( + root_node->get_paddr(), + root_node->get_node_meta().begin, + root_node->get_length(), + get_root().get_depth(), + internal_node_t::TYPE, + iter); + return lookup_root_iertr::now(); + }; + auto on_found_leaf = + [visitor, &iter, this](LeafNodeRef root_node) { + iter.leaf.node = root_node; + if (visitor) (*visitor)( + root_node->get_paddr(), + root_node->get_node_meta().begin, + root_node->get_length(), + get_root().get_depth(), + leaf_node_t::TYPE, + iter); + return lookup_root_iertr::now(); + }; + + if (found) { + return fut.si_then( + [this, c, on_found_internal=std::move(on_found_internal), + on_found_leaf=std::move(on_found_leaf)](auto root) { + LOG_PREFIX(FixedKVBtree::lookup_root); + ceph_assert(root); + SUBTRACET(seastore_fixedkv_tree, + "got root node on {}, res: {}", + c.trans, + *root_block, + *root); + + if (get_root().get_depth() > 1) { + auto root_node = root->template cast<internal_node_t>(); + return on_found_internal(root_node); + } else { + auto root_node = root->template cast<leaf_node_t>(); + return on_found_leaf(root_node); + } + }); + } else { + if (get_root().get_depth() > 1) { + return get_internal_node( + c, + get_root().get_depth(), + get_root().get_location(), + min_max_t<node_key_t>::min, + min_max_t<node_key_t>::max, + std::nullopt + ).si_then([on_found=std::move(on_found_internal)](InternalNodeRef root_node) { + return on_found(root_node); + }); + } else { + return get_leaf_node( + c, + get_root().get_location(), + min_max_t<node_key_t>::min, + min_max_t<node_key_t>::max, + std::nullopt + ).si_then([on_found=std::move(on_found_leaf)](LeafNodeRef root_node) { + return on_found(root_node); + }); + } + } + } + + using lookup_internal_level_iertr = base_iertr; + using lookup_internal_level_ret = lookup_internal_level_iertr::future<>; + template <typename F> + static lookup_internal_level_ret lookup_internal_level( + op_context_t<node_key_t> c, + depth_t depth, + iterator &iter, + F &f, + mapped_space_visitor_t *visitor + ) { + assert(depth > 1); + auto &parent_entry = iter.get_internal(depth + 1); + auto parent = parent_entry.node; + auto node_iter = parent->iter_idx(parent_entry.pos); + + auto on_found = [depth, visitor, &iter, &f](InternalNodeRef node) { + auto &entry = iter.get_internal(depth); + entry.node = node; + auto node_iter = f(*node); + assert(node_iter != node->end()); + entry.pos = node_iter->get_offset(); + if (visitor) + (*visitor)( + node->get_paddr(), + node->get_node_meta().begin, + node->get_length(), + depth, + node->get_type(), + iter); + return seastar::now(); + }; + + auto v = parent->template get_child<internal_node_t>(c, node_iter); + if (v.has_child()) { + return v.get_child_fut().safe_then( + [on_found=std::move(on_found), node_iter, c, + parent_entry](auto child) mutable { + LOG_PREFIX(FixedKVBtree::lookup_internal_level); + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_entry.node, + parent_entry.pos, + *child); + auto &cnode = (typename internal_node_t::base_t &)*child; + assert(cnode.get_node_meta().begin == node_iter.get_key()); + assert(cnode.get_node_meta().end > node_iter.get_key()); + return on_found(child->template cast<internal_node_t>()); + }); + } + + auto child_pos = v.get_child_pos(); + auto next_iter = node_iter + 1; + auto begin = node_iter->get_key(); + auto end = next_iter == parent->end() + ? parent->get_node_meta().end + : next_iter->get_key(); + return get_internal_node( + c, + depth, + node_iter->get_val().maybe_relative_to(parent->get_paddr()), + begin, + end, + std::make_optional<node_position_t<internal_node_t>>( + child_pos.template get_parent<internal_node_t>(), + child_pos.get_pos()) + ).si_then([on_found=std::move(on_found)](InternalNodeRef node) { + return on_found(node); + }); + } + + using lookup_leaf_iertr = base_iertr; + using lookup_leaf_ret = lookup_leaf_iertr::future<>; + template <typename F> + static lookup_internal_level_ret lookup_leaf( + op_context_t<node_key_t> c, + iterator &iter, + F &f, + mapped_space_visitor_t *visitor + ) { + auto &parent_entry = iter.get_internal(2); + auto parent = parent_entry.node; + assert(parent); + auto node_iter = parent->iter_idx(parent_entry.pos); + + auto on_found = [visitor, &iter, &f](LeafNodeRef node) { + iter.leaf.node = node; + auto node_iter = f(*node); + iter.leaf.pos = node_iter->get_offset(); + if (visitor) + (*visitor)( + node->get_paddr(), + node->get_node_meta().begin, + node->get_length(), + 1, + node->get_type(), + iter); + return seastar::now(); + }; + + auto v = parent->template get_child<leaf_node_t>(c, node_iter); + if (v.has_child()) { + return v.get_child_fut().safe_then( + [on_found=std::move(on_found), node_iter, c, + parent_entry](auto child) mutable { + LOG_PREFIX(FixedKVBtree::lookup_leaf); + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_entry.node, + parent_entry.pos, + *child); + auto &cnode = (typename internal_node_t::base_t &)*child; + assert(cnode.get_node_meta().begin == node_iter.get_key()); + assert(cnode.get_node_meta().end > node_iter.get_key()); + return on_found(child->template cast<leaf_node_t>()); + }); + } + + auto child_pos = v.get_child_pos(); + auto next_iter = node_iter + 1; + auto begin = node_iter->get_key(); + auto end = next_iter == parent->end() + ? parent->get_node_meta().end + : next_iter->get_key(); + + return get_leaf_node( + c, + node_iter->get_val().maybe_relative_to(parent->get_paddr()), + begin, + end, + std::make_optional<node_position_t<leaf_node_t>>( + child_pos.template get_parent<leaf_node_t>(), + child_pos.get_pos()) + ).si_then([on_found=std::move(on_found)](LeafNodeRef node) { + return on_found(node); + }); + } + + /** + * lookup_depth_range + * + * Performs node lookups on depths [from, to) using li and ll to + * specific target at each level. Note, may leave the iterator + * at_boundary(), call handle_boundary() prior to returning out + * lf FixedKVBtree. + */ + using lookup_depth_range_iertr = base_iertr; + using lookup_depth_range_ret = lookup_depth_range_iertr::future<>; + template <typename LI, typename LL> + static lookup_depth_range_ret lookup_depth_range( + op_context_t<node_key_t> c, ///< [in] context + iterator &iter, ///< [in,out] iterator to populate + depth_t from, ///< [in] from inclusive + depth_t to, ///< [in] to exclusive, (to <= from, to == from is a noop) + LI &li, ///< [in] internal->iterator + LL &ll, ///< [in] leaf->iterator + mapped_space_visitor_t *visitor ///< [in] mapped space visitor + ) { + LOG_PREFIX(FixedKVBtree::lookup_depth_range); + SUBTRACET(seastore_fixedkv_tree, "{} -> {}", c.trans, from, to); + return seastar::do_with( + from, + [c, to, visitor, &iter, &li, &ll](auto &d) { + return trans_intr::repeat( + [c, to, visitor, &iter, &li, &ll, &d] { + if (d > to) { + return [&] { + if (d > 1) { + return lookup_internal_level( + c, + d, + iter, + li, + visitor); + } else { + assert(d == 1); + return lookup_leaf( + c, + iter, + ll, + visitor); + } + }().si_then([&d] { + --d; + return lookup_depth_range_iertr::make_ready_future< + seastar::stop_iteration + >(seastar::stop_iteration::no); + }); + } else { + return lookup_depth_range_iertr::make_ready_future< + seastar::stop_iteration + >(seastar::stop_iteration::yes); + } + }); + }); + } + + using lookup_iertr = base_iertr; + using lookup_ret = lookup_iertr::future<iterator>; + template <typename LI, typename LL> + lookup_ret lookup( + op_context_t<node_key_t> c, + LI &&lookup_internal, + LL &&lookup_leaf, + depth_t min_depth, + mapped_space_visitor_t *visitor + ) const { + LOG_PREFIX(FixedKVBtree::lookup); + assert(min_depth > 0); + return seastar::do_with( + iterator{get_root().get_depth()}, + std::forward<LI>(lookup_internal), + std::forward<LL>(lookup_leaf), + [FNAME, this, visitor, c, min_depth](auto &iter, auto &li, auto &ll) { + return lookup_root( + c, iter, visitor + ).si_then([FNAME, this, visitor, c, &iter, &li, &ll, min_depth] { + if (iter.get_depth() > 1) { + auto &root_entry = *(iter.internal.rbegin()); + root_entry.pos = li(*(root_entry.node)).get_offset(); + } else { + auto &root_entry = iter.leaf; + auto riter = ll(*(root_entry.node)); + root_entry.pos = riter->get_offset(); + } + SUBTRACET(seastore_fixedkv_tree, "got root, depth {}", + c.trans, get_root().get_depth()); + return lookup_depth_range( + c, + iter, + get_root().get_depth() - 1, + min_depth - 1, + li, + ll, + visitor + ).si_then([c, visitor, &iter, min_depth] { + // It's only when the lookup is triggered by + // update_internal_mapping() that min_depth is + // NOT 1 + if (min_depth == 1 && iter.at_boundary()) { + return iter.handle_boundary(c, visitor); + } else { + return lookup_iertr::now(); + } + }); + }).si_then([&iter] { + return std::move(iter); + }); + }); + } + + /** + * find_insertion + * + * Prepare iter for insertion. iter should begin pointing at + * the valid insertion point (lower_bound(laddr)). + * + * Upon completion, iter will point at the + * position at which laddr should be inserted. iter may, upon completion, + * point at the end of a leaf other than the end leaf if that's the correct + * insertion point. + */ + using find_insertion_iertr = base_iertr; + using find_insertion_ret = find_insertion_iertr::future<>; + static find_insertion_ret find_insertion( + op_context_t<node_key_t> c, + node_key_t laddr, + iterator &iter) + { + assert(iter.is_end() || iter.get_key() >= laddr); + if (!iter.is_end() && iter.get_key() == laddr) { + return seastar::now(); + } else if (iter.leaf.node->get_node_meta().begin <= laddr) { +#ifndef NDEBUG + auto p = iter; + if (p.leaf.pos > 0) { + --p.leaf.pos; + assert(p.get_key() < laddr); + } +#endif + return seastar::now(); + } else { + assert(iter.leaf.pos == 0); + return iter.prev( + c + ).si_then([laddr, &iter](auto p) { + boost::ignore_unused(laddr); // avoid clang warning; + assert(p.leaf.node->get_node_meta().begin <= laddr); + assert(p.get_key() < laddr); + // Note, this is specifically allowed to violate the iterator + // invariant that pos is a valid index for the node in the event + // that the insertion point is at the end of a node. + p.leaf.pos++; + assert(p.at_boundary()); + iter = p; + return seastar::now(); + }); + } + } + + /** + * handle_split + * + * Split nodes in iter as needed for insertion. First, scan iter from leaf + * to find first non-full level. Then, split from there towards leaf. + * + * Upon completion, iter will point at the newly split insertion point. As + * with find_insertion, iter's leaf pointer may be end without iter being + * end. + */ + using handle_split_iertr = base_iertr; + using handle_split_ret = handle_split_iertr::future<>; + handle_split_ret handle_split( + op_context_t<node_key_t> c, + iterator &iter) + { + LOG_PREFIX(FixedKVBtree::handle_split); + + depth_t split_from = iter.check_split(); + + SUBTRACET(seastore_fixedkv_tree, "split_from {}, depth {}", c.trans, split_from, iter.get_depth()); + + if (split_from == iter.get_depth()) { + auto nroot = c.cache.template alloc_new_extent<internal_node_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + fixed_kv_node_meta_t<node_key_t> meta{ + min_max_t<node_key_t>::min, min_max_t<node_key_t>::max, iter.get_depth() + 1}; + nroot->set_meta(meta); + nroot->range = meta; + nroot->journal_insert( + nroot->begin(), + min_max_t<node_key_t>::min, + get_root().get_location(), + nullptr); + iter.internal.push_back({nroot, 0}); + + get_tree_stats<self_type>(c.trans).depth = iter.get_depth(); + get_tree_stats<self_type>(c.trans).extents_num_delta++; + + root_block = c.cache.duplicate_for_write( + c.trans, root_block)->template cast<RootBlock>(); + get_root().set_location(nroot->get_paddr()); + get_root().set_depth(iter.get_depth()); + ceph_assert(get_root().get_depth() <= MAX_FIXEDKVBTREE_DEPTH); + set_root_node(nroot); + } + + /* pos may be either node_position_t<leaf_node_t> or + * node_position_t<internal_node_t> */ + auto split_level = [&](auto &parent_pos, auto &pos) { + LOG_PREFIX(FixedKVBtree::handle_split); + auto [left, right, pivot] = pos.node->make_split_children(c); + + auto parent_node = parent_pos.node; + auto parent_iter = parent_pos.get_iter(); + + parent_node->update( + parent_iter, + left->get_paddr(), + left.get()); + parent_node->insert( + parent_iter + 1, + pivot, + right->get_paddr(), + right.get()); + + SUBTRACET( + seastore_fixedkv_tree, + "splitted {} into left: {}, right: {}", + c.trans, + *pos.node, + *left, + *right); + c.cache.retire_extent(c.trans, pos.node); + + get_tree_stats<self_type>(c.trans).extents_num_delta++; + return std::make_pair(left, right); + }; + + for (; split_from > 0; --split_from) { + auto &parent_pos = iter.get_internal(split_from + 1); + if (!parent_pos.node->is_mutable()) { + parent_pos.node = c.cache.duplicate_for_write( + c.trans, parent_pos.node + )->template cast<internal_node_t>(); + } + + if (split_from > 1) { + auto &pos = iter.get_internal(split_from); + SUBTRACET( + seastore_fixedkv_tree, + "splitting internal {} at depth {}, parent: {} at pos: {}", + c.trans, + *pos.node, + split_from, + *parent_pos.node, + parent_pos.pos); + auto [left, right] = split_level(parent_pos, pos); + + if (pos.pos < left->get_size()) { + pos.node = left; + } else { + pos.node = right; + pos.pos -= left->get_size(); + + parent_pos.pos += 1; + } + } else { + auto &pos = iter.leaf; + SUBTRACET( + seastore_fixedkv_tree, + "splitting leaf {}, parent: {} at pos: {}", + c.trans, + *pos.node, + *parent_pos.node, + parent_pos.pos); + auto [left, right] = split_level(parent_pos, pos); + + /* right->get_node_meta().begin == pivot == right->begin()->get_key() + * Thus, if pos.pos == left->get_size(), we want iter to point to + * left with pos.pos at the end rather than right with pos.pos = 0 + * since the insertion would be to the left of the first element + * of right and thus necessarily less than right->get_node_meta().begin. + */ + if (pos.pos <= left->get_size()) { + pos.node = left; + } else { + pos.node = right; + pos.pos -= left->get_size(); + + parent_pos.pos += 1; + } + } + } + + return seastar::now(); + } + + + using handle_merge_iertr = base_iertr; + using handle_merge_ret = handle_merge_iertr::future<>; + handle_merge_ret handle_merge( + op_context_t<node_key_t> c, + iterator &iter) + { + LOG_PREFIX(FixedKVBtree::handle_merge); + if (iter.get_depth() == 1 || + !iter.leaf.node->below_min_capacity()) { + SUBTRACET( + seastore_fixedkv_tree, + "no need to merge leaf, leaf size {}, depth {}", + c.trans, + iter.leaf.node->get_size(), + iter.get_depth()); + return seastar::now(); + } + + return seastar::do_with( + depth_t{1}, + [FNAME, this, c, &iter](auto &to_merge) { + return trans_intr::repeat( + [FNAME, this, c, &iter, &to_merge] { + SUBTRACET( + seastore_fixedkv_tree, + "merging depth {}", + c.trans, + to_merge); + auto &parent_pos = iter.get_internal(to_merge + 1); + auto merge_fut = handle_merge_iertr::now(); + if (to_merge > 1) { + auto &pos = iter.get_internal(to_merge); + merge_fut = merge_level(c, to_merge, parent_pos, pos); + } else { + auto &pos = iter.leaf; + merge_fut = merge_level(c, to_merge, parent_pos, pos); + } + + return merge_fut.si_then([FNAME, this, c, &iter, &to_merge] { + ++to_merge; + auto &pos = iter.get_internal(to_merge); + if (to_merge == iter.get_depth()) { + if (pos.node->get_size() == 1) { + SUBTRACET(seastore_fixedkv_tree, "collapsing root", c.trans); + c.cache.retire_extent(c.trans, pos.node); + assert(pos.pos == 0); + auto node_iter = pos.get_iter(); + iter.internal.pop_back(); + get_tree_stats<self_type>(c.trans).depth = iter.get_depth(); + get_tree_stats<self_type>(c.trans).extents_num_delta--; + + root_block = c.cache.duplicate_for_write( + c.trans, root_block + )->template cast<RootBlock>(); + get_root().set_location( + node_iter->get_val().maybe_relative_to(pos.node->get_paddr())); + get_root().set_depth(iter.get_depth()); + if (iter.get_depth() > 1) { + auto root_node = iter.get_internal(iter.get_depth()).node; + set_root_node(root_node); + } else { + set_root_node(iter.leaf.node); + } + } else { + SUBTRACET(seastore_fixedkv_tree, "no need to collapse root", c.trans); + } + return seastar::stop_iteration::yes; + } else if (pos.node->below_min_capacity()) { + SUBTRACET( + seastore_fixedkv_tree, + "continuing, next node {} depth {} at min", + c.trans, + *pos.node, + to_merge); + return seastar::stop_iteration::no; + } else { + SUBTRACET( + seastore_fixedkv_tree, + "complete, next node {} depth {} not min", + c.trans, + *pos.node, + to_merge); + return seastar::stop_iteration::yes; + } + }); + }); + }); + } + + template <typename NodeType, + std::enable_if_t<std::is_same_v<NodeType, leaf_node_t>, int> = 0> + base_iertr::future<typename NodeType::Ref> get_node( + op_context_t<node_key_t> c, + depth_t depth, + paddr_t addr, + node_key_t begin, + node_key_t end, + typename std::optional<node_position_t<leaf_node_t>> parent_pos) { + assert(depth == 1); + return get_leaf_node(c, addr, begin, end, std::move(parent_pos)); + } + + template <typename NodeType, + std::enable_if_t<std::is_same_v<NodeType, internal_node_t>, int> = 0> + base_iertr::future<typename NodeType::Ref> get_node( + op_context_t<node_key_t> c, + depth_t depth, + paddr_t addr, + node_key_t begin, + node_key_t end, + typename std::optional<node_position_t<internal_node_t>> parent_pos) { + return get_internal_node(c, depth, addr, begin, end, std::move(parent_pos)); + } + + template <typename NodeType> + handle_merge_ret merge_level( + op_context_t<node_key_t> c, + depth_t depth, + node_position_t<internal_node_t> &parent_pos, + node_position_t<NodeType> &pos) + { + LOG_PREFIX(FixedKVBtree::merge_level); + if (!parent_pos.node->is_mutable()) { + parent_pos.node = c.cache.duplicate_for_write( + c.trans, parent_pos.node + )->template cast<internal_node_t>(); + } + + auto iter = parent_pos.get_iter(); + assert(iter.get_offset() < parent_pos.node->get_size()); + bool donor_is_left = ((iter.get_offset() + 1) == parent_pos.node->get_size()); + auto donor_iter = donor_is_left ? (iter - 1) : (iter + 1); + auto next_iter = donor_iter + 1; + auto begin = donor_iter->get_key(); + auto end = next_iter == parent_pos.node->end() + ? parent_pos.node->get_node_meta().end + : next_iter->get_key(); + + SUBTRACET(seastore_fixedkv_tree, "parent: {}, node: {}", c.trans, *parent_pos.node, *pos.node); + auto do_merge = [c, iter, donor_iter, donor_is_left, &parent_pos, &pos]( + typename NodeType::Ref donor) { + LOG_PREFIX(FixedKVBtree::merge_level); + auto [l, r] = donor_is_left ? + std::make_pair(donor, pos.node) : std::make_pair(pos.node, donor); + + auto [liter, riter] = donor_is_left ? + std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); + + if (donor->at_min_capacity()) { + auto replacement = l->make_full_merge(c, r); + + parent_pos.node->update( + liter, + replacement->get_paddr(), + replacement.get()); + parent_pos.node->remove(riter); + + pos.node = replacement; + if (donor_is_left) { + pos.pos += l->get_size(); + parent_pos.pos--; + } + + SUBTRACET(seastore_fixedkv_tree, "l: {}, r: {}, replacement: {}", c.trans, *l, *r, *replacement); + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + get_tree_stats<self_type>(c.trans).extents_num_delta--; + } else { + LOG_PREFIX(FixedKVBtree::merge_level); + auto [replacement_l, replacement_r, pivot] = + l->make_balanced( + c, + r, + !donor_is_left); + + parent_pos.node->update( + liter, + replacement_l->get_paddr(), + replacement_l.get()); + parent_pos.node->replace( + riter, + pivot, + replacement_r->get_paddr(), + replacement_r.get()); + + if (donor_is_left) { + assert(parent_pos.pos > 0); + parent_pos.pos--; + } + + auto orig_position = donor_is_left ? + l->get_size() + pos.pos : + pos.pos; + if (orig_position < replacement_l->get_size()) { + pos.node = replacement_l; + pos.pos = orig_position; + } else { + parent_pos.pos++; + pos.node = replacement_r; + pos.pos = orig_position - replacement_l->get_size(); + } + + SUBTRACET( + seastore_fixedkv_tree, + "l: {}, r: {}, replacement_l: {}, replacement_r: {}", + c.trans, *l, *r, *replacement_l, *replacement_r); + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + } + + return seastar::now(); + }; + + auto v = parent_pos.node->template get_child<NodeType>(c, donor_iter); + if (v.has_child()) { + return v.get_child_fut().safe_then( + [do_merge=std::move(do_merge), &pos, + donor_iter, donor_is_left, c, parent_pos](auto child) mutable { + LOG_PREFIX(FixedKVBtree::merge_level); + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_pos.node, + donor_iter.get_offset(), + *child); + auto &node = (typename internal_node_t::base_t&)*child; + assert(donor_is_left ? + node.get_node_meta().end == pos.node->get_node_meta().begin : + node.get_node_meta().begin == pos.node->get_node_meta().end); + assert(node.get_node_meta().begin == donor_iter.get_key()); + assert(node.get_node_meta().end > donor_iter.get_key()); + return do_merge(child->template cast<NodeType>()); + }); + } + + auto child_pos = v.get_child_pos(); + return get_node<NodeType>( + c, + depth, + donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()), + begin, + end, + std::make_optional<node_position_t<NodeType>>( + child_pos.template get_parent<NodeType>(), + child_pos.get_pos()) + ).si_then([do_merge=std::move(do_merge)](typename NodeType::Ref donor) { + return do_merge(donor); + }); + } +}; + +template <typename T> +struct is_fixed_kv_tree : std::false_type {}; + +template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size, + bool leaf_has_children> +struct is_fixed_kv_tree< + FixedKVBtree< + node_key_t, + node_val_t, + internal_node_t, + leaf_node_t, + pin_t, + node_size, + leaf_has_children>> : std::true_type {}; + +template < + typename tree_type_t, + typename node_key_t, + typename F, + std::enable_if_t<is_fixed_kv_tree<tree_type_t>::value, int> = 0> +auto with_btree( + Cache &cache, + op_context_t<node_key_t> c, + F &&f) { + return cache.get_root( + c.trans + ).si_then([f=std::forward<F>(f)](RootBlockRef croot) mutable { + return seastar::do_with( + tree_type_t(croot), + [f=std::move(f)](auto &btree) mutable { + return f(btree); + }); + }); +} + +template < + typename tree_type_t, + typename State, + typename node_key_t, + typename F, + std::enable_if_t<is_fixed_kv_tree<tree_type_t>::value, int> = 0> +auto with_btree_state( + Cache &cache, + op_context_t<node_key_t> c, + State &&init, + F &&f) { + return seastar::do_with( + std::forward<State>(init), + [&cache, c, f=std::forward<F>(f)](auto &state) mutable { + return with_btree<tree_type_t>( + cache, + c, + [&state, f=std::move(f)](auto &btree) mutable { + return f(btree, state); + }).si_then([&state] { + return seastar::make_ready_future<State>(std::move(state)); + }); + }); +} + +template < + typename tree_type_t, + typename State, + typename node_key_t, + typename F, + std::enable_if_t<is_fixed_kv_tree<tree_type_t>::value, int> = 0> +auto with_btree_state( + Cache &cache, + op_context_t<node_key_t> c, + F &&f) { + return crimson::os::seastore::with_btree_state<tree_type_t, State>( + cache, c, State{}, std::forward<F>(f)); +} + +template < + typename tree_type_t, + typename Ret, + typename node_key_t, + typename F> +auto with_btree_ret( + Cache &cache, + op_context_t<node_key_t> c, + F &&f) { + return with_btree_state<tree_type_t, Ret>( + cache, + c, + [f=std::forward<F>(f)](auto &btree, auto &ret) mutable { + return f( + btree + ).si_then([&ret](auto &&_ret) { + ret = std::move(_ret); + }); + }); +} + +} + diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.cc b/src/crimson/os/seastore/btree/fixed_kv_node.cc new file mode 100644 index 000000000..00aceab92 --- /dev/null +++ b/src/crimson/os/seastore/btree/fixed_kv_node.cc @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/btree/fixed_kv_node.h" + +namespace crimson::os::seastore { + +bool is_valid_child_ptr(ChildableCachedExtent* child) { + return child != nullptr && child != RESERVATION_PTR; +} + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h new file mode 100644 index 000000000..956a1824e --- /dev/null +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -0,0 +1,1220 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <sys/mman.h> +#include <memory> +#include <string.h> + + +#include "include/buffer.h" + +#include "crimson/common/fixed_kv_node_layout.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/cached_extent.h" + +#include "crimson/os/seastore/btree/btree_range_pin.h" +#include "crimson/os/seastore/btree/fixed_kv_btree.h" +#include "crimson/os/seastore/root_block.h" + +namespace crimson::os::seastore { + +/** + * FixedKVNode + * + * Base class enabling recursive lookup between internal and leaf nodes. + */ +template <typename node_key_t> +struct FixedKVNode : ChildableCachedExtent { + using FixedKVNodeRef = TCachedExtentRef<FixedKVNode>; + fixed_kv_node_meta_t<node_key_t> range; + + struct copy_source_cmp_t { + using is_transparent = node_key_t; + bool operator()(const FixedKVNodeRef &l, const FixedKVNodeRef &r) const { + assert(l->range.end <= r->range.begin + || r->range.end <= l->range.begin + || (l->range.begin == r->range.begin + && l->range.end == r->range.end)); + return l->range.begin < r->range.begin; + } + bool operator()(const node_key_t &l, const FixedKVNodeRef &r) const { + return l < r->range.begin; + } + bool operator()(const FixedKVNodeRef &l, const node_key_t &r) const { + return l->range.begin < r; + } + }; + + /* + * + * Nodes of fixed-kv-btree connect to their child nodes by pointers following + * invariants below: + * + * 1. if nodes are stable: + * a. parent points at the node's stable parent + * b. prior_instance is empty + * c. child pointers point at stable children. Child resolution is done + * directly via this array. + * c. copy_sources is empty + * 2. if nodes are mutation_pending: + * a. parent is empty and needs to be fixed upon commit + * b. prior_instance points to its stable version + * c. child pointers are null except for initial_pending() children of + * this transaction. Child resolution is done by first checking this + * array, and then recursively resolving via the parent. We copy child + * pointers from parent on commit. + * c. copy_sources is empty + * 3. if nodes are initial_pending + * a. parent points at its pending parent on this transaction (must exist) + * b. prior_instance is empty or, if it's the result of rewrite, points to + * its stable predecessor + * c. child pointers are null except for initial_pending() children of + * this transaction (live due to 3a below). Child resolution is done + * by first checking this array, and then recursively resolving via + * the correct copy_sources entry. We copy child pointers from copy_sources + * on commit. + * d. copy_sources contains the set of stable nodes at the same tree-level(only + * its "prior_instance" if the node is the result of a rewrite), with which + * the lba range of this node overlaps. + */ + std::vector<ChildableCachedExtent*> children; + std::set<FixedKVNodeRef, copy_source_cmp_t> copy_sources; + uint16_t capacity = 0; + parent_tracker_t* my_tracker = nullptr; + RootBlockRef root_block; + + bool is_linked() { + assert(!has_parent_tracker() || !(bool)root_block); + return (bool)has_parent_tracker() || (bool)root_block; + } + + FixedKVNode(uint16_t capacity, ceph::bufferptr &&ptr) + : ChildableCachedExtent(std::move(ptr)), + children(capacity, nullptr), + capacity(capacity) {} + FixedKVNode(const FixedKVNode &rhs) + : ChildableCachedExtent(rhs), + range(rhs.range), + children(rhs.capacity, nullptr), + capacity(rhs.capacity) {} + + virtual fixed_kv_node_meta_t<node_key_t> get_node_meta() const = 0; + virtual uint16_t get_node_size() const = 0; + + virtual ~FixedKVNode() = default; + virtual node_key_t get_key_from_idx(uint16_t idx) const = 0; + + template<typename iter_t> + void update_child_ptr(iter_t iter, ChildableCachedExtent* child) { + children[iter.get_offset()] = child; + set_child_ptracker(child); + } + + virtual bool is_leaf_and_has_children() const = 0; + + template<typename iter_t> + void insert_child_ptr(iter_t iter, ChildableCachedExtent* child) { + auto raw_children = children.data(); + auto offset = iter.get_offset(); + std::memmove( + &raw_children[offset + 1], + &raw_children[offset], + (get_node_size() - offset) * sizeof(ChildableCachedExtent*)); + if (child) { + children[offset] = child; + set_child_ptracker(child); + } else { + // this can only happen when reserving lba spaces + ceph_assert(is_leaf_and_has_children()); + // this is to avoid mistakenly copying pointers from + // copy sources when committing this lba node, because + // we rely on pointers' "nullness" to avoid copying + // pointers for updated values + children[offset] = RESERVATION_PTR; + } + } + + template<typename iter_t> + void remove_child_ptr(iter_t iter) { + LOG_PREFIX(FixedKVNode::remove_child_ptr); + auto raw_children = children.data(); + auto offset = iter.get_offset(); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, total size {}, extent {}", + this->pending_for_transaction, + offset, + get_node_size(), + (void*)raw_children[offset]); + // parent tracker of the child being removed will be + // reset when the child is invalidated, so no need to + // reset it here + std::memmove( + &raw_children[offset], + &raw_children[offset + 1], + (get_node_size() - offset - 1) * sizeof(ChildableCachedExtent*)); + } + + FixedKVNode& get_stable_for_key(node_key_t key) { + ceph_assert(is_pending()); + if (is_mutation_pending()) { + return (FixedKVNode&)*get_prior_instance(); + } else { + ceph_assert(!copy_sources.empty()); + auto it = copy_sources.upper_bound(key); + it--; + auto ©_source = *it; + ceph_assert(copy_source->get_node_meta().is_in_range(key)); + return *copy_source; + } + } + + static void push_copy_sources( + FixedKVNode &dest, + FixedKVNode &src) + { + ceph_assert(dest.is_initial_pending()); + if (!src.is_pending()) { + dest.copy_sources.emplace(&src); + } else if (src.is_mutation_pending()) { + dest.copy_sources.emplace( + src.get_prior_instance()->template cast<FixedKVNode>()); + } else { + ceph_assert(src.is_initial_pending()); + dest.copy_sources.insert( + src.copy_sources.begin(), + src.copy_sources.end()); + } + } + + virtual uint16_t get_node_split_pivot() = 0; + + static void move_child_ptrs( + FixedKVNode &dest, + FixedKVNode &src, + size_t dest_start, + size_t src_start, + size_t src_end) + { + std::memmove( + dest.children.data() + dest_start, + src.children.data() + src_start, + (src_end - src_start) * sizeof(ChildableCachedExtent*)); + + ceph_assert(src_start < src_end); + ceph_assert(src.children.size() >= src_end); + for (auto it = src.children.begin() + src_start; + it != src.children.begin() + src_end; + it++) + { + auto child = *it; + if (is_valid_child_ptr(child)) { + dest.set_child_ptracker(child); + } + } + } + + void link_child(ChildableCachedExtent* child, uint16_t pos) { + assert(pos < get_node_size()); + assert(child); + ceph_assert(!is_pending()); + ceph_assert(child->is_valid() && !child->is_pending()); + assert(!children[pos]); + children[pos] = child; + set_child_ptracker(child); + } + + virtual get_child_ret_t<LogicalCachedExtent> + get_logical_child(op_context_t<node_key_t> c, uint16_t pos) = 0; + + template <typename T, typename iter_t> + get_child_ret_t<T> get_child(op_context_t<node_key_t> c, iter_t iter) { + auto pos = iter.get_offset(); + assert(children.capacity()); + auto child = children[pos]; + if (is_valid_child_ptr(child)) { + ceph_assert(child->get_type() == T::TYPE); + return c.cache.template get_extent_viewable_by_trans<T>(c.trans, (T*)child); + } else if (is_pending()) { + auto key = iter.get_key(); + auto &sparent = get_stable_for_key(key); + auto spos = sparent.child_pos_for_key(key); + auto child = sparent.children[spos]; + if (is_valid_child_ptr(child)) { + ceph_assert(child->get_type() == T::TYPE); + return c.cache.template get_extent_viewable_by_trans<T>(c.trans, (T*)child); + } else { + return child_pos_t(&sparent, spos); + } + } else { + return child_pos_t(this, pos); + } + } + + void split_child_ptrs( + FixedKVNode &left, + FixedKVNode &right) + { + assert(!left.my_tracker); + assert(!right.my_tracker); + push_copy_sources(left, *this); + push_copy_sources(right, *this); + if (is_pending()) { + uint16_t pivot = get_node_split_pivot(); + move_child_ptrs(left, *this, 0, 0, pivot); + move_child_ptrs(right, *this, 0, pivot, get_node_size()); + my_tracker = nullptr; + } + } + + void merge_child_ptrs( + FixedKVNode &left, + FixedKVNode &right) + { + ceph_assert(!my_tracker); + push_copy_sources(*this, left); + push_copy_sources(*this, right); + + if (left.is_pending()) { + move_child_ptrs(*this, left, 0, 0, left.get_node_size()); + left.my_tracker = nullptr; + } + + if (right.is_pending()) { + move_child_ptrs(*this, right, left.get_node_size(), 0, right.get_node_size()); + right.my_tracker = nullptr; + } + } + + static void balance_child_ptrs( + FixedKVNode &left, + FixedKVNode &right, + bool prefer_left, + FixedKVNode &replacement_left, + FixedKVNode &replacement_right) + { + size_t l_size = left.get_node_size(); + size_t r_size = right.get_node_size(); + size_t total = l_size + r_size; + size_t pivot_idx = (l_size + r_size) / 2; + if (total % 2 && prefer_left) { + pivot_idx++; + } + + assert(!replacement_left.my_tracker); + assert(!replacement_right.my_tracker); + if (pivot_idx < l_size) { + // deal with left + push_copy_sources(replacement_left, left); + push_copy_sources(replacement_right, left); + if (left.is_pending()) { + move_child_ptrs(replacement_left, left, 0, 0, pivot_idx); + move_child_ptrs(replacement_right, left, 0, pivot_idx, l_size); + left.my_tracker = nullptr; + } + + // deal with right + push_copy_sources(replacement_right, right); + if (right.is_pending()) { + move_child_ptrs(replacement_right, right, l_size - pivot_idx, 0, r_size); + right.my_tracker= nullptr; + } + } else { + // deal with left + push_copy_sources(replacement_left, left); + if (left.is_pending()) { + move_child_ptrs(replacement_left, left, 0, 0, l_size); + left.my_tracker = nullptr; + } + + // deal with right + push_copy_sources(replacement_left, right); + push_copy_sources(replacement_right, right); + if (right.is_pending()) { + move_child_ptrs(replacement_left, right, l_size, 0, pivot_idx - l_size); + move_child_ptrs(replacement_right, right, 0, pivot_idx - l_size, r_size); + right.my_tracker= nullptr; + } + } + } + + void set_parent_tracker_from_prior_instance() { + assert(is_mutation_pending()); + auto &prior = (FixedKVNode&)(*get_prior_instance()); + if (range.is_root()) { + ceph_assert(prior.root_block); + ceph_assert(pending_for_transaction); + root_block = prior.root_block; + link_phy_tree_root_node(root_block, this); + return; + } + ceph_assert(!root_block); + take_prior_parent_tracker(); + assert(is_parent_valid()); + auto parent = get_parent_node<FixedKVNode>(); + //TODO: can this search be avoided? + auto off = parent->lower_bound_offset(get_node_meta().begin); + assert(parent->get_key_from_idx(off) == get_node_meta().begin); + parent->children[off] = this; + } + + bool is_children_empty() const { + for (auto it = children.begin(); + it != children.begin() + get_node_size(); + it++) { + if (is_valid_child_ptr(*it) + && (*it)->is_valid()) { + return false; + } + } + return true; + } + + void set_children_from_prior_instance() { + assert(get_prior_instance()); + auto &prior = (FixedKVNode&)(*get_prior_instance()); + assert(prior.my_tracker || prior.is_children_empty()); + + if (prior.my_tracker) { + prior.my_tracker->reset_parent(this); + my_tracker = prior.my_tracker; + // All my initial pending children is pointing to the original + // tracker which has been dropped by the above line, so need + // to adjust them to point to the new tracker + adjust_ptracker_for_children(); + } + assert(my_tracker || is_children_empty()); + } + + void adjust_ptracker_for_children() { + auto begin = children.begin(); + auto end = begin + get_node_size(); + ceph_assert(end <= children.end()); + for (auto it = begin; it != end; it++) { + auto child = *it; + if (is_valid_child_ptr(child)) { + set_child_ptracker(child); + } + } + } + + void on_delta_write(paddr_t record_block_offset) final { + // All in-memory relative addrs are necessarily record-relative + assert(get_prior_instance()); + assert(pending_for_transaction); + resolve_relative_addrs(record_block_offset); + } + + virtual uint16_t lower_bound_offset(node_key_t) const = 0; + virtual uint16_t upper_bound_offset(node_key_t) const = 0; + virtual uint16_t child_pos_for_key(node_key_t) const = 0; + + virtual bool validate_stable_children() = 0; + + template<typename iter_t> + uint16_t copy_children_from_stable_source( + FixedKVNode &source, + iter_t foreign_start_it, + iter_t foreign_end_it, + iter_t local_start_it) { + auto foreign_it = foreign_start_it, local_it = local_start_it; + while (foreign_it != foreign_end_it + && local_it.get_offset() < get_node_size()) + { + auto &child = children[local_it.get_offset()]; + if (foreign_it.get_key() == local_it.get_key()) { + // the foreign key is preserved + if (!child) { + child = source.children[foreign_it.get_offset()]; + } + foreign_it++; + local_it++; + } else if (foreign_it.get_key() < local_it.get_key()) { + // the foreign key has been removed, because, if it hasn't, + // there must have been a local key before the one pointed + // by the current "local_it" that's equal to this foreign key + // and has pushed the foreign_it forward. + foreign_it++; + } else { + // the local key must be a newly inserted one. + local_it++; + } + } + return local_it.get_offset(); + } + + template<typename Func> + void copy_children_from_stable_sources(Func &&get_iter) { + if (!copy_sources.empty()) { + auto it = --copy_sources.upper_bound(get_node_meta().begin); + auto &cs = *it; + uint16_t start_pos = cs->lower_bound_offset( + get_node_meta().begin); + if (start_pos == cs->get_node_size()) { + it++; + start_pos = 0; + } + uint16_t local_next_pos = 0; + for (; it != copy_sources.end(); it++) { + auto& copy_source = *it; + auto end_pos = copy_source->get_node_size(); + if (copy_source->get_node_meta().is_in_range(get_node_meta().end)) { + end_pos = copy_source->upper_bound_offset(get_node_meta().end); + } + auto local_start_iter = get_iter(*this, local_next_pos); + auto foreign_start_iter = get_iter(*copy_source, start_pos); + auto foreign_end_iter = get_iter(*copy_source, end_pos); + local_next_pos = copy_children_from_stable_source( + *copy_source, foreign_start_iter, foreign_end_iter, local_start_iter); + if (end_pos != copy_source->get_node_size()) { + break; + } + start_pos = 0; + } + } + } + + void on_invalidated(Transaction &t) final { + reset_parent_tracker(); + } + + bool is_rewrite() { + return is_initial_pending() && get_prior_instance(); + } + + void on_initial_write() final { + // All in-memory relative addrs are necessarily block-relative + resolve_relative_addrs(get_paddr()); + if (range.is_root()) { + reset_parent_tracker(); + } + assert(has_parent_tracker() ? (is_parent_valid()) : true); + } + + void set_child_ptracker(ChildableCachedExtent *child) { + if (!this->my_tracker) { + this->my_tracker = new parent_tracker_t(this); + } + child->reset_parent_tracker(this->my_tracker); + } + + void on_clean_read() final { + // From initial write of block, relative addrs are necessarily block-relative + resolve_relative_addrs(get_paddr()); + } + + virtual void resolve_relative_addrs(paddr_t base) = 0; +}; + +/** + * FixedKVInternalNode + * + * Abstracts operations on and layout of internal nodes for the + * FixedKVBTree. + */ +template < + size_t CAPACITY, + typename NODE_KEY, + typename NODE_KEY_LE, + size_t node_size, + typename node_type_t> +struct FixedKVInternalNode + : FixedKVNode<NODE_KEY>, + common::FixedKVNodeLayout< + CAPACITY, + fixed_kv_node_meta_t<NODE_KEY>, + fixed_kv_node_meta_le_t<NODE_KEY_LE>, + NODE_KEY, NODE_KEY_LE, + paddr_t, paddr_le_t> { + using Ref = TCachedExtentRef<node_type_t>; + using base_t = FixedKVNode<NODE_KEY>; + using base_ref = typename FixedKVNode<NODE_KEY>::FixedKVNodeRef; + using node_layout_t = + common::FixedKVNodeLayout< + CAPACITY, + fixed_kv_node_meta_t<NODE_KEY>, + fixed_kv_node_meta_le_t<NODE_KEY_LE>, + NODE_KEY, + NODE_KEY_LE, + paddr_t, + paddr_le_t>; + using internal_const_iterator_t = typename node_layout_t::const_iterator; + using internal_iterator_t = typename node_layout_t::iterator; + using this_type_t = FixedKVInternalNode< + CAPACITY, + NODE_KEY, + NODE_KEY_LE, + node_size, + node_type_t>; + + FixedKVInternalNode(ceph::bufferptr &&ptr) + : FixedKVNode<NODE_KEY>(CAPACITY, std::move(ptr)), + node_layout_t(this->get_bptr().c_str()) {} + FixedKVInternalNode(const FixedKVInternalNode &rhs) + : FixedKVNode<NODE_KEY>(rhs), + node_layout_t(this->get_bptr().c_str()) {} + + bool is_leaf_and_has_children() const final { + return false; + } + + uint16_t get_node_split_pivot() final { + return this->get_split_pivot().get_offset(); + } + + void prepare_commit() final { + if (this->is_initial_pending()) { + if (this->is_rewrite()) { + this->set_children_from_prior_instance(); + } + this->copy_children_from_stable_sources( + [this](base_t &node, uint16_t pos) { + ceph_assert(node.get_type() == this->get_type()); + auto &n = static_cast<this_type_t&>(node); + return n.iter_idx(pos); + } + ); + if (this->is_rewrite()) { + this->reset_prior_instance(); + } else { + this->adjust_ptracker_for_children(); + } + assert(this->validate_stable_children()); + this->copy_sources.clear(); + } + } + + get_child_ret_t<LogicalCachedExtent> + get_logical_child(op_context_t<NODE_KEY>, uint16_t pos) final { + ceph_abort("impossible"); + return get_child_ret_t<LogicalCachedExtent>(child_pos_t(nullptr, 0)); + } + + bool validate_stable_children() final { + LOG_PREFIX(FixedKVInternalNode::validate_stable_children); + if (this->children.empty()) { + return false; + } + + for (auto i : *this) { + auto child = (FixedKVNode<NODE_KEY>*)this->children[i.get_offset()]; + if (child && child->range.begin != i.get_key()) { + SUBERROR(seastore_fixedkv_tree, + "stable child not valid: child {}, child meta{}, key {}", + *child, + child->get_node_meta(), + i.get_key()); + ceph_abort(); + return false; + } + } + return true; + } + + virtual ~FixedKVInternalNode() { + if (this->is_valid() && !this->is_pending()) { + if (this->range.is_root()) { + ceph_assert(this->root_block); + unlink_phy_tree_root_node<NODE_KEY>(this->root_block); + } else { + ceph_assert(this->is_parent_valid()); + auto parent = this->template get_parent_node<FixedKVNode<NODE_KEY>>(); + auto off = parent->lower_bound_offset(this->get_meta().begin); + assert(parent->get_key_from_idx(off) == this->get_meta().begin); + assert(parent->children[off] == this); + parent->children[off] = nullptr; + } + } + } + + uint16_t lower_bound_offset(NODE_KEY key) const final { + return this->lower_bound(key).get_offset(); + } + + uint16_t upper_bound_offset(NODE_KEY key) const final { + return this->upper_bound(key).get_offset(); + } + + uint16_t child_pos_for_key(NODE_KEY key) const final { + auto it = this->upper_bound(key); + assert(it != this->begin()); + --it; + return it.get_offset(); + } + + NODE_KEY get_key_from_idx(uint16_t idx) const final { + return this->iter_idx(idx).get_key(); + } + + fixed_kv_node_meta_t<NODE_KEY> get_node_meta() const { + return this->get_meta(); + } + + uint16_t get_node_size() const final { + return this->get_size(); + } + + typename node_layout_t::delta_buffer_t delta_buffer; + typename node_layout_t::delta_buffer_t *maybe_get_delta_buffer() { + return this->is_mutation_pending() + ? &delta_buffer : nullptr; + } + + CachedExtentRef duplicate_for_write(Transaction&) override { + assert(delta_buffer.empty()); + return CachedExtentRef(new node_type_t(*this)); + }; + + void on_replace_prior(Transaction&) final { + ceph_assert(!this->is_rewrite()); + this->set_children_from_prior_instance(); + auto &prior = (this_type_t&)(*this->get_prior_instance()); + auto copied = this->copy_children_from_stable_source( + prior, + prior.begin(), + prior.end(), + this->begin()); + ceph_assert(copied <= get_node_size()); + assert(this->validate_stable_children()); + this->set_parent_tracker_from_prior_instance(); + } + + void update( + internal_const_iterator_t iter, + paddr_t addr, + FixedKVNode<NODE_KEY>* nextent) { + LOG_PREFIX(FixedKVInternalNode::update); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, {}", + this->pending_for_transaction, + iter.get_offset(), + *nextent); + this->update_child_ptr(iter, nextent); + return this->journal_update( + iter, + this->maybe_generate_relative(addr), + maybe_get_delta_buffer()); + } + + void insert( + internal_const_iterator_t iter, + NODE_KEY pivot, + paddr_t addr, + FixedKVNode<NODE_KEY>* nextent) { + LOG_PREFIX(FixedKVInternalNode::insert); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}, {}", + this->pending_for_transaction, + iter.get_offset(), + pivot, + *nextent); + this->insert_child_ptr(iter, nextent); + return this->journal_insert( + iter, + pivot, + this->maybe_generate_relative(addr), + maybe_get_delta_buffer()); + } + + void remove(internal_const_iterator_t iter) { + LOG_PREFIX(FixedKVInternalNode::remove); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}", + this->pending_for_transaction, + iter.get_offset(), + iter.get_key()); + this->remove_child_ptr(iter); + return this->journal_remove( + iter, + maybe_get_delta_buffer()); + } + + void replace( + internal_const_iterator_t iter, + NODE_KEY pivot, + paddr_t addr, + FixedKVNode<NODE_KEY>* nextent) { + LOG_PREFIX(FixedKVInternalNode::replace); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, old key {}, key {}, {}", + this->pending_for_transaction, + iter.get_offset(), + iter.get_key(), + pivot, + *nextent); + this->update_child_ptr(iter, nextent); + return this->journal_replace( + iter, + pivot, + this->maybe_generate_relative(addr), + maybe_get_delta_buffer()); + } + + std::tuple<Ref, Ref, NODE_KEY> + make_split_children(op_context_t<NODE_KEY> c) { + auto left = c.cache.template alloc_new_extent<node_type_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + auto right = c.cache.template alloc_new_extent<node_type_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + this->split_child_ptrs(*left, *right); + auto pivot = this->split_into(*left, *right); + left->range = left->get_meta(); + right->range = right->get_meta(); + return std::make_tuple( + left, + right, + pivot); + } + + Ref make_full_merge( + op_context_t<NODE_KEY> c, + Ref &right) { + auto replacement = c.cache.template alloc_new_extent<node_type_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + replacement->merge_child_ptrs(*this, *right); + replacement->merge_from(*this, *right->template cast<node_type_t>()); + replacement->range = replacement->get_meta(); + return replacement; + } + + std::tuple<Ref, Ref, NODE_KEY> + make_balanced( + op_context_t<NODE_KEY> c, + Ref &_right, + bool prefer_left) { + ceph_assert(_right->get_type() == this->get_type()); + auto &right = *_right->template cast<node_type_t>(); + auto replacement_left = c.cache.template alloc_new_extent<node_type_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + auto replacement_right = c.cache.template alloc_new_extent<node_type_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + + auto pivot = this->balance_into_new_nodes( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + this->balance_child_ptrs( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + + replacement_left->range = replacement_left->get_meta(); + replacement_right->range = replacement_right->get_meta(); + return std::make_tuple( + replacement_left, + replacement_right, + pivot); + } + + /** + * Internal relative addresses on read or in memory prior to commit + * are either record or block relative depending on whether this + * physical node is is_initial_pending() or just is_mutable(). + * + * User passes appropriate base depending on lifecycle and + * resolve_relative_addrs fixes up relative internal references + * based on base. + */ + void resolve_relative_addrs(paddr_t base) + { + LOG_PREFIX(FixedKVInternalNode::resolve_relative_addrs); + for (auto i: *this) { + if (i->get_val().is_relative()) { + auto updated = base.add_relative(i->get_val()); + SUBTRACE(seastore_fixedkv_tree, "{} -> {}", i->get_val(), updated); + i->set_val(updated); + } + } + } + + void node_resolve_vals( + internal_iterator_t from, + internal_iterator_t to) const { + if (this->is_initial_pending()) { + for (auto i = from; i != to; ++i) { + if (i->get_val().is_relative()) { + assert(i->get_val().is_block_relative()); + i->set_val(this->get_paddr().add_relative(i->get_val())); + } + } + } + } + void node_unresolve_vals( + internal_iterator_t from, + internal_iterator_t to) const { + if (this->is_initial_pending()) { + for (auto i = from; i != to; ++i) { + if (i->get_val().is_relative()) { + assert(i->get_val().is_record_relative()); + i->set_val(i->get_val().block_relative_to(this->get_paddr())); + } + } + } + } + + std::ostream &_print_detail(std::ostream &out) const + { + out << ", size=" << this->get_size() + << ", meta=" << this->get_meta() + << ", my_tracker=" << (void*)this->my_tracker; + if (this->my_tracker) { + out << ", my_tracker->parent=" << (void*)this->my_tracker->get_parent().get(); + } + return out << ", root_block=" << (void*)this->root_block.get(); + } + + ceph::bufferlist get_delta() { + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &_bl) { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + typename node_layout_t::delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + this->set_last_committed_crc(this->get_crc32c()); + resolve_relative_addrs(base); + } + + constexpr static size_t get_min_capacity() { + return (node_layout_t::get_capacity() - 1) / 2; + } + + bool at_max_capacity() const { + assert(this->get_size() <= node_layout_t::get_capacity()); + return this->get_size() == node_layout_t::get_capacity(); + } + + bool at_min_capacity() const { + assert(this->get_size() >= (get_min_capacity() - 1)); + return this->get_size() <= get_min_capacity(); + } + + bool below_min_capacity() const { + assert(this->get_size() >= (get_min_capacity() - 1)); + return this->get_size() < get_min_capacity(); + } +}; + +template < + size_t CAPACITY, + typename NODE_KEY, + typename NODE_KEY_LE, + typename VAL, + typename VAL_LE, + size_t node_size, + typename node_type_t, + bool has_children> +struct FixedKVLeafNode + : FixedKVNode<NODE_KEY>, + common::FixedKVNodeLayout< + CAPACITY, + fixed_kv_node_meta_t<NODE_KEY>, + fixed_kv_node_meta_le_t<NODE_KEY_LE>, + NODE_KEY, NODE_KEY_LE, + VAL, VAL_LE> { + using Ref = TCachedExtentRef<node_type_t>; + using node_layout_t = + common::FixedKVNodeLayout< + CAPACITY, + fixed_kv_node_meta_t<NODE_KEY>, + fixed_kv_node_meta_le_t<NODE_KEY_LE>, + NODE_KEY, + NODE_KEY_LE, + VAL, + VAL_LE>; + using internal_const_iterator_t = typename node_layout_t::const_iterator; + using this_type_t = FixedKVLeafNode< + CAPACITY, + NODE_KEY, + NODE_KEY_LE, + VAL, + VAL_LE, + node_size, + node_type_t, + has_children>; + using base_t = FixedKVNode<NODE_KEY>; + FixedKVLeafNode(ceph::bufferptr &&ptr) + : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, std::move(ptr)), + node_layout_t(this->get_bptr().c_str()) {} + FixedKVLeafNode(const FixedKVLeafNode &rhs) + : FixedKVNode<NODE_KEY>(rhs), + node_layout_t(this->get_bptr().c_str()) {} + + static constexpr bool do_has_children = has_children; + + bool is_leaf_and_has_children() const final { + return has_children; + } + + uint16_t get_node_split_pivot() final { + return this->get_split_pivot().get_offset(); + } + + get_child_ret_t<LogicalCachedExtent> + get_logical_child(op_context_t<NODE_KEY> c, uint16_t pos) final { + auto child = this->children[pos]; + if (is_valid_child_ptr(child)) { + ceph_assert(child->is_logical()); + return c.cache.template get_extent_viewable_by_trans< + LogicalCachedExtent>(c.trans, (LogicalCachedExtent*)child); + } else if (this->is_pending()) { + auto key = this->iter_idx(pos).get_key(); + auto &sparent = this->get_stable_for_key(key); + auto spos = sparent.child_pos_for_key(key); + auto child = sparent.children[spos]; + if (is_valid_child_ptr(child)) { + ceph_assert(child->is_logical()); + return c.cache.template get_extent_viewable_by_trans< + LogicalCachedExtent>(c.trans, (LogicalCachedExtent*)child); + } else { + return child_pos_t(&sparent, spos); + } + } else { + return child_pos_t(this, pos); + } + } + + bool validate_stable_children() override { + return true; + } + + virtual ~FixedKVLeafNode() { + if (this->is_valid() && !this->is_pending()) { + if (this->range.is_root()) { + ceph_assert(this->root_block); + unlink_phy_tree_root_node<NODE_KEY>(this->root_block); + } else { + ceph_assert(this->is_parent_valid()); + auto parent = this->template get_parent_node<FixedKVNode<NODE_KEY>>(); + auto off = parent->lower_bound_offset(this->get_meta().begin); + assert(parent->get_key_from_idx(off) == this->get_meta().begin); + assert(parent->children[off] == this); + parent->children[off] = nullptr; + } + } + } + + void prepare_commit() final { + if constexpr (has_children) { + if (this->is_initial_pending()) { + if (this->is_rewrite()) { + this->set_children_from_prior_instance(); + } + this->copy_children_from_stable_sources( + [this](base_t &node, uint16_t pos) { + ceph_assert(node.get_type() == this->get_type()); + auto &n = static_cast<this_type_t&>(node); + return n.iter_idx(pos); + } + ); + if (this->is_rewrite()) { + this->reset_prior_instance(); + } else { + this->adjust_ptracker_for_children(); + } + assert(this->validate_stable_children()); + this->copy_sources.clear(); + } + } + assert(this->is_initial_pending() + ? this->copy_sources.empty(): + true); + } + + void on_replace_prior(Transaction&) final { + ceph_assert(!this->is_rewrite()); + if constexpr (has_children) { + this->set_children_from_prior_instance(); + auto &prior = (this_type_t&)(*this->get_prior_instance()); + auto copied = this->copy_children_from_stable_source( + prior, + prior.begin(), + prior.end(), + this->begin()); + ceph_assert(copied <= get_node_size()); + assert(this->validate_stable_children()); + this->set_parent_tracker_from_prior_instance(); + } else { + this->set_parent_tracker_from_prior_instance(); + } + } + + uint16_t lower_bound_offset(NODE_KEY key) const final { + return this->lower_bound(key).get_offset(); + } + + uint16_t upper_bound_offset(NODE_KEY key) const final { + return this->upper_bound(key).get_offset(); + } + + uint16_t child_pos_for_key(NODE_KEY key) const final { + return lower_bound_offset(key); + } + + NODE_KEY get_key_from_idx(uint16_t idx) const final { + return this->iter_idx(idx).get_key(); + } + + fixed_kv_node_meta_t<NODE_KEY> get_node_meta() const { + return this->get_meta(); + } + + uint16_t get_node_size() const final { + return this->get_size(); + } + + typename node_layout_t::delta_buffer_t delta_buffer; + virtual typename node_layout_t::delta_buffer_t *maybe_get_delta_buffer() { + return this->is_mutation_pending() ? &delta_buffer : nullptr; + } + + CachedExtentRef duplicate_for_write(Transaction&) override { + assert(delta_buffer.empty()); + return CachedExtentRef(new node_type_t(*this)); + }; + + virtual void update( + internal_const_iterator_t iter, + VAL val, + LogicalCachedExtent* nextent) = 0; + virtual internal_const_iterator_t insert( + internal_const_iterator_t iter, + NODE_KEY addr, + VAL val, + LogicalCachedExtent* nextent) = 0; + virtual void remove(internal_const_iterator_t iter) = 0; + + std::tuple<Ref, Ref, NODE_KEY> + make_split_children(op_context_t<NODE_KEY> c) { + auto left = c.cache.template alloc_new_extent<node_type_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + auto right = c.cache.template alloc_new_extent<node_type_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + if constexpr (has_children) { + this->split_child_ptrs(*left, *right); + } + auto pivot = this->split_into(*left, *right); + left->range = left->get_meta(); + right->range = right->get_meta(); + return std::make_tuple( + left, + right, + pivot); + } + + Ref make_full_merge( + op_context_t<NODE_KEY> c, + Ref &right) { + auto replacement = c.cache.template alloc_new_extent<node_type_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + if constexpr (has_children) { + replacement->merge_child_ptrs(*this, *right); + } + replacement->merge_from(*this, *right->template cast<node_type_t>()); + replacement->range = replacement->get_meta(); + return replacement; + } + + std::tuple<Ref, Ref, NODE_KEY> + make_balanced( + op_context_t<NODE_KEY> c, + Ref &_right, + bool prefer_left) { + ceph_assert(_right->get_type() == this->get_type()); + auto &right = *_right->template cast<node_type_t>(); + auto replacement_left = c.cache.template alloc_new_extent<node_type_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + auto replacement_right = c.cache.template alloc_new_extent<node_type_t>( + c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + + auto pivot = this->balance_into_new_nodes( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + if constexpr (has_children) { + this->balance_child_ptrs( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + } + + replacement_left->range = replacement_left->get_meta(); + replacement_right->range = replacement_right->get_meta(); + return std::make_tuple( + replacement_left, + replacement_right, + pivot); + } + + ceph::bufferlist get_delta() { + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &_bl) { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + typename node_layout_t::delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + this->set_last_committed_crc(this->get_crc32c()); + this->resolve_relative_addrs(base); + } + + std::ostream &_print_detail(std::ostream &out) const + { + return out << ", size=" << this->get_size() + << ", meta=" << this->get_meta(); + } + + constexpr static size_t get_min_capacity() { + return (node_layout_t::get_capacity() - 1) / 2; + } + + bool at_max_capacity() const { + assert(this->get_size() <= node_layout_t::get_capacity()); + return this->get_size() == node_layout_t::get_capacity(); + } + + bool at_min_capacity() const { + assert(this->get_size() >= (get_min_capacity() - 1)); + return this->get_size() <= get_min_capacity(); + } + + bool below_min_capacity() const { + assert(this->get_size() >= (get_min_capacity() - 1)); + return this->get_size() < get_min_capacity(); + } +}; + +} // namespace crimson::os::seastore + +#if FMT_VERSION >= 90000 +template <> +struct fmt::formatter< + crimson::os::seastore::FixedKVNode< + crimson::os::seastore::laddr_t>> : fmt::ostream_formatter {}; +template <> +struct fmt::formatter< + crimson::os::seastore::FixedKVNode< + crimson::os::seastore::paddr_t>> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc new file mode 100644 index 000000000..4d1dc9296 --- /dev/null +++ b/src/crimson/os/seastore/cache.cc @@ -0,0 +1,2040 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/cache.h" + +#include <sstream> +#include <string_view> + +#include <seastar/core/metrics.hh> + +#include "crimson/os/seastore/logging.h" +#include "crimson/common/config_proxy.h" +#include "crimson/os/seastore/async_cleaner.h" + +// included for get_extent_by_type +#include "crimson/os/seastore/collection_manager/collection_flat_node.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h" +#include "crimson/os/seastore/object_data_handler.h" +#include "crimson/os/seastore/collection_manager/collection_flat_node.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h" +#include "crimson/os/seastore/backref/backref_tree_node.h" +#include "test/crimson/seastore/test_block.h" + +using std::string_view; + +SET_SUBSYS(seastore_cache); + +namespace crimson::os::seastore { + +std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) { + return out << "backref_entry_t{" + << ent.paddr << "~" << ent.len << ", " + << "laddr: " << ent.laddr << ", " + << "type: " << ent.type << ", " + << "seq: " << ent.seq << ", " + << "}"; +} + +Cache::Cache( + ExtentPlacementManager &epm) + : epm(epm), + lru(crimson::common::get_conf<Option::size_t>( + "seastore_cache_lru_size")) +{ + LOG_PREFIX(Cache::Cache); + INFO("created, lru_size={}", lru.get_capacity()); + register_metrics(); + segment_providers_by_device_id.resize(DEVICE_ID_MAX, nullptr); +} + +Cache::~Cache() +{ + LOG_PREFIX(Cache::~Cache); + for (auto &i: extents) { + ERROR("extent is still alive -- {}", i); + } + ceph_assert(extents.empty()); +} + +Cache::retire_extent_ret Cache::retire_extent_addr( + Transaction &t, paddr_t addr, extent_len_t length) +{ + LOG_PREFIX(Cache::retire_extent_addr); + TRACET("retire {}~{}", t, addr, length); + + assert(addr.is_real() && !addr.is_block_relative()); + + CachedExtentRef ext; + auto result = t.get_extent(addr, &ext); + if (result == Transaction::get_extent_ret::PRESENT) { + DEBUGT("retire {}~{} on t -- {}", t, addr, length, *ext); + t.add_to_retired_set(CachedExtentRef(&*ext)); + return retire_extent_iertr::now(); + } else if (result == Transaction::get_extent_ret::RETIRED) { + ERRORT("retire {}~{} failed, already retired -- {}", t, addr, length, *ext); + ceph_abort(); + } + + // any relative addr must have been on the transaction + assert(!addr.is_relative()); + + // absent from transaction + // retiring is not included by the cache hit metrics + ext = query_cache(addr, nullptr); + if (ext) { + DEBUGT("retire {}~{} in cache -- {}", t, addr, length, *ext); + } else { + // add a new placeholder to Cache + ext = CachedExtent::make_cached_extent_ref< + RetiredExtentPlaceholder>(length); + ext->init(CachedExtent::extent_state_t::CLEAN, + addr, + PLACEMENT_HINT_NULL, + NULL_GENERATION, + TRANS_ID_NULL); + DEBUGT("retire {}~{} as placeholder, add extent -- {}", + t, addr, length, *ext); + const auto t_src = t.get_src(); + add_extent(ext, &t_src); + } + t.add_to_read_set(ext); + t.add_to_retired_set(ext); + return retire_extent_iertr::now(); +} + +void Cache::dump_contents() +{ + LOG_PREFIX(Cache::dump_contents); + DEBUG("enter"); + for (auto &&i: extents) { + DEBUG("live {}", i); + } + DEBUG("exit"); +} + +void Cache::register_metrics() +{ + LOG_PREFIX(Cache::register_metrics); + DEBUG(""); + + stats = {}; + + namespace sm = seastar::metrics; + using src_t = Transaction::src_t; + + std::map<src_t, sm::label_instance> labels_by_src { + {src_t::MUTATE, sm::label_instance("src", "MUTATE")}, + {src_t::READ, sm::label_instance("src", "READ")}, + {src_t::TRIM_DIRTY, sm::label_instance("src", "TRIM_DIRTY")}, + {src_t::TRIM_ALLOC, sm::label_instance("src", "TRIM_ALLOC")}, + {src_t::CLEANER_MAIN, sm::label_instance("src", "CLEANER_MAIN")}, + {src_t::CLEANER_COLD, sm::label_instance("src", "CLEANER_COLD")}, + }; + assert(labels_by_src.size() == (std::size_t)src_t::MAX); + + std::map<extent_types_t, sm::label_instance> labels_by_ext { + {extent_types_t::ROOT, sm::label_instance("ext", "ROOT")}, + {extent_types_t::LADDR_INTERNAL, sm::label_instance("ext", "LADDR_INTERNAL")}, + {extent_types_t::LADDR_LEAF, sm::label_instance("ext", "LADDR_LEAF")}, + {extent_types_t::DINK_LADDR_LEAF, sm::label_instance("ext", "DINK_LADDR_LEAF")}, + {extent_types_t::OMAP_INNER, sm::label_instance("ext", "OMAP_INNER")}, + {extent_types_t::OMAP_LEAF, sm::label_instance("ext", "OMAP_LEAF")}, + {extent_types_t::ONODE_BLOCK_STAGED, sm::label_instance("ext", "ONODE_BLOCK_STAGED")}, + {extent_types_t::COLL_BLOCK, sm::label_instance("ext", "COLL_BLOCK")}, + {extent_types_t::OBJECT_DATA_BLOCK, sm::label_instance("ext", "OBJECT_DATA_BLOCK")}, + {extent_types_t::RETIRED_PLACEHOLDER, sm::label_instance("ext", "RETIRED_PLACEHOLDER")}, + {extent_types_t::ALLOC_INFO, sm::label_instance("ext", "ALLOC_INFO")}, + {extent_types_t::JOURNAL_TAIL, sm::label_instance("ext", "JOURNAL_TAIL")}, + {extent_types_t::TEST_BLOCK, sm::label_instance("ext", "TEST_BLOCK")}, + {extent_types_t::TEST_BLOCK_PHYSICAL, sm::label_instance("ext", "TEST_BLOCK_PHYSICAL")}, + {extent_types_t::BACKREF_INTERNAL, sm::label_instance("ext", "BACKREF_INTERNAL")}, + {extent_types_t::BACKREF_LEAF, sm::label_instance("ext", "BACKREF_LEAF")} + }; + assert(labels_by_ext.size() == (std::size_t)extent_types_t::NONE); + + /* + * trans_created + */ + for (auto& [src, src_label] : labels_by_src) { + metrics.add_group( + "cache", + { + sm::make_counter( + "trans_created", + get_by_src(stats.trans_created_by_src, src), + sm::description("total number of transaction created"), + {src_label} + ), + } + ); + } + + /* + * cache_query: cache_access and cache_hit + */ + for (auto& [src, src_label] : labels_by_src) { + metrics.add_group( + "cache", + { + sm::make_counter( + "cache_access", + get_by_src(stats.cache_query_by_src, src).access, + sm::description("total number of cache accesses"), + {src_label} + ), + sm::make_counter( + "cache_hit", + get_by_src(stats.cache_query_by_src, src).hit, + sm::description("total number of cache hits"), + {src_label} + ), + } + ); + } + + { + /* + * efforts discarded/committed + */ + auto effort_label = sm::label("effort"); + + // invalidated efforts + using namespace std::literals::string_view_literals; + const string_view invalidated_effort_names[] = { + "READ"sv, + "MUTATE"sv, + "RETIRE"sv, + "FRESH"sv, + "FRESH_OOL_WRITTEN"sv, + }; + for (auto& [src, src_label] : labels_by_src) { + auto& efforts = get_by_src(stats.invalidated_efforts_by_src, src); + for (auto& [ext, ext_label] : labels_by_ext) { + auto& counter = get_by_ext(efforts.num_trans_invalidated, ext); + metrics.add_group( + "cache", + { + sm::make_counter( + "trans_invalidated_by_extent", + counter, + sm::description("total number of transactions invalidated by extents"), + {src_label, ext_label} + ), + } + ); + } + + if (src == src_t::READ) { + // read transaction won't have non-read efforts + auto read_effort_label = effort_label("READ"); + metrics.add_group( + "cache", + { + sm::make_counter( + "invalidated_extents", + efforts.read.num, + sm::description("extents of invalidated transactions"), + {src_label, read_effort_label} + ), + sm::make_counter( + "invalidated_extent_bytes", + efforts.read.bytes, + sm::description("extent bytes of invalidated transactions"), + {src_label, read_effort_label} + ), + } + ); + continue; + } + + // non READ invalidated efforts + for (auto& effort_name : invalidated_effort_names) { + auto& effort = [&effort_name, &efforts]() -> io_stat_t& { + if (effort_name == "READ") { + return efforts.read; + } else if (effort_name == "MUTATE") { + return efforts.mutate; + } else if (effort_name == "RETIRE") { + return efforts.retire; + } else if (effort_name == "FRESH") { + return efforts.fresh; + } else { + assert(effort_name == "FRESH_OOL_WRITTEN"); + return efforts.fresh_ool_written; + } + }(); + metrics.add_group( + "cache", + { + sm::make_counter( + "invalidated_extents", + effort.num, + sm::description("extents of invalidated transactions"), + {src_label, effort_label(effort_name)} + ), + sm::make_counter( + "invalidated_extent_bytes", + effort.bytes, + sm::description("extent bytes of invalidated transactions"), + {src_label, effort_label(effort_name)} + ), + } + ); + } // effort_name + + metrics.add_group( + "cache", + { + sm::make_counter( + "trans_invalidated", + efforts.total_trans_invalidated, + sm::description("total number of transactions invalidated"), + {src_label} + ), + sm::make_counter( + "invalidated_delta_bytes", + efforts.mutate_delta_bytes, + sm::description("delta bytes of invalidated transactions"), + {src_label} + ), + sm::make_counter( + "invalidated_ool_records", + efforts.num_ool_records, + sm::description("number of ool-records from invalidated transactions"), + {src_label} + ), + sm::make_counter( + "invalidated_ool_record_bytes", + efforts.ool_record_bytes, + sm::description("bytes of ool-record from invalidated transactions"), + {src_label} + ), + } + ); + } // src + + // committed efforts + const string_view committed_effort_names[] = { + "READ"sv, + "MUTATE"sv, + "RETIRE"sv, + "FRESH_INVALID"sv, + "FRESH_INLINE"sv, + "FRESH_OOL"sv, + }; + for (auto& [src, src_label] : labels_by_src) { + if (src == src_t::READ) { + // READ transaction won't commit + continue; + } + auto& efforts = get_by_src(stats.committed_efforts_by_src, src); + metrics.add_group( + "cache", + { + sm::make_counter( + "trans_committed", + efforts.num_trans, + sm::description("total number of transaction committed"), + {src_label} + ), + sm::make_counter( + "committed_ool_records", + efforts.num_ool_records, + sm::description("number of ool-records from committed transactions"), + {src_label} + ), + sm::make_counter( + "committed_ool_record_metadata_bytes", + efforts.ool_record_metadata_bytes, + sm::description("bytes of ool-record metadata from committed transactions"), + {src_label} + ), + sm::make_counter( + "committed_ool_record_data_bytes", + efforts.ool_record_data_bytes, + sm::description("bytes of ool-record data from committed transactions"), + {src_label} + ), + sm::make_counter( + "committed_inline_record_metadata_bytes", + efforts.inline_record_metadata_bytes, + sm::description("bytes of inline-record metadata from committed transactions" + "(excludes delta buffer)"), + {src_label} + ), + } + ); + for (auto& effort_name : committed_effort_names) { + auto& effort_by_ext = [&efforts, &effort_name]() + -> counter_by_extent_t<io_stat_t>& { + if (effort_name == "READ") { + return efforts.read_by_ext; + } else if (effort_name == "MUTATE") { + return efforts.mutate_by_ext; + } else if (effort_name == "RETIRE") { + return efforts.retire_by_ext; + } else if (effort_name == "FRESH_INVALID") { + return efforts.fresh_invalid_by_ext; + } else if (effort_name == "FRESH_INLINE") { + return efforts.fresh_inline_by_ext; + } else { + assert(effort_name == "FRESH_OOL"); + return efforts.fresh_ool_by_ext; + } + }(); + for (auto& [ext, ext_label] : labels_by_ext) { + auto& effort = get_by_ext(effort_by_ext, ext); + metrics.add_group( + "cache", + { + sm::make_counter( + "committed_extents", + effort.num, + sm::description("extents of committed transactions"), + {src_label, effort_label(effort_name), ext_label} + ), + sm::make_counter( + "committed_extent_bytes", + effort.bytes, + sm::description("extent bytes of committed transactions"), + {src_label, effort_label(effort_name), ext_label} + ), + } + ); + } // ext + } // effort_name + + auto& delta_by_ext = efforts.delta_bytes_by_ext; + for (auto& [ext, ext_label] : labels_by_ext) { + auto& value = get_by_ext(delta_by_ext, ext); + metrics.add_group( + "cache", + { + sm::make_counter( + "committed_delta_bytes", + value, + sm::description("delta bytes of committed transactions"), + {src_label, ext_label} + ), + } + ); + } // ext + } // src + + // successful read efforts + metrics.add_group( + "cache", + { + sm::make_counter( + "trans_read_successful", + stats.success_read_efforts.num_trans, + sm::description("total number of successful read transactions") + ), + sm::make_counter( + "successful_read_extents", + stats.success_read_efforts.read.num, + sm::description("extents of successful read transactions") + ), + sm::make_counter( + "successful_read_extent_bytes", + stats.success_read_efforts.read.bytes, + sm::description("extent bytes of successful read transactions") + ), + } + ); + } + + /** + * Cached extents (including placeholders) + * + * Dirty extents + */ + metrics.add_group( + "cache", + { + sm::make_counter( + "cached_extents", + [this] { + return extents.size(); + }, + sm::description("total number of cached extents") + ), + sm::make_counter( + "cached_extent_bytes", + [this] { + return extents.get_bytes(); + }, + sm::description("total bytes of cached extents") + ), + sm::make_counter( + "dirty_extents", + [this] { + return dirty.size(); + }, + sm::description("total number of dirty extents") + ), + sm::make_counter( + "dirty_extent_bytes", + stats.dirty_bytes, + sm::description("total bytes of dirty extents") + ), + sm::make_counter( + "cache_lru_size_bytes", + [this] { + return lru.get_current_contents_bytes(); + }, + sm::description("total bytes pinned by the lru") + ), + sm::make_counter( + "cache_lru_size_extents", + [this] { + return lru.get_current_contents_extents(); + }, + sm::description("total extents pinned by the lru") + ), + } + ); + + /** + * tree stats + */ + auto tree_label = sm::label("tree"); + auto onode_label = tree_label("ONODE"); + auto omap_label = tree_label("OMAP"); + auto lba_label = tree_label("LBA"); + auto backref_label = tree_label("BACKREF"); + auto register_tree_metrics = [&labels_by_src, &onode_label, &omap_label, this]( + const sm::label_instance& tree_label, + uint64_t& tree_depth, + int64_t& tree_extents_num, + counter_by_src_t<tree_efforts_t>& committed_tree_efforts, + counter_by_src_t<tree_efforts_t>& invalidated_tree_efforts) { + metrics.add_group( + "cache", + { + sm::make_counter( + "tree_depth", + tree_depth, + sm::description("the depth of tree"), + {tree_label} + ), + sm::make_counter( + "tree_extents_num", + tree_extents_num, + sm::description("num of extents of the tree"), + {tree_label} + ) + } + ); + for (auto& [src, src_label] : labels_by_src) { + if (src == src_t::READ) { + // READ transaction won't contain any tree inserts and erases + continue; + } + if (is_background_transaction(src) && + (tree_label == onode_label || + tree_label == omap_label)) { + // CLEANER transaction won't contain any onode/omap tree operations + continue; + } + auto& committed_efforts = get_by_src(committed_tree_efforts, src); + auto& invalidated_efforts = get_by_src(invalidated_tree_efforts, src); + metrics.add_group( + "cache", + { + sm::make_counter( + "tree_inserts_committed", + committed_efforts.num_inserts, + sm::description("total number of committed insert operations"), + {tree_label, src_label} + ), + sm::make_counter( + "tree_erases_committed", + committed_efforts.num_erases, + sm::description("total number of committed erase operations"), + {tree_label, src_label} + ), + sm::make_counter( + "tree_updates_committed", + committed_efforts.num_updates, + sm::description("total number of committed update operations"), + {tree_label, src_label} + ), + sm::make_counter( + "tree_inserts_invalidated", + invalidated_efforts.num_inserts, + sm::description("total number of invalidated insert operations"), + {tree_label, src_label} + ), + sm::make_counter( + "tree_erases_invalidated", + invalidated_efforts.num_erases, + sm::description("total number of invalidated erase operations"), + {tree_label, src_label} + ), + sm::make_counter( + "tree_updates_invalidated", + invalidated_efforts.num_updates, + sm::description("total number of invalidated update operations"), + {tree_label, src_label} + ), + } + ); + } + }; + register_tree_metrics( + onode_label, + stats.onode_tree_depth, + stats.onode_tree_extents_num, + stats.committed_onode_tree_efforts, + stats.invalidated_onode_tree_efforts); + register_tree_metrics( + omap_label, + stats.omap_tree_depth, + stats.omap_tree_extents_num, + stats.committed_omap_tree_efforts, + stats.invalidated_omap_tree_efforts); + register_tree_metrics( + lba_label, + stats.lba_tree_depth, + stats.lba_tree_extents_num, + stats.committed_lba_tree_efforts, + stats.invalidated_lba_tree_efforts); + register_tree_metrics( + backref_label, + stats.backref_tree_depth, + stats.backref_tree_extents_num, + stats.committed_backref_tree_efforts, + stats.invalidated_backref_tree_efforts); + + /** + * conflict combinations + */ + auto srcs_label = sm::label("srcs"); + auto num_srcs = static_cast<std::size_t>(Transaction::src_t::MAX); + std::size_t srcs_index = 0; + for (uint8_t src2_int = 0; src2_int < num_srcs; ++src2_int) { + auto src2 = static_cast<Transaction::src_t>(src2_int); + for (uint8_t src1_int = src2_int; src1_int < num_srcs; ++src1_int) { + ++srcs_index; + auto src1 = static_cast<Transaction::src_t>(src1_int); + // impossible combinations + // should be consistent with checks in account_conflict() + if ((src1 == Transaction::src_t::READ && + src2 == Transaction::src_t::READ) || + (src1 == Transaction::src_t::TRIM_DIRTY && + src2 == Transaction::src_t::TRIM_DIRTY) || + (src1 == Transaction::src_t::CLEANER_MAIN && + src2 == Transaction::src_t::CLEANER_MAIN) || + (src1 == Transaction::src_t::CLEANER_COLD && + src2 == Transaction::src_t::CLEANER_COLD) || + (src1 == Transaction::src_t::TRIM_ALLOC && + src2 == Transaction::src_t::TRIM_ALLOC)) { + continue; + } + std::ostringstream oss; + oss << src1 << "," << src2; + metrics.add_group( + "cache", + { + sm::make_counter( + "trans_srcs_invalidated", + stats.trans_conflicts_by_srcs[srcs_index - 1], + sm::description("total number conflicted transactions by src pair"), + {srcs_label(oss.str())} + ), + } + ); + } + } + assert(srcs_index == NUM_SRC_COMB); + srcs_index = 0; + for (uint8_t src_int = 0; src_int < num_srcs; ++src_int) { + ++srcs_index; + auto src = static_cast<Transaction::src_t>(src_int); + std::ostringstream oss; + oss << "UNKNOWN," << src; + metrics.add_group( + "cache", + { + sm::make_counter( + "trans_srcs_invalidated", + stats.trans_conflicts_by_unknown[srcs_index - 1], + sm::description("total number conflicted transactions by src pair"), + {srcs_label(oss.str())} + ), + } + ); + } + + /** + * rewrite version + */ + metrics.add_group( + "cache", + { + sm::make_counter( + "version_count_dirty", + stats.committed_dirty_version.num, + sm::description("total number of rewrite-dirty extents") + ), + sm::make_counter( + "version_sum_dirty", + stats.committed_dirty_version.version, + sm::description("sum of the version from rewrite-dirty extents") + ), + sm::make_counter( + "version_count_reclaim", + stats.committed_reclaim_version.num, + sm::description("total number of rewrite-reclaim extents") + ), + sm::make_counter( + "version_sum_reclaim", + stats.committed_reclaim_version.version, + sm::description("sum of the version from rewrite-reclaim extents") + ), + } + ); +} + +void Cache::add_extent( + CachedExtentRef ref, + const Transaction::src_t* p_src=nullptr) +{ + assert(ref->is_valid()); + assert(ref->user_hint == PLACEMENT_HINT_NULL); + assert(ref->rewrite_generation == NULL_GENERATION); + extents.insert(*ref); + if (ref->is_dirty()) { + add_to_dirty(ref); + } else { + touch_extent(*ref, p_src); + } +} + +void Cache::mark_dirty(CachedExtentRef ref) +{ + if (ref->is_dirty()) { + assert(ref->primary_ref_list_hook.is_linked()); + return; + } + + lru.remove_from_lru(*ref); + ref->state = CachedExtent::extent_state_t::DIRTY; + add_to_dirty(ref); +} + +void Cache::add_to_dirty(CachedExtentRef ref) +{ + assert(ref->is_dirty()); + assert(!ref->primary_ref_list_hook.is_linked()); + ceph_assert(ref->get_modify_time() != NULL_TIME); + intrusive_ptr_add_ref(&*ref); + dirty.push_back(*ref); + stats.dirty_bytes += ref->get_length(); +} + +void Cache::remove_from_dirty(CachedExtentRef ref) +{ + if (ref->is_dirty()) { + ceph_assert(ref->primary_ref_list_hook.is_linked()); + stats.dirty_bytes -= ref->get_length(); + dirty.erase(dirty.s_iterator_to(*ref)); + intrusive_ptr_release(&*ref); + } else { + ceph_assert(!ref->primary_ref_list_hook.is_linked()); + } +} + +void Cache::remove_extent(CachedExtentRef ref) +{ + assert(ref->is_valid()); + if (ref->is_dirty()) { + remove_from_dirty(ref); + } else if (!ref->is_placeholder()) { + lru.remove_from_lru(*ref); + } + extents.erase(*ref); +} + +void Cache::commit_retire_extent( + Transaction& t, + CachedExtentRef ref) +{ + remove_extent(ref); + + ref->dirty_from_or_retired_at = JOURNAL_SEQ_NULL; + invalidate_extent(t, *ref); +} + +void Cache::commit_replace_extent( + Transaction& t, + CachedExtentRef next, + CachedExtentRef prev) +{ + assert(next->is_dirty()); + assert(next->get_paddr() == prev->get_paddr()); + assert(next->version == prev->version + 1); + extents.replace(*next, *prev); + + if (prev->get_type() == extent_types_t::ROOT) { + assert(prev->is_stable_clean() + || prev->primary_ref_list_hook.is_linked()); + if (prev->is_dirty()) { + stats.dirty_bytes -= prev->get_length(); + dirty.erase(dirty.s_iterator_to(*prev)); + intrusive_ptr_release(&*prev); + } + add_to_dirty(next); + } else if (prev->is_dirty()) { + assert(prev->get_dirty_from() == next->get_dirty_from()); + assert(prev->primary_ref_list_hook.is_linked()); + auto prev_it = dirty.iterator_to(*prev); + dirty.insert(prev_it, *next); + dirty.erase(prev_it); + intrusive_ptr_release(&*prev); + intrusive_ptr_add_ref(&*next); + } else { + lru.remove_from_lru(*prev); + add_to_dirty(next); + } + + next->on_replace_prior(t); + invalidate_extent(t, *prev); +} + +void Cache::invalidate_extent( + Transaction& t, + CachedExtent& extent) +{ + if (!extent.may_conflict()) { + assert(extent.transactions.empty()); + extent.set_invalid(t); + return; + } + + LOG_PREFIX(Cache::invalidate_extent); + bool do_conflict_log = true; + for (auto &&i: extent.transactions) { + if (!i.t->conflicted) { + if (do_conflict_log) { + SUBDEBUGT(seastore_t, "conflict begin -- {}", t, extent); + do_conflict_log = false; + } + assert(!i.t->is_weak()); + account_conflict(t.get_src(), i.t->get_src()); + mark_transaction_conflicted(*i.t, extent); + } + } + extent.set_invalid(t); +} + +void Cache::mark_transaction_conflicted( + Transaction& t, CachedExtent& conflicting_extent) +{ + LOG_PREFIX(Cache::mark_transaction_conflicted); + SUBTRACET(seastore_t, "", t); + assert(!t.conflicted); + t.conflicted = true; + + auto& efforts = get_by_src(stats.invalidated_efforts_by_src, + t.get_src()); + ++efforts.total_trans_invalidated; + + auto& counter = get_by_ext(efforts.num_trans_invalidated, + conflicting_extent.get_type()); + ++counter; + + io_stat_t read_stat; + for (auto &i: t.read_set) { + read_stat.increment(i.ref->get_length()); + } + efforts.read.increment_stat(read_stat); + + if (t.get_src() != Transaction::src_t::READ) { + io_stat_t retire_stat; + for (auto &i: t.retired_set) { + retire_stat.increment(i->get_length()); + } + efforts.retire.increment_stat(retire_stat); + + auto& fresh_stat = t.get_fresh_block_stats(); + efforts.fresh.increment_stat(fresh_stat); + + io_stat_t delta_stat; + for (auto &i: t.mutated_block_list) { + if (!i->is_valid()) { + continue; + } + efforts.mutate.increment(i->get_length()); + delta_stat.increment(i->get_delta().length()); + } + efforts.mutate_delta_bytes += delta_stat.bytes; + + for (auto &i: t.pre_alloc_list) { + epm.mark_space_free(i->get_paddr(), i->get_length()); + } + + auto& ool_stats = t.get_ool_write_stats(); + efforts.fresh_ool_written.increment_stat(ool_stats.extents); + efforts.num_ool_records += ool_stats.num_records; + auto ool_record_bytes = (ool_stats.md_bytes + ool_stats.get_data_bytes()); + efforts.ool_record_bytes += ool_record_bytes; + + if (is_background_transaction(t.get_src())) { + // CLEANER transaction won't contain any onode/omap tree operations + assert(t.onode_tree_stats.is_clear()); + assert(t.omap_tree_stats.is_clear()); + } else { + get_by_src(stats.invalidated_onode_tree_efforts, t.get_src() + ).increment(t.onode_tree_stats); + get_by_src(stats.invalidated_omap_tree_efforts, t.get_src() + ).increment(t.omap_tree_stats); + } + + get_by_src(stats.invalidated_lba_tree_efforts, t.get_src() + ).increment(t.lba_tree_stats); + get_by_src(stats.invalidated_backref_tree_efforts, t.get_src() + ).increment(t.backref_tree_stats); + + SUBDEBUGT(seastore_t, + "discard {} read, {} fresh, {} delta, {} retire, {}({}B) ool-records", + t, + read_stat, + fresh_stat, + delta_stat, + retire_stat, + ool_stats.num_records, + ool_record_bytes); + } else { + // read transaction won't have non-read efforts + assert(t.retired_set.empty()); + assert(t.get_fresh_block_stats().is_clear()); + assert(t.mutated_block_list.empty()); + assert(t.get_ool_write_stats().is_clear()); + assert(t.onode_tree_stats.is_clear()); + assert(t.omap_tree_stats.is_clear()); + assert(t.lba_tree_stats.is_clear()); + assert(t.backref_tree_stats.is_clear()); + SUBDEBUGT(seastore_t, "discard {} read", t, read_stat); + } +} + +void Cache::on_transaction_destruct(Transaction& t) +{ + LOG_PREFIX(Cache::on_transaction_destruct); + SUBTRACET(seastore_t, "", t); + if (t.get_src() == Transaction::src_t::READ && + t.conflicted == false) { + io_stat_t read_stat; + for (auto &i: t.read_set) { + read_stat.increment(i.ref->get_length()); + } + SUBDEBUGT(seastore_t, "done {} read", t, read_stat); + + if (!t.is_weak()) { + // exclude weak transaction as it is impossible to conflict + ++stats.success_read_efforts.num_trans; + stats.success_read_efforts.read.increment_stat(read_stat); + } + + // read transaction won't have non-read efforts + assert(t.retired_set.empty()); + assert(t.get_fresh_block_stats().is_clear()); + assert(t.mutated_block_list.empty()); + assert(t.onode_tree_stats.is_clear()); + assert(t.omap_tree_stats.is_clear()); + assert(t.lba_tree_stats.is_clear()); + assert(t.backref_tree_stats.is_clear()); + } +} + +CachedExtentRef Cache::alloc_new_extent_by_type( + Transaction &t, ///< [in, out] current transaction + extent_types_t type, ///< [in] type tag + extent_len_t length, ///< [in] length + placement_hint_t hint, ///< [in] user hint + rewrite_gen_t gen ///< [in] rewrite generation +) +{ + LOG_PREFIX(Cache::alloc_new_extent_by_type); + SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}", + t, type, length, hint, rewrite_gen_printer_t{gen}); + switch (type) { + case extent_types_t::ROOT: + ceph_assert(0 == "ROOT is never directly alloc'd"); + return CachedExtentRef(); + case extent_types_t::LADDR_INTERNAL: + return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length, hint, gen); + case extent_types_t::LADDR_LEAF: + return alloc_new_extent<lba_manager::btree::LBALeafNode>( + t, length, hint, gen); + case extent_types_t::ONODE_BLOCK_STAGED: + return alloc_new_extent<onode::SeastoreNodeExtent>(t, length, hint, gen); + case extent_types_t::OMAP_INNER: + return alloc_new_extent<omap_manager::OMapInnerNode>(t, length, hint, gen); + case extent_types_t::OMAP_LEAF: + return alloc_new_extent<omap_manager::OMapLeafNode>(t, length, hint, gen); + case extent_types_t::COLL_BLOCK: + return alloc_new_extent<collection_manager::CollectionNode>(t, length, hint, gen); + case extent_types_t::OBJECT_DATA_BLOCK: + return alloc_new_extent<ObjectDataBlock>(t, length, hint, gen); + case extent_types_t::RETIRED_PLACEHOLDER: + ceph_assert(0 == "impossible"); + return CachedExtentRef(); + case extent_types_t::TEST_BLOCK: + return alloc_new_extent<TestBlock>(t, length, hint, gen); + case extent_types_t::TEST_BLOCK_PHYSICAL: + return alloc_new_extent<TestBlockPhysical>(t, length, hint, gen); + case extent_types_t::NONE: { + ceph_assert(0 == "NONE is an invalid extent type"); + return CachedExtentRef(); + } + default: + ceph_assert(0 == "impossible"); + return CachedExtentRef(); + } +} + +CachedExtentRef Cache::duplicate_for_write( + Transaction &t, + CachedExtentRef i) { + LOG_PREFIX(Cache::duplicate_for_write); + assert(i->is_fully_loaded()); + + if (i->is_mutable()) + return i; + + if (i->is_exist_clean()) { + i->version++; + i->state = CachedExtent::extent_state_t::EXIST_MUTATION_PENDING; + i->last_committed_crc = i->get_crc32c(); + // deepcopy the buffer of exist clean extent beacuse it shares + // buffer with original clean extent. + auto bp = i->get_bptr(); + auto nbp = ceph::bufferptr(bp.c_str(), bp.length()); + i->set_bptr(std::move(nbp)); + + t.add_mutated_extent(i); + DEBUGT("duplicate existing extent {}", t, *i); + return i; + } + + auto ret = i->duplicate_for_write(t); + ret->pending_for_transaction = t.get_trans_id(); + ret->prior_instance = i; + // duplicate_for_write won't occur after ool write finished + assert(!i->prior_poffset); + auto [iter, inserted] = i->mutation_pendings.insert(*ret); + ceph_assert(inserted); + t.add_mutated_extent(ret); + if (ret->get_type() == extent_types_t::ROOT) { + t.root = ret->cast<RootBlock>(); + } else { + ret->last_committed_crc = i->last_committed_crc; + } + + ret->version++; + ret->state = CachedExtent::extent_state_t::MUTATION_PENDING; + DEBUGT("{} -> {}", t, *i, *ret); + return ret; +} + +record_t Cache::prepare_record( + Transaction &t, + const journal_seq_t &journal_head, + const journal_seq_t &journal_dirty_tail) +{ + LOG_PREFIX(Cache::prepare_record); + SUBTRACET(seastore_t, "enter", t); + + auto trans_src = t.get_src(); + assert(!t.is_weak()); + assert(trans_src != Transaction::src_t::READ); + + auto& efforts = get_by_src(stats.committed_efforts_by_src, + trans_src); + + // Should be valid due to interruptible future + io_stat_t read_stat; + for (auto &i: t.read_set) { + if (!i.ref->is_valid()) { + SUBERRORT(seastore_t, + "read_set got invalid extent, aborting -- {}", t, *i.ref); + ceph_abort("no invalid extent allowed in transactions' read_set"); + } + get_by_ext(efforts.read_by_ext, + i.ref->get_type()).increment(i.ref->get_length()); + read_stat.increment(i.ref->get_length()); + } + t.read_set.clear(); + t.write_set.clear(); + + record_t record(trans_src); + auto commit_time = seastar::lowres_system_clock::now(); + + // Add new copy of mutated blocks, set_io_wait to block until written + record.deltas.reserve(t.mutated_block_list.size()); + io_stat_t delta_stat; + for (auto &i: t.mutated_block_list) { + if (!i->is_valid()) { + DEBUGT("invalid mutated extent -- {}", t, *i); + continue; + } + assert(i->is_exist_mutation_pending() || + i->prior_instance); + get_by_ext(efforts.mutate_by_ext, + i->get_type()).increment(i->get_length()); + + auto delta_bl = i->get_delta(); + auto delta_length = delta_bl.length(); + i->set_modify_time(commit_time); + DEBUGT("mutated extent with {}B delta -- {}", + t, delta_length, *i); + if (!i->is_exist_mutation_pending()) { + DEBUGT("commit replace extent ... -- {}, prior={}", + t, *i, *i->prior_instance); + // extent with EXIST_MUTATION_PENDING doesn't have + // prior_instance field so skip these extents. + // the existing extents should be added into Cache + // during complete_commit to sync with gc transaction. + commit_replace_extent(t, i, i->prior_instance); + } + + i->prepare_write(); + i->set_io_wait(); + i->prepare_commit(); + + assert(i->get_version() > 0); + auto final_crc = i->get_crc32c(); + if (i->get_type() == extent_types_t::ROOT) { + SUBTRACET(seastore_t, "writing out root delta {}B -- {}", + t, delta_length, *i); + assert(t.root == i); + root = t.root; + record.push_back( + delta_info_t{ + extent_types_t::ROOT, + P_ADDR_NULL, + L_ADDR_NULL, + 0, + 0, + 0, + t.root->get_version() - 1, + MAX_SEG_SEQ, + segment_type_t::NULL_SEG, + std::move(delta_bl) + }); + } else { + auto sseq = NULL_SEG_SEQ; + auto stype = segment_type_t::NULL_SEG; + + // FIXME: This is specific to the segmented implementation + if (i->get_paddr().get_addr_type() == paddr_types_t::SEGMENT) { + auto sid = i->get_paddr().as_seg_paddr().get_segment_id(); + auto sinfo = get_segment_info(sid); + if (sinfo) { + sseq = sinfo->seq; + stype = sinfo->type; + } + } + + record.push_back( + delta_info_t{ + i->get_type(), + i->get_paddr(), + (i->is_logical() + ? i->cast<LogicalCachedExtent>()->get_laddr() + : L_ADDR_NULL), + i->last_committed_crc, + final_crc, + i->get_length(), + i->get_version() - 1, + sseq, + stype, + std::move(delta_bl) + }); + i->last_committed_crc = final_crc; + } + assert(delta_length); + get_by_ext(efforts.delta_bytes_by_ext, + i->get_type()) += delta_length; + delta_stat.increment(delta_length); + } + + // Transaction is now a go, set up in-memory cache state + // invalidate now invalid blocks + io_stat_t retire_stat; + std::vector<alloc_delta_t> alloc_deltas; + alloc_delta_t rel_delta; + rel_delta.op = alloc_delta_t::op_types_t::CLEAR; + for (auto &i: t.retired_set) { + get_by_ext(efforts.retire_by_ext, + i->get_type()).increment(i->get_length()); + retire_stat.increment(i->get_length()); + DEBUGT("retired and remove extent -- {}", t, *i); + commit_retire_extent(t, i); + if (is_backref_mapped_extent_node(i) + || is_retired_placeholder(i->get_type())) { + rel_delta.alloc_blk_ranges.emplace_back( + i->get_paddr(), + L_ADDR_NULL, + i->get_length(), + i->get_type()); + } + } + alloc_deltas.emplace_back(std::move(rel_delta)); + + record.extents.reserve(t.inline_block_list.size()); + io_stat_t fresh_stat; + io_stat_t fresh_invalid_stat; + alloc_delta_t alloc_delta; + alloc_delta.op = alloc_delta_t::op_types_t::SET; + for (auto &i: t.inline_block_list) { + if (!i->is_valid()) { + DEBUGT("invalid fresh inline extent -- {}", t, *i); + fresh_invalid_stat.increment(i->get_length()); + get_by_ext(efforts.fresh_invalid_by_ext, + i->get_type()).increment(i->get_length()); + } else { + TRACET("fresh inline extent -- {}", t, *i); + } + fresh_stat.increment(i->get_length()); + get_by_ext(efforts.fresh_inline_by_ext, + i->get_type()).increment(i->get_length()); + assert(i->is_inline() || i->get_paddr().is_fake()); + + bufferlist bl; + i->prepare_write(); + i->prepare_commit(); + bl.append(i->get_bptr()); + if (i->get_type() == extent_types_t::ROOT) { + ceph_assert(0 == "ROOT never gets written as a fresh block"); + } + + assert(bl.length() == i->get_length()); + auto modify_time = i->get_modify_time(); + if (modify_time == NULL_TIME) { + modify_time = commit_time; + } + record.push_back(extent_t{ + i->get_type(), + i->is_logical() + ? i->cast<LogicalCachedExtent>()->get_laddr() + : (is_lba_node(i->get_type()) + ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin + : L_ADDR_NULL), + std::move(bl) + }, + modify_time); + if (i->is_valid() + && is_backref_mapped_extent_node(i)) { + alloc_delta.alloc_blk_ranges.emplace_back( + i->get_paddr(), + i->is_logical() + ? i->cast<LogicalCachedExtent>()->get_laddr() + : (is_lba_node(i->get_type()) + ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin + : L_ADDR_NULL), + i->get_length(), + i->get_type()); + } + } + + for (auto &i: t.written_ool_block_list) { + TRACET("fresh ool extent -- {}", t, *i); + ceph_assert(i->is_valid()); + assert(!i->is_inline()); + get_by_ext(efforts.fresh_ool_by_ext, + i->get_type()).increment(i->get_length()); + i->prepare_commit(); + if (is_backref_mapped_extent_node(i)) { + alloc_delta.alloc_blk_ranges.emplace_back( + i->get_paddr(), + i->is_logical() + ? i->cast<LogicalCachedExtent>()->get_laddr() + : i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin, + i->get_length(), + i->get_type()); + } + } + + for (auto &i: t.existing_block_list) { + if (i->is_valid()) { + alloc_delta.alloc_blk_ranges.emplace_back( + i->get_paddr(), + i->cast<LogicalCachedExtent>()->get_laddr(), + i->get_length(), + i->get_type()); + } + } + alloc_deltas.emplace_back(std::move(alloc_delta)); + + for (auto b : alloc_deltas) { + bufferlist bl; + encode(b, bl); + delta_info_t delta; + delta.type = extent_types_t::ALLOC_INFO; + delta.bl = bl; + record.push_back(std::move(delta)); + } + + if (is_background_transaction(trans_src)) { + assert(journal_head != JOURNAL_SEQ_NULL); + assert(journal_dirty_tail != JOURNAL_SEQ_NULL); + journal_seq_t dirty_tail; + auto maybe_dirty_tail = get_oldest_dirty_from(); + if (!maybe_dirty_tail.has_value()) { + dirty_tail = journal_head; + SUBINFOT(seastore_t, "dirty_tail all trimmed, set to head {}, src={}", + t, dirty_tail, trans_src); + } else if (*maybe_dirty_tail == JOURNAL_SEQ_NULL) { + dirty_tail = journal_dirty_tail; + SUBINFOT(seastore_t, "dirty_tail is pending, set to {}, src={}", + t, dirty_tail, trans_src); + } else { + dirty_tail = *maybe_dirty_tail; + } + ceph_assert(dirty_tail != JOURNAL_SEQ_NULL); + journal_seq_t alloc_tail; + auto maybe_alloc_tail = get_oldest_backref_dirty_from(); + if (!maybe_alloc_tail.has_value()) { + // FIXME: the replay point of the allocations requires to be accurate. + // Setting the alloc_tail to get_journal_head() cannot skip replaying the + // last unnecessary record. + alloc_tail = journal_head; + SUBINFOT(seastore_t, "alloc_tail all trimmed, set to head {}, src={}", + t, alloc_tail, trans_src); + } else if (*maybe_alloc_tail == JOURNAL_SEQ_NULL) { + ceph_abort("impossible"); + } else { + alloc_tail = *maybe_alloc_tail; + } + ceph_assert(alloc_tail != JOURNAL_SEQ_NULL); + auto tails = journal_tail_delta_t{alloc_tail, dirty_tail}; + SUBDEBUGT(seastore_t, "update tails as delta {}", t, tails); + bufferlist bl; + encode(tails, bl); + delta_info_t delta; + delta.type = extent_types_t::JOURNAL_TAIL; + delta.bl = bl; + record.push_back(std::move(delta)); + } + + ceph_assert(t.get_fresh_block_stats().num == + t.inline_block_list.size() + + t.written_ool_block_list.size() + + t.num_delayed_invalid_extents + + t.num_allocated_invalid_extents); + + auto& ool_stats = t.get_ool_write_stats(); + ceph_assert(ool_stats.extents.num == t.written_ool_block_list.size()); + + if (record.is_empty()) { + SUBINFOT(seastore_t, + "record to submit is empty, src={}", t, trans_src); + assert(t.onode_tree_stats.is_clear()); + assert(t.omap_tree_stats.is_clear()); + assert(t.lba_tree_stats.is_clear()); + assert(t.backref_tree_stats.is_clear()); + assert(ool_stats.is_clear()); + } + + if (record.modify_time == NULL_TIME) { + record.modify_time = commit_time; + } + + SUBDEBUGT(seastore_t, + "commit H{} dirty_from={}, alloc_from={}, " + "{} read, {} fresh with {} invalid, " + "{} delta, {} retire, {}(md={}B, data={}B) ool-records, " + "{}B md, {}B data, modify_time={}", + t, (void*)&t.get_handle(), + get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL), + get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL), + read_stat, + fresh_stat, + fresh_invalid_stat, + delta_stat, + retire_stat, + ool_stats.num_records, + ool_stats.md_bytes, + ool_stats.get_data_bytes(), + record.size.get_raw_mdlength(), + record.size.dlength, + sea_time_point_printer_t{record.modify_time}); + if (is_background_transaction(trans_src)) { + // background transaction won't contain any onode tree operations + assert(t.onode_tree_stats.is_clear()); + assert(t.omap_tree_stats.is_clear()); + } else { + if (t.onode_tree_stats.depth) { + stats.onode_tree_depth = t.onode_tree_stats.depth; + } + if (t.omap_tree_stats.depth) { + stats.omap_tree_depth = t.omap_tree_stats.depth; + } + stats.onode_tree_extents_num += t.onode_tree_stats.extents_num_delta; + ceph_assert(stats.onode_tree_extents_num >= 0); + get_by_src(stats.committed_onode_tree_efforts, trans_src + ).increment(t.onode_tree_stats); + stats.omap_tree_extents_num += t.omap_tree_stats.extents_num_delta; + ceph_assert(stats.omap_tree_extents_num >= 0); + get_by_src(stats.committed_omap_tree_efforts, trans_src + ).increment(t.omap_tree_stats); + } + + if (t.lba_tree_stats.depth) { + stats.lba_tree_depth = t.lba_tree_stats.depth; + } + stats.lba_tree_extents_num += t.lba_tree_stats.extents_num_delta; + ceph_assert(stats.lba_tree_extents_num >= 0); + get_by_src(stats.committed_lba_tree_efforts, trans_src + ).increment(t.lba_tree_stats); + if (t.backref_tree_stats.depth) { + stats.backref_tree_depth = t.backref_tree_stats.depth; + } + stats.backref_tree_extents_num += t.backref_tree_stats.extents_num_delta; + ceph_assert(stats.backref_tree_extents_num >= 0); + get_by_src(stats.committed_backref_tree_efforts, trans_src + ).increment(t.backref_tree_stats); + + ++(efforts.num_trans); + efforts.num_ool_records += ool_stats.num_records; + efforts.ool_record_metadata_bytes += ool_stats.md_bytes; + efforts.ool_record_data_bytes += ool_stats.get_data_bytes(); + efforts.inline_record_metadata_bytes += + (record.size.get_raw_mdlength() - record.get_delta_size()); + + auto &rewrite_version_stats = t.get_rewrite_version_stats(); + if (trans_src == Transaction::src_t::TRIM_DIRTY) { + stats.committed_dirty_version.increment_stat(rewrite_version_stats); + } else if (trans_src == Transaction::src_t::CLEANER_MAIN || + trans_src == Transaction::src_t::CLEANER_COLD) { + stats.committed_reclaim_version.increment_stat(rewrite_version_stats); + } else { + assert(rewrite_version_stats.is_clear()); + } + + return record; +} + +void Cache::backref_batch_update( + std::vector<backref_entry_ref> &&list, + const journal_seq_t &seq) +{ + LOG_PREFIX(Cache::backref_batch_update); + DEBUG("inserting {} entries at {}", list.size(), seq); + ceph_assert(seq != JOURNAL_SEQ_NULL); + + for (auto &ent : list) { + backref_entry_mset.insert(*ent); + } + + auto iter = backref_entryrefs_by_seq.find(seq); + if (iter == backref_entryrefs_by_seq.end()) { + backref_entryrefs_by_seq.emplace(seq, std::move(list)); + } else { + iter->second.insert( + iter->second.end(), + std::make_move_iterator(list.begin()), + std::make_move_iterator(list.end())); + } +} + +void Cache::complete_commit( + Transaction &t, + paddr_t final_block_start, + journal_seq_t start_seq) +{ + LOG_PREFIX(Cache::complete_commit); + SUBTRACET(seastore_t, "final_block_start={}, start_seq={}", + t, final_block_start, start_seq); + + std::vector<backref_entry_ref> backref_list; + t.for_each_fresh_block([&](const CachedExtentRef &i) { + if (!i->is_valid()) { + return; + } + + bool is_inline = false; + if (i->is_inline()) { + is_inline = true; + i->set_paddr(final_block_start.add_relative(i->get_paddr())); + } + i->last_committed_crc = i->get_crc32c(); + i->pending_for_transaction = TRANS_ID_NULL; + i->on_initial_write(); + + i->state = CachedExtent::extent_state_t::CLEAN; + DEBUGT("add extent as fresh, inline={} -- {}", + t, is_inline, *i); + const auto t_src = t.get_src(); + i->invalidate_hints(); + add_extent(i, &t_src); + epm.commit_space_used(i->get_paddr(), i->get_length()); + if (is_backref_mapped_extent_node(i)) { + DEBUGT("backref_list new {} len {}", + t, + i->get_paddr(), + i->get_length()); + backref_list.emplace_back( + std::make_unique<backref_entry_t>( + i->get_paddr(), + i->is_logical() + ? i->cast<LogicalCachedExtent>()->get_laddr() + : (is_lba_node(i->get_type()) + ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin + : L_ADDR_NULL), + i->get_length(), + i->get_type(), + start_seq)); + } else if (is_backref_node(i->get_type())) { + add_backref_extent( + i->get_paddr(), + i->cast<backref::BackrefNode>()->get_node_meta().begin, + i->get_type()); + } else { + ERRORT("{}", t, *i); + ceph_abort("not possible"); + } + }); + + // Add new copy of mutated blocks, set_io_wait to block until written + for (auto &i: t.mutated_block_list) { + if (!i->is_valid()) { + continue; + } + assert(i->is_exist_mutation_pending() || + i->prior_instance); + i->on_delta_write(final_block_start); + i->pending_for_transaction = TRANS_ID_NULL; + i->prior_instance = CachedExtentRef(); + i->state = CachedExtent::extent_state_t::DIRTY; + assert(i->version > 0); + if (i->version == 1 || i->get_type() == extent_types_t::ROOT) { + i->dirty_from_or_retired_at = start_seq; + DEBUGT("commit extent done, become dirty -- {}", t, *i); + } else { + DEBUGT("commit extent done -- {}", t, *i); + } + } + + for (auto &i: t.retired_set) { + epm.mark_space_free(i->get_paddr(), i->get_length()); + } + for (auto &i: t.existing_block_list) { + if (i->is_valid()) { + epm.mark_space_used(i->get_paddr(), i->get_length()); + } + } + + for (auto &i: t.mutated_block_list) { + if (!i->is_valid()) { + continue; + } + i->complete_io(); + } + + last_commit = start_seq; + for (auto &i: t.retired_set) { + i->dirty_from_or_retired_at = start_seq; + if (is_backref_mapped_extent_node(i) + || is_retired_placeholder(i->get_type())) { + DEBUGT("backref_list free {} len {}", + t, + i->get_paddr(), + i->get_length()); + backref_list.emplace_back( + std::make_unique<backref_entry_t>( + i->get_paddr(), + L_ADDR_NULL, + i->get_length(), + i->get_type(), + start_seq)); + } else if (is_backref_node(i->get_type())) { + remove_backref_extent(i->get_paddr()); + } else { + ERRORT("{}", t, *i); + ceph_abort("not possible"); + } + } + + auto existing_stats = t.get_existing_block_stats(); + DEBUGT("total existing blocks num: {}, exist clean num: {}, " + "exist mutation pending num: {}", + t, + existing_stats.valid_num, + existing_stats.clean_num, + existing_stats.mutated_num); + for (auto &i: t.existing_block_list) { + if (i->is_valid()) { + if (i->is_exist_clean()) { + i->state = CachedExtent::extent_state_t::CLEAN; + } else { + assert(i->state == CachedExtent::extent_state_t::DIRTY); + } + DEBUGT("backref_list new existing {} len {}", + t, + i->get_paddr(), + i->get_length()); + backref_list.emplace_back( + std::make_unique<backref_entry_t>( + i->get_paddr(), + i->cast<LogicalCachedExtent>()->get_laddr(), + i->get_length(), + i->get_type(), + start_seq)); + const auto t_src = t.get_src(); + add_extent(i, &t_src); + } + } + if (!backref_list.empty()) { + backref_batch_update(std::move(backref_list), start_seq); + } + + for (auto &i: t.pre_alloc_list) { + if (!i->is_valid()) { + epm.mark_space_free(i->get_paddr(), i->get_length()); + } + } +} + +void Cache::init() +{ + LOG_PREFIX(Cache::init); + if (root) { + // initial creation will do mkfs followed by mount each of which calls init + DEBUG("remove extent -- prv_root={}", *root); + remove_extent(root); + root = nullptr; + } + root = new RootBlock(); + root->init(CachedExtent::extent_state_t::CLEAN, + P_ADDR_ROOT, + PLACEMENT_HINT_NULL, + NULL_GENERATION, + TRANS_ID_NULL); + INFO("init root -- {}", *root); + extents.insert(*root); +} + +Cache::mkfs_iertr::future<> Cache::mkfs(Transaction &t) +{ + LOG_PREFIX(Cache::mkfs); + INFOT("create root", t); + return get_root(t).si_then([this, &t](auto croot) { + duplicate_for_write(t, croot); + return mkfs_iertr::now(); + }).handle_error_interruptible( + mkfs_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in Cache::mkfs" + } + ); +} + +Cache::close_ertr::future<> Cache::close() +{ + LOG_PREFIX(Cache::close); + INFO("close with {}({}B) dirty, dirty_from={}, alloc_from={}, " + "{}({}B) lru, totally {}({}B) indexed extents", + dirty.size(), + stats.dirty_bytes, + get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL), + get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL), + lru.get_current_contents_extents(), + lru.get_current_contents_bytes(), + extents.size(), + extents.get_bytes()); + root.reset(); + for (auto i = dirty.begin(); i != dirty.end(); ) { + auto ptr = &*i; + stats.dirty_bytes -= ptr->get_length(); + dirty.erase(i++); + intrusive_ptr_release(ptr); + } + backref_extents.clear(); + backref_entryrefs_by_seq.clear(); + assert(stats.dirty_bytes == 0); + lru.clear(); + return close_ertr::now(); +} + +Cache::replay_delta_ret +Cache::replay_delta( + journal_seq_t journal_seq, + paddr_t record_base, + const delta_info_t &delta, + const journal_seq_t &dirty_tail, + const journal_seq_t &alloc_tail, + sea_time_point modify_time) +{ + LOG_PREFIX(Cache::replay_delta); + assert(dirty_tail != JOURNAL_SEQ_NULL); + assert(alloc_tail != JOURNAL_SEQ_NULL); + ceph_assert(modify_time != NULL_TIME); + + // FIXME: This is specific to the segmented implementation + /* The journal may validly contain deltas for extents in + * since released segments. We can detect those cases by + * checking whether the segment in question currently has a + * sequence number > the current journal segment seq. We can + * safetly skip these deltas because the extent must already + * have been rewritten. + */ + if (delta.paddr != P_ADDR_NULL && + delta.paddr.get_addr_type() == paddr_types_t::SEGMENT) { + auto& seg_addr = delta.paddr.as_seg_paddr(); + auto seg_info = get_segment_info(seg_addr.get_segment_id()); + if (seg_info) { + auto delta_paddr_segment_seq = seg_info->seq; + auto delta_paddr_segment_type = seg_info->type; + if (delta_paddr_segment_seq != delta.ext_seq || + delta_paddr_segment_type != delta.seg_type) { + DEBUG("delta is obsolete, delta_paddr_segment_seq={}," + " delta_paddr_segment_type={} -- {}", + segment_seq_printer_t{delta_paddr_segment_seq}, + delta_paddr_segment_type, + delta); + return replay_delta_ertr::make_ready_future<bool>(false); + } + } + } + + if (delta.type == extent_types_t::JOURNAL_TAIL) { + // this delta should have been dealt with during segment cleaner mounting + return replay_delta_ertr::make_ready_future<bool>(false); + } + + // replay alloc + if (delta.type == extent_types_t::ALLOC_INFO) { + if (journal_seq < alloc_tail) { + DEBUG("journal_seq {} < alloc_tail {}, don't replay {}", + journal_seq, alloc_tail, delta); + return replay_delta_ertr::make_ready_future<bool>(false); + } + + alloc_delta_t alloc_delta; + decode(alloc_delta, delta.bl); + std::vector<backref_entry_ref> backref_list; + for (auto &alloc_blk : alloc_delta.alloc_blk_ranges) { + if (alloc_blk.paddr.is_relative()) { + assert(alloc_blk.paddr.is_record_relative()); + alloc_blk.paddr = record_base.add_relative(alloc_blk.paddr); + } + DEBUG("replay alloc_blk {}~{} {}, journal_seq: {}", + alloc_blk.paddr, alloc_blk.len, alloc_blk.laddr, journal_seq); + backref_list.emplace_back( + std::make_unique<backref_entry_t>( + alloc_blk.paddr, + alloc_blk.laddr, + alloc_blk.len, + alloc_blk.type, + journal_seq)); + } + if (!backref_list.empty()) { + backref_batch_update(std::move(backref_list), journal_seq); + } + return replay_delta_ertr::make_ready_future<bool>(true); + } + + // replay dirty + if (journal_seq < dirty_tail) { + DEBUG("journal_seq {} < dirty_tail {}, don't replay {}", + journal_seq, dirty_tail, delta); + return replay_delta_ertr::make_ready_future<bool>(false); + } + + if (delta.type == extent_types_t::ROOT) { + TRACE("replay root delta at {} {}, remove extent ... -- {}, prv_root={}", + journal_seq, record_base, delta, *root); + remove_extent(root); + root->apply_delta_and_adjust_crc(record_base, delta.bl); + root->dirty_from_or_retired_at = journal_seq; + root->state = CachedExtent::extent_state_t::DIRTY; + DEBUG("replayed root delta at {} {}, add extent -- {}, root={}", + journal_seq, record_base, delta, *root); + root->set_modify_time(modify_time); + add_extent(root); + return replay_delta_ertr::make_ready_future<bool>(true); + } else { + auto _get_extent_if_cached = [this](paddr_t addr) + -> get_extent_ertr::future<CachedExtentRef> { + // replay is not included by the cache hit metrics + auto ret = query_cache(addr, nullptr); + if (ret) { + // no retired-placeholder should be exist yet because no transaction + // has been created. + assert(ret->get_type() != extent_types_t::RETIRED_PLACEHOLDER); + return ret->wait_io().then([ret] { + return ret; + }); + } else { + return seastar::make_ready_future<CachedExtentRef>(); + } + }; + auto extent_fut = (delta.pversion == 0 ? + // replay is not included by the cache hit metrics + _get_extent_by_type( + delta.type, + delta.paddr, + delta.laddr, + delta.length, + nullptr, + [](CachedExtent &) {}, + [](CachedExtent &) {}) : + _get_extent_if_cached( + delta.paddr) + ).handle_error( + replay_delta_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in Cache::replay_delta" + } + ); + return extent_fut.safe_then([=, this, &delta](auto extent) { + if (!extent) { + DEBUG("replay extent is not present, so delta is obsolete at {} {} -- {}", + journal_seq, record_base, delta); + assert(delta.pversion > 0); + return replay_delta_ertr::make_ready_future<bool>(true); + } + + DEBUG("replay extent delta at {} {} ... -- {}, prv_extent={}", + journal_seq, record_base, delta, *extent); + + assert(extent->last_committed_crc == delta.prev_crc); + assert(extent->version == delta.pversion); + extent->apply_delta_and_adjust_crc(record_base, delta.bl); + extent->set_modify_time(modify_time); + assert(extent->last_committed_crc == delta.final_crc); + + extent->version++; + if (extent->version == 1) { + extent->dirty_from_or_retired_at = journal_seq; + DEBUG("replayed extent delta at {} {}, become dirty -- {}, extent={}" , + journal_seq, record_base, delta, *extent); + } else { + DEBUG("replayed extent delta at {} {} -- {}, extent={}" , + journal_seq, record_base, delta, *extent); + } + mark_dirty(extent); + return replay_delta_ertr::make_ready_future<bool>(true); + }); + } +} + +Cache::get_next_dirty_extents_ret Cache::get_next_dirty_extents( + Transaction &t, + journal_seq_t seq, + size_t max_bytes) +{ + LOG_PREFIX(Cache::get_next_dirty_extents); + if (dirty.empty()) { + DEBUGT("max_bytes={}B, seq={}, dirty is empty", + t, max_bytes, seq); + } else { + DEBUGT("max_bytes={}B, seq={}, dirty_from={}", + t, max_bytes, seq, dirty.begin()->get_dirty_from()); + } + std::vector<CachedExtentRef> cand; + size_t bytes_so_far = 0; + for (auto i = dirty.begin(); + i != dirty.end() && bytes_so_far < max_bytes; + ++i) { + auto dirty_from = i->get_dirty_from(); + //dirty extents must be fully loaded + assert(i->is_fully_loaded()); + if (unlikely(dirty_from == JOURNAL_SEQ_NULL)) { + ERRORT("got dirty extent with JOURNAL_SEQ_NULL -- {}", t, *i); + ceph_abort(); + } + if (dirty_from < seq) { + TRACET("next extent -- {}", t, *i); + if (!cand.empty() && cand.back()->get_dirty_from() > dirty_from) { + ERRORT("dirty extents are not ordered by dirty_from -- last={}, next={}", + t, *cand.back(), *i); + ceph_abort(); + } + bytes_so_far += i->get_length(); + cand.push_back(&*i); + } else { + break; + } + } + return seastar::do_with( + std::move(cand), + decltype(cand)(), + [FNAME, this, &t](auto &cand, auto &ret) { + return trans_intr::do_for_each( + cand, + [FNAME, this, &t, &ret](auto &ext) { + TRACET("waiting on extent -- {}", t, *ext); + return trans_intr::make_interruptible( + ext->wait_io() + ).then_interruptible([FNAME, this, ext, &t, &ret] { + if (!ext->is_valid()) { + ++(get_by_src(stats.trans_conflicts_by_unknown, t.get_src())); + mark_transaction_conflicted(t, *ext); + return; + } + + CachedExtentRef on_transaction; + auto result = t.get_extent(ext->get_paddr(), &on_transaction); + if (result == Transaction::get_extent_ret::ABSENT) { + DEBUGT("extent is absent on t -- {}", t, *ext); + t.add_to_read_set(ext); + if (ext->get_type() == extent_types_t::ROOT) { + if (t.root) { + assert(&*t.root == &*ext); + ceph_assert(0 == "t.root would have to already be in the read set"); + } else { + assert(&*ext == &*root); + t.root = root; + } + } + ret.push_back(ext); + } else if (result == Transaction::get_extent_ret::PRESENT) { + DEBUGT("extent is present on t -- {}, on t {}", t, *ext, *on_transaction); + ret.push_back(on_transaction); + } else { + assert(result == Transaction::get_extent_ret::RETIRED); + DEBUGT("extent is retired on t -- {}", t, *ext); + } + }); + }).then_interruptible([&ret] { + return std::move(ret); + }); + }); +} + +Cache::get_root_ret Cache::get_root(Transaction &t) +{ + LOG_PREFIX(Cache::get_root); + if (t.root) { + TRACET("root already on t -- {}", t, *t.root); + return t.root->wait_io().then([&t] { + return get_root_iertr::make_ready_future<RootBlockRef>( + t.root); + }); + } else { + DEBUGT("root not on t -- {}", t, *root); + t.root = root; + t.add_to_read_set(root); + return root->wait_io().then([root=root] { + return get_root_iertr::make_ready_future<RootBlockRef>( + root); + }); + } +} + +Cache::get_extent_ertr::future<CachedExtentRef> Cache::_get_extent_by_type( + extent_types_t type, + paddr_t offset, + laddr_t laddr, + extent_len_t length, + const Transaction::src_t* p_src, + extent_init_func_t &&extent_init_func, + extent_init_func_t &&on_cache) +{ + return [=, this, extent_init_func=std::move(extent_init_func)]() mutable { + src_ext_t* p_metric_key = nullptr; + src_ext_t metric_key; + if (p_src) { + metric_key = std::make_pair(*p_src, type); + p_metric_key = &metric_key; + } + + switch (type) { + case extent_types_t::ROOT: + ceph_assert(0 == "ROOT is never directly read"); + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + case extent_types_t::BACKREF_INTERNAL: + return get_extent<backref::BackrefInternalNode>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::BACKREF_LEAF: + return get_extent<backref::BackrefLeafNode>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::LADDR_INTERNAL: + return get_extent<lba_manager::btree::LBAInternalNode>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::LADDR_LEAF: + return get_extent<lba_manager::btree::LBALeafNode>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::OMAP_INNER: + return get_extent<omap_manager::OMapInnerNode>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::OMAP_LEAF: + return get_extent<omap_manager::OMapLeafNode>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::COLL_BLOCK: + return get_extent<collection_manager::CollectionNode>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::ONODE_BLOCK_STAGED: + return get_extent<onode::SeastoreNodeExtent>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::OBJECT_DATA_BLOCK: + return get_extent<ObjectDataBlock>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::RETIRED_PLACEHOLDER: + ceph_assert(0 == "impossible"); + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + case extent_types_t::TEST_BLOCK: + return get_extent<TestBlock>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::TEST_BLOCK_PHYSICAL: + return get_extent<TestBlockPhysical>( + offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache) + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::NONE: { + ceph_assert(0 == "NONE is an invalid extent type"); + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + } + default: + ceph_assert(0 == "impossible"); + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + } + }().safe_then([laddr](CachedExtentRef e) { + assert(e->is_logical() == (laddr != L_ADDR_NULL)); + if (e->is_logical()) { + e->cast<LogicalCachedExtent>()->set_laddr(laddr); + } + return get_extent_ertr::make_ready_future<CachedExtentRef>(e); + }); +} + +} diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h new file mode 100644 index 000000000..c79473f98 --- /dev/null +++ b/src/crimson/os/seastore/cache.h @@ -0,0 +1,1688 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include "seastar/core/shared_future.hh" + +#include "include/buffer.h" + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/extent_placement_manager.h" +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/random_block_manager.h" +#include "crimson/os/seastore/root_block.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/transaction.h" + +namespace crimson::os::seastore::backref { +class BtreeBackrefManager; +} + +namespace crimson::os::seastore { + +template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size, + bool leaf_has_children> +class FixedKVBtree; +class BackrefManager; +class SegmentProvider; + +struct backref_entry_t { + backref_entry_t( + const paddr_t paddr, + const laddr_t laddr, + const extent_len_t len, + const extent_types_t type, + const journal_seq_t seq) + : paddr(paddr), + laddr(laddr), + len(len), + type(type), + seq(seq) + {} + backref_entry_t(alloc_blk_t alloc_blk) + : paddr(alloc_blk.paddr), + laddr(alloc_blk.laddr), + len(alloc_blk.len), + type(alloc_blk.type) + {} + paddr_t paddr = P_ADDR_NULL; + laddr_t laddr = L_ADDR_NULL; + extent_len_t len = 0; + extent_types_t type = + extent_types_t::ROOT; + journal_seq_t seq; + friend bool operator< ( + const backref_entry_t &l, + const backref_entry_t &r) { + return l.paddr < r.paddr; + } + friend bool operator> ( + const backref_entry_t &l, + const backref_entry_t &r) { + return l.paddr > r.paddr; + } + friend bool operator== ( + const backref_entry_t &l, + const backref_entry_t &r) { + return l.paddr == r.paddr; + } + + using set_hook_t = + boost::intrusive::set_member_hook< + boost::intrusive::link_mode< + boost::intrusive::auto_unlink>>; + set_hook_t backref_set_hook; + using backref_set_member_options = boost::intrusive::member_hook< + backref_entry_t, + set_hook_t, + &backref_entry_t::backref_set_hook>; + using multiset_t = boost::intrusive::multiset< + backref_entry_t, + backref_set_member_options, + boost::intrusive::constant_time_size<false>>; + + struct cmp_t { + using is_transparent = paddr_t; + bool operator()( + const backref_entry_t &l, + const backref_entry_t &r) const { + return l.paddr < r.paddr; + } + bool operator()(const paddr_t l, const backref_entry_t &r) const { + return l < r.paddr; + } + bool operator()(const backref_entry_t &l, const paddr_t r) const { + return l.paddr < r; + } + }; +}; + +std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent); + +using backref_entry_ref = std::unique_ptr<backref_entry_t>; +using backref_entry_mset_t = backref_entry_t::multiset_t; +using backref_entry_refs_t = std::vector<backref_entry_ref>; +using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>; +using backref_entry_query_set_t = std::set< + backref_entry_t, backref_entry_t::cmp_t>; + +/** + * Cache + * + * This component is responsible for buffer management, including + * transaction lifecycle. + * + * Seastore transactions are expressed as an atomic combination of + * 1) newly written blocks + * 2) logical mutations to existing physical blocks + * + * See record_t + * + * As such, any transaction has 3 components: + * 1) read_set: references to extents read during the transaction + * See Transaction::read_set + * 2) write_set: references to extents to be written as: + * a) new physical blocks, see Transaction::fresh_block_list + * b) mutations to existing physical blocks, + * see Transaction::mutated_block_list + * 3) retired_set: extent refs to be retired either due to 2b or + * due to releasing the extent generally. + + * In the case of 2b, the CachedExtent will have been copied into + * a fresh CachedExtentRef such that the source extent ref is present + * in the read set and the newly allocated extent is present in the + * write_set. + * + * A transaction has 3 phases: + * 1) construction: user calls Cache::get_transaction() and populates + * the returned transaction by calling Cache methods + * 2) submission: user calls Cache::try_start_transaction(). If + * succcessful, the user may construct a record and submit the + * transaction to the journal. + * 3) completion: once the transaction is durable, the user must call + * Cache::complete_commit() with the block offset to complete + * the transaction. + * + * Internally, in phase 1, the fields in Transaction are filled in. + * - reads may block if the referenced extent is being written + * - once a read obtains a particular CachedExtentRef for a paddr_t, + * it'll always get the same one until overwritten + * - once a paddr_t is overwritten or written, subsequent reads of + * that addr will get the new ref + * + * In phase 2, if all extents in the read set are valid (not expired), + * we can commit (otherwise, we fail and the user must retry). + * - Expire all extents in the retired_set (they must all be valid) + * - Remove all extents in the retired_set from Cache::extents + * - Mark all extents in the write_set wait_io(), add promises to + * transaction + * - Merge Transaction::write_set into Cache::extents + * + * After phase 2, the user will submit the record to the journal. + * Once complete, we perform phase 3: + * - For each CachedExtent in block_list, call + * CachedExtent::complete_initial_write(paddr_t) with the block's + * final offset (inferred from the extent's position in the block_list + * and extent lengths). + * - For each block in mutation_list, call + * CachedExtent::delta_written(paddr_t) with the address of the start + * of the record + * - Complete all promises with the final record start paddr_t + * + * + * Cache logs + * + * levels: + * - INFO: major initiation, closing operations + * - DEBUG: major extent related operations, INFO details + * - TRACE: DEBUG details + * - seastore_t logs + */ +class Cache { +public: + using base_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using base_iertr = trans_iertr<base_ertr>; + + Cache(ExtentPlacementManager &epm); + ~Cache(); + + /// Creates empty transaction by source + TransactionRef create_transaction( + Transaction::src_t src, + const char* name, + bool is_weak) { + LOG_PREFIX(Cache::create_transaction); + + ++(get_by_src(stats.trans_created_by_src, src)); + + auto ret = std::make_unique<Transaction>( + get_dummy_ordering_handle(), + is_weak, + src, + last_commit, + [this](Transaction& t) { + return on_transaction_destruct(t); + }, + ++next_id + ); + SUBDEBUGT(seastore_t, "created name={}, source={}, is_weak={}", + *ret, name, src, is_weak); + assert(!is_weak || src == Transaction::src_t::READ); + return ret; + } + + /// Resets transaction preserving + void reset_transaction_preserve_handle(Transaction &t) { + LOG_PREFIX(Cache::reset_transaction_preserve_handle); + if (t.did_reset()) { + SUBTRACET(seastore_t, "reset", t); + ++(get_by_src(stats.trans_created_by_src, t.get_src())); + } + t.reset_preserve_handle(last_commit); + } + + /// Declare ref retired in t + void retire_extent(Transaction &t, CachedExtentRef ref) { + LOG_PREFIX(Cache::retire_extent); + SUBDEBUGT(seastore_cache, "retire extent -- {}", t, *ref); + t.add_to_retired_set(ref); + } + + /// Declare paddr retired in t + using retire_extent_iertr = base_iertr; + using retire_extent_ret = base_iertr::future<>; + retire_extent_ret retire_extent_addr( + Transaction &t, paddr_t addr, extent_len_t length); + + /** + * get_root + * + * returns ref to current root or t.root if modified in t + */ + using get_root_iertr = base_iertr; + using get_root_ret = get_root_iertr::future<RootBlockRef>; + get_root_ret get_root(Transaction &t); + + /** + * get_root_fast + * + * returns t.root and assume it is already present/read in t + */ + RootBlockRef get_root_fast(Transaction &t) { + LOG_PREFIX(Cache::get_root_fast); + SUBTRACET(seastore_cache, "root already on t -- {}", t, *t.root); + assert(t.root); + return t.root; + } + + /** + * get_extent + * + * returns ref to extent at offset~length of type T either from + * - extent_set if already in cache + * - disk + */ + using src_ext_t = std::pair<Transaction::src_t, extent_types_t>; + using get_extent_ertr = base_ertr; + template <typename T> + using get_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>; + template <typename T, typename Func, typename OnCache> + get_extent_ret<T> get_extent( + paddr_t offset, ///< [in] starting addr + extent_len_t length, ///< [in] length + const src_ext_t* p_src_ext, ///< [in] cache query metric key + Func &&extent_init_func, ///< [in] init func for extent + OnCache &&on_cache + ) { + LOG_PREFIX(Cache::get_extent); + auto cached = query_cache(offset, p_src_ext); + if (!cached) { + auto ret = CachedExtent::make_cached_extent_ref<T>( + alloc_cache_buf(length)); + ret->init(CachedExtent::extent_state_t::CLEAN_PENDING, + offset, + PLACEMENT_HINT_NULL, + NULL_GENERATION, + TRANS_ID_NULL); + SUBDEBUG(seastore_cache, + "{} {}~{} is absent, add extent and reading ... -- {}", + T::TYPE, offset, length, *ret); + const auto p_src = p_src_ext ? &p_src_ext->first : nullptr; + add_extent(ret, p_src); + on_cache(*ret); + extent_init_func(*ret); + return read_extent<T>( + std::move(ret)); + } + + // extent PRESENT in cache + if (cached->get_type() == extent_types_t::RETIRED_PLACEHOLDER) { + auto ret = CachedExtent::make_cached_extent_ref<T>( + alloc_cache_buf(length)); + ret->init(CachedExtent::extent_state_t::CLEAN_PENDING, + offset, + PLACEMENT_HINT_NULL, + NULL_GENERATION, + TRANS_ID_NULL); + SUBDEBUG(seastore_cache, + "{} {}~{} is absent(placeholder), reading ... -- {}", + T::TYPE, offset, length, *ret); + extents.replace(*ret, *cached); + on_cache(*ret); + + // replace placeholder in transactions + while (!cached->transactions.empty()) { + auto t = cached->transactions.begin()->t; + t->replace_placeholder(*cached, *ret); + } + + cached->state = CachedExtent::extent_state_t::INVALID; + extent_init_func(*ret); + return read_extent<T>( + std::move(ret)); + } else if (!cached->is_fully_loaded()) { + auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get())); + on_cache(*ret); + SUBDEBUG(seastore_cache, + "{} {}~{} is present without been fully loaded, reading ... -- {}", + T::TYPE, offset, length, *ret); + auto bp = alloc_cache_buf(length); + ret->set_bptr(std::move(bp)); + return read_extent<T>( + std::move(ret)); + } else { + SUBTRACE(seastore_cache, + "{} {}~{} is present in cache -- {}", + T::TYPE, offset, length, *cached); + auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get())); + on_cache(*ret); + return ret->wait_io( + ).then([ret=std::move(ret)]() mutable + -> get_extent_ret<T> { + // ret may be invalid, caller must check + return get_extent_ret<T>( + get_extent_ertr::ready_future_marker{}, + std::move(ret)); + }); + } + } + template <typename T> + get_extent_ret<T> get_extent( + paddr_t offset, ///< [in] starting addr + extent_len_t length, ///< [in] length + const src_ext_t* p_metric_key ///< [in] cache query metric key + ) { + return get_extent<T>( + offset, length, p_metric_key, + [](T &){}, [](T &) {}); + } + + + /** + * get_extent_if_cached + * + * Returns extent at offset if in cache + */ + using get_extent_if_cached_iertr = base_iertr; + using get_extent_if_cached_ret = + get_extent_if_cached_iertr::future<CachedExtentRef>; + get_extent_if_cached_ret get_extent_if_cached( + Transaction &t, + paddr_t offset, + extent_types_t type) { + CachedExtentRef ret; + LOG_PREFIX(Cache::get_extent_if_cached); + auto result = t.get_extent(offset, &ret); + if (result == Transaction::get_extent_ret::RETIRED) { + SUBDEBUGT(seastore_cache, "{} {} is retired on t -- {}", + t, type, offset, *ret); + return get_extent_if_cached_iertr::make_ready_future< + CachedExtentRef>(ret); + } else if (result == Transaction::get_extent_ret::PRESENT) { + if (ret->is_fully_loaded()) { + SUBTRACET(seastore_cache, "{} {} is present on t -- {}", + t, type, offset, *ret); + return ret->wait_io().then([ret] { + return get_extent_if_cached_iertr::make_ready_future< + CachedExtentRef>(ret); + }); + } else { + SUBDEBUGT(seastore_cache, "{} {} is present on t -- {}" + " without being fully loaded", t, type, offset, *ret); + return get_extent_if_cached_iertr::make_ready_future< + CachedExtentRef>(); + } + } + + // get_extent_ret::ABSENT from transaction + auto metric_key = std::make_pair(t.get_src(), type); + ret = query_cache(offset, &metric_key); + if (!ret) { + SUBDEBUGT(seastore_cache, "{} {} is absent", t, type, offset); + return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>(); + } else if (ret->get_type() == extent_types_t::RETIRED_PLACEHOLDER) { + // retired_placeholder is not really cached yet + SUBDEBUGT(seastore_cache, "{} {} is absent(placeholder)", + t, type, offset); + return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>(); + } else if (!ret->is_fully_loaded()) { + SUBDEBUGT(seastore_cache, "{} {} is present without " + "being fully loaded", t, type, offset); + return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>(); + } + + // present in cache(fully loaded) and is not a retired_placeholder + SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}", + t, type, offset, *ret); + t.add_to_read_set(ret); + touch_extent(*ret); + return ret->wait_io().then([ret] { + return get_extent_if_cached_iertr::make_ready_future< + CachedExtentRef>(ret); + }); + } + + /** + * get_extent + * + * returns ref to extent at offset~length of type T either from + * - t if modified by t + * - extent_set if already in cache + * - disk + * + * t *must not* have retired offset + */ + using get_extent_iertr = base_iertr; + template <typename T, typename Func> + get_extent_iertr::future<TCachedExtentRef<T>> get_extent( + Transaction &t, + paddr_t offset, + extent_len_t length, + Func &&extent_init_func) { + CachedExtentRef ret; + LOG_PREFIX(Cache::get_extent); + auto result = t.get_extent(offset, &ret); + if (result == Transaction::get_extent_ret::RETIRED) { + SUBERRORT(seastore_cache, "{} {}~{} is retired on t -- {}", + t, T::TYPE, offset, length, *ret); + ceph_abort("impossible"); + } else if (result == Transaction::get_extent_ret::PRESENT) { + if (ret->is_fully_loaded()) { + SUBTRACET(seastore_cache, "{} {}~{} is present on t -- {}", + t, T::TYPE, offset, length, *ret); + return ret->wait_io().then([ret] { + return seastar::make_ready_future<TCachedExtentRef<T>>( + ret->cast<T>()); + }); + } else { + assert(!ret->is_mutable()); + touch_extent(*ret); + SUBDEBUGT(seastore_cache, "{} {}~{} is present on t without been \ + fully loaded, reading ... {}", t, T::TYPE, offset, length, *ret); + auto bp = alloc_cache_buf(ret->get_length()); + ret->set_bptr(std::move(bp)); + return read_extent<T>( + ret->cast<T>()); + } + } else { + SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...", + t, T::TYPE, offset, length); + auto f = [&t, this](CachedExtent &ext) { + t.add_to_read_set(CachedExtentRef(&ext)); + touch_extent(ext); + }; + auto metric_key = std::make_pair(t.get_src(), T::TYPE); + return trans_intr::make_interruptible( + get_extent<T>( + offset, length, &metric_key, + std::forward<Func>(extent_init_func), std::move(f)) + ); + } + } + + /* + * get_absent_extent + * + * Mostly the same as Cache::get_extent(), with the only difference + * that get_absent_extent won't search the transaction's context for + * the specific CachedExtent + */ + template <typename T, typename Func> + get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent( + Transaction &t, + paddr_t offset, + extent_len_t length, + Func &&extent_init_func) { + CachedExtentRef ret; + LOG_PREFIX(Cache::get_absent_extent); + +#ifndef NDEBUG + auto r = t.get_extent(offset, &ret); + if (r != Transaction::get_extent_ret::ABSENT) { + SUBERRORT(seastore_cache, "unexpected non-absent extent {}", t, *ret); + ceph_abort(); + } +#endif + + SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...", + t, T::TYPE, offset, length); + auto f = [&t, this](CachedExtent &ext) { + t.add_to_read_set(CachedExtentRef(&ext)); + touch_extent(ext); + }; + auto metric_key = std::make_pair(t.get_src(), T::TYPE); + return trans_intr::make_interruptible( + get_extent<T>( + offset, length, &metric_key, + std::forward<Func>(extent_init_func), std::move(f)) + ); + } + + template <typename T> + get_extent_iertr::future<TCachedExtentRef<T>> get_extent( + Transaction &t, + paddr_t offset, + extent_len_t length) { + return get_extent<T>(t, offset, length, [](T &){}); + } + + /* + * get_absent_extent + * + * Mostly the same as Cache::get_extent(), with the only difference + * that get_absent_extent won't search the transaction's context for + * the specific CachedExtent + */ + template <typename T> + get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent( + Transaction &t, + paddr_t offset, + extent_len_t length) { + return get_absent_extent<T>(t, offset, length, [](T &){}); + } + + get_extent_ertr::future<CachedExtentRef> get_extent_viewable_by_trans( + Transaction &t, + CachedExtentRef extent) + { + auto p_extent = extent->get_transactional_view(t); + if (!p_extent->is_pending_in_trans(t.get_trans_id())) { + t.add_to_read_set(p_extent); + if (!p_extent->is_mutation_pending()) { + touch_extent(*p_extent); + } + } + // user should not see RETIRED_PLACEHOLDER extents + ceph_assert(p_extent->get_type() != extent_types_t::RETIRED_PLACEHOLDER); + if (!p_extent->is_fully_loaded()) { + assert(!p_extent->is_mutable()); + touch_extent(*p_extent); + LOG_PREFIX(Cache::get_extent_viewable_by_trans); + SUBDEBUG(seastore_cache, + "{} {}~{} is present without been fully loaded, reading ... -- {}", + p_extent->get_type(), p_extent->get_paddr(),p_extent->get_length(), + *p_extent); + auto bp = alloc_cache_buf(p_extent->get_length()); + p_extent->set_bptr(std::move(bp)); + return read_extent<CachedExtent>(CachedExtentRef(p_extent)); + } + return p_extent->wait_io( + ).then([p_extent] { + return get_extent_ertr::make_ready_future<CachedExtentRef>( + CachedExtentRef(p_extent)); + }); + } + + template <typename T> + get_extent_ertr::future<TCachedExtentRef<T>> get_extent_viewable_by_trans( + Transaction &t, + TCachedExtentRef<T> extent) + { + return get_extent_viewable_by_trans(t, CachedExtentRef(extent.get()) + ).safe_then([](auto p_extent) { + return p_extent->template cast<T>(); + }); + } + + extent_len_t get_block_size() const { + return epm.get_block_size(); + } + +private: + // This is a workaround std::move_only_function not being available, + // not really worth generalizing at this time. + class extent_init_func_t { + struct callable_i { + virtual void operator()(CachedExtent &extent) = 0; + virtual ~callable_i() = default; + }; + template <typename Func> + struct callable_wrapper final : callable_i { + Func func; + callable_wrapper(Func &&func) : func(std::forward<Func>(func)) {} + void operator()(CachedExtent &extent) final { + return func(extent); + } + ~callable_wrapper() final = default; + }; + public: + std::unique_ptr<callable_i> wrapped; + template <typename Func> + extent_init_func_t(Func &&func) : wrapped( + std::make_unique<callable_wrapper<Func>>(std::forward<Func>(func))) + {} + void operator()(CachedExtent &extent) { + return (*wrapped)(extent); + } + }; + get_extent_ertr::future<CachedExtentRef> _get_extent_by_type( + extent_types_t type, + paddr_t offset, + laddr_t laddr, + extent_len_t length, + const Transaction::src_t* p_src, + extent_init_func_t &&extent_init_func, + extent_init_func_t &&on_cache + ); + + using get_extent_by_type_iertr = get_extent_iertr; + using get_extent_by_type_ret = get_extent_by_type_iertr::future< + CachedExtentRef>; + get_extent_by_type_ret _get_extent_by_type( + Transaction &t, + extent_types_t type, + paddr_t offset, + laddr_t laddr, + extent_len_t length, + extent_init_func_t &&extent_init_func + ) { + LOG_PREFIX(Cache::get_extent_by_type); + CachedExtentRef ret; + auto status = t.get_extent(offset, &ret); + if (status == Transaction::get_extent_ret::RETIRED) { + SUBERRORT(seastore_cache, "{} {}~{} {} is retired on t -- {}", + t, type, offset, length, laddr, *ret); + ceph_abort("impossible"); + } else if (status == Transaction::get_extent_ret::PRESENT) { + if (ret->is_fully_loaded()) { + SUBTRACET(seastore_cache, "{} {}~{} {} is present on t -- {}", + t, type, offset, length, laddr, *ret); + return ret->wait_io().then([ret] { + return seastar::make_ready_future<CachedExtentRef>(ret); + }); + } else { + assert(!ret->is_mutable()); + touch_extent(*ret); + SUBDEBUGT(seastore_cache, "{} {}~{} {} is present on t without been \ + fully loaded, reading ...", t, type, offset, length, laddr); + auto bp = alloc_cache_buf(ret->get_length()); + ret->set_bptr(std::move(bp)); + return read_extent<CachedExtent>( + std::move(ret)); + } + } else { + SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...", + t, type, offset, length, laddr); + auto f = [&t, this](CachedExtent &ext) { + t.add_to_read_set(CachedExtentRef(&ext)); + touch_extent(ext); + }; + auto src = t.get_src(); + return trans_intr::make_interruptible( + _get_extent_by_type( + type, offset, laddr, length, &src, + std::move(extent_init_func), std::move(f)) + ); + } + } + + get_extent_by_type_ret _get_absent_extent_by_type( + Transaction &t, + extent_types_t type, + paddr_t offset, + laddr_t laddr, + extent_len_t length, + extent_init_func_t &&extent_init_func + ) { + LOG_PREFIX(Cache::_get_absent_extent_by_type); + +#ifndef NDEBUG + CachedExtentRef ret; + auto r = t.get_extent(offset, &ret); + if (r != Transaction::get_extent_ret::ABSENT) { + SUBERRORT(seastore_cache, "unexpected non-absent extent {}", t, *ret); + ceph_abort(); + } +#endif + + SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...", + t, type, offset, length, laddr); + auto f = [&t, this](CachedExtent &ext) { + t.add_to_read_set(CachedExtentRef(&ext)); + touch_extent(ext); + }; + auto src = t.get_src(); + return trans_intr::make_interruptible( + _get_extent_by_type( + type, offset, laddr, length, &src, + std::move(extent_init_func), std::move(f)) + ); + } + + backref_entryrefs_by_seq_t backref_entryrefs_by_seq; + backref_entry_mset_t backref_entry_mset; + + using backref_entry_query_mset_t = std::multiset< + backref_entry_t, backref_entry_t::cmp_t>; + backref_entry_query_mset_t get_backref_entries_in_range( + paddr_t start, + paddr_t end) { + auto start_iter = backref_entry_mset.lower_bound( + start, + backref_entry_t::cmp_t()); + auto end_iter = backref_entry_mset.lower_bound( + end, + backref_entry_t::cmp_t()); + backref_entry_query_mset_t res; + for (auto it = start_iter; + it != end_iter; + it++) { + res.emplace(it->paddr, it->laddr, it->len, it->type, it->seq); + } + return res; + } + + const backref_entry_mset_t& get_backref_entry_mset() { + return backref_entry_mset; + } + + backref_entryrefs_by_seq_t& get_backref_entryrefs_by_seq() { + return backref_entryrefs_by_seq; + } + + const segment_info_t* get_segment_info(segment_id_t sid) { + auto provider = segment_providers_by_device_id[sid.device_id()]; + if (provider) { + return &provider->get_seg_info(sid); + } else { + return nullptr; + } + } + +public: + /** + * get_extent_by_type + * + * Based on type, instantiate the correct concrete type + * and read in the extent at location offset~length. + */ + template <typename Func> + get_extent_by_type_ret get_extent_by_type( + Transaction &t, ///< [in] transaction + extent_types_t type, ///< [in] type tag + paddr_t offset, ///< [in] starting addr + laddr_t laddr, ///< [in] logical address if logical + extent_len_t length, ///< [in] length + Func &&extent_init_func ///< [in] extent init func + ) { + return _get_extent_by_type( + t, + type, + offset, + laddr, + length, + extent_init_func_t(std::forward<Func>(extent_init_func))); + } + + /* + * get_absent_extent_by_type + * + * Mostly the same as Cache::get_extent_by_type(), with the only difference + * that get_absent_extent_by_type won't search the transaction's context for + * the specific CachedExtent + */ + template <typename Func> + get_extent_by_type_ret get_absent_extent_by_type( + Transaction &t, ///< [in] transaction + extent_types_t type, ///< [in] type tag + paddr_t offset, ///< [in] starting addr + laddr_t laddr, ///< [in] logical address if logical + extent_len_t length, ///< [in] length + Func &&extent_init_func ///< [in] extent init func + ) { + return _get_absent_extent_by_type( + t, + type, + offset, + laddr, + length, + extent_init_func_t(std::forward<Func>(extent_init_func))); + } + + get_extent_by_type_ret get_extent_by_type( + Transaction &t, + extent_types_t type, + paddr_t offset, + laddr_t laddr, + extent_len_t length + ) { + return get_extent_by_type( + t, type, offset, laddr, length, [](CachedExtent &) {}); + } + + + /* + * get_absent_extent_by_type + * + * Mostly the same as Cache::get_extent_by_type(), with the only difference + * that get_absent_extent_by_type won't search the transaction's context for + * the specific CachedExtent + */ + get_extent_by_type_ret get_absent_extent_by_type( + Transaction &t, + extent_types_t type, + paddr_t offset, + laddr_t laddr, + extent_len_t length + ) { + return get_absent_extent_by_type( + t, type, offset, laddr, length, [](CachedExtent &) {}); + } + + void trim_backref_bufs(const journal_seq_t &trim_to) { + LOG_PREFIX(Cache::trim_backref_bufs); + SUBDEBUG(seastore_cache, "trimming to {}", trim_to); + if (!backref_entryrefs_by_seq.empty()) { + SUBDEBUG(seastore_cache, "backref_entryrefs_by_seq {} ~ {}, size={}", + backref_entryrefs_by_seq.rbegin()->first, + backref_entryrefs_by_seq.begin()->first, + backref_entryrefs_by_seq.size()); + assert(backref_entryrefs_by_seq.rbegin()->first >= trim_to); + auto iter = backref_entryrefs_by_seq.upper_bound(trim_to); + backref_entryrefs_by_seq.erase(backref_entryrefs_by_seq.begin(), iter); + } + if (backref_entryrefs_by_seq.empty()) { + SUBDEBUG(seastore_cache, "backref_entryrefs_by_seq all trimmed"); + } + } + + /** + * alloc_new_extent + * + * Allocates a fresh extent. if delayed is true, addr will be alloc'd later. + * Note that epaddr can only be fed by the btree lba unittest for now + */ + template <typename T> + TCachedExtentRef<T> alloc_new_extent( + Transaction &t, ///< [in, out] current transaction + extent_len_t length, ///< [in] length + placement_hint_t hint, ///< [in] user hint +#ifdef UNIT_TESTS_BUILT + rewrite_gen_t gen, ///< [in] rewrite generation + std::optional<paddr_t> epaddr = std::nullopt ///< [in] paddr fed by callers +#else + rewrite_gen_t gen +#endif + ) { + LOG_PREFIX(Cache::alloc_new_extent); + SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}", + t, T::TYPE, length, hint, rewrite_gen_printer_t{gen}); +#ifdef UNIT_TESTS_BUILT + auto result = epm.alloc_new_extent(t, T::TYPE, length, hint, gen, epaddr); +#else + auto result = epm.alloc_new_extent(t, T::TYPE, length, hint, gen); +#endif + auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result.bp)); + ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING, + result.paddr, + hint, + result.gen, + t.get_trans_id()); + t.add_fresh_extent(ret); + SUBDEBUGT(seastore_cache, + "allocated {} {}B extent at {}, hint={}, gen={} -- {}", + t, T::TYPE, length, result.paddr, + hint, rewrite_gen_printer_t{result.gen}, *ret); + return ret; + } + + /** + * alloc_remapped_extent + * + * Allocates an EXIST_CLEAN extent. Use the buffer to fill the new extent + * if buffer exists. + */ + template <typename T> + TCachedExtentRef<T> alloc_remapped_extent( + Transaction &t, + laddr_t remap_laddr, + paddr_t remap_paddr, + extent_len_t remap_length, + laddr_t original_laddr, + std::optional<ceph::bufferptr> &&original_bptr) { + LOG_PREFIX(Cache::alloc_remapped_extent); + assert(remap_laddr >= original_laddr); + TCachedExtentRef<T> ext; + if (original_bptr.has_value()) { + // shallow copy the buffer from original extent + auto nbp = ceph::bufferptr( + *original_bptr, + remap_laddr - original_laddr, + remap_length); + // ExtentPlacementManager::alloc_new_extent will make a new + // (relative/temp) paddr, so make extent directly + ext = CachedExtent::make_cached_extent_ref<T>(std::move(nbp)); + } else { + ext = CachedExtent::make_placeholder_cached_extent_ref<T>(remap_length); + } + + ext->init(CachedExtent::extent_state_t::EXIST_CLEAN, + remap_paddr, + PLACEMENT_HINT_NULL, + NULL_GENERATION, + t.get_trans_id()); + + t.add_fresh_extent(ext); + SUBTRACET(seastore_cache, "allocated {} {}B, hint={}, has ptr? {} -- {}", + t, T::TYPE, remap_length, remap_laddr, original_bptr.has_value(), *ext); + return ext; + } + + /** + * alloc_new_extent + * + * Allocates a fresh extent. addr will be relative until commit. + */ + CachedExtentRef alloc_new_extent_by_type( + Transaction &t, ///< [in, out] current transaction + extent_types_t type, ///< [in] type tag + extent_len_t length, ///< [in] length + placement_hint_t hint, ///< [in] user hint + rewrite_gen_t gen ///< [in] rewrite generation + ); + + /** + * Allocates mutable buffer from extent_set on offset~len + * + * TODO: Note, currently all implementations literally copy the + * buffer. This needn't be true, CachedExtent implementations could + * choose to refer to the same buffer unmodified until commit and just + * buffer the mutations in an ancillary data structure. + * + * @param current transaction + * @param extent to duplicate + * @return mutable extent + */ + CachedExtentRef duplicate_for_write( + Transaction &t, ///< [in, out] current transaction + CachedExtentRef i ///< [in] ref to existing extent + ); + + /** + * set_segment_provider + * + * Set to provide segment information to help identify out-dated delta. + * + * FIXME: This is specific to the segmented implementation + */ + void set_segment_providers(std::vector<SegmentProvider*> &&providers) { + segment_providers_by_device_id = std::move(providers); + } + + /** + * prepare_record + * + * Construct the record for Journal from transaction. + */ + record_t prepare_record( + Transaction &t, ///< [in, out] current transaction + const journal_seq_t &journal_head, + const journal_seq_t &journal_dirty_tail + ); + + /** + * complete_commit + * + * Must be called upon completion of write. Releases blocks on mutating + * extents, fills in addresses, and calls relevant callbacks on fresh + * and mutated exents. + */ + void complete_commit( + Transaction &t, ///< [in, out] current transaction + paddr_t final_block_start, ///< [in] offset of initial block + journal_seq_t seq ///< [in] journal commit seq + ); + + /** + * init + */ + void init(); + + /** + * mkfs + * + * Alloc initial root node and add to t. The intention is for other + * components to use t to adjust the resulting root ref prior to commit. + */ + using mkfs_iertr = base_iertr; + mkfs_iertr::future<> mkfs(Transaction &t); + + /** + * close + * + * TODO: should flush dirty blocks + */ + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + close_ertr::future<> close(); + + /** + * replay_delta + * + * Intended for use in Journal::delta. For each delta, should decode delta, + * read relevant block from disk or cache (using correct type), and call + * CachedExtent::apply_delta marking the extent dirty. + * + * Returns whether the delta is applied. + */ + using replay_delta_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using replay_delta_ret = replay_delta_ertr::future<bool>; + replay_delta_ret replay_delta( + journal_seq_t seq, + paddr_t record_block_base, + const delta_info_t &delta, + const journal_seq_t &dirty_tail, + const journal_seq_t &alloc_tail, + sea_time_point modify_time); + + /** + * init_cached_extents + * + * Calls passed lambda for each dirty cached block. Intended for use + * after replay to allow lba_manager (or w/e) to read in any ancestor + * blocks. + */ + using init_cached_extents_iertr = base_iertr; + using init_cached_extents_ret = init_cached_extents_iertr::future<>; + template <typename F> + init_cached_extents_ret init_cached_extents( + Transaction &t, + F &&f) + { + LOG_PREFIX(Cache::init_cached_extents); + SUBINFOT(seastore_cache, + "start with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}", + t, + extents.size(), + extents.get_bytes(), + dirty.size(), + get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL), + get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL)); + + // journal replay should has been finished at this point, + // Cache::root should have been inserted to the dirty list + assert(root->is_dirty()); + std::vector<CachedExtentRef> _dirty; + for (auto &e : extents) { + _dirty.push_back(CachedExtentRef(&e)); + } + return seastar::do_with( + std::forward<F>(f), + std::move(_dirty), + [this, FNAME, &t](auto &f, auto &refs) mutable + { + return trans_intr::do_for_each( + refs, + [this, FNAME, &t, &f](auto &e) + { + SUBTRACET(seastore_cache, "inspecting extent ... -- {}", t, *e); + return f(t, e + ).si_then([this, FNAME, &t, e](bool is_alive) { + if (!is_alive) { + SUBDEBUGT(seastore_cache, "extent is not alive, remove extent -- {}", t, *e); + remove_extent(e); + e->set_invalid(t); + } else { + SUBDEBUGT(seastore_cache, "extent is alive -- {}", t, *e); + } + }); + }); + }).handle_error_interruptible( + init_cached_extents_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in Cache::init_cached_extents" + } + ).si_then([this, FNAME, &t] { + SUBINFOT(seastore_cache, + "finish with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}", + t, + extents.size(), + extents.get_bytes(), + dirty.size(), + get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL), + get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL)); + }); + } + + /** + * update_extent_from_transaction + * + * Updates passed extent based on t. If extent has been retired, + * a null result will be returned. + */ + CachedExtentRef update_extent_from_transaction( + Transaction &t, + CachedExtentRef extent) { + if (extent->get_type() == extent_types_t::ROOT) { + if (t.root) { + return t.root; + } else { + t.add_to_read_set(extent); + t.root = extent->cast<RootBlock>(); + return extent; + } + } else { + auto result = t.get_extent(extent->get_paddr(), &extent); + if (result == Transaction::get_extent_ret::RETIRED) { + return CachedExtentRef(); + } else { + if (result == Transaction::get_extent_ret::ABSENT) { + t.add_to_read_set(extent); + } + return extent; + } + } + } + + /** + * print + * + * Dump summary of contents (TODO) + */ + std::ostream &print( + std::ostream &out) const { + return out; + } + + /** + * get_next_dirty_extents + * + * Returns extents with get_dirty_from() < seq and adds to read set of + * t. + */ + using get_next_dirty_extents_iertr = base_iertr; + using get_next_dirty_extents_ret = get_next_dirty_extents_iertr::future< + std::vector<CachedExtentRef>>; + get_next_dirty_extents_ret get_next_dirty_extents( + Transaction &t, + journal_seq_t seq, + size_t max_bytes); + + /// returns std::nullopt if no pending alloc-infos + std::optional<journal_seq_t> get_oldest_backref_dirty_from() const { + LOG_PREFIX(Cache::get_oldest_backref_dirty_from); + if (backref_entryrefs_by_seq.empty()) { + SUBDEBUG(seastore_cache, "backref_oldest: null"); + return std::nullopt; + } + auto oldest = backref_entryrefs_by_seq.begin()->first; + SUBDEBUG(seastore_cache, "backref_oldest: {}", oldest); + ceph_assert(oldest != JOURNAL_SEQ_NULL); + return oldest; + } + + /// returns std::nullopt if no dirty extents + /// returns JOURNAL_SEQ_NULL if the oldest dirty extent is still pending + std::optional<journal_seq_t> get_oldest_dirty_from() const { + LOG_PREFIX(Cache::get_oldest_dirty_from); + if (dirty.empty()) { + SUBDEBUG(seastore_cache, "dirty_oldest: null"); + return std::nullopt; + } else { + auto oldest = dirty.begin()->get_dirty_from(); + if (oldest == JOURNAL_SEQ_NULL) { + SUBDEBUG(seastore_cache, "dirty_oldest: pending"); + } else { + SUBDEBUG(seastore_cache, "dirty_oldest: {}", oldest); + } + return oldest; + } + } + + /// Dump live extents + void dump_contents(); + + /** + * backref_extent_entry_t + * + * All the backref extent entries have to be indexed by paddr in memory, + * so they can be retrived by range during cleaning. + * + * See BtreeBackrefManager::retrieve_backref_extents_in_range() + */ + struct backref_extent_entry_t { + backref_extent_entry_t( + paddr_t paddr, + paddr_t key, + extent_types_t type) + : paddr(paddr), key(key), type(type) {} + paddr_t paddr = P_ADDR_NULL; + paddr_t key = P_ADDR_NULL; + extent_types_t type = extent_types_t::ROOT; + struct cmp_t { + using is_transparent = paddr_t; + bool operator()( + const backref_extent_entry_t &l, + const backref_extent_entry_t &r) const { + return l.paddr < r.paddr; + } + bool operator()( + const paddr_t &l, + const backref_extent_entry_t &r) const { + return l < r.paddr; + } + bool operator()( + const backref_extent_entry_t &l, + const paddr_t &r) const { + return l.paddr < r; + } + }; + }; + + void update_tree_extents_num(extent_types_t type, int64_t delta) { + switch (type) { + case extent_types_t::LADDR_INTERNAL: + [[fallthrough]]; + case extent_types_t::DINK_LADDR_LEAF: + [[fallthrough]]; + case extent_types_t::LADDR_LEAF: + stats.lba_tree_extents_num += delta; + ceph_assert(stats.lba_tree_extents_num >= 0); + return; + case extent_types_t::OMAP_INNER: + [[fallthrough]]; + case extent_types_t::OMAP_LEAF: + stats.omap_tree_extents_num += delta; + ceph_assert(stats.lba_tree_extents_num >= 0); + return; + case extent_types_t::ONODE_BLOCK_STAGED: + stats.onode_tree_extents_num += delta; + ceph_assert(stats.onode_tree_extents_num >= 0); + return; + case extent_types_t::BACKREF_INTERNAL: + [[fallthrough]]; + case extent_types_t::BACKREF_LEAF: + stats.backref_tree_extents_num += delta; + ceph_assert(stats.backref_tree_extents_num >= 0); + return; + default: + return; + } + } + + uint64_t get_omap_tree_depth() { + return stats.omap_tree_depth; + } + + /// Update lru for access to ref + void touch_extent( + CachedExtent &ext, + const Transaction::src_t* p_src=nullptr) + { + if (p_src && is_background_transaction(*p_src)) + return; + if (ext.is_stable_clean() && !ext.is_placeholder()) { + lru.move_to_top(ext); + } + } + +private: + ExtentPlacementManager& epm; + RootBlockRef root; ///< ref to current root + ExtentIndex extents; ///< set of live extents + + journal_seq_t last_commit = JOURNAL_SEQ_MIN; + + // FIXME: This is specific to the segmented implementation + std::vector<SegmentProvider*> segment_providers_by_device_id; + + transaction_id_t next_id = 0; + + /** + * dirty + * + * holds refs to dirty extents. Ordered by CachedExtent::get_dirty_from(). + */ + CachedExtent::list dirty; + + using backref_extent_entry_query_set_t = + std::set< + backref_extent_entry_t, + backref_extent_entry_t::cmp_t>; + backref_extent_entry_query_set_t backref_extents; + + void add_backref_extent( + paddr_t paddr, + paddr_t key, + extent_types_t type) { + assert(!paddr.is_relative()); + auto [iter, inserted] = backref_extents.emplace(paddr, key, type); + boost::ignore_unused(inserted); + assert(inserted); + } + + void remove_backref_extent(paddr_t paddr) { + auto iter = backref_extents.find(paddr); + if (iter != backref_extents.end()) + backref_extents.erase(iter); + } + + backref_extent_entry_query_set_t get_backref_extents_in_range( + paddr_t start, + paddr_t end) { + auto start_iter = backref_extents.lower_bound(start); + auto end_iter = backref_extents.upper_bound(end); + backref_extent_entry_query_set_t res; + res.insert(start_iter, end_iter); + return res; + } + + friend class crimson::os::seastore::backref::BtreeBackrefManager; + friend class crimson::os::seastore::BackrefManager; + /** + * lru + * + * holds references to recently used extents + */ + class LRU { + // max size (bytes) + const size_t capacity = 0; + + // current size (bytes) + size_t contents = 0; + + CachedExtent::list lru; + + void trim_to_capacity() { + while (contents > capacity) { + assert(lru.size() > 0); + remove_from_lru(lru.front()); + } + } + + void add_to_lru(CachedExtent &extent) { + assert(extent.is_stable_clean() && !extent.is_placeholder()); + + if (!extent.primary_ref_list_hook.is_linked()) { + contents += extent.get_length(); + intrusive_ptr_add_ref(&extent); + lru.push_back(extent); + } + trim_to_capacity(); + } + + public: + LRU(size_t capacity) : capacity(capacity) {} + + size_t get_capacity() const { + return capacity; + } + + size_t get_current_contents_bytes() const { + return contents; + } + + size_t get_current_contents_extents() const { + return lru.size(); + } + + void remove_from_lru(CachedExtent &extent) { + assert(extent.is_stable_clean() && !extent.is_placeholder()); + + if (extent.primary_ref_list_hook.is_linked()) { + lru.erase(lru.s_iterator_to(extent)); + assert(contents >= extent.get_length()); + contents -= extent.get_length(); + intrusive_ptr_release(&extent); + } + } + + void move_to_top(CachedExtent &extent) { + assert(extent.is_stable_clean() && !extent.is_placeholder()); + + if (extent.primary_ref_list_hook.is_linked()) { + lru.erase(lru.s_iterator_to(extent)); + intrusive_ptr_release(&extent); + assert(contents >= extent.get_length()); + contents -= extent.get_length(); + } + add_to_lru(extent); + } + + void clear() { + LOG_PREFIX(Cache::LRU::clear); + for (auto iter = lru.begin(); iter != lru.end();) { + SUBDEBUG(seastore_cache, "clearing {}", *iter); + remove_from_lru(*(iter++)); + } + } + + ~LRU() { + clear(); + } + } lru; + + struct query_counters_t { + uint64_t access = 0; + uint64_t hit = 0; + }; + + template <typename CounterT> + using counter_by_extent_t = std::array<CounterT, EXTENT_TYPES_MAX>; + + struct invalid_trans_efforts_t { + io_stat_t read; + io_stat_t mutate; + uint64_t mutate_delta_bytes = 0; + io_stat_t retire; + io_stat_t fresh; + io_stat_t fresh_ool_written; + counter_by_extent_t<uint64_t> num_trans_invalidated; + uint64_t total_trans_invalidated = 0; + uint64_t num_ool_records = 0; + uint64_t ool_record_bytes = 0; + }; + + struct commit_trans_efforts_t { + counter_by_extent_t<io_stat_t> read_by_ext; + counter_by_extent_t<io_stat_t> mutate_by_ext; + counter_by_extent_t<uint64_t> delta_bytes_by_ext; + counter_by_extent_t<io_stat_t> retire_by_ext; + counter_by_extent_t<io_stat_t> fresh_invalid_by_ext; // inline but is already invalid (retired) + counter_by_extent_t<io_stat_t> fresh_inline_by_ext; + counter_by_extent_t<io_stat_t> fresh_ool_by_ext; + uint64_t num_trans = 0; // the number of inline records + uint64_t num_ool_records = 0; + uint64_t ool_record_metadata_bytes = 0; + uint64_t ool_record_data_bytes = 0; + uint64_t inline_record_metadata_bytes = 0; // metadata exclude the delta bytes + }; + + struct success_read_trans_efforts_t { + io_stat_t read; + uint64_t num_trans = 0; + }; + + struct tree_efforts_t { + uint64_t num_inserts = 0; + uint64_t num_erases = 0; + uint64_t num_updates = 0; + + void increment(const Transaction::tree_stats_t& incremental) { + num_inserts += incremental.num_inserts; + num_erases += incremental.num_erases; + num_updates += incremental.num_updates; + } + }; + + template <typename CounterT> + using counter_by_src_t = std::array<CounterT, TRANSACTION_TYPE_MAX>; + + static constexpr std::size_t NUM_SRC_COMB = + TRANSACTION_TYPE_MAX * (TRANSACTION_TYPE_MAX + 1) / 2; + + struct { + counter_by_src_t<uint64_t> trans_created_by_src; + counter_by_src_t<commit_trans_efforts_t> committed_efforts_by_src; + counter_by_src_t<invalid_trans_efforts_t> invalidated_efforts_by_src; + counter_by_src_t<query_counters_t> cache_query_by_src; + success_read_trans_efforts_t success_read_efforts; + uint64_t dirty_bytes = 0; + + uint64_t onode_tree_depth = 0; + int64_t onode_tree_extents_num = 0; + counter_by_src_t<tree_efforts_t> committed_onode_tree_efforts; + counter_by_src_t<tree_efforts_t> invalidated_onode_tree_efforts; + + uint64_t omap_tree_depth = 0; + int64_t omap_tree_extents_num = 0; + counter_by_src_t<tree_efforts_t> committed_omap_tree_efforts; + counter_by_src_t<tree_efforts_t> invalidated_omap_tree_efforts; + + uint64_t lba_tree_depth = 0; + int64_t lba_tree_extents_num = 0; + counter_by_src_t<tree_efforts_t> committed_lba_tree_efforts; + counter_by_src_t<tree_efforts_t> invalidated_lba_tree_efforts; + + uint64_t backref_tree_depth = 0; + int64_t backref_tree_extents_num = 0; + counter_by_src_t<tree_efforts_t> committed_backref_tree_efforts; + counter_by_src_t<tree_efforts_t> invalidated_backref_tree_efforts; + + std::array<uint64_t, NUM_SRC_COMB> trans_conflicts_by_srcs; + counter_by_src_t<uint64_t> trans_conflicts_by_unknown; + + version_stat_t committed_dirty_version; + version_stat_t committed_reclaim_version; + } stats; + + template <typename CounterT> + CounterT& get_by_src( + counter_by_src_t<CounterT>& counters_by_src, + Transaction::src_t src) { + assert(static_cast<std::size_t>(src) < counters_by_src.size()); + return counters_by_src[static_cast<std::size_t>(src)]; + } + + template <typename CounterT> + CounterT& get_by_ext( + counter_by_extent_t<CounterT>& counters_by_ext, + extent_types_t ext) { + auto index = static_cast<uint8_t>(ext); + assert(index < EXTENT_TYPES_MAX); + return counters_by_ext[index]; + } + + void account_conflict(Transaction::src_t src1, Transaction::src_t src2) { + assert(src1 < Transaction::src_t::MAX); + assert(src2 < Transaction::src_t::MAX); + if (src1 > src2) { + std::swap(src1, src2); + } + // impossible combinations + // should be consistent with trans_srcs_invalidated in register_metrics() + assert(!(src1 == Transaction::src_t::READ && + src2 == Transaction::src_t::READ)); + assert(!(src1 == Transaction::src_t::TRIM_DIRTY && + src2 == Transaction::src_t::TRIM_DIRTY)); + assert(!(src1 == Transaction::src_t::CLEANER_MAIN && + src2 == Transaction::src_t::CLEANER_MAIN)); + assert(!(src1 == Transaction::src_t::CLEANER_COLD && + src2 == Transaction::src_t::CLEANER_COLD)); + assert(!(src1 == Transaction::src_t::TRIM_ALLOC && + src2 == Transaction::src_t::TRIM_ALLOC)); + + auto src1_value = static_cast<std::size_t>(src1); + auto src2_value = static_cast<std::size_t>(src2); + auto num_srcs = static_cast<std::size_t>(Transaction::src_t::MAX); + auto conflict_index = num_srcs * src1_value + src2_value - + src1_value * (src1_value + 1) / 2; + assert(conflict_index < NUM_SRC_COMB); + ++stats.trans_conflicts_by_srcs[conflict_index]; + } + + seastar::metrics::metric_group metrics; + void register_metrics(); + + /// alloc buffer for cached extent + bufferptr alloc_cache_buf(size_t size) { + // TODO: memory pooling etc + auto bp = ceph::bufferptr( + buffer::create_page_aligned(size)); + bp.zero(); + return bp; + } + + void backref_batch_update( + std::vector<backref_entry_ref> &&, + const journal_seq_t &); + + /// Add extent to extents handling dirty and refcounting + void add_extent(CachedExtentRef ref, const Transaction::src_t* t_src); + + /// Mark exising extent ref dirty -- mainly for replay + void mark_dirty(CachedExtentRef ref); + + /// Add dirty extent to dirty list + void add_to_dirty(CachedExtentRef ref); + + /// Remove from dirty list + void remove_from_dirty(CachedExtentRef ref); + + /// Remove extent from extents handling dirty and refcounting + void remove_extent(CachedExtentRef ref); + + /// Retire extent + void commit_retire_extent(Transaction& t, CachedExtentRef ref); + + /// Replace prev with next + void commit_replace_extent(Transaction& t, CachedExtentRef next, CachedExtentRef prev); + + /// Invalidate extent and mark affected transactions + void invalidate_extent(Transaction& t, CachedExtent& extent); + + /// Mark a valid transaction as conflicted + void mark_transaction_conflicted( + Transaction& t, CachedExtent& conflicting_extent); + + /// Introspect transaction when it is being destructed + void on_transaction_destruct(Transaction& t); + + template <typename T> + get_extent_ret<T> read_extent( + TCachedExtentRef<T>&& extent + ) { + assert(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING || + extent->state == CachedExtent::extent_state_t::EXIST_CLEAN || + extent->state == CachedExtent::extent_state_t::CLEAN); + extent->set_io_wait(); + return epm.read( + extent->get_paddr(), + extent->get_length(), + extent->get_bptr() + ).safe_then( + [extent=std::move(extent)]() mutable { + LOG_PREFIX(Cache::read_extent); + if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) { + extent->state = CachedExtent::extent_state_t::CLEAN; + /* TODO: crc should be checked against LBA manager */ + extent->last_committed_crc = extent->get_crc32c(); + + extent->on_clean_read(); + } else if (extent->state == CachedExtent::extent_state_t::EXIST_CLEAN || + extent->state == CachedExtent::extent_state_t::CLEAN) { + /* TODO: crc should be checked against LBA manager */ + extent->last_committed_crc = extent->get_crc32c(); + } else { + ceph_assert(!extent->is_valid()); + } + extent->complete_io(); + SUBDEBUG(seastore_cache, "read extent done -- {}", *extent); + return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + std::move(extent)); + }, + get_extent_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Cache::get_extent: invalid error" + } + ); + } + + // Extents in cache may contain placeholders + CachedExtentRef query_cache( + paddr_t offset, + const src_ext_t* p_metric_key) { + query_counters_t* p_counters = nullptr; + if (p_metric_key) { + p_counters = &get_by_src(stats.cache_query_by_src, p_metric_key->first); + ++p_counters->access; + } + if (auto iter = extents.find_offset(offset); + iter != extents.end()) { + if (p_metric_key && + // retired_placeholder is not really cached yet + iter->get_type() != extent_types_t::RETIRED_PLACEHOLDER) { + ++p_counters->hit; + } + return CachedExtentRef(&*iter); + } else { + return CachedExtentRef(); + } + } + + template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size, + bool leaf_has_children> + friend class FixedKVBtree; +}; +using CacheRef = std::unique_ptr<Cache>; + +} diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc new file mode 100644 index 000000000..769b0446a --- /dev/null +++ b/src/crimson/os/seastore/cached_extent.cc @@ -0,0 +1,176 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/transaction.h" + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/btree/fixed_kv_node.h" + +namespace { + [[maybe_unused]] seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_seastore_tm); + } +} + +namespace crimson::os::seastore { + +#ifdef DEBUG_CACHED_EXTENT_REF + +void intrusive_ptr_add_ref(CachedExtent *ptr) +{ + intrusive_ptr_add_ref( + static_cast<boost::intrusive_ref_counter< + CachedExtent, + boost::thread_unsafe_counter>*>(ptr)); + logger().debug("intrusive_ptr_add_ref: {}", *ptr); +} + +void intrusive_ptr_release(CachedExtent *ptr) +{ + logger().debug("intrusive_ptr_release: {}", *ptr); + intrusive_ptr_release( + static_cast<boost::intrusive_ref_counter< + CachedExtent, + boost::thread_unsafe_counter>*>(ptr)); +} + +#endif + +bool is_backref_mapped_extent_node(const CachedExtentRef &extent) { + return extent->is_logical() + || is_lba_node(extent->get_type()) + || extent->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL; +} + +std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state) +{ + switch (state) { + case CachedExtent::extent_state_t::INITIAL_WRITE_PENDING: + return out << "INITIAL_WRITE_PENDING"; + case CachedExtent::extent_state_t::MUTATION_PENDING: + return out << "MUTATION_PENDING"; + case CachedExtent::extent_state_t::CLEAN_PENDING: + return out << "CLEAN_PENDING"; + case CachedExtent::extent_state_t::CLEAN: + return out << "CLEAN"; + case CachedExtent::extent_state_t::DIRTY: + return out << "DIRTY"; + case CachedExtent::extent_state_t::EXIST_CLEAN: + return out << "EXIST_CLEAN"; + case CachedExtent::extent_state_t::EXIST_MUTATION_PENDING: + return out << "EXIST_MUTATION_PENDING"; + case CachedExtent::extent_state_t::INVALID: + return out << "INVALID"; + default: + return out << "UNKNOWN"; + } +} + +std::ostream &operator<<(std::ostream &out, const CachedExtent &ext) +{ + return ext.print(out); +} + +CachedExtent::~CachedExtent() +{ + if (parent_index) { + assert(is_linked()); + parent_index->erase(*this); + } +} +CachedExtent* CachedExtent::get_transactional_view(Transaction &t) { + return get_transactional_view(t.get_trans_id()); +} + +CachedExtent* CachedExtent::get_transactional_view(transaction_id_t tid) { + auto it = mutation_pendings.find(tid, trans_spec_view_t::cmp_t()); + if (it != mutation_pendings.end()) { + return (CachedExtent*)&(*it); + } else { + return this; + } +} + +std::ostream &operator<<(std::ostream &out, const parent_tracker_t &tracker) { + return out << "parent_tracker=" << (void*)&tracker + << ", parent=" << (void*)tracker.get_parent().get(); +} + +std::ostream &ChildableCachedExtent::print_detail(std::ostream &out) const { + if (parent_tracker) { + out << *parent_tracker; + } else { + out << ", parent_tracker=" << (void*)nullptr; + } + _print_detail(out); + return out; +} + +std::ostream &LogicalCachedExtent::_print_detail(std::ostream &out) const +{ + out << ", laddr=" << laddr; + return print_detail_l(out); +} + +void child_pos_t::link_child(ChildableCachedExtent *c) { + get_parent<FixedKVNode<laddr_t>>()->link_child(c, pos); +} + +void CachedExtent::set_invalid(Transaction &t) { + state = extent_state_t::INVALID; + if (trans_view_hook.is_linked()) { + trans_view_hook.unlink(); + } + on_invalidated(t); +} + +LogicalCachedExtent::~LogicalCachedExtent() { + if (has_parent_tracker() && is_valid() && !is_pending()) { + assert(get_parent_node()); + auto parent = get_parent_node<FixedKVNode<laddr_t>>(); + auto off = parent->lower_bound_offset(laddr); + assert(parent->get_key_from_idx(off) == laddr); + assert(parent->children[off] == this); + parent->children[off] = nullptr; + } +} + +void LogicalCachedExtent::on_replace_prior(Transaction &t) { + assert(is_mutation_pending()); + take_prior_parent_tracker(); + assert(get_parent_node()); + auto parent = get_parent_node<FixedKVNode<laddr_t>>(); + //TODO: can this search be avoided? + auto off = parent->lower_bound_offset(laddr); + assert(parent->get_key_from_idx(off) == laddr); + parent->children[off] = this; +} + +parent_tracker_t::~parent_tracker_t() { + // this is parent's tracker, reset it + auto &p = (FixedKVNode<laddr_t>&)*parent; + if (p.my_tracker == this) { + p.my_tracker = nullptr; + } +} + +std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs) +{ + return out << "LBAMapping(" << rhs.get_key() << "~" << rhs.get_length() + << "->" << rhs.get_val(); +} + +std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs) +{ + bool first = true; + out << '['; + for (const auto &i: rhs) { + out << (first ? "" : ",") << *i; + first = false; + } + return out << ']'; +} + +} diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h new file mode 100644 index 000000000..02f8ae46c --- /dev/null +++ b/src/crimson/os/seastore/cached_extent.h @@ -0,0 +1,1304 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive/list.hpp> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "seastar/core/shared_future.hh" + +#include "include/buffer.h" +#include "crimson/common/errorator.h" +#include "crimson/common/interruptible_future.h" +#include "crimson/os/seastore/seastore_types.h" + +struct btree_lba_manager_test; + +namespace crimson::os::seastore { + +class Transaction; +class CachedExtent; +using CachedExtentRef = boost::intrusive_ptr<CachedExtent>; +class SegmentedAllocator; +class TransactionManager; +class ExtentPlacementManager; + +template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size, + bool leaf_has_children> +class FixedKVBtree; +template <typename, typename> +class BtreeNodeMapping; + +// #define DEBUG_CACHED_EXTENT_REF +#ifdef DEBUG_CACHED_EXTENT_REF + +void intrusive_ptr_add_ref(CachedExtent *); +void intrusive_ptr_release(CachedExtent *); + +#endif + +template <typename T> +using TCachedExtentRef = boost::intrusive_ptr<T>; + +/** + * CachedExtent + */ +namespace onode { + class DummyNodeExtent; + class TestReplayExtent; +} + +template <typename T> +class read_set_item_t { + using set_hook_t = boost::intrusive::set_member_hook< + boost::intrusive::link_mode< + boost::intrusive::auto_unlink>>; + set_hook_t trans_hook; + using set_hook_options = boost::intrusive::member_hook< + read_set_item_t, + set_hook_t, + &read_set_item_t::trans_hook>; + +public: + struct cmp_t { + using is_transparent = paddr_t; + bool operator()(const read_set_item_t<T> &lhs, const read_set_item_t &rhs) const; + bool operator()(const paddr_t &lhs, const read_set_item_t<T> &rhs) const; + bool operator()(const read_set_item_t<T> &lhs, const paddr_t &rhs) const; + }; + + struct trans_cmp_t { + bool operator()( + const read_set_item_t<Transaction> &lhs, + const read_set_item_t<Transaction> &rhs) const { + return lhs.t < rhs.t; + } + bool operator()( + const Transaction *lhs, + const read_set_item_t<Transaction> &rhs) const { + return lhs < rhs.t; + } + bool operator()( + const read_set_item_t<Transaction> &lhs, + const Transaction *rhs) const { + return lhs.t < rhs; + } + }; + + using trans_set_t = boost::intrusive::set< + read_set_item_t, + set_hook_options, + boost::intrusive::constant_time_size<false>, + boost::intrusive::compare<trans_cmp_t>>; + + T *t = nullptr; + CachedExtentRef ref; + + read_set_item_t(T *t, CachedExtentRef ref); + read_set_item_t(const read_set_item_t &) = delete; + read_set_item_t(read_set_item_t &&) = default; + ~read_set_item_t() = default; +}; +template <typename T> +using read_set_t = std::set< + read_set_item_t<T>, + typename read_set_item_t<T>::cmp_t>; + +struct trans_spec_view_t { + // if the extent is pending, contains the id of the owning transaction; + // TRANS_ID_NULL otherwise + transaction_id_t pending_for_transaction = TRANS_ID_NULL; + + struct cmp_t { + bool operator()( + const trans_spec_view_t &lhs, + const trans_spec_view_t &rhs) const + { + return lhs.pending_for_transaction < rhs.pending_for_transaction; + } + bool operator()( + const transaction_id_t &lhs, + const trans_spec_view_t &rhs) const + { + return lhs < rhs.pending_for_transaction; + } + bool operator()( + const trans_spec_view_t &lhs, + const transaction_id_t &rhs) const + { + return lhs.pending_for_transaction < rhs; + } + }; + + using trans_view_hook_t = + boost::intrusive::set_member_hook< + boost::intrusive::link_mode< + boost::intrusive::auto_unlink>>; + trans_view_hook_t trans_view_hook; + + using trans_view_member_options = + boost::intrusive::member_hook< + trans_spec_view_t, + trans_view_hook_t, + &trans_spec_view_t::trans_view_hook>; + using trans_view_set_t = boost::intrusive::set< + trans_spec_view_t, + trans_view_member_options, + boost::intrusive::constant_time_size<false>, + boost::intrusive::compare<cmp_t>>; +}; + +class ExtentIndex; +class CachedExtent + : public boost::intrusive_ref_counter< + CachedExtent, boost::thread_unsafe_counter>, + public trans_spec_view_t { + enum class extent_state_t : uint8_t { + INITIAL_WRITE_PENDING, // In Transaction::write_set and fresh_block_list + MUTATION_PENDING, // In Transaction::write_set and mutated_block_list + CLEAN_PENDING, // CLEAN, but not yet read out + CLEAN, // In Cache::extent_index, Transaction::read_set + // during write, contents match disk, version == 0 + DIRTY, // Same as CLEAN, but contents do not match disk, + // version > 0 + EXIST_CLEAN, // Similar to CLEAN, but its metadata not yet + // persisted to disk. + // In Transaction::write_set and existing_block_list. + // After transaction commits, state becomes CLEAN + // and add extent to Cache. Modifing such extents + // will cause state turn to EXIST_MUTATION_PENDING. + EXIST_MUTATION_PENDING,// Similar to MUTATION_PENDING, but its prior_instance + // is empty. + // In Transaction::write_set, existing_block_list and + // mutated_block_list. State becomes DIRTY and it is + // added to Cache after transaction commits. + INVALID // Part of no ExtentIndex set + } state = extent_state_t::INVALID; + friend std::ostream &operator<<(std::ostream &, extent_state_t); + // allow a dummy extent to pretend it is at a specific state + friend class onode::DummyNodeExtent; + friend class onode::TestReplayExtent; + + template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size, + bool leaf_has_children> + friend class FixedKVBtree; + uint32_t last_committed_crc = 0; + + // Points at current version while in state MUTATION_PENDING + CachedExtentRef prior_instance; + + // time of the last modification + sea_time_point modify_time = NULL_TIME; + +public: + void init(extent_state_t _state, + paddr_t paddr, + placement_hint_t hint, + rewrite_gen_t gen, + transaction_id_t trans_id) { + assert(gen == NULL_GENERATION || is_rewrite_generation(gen)); + state = _state; + set_paddr(paddr); + user_hint = hint; + rewrite_generation = gen; + pending_for_transaction = trans_id; + } + + void set_modify_time(sea_time_point t) { + modify_time = t; + } + + sea_time_point get_modify_time() const { + return modify_time; + } + + /** + * duplicate_for_write + * + * Implementation should return a fresh CachedExtentRef + * which represents a copy of *this until on_delta_write() + * is complete, at which point the user may assume *this + * will be in state INVALID. As such, the implementation + * may involve a copy of get_bptr(), or an ancillary + * structure which defers updating the actual buffer until + * on_delta_write(). + */ + virtual CachedExtentRef duplicate_for_write(Transaction &t) = 0; + + /** + * prepare_write + * + * Called prior to reading buffer. + * Implemenation may use this callback to fully write out + * updates to the buffer. + */ + virtual void prepare_write() {} + + /** + * prepare_commit + * + * Called prior to committing the transaction in which this extent + * is living. + */ + virtual void prepare_commit() {} + + /** + * on_initial_write + * + * Called after commit of extent. State will be CLEAN. + * Implentation may use this call to fixup the buffer + * with the newly available absolute get_paddr(). + */ + virtual void on_initial_write() {} + + /** + * on_clean_read + * + * Called after read of initially written extent. + * State will be CLEAN. Implentation may use this + * call to fixup the buffer with the newly available + * absolute get_paddr(). + */ + virtual void on_clean_read() {} + + /** + * on_delta_write + * + * Called after commit of delta. State will be DIRTY. + * Implentation may use this call to fixup any relative + * references in the the buffer with the passed + * record_block_offset record location. + */ + virtual void on_delta_write(paddr_t record_block_offset) {} + + /** + * on_replace_prior + * + * Called after the extent has replaced a previous one. State + * of the extent must be MUTATION_PENDING. Implementation + * may use this call to synchronize states that must be synchronized + * with the states of Cache and can't wait till transaction + * completes. + */ + virtual void on_replace_prior(Transaction &t) {} + + /** + * on_invalidated + * + * Called after the extent is invalidated, either by Cache::invalidate_extent + * or Transaction::add_to_retired_set. Implementation may use this + * call to adjust states that must be changed immediately once + * invalidated. + */ + virtual void on_invalidated(Transaction &t) {} + /** + * get_type + * + * Returns concrete type. + */ + virtual extent_types_t get_type() const = 0; + + virtual bool is_logical() const { + return false; + } + + virtual bool may_conflict() const { + return true; + } + + friend std::ostream &operator<<(std::ostream &, extent_state_t); + virtual std::ostream &print_detail(std::ostream &out) const { return out; } + std::ostream &print(std::ostream &out) const { + std::string prior_poffset_str = prior_poffset + ? fmt::format("{}", *prior_poffset) + : "nullopt"; + out << "CachedExtent(addr=" << this + << ", type=" << get_type() + << ", version=" << version + << ", dirty_from_or_retired_at=" << dirty_from_or_retired_at + << ", modify_time=" << sea_time_point_printer_t{modify_time} + << ", paddr=" << get_paddr() + << ", prior_paddr=" << prior_poffset_str + << ", length=" << get_length() + << ", state=" << state + << ", last_committed_crc=" << last_committed_crc + << ", refcount=" << use_count() + << ", user_hint=" << user_hint + << ", fully_loaded=" << is_fully_loaded() + << ", rewrite_gen=" << rewrite_gen_printer_t{rewrite_generation}; + if (state != extent_state_t::INVALID && + state != extent_state_t::CLEAN_PENDING) { + print_detail(out); + } + return out << ")"; + } + + /** + * get_delta + * + * Must return a valid delta usable in apply_delta() in submit_transaction + * if state == MUTATION_PENDING. + */ + virtual ceph::bufferlist get_delta() = 0; + + /** + * apply_delta + * + * bl is a delta obtained previously from get_delta. The versions will + * match. Implementation should mutate buffer based on bl. base matches + * the address passed on_delta_write. + * + * Implementation *must* use set_last_committed_crc to update the crc to + * what the crc of the buffer would have been at submission. For physical + * extents that use base to adjust internal record-relative deltas, this + * means that the crc should be of the buffer after applying the delta, + * but before that adjustment. We do it this way because the crc in the + * commit path does not yet know the record base address. + * + * LogicalCachedExtent overrides this method and provides a simpler + * apply_delta override for LogicalCachedExtent implementers. + */ + virtual void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &bl) = 0; + + /** + * Called on dirty CachedExtent implementation after replay. + * Implementation should perform any reads/in-memory-setup + * necessary. (for instance, the lba implementation will use this + * to load in lba_manager blocks) + */ + using complete_load_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + virtual complete_load_ertr::future<> complete_load() { + return complete_load_ertr::now(); + } + + /** + * cast + * + * Returns a TCachedExtentRef of the specified type. + * TODO: add dynamic check that the requested type is actually correct. + */ + template <typename T> + TCachedExtentRef<T> cast() { + return TCachedExtentRef<T>(static_cast<T*>(this)); + } + template <typename T> + TCachedExtentRef<const T> cast() const { + return TCachedExtentRef<const T>(static_cast<const T*>(this)); + } + + /// Returns true if extent can be mutated in an open transaction + bool is_mutable() const { + return state == extent_state_t::INITIAL_WRITE_PENDING || + state == extent_state_t::MUTATION_PENDING || + state == extent_state_t::EXIST_MUTATION_PENDING; + } + + /// Returns true if extent is part of an open transaction + bool is_pending() const { + return is_mutable() || state == extent_state_t::EXIST_CLEAN; + } + + /// Returns true if extent is stable and shared among transactions + bool is_stable() const { + return state == extent_state_t::CLEAN_PENDING || + state == extent_state_t::CLEAN || + state == extent_state_t::DIRTY; + } + + /// Returns true if extent has a pending delta + bool is_mutation_pending() const { + return state == extent_state_t::MUTATION_PENDING; + } + + /// Returns true if extent is a fresh extent + bool is_initial_pending() const { + return state == extent_state_t::INITIAL_WRITE_PENDING; + } + + /// Returns true if extent is clean (does not have deltas on disk) + bool is_clean() const { + ceph_assert(is_valid()); + return state == extent_state_t::INITIAL_WRITE_PENDING || + state == extent_state_t::CLEAN || + state == extent_state_t::CLEAN_PENDING || + state == extent_state_t::EXIST_CLEAN; + } + + // Returs true if extent is stable and clean + bool is_stable_clean() const { + ceph_assert(is_valid()); + return state == extent_state_t::CLEAN || + state == extent_state_t::CLEAN_PENDING; + } + + /// Ruturns true if data is persisted while metadata isn't + bool is_exist_clean() const { + return state == extent_state_t::EXIST_CLEAN; + } + + /// Returns true if the extent with EXTIST_CLEAN is modified + bool is_exist_mutation_pending() const { + return state == extent_state_t::EXIST_MUTATION_PENDING; + } + + /// Returns true if extent is dirty (has deltas on disk) + bool is_dirty() const { + ceph_assert(is_valid()); + return !is_clean(); + } + + /// Returns true if extent has not been superceded or retired + bool is_valid() const { + return state != extent_state_t::INVALID; + } + + /// Returns true if extent or prior_instance has been invalidated + bool has_been_invalidated() const { + return !is_valid() || (is_mutation_pending() && !prior_instance->is_valid()); + } + + /// Returns true if extent is a plcaeholder + bool is_placeholder() const { + return get_type() == extent_types_t::RETIRED_PLACEHOLDER; + } + + bool is_pending_io() const { + return !!io_wait_promise; + } + + /// Return journal location of oldest relevant delta, only valid while DIRTY + auto get_dirty_from() const { + ceph_assert(is_dirty()); + return dirty_from_or_retired_at; + } + + /// Return journal location of oldest relevant delta, only valid while RETIRED + auto get_retired_at() const { + ceph_assert(!is_valid()); + return dirty_from_or_retired_at; + } + + /// Return true if extent is fully loaded or is about to be fully loaded (call + /// wait_io() in this case) + bool is_fully_loaded() const { + return ptr.has_value(); + } + + /** + * get_paddr + * + * Returns current address of extent. If is_initial_pending(), address will + * be relative, otherwise address will be absolute. + */ + paddr_t get_paddr() const { return poffset; } + + /// Returns length of extent data in disk + extent_len_t get_length() const { + return length; + } + + extent_len_t get_loaded_length() const { + if (ptr.has_value()) { + return ptr->length(); + } else { + return 0; + } + } + + /// Returns version, get_version() == 0 iff is_clean() + extent_version_t get_version() const { + return version; + } + + /// Returns crc32c of buffer + uint32_t get_crc32c() { + return ceph_crc32c( + 1, + reinterpret_cast<const unsigned char *>(get_bptr().c_str()), + get_length()); + } + + /// Get ref to raw buffer + bufferptr &get_bptr() { + assert(ptr.has_value()); + return *ptr; + } + const bufferptr &get_bptr() const { + assert(ptr.has_value()); + return *ptr; + } + + /// Compare by paddr + friend bool operator< (const CachedExtent &a, const CachedExtent &b) { + return a.poffset < b.poffset; + } + friend bool operator> (const CachedExtent &a, const CachedExtent &b) { + return a.poffset > b.poffset; + } + friend bool operator== (const CachedExtent &a, const CachedExtent &b) { + return a.poffset == b.poffset; + } + + virtual ~CachedExtent(); + + placement_hint_t get_user_hint() const { + return user_hint; + } + + rewrite_gen_t get_rewrite_generation() const { + return rewrite_generation; + } + + void invalidate_hints() { + user_hint = PLACEMENT_HINT_NULL; + rewrite_generation = NULL_GENERATION; + } + + /// assign the target rewrite generation for the followup rewrite + void set_target_rewrite_generation(rewrite_gen_t gen) { + assert(is_target_rewrite_generation(gen)); + + user_hint = placement_hint_t::REWRITE; + rewrite_generation = gen; + } + + bool is_inline() const { + return poffset.is_relative(); + } + + paddr_t get_prior_paddr_and_reset() { + assert(prior_poffset); + auto ret = *prior_poffset; + prior_poffset.reset(); + return ret; + } + + void set_invalid(Transaction &t); + + // a rewrite extent has an invalid prior_instance, + // and a mutation_pending extent has a valid prior_instance + CachedExtentRef get_prior_instance() { + return prior_instance; + } + +private: + template <typename T> + friend class read_set_item_t; + + friend struct paddr_cmp; + friend struct ref_paddr_cmp; + friend class ExtentIndex; + + /// Pointer to containing index (or null) + ExtentIndex *parent_index = nullptr; + + /// hook for intrusive extent_index + boost::intrusive::set_member_hook<> extent_index_hook; + using index_member_options = boost::intrusive::member_hook< + CachedExtent, + boost::intrusive::set_member_hook<>, + &CachedExtent::extent_index_hook>; + using index = boost::intrusive::set<CachedExtent, index_member_options>; + friend class ExtentIndex; + friend class Transaction; + + bool is_linked() { + return extent_index_hook.is_linked(); + } + + /// set bufferptr + void set_bptr(ceph::bufferptr &&nptr) { + ptr = nptr; + } + + /// Returns true if the extent part of the open transaction + bool is_pending_in_trans(transaction_id_t id) const { + return is_pending() && pending_for_transaction == id; + } + + /// hook for intrusive ref list (mainly dirty or lru list) + boost::intrusive::list_member_hook<> primary_ref_list_hook; + using primary_ref_list_member_options = boost::intrusive::member_hook< + CachedExtent, + boost::intrusive::list_member_hook<>, + &CachedExtent::primary_ref_list_hook>; + using list = boost::intrusive::list< + CachedExtent, + primary_ref_list_member_options>; + + /** + * dirty_from_or_retired_at + * + * Encodes ordering token for primary_ref_list -- dirty_from when + * dirty or retired_at if retired. + */ + journal_seq_t dirty_from_or_retired_at; + + /// cache data contents, std::nullopt if no data in cache + std::optional<ceph::bufferptr> ptr; + + /// disk data length + extent_len_t length; + + /// number of deltas since initial write + extent_version_t version = 0; + + /// address of original block -- record relative iff is_initial_pending() + paddr_t poffset; + + /// relative address before ool write, used to update mapping + std::optional<paddr_t> prior_poffset = std::nullopt; + + /// used to wait while in-progress commit completes + std::optional<seastar::shared_promise<>> io_wait_promise; + void set_io_wait() { + ceph_assert(!io_wait_promise); + io_wait_promise = seastar::shared_promise<>(); + } + void complete_io() { + ceph_assert(io_wait_promise); + io_wait_promise->set_value(); + io_wait_promise = std::nullopt; + } + + seastar::future<> wait_io() { + if (!io_wait_promise) { + return seastar::now(); + } else { + return io_wait_promise->get_shared_future(); + } + } + + CachedExtent* get_transactional_view(Transaction &t); + CachedExtent* get_transactional_view(transaction_id_t tid); + + read_set_item_t<Transaction>::trans_set_t transactions; + + placement_hint_t user_hint = PLACEMENT_HINT_NULL; + + // the target rewrite generation for the followup rewrite + // or the rewrite generation for the fresh write + rewrite_gen_t rewrite_generation = NULL_GENERATION; + +protected: + trans_view_set_t mutation_pendings; + + CachedExtent(CachedExtent &&other) = delete; + CachedExtent(ceph::bufferptr &&_ptr) : ptr(std::move(_ptr)) { + length = ptr->length(); + assert(length > 0); + } + + /// construct new CachedExtent, will deep copy the buffer + CachedExtent(const CachedExtent &other) + : state(other.state), + dirty_from_or_retired_at(other.dirty_from_or_retired_at), + length(other.get_length()), + version(other.version), + poffset(other.poffset) { + assert((length % CEPH_PAGE_SIZE) == 0); + if (other.is_fully_loaded()) { + ptr.emplace(buffer::create_page_aligned(length)); + other.ptr->copy_out(0, length, ptr->c_str()); + } else { + // the extent must be fully loaded before CoW + assert(length == 0); // in case of root + } + } + + struct share_buffer_t {}; + /// construct new CachedExtent, will shallow copy the buffer + CachedExtent(const CachedExtent &other, share_buffer_t) + : state(other.state), + dirty_from_or_retired_at(other.dirty_from_or_retired_at), + ptr(other.ptr), + length(other.get_length()), + version(other.version), + poffset(other.poffset) {} + + // 0 length is only possible for the RootBlock + struct zero_length_t {}; + CachedExtent(zero_length_t) : ptr(ceph::bufferptr(0)), length(0) {}; + + struct retired_placeholder_t{}; + CachedExtent(retired_placeholder_t, extent_len_t _length) + : state(extent_state_t::INVALID), + length(_length) { + assert(length > 0); + } + + /// no buffer extent, for lazy read + CachedExtent(extent_len_t _length) : length(_length) { + assert(length > 0); + } + + friend class Cache; + template <typename T, typename... Args> + static TCachedExtentRef<T> make_cached_extent_ref( + Args&&... args) { + return new T(std::forward<Args>(args)...); + } + + template <typename T> + static TCachedExtentRef<T> make_placeholder_cached_extent_ref( + extent_len_t length) { + return new T(length); + } + + void reset_prior_instance() { + prior_instance.reset(); + } + + /// Sets last_committed_crc + void set_last_committed_crc(uint32_t crc) { + last_committed_crc = crc; + } + + void set_paddr(paddr_t offset, bool need_update_mapping = false) { + if (need_update_mapping) { + assert(!prior_poffset); + prior_poffset = poffset; + } + poffset = offset; + } + + /** + * maybe_generate_relative + * + * There are three kinds of addresses one might want to + * store within an extent: + * - addr for a block within the same transaction relative to the + * physical location of this extent in the + * event that we will read it in the initial read of the extent + * - addr relative to the physical location of the next record to a + * block within that record to contain a delta for this extent in + * the event that we'll read it from a delta and overlay it onto a + * dirty representation of the extent. + * - absolute addr to a block already written outside of the current + * transaction. + * + * This helper checks addr and the current state to create the correct + * reference. + */ + paddr_t maybe_generate_relative(paddr_t addr) { + if (is_initial_pending() && addr.is_record_relative()) { + return addr.block_relative_to(get_paddr()); + } else { + ceph_assert(!addr.is_record_relative() || is_mutation_pending()); + return addr; + } + } + + friend class crimson::os::seastore::SegmentedAllocator; + friend class crimson::os::seastore::TransactionManager; + friend class crimson::os::seastore::ExtentPlacementManager; + template <typename, typename> + friend class BtreeNodeMapping; + friend class ::btree_lba_manager_test; +}; + +std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t); +std::ostream &operator<<(std::ostream &, const CachedExtent&); + +bool is_backref_mapped_extent_node(const CachedExtentRef &extent); + +/// Compare extents by paddr +struct paddr_cmp { + bool operator()(paddr_t lhs, const CachedExtent &rhs) const { + return lhs < rhs.poffset; + } + bool operator()(const CachedExtent &lhs, paddr_t rhs) const { + return lhs.poffset < rhs; + } +}; + +/// Compare extent refs by paddr +struct ref_paddr_cmp { + using is_transparent = paddr_t; + bool operator()(const CachedExtentRef &lhs, const CachedExtentRef &rhs) const { + return lhs->poffset < rhs->poffset; + } + bool operator()(const paddr_t &lhs, const CachedExtentRef &rhs) const { + return lhs < rhs->poffset; + } + bool operator()(const CachedExtentRef &lhs, const paddr_t &rhs) const { + return lhs->poffset < rhs; + } +}; + +template <typename T, typename C> +class addr_extent_list_base_t + : public std::list<std::pair<T, C>> {}; + +using pextent_list_t = addr_extent_list_base_t<paddr_t, CachedExtentRef>; + +template <typename T, typename C, typename Cmp> +class addr_extent_set_base_t + : public std::set<C, Cmp> {}; + +using pextent_set_t = addr_extent_set_base_t< + paddr_t, + CachedExtentRef, + ref_paddr_cmp + >; + +template <typename T> +using t_pextent_list_t = addr_extent_list_base_t<paddr_t, TCachedExtentRef<T>>; + +/** + * ExtentIndex + * + * Index of CachedExtent & by poffset, does not hold a reference, + * user must ensure each extent is removed prior to deletion + */ +class ExtentIndex { + friend class Cache; + CachedExtent::index extent_index; +public: + auto get_overlap(paddr_t addr, extent_len_t len) { + auto bottom = extent_index.upper_bound(addr, paddr_cmp()); + if (bottom != extent_index.begin()) + --bottom; + if (bottom != extent_index.end() && + bottom->get_paddr().add_offset(bottom->get_length()) <= addr) + ++bottom; + + auto top = extent_index.lower_bound(addr.add_offset(len), paddr_cmp()); + return std::make_pair( + bottom, + top + ); + } + + void clear() { + struct cached_extent_disposer { + void operator() (CachedExtent* extent) { + extent->parent_index = nullptr; + } + }; + extent_index.clear_and_dispose(cached_extent_disposer()); + bytes = 0; + } + + void insert(CachedExtent &extent) { + // sanity check + ceph_assert(!extent.parent_index); + auto [a, b] = get_overlap( + extent.get_paddr(), + extent.get_length()); + ceph_assert(a == b); + + [[maybe_unused]] auto [iter, inserted] = extent_index.insert(extent); + assert(inserted); + extent.parent_index = this; + + bytes += extent.get_length(); + } + + void erase(CachedExtent &extent) { + assert(extent.parent_index); + assert(extent.is_linked()); + [[maybe_unused]] auto erased = extent_index.erase( + extent_index.s_iterator_to(extent)); + extent.parent_index = nullptr; + + assert(erased); + bytes -= extent.get_length(); + } + + void replace(CachedExtent &to, CachedExtent &from) { + assert(to.get_length() == from.get_length()); + extent_index.replace_node(extent_index.s_iterator_to(from), to); + from.parent_index = nullptr; + to.parent_index = this; + } + + bool empty() const { + return extent_index.empty(); + } + + auto find_offset(paddr_t offset) { + return extent_index.find(offset, paddr_cmp()); + } + + auto begin() { + return extent_index.begin(); + } + + auto end() { + return extent_index.end(); + } + + auto size() const { + return extent_index.size(); + } + + auto get_bytes() const { + return bytes; + } + + ~ExtentIndex() { + assert(extent_index.empty()); + assert(bytes == 0); + } + +private: + uint64_t bytes = 0; +}; + +class ChildableCachedExtent; +class LogicalCachedExtent; + +class child_pos_t { +public: + child_pos_t(CachedExtentRef stable_parent, uint16_t pos) + : stable_parent(stable_parent), pos(pos) {} + + template <typename parent_t> + TCachedExtentRef<parent_t> get_parent() { + ceph_assert(stable_parent); + return stable_parent->template cast<parent_t>(); + } + uint16_t get_pos() { + return pos; + } + void link_child(ChildableCachedExtent *c); +private: + CachedExtentRef stable_parent; + uint16_t pos = std::numeric_limits<uint16_t>::max(); +}; + +using get_child_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; +template <typename T> +struct get_child_ret_t { + std::variant<child_pos_t, get_child_ertr::future<TCachedExtentRef<T>>> ret; + get_child_ret_t(child_pos_t pos) + : ret(std::move(pos)) {} + get_child_ret_t(get_child_ertr::future<TCachedExtentRef<T>> child) + : ret(std::move(child)) {} + + bool has_child() const { + return ret.index() == 1; + } + + child_pos_t &get_child_pos() { + ceph_assert(ret.index() == 0); + return std::get<0>(ret); + } + + get_child_ertr::future<TCachedExtentRef<T>> &get_child_fut() { + ceph_assert(ret.index() == 1); + return std::get<1>(ret); + } +}; + +template <typename key_t, typename> +class PhysicalNodeMapping; + +template <typename key_t, typename val_t> +using PhysicalNodeMappingRef = std::unique_ptr<PhysicalNodeMapping<key_t, val_t>>; + +template <typename key_t, typename val_t> +class PhysicalNodeMapping { +public: + virtual extent_len_t get_length() const = 0; + virtual extent_types_t get_type() const = 0; + virtual val_t get_val() const = 0; + virtual key_t get_key() const = 0; + virtual PhysicalNodeMappingRef<key_t, val_t> duplicate() const = 0; + virtual bool has_been_invalidated() const = 0; + virtual CachedExtentRef get_parent() const = 0; + virtual uint16_t get_pos() const = 0; + // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h + virtual bool is_indirect() const { return false; } + virtual key_t get_intermediate_key() const { return min_max_t<key_t>::null; } + virtual key_t get_intermediate_base() const { return min_max_t<key_t>::null; } + virtual extent_len_t get_intermediate_length() const { return 0; } + // The start offset of the pin, must be 0 if the pin is not indirect + virtual extent_len_t get_intermediate_offset() const { + return std::numeric_limits<extent_len_t>::max(); + } + + virtual get_child_ret_t<LogicalCachedExtent> + get_logical_extent(Transaction &t) = 0; + + void link_child(ChildableCachedExtent *c) { + ceph_assert(child_pos); + child_pos->link_child(c); + } + + virtual ~PhysicalNodeMapping() {} +protected: + std::optional<child_pos_t> child_pos = std::nullopt; +}; + +using LBAMapping = PhysicalNodeMapping<laddr_t, paddr_t>; +using LBAMappingRef = PhysicalNodeMappingRef<laddr_t, paddr_t>; + +std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs); + +using lba_pin_list_t = std::list<LBAMappingRef>; + +std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs); + +using BackrefMapping = PhysicalNodeMapping<paddr_t, laddr_t>; +using BackrefMappingRef = PhysicalNodeMappingRef<paddr_t, laddr_t>; + +using backref_pin_list_t = std::list<BackrefMappingRef>; + +/** + * RetiredExtentPlaceholder + * + * Cache::retire_extent_addr(Transaction&, paddr_t, extent_len_t) can retire an + * extent not currently in cache. In that case, in order to detect transaction + * invalidation, we need to add a placeholder to the cache to create the + * mapping back to the transaction. And whenever there is a transaction tries + * to read the placeholder extent out, Cache is responsible to replace the + * placeholder by the real one. Anyway, No placeholder extents should escape + * the Cache interface boundary. + */ +class RetiredExtentPlaceholder : public CachedExtent { + +public: + RetiredExtentPlaceholder(extent_len_t length) + : CachedExtent(CachedExtent::retired_placeholder_t{}, length) {} + + CachedExtentRef duplicate_for_write(Transaction&) final { + ceph_assert(0 == "Should never happen for a placeholder"); + return CachedExtentRef(); + } + + ceph::bufferlist get_delta() final { + ceph_assert(0 == "Should never happen for a placeholder"); + return ceph::bufferlist(); + } + + static constexpr extent_types_t TYPE = extent_types_t::RETIRED_PLACEHOLDER; + extent_types_t get_type() const final { + return TYPE; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &bl) final { + ceph_assert(0 == "Should never happen for a placeholder"); + } + + bool is_logical() const final { + return false; + } + + std::ostream &print_detail(std::ostream &out) const final { + return out << ", RetiredExtentPlaceholder"; + } + + void on_delta_write(paddr_t record_block_offset) final { + ceph_assert(0 == "Should never happen for a placeholder"); + } +}; + +class parent_tracker_t + : public boost::intrusive_ref_counter< + parent_tracker_t, boost::thread_unsafe_counter> { +public: + parent_tracker_t(CachedExtentRef parent) + : parent(parent) {} + parent_tracker_t(CachedExtent* parent) + : parent(parent) {} + ~parent_tracker_t(); + template <typename T = CachedExtent> + TCachedExtentRef<T> get_parent() const { + ceph_assert(parent); + if constexpr (std::is_same_v<T, CachedExtent>) { + return parent; + } else { + return parent->template cast<T>(); + } + } + void reset_parent(CachedExtentRef p) { + parent = p; + } + bool is_valid() const { + return parent && parent->is_valid(); + } +private: + CachedExtentRef parent; +}; + +std::ostream &operator<<(std::ostream &, const parent_tracker_t &); + +using parent_tracker_ref = boost::intrusive_ptr<parent_tracker_t>; + +class ChildableCachedExtent : public CachedExtent { +public: + template <typename... T> + ChildableCachedExtent(T&&... t) : CachedExtent(std::forward<T>(t)...) {} + bool has_parent_tracker() const { + return (bool)parent_tracker; + } + void reset_parent_tracker(parent_tracker_t *p = nullptr) { + parent_tracker.reset(p); + } + bool is_parent_valid() const { + return parent_tracker && parent_tracker->is_valid(); + } + template <typename T = CachedExtent> + TCachedExtentRef<T> get_parent_node() const { + assert(parent_tracker); + return parent_tracker->template get_parent<T>(); + } + void take_prior_parent_tracker() { + auto &prior = (ChildableCachedExtent&)(*get_prior_instance()); + parent_tracker = prior.parent_tracker; + } + std::ostream &print_detail(std::ostream &out) const final; +private: + parent_tracker_ref parent_tracker; + virtual std::ostream &_print_detail(std::ostream &out) const { + return out; + } +}; +/** + * LogicalCachedExtent + * + * CachedExtent with associated lba mapping. + * + * Users of TransactionManager should be using extents derived from + * LogicalCachedExtent. + */ +class LogicalCachedExtent : public ChildableCachedExtent { +public: + template <typename... T> + LogicalCachedExtent(T&&... t) + : ChildableCachedExtent(std::forward<T>(t)...) + {} + + bool has_laddr() const { + return laddr != L_ADDR_NULL; + } + + laddr_t get_laddr() const { + assert(laddr != L_ADDR_NULL); + return laddr; + } + + void set_laddr(laddr_t nladdr) { + laddr = nladdr; + } + + void maybe_set_intermediate_laddr(LBAMapping &mapping) { + laddr = mapping.is_indirect() + ? mapping.get_intermediate_base() + : mapping.get_key(); + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &bl) final { + apply_delta(bl); + set_last_committed_crc(get_crc32c()); + } + + bool is_logical() const final { + return true; + } + + std::ostream &_print_detail(std::ostream &out) const final; + + void on_replace_prior(Transaction &t) final; + + virtual ~LogicalCachedExtent(); +protected: + + virtual void apply_delta(const ceph::bufferlist &bl) = 0; + virtual std::ostream &print_detail_l(std::ostream &out) const { + return out; + } + + virtual void logical_on_delta_write() {} + + void on_delta_write(paddr_t record_block_offset) final { + assert(is_exist_mutation_pending() || + get_prior_instance()); + logical_on_delta_write(); + } + +private: + // the logical address of the extent, and if shared, + // it is the intermediate_base, see BtreeLBAMapping comments. + laddr_t laddr = L_ADDR_NULL; +}; + +using LogicalCachedExtentRef = TCachedExtentRef<LogicalCachedExtent>; +struct ref_laddr_cmp { + using is_transparent = laddr_t; + bool operator()(const LogicalCachedExtentRef &lhs, + const LogicalCachedExtentRef &rhs) const { + return lhs->get_laddr() < rhs->get_laddr(); + } + bool operator()(const laddr_t &lhs, + const LogicalCachedExtentRef &rhs) const { + return lhs < rhs->get_laddr(); + } + bool operator()(const LogicalCachedExtentRef &lhs, + const laddr_t &rhs) const { + return lhs->get_laddr() < rhs; + } +}; + +template <typename T> +read_set_item_t<T>::read_set_item_t(T *t, CachedExtentRef ref) + : t(t), ref(ref) +{} + +template <typename T> +inline bool read_set_item_t<T>::cmp_t::operator()( + const read_set_item_t<T> &lhs, const read_set_item_t<T> &rhs) const { + return lhs.ref->poffset < rhs.ref->poffset; +} +template <typename T> +inline bool read_set_item_t<T>::cmp_t::operator()( + const paddr_t &lhs, const read_set_item_t<T> &rhs) const { + return lhs < rhs.ref->poffset; +} +template <typename T> +inline bool read_set_item_t<T>::cmp_t::operator()( + const read_set_item_t<T> &lhs, const paddr_t &rhs) const { + return lhs.ref->poffset < rhs; +} + +using lextent_set_t = addr_extent_set_base_t< + laddr_t, + LogicalCachedExtentRef, + ref_laddr_cmp + >; + +template <typename T> +using lextent_list_t = addr_extent_list_base_t< + laddr_t, TCachedExtentRef<T>>; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::CachedExtent> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::LogicalCachedExtent> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/collection_manager.cc b/src/crimson/os/seastore/collection_manager.cc new file mode 100644 index 000000000..4f5b58d01 --- /dev/null +++ b/src/crimson/os/seastore/collection_manager.cc @@ -0,0 +1,14 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +// +#include "crimson/os/seastore/collection_manager.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/collection_manager/flat_collection_manager.h" + +namespace crimson::os::seastore::collection_manager { + +CollectionManagerRef create_coll_manager(TransactionManager &trans_manager) { + return CollectionManagerRef(new FlatCollectionManager(trans_manager)); +} + +} diff --git a/src/crimson/os/seastore/collection_manager.h b/src/crimson/os/seastore/collection_manager.h new file mode 100644 index 000000000..37913abb4 --- /dev/null +++ b/src/crimson/os/seastore/collection_manager.h @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "osd/osd_types.h" + +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" + +namespace crimson::os::seastore { + +struct coll_info_t { + unsigned split_bits; + + coll_info_t(unsigned bits) + : split_bits(bits) {} + + bool operator==(const coll_info_t &rhs) const { + return split_bits == rhs.split_bits; + } +}; + +/// Interface for maintaining set of collections +class CollectionManager { +public: + using base_iertr = TransactionManager::read_extent_iertr; + + /// Initialize collection manager instance for an empty store + using mkfs_iertr = TransactionManager::alloc_extent_iertr; + using mkfs_ret = mkfs_iertr::future<coll_root_t>; + virtual mkfs_ret mkfs( + Transaction &t) = 0; + + /// Create collection + using create_iertr = base_iertr; + using create_ret = create_iertr::future<>; + virtual create_ret create( + coll_root_t &root, + Transaction &t, + coll_t cid, + coll_info_t info + ) = 0; + + /// List collections with info + using list_iertr = base_iertr; + using list_ret_bare = std::vector<std::pair<coll_t, coll_info_t>>; + using list_ret = list_iertr::future<list_ret_bare>; + virtual list_ret list( + const coll_root_t &root, + Transaction &t) = 0; + + /// Remove cid + using remove_iertr = base_iertr; + using remove_ret = remove_iertr::future<>; + virtual remove_ret remove( + const coll_root_t &coll_root, + Transaction &t, + coll_t cid) = 0; + + /// Update info for cid + using update_iertr = base_iertr; + using update_ret = base_iertr::future<>; + virtual update_ret update( + const coll_root_t &coll_root, + Transaction &t, + coll_t cid, + coll_info_t info + ) = 0; + + virtual ~CollectionManager() {} +}; +using CollectionManagerRef = std::unique_ptr<CollectionManager>; + +namespace collection_manager { +/* creat CollectionMapManager for Collection */ +CollectionManagerRef create_coll_manager( + TransactionManager &trans_manager); + +} + +} diff --git a/src/crimson/os/seastore/collection_manager/collection_flat_node.cc b/src/crimson/os/seastore/collection_manager/collection_flat_node.cc new file mode 100644 index 000000000..ed17e2b12 --- /dev/null +++ b/src/crimson/os/seastore/collection_manager/collection_flat_node.cc @@ -0,0 +1,120 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/buffer.h" +#include "osd/osd_types.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/collection_manager/collection_flat_node.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_seastore); + } +} + +namespace crimson::os::seastore::collection_manager { + +void delta_t::replay(coll_map_t &l) const +{ + switch (op) { + case op_t::INSERT: { + l.insert(coll, bits); + break; + } + case op_t::UPDATE: { + l.update(coll, bits); + break; + } + case op_t::REMOVE: { + l.erase(coll); + break; + } + case op_t::INVALID: { + assert(0 == "impossible"); + break; + } + __builtin_unreachable(); + } +} + + +std::ostream &CollectionNode::print_detail_l(std::ostream &out) const +{ + return out; +} + +CollectionNode::list_ret +CollectionNode::list() +{ + logger().debug("CollectionNode:{}, {}", __func__, *this); + CollectionManager::list_ret_bare list_result; + for (auto &[coll, bits] : decoded) { + list_result.emplace_back(coll, bits); + } + return list_ret( + interruptible::ready_future_marker{}, + std::move(list_result)); +} + +CollectionNode::create_ret +CollectionNode::create(coll_context_t cc, coll_t coll, unsigned bits) +{ + logger().debug("CollectionNode:{}", __func__); + if (!is_mutable()) { + auto mut = cc.tm.get_mutable_extent(cc.t, this)->cast<CollectionNode>(); + return mut->create(cc, coll, bits); + } + logger().debug("CollectionNode::create {} {} {}", coll, bits, *this); + auto [iter, inserted] = decoded.insert(coll, bits); + assert(inserted); + if (encoded_sizeof((base_coll_map_t&)decoded) > get_bptr().length()) { + decoded.erase(iter); + return create_ret( + interruptible::ready_future_marker{}, + create_result_t::OVERFLOW); + } else { + if (auto buffer = maybe_get_delta_buffer(); buffer) { + buffer->insert(coll, bits); + } + copy_to_node(); + return create_ret( + interruptible::ready_future_marker{}, + create_result_t::SUCCESS); + } +} + +CollectionNode::update_ret +CollectionNode::update(coll_context_t cc, coll_t coll, unsigned bits) +{ + logger().debug("trans.{} CollectionNode:{} {} {}", + cc.t.get_trans_id(), __func__, coll, bits); + if (!is_mutable()) { + auto mut = cc.tm.get_mutable_extent(cc.t, this)->cast<CollectionNode>(); + return mut->update(cc, coll, bits); + } + if (auto buffer = maybe_get_delta_buffer(); buffer) { + buffer->update(coll, bits); + } + decoded.update(coll, bits); + copy_to_node(); + return seastar::now(); +} + +CollectionNode::remove_ret +CollectionNode::remove(coll_context_t cc, coll_t coll) +{ + logger().debug("trans.{} CollectionNode:{} {}", + cc.t.get_trans_id(),__func__, coll); + if (!is_mutable()) { + auto mut = cc.tm.get_mutable_extent(cc.t, this)->cast<CollectionNode>(); + return mut->remove(cc, coll); + } + if (auto buffer = maybe_get_delta_buffer(); buffer) { + buffer->remove(coll); + } + decoded.remove(coll); + copy_to_node(); + return seastar::now(); +} + +} diff --git a/src/crimson/os/seastore/collection_manager/collection_flat_node.h b/src/crimson/os/seastore/collection_manager/collection_flat_node.h new file mode 100644 index 000000000..2690fb5fd --- /dev/null +++ b/src/crimson/os/seastore/collection_manager/collection_flat_node.h @@ -0,0 +1,184 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/collection_manager.h" + +namespace crimson::os::seastore::collection_manager { +struct coll_context_t { + TransactionManager &tm; + Transaction &t; +}; + +using base_coll_map_t = std::map<denc_coll_t, uint32_t>; +struct coll_map_t : base_coll_map_t { + auto insert(coll_t coll, unsigned bits) { + return emplace( + std::make_pair(denc_coll_t{coll}, bits) + ); + } + + void update(coll_t coll, unsigned bits) { + (*this)[denc_coll_t{coll}] = bits; + } + + void remove(coll_t coll) { + erase(denc_coll_t{coll}); + } +}; + +struct delta_t { + enum class op_t : uint_fast8_t { + INSERT, + UPDATE, + REMOVE, + INVALID + } op = op_t::INVALID; + + denc_coll_t coll; + uint32_t bits = 0; + + DENC(delta_t, v, p) { + DENC_START(1, 1, p); + denc(v.op, p); + denc(v.coll, p); + denc(v.bits, p); + DENC_FINISH(p); + } + + void replay(coll_map_t &l) const; +}; +} +WRITE_CLASS_DENC(crimson::os::seastore::collection_manager::delta_t) + +namespace crimson::os::seastore::collection_manager { +class delta_buffer_t { + std::vector<delta_t> buffer; +public: + bool empty() const { + return buffer.empty(); + } + + void insert(coll_t coll, uint32_t bits) { + buffer.push_back(delta_t{delta_t::op_t::INSERT, denc_coll_t(coll), bits}); + } + void update(coll_t coll, uint32_t bits) { + buffer.push_back(delta_t{delta_t::op_t::UPDATE, denc_coll_t(coll), bits}); + } + void remove(coll_t coll) { + buffer.push_back(delta_t{delta_t::op_t::REMOVE, denc_coll_t(coll), 0}); + } + void replay(coll_map_t &l) { + for (auto &i: buffer) { + i.replay(l); + } + } + + void clear() { buffer.clear(); } + + DENC(delta_buffer_t, v, p) { + DENC_START(1, 1, p); + denc(v.buffer, p); + DENC_FINISH(p); + } +}; +} +WRITE_CLASS_DENC(crimson::os::seastore::collection_manager::delta_buffer_t) + +namespace crimson::os::seastore::collection_manager { + +struct CollectionNode + : LogicalCachedExtent { + using CollectionNodeRef = TCachedExtentRef<CollectionNode>; + + explicit CollectionNode(ceph::bufferptr &&ptr) + : LogicalCachedExtent(std::move(ptr)) {} + explicit CollectionNode(const CollectionNode &other) + : LogicalCachedExtent(other), + decoded(other.decoded) {} + + static constexpr extent_types_t type = extent_types_t::COLL_BLOCK; + + coll_map_t decoded; + delta_buffer_t delta_buffer; + + CachedExtentRef duplicate_for_write(Transaction&) final { + assert(delta_buffer.empty()); + return CachedExtentRef(new CollectionNode(*this)); + } + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + using list_iertr = CollectionManager::list_iertr; + using list_ret = CollectionManager::list_ret; + list_ret list(); + + + enum class create_result_t : uint8_t { + SUCCESS, + OVERFLOW + }; + using create_iertr = CollectionManager::create_iertr; + using create_ret = create_iertr::future<create_result_t>; + create_ret create(coll_context_t cc, coll_t coll, unsigned bits); + + using remove_iertr = CollectionManager::remove_iertr; + using remove_ret = CollectionManager::remove_ret; + remove_ret remove(coll_context_t cc, coll_t coll); + + using update_iertr = CollectionManager::update_iertr; + using update_ret = CollectionManager::update_ret; + update_ret update(coll_context_t cc, coll_t coll, unsigned bits); + + void on_clean_read() final { + bufferlist bl; + bl.append(get_bptr()); + auto iter = bl.cbegin(); + decode((base_coll_map_t&)decoded, iter); + } + + void copy_to_node() { + bufferlist bl; + encode((base_coll_map_t&)decoded, bl); + auto iter = bl.begin(); + auto size = encoded_sizeof((base_coll_map_t&)decoded); + assert(size <= get_bptr().length()); + get_bptr().zero(); + iter.copy(size, get_bptr().c_str()); + + } + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::bufferlist bl; + encode(delta_buffer, bl); + delta_buffer.clear(); + return bl; + } + + void apply_delta(const ceph::bufferlist &bl) final { + assert(bl.length()); + delta_buffer_t buffer; + auto bptr = bl.begin(); + decode(buffer, bptr); + buffer.replay(decoded); + copy_to_node(); + } + + static constexpr extent_types_t TYPE = extent_types_t::COLL_BLOCK; + extent_types_t get_type() const final { + return TYPE; + } + + std::ostream &print_detail_l(std::ostream &out) const final; +}; +using CollectionNodeRef = CollectionNode::CollectionNodeRef; +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::collection_manager::CollectionNode> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc b/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc new file mode 100644 index 000000000..decb095f6 --- /dev/null +++ b/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc @@ -0,0 +1,133 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <string.h> + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "include/stringify.h" +#include "crimson/os/seastore/collection_manager/flat_collection_manager.h" +#include "crimson/os/seastore/collection_manager/collection_flat_node.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_seastore); + } +} + +namespace crimson::os::seastore::collection_manager { + +constexpr static extent_len_t MIN_FLAT_BLOCK_SIZE = 4<<10; +[[maybe_unused]] constexpr static extent_len_t MAX_FLAT_BLOCK_SIZE = 4<<20; + +FlatCollectionManager::FlatCollectionManager( + TransactionManager &tm) + : tm(tm) {} + +FlatCollectionManager::mkfs_ret +FlatCollectionManager::mkfs(Transaction &t) +{ + + logger().debug("FlatCollectionManager: {}", __func__); + return tm.alloc_extent<CollectionNode>( + t, L_ADDR_MIN, MIN_FLAT_BLOCK_SIZE + ).si_then([](auto&& root_extent) { + coll_root_t coll_root = coll_root_t( + root_extent->get_laddr(), + MIN_FLAT_BLOCK_SIZE + ); + return mkfs_iertr::make_ready_future<coll_root_t>(coll_root); + }); +} + +FlatCollectionManager::get_root_ret +FlatCollectionManager::get_coll_root(const coll_root_t &coll_root, Transaction &t) +{ + logger().debug("FlatCollectionManager: {}", __func__); + assert(coll_root.get_location() != L_ADDR_NULL); + auto cc = get_coll_context(t); + return cc.tm.read_extent<CollectionNode>( + cc.t, + coll_root.get_location(), + coll_root.get_size() + ).si_then([](auto&& e) { + return get_root_iertr::make_ready_future<CollectionNodeRef>(std::move(e)); + }); +} + +FlatCollectionManager::create_ret +FlatCollectionManager::create(coll_root_t &coll_root, Transaction &t, + coll_t cid, coll_info_t info) +{ + logger().debug("FlatCollectionManager: {}", __func__); + return get_coll_root(coll_root, t + ).si_then([=, this, &coll_root, &t] (auto &&extent) { + return extent->create( + get_coll_context(t), cid, info.split_bits + ).si_then([=, this, &coll_root, &t] (auto ret) { + switch (ret) { + case CollectionNode::create_result_t::OVERFLOW: { + logger().debug("FlatCollectionManager: {} overflow!", __func__); + auto new_size = coll_root.get_size() * 2; // double each time + + // TODO return error probably, but such a nonsensically large number of + // collections would create a ton of other problems as well + assert(new_size < MAX_FLAT_BLOCK_SIZE); + return tm.alloc_extent<CollectionNode>( + t, L_ADDR_MIN, new_size + ).si_then([=, this, &coll_root, &t] (auto &&root_extent) { + coll_root.update(root_extent->get_laddr(), root_extent->get_length()); + + root_extent->decoded = extent->decoded; + return root_extent->create( + get_coll_context(t), cid, info.split_bits + ).si_then([=, this, &t](auto result) { + assert(result == CollectionNode::create_result_t::SUCCESS); + return tm.dec_ref(t, extent->get_laddr()); + }).si_then([] (auto) { + return create_iertr::make_ready_future<>(); + }); + }); + } + case CollectionNode::create_result_t::SUCCESS: { + return create_iertr::make_ready_future<>(); + } + } + __builtin_unreachable(); + }); + }); +} + +FlatCollectionManager::list_ret +FlatCollectionManager::list(const coll_root_t &coll_root, Transaction &t) +{ + logger().debug("FlatCollectionManager: {}", __func__); + return get_coll_root(coll_root, t) + .si_then([] (auto extent) { + return extent->list(); + }); +} + +FlatCollectionManager::update_ret +FlatCollectionManager::update(const coll_root_t &coll_root, Transaction &t, + coll_t cid, coll_info_t info) +{ + logger().debug("FlatCollectionManager: {}", __func__); + return get_coll_root(coll_root, t) + .si_then([this, &t, cid, info] (auto extent) { + return extent->update(get_coll_context(t), cid, info.split_bits); + }); +} + +FlatCollectionManager::remove_ret +FlatCollectionManager::remove(const coll_root_t &coll_root, Transaction &t, + coll_t cid ) +{ + logger().debug("FlatCollectionManager: {}", __func__); + return get_coll_root(coll_root, t).si_then([this, &t, cid] (auto extent) { + return extent->remove(get_coll_context(t), cid); + }); +} + +} diff --git a/src/crimson/os/seastore/collection_manager/flat_collection_manager.h b/src/crimson/os/seastore/collection_manager/flat_collection_manager.h new file mode 100644 index 000000000..1321ec1d8 --- /dev/null +++ b/src/crimson/os/seastore/collection_manager/flat_collection_manager.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/ceph_assert.h" + +#include "crimson/os/seastore/collection_manager.h" +#include "crimson/os/seastore/collection_manager/collection_flat_node.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" + +namespace crimson::os::seastore::collection_manager { + +class FlatCollectionManager : public CollectionManager { + TransactionManager &tm; + + coll_context_t get_coll_context(Transaction &t) { + return coll_context_t{tm, t}; + } + + using get_root_iertr = base_iertr; + using get_root_ret = get_root_iertr::future<CollectionNodeRef>; + get_root_ret get_coll_root(const coll_root_t &coll_root, Transaction &t); + +public: + explicit FlatCollectionManager(TransactionManager &tm); + + mkfs_ret mkfs(Transaction &t) final; + + create_ret create(coll_root_t &coll_root, Transaction &t, coll_t cid, + coll_info_t info) final; + + list_ret list(const coll_root_t &coll_root, Transaction &t) final; + + remove_ret remove(const coll_root_t &coll_root, Transaction &t, coll_t cid) final; + + update_ret update(const coll_root_t &coll_root, Transaction &t, coll_t cid, coll_info_t info) final; +}; +using FlatCollectionManagerRef = std::unique_ptr<FlatCollectionManager>; +} diff --git a/src/crimson/os/seastore/device.cc b/src/crimson/os/seastore/device.cc new file mode 100644 index 000000000..c3bda82a7 --- /dev/null +++ b/src/crimson/os/seastore/device.cc @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "device.h" + +#include "segment_manager.h" +#include "random_block_manager.h" +#include "random_block_manager/rbm_device.h" + +namespace crimson::os::seastore { + +std::ostream& operator<<(std::ostream& out, const device_spec_t& ds) +{ + return out << "device_spec(" + << "magic=" << ds.magic + << ", dtype=" << ds.dtype + << ", " << device_id_printer_t{ds.id} + << ")"; +} + +std::ostream& operator<<(std::ostream& out, const device_config_t& conf) +{ + out << "device_config_t(" + << "major_dev=" << conf.major_dev + << ", spec=" << conf.spec + << ", meta=" << conf.meta + << ", secondary("; + for (const auto& [k, v] : conf.secondary_devices) { + out << device_id_printer_t{k} + << ": " << v << ", "; + } + return out << "))"; +} + +seastar::future<DeviceRef> +Device::make_device(const std::string& device, device_type_t dtype) +{ + if (get_default_backend_of_device(dtype) == backend_type_t::SEGMENTED) { + return SegmentManager::get_segment_manager(device, dtype + ).then([](DeviceRef ret) { + return ret; + }); + } + assert(get_default_backend_of_device(dtype) == backend_type_t::RANDOM_BLOCK); + return get_rb_device(device + ).then([](DeviceRef ret) { + return ret; + }); +} + +} diff --git a/src/crimson/os/seastore/device.h b/src/crimson/os/seastore/device.h new file mode 100644 index 000000000..ceb1ede64 --- /dev/null +++ b/src/crimson/os/seastore/device.h @@ -0,0 +1,175 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> + +#include "include/buffer_fwd.h" + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { + +using magic_t = uint64_t; + +struct device_spec_t { + magic_t magic = 0; + device_type_t dtype = device_type_t::NONE; + device_id_t id = DEVICE_ID_NULL; + DENC(device_spec_t, v, p) { + DENC_START(1, 1, p); + denc(v.magic, p); + denc(v.dtype, p); + denc(v.id, p); + DENC_FINISH(p); + } +}; + +std::ostream& operator<<(std::ostream&, const device_spec_t&); + +using secondary_device_set_t = + std::map<device_id_t, device_spec_t>; + +struct device_config_t { + bool major_dev = false; + device_spec_t spec; + seastore_meta_t meta; + secondary_device_set_t secondary_devices; + DENC(device_config_t, v, p) { + DENC_START(1, 1, p); + denc(v.major_dev, p); + denc(v.spec, p); + denc(v.meta, p); + denc(v.secondary_devices, p); + DENC_FINISH(p); + } + static device_config_t create_primary( + uuid_d new_osd_fsid, + device_id_t id, + device_type_t d_type, + secondary_device_set_t sds) { + return device_config_t{ + true, + device_spec_t{ + (magic_t)std::rand(), + d_type, + id}, + seastore_meta_t{new_osd_fsid}, + sds}; + } + static device_config_t create_secondary( + uuid_d new_osd_fsid, + device_id_t id, + device_type_t d_type, + magic_t magic) { + return device_config_t{ + false, + device_spec_t{ + magic, + d_type, + id}, + seastore_meta_t{new_osd_fsid}, + secondary_device_set_t()}; + } +}; + +std::ostream& operator<<(std::ostream&, const device_config_t&); + +class Device; +using DeviceRef = std::unique_ptr<Device>; + +/** + * Device + * + * Represents a general device regardless of the underlying medium. + */ +class Device { +// interfaces used by device +public: + virtual ~Device() {} + + virtual seastar::future<> start() { + return seastar::now(); + } + + virtual seastar::future<> stop() { + return seastar::now(); + } + // called on the shard to get this shard device; + virtual Device& get_sharded_device() { + return *this; + } + + using access_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::permission_denied, + crimson::ct_error::enoent>; + + using mkfs_ertr = access_ertr; + using mkfs_ret = mkfs_ertr::future<>; + virtual mkfs_ret mkfs(device_config_t) = 0; + + using mount_ertr = access_ertr; + using mount_ret = access_ertr::future<>; + virtual mount_ret mount() = 0; + + static seastar::future<DeviceRef> make_device( + const std::string &device, + device_type_t dtype); + +// interfaces used by each device shard +public: + virtual device_id_t get_device_id() const = 0; + + virtual magic_t get_magic() const = 0; + + virtual device_type_t get_device_type() const = 0; + + virtual backend_type_t get_backend_type() const = 0; + + virtual const seastore_meta_t &get_meta() const = 0; + + virtual extent_len_t get_block_size() const = 0; + + virtual std::size_t get_available_size() const = 0; + + virtual secondary_device_set_t& get_secondary_devices() = 0; + + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + virtual close_ertr::future<> close() = 0; + + using read_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + virtual read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) = 0; + + read_ertr::future<ceph::bufferptr> read( + paddr_t addr, + size_t len + ) { + auto ptrref = std::make_unique<ceph::bufferptr>( + buffer::create_page_aligned(len)); + return read(addr, len, *ptrref + ).safe_then([ptrref=std::move(ptrref)]() mutable { + return read_ertr::make_ready_future<bufferptr>(std::move(*ptrref)); + }); + } +}; + +} + +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::device_spec_t) +WRITE_CLASS_DENC(crimson::os::seastore::device_config_t) + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::device_config_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::device_spec_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc new file mode 100644 index 000000000..b7aabefc6 --- /dev/null +++ b/src/crimson/os/seastore/extent_placement_manager.cc @@ -0,0 +1,808 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "crimson/os/seastore/extent_placement_manager.h" + +#include "crimson/common/config_proxy.h" +#include "crimson/os/seastore/logging.h" + +SET_SUBSYS(seastore_epm); + +namespace crimson::os::seastore { + +SegmentedOolWriter::SegmentedOolWriter( + data_category_t category, + rewrite_gen_t gen, + SegmentProvider& sp, + SegmentSeqAllocator &ssa) + : segment_allocator(nullptr, category, gen, sp, ssa), + record_submitter(crimson::common::get_conf<uint64_t>( + "seastore_journal_iodepth_limit"), + crimson::common::get_conf<uint64_t>( + "seastore_journal_batch_capacity"), + crimson::common::get_conf<Option::size_t>( + "seastore_journal_batch_flush_size"), + crimson::common::get_conf<double>( + "seastore_journal_batch_preferred_fullness"), + segment_allocator) +{ +} + +SegmentedOolWriter::alloc_write_ertr::future<> +SegmentedOolWriter::write_record( + Transaction& t, + record_t&& record, + std::list<LogicalCachedExtentRef>&& extents, + bool with_atomic_roll_segment) +{ + LOG_PREFIX(SegmentedOolWriter::write_record); + assert(extents.size()); + assert(extents.size() == record.extents.size()); + assert(!record.deltas.size()); + + // account transactional ool writes before write() + auto& stats = t.get_ool_write_stats(); + stats.extents.num += extents.size(); + stats.extents.bytes += record.size.dlength; + stats.md_bytes += record.size.get_raw_mdlength(); + stats.num_records += 1; + + return record_submitter.submit( + std::move(record), + with_atomic_roll_segment + ).safe_then([this, FNAME, &t, extents=std::move(extents) + ](record_locator_t ret) mutable { + DEBUGT("{} finish with {} and {} extents", + t, segment_allocator.get_name(), + ret, extents.size()); + paddr_t extent_addr = ret.record_block_base; + for (auto& extent : extents) { + TRACET("{} ool extent written at {} -- {}", + t, segment_allocator.get_name(), + extent_addr, *extent); + t.update_delayed_ool_extent_addr(extent, extent_addr); + extent_addr = extent_addr.as_seg_paddr().add_offset( + extent->get_length()); + } + }); +} + +SegmentedOolWriter::alloc_write_iertr::future<> +SegmentedOolWriter::do_write( + Transaction& t, + std::list<LogicalCachedExtentRef>& extents) +{ + LOG_PREFIX(SegmentedOolWriter::do_write); + assert(!extents.empty()); + if (!record_submitter.is_available()) { + DEBUGT("{} extents={} wait ...", + t, segment_allocator.get_name(), + extents.size()); + return trans_intr::make_interruptible( + record_submitter.wait_available() + ).si_then([this, &t, &extents] { + return do_write(t, extents); + }); + } + record_t record(TRANSACTION_TYPE_NULL); + std::list<LogicalCachedExtentRef> pending_extents; + auto commit_time = seastar::lowres_system_clock::now(); + + for (auto it = extents.begin(); it != extents.end();) { + auto& extent = *it; + record_size_t wouldbe_rsize = record.size; + wouldbe_rsize.account_extent(extent->get_bptr().length()); + using action_t = journal::RecordSubmitter::action_t; + action_t action = record_submitter.check_action(wouldbe_rsize); + if (action == action_t::ROLL) { + auto num_extents = pending_extents.size(); + DEBUGT("{} extents={} submit {} extents and roll, unavailable ...", + t, segment_allocator.get_name(), + extents.size(), num_extents); + auto fut_write = alloc_write_ertr::now(); + if (num_extents > 0) { + assert(record_submitter.check_action(record.size) != + action_t::ROLL); + fut_write = write_record( + t, std::move(record), std::move(pending_extents), + true/* with_atomic_roll_segment */); + } + return trans_intr::make_interruptible( + record_submitter.roll_segment( + ).safe_then([fut_write=std::move(fut_write)]() mutable { + return std::move(fut_write); + }) + ).si_then([this, &t, &extents] { + return do_write(t, extents); + }); + } + + TRACET("{} extents={} add extent to record -- {}", + t, segment_allocator.get_name(), + extents.size(), *extent); + ceph::bufferlist bl; + extent->prepare_write(); + bl.append(extent->get_bptr()); + assert(bl.length() == extent->get_length()); + auto modify_time = extent->get_modify_time(); + if (modify_time == NULL_TIME) { + modify_time = commit_time; + } + record.push_back( + extent_t{ + extent->get_type(), + extent->get_laddr(), + std::move(bl)}, + modify_time); + pending_extents.push_back(extent); + it = extents.erase(it); + + assert(record_submitter.check_action(record.size) == action); + if (action == action_t::SUBMIT_FULL) { + DEBUGT("{} extents={} submit {} extents ...", + t, segment_allocator.get_name(), + extents.size(), pending_extents.size()); + return trans_intr::make_interruptible( + write_record(t, std::move(record), std::move(pending_extents)) + ).si_then([this, &t, &extents] { + if (!extents.empty()) { + return do_write(t, extents); + } else { + return alloc_write_iertr::now(); + } + }); + } + // SUBMIT_NOT_FULL: evaluate the next extent + } + + auto num_extents = pending_extents.size(); + DEBUGT("{} submit the rest {} extents ...", + t, segment_allocator.get_name(), + num_extents); + assert(num_extents > 0); + return trans_intr::make_interruptible( + write_record(t, std::move(record), std::move(pending_extents))); +} + +SegmentedOolWriter::alloc_write_iertr::future<> +SegmentedOolWriter::alloc_write_ool_extents( + Transaction& t, + std::list<LogicalCachedExtentRef>& extents) +{ + if (extents.empty()) { + return alloc_write_iertr::now(); + } + return seastar::with_gate(write_guard, [this, &t, &extents] { + return do_write(t, extents); + }); +} + +void ExtentPlacementManager::init( + JournalTrimmerImplRef &&trimmer, + AsyncCleanerRef &&cleaner, + AsyncCleanerRef &&cold_cleaner) +{ + writer_refs.clear(); + auto cold_segment_cleaner = dynamic_cast<SegmentCleaner*>(cold_cleaner.get()); + dynamic_max_rewrite_generation = MIN_COLD_GENERATION - 1; + if (cold_segment_cleaner) { + dynamic_max_rewrite_generation = MAX_REWRITE_GENERATION; + } + + if (trimmer->get_journal_type() == journal_type_t::SEGMENTED) { + auto segment_cleaner = dynamic_cast<SegmentCleaner*>(cleaner.get()); + ceph_assert(segment_cleaner != nullptr); + auto num_writers = generation_to_writer(dynamic_max_rewrite_generation + 1); + + data_writers_by_gen.resize(num_writers, {}); + for (rewrite_gen_t gen = OOL_GENERATION; gen < MIN_COLD_GENERATION; ++gen) { + writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>( + data_category_t::DATA, gen, *segment_cleaner, + *ool_segment_seq_allocator)); + data_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get(); + } + + md_writers_by_gen.resize(num_writers, {}); + for (rewrite_gen_t gen = OOL_GENERATION; gen < MIN_COLD_GENERATION; ++gen) { + writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>( + data_category_t::METADATA, gen, *segment_cleaner, + *ool_segment_seq_allocator)); + md_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get(); + } + + for (auto *device : segment_cleaner->get_segment_manager_group() + ->get_segment_managers()) { + add_device(device); + } + } else { + assert(trimmer->get_journal_type() == journal_type_t::RANDOM_BLOCK); + auto rb_cleaner = dynamic_cast<RBMCleaner*>(cleaner.get()); + ceph_assert(rb_cleaner != nullptr); + auto num_writers = generation_to_writer(dynamic_max_rewrite_generation + 1); + data_writers_by_gen.resize(num_writers, {}); + md_writers_by_gen.resize(num_writers, {}); + writer_refs.emplace_back(std::make_unique<RandomBlockOolWriter>( + rb_cleaner)); + // TODO: implement eviction in RBCleaner and introduce further writers + data_writers_by_gen[generation_to_writer(OOL_GENERATION)] = writer_refs.back().get(); + md_writers_by_gen[generation_to_writer(OOL_GENERATION)] = writer_refs.back().get(); + for (auto *rb : rb_cleaner->get_rb_group()->get_rb_managers()) { + add_device(rb->get_device()); + } + } + + if (cold_segment_cleaner) { + for (rewrite_gen_t gen = MIN_COLD_GENERATION; gen < REWRITE_GENERATIONS; ++gen) { + writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>( + data_category_t::DATA, gen, *cold_segment_cleaner, + *ool_segment_seq_allocator)); + data_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get(); + } + for (rewrite_gen_t gen = MIN_COLD_GENERATION; gen < REWRITE_GENERATIONS; ++gen) { + writer_refs.emplace_back(std::make_unique<SegmentedOolWriter>( + data_category_t::METADATA, gen, *cold_segment_cleaner, + *ool_segment_seq_allocator)); + md_writers_by_gen[generation_to_writer(gen)] = writer_refs.back().get(); + } + for (auto *device : cold_segment_cleaner->get_segment_manager_group() + ->get_segment_managers()) { + add_device(device); + } + } + + background_process.init(std::move(trimmer), + std::move(cleaner), + std::move(cold_cleaner)); + if (cold_segment_cleaner) { + ceph_assert(get_main_backend_type() == backend_type_t::SEGMENTED); + ceph_assert(background_process.has_cold_tier()); + } else { + ceph_assert(!background_process.has_cold_tier()); + } +} + +void ExtentPlacementManager::set_primary_device(Device *device) +{ + ceph_assert(primary_device == nullptr); + primary_device = device; + ceph_assert(devices_by_id[device->get_device_id()] == device); +} + +ExtentPlacementManager::open_ertr::future<> +ExtentPlacementManager::open_for_write() +{ + LOG_PREFIX(ExtentPlacementManager::open_for_write); + INFO("started with {} devices", num_devices); + ceph_assert(primary_device != nullptr); + return crimson::do_for_each(data_writers_by_gen, [](auto &writer) { + if (writer) { + return writer->open(); + } + return open_ertr::now(); + }).safe_then([this] { + return crimson::do_for_each(md_writers_by_gen, [](auto &writer) { + if (writer) { + return writer->open(); + } + return open_ertr::now(); + }); + }); +} + +ExtentPlacementManager::dispatch_result_t +ExtentPlacementManager::dispatch_delayed_extents(Transaction &t) +{ + dispatch_result_t res; + res.delayed_extents = t.get_delayed_alloc_list(); + + // init projected usage + for (auto &extent : t.get_inline_block_list()) { + if (extent->is_valid()) { + res.usage.inline_usage += extent->get_length(); + res.usage.cleaner_usage.main_usage += extent->get_length(); + } + } + + for (auto &extent : res.delayed_extents) { + if (dispatch_delayed_extent(extent)) { + res.usage.inline_usage += extent->get_length(); + res.usage.cleaner_usage.main_usage += extent->get_length(); + t.mark_delayed_extent_inline(extent); + } else { + if (extent->get_rewrite_generation() < MIN_COLD_GENERATION) { + res.usage.cleaner_usage.main_usage += extent->get_length(); + } else { + assert(background_process.has_cold_tier()); + res.usage.cleaner_usage.cold_ool_usage += extent->get_length(); + } + t.mark_delayed_extent_ool(extent); + auto writer_ptr = get_writer( + extent->get_user_hint(), + get_extent_category(extent->get_type()), + extent->get_rewrite_generation()); + res.alloc_map[writer_ptr].emplace_back(extent); + } + } + return res; +} + +ExtentPlacementManager::alloc_paddr_iertr::future<> +ExtentPlacementManager::write_delayed_ool_extents( + Transaction& t, + extents_by_writer_t& alloc_map) { + return trans_intr::do_for_each(alloc_map, [&t](auto& p) { + auto writer = p.first; + auto& extents = p.second; + return writer->alloc_write_ool_extents(t, extents); + }); +} + +ExtentPlacementManager::alloc_paddr_iertr::future<> +ExtentPlacementManager::write_preallocated_ool_extents( + Transaction &t, + std::list<LogicalCachedExtentRef> extents) +{ + LOG_PREFIX(ExtentPlacementManager::write_preallocated_ool_extents); + DEBUGT("start with {} allocated extents", + t, extents.size()); + assert(writer_refs.size()); + return seastar::do_with( + std::map<ExtentOolWriter*, std::list<LogicalCachedExtentRef>>(), + [this, &t, extents=std::move(extents)](auto& alloc_map) { + for (auto& extent : extents) { + auto writer_ptr = get_writer( + extent->get_user_hint(), + get_extent_category(extent->get_type()), + extent->get_rewrite_generation()); + alloc_map[writer_ptr].emplace_back(extent); + } + return trans_intr::do_for_each(alloc_map, [&t](auto& p) { + auto writer = p.first; + auto& extents = p.second; + return writer->alloc_write_ool_extents(t, extents); + }); + }); +} + +ExtentPlacementManager::close_ertr::future<> +ExtentPlacementManager::close() +{ + LOG_PREFIX(ExtentPlacementManager::close); + INFO("started"); + return crimson::do_for_each(data_writers_by_gen, [](auto &writer) { + if (writer) { + return writer->close(); + } + return close_ertr::now(); + }).safe_then([this] { + return crimson::do_for_each(md_writers_by_gen, [](auto &writer) { + if (writer) { + return writer->close(); + } + return close_ertr::now(); + }); + }); +} + +void ExtentPlacementManager::BackgroundProcess::log_state(const char *caller) const +{ + LOG_PREFIX(BackgroundProcess::log_state); + DEBUG("caller {}, {}, {}", + caller, + JournalTrimmerImpl::stat_printer_t{*trimmer, true}, + AsyncCleaner::stat_printer_t{*main_cleaner, true}); + if (has_cold_tier()) { + DEBUG("caller {}, cold_cleaner: {}", + caller, + AsyncCleaner::stat_printer_t{*cold_cleaner, true}); + } +} + +void ExtentPlacementManager::BackgroundProcess::start_background() +{ + LOG_PREFIX(BackgroundProcess::start_background); + INFO("{}, {}", + JournalTrimmerImpl::stat_printer_t{*trimmer, true}, + AsyncCleaner::stat_printer_t{*main_cleaner, true}); + if (has_cold_tier()) { + INFO("cold_cleaner: {}", + AsyncCleaner::stat_printer_t{*cold_cleaner, true}); + } + ceph_assert(trimmer->check_is_ready()); + ceph_assert(state == state_t::SCAN_SPACE); + assert(!is_running()); + process_join = seastar::now(); + state = state_t::RUNNING; + assert(is_running()); + process_join = run(); +} + +seastar::future<> +ExtentPlacementManager::BackgroundProcess::stop_background() +{ + return seastar::futurize_invoke([this] { + if (!is_running()) { + if (state != state_t::HALT) { + state = state_t::STOP; + } + return seastar::now(); + } + auto ret = std::move(*process_join); + process_join.reset(); + state = state_t::HALT; + assert(!is_running()); + do_wake_background(); + return ret; + }).then([this] { + LOG_PREFIX(BackgroundProcess::stop_background); + INFO("done, {}, {}", + JournalTrimmerImpl::stat_printer_t{*trimmer, true}, + AsyncCleaner::stat_printer_t{*main_cleaner, true}); + if (has_cold_tier()) { + INFO("done, cold_cleaner: {}", + AsyncCleaner::stat_printer_t{*cold_cleaner, true}); + } + // run_until_halt() can be called at HALT + }); +} + +seastar::future<> +ExtentPlacementManager::BackgroundProcess::run_until_halt() +{ + ceph_assert(state == state_t::HALT); + assert(!is_running()); + if (is_running_until_halt) { + return seastar::now(); + } + is_running_until_halt = true; + return seastar::do_until( + [this] { + log_state("run_until_halt"); + assert(is_running_until_halt); + if (background_should_run()) { + return false; + } else { + is_running_until_halt = false; + return true; + } + }, + [this] { + return do_background_cycle(); + } + ); +} + +seastar::future<> +ExtentPlacementManager::BackgroundProcess::reserve_projected_usage( + io_usage_t usage) +{ + if (!is_ready()) { + return seastar::now(); + } + ceph_assert(!blocking_io); + // The pipeline configuration prevents another IO from entering + // prepare until the prior one exits and clears this. + ++stats.io_count; + + auto res = try_reserve_io(usage); + if (res.is_successful()) { + return seastar::now(); + } else { + abort_io_usage(usage, res); + if (!res.reserve_inline_success) { + ++stats.io_blocked_count_trim; + } + if (!res.cleaner_result.is_successful()) { + ++stats.io_blocked_count_clean; + } + ++stats.io_blocking_num; + ++stats.io_blocked_count; + stats.io_blocked_sum += stats.io_blocking_num; + + return seastar::repeat([this, usage] { + blocking_io = seastar::promise<>(); + return blocking_io->get_future( + ).then([this, usage] { + ceph_assert(!blocking_io); + auto res = try_reserve_io(usage); + if (res.is_successful()) { + assert(stats.io_blocking_num == 1); + --stats.io_blocking_num; + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } else { + abort_io_usage(usage, res); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + } + }); + }); + } +} + +seastar::future<> +ExtentPlacementManager::BackgroundProcess::run() +{ + assert(is_running()); + return seastar::repeat([this] { + if (!is_running()) { + log_state("run(exit)"); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + return seastar::futurize_invoke([this] { + if (background_should_run()) { + log_state("run(background)"); + return do_background_cycle(); + } else { + log_state("run(block)"); + ceph_assert(!blocking_background); + blocking_background = seastar::promise<>(); + return blocking_background->get_future(); + } + }).then([] { + return seastar::stop_iteration::no; + }); + }); +} + +/** + * Reservation Process + * + * Most of transctions need to reserve its space usage before performing the + * ool writes and committing transactions. If the space reservation is + * unsuccessful, the current transaction is blocked, and waits for new + * background transactions to finish. + * + * The following are the reservation requirements for each transaction type: + * 1. MUTATE transaction: + * (1) inline usage on the trimmer, + * (2) inline usage with OOL usage on the main cleaner, + * (3) cold OOL usage to the cold cleaner(if it exists). + * 2. TRIM_DIRTY/TRIM_ALLOC transaction: + * (1) all extents usage on the main cleaner, + * (2) usage on the cold cleaner(if it exists) + * 3. CLEANER_MAIN: + * (1) cleaned extents size on the cold cleaner(if it exists). + * 4. CLEANER_COLD transction does not require space reservation. + * + * The reserve implementation should satisfy the following conditions: + * 1. The reservation should be atomic. If a reservation involves several reservations, + * such as the MUTATE transaction that needs to reserve space on both the trimmer + * and cleaner at the same time, the successful condition is that all of its + * sub-reservations succeed. If one or more operations fail, the entire reservation + * fails, and the successful operation should be reverted. + * 2. The reserve/block relationship should form a DAG to avoid deadlock. For example, + * TRIM_ALLOC transaction might be blocked by cleaner due to the failure of reserving + * on the cleaner. In such cases, the cleaner must not reserve space on the trimmer + * since the trimmer is already blocked by itself. + * + * Finally the reserve relationship can be represented as follows: + * + * +-------------------------+----------------+ + * | | | + * | v v + * MUTATE ---> TRIM_* ---> CLEANER_MAIN ---> CLEANER_COLD + * | ^ + * | | + * +--------------------------------+ + */ +bool ExtentPlacementManager::BackgroundProcess::try_reserve_cold(std::size_t usage) +{ + if (has_cold_tier()) { + return cold_cleaner->try_reserve_projected_usage(usage); + } else { + assert(usage == 0); + return true; + } +} +void ExtentPlacementManager::BackgroundProcess::abort_cold_usage( + std::size_t usage, bool success) +{ + if (has_cold_tier() && success) { + cold_cleaner->release_projected_usage(usage); + } +} + +reserve_cleaner_result_t +ExtentPlacementManager::BackgroundProcess::try_reserve_cleaner( + const cleaner_usage_t &usage) +{ + return { + main_cleaner->try_reserve_projected_usage(usage.main_usage), + try_reserve_cold(usage.cold_ool_usage) + }; +} + +void ExtentPlacementManager::BackgroundProcess::abort_cleaner_usage( + const cleaner_usage_t &usage, + const reserve_cleaner_result_t &result) +{ + if (result.reserve_main_success) { + main_cleaner->release_projected_usage(usage.main_usage); + } + abort_cold_usage(usage.cold_ool_usage, result.reserve_cold_success); +} + +reserve_io_result_t +ExtentPlacementManager::BackgroundProcess::try_reserve_io( + const io_usage_t &usage) +{ + return { + trimmer->try_reserve_inline_usage(usage.inline_usage), + try_reserve_cleaner(usage.cleaner_usage) + }; +} + +void ExtentPlacementManager::BackgroundProcess::abort_io_usage( + const io_usage_t &usage, + const reserve_io_result_t &result) +{ + if (result.reserve_inline_success) { + trimmer->release_inline_usage(usage.inline_usage); + } + abort_cleaner_usage(usage.cleaner_usage, result.cleaner_result); +} + +seastar::future<> +ExtentPlacementManager::BackgroundProcess::do_background_cycle() +{ + assert(is_ready()); + bool should_trim = trimmer->should_trim(); + bool proceed_trim = false; + auto trim_size = trimmer->get_trim_size_per_cycle(); + cleaner_usage_t trim_usage{ + trim_size, + // We take a cautious policy here that the trimmer also reserves + // the max value on cold cleaner even if no extents will be rewritten + // to the cold tier. Cleaner also takes the same policy. + // The reason is that we don't know the exact value of reservation until + // the construction of trimmer transaction completes after which the reservation + // might fail then the trimmer is possible to be invalidated by cleaner. + // Reserving the max size at first could help us avoid these trouble. + has_cold_tier() ? trim_size : 0 + }; + + reserve_cleaner_result_t trim_reserve_res; + if (should_trim) { + trim_reserve_res = try_reserve_cleaner(trim_usage); + if (trim_reserve_res.is_successful()) { + proceed_trim = true; + } else { + abort_cleaner_usage(trim_usage, trim_reserve_res); + } + } + + if (proceed_trim) { + return trimmer->trim( + ).finally([this, trim_usage] { + abort_cleaner_usage(trim_usage, {true, true}); + }); + } else { + bool should_clean_main = + main_cleaner_should_run() || + // make sure cleaner will start + // when the trimmer should run but + // failed to reserve space. + (should_trim && !proceed_trim && + !trim_reserve_res.reserve_main_success); + bool proceed_clean_main = false; + + auto main_cold_usage = main_cleaner->get_reclaim_size_per_cycle(); + if (should_clean_main) { + if (has_cold_tier()) { + proceed_clean_main = try_reserve_cold(main_cold_usage); + } else { + proceed_clean_main = true; + } + } + + bool proceed_clean_cold = false; + if (has_cold_tier() && + (cold_cleaner->should_clean_space() || + (should_trim && !proceed_trim && + !trim_reserve_res.reserve_cold_success) || + (should_clean_main && !proceed_clean_main))) { + proceed_clean_cold = true; + } + + if (!proceed_clean_main && !proceed_clean_cold) { + ceph_abort("no background process will start"); + } + return seastar::when_all( + [this, proceed_clean_main, main_cold_usage] { + if (!proceed_clean_main) { + return seastar::now(); + } + return main_cleaner->clean_space( + ).handle_error( + crimson::ct_error::assert_all{ + "do_background_cycle encountered invalid error in main clean_space" + } + ).finally([this, main_cold_usage] { + abort_cold_usage(main_cold_usage, true); + }); + }, + [this, proceed_clean_cold] { + if (!proceed_clean_cold) { + return seastar::now(); + } + return cold_cleaner->clean_space( + ).handle_error( + crimson::ct_error::assert_all{ + "do_background_cycle encountered invalid error in cold clean_space" + } + ); + } + ).discard_result(); + } +} + +void ExtentPlacementManager::BackgroundProcess::register_metrics() +{ + namespace sm = seastar::metrics; + metrics.add_group("background_process", { + sm::make_counter("io_count", stats.io_count, + sm::description("the sum of IOs")), + sm::make_counter("io_blocked_count", stats.io_blocked_count, + sm::description("IOs that are blocked by gc")), + sm::make_counter("io_blocked_count_trim", stats.io_blocked_count_trim, + sm::description("IOs that are blocked by trimming")), + sm::make_counter("io_blocked_count_clean", stats.io_blocked_count_clean, + sm::description("IOs that are blocked by cleaning")), + sm::make_counter("io_blocked_sum", stats.io_blocked_sum, + sm::description("the sum of blocking IOs")) + }); +} + +RandomBlockOolWriter::alloc_write_iertr::future<> +RandomBlockOolWriter::alloc_write_ool_extents( + Transaction& t, + std::list<LogicalCachedExtentRef>& extents) +{ + if (extents.empty()) { + return alloc_write_iertr::now(); + } + return seastar::with_gate(write_guard, [this, &t, &extents] { + return do_write(t, extents); + }); +} + +RandomBlockOolWriter::alloc_write_iertr::future<> +RandomBlockOolWriter::do_write( + Transaction& t, + std::list<LogicalCachedExtentRef>& extents) +{ + LOG_PREFIX(RandomBlockOolWriter::do_write); + assert(!extents.empty()); + DEBUGT("start with {} allocated extents", + t, extents.size()); + return trans_intr::do_for_each(extents, + [this, &t, FNAME](auto& ex) { + auto paddr = ex->get_paddr(); + assert(paddr.is_absolute()); + RandomBlockManager * rbm = rb_cleaner->get_rbm(paddr); + assert(rbm); + TRACE("extent {}, allocated addr {}", fmt::ptr(ex.get()), paddr); + auto& stats = t.get_ool_write_stats(); + stats.extents.num += 1; + stats.extents.bytes += ex->get_length(); + stats.num_records += 1; + + ex->prepare_write(); + return rbm->write(paddr, + ex->get_bptr() + ).handle_error( + alloc_write_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error when writing record"} + ).safe_then([&t, &ex, paddr, FNAME]() { + TRACET("ool extent written at {} -- {}", + t, paddr, *ex); + t.mark_allocated_extent_ool(ex); + return alloc_write_iertr::now(); + }); + }); +} + +} diff --git a/src/crimson/os/seastore/extent_placement_manager.h b/src/crimson/os/seastore/extent_placement_manager.h new file mode 100644 index 000000000..b94c03ec3 --- /dev/null +++ b/src/crimson/os/seastore/extent_placement_manager.h @@ -0,0 +1,915 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include "seastar/core/gate.hh" + +#include "crimson/os/seastore/async_cleaner.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/journal/segment_allocator.h" +#include "crimson/os/seastore/journal/record_submitter.h" +#include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/random_block_manager.h" +#include "crimson/os/seastore/random_block_manager/block_rb_manager.h" +#include "crimson/os/seastore/randomblock_manager_group.h" + +class transaction_manager_test_t; + +namespace crimson::os::seastore { + +/** + * ExtentOolWriter + * + * Write the extents as out-of-line and allocate the physical addresses. + * Different writers write extents to different locations. + */ +class ExtentOolWriter { + using base_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; +public: + virtual ~ExtentOolWriter() {} + + using open_ertr = base_ertr; + virtual open_ertr::future<> open() = 0; + + virtual paddr_t alloc_paddr(extent_len_t length) = 0; + + using alloc_write_ertr = base_ertr; + using alloc_write_iertr = trans_iertr<alloc_write_ertr>; + virtual alloc_write_iertr::future<> alloc_write_ool_extents( + Transaction &t, + std::list<LogicalCachedExtentRef> &extents) = 0; + + using close_ertr = base_ertr; + virtual close_ertr::future<> close() = 0; +}; +using ExtentOolWriterRef = std::unique_ptr<ExtentOolWriter>; + +/** + * SegmentedOolWriter + * + * Different writers write extents to different out-of-line segments provided + * by the SegmentProvider. + */ +class SegmentedOolWriter : public ExtentOolWriter { +public: + SegmentedOolWriter(data_category_t category, + rewrite_gen_t gen, + SegmentProvider &sp, + SegmentSeqAllocator &ssa); + + open_ertr::future<> open() final { + return record_submitter.open(false).discard_result(); + } + + alloc_write_iertr::future<> alloc_write_ool_extents( + Transaction &t, + std::list<LogicalCachedExtentRef> &extents) final; + + close_ertr::future<> close() final { + return write_guard.close().then([this] { + return record_submitter.close(); + }).safe_then([this] { + write_guard = seastar::gate(); + }); + } + + paddr_t alloc_paddr(extent_len_t length) final { + return make_delayed_temp_paddr(0); + } + +private: + alloc_write_iertr::future<> do_write( + Transaction& t, + std::list<LogicalCachedExtentRef> &extent); + + alloc_write_ertr::future<> write_record( + Transaction& t, + record_t&& record, + std::list<LogicalCachedExtentRef> &&extents, + bool with_atomic_roll_segment=false); + + journal::SegmentAllocator segment_allocator; + journal::RecordSubmitter record_submitter; + seastar::gate write_guard; +}; + + +class RandomBlockOolWriter : public ExtentOolWriter { +public: + RandomBlockOolWriter(RBMCleaner* rb_cleaner) : + rb_cleaner(rb_cleaner) {} + + using open_ertr = ExtentOolWriter::open_ertr; + open_ertr::future<> open() final { + return open_ertr::now(); + } + + alloc_write_iertr::future<> alloc_write_ool_extents( + Transaction &t, + std::list<LogicalCachedExtentRef> &extents) final; + + close_ertr::future<> close() final { + return write_guard.close().then([this] { + write_guard = seastar::gate(); + return close_ertr::now(); + }); + } + + paddr_t alloc_paddr(extent_len_t length) final { + assert(rb_cleaner); + return rb_cleaner->alloc_paddr(length); + } + +private: + alloc_write_iertr::future<> do_write( + Transaction& t, + std::list<LogicalCachedExtentRef> &extent); + + RBMCleaner* rb_cleaner; + seastar::gate write_guard; +}; + +struct cleaner_usage_t { + // The size of all extents write to the main devices, including inline extents + // and out-of-line extents. + std::size_t main_usage = 0; + // The size of extents write to the cold devices + std::size_t cold_ool_usage = 0; +}; + +struct reserve_cleaner_result_t { + bool reserve_main_success = true; + bool reserve_cold_success = true; + + bool is_successful() const { + return reserve_main_success && + reserve_cold_success; + } +}; + +/** + * io_usage_t + * + * io_usage_t describes the space usage consumed by client IO. + */ +struct io_usage_t { + // The total size of all inlined extents, not including deltas and other metadata + // produced by Cache::prepare_record. + std::size_t inline_usage = 0; + cleaner_usage_t cleaner_usage; + friend std::ostream &operator<<(std::ostream &out, const io_usage_t &usage) { + return out << "io_usage_t(" + << "inline_usage=" << usage.inline_usage + << ", main_cleaner_usage=" << usage.cleaner_usage.main_usage + << ", cold_cleaner_usage=" << usage.cleaner_usage.cold_ool_usage + << ")"; + } +}; + +struct reserve_io_result_t { + bool reserve_inline_success = true; + reserve_cleaner_result_t cleaner_result; + + bool is_successful() const { + return reserve_inline_success && + cleaner_result.is_successful(); + } +}; + +class ExtentPlacementManager { +public: + ExtentPlacementManager() + : ool_segment_seq_allocator( + std::make_unique<SegmentSeqAllocator>(segment_type_t::OOL)) + { + devices_by_id.resize(DEVICE_ID_MAX, nullptr); + } + + void init(JournalTrimmerImplRef &&, AsyncCleanerRef &&, AsyncCleanerRef &&); + + SegmentSeqAllocator &get_ool_segment_seq_allocator() const { + return *ool_segment_seq_allocator; + } + + void set_primary_device(Device *device); + + void set_extent_callback(ExtentCallbackInterface *cb) { + background_process.set_extent_callback(cb); + } + + journal_type_t get_journal_type() const { + return background_process.get_journal_type(); + } + + extent_len_t get_block_size() const { + assert(primary_device != nullptr); + // assume all the devices have the same block size + return primary_device->get_block_size(); + } + + Device& get_primary_device() { + assert(primary_device != nullptr); + return *primary_device; + } + + store_statfs_t get_stat() const { + return background_process.get_stat(); + } + + using mount_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using mount_ret = mount_ertr::future<>; + mount_ret mount() { + return background_process.mount(); + } + + using open_ertr = ExtentOolWriter::open_ertr; + open_ertr::future<> open_for_write(); + + void start_scan_space() { + return background_process.start_scan_space(); + } + + void start_background() { + return background_process.start_background(); + } + + struct alloc_result_t { + paddr_t paddr; + bufferptr bp; + rewrite_gen_t gen; + }; + alloc_result_t alloc_new_extent( + Transaction& t, + extent_types_t type, + extent_len_t length, + placement_hint_t hint, +#ifdef UNIT_TESTS_BUILT + rewrite_gen_t gen, + std::optional<paddr_t> external_paddr = std::nullopt +#else + rewrite_gen_t gen +#endif + ) { + assert(hint < placement_hint_t::NUM_HINTS); + assert(is_target_rewrite_generation(gen)); + assert(gen == INIT_GENERATION || hint == placement_hint_t::REWRITE); + + data_category_t category = get_extent_category(type); + gen = adjust_generation(category, type, hint, gen); + + // XXX: bp might be extended to point to different memory (e.g. PMem) + // according to the allocator. + auto bp = ceph::bufferptr( + buffer::create_page_aligned(length)); + bp.zero(); + paddr_t addr; +#ifdef UNIT_TESTS_BUILT + if (unlikely(external_paddr.has_value())) { + assert(external_paddr->is_fake()); + addr = *external_paddr; + } else if (gen == INLINE_GENERATION) { +#else + if (gen == INLINE_GENERATION) { +#endif + addr = make_record_relative_paddr(0); + } else if (category == data_category_t::DATA) { + assert(data_writers_by_gen[generation_to_writer(gen)]); + addr = data_writers_by_gen[ + generation_to_writer(gen)]->alloc_paddr(length); + } else { + assert(category == data_category_t::METADATA); + assert(md_writers_by_gen[generation_to_writer(gen)]); + addr = md_writers_by_gen[ + generation_to_writer(gen)]->alloc_paddr(length); + } + return {addr, std::move(bp), gen}; + } + + /** + * dispatch_result_t + * + * ool extents are placed in alloc_map and passed to + * EPM::write_delayed_ool_extents, + * delayed_extents is used to update lba mapping. + * usage is used to reserve projected space + */ + using extents_by_writer_t = + std::map<ExtentOolWriter*, std::list<LogicalCachedExtentRef>>; + struct dispatch_result_t { + extents_by_writer_t alloc_map; + std::list<LogicalCachedExtentRef> delayed_extents; + io_usage_t usage; + }; + + /** + * dispatch_delayed_extents + * + * Performs delayed allocation + */ + dispatch_result_t dispatch_delayed_extents(Transaction& t); + + /** + * write_delayed_ool_extents + * + * Do writes for out-of-line extents. + */ + using alloc_paddr_iertr = ExtentOolWriter::alloc_write_iertr; + alloc_paddr_iertr::future<> write_delayed_ool_extents( + Transaction& t, + extents_by_writer_t& alloc_map); + + /** + * write_preallocated_ool_extents + * + * Performs ool writes for extents with pre-allocated addresses. + * See Transaction::pre_alloc_list + */ + alloc_paddr_iertr::future<> write_preallocated_ool_extents( + Transaction &t, + std::list<LogicalCachedExtentRef> extents); + + seastar::future<> stop_background() { + return background_process.stop_background(); + } + + using close_ertr = ExtentOolWriter::close_ertr; + close_ertr::future<> close(); + + using read_ertr = Device::read_ertr; + read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out + ) { + assert(devices_by_id[addr.get_device_id()] != nullptr); + return devices_by_id[addr.get_device_id()]->read(addr, len, out); + } + + void mark_space_used(paddr_t addr, extent_len_t len) { + background_process.mark_space_used(addr, len); + } + + void mark_space_free(paddr_t addr, extent_len_t len) { + background_process.mark_space_free(addr, len); + } + + void commit_space_used(paddr_t addr, extent_len_t len) { + return background_process.commit_space_used(addr, len); + } + + seastar::future<> reserve_projected_usage(io_usage_t usage) { + return background_process.reserve_projected_usage(usage); + } + + void release_projected_usage(const io_usage_t &usage) { + background_process.release_projected_usage(usage); + } + + backend_type_t get_main_backend_type() const { + if (!background_process.is_no_background()) { + return background_process.get_main_backend_type(); + } + // for test + assert(primary_device); + return primary_device->get_backend_type(); + } + + // Testing interfaces + + void test_init_no_background(Device *test_device) { + assert(test_device->get_backend_type() == backend_type_t::SEGMENTED); + add_device(test_device); + set_primary_device(test_device); + } + + bool check_usage() { + return background_process.check_usage(); + } + + seastar::future<> run_background_work_until_halt() { + return background_process.run_until_halt(); + } + +private: + rewrite_gen_t adjust_generation( + data_category_t category, + extent_types_t type, + placement_hint_t hint, + rewrite_gen_t gen) { + if (type == extent_types_t::ROOT) { + gen = INLINE_GENERATION; + } else if (get_main_backend_type() == backend_type_t::SEGMENTED && + is_lba_backref_node(type)) { + gen = INLINE_GENERATION; + } else if (hint == placement_hint_t::COLD) { + assert(gen == INIT_GENERATION); + if (background_process.has_cold_tier()) { + gen = MIN_COLD_GENERATION; + } else { + gen = MIN_REWRITE_GENERATION; + } + } else if (gen == INIT_GENERATION) { + if (category == data_category_t::METADATA) { + if (get_main_backend_type() == backend_type_t::SEGMENTED) { + // with SEGMENTED, default not to ool metadata extents to reduce + // padding overhead. + // TODO: improve padding so we can default to the ool path. + gen = INLINE_GENERATION; + } else { + // with RBM, all extents must be OOL + assert(get_main_backend_type() == + backend_type_t::RANDOM_BLOCK); + gen = OOL_GENERATION; + } + } else { + assert(category == data_category_t::DATA); + gen = OOL_GENERATION; + } + } else if (background_process.has_cold_tier()) { + gen = background_process.adjust_generation(gen); + } + + if (gen > dynamic_max_rewrite_generation) { + gen = dynamic_max_rewrite_generation; + } + + return gen; + } + + void add_device(Device *device) { + auto device_id = device->get_device_id(); + ceph_assert(devices_by_id[device_id] == nullptr); + devices_by_id[device_id] = device; + ++num_devices; + } + + /** + * dispatch_delayed_extent + * + * Specify the extent inline or ool + * return true indicates inline otherwise ool + */ + bool dispatch_delayed_extent(LogicalCachedExtentRef& extent) { + // TODO: all delayed extents are ool currently + boost::ignore_unused(extent); + return false; + } + + ExtentOolWriter* get_writer(placement_hint_t hint, + data_category_t category, + rewrite_gen_t gen) { + assert(hint < placement_hint_t::NUM_HINTS); + assert(is_rewrite_generation(gen)); + assert(gen != INLINE_GENERATION); + assert(gen <= dynamic_max_rewrite_generation); + if (category == data_category_t::DATA) { + return data_writers_by_gen[generation_to_writer(gen)]; + } else { + assert(category == data_category_t::METADATA); + return md_writers_by_gen[generation_to_writer(gen)]; + } + } + + /** + * BackgroundProcess + * + * Background process to schedule background transactions. + * + * TODO: device tiering + */ + class BackgroundProcess : public BackgroundListener { + public: + BackgroundProcess() = default; + + void init(JournalTrimmerImplRef &&_trimmer, + AsyncCleanerRef &&_cleaner, + AsyncCleanerRef &&_cold_cleaner) { + trimmer = std::move(_trimmer); + trimmer->set_background_callback(this); + main_cleaner = std::move(_cleaner); + main_cleaner->set_background_callback(this); + if (_cold_cleaner) { + cold_cleaner = std::move(_cold_cleaner); + cold_cleaner->set_background_callback(this); + + cleaners_by_device_id.resize(DEVICE_ID_MAX, nullptr); + for (auto id : main_cleaner->get_device_ids()) { + cleaners_by_device_id[id] = main_cleaner.get(); + } + for (auto id : cold_cleaner->get_device_ids()) { + cleaners_by_device_id[id] = cold_cleaner.get(); + } + + eviction_state.init( + crimson::common::get_conf<double>( + "seastore_multiple_tiers_stop_evict_ratio"), + crimson::common::get_conf<double>( + "seastore_multiple_tiers_default_evict_ratio"), + crimson::common::get_conf<double>( + "seastore_multiple_tiers_fast_evict_ratio")); + } + } + + journal_type_t get_journal_type() const { + return trimmer->get_journal_type(); + } + + bool has_cold_tier() const { + return cold_cleaner.get() != nullptr; + } + + void set_extent_callback(ExtentCallbackInterface *cb) { + trimmer->set_extent_callback(cb); + main_cleaner->set_extent_callback(cb); + if (has_cold_tier()) { + cold_cleaner->set_extent_callback(cb); + } + } + + store_statfs_t get_stat() const { + auto stat = main_cleaner->get_stat(); + if (has_cold_tier()) { + stat.add(cold_cleaner->get_stat()); + } + return stat; + } + + using mount_ret = ExtentPlacementManager::mount_ret; + mount_ret mount() { + ceph_assert(state == state_t::STOP); + state = state_t::MOUNT; + trimmer->reset(); + stats = {}; + register_metrics(); + return main_cleaner->mount( + ).safe_then([this] { + return has_cold_tier() ? cold_cleaner->mount() : mount_ertr::now(); + }); + } + + void start_scan_space() { + ceph_assert(state == state_t::MOUNT); + state = state_t::SCAN_SPACE; + ceph_assert(main_cleaner->check_usage_is_empty()); + ceph_assert(!has_cold_tier() || + cold_cleaner->check_usage_is_empty()); + } + + void start_background(); + + void mark_space_used(paddr_t addr, extent_len_t len) { + if (state < state_t::SCAN_SPACE) { + return; + } + + if (!has_cold_tier()) { + assert(main_cleaner); + main_cleaner->mark_space_used(addr, len); + } else { + auto id = addr.get_device_id(); + assert(id < cleaners_by_device_id.size()); + auto cleaner = cleaners_by_device_id[id]; + assert(cleaner); + cleaner->mark_space_used(addr, len); + } + } + + void mark_space_free(paddr_t addr, extent_len_t len) { + if (state < state_t::SCAN_SPACE) { + return; + } + + if (!has_cold_tier()) { + assert(main_cleaner); + main_cleaner->mark_space_free(addr, len); + } else { + auto id = addr.get_device_id(); + assert(id < cleaners_by_device_id.size()); + auto cleaner = cleaners_by_device_id[id]; + assert(cleaner); + cleaner->mark_space_free(addr, len); + } + } + + void commit_space_used(paddr_t addr, extent_len_t len) { + if (state < state_t::SCAN_SPACE) { + return; + } + + if (!has_cold_tier()) { + assert(main_cleaner); + main_cleaner->commit_space_used(addr, len); + } else { + auto id = addr.get_device_id(); + assert(id < cleaners_by_device_id.size()); + auto cleaner = cleaners_by_device_id[id]; + assert(cleaner); + cleaner->commit_space_used(addr, len); + } + } + + rewrite_gen_t adjust_generation(rewrite_gen_t gen) { + if (has_cold_tier()) { + return eviction_state.adjust_generation_with_eviction(gen); + } else { + return gen; + } + } + + seastar::future<> reserve_projected_usage(io_usage_t usage); + + void release_projected_usage(const io_usage_t &usage) { + if (is_ready()) { + trimmer->release_inline_usage(usage.inline_usage); + main_cleaner->release_projected_usage(usage.cleaner_usage.main_usage); + if (has_cold_tier()) { + cold_cleaner->release_projected_usage(usage.cleaner_usage.cold_ool_usage); + } + } + } + + seastar::future<> stop_background(); + backend_type_t get_main_backend_type() const { + return get_journal_type(); + } + + // Testing interfaces + + bool check_usage() { + return main_cleaner->check_usage() && + (!has_cold_tier() || cold_cleaner->check_usage()); + } + + seastar::future<> run_until_halt(); + + bool is_no_background() const { + return !trimmer || !main_cleaner; + } + + protected: + state_t get_state() const final { + return state; + } + + void maybe_wake_background() final { + if (!is_running()) { + return; + } + if (background_should_run()) { + do_wake_background(); + } + } + + void maybe_wake_blocked_io() final { + if (!is_ready()) { + return; + } + if (!should_block_io() && blocking_io) { + blocking_io->set_value(); + blocking_io = std::nullopt; + } + } + + private: + // reserve helpers + bool try_reserve_cold(std::size_t usage); + void abort_cold_usage(std::size_t usage, bool success); + + reserve_cleaner_result_t try_reserve_cleaner(const cleaner_usage_t &usage); + void abort_cleaner_usage(const cleaner_usage_t &usage, + const reserve_cleaner_result_t &result); + + reserve_io_result_t try_reserve_io(const io_usage_t &usage); + void abort_io_usage(const io_usage_t &usage, + const reserve_io_result_t &result); + + bool is_running() const { + if (state == state_t::RUNNING) { + assert(process_join); + return true; + } else { + assert(!process_join); + return false; + } + } + + void log_state(const char *caller) const; + + seastar::future<> run(); + + void do_wake_background() { + if (blocking_background) { + blocking_background->set_value(); + blocking_background = std::nullopt; + } + } + + // background_should_run() should be atomic with do_background_cycle() + // to make sure the condition is consistent. + bool background_should_run() { + assert(is_ready()); + maybe_update_eviction_mode(); + return main_cleaner_should_run() + || cold_cleaner_should_run() + || trimmer->should_trim(); + } + + bool main_cleaner_should_run() const { + assert(is_ready()); + return main_cleaner->should_clean_space() || + (has_cold_tier() && + main_cleaner->can_clean_space() && + eviction_state.is_fast_mode()); + } + + bool cold_cleaner_should_run() const { + assert(is_ready()); + return has_cold_tier() && + cold_cleaner->should_clean_space(); + } + + bool should_block_io() const { + assert(is_ready()); + return trimmer->should_block_io_on_trim() || + main_cleaner->should_block_io_on_clean() || + (has_cold_tier() && + cold_cleaner->should_block_io_on_clean()); + } + + void maybe_update_eviction_mode() { + if (has_cold_tier()) { + auto main_alive_ratio = main_cleaner->get_stat().get_used_raw_ratio(); + eviction_state.maybe_update_eviction_mode(main_alive_ratio); + } + } + + struct eviction_state_t { + enum class eviction_mode_t { + STOP, // generation greater than or equal to MIN_COLD_GENERATION + // will be set to MIN_COLD_GENERATION - 1, which means + // no extents will be evicted. + DEFAULT, // generation incremented with each rewrite. Extents will + // be evicted when generation reaches MIN_COLD_GENERATION. + FAST, // map all generations located in + // [MIN_REWRITE_GENERATION, MIN_COLD_GENERATIOIN) to + // MIN_COLD_GENERATION. + }; + + eviction_mode_t eviction_mode; + double stop_evict_ratio; + double default_evict_ratio; + double fast_evict_ratio; + + void init(double stop_ratio, + double default_ratio, + double fast_ratio) { + ceph_assert(0 <= stop_ratio); + ceph_assert(stop_ratio < default_ratio); + ceph_assert(default_ratio < fast_ratio); + ceph_assert(fast_ratio <= 1); + eviction_mode = eviction_mode_t::STOP; + stop_evict_ratio = stop_ratio; + default_evict_ratio = default_ratio; + fast_evict_ratio = fast_ratio; + } + + bool is_stop_mode() const { + return eviction_mode == eviction_mode_t::STOP; + } + + bool is_default_mode() const { + return eviction_mode == eviction_mode_t::DEFAULT; + } + + bool is_fast_mode() const { + return eviction_mode == eviction_mode_t::FAST; + } + + rewrite_gen_t adjust_generation_with_eviction(rewrite_gen_t gen) { + rewrite_gen_t ret = gen; + switch(eviction_mode) { + case eviction_mode_t::STOP: + if (gen == MIN_COLD_GENERATION) { + ret = MIN_COLD_GENERATION - 1; + } + break; + case eviction_mode_t::DEFAULT: + break; + case eviction_mode_t::FAST: + if (gen >= MIN_REWRITE_GENERATION && gen < MIN_COLD_GENERATION) { + ret = MIN_COLD_GENERATION; + } + break; + default: + ceph_abort("impossible"); + } + return ret; + } + + // We change the state of eviction_mode according to the alive ratio + // of the main cleaner. + // + // Use A, B, C, D to represent the state of alive ratio: + // A: alive ratio <= stop_evict_ratio + // B: alive ratio <= default_evict_ratio + // C: alive ratio <= fast_evict_ratio + // D: alive ratio > fast_evict_ratio + // + // and use X, Y, Z to shorten the state of eviction_mode_t: + // X: STOP + // Y: DEFAULT + // Z: FAST + // + // Then we can use a form like (A && X) to describe the current state + // of the main cleaner, which indicates the alive ratio is less than or + // equal to stop_evict_ratio and current eviction mode is STOP. + // + // all valid state transitions show as follow: + // (A && X) => (B && X) => (C && Y) => (D && Z) => + // (C && Z) => (B && Y) => (A && X) + // `--> (C && Y) => ... + // + // when the system restarts, the init state is (_ && X), the + // transitions should be: + // (_ && X) -> (A && X) => normal transition + // -> (B && X) => normal transition + // -> (C && X) => (C && Y) => normal transition + // -> (D && X) => (D && Z) => normal transition + void maybe_update_eviction_mode(double main_alive_ratio) { + if (main_alive_ratio <= stop_evict_ratio) { + eviction_mode = eviction_mode_t::STOP; + } else if (main_alive_ratio <= default_evict_ratio) { + if (eviction_mode > eviction_mode_t::DEFAULT) { + eviction_mode = eviction_mode_t::DEFAULT; + } + } else if (main_alive_ratio <= fast_evict_ratio) { + if (eviction_mode < eviction_mode_t::DEFAULT) { + eviction_mode = eviction_mode_t::DEFAULT; + } + } else { + assert(main_alive_ratio > fast_evict_ratio); + eviction_mode = eviction_mode_t::FAST; + } + } + }; + + seastar::future<> do_background_cycle(); + + void register_metrics(); + + struct { + uint64_t io_blocking_num = 0; + uint64_t io_count = 0; + uint64_t io_blocked_count = 0; + uint64_t io_blocked_count_trim = 0; + uint64_t io_blocked_count_clean = 0; + uint64_t io_blocked_sum = 0; + } stats; + seastar::metrics::metric_group metrics; + + JournalTrimmerImplRef trimmer; + AsyncCleanerRef main_cleaner; + + /* + * cold tier (optional, see has_cold_tier()) + */ + AsyncCleanerRef cold_cleaner; + std::vector<AsyncCleaner*> cleaners_by_device_id; + + std::optional<seastar::future<>> process_join; + std::optional<seastar::promise<>> blocking_background; + std::optional<seastar::promise<>> blocking_io; + bool is_running_until_halt = false; + state_t state = state_t::STOP; + eviction_state_t eviction_state; + + friend class ::transaction_manager_test_t; + }; + + std::vector<ExtentOolWriterRef> writer_refs; + std::vector<ExtentOolWriter*> data_writers_by_gen; + // gen 0 METADATA writer is the journal writer + std::vector<ExtentOolWriter*> md_writers_by_gen; + + std::vector<Device*> devices_by_id; + Device* primary_device = nullptr; + std::size_t num_devices = 0; + + rewrite_gen_t dynamic_max_rewrite_generation = REWRITE_GENERATIONS; + BackgroundProcess background_process; + // TODO: drop once paddr->journal_seq_t is introduced + SegmentSeqAllocatorRef ool_segment_seq_allocator; + + friend class ::transaction_manager_test_t; +}; + +using ExtentPlacementManagerRef = std::unique_ptr<ExtentPlacementManager>; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::io_usage_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager.cc new file mode 100644 index 000000000..b0dc1b8c8 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager.cc @@ -0,0 +1,33 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <experimental/iterator> +#include <iostream> + +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/extentmap_manager.h" +#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h" +namespace crimson::os::seastore::extentmap_manager { + +ExtentMapManagerRef create_extentmap_manager( + TransactionManager &trans_manager) { + return ExtentMapManagerRef(new BtreeExtentMapManager(trans_manager)); +} + +} + +namespace crimson::os::seastore { + +std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs) +{ + return out << "extent_mapping_t (" << rhs.logical_offset << "~" << rhs.length + << "->" << rhs.laddr << ")"; +} + +std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs) +{ + out << '['; + std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", ")); + return out << ']'; +} + +} diff --git a/src/crimson/os/seastore/journal.cc b/src/crimson/os/seastore/journal.cc new file mode 100644 index 000000000..d4714cf3f --- /dev/null +++ b/src/crimson/os/seastore/journal.cc @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "journal.h" +#include "journal/segmented_journal.h" +#include "journal/circular_bounded_journal.h" + +namespace crimson::os::seastore::journal { + +JournalRef make_segmented( + SegmentProvider &provider, + JournalTrimmer &trimmer) +{ + return std::make_unique<SegmentedJournal>(provider, trimmer); +} + +JournalRef make_circularbounded( + JournalTrimmer &trimmer, + crimson::os::seastore::random_block_device::RBMDevice* device, + std::string path) +{ + return std::make_unique<CircularBoundedJournal>(trimmer, device, path); +} + +} diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h new file mode 100644 index 000000000..18c0797a8 --- /dev/null +++ b/src/crimson/os/seastore/journal.h @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> + +#include "crimson/os/seastore/ordering_handle.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/segment_seq_allocator.h" + +namespace crimson::os::seastore { + +namespace random_block_device { +class RBMDevice; +} + +class SegmentManagerGroup; +class SegmentProvider; +class JournalTrimmer; + +class Journal { +public: + virtual JournalTrimmer &get_trimmer() = 0; + /** + * initializes journal for mkfs writes -- must run prior to calls + * to submit_record. + */ + using open_for_mkfs_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using open_for_mkfs_ret = open_for_mkfs_ertr::future<journal_seq_t>; + virtual open_for_mkfs_ret open_for_mkfs() = 0; + + /** + * initializes journal for new writes -- must run prior to calls + * to submit_record. Should be called after replay if not a new + * Journal. + */ + using open_for_mount_ertr = open_for_mkfs_ertr; + using open_for_mount_ret = open_for_mkfs_ret; + virtual open_for_mount_ret open_for_mount() = 0; + + /// close journal + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + virtual close_ertr::future<> close() = 0; + + /** + * submit_record + * + * write record with the ordering handle + */ + using submit_record_ertr = crimson::errorator< + crimson::ct_error::erange, + crimson::ct_error::input_output_error + >; + using submit_record_ret = submit_record_ertr::future< + record_locator_t + >; + virtual submit_record_ret submit_record( + record_t &&record, + OrderingHandle &handle + ) = 0; + + /** + * flush + * + * Wait for all outstanding IOs on handle to commit. + * Note, flush() machinery must go through the same pipeline + * stages and locks as submit_record. + */ + virtual seastar::future<> flush(OrderingHandle &handle) = 0; + + /// sets write pipeline reference + virtual void set_write_pipeline(WritePipeline *_write_pipeline) = 0; + + /** + * Read deltas and pass to delta_handler + * + * record_block_start (argument to delta_handler) is the start of the + * of the first block in the record + */ + using replay_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + using replay_ret = replay_ertr::future<>; + using delta_handler_t = std::function< + replay_ertr::future<bool>( + const record_locator_t&, + const delta_info_t&, + const journal_seq_t&, // dirty_tail + const journal_seq_t&, // alloc_tail + sea_time_point modify_time)>; + virtual replay_ret replay( + delta_handler_t &&delta_handler) = 0; + + virtual seastar::future<> finish_commit( + transaction_type_t type) = 0; + + virtual ~Journal() {} + + virtual journal_type_t get_type() = 0; +}; +using JournalRef = std::unique_ptr<Journal>; + +namespace journal { + +JournalRef make_segmented( + SegmentProvider &provider, + JournalTrimmer &trimmer); + +JournalRef make_circularbounded( + JournalTrimmer &trimmer, + crimson::os::seastore::random_block_device::RBMDevice* device, + std::string path); + +} + +} diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.cc b/src/crimson/os/seastore/journal/circular_bounded_journal.cc new file mode 100644 index 000000000..ec41bfab1 --- /dev/null +++ b/src/crimson/os/seastore/journal/circular_bounded_journal.cc @@ -0,0 +1,387 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/iterator/counting_iterator.hpp> + +#include "crimson/common/errorator-loop.h" +#include "include/intarith.h" +#include "crimson/os/seastore/async_cleaner.h" +#include "crimson/os/seastore/journal/circular_bounded_journal.h" +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/journal/circular_journal_space.h" + +SET_SUBSYS(seastore_journal); + +namespace crimson::os::seastore::journal { + +CircularBoundedJournal::CircularBoundedJournal( + JournalTrimmer &trimmer, + RBMDevice* device, + const std::string &path) + : trimmer(trimmer), path(path), + cjs(device), + record_submitter(crimson::common::get_conf<uint64_t>( + "seastore_journal_iodepth_limit"), + crimson::common::get_conf<uint64_t>( + "seastore_journal_batch_capacity"), + crimson::common::get_conf<Option::size_t>( + "seastore_journal_batch_flush_size"), + crimson::common::get_conf<double>( + "seastore_journal_batch_preferred_fullness"), + cjs) + {} + +CircularBoundedJournal::open_for_mkfs_ret +CircularBoundedJournal::open_for_mkfs() +{ + return record_submitter.open(true + ).safe_then([this](auto ret) { + return open_for_mkfs_ret( + open_for_mkfs_ertr::ready_future_marker{}, + get_written_to()); + }); +} + +CircularBoundedJournal::open_for_mount_ret +CircularBoundedJournal::open_for_mount() +{ + return record_submitter.open(false + ).safe_then([this](auto ret) { + return open_for_mount_ret( + open_for_mount_ertr::ready_future_marker{}, + get_written_to()); + }); +} + +CircularBoundedJournal::close_ertr::future<> CircularBoundedJournal::close() +{ + return record_submitter.close(); +} + +CircularBoundedJournal::submit_record_ret +CircularBoundedJournal::submit_record( + record_t &&record, + OrderingHandle &handle) +{ + LOG_PREFIX(CircularBoundedJournal::submit_record); + DEBUG("H{} {} start ...", (void*)&handle, record); + assert(write_pipeline); + return do_submit_record(std::move(record), handle); +} + +CircularBoundedJournal::submit_record_ret +CircularBoundedJournal::do_submit_record( + record_t &&record, + OrderingHandle &handle) +{ + LOG_PREFIX(CircularBoundedJournal::do_submit_record); + if (!record_submitter.is_available()) { + DEBUG("H{} wait ...", (void*)&handle); + return record_submitter.wait_available( + ).safe_then([this, record=std::move(record), &handle]() mutable { + return do_submit_record(std::move(record), handle); + }); + } + auto action = record_submitter.check_action(record.size); + if (action == RecordSubmitter::action_t::ROLL) { + return record_submitter.roll_segment( + ).safe_then([this, record=std::move(record), &handle]() mutable { + return do_submit_record(std::move(record), handle); + }); + } + + DEBUG("H{} submit {} ...", + (void*)&handle, + action == RecordSubmitter::action_t::SUBMIT_FULL ? + "FULL" : "NOT_FULL"); + auto submit_fut = record_submitter.submit(std::move(record)); + return handle.enter(write_pipeline->device_submission + ).then([submit_fut=std::move(submit_fut)]() mutable { + return std::move(submit_fut); + }).safe_then([FNAME, this, &handle](record_locator_t result) { + return handle.enter(write_pipeline->finalize + ).then([FNAME, this, result, &handle] { + DEBUG("H{} finish with {}", (void*)&handle, result); + auto new_committed_to = result.write_result.get_end_seq(); + record_submitter.update_committed_to(new_committed_to); + return result; + }); + }); +} + +Journal::replay_ret CircularBoundedJournal::replay_segment( + cbj_delta_handler_t &handler, scan_valid_records_cursor& cursor) +{ + LOG_PREFIX(Journal::replay_segment); + return seastar::do_with( + RecordScanner::found_record_handler_t( + [this, &handler, FNAME]( + record_locator_t locator, + const record_group_header_t& r_header, + const bufferlist& mdbuf) + -> RecordScanner::scan_valid_records_ertr::future<> + { + auto maybe_record_deltas_list = try_decode_deltas( + r_header, mdbuf, locator.record_block_base); + if (!maybe_record_deltas_list) { + // This should be impossible, we did check the crc on the mdbuf + ERROR("unable to decode deltas for record {} at {}", + r_header, locator.record_block_base); + return crimson::ct_error::input_output_error::make(); + } + assert(locator.write_result.start_seq != JOURNAL_SEQ_NULL); + auto cursor_addr = convert_paddr_to_abs_addr(locator.write_result.start_seq.offset); + DEBUG("{} at {}", r_header, cursor_addr); + journal_seq_t start_seq = locator.write_result.start_seq; + auto write_result = write_result_t{ + start_seq, + r_header.mdlength + r_header.dlength + }; + auto expected_seq = locator.write_result.start_seq.segment_seq; + cursor_addr += (r_header.mdlength + r_header.dlength); + if (cursor_addr >= get_journal_end()) { + cursor_addr = get_records_start(); + ++expected_seq; + paddr_t addr = convert_abs_addr_to_paddr( + cursor_addr, + get_device_id()); + write_result.start_seq.offset = addr; + write_result.start_seq.segment_seq = expected_seq; + } + paddr_t addr = convert_abs_addr_to_paddr( + cursor_addr, + get_device_id()); + set_written_to( + journal_seq_t{expected_seq, addr}); + return seastar::do_with( + std::move(*maybe_record_deltas_list), + [write_result, + &handler, + FNAME](auto& record_deltas_list) { + return crimson::do_for_each( + record_deltas_list, + [write_result, + &handler, FNAME](record_deltas_t& record_deltas) { + auto locator = record_locator_t{ + record_deltas.record_block_base, + write_result + }; + DEBUG("processing {} deltas at block_base {}", + record_deltas.deltas.size(), + locator); + return crimson::do_for_each( + record_deltas.deltas, + [locator, + &handler](auto& p) { + auto& modify_time = p.first; + auto& delta = p.second; + return handler( + locator, + delta, + modify_time).discard_result(); + }); + }); + }); + }), + [=, this, &cursor](auto &dhandler) { + return scan_valid_records( + cursor, + cjs.get_cbj_header().magic, + std::numeric_limits<size_t>::max(), + dhandler).safe_then([](auto){} + ).handle_error( + replay_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "shouldn't meet with any other error other replay_ertr" + } + ); + } + ); +} + + +Journal::replay_ret CircularBoundedJournal::scan_valid_record_delta( + cbj_delta_handler_t &&handler, journal_seq_t tail) +{ + LOG_PREFIX(Journal::scan_valid_record_delta); + INFO("starting at {} ", tail); + return seastar::do_with( + scan_valid_records_cursor(tail), + std::move(handler), + bool(false), + [this] (auto &cursor, auto &handler, auto &rolled) { + return crimson::repeat([this, &handler, &cursor, &rolled]() + -> replay_ertr::future<seastar::stop_iteration> + { + return replay_segment(handler, cursor + ).safe_then([this, &cursor, &rolled] { + if (!rolled) { + cursor.last_valid_header_found = false; + } + if (!cursor.is_complete()) { + try_read_rolled_header(cursor); + rolled = true; + return replay_ertr::make_ready_future< + seastar::stop_iteration>(seastar::stop_iteration::no); + } + return replay_ertr::make_ready_future< + seastar::stop_iteration>(seastar::stop_iteration::yes); + }); + }); + }); +} + +RecordScanner::read_ret CircularBoundedJournal::read(paddr_t start, size_t len) +{ + LOG_PREFIX(CircularBoundedJournal::read); + rbm_abs_addr addr = convert_paddr_to_abs_addr(start); + DEBUG("reading data from addr {} read length {}", addr, len); + auto bptr = bufferptr(ceph::buffer::create_page_aligned(len)); + return cjs.read(addr, bptr + ).safe_then([bptr=std::move(bptr)]() { + return read_ret( + RecordScanner::read_ertr::ready_future_marker{}, + std::move(bptr) + ); + }); +} + +bool CircularBoundedJournal::is_record_segment_seq_invalid( + scan_valid_records_cursor &cursor, + record_group_header_t &r_header) +{ + LOG_PREFIX(CircularBoundedJournal::is_record_segment_seq_invalid); + auto print_invalid = [FNAME](auto &r_header) { + DEBUG("invalid header: {}", r_header); + return true; + }; + if (cursor.seq.offset == convert_abs_addr_to_paddr( + get_records_start(), get_device_id())) { + if ((r_header.committed_to.segment_seq == NULL_SEG_SEQ && + cursor.seq.segment_seq != 0) || + r_header.committed_to.segment_seq != cursor.seq.segment_seq - 1) { + return print_invalid(r_header); + } + } else if (r_header.committed_to.segment_seq != cursor.seq.segment_seq) { + /* + * Assuing that seastore issues several records using submit_recods() + * as shown in the following example. + * + * Example ) + * a. submit_record(a); + * b. submit_record(b); + * c. submit_record(c); + * d. roll to begin + * e. submit_record(d); + * f. submit_record(e); + * g. submit_record(f); + * + * In this example, we need to consider the two cases. + * case 1) + * records a - e were issued in a batch manner + * case 2) + * When starts to submit_record(e) at step 6, submit(b) has completed its finalize phase, + * so the header of e's committed_to points to the end of b. + * + * To handle these cases correctly, the following condition is added. + */ + if ((r_header.committed_to.offset >= cursor.last_committed.offset && + r_header.committed_to.segment_seq == cursor.last_committed.segment_seq) && + r_header.committed_to.segment_seq == cursor.seq.segment_seq - 1) { + return false; + } + return print_invalid(r_header); + } + return false; +} + +Journal::replay_ret CircularBoundedJournal::replay( + delta_handler_t &&delta_handler) +{ + /* + * read records from last applied record prior to written_to, and replay + */ + LOG_PREFIX(CircularBoundedJournal::replay); + return cjs.read_header( + ).handle_error( + open_for_mount_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error read_header" + }).safe_then([this, FNAME, delta_handler=std::move(delta_handler)](auto p) + mutable { + auto &[head, bl] = *p; + cjs.set_cbj_header(head); + DEBUG("header : {}", cjs.get_cbj_header()); + cjs.set_initialized(true); + return seastar::do_with( + std::move(delta_handler), + std::map<paddr_t, journal_seq_t>(), + [this](auto &d_handler, auto &map) { + auto build_paddr_seq_map = [&map]( + const auto &offsets, + const auto &e, + sea_time_point modify_time) + { + if (e.type == extent_types_t::ALLOC_INFO) { + alloc_delta_t alloc_delta; + decode(alloc_delta, e.bl); + if (alloc_delta.op == alloc_delta_t::op_types_t::CLEAR) { + for (auto &alloc_blk : alloc_delta.alloc_blk_ranges) { + map[alloc_blk.paddr] = offsets.write_result.start_seq; + } + } + } + return replay_ertr::make_ready_future<bool>(true); + }; + auto tail = get_dirty_tail() <= get_alloc_tail() ? + get_dirty_tail() : get_alloc_tail(); + set_written_to(tail); + // The first pass to build the paddr->journal_seq_t map + // from extent allocations + return scan_valid_record_delta(std::move(build_paddr_seq_map), tail + ).safe_then([this, &map, &d_handler, tail]() { + auto call_d_handler_if_valid = [this, &map, &d_handler]( + const auto &offsets, + const auto &e, + sea_time_point modify_time) + { + if (map.find(e.paddr) == map.end() || + map[e.paddr] <= offsets.write_result.start_seq) { + return d_handler( + offsets, + e, + get_dirty_tail(), + get_alloc_tail(), + modify_time + ); + } + return replay_ertr::make_ready_future<bool>(true); + }; + // The second pass to replay deltas + return scan_valid_record_delta(std::move(call_d_handler_if_valid), tail); + }); + }).safe_then([this]() { + // make sure that committed_to is JOURNAL_SEQ_NULL if jounal is the initial state + if (get_written_to() != + journal_seq_t{0, + convert_abs_addr_to_paddr(get_records_start(), + get_device_id())}) { + record_submitter.update_committed_to(get_written_to()); + } + trimmer.update_journal_tails( + get_dirty_tail(), + get_alloc_tail()); + }); + }); +} + +seastar::future<> CircularBoundedJournal::finish_commit(transaction_type_t type) { + if (is_trim_transaction(type)) { + return update_journal_tail( + trimmer.get_dirty_tail(), + trimmer.get_alloc_tail()); + } + return seastar::now(); +} + +} diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.h b/src/crimson/os/seastore/journal/circular_bounded_journal.h new file mode 100644 index 000000000..debe535ae --- /dev/null +++ b/src/crimson/os/seastore/journal/circular_bounded_journal.h @@ -0,0 +1,224 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/log.h" + +#include <boost/intrusive_ptr.hpp> + +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer.h" +#include "include/denc.h" + +#include "crimson/osd/exceptions.h" +#include "crimson/os/seastore/journal.h" +#include "include/uuid.h" +#include "crimson/os/seastore/random_block_manager.h" +#include "crimson/os/seastore/random_block_manager/rbm_device.h" +#include <list> +#include "crimson/os/seastore/journal/record_submitter.h" +#include "crimson/os/seastore/journal/circular_journal_space.h" +#include "crimson/os/seastore/record_scanner.h" + +namespace crimson::os::seastore::journal { + +using RBMDevice = random_block_device::RBMDevice; + +/** + * CircularBoundedJournal + * + * + * CircularBoundedJournal (CBJournal) is the journal that works like circular + * queue. With CBJournal, Seastore will append some of the records if the size + * of the record is small (most likely metadata), at which point the head + * (written_to) will be moved. Then, eventually, Seastore applies the records + * in CBjournal to RBM (TODO). + * + * - Commit time + * After submit_record is done, written_to is increased(this in-memory value) + * ---written_to represents where the new record will be appended. Note that + * applied_to is not changed here. + * + * - Replay time + * At replay time, CBJournal begins to replay records in CBjournal by reading + * records from dirty_tail. Then, CBJournal examines whether the records is valid + * one by one, at which point written_to is recovered + * if the valid record is founded. Note that applied_to is stored + * permanently when the apply work---applying the records in CBJournal to RBM--- + * is done by CBJournal (TODO). + * + * TODO: apply records from CircularBoundedJournal to RandomBlockManager + * + */ + +constexpr uint64_t DEFAULT_BLOCK_SIZE = 4096; + +class CircularBoundedJournal : public Journal, RecordScanner { +public: + CircularBoundedJournal( + JournalTrimmer &trimmer, RBMDevice* device, const std::string &path); + ~CircularBoundedJournal() {} + + JournalTrimmer &get_trimmer() final { + return trimmer; + } + + open_for_mkfs_ret open_for_mkfs() final; + + open_for_mount_ret open_for_mount() final; + + close_ertr::future<> close() final; + + journal_type_t get_type() final { + return journal_type_t::RANDOM_BLOCK; + } + + submit_record_ret submit_record( + record_t &&record, + OrderingHandle &handle + ) final; + + seastar::future<> flush( + OrderingHandle &handle + ) final { + // TODO + return seastar::now(); + } + + replay_ret replay(delta_handler_t &&delta_handler) final; + + rbm_abs_addr get_rbm_addr(journal_seq_t seq) const { + return convert_paddr_to_abs_addr(seq.offset); + } + + /** + * + * CircularBoundedJournal write + * + * NVMe will support a large block write (< 512KB) with atomic write unit command. + * With this command, we expect that the most of incoming data can be stored + * as a single write call, which has lower overhead than existing + * way that uses a combination of system calls such as write() and sync(). + * + */ + + seastar::future<> update_journal_tail( + journal_seq_t dirty, + journal_seq_t alloc) { + return cjs.update_journal_tail(dirty, alloc); + } + journal_seq_t get_dirty_tail() const { + return cjs.get_dirty_tail(); + } + journal_seq_t get_alloc_tail() const { + return cjs.get_alloc_tail(); + } + + void set_write_pipeline(WritePipeline *_write_pipeline) final { + write_pipeline = _write_pipeline; + } + + device_id_t get_device_id() const { + return cjs.get_device_id(); + } + extent_len_t get_block_size() const { + return cjs.get_block_size(); + } + + rbm_abs_addr get_journal_end() const { + return cjs.get_journal_end(); + } + + void set_written_to(journal_seq_t seq) { + cjs.set_written_to(seq); + } + + journal_seq_t get_written_to() { + return cjs.get_written_to(); + } + + rbm_abs_addr get_records_start() const { + return cjs.get_records_start(); + } + + seastar::future<> finish_commit(transaction_type_t type) final; + + using cbj_delta_handler_t = std::function< + replay_ertr::future<bool>( + const record_locator_t&, + const delta_info_t&, + sea_time_point modify_time)>; + + Journal::replay_ret scan_valid_record_delta( + cbj_delta_handler_t &&delta_handler, + journal_seq_t tail); + + submit_record_ret do_submit_record(record_t &&record, OrderingHandle &handle); + + void try_read_rolled_header(scan_valid_records_cursor &cursor) { + paddr_t addr = convert_abs_addr_to_paddr( + get_records_start(), + get_device_id()); + cursor.seq.offset = addr; + cursor.seq.segment_seq += 1; + } + + void initialize_cursor(scan_valid_records_cursor& cursor) final { + cursor.block_size = get_block_size(); + }; + + Journal::replay_ret replay_segment( + cbj_delta_handler_t &handler, scan_valid_records_cursor& cursor); + + read_ret read(paddr_t start, size_t len) final; + + bool is_record_segment_seq_invalid(scan_valid_records_cursor &cursor, + record_group_header_t &h) final; + + int64_t get_segment_end_offset(paddr_t addr) final { + return get_journal_end(); + } + + // Test interfaces + + CircularJournalSpace& get_cjs() { + return cjs; + } + + read_validate_record_metadata_ret test_read_validate_record_metadata( + scan_valid_records_cursor &cursor, + segment_nonce_t nonce) + { + return read_validate_record_metadata(cursor, nonce); + } + + void test_initialize_cursor(scan_valid_records_cursor &cursor) + { + initialize_cursor(cursor); + } + +private: + JournalTrimmer &trimmer; + std::string path; + WritePipeline *write_pipeline = nullptr; + /** + * initialized + * + * true after open_device_read_header, set to false in close(). + * Indicates that device is open and in-memory header is valid. + */ + bool initialized = false; + + // start address where the newest record will be written + // should be in range [get_records_start(), get_journal_end()) + // written_to.segment_seq is circulation seq to track + // the sequence to written records + CircularJournalSpace cjs; + RecordSubmitter record_submitter; +}; + +} + diff --git a/src/crimson/os/seastore/journal/circular_journal_space.cc b/src/crimson/os/seastore/journal/circular_journal_space.cc new file mode 100644 index 000000000..123bb9135 --- /dev/null +++ b/src/crimson/os/seastore/journal/circular_journal_space.cc @@ -0,0 +1,249 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "circular_journal_space.h" + +#include <fmt/format.h> +#include <fmt/os.h> + +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/async_cleaner.h" +#include "crimson/os/seastore/journal/circular_bounded_journal.h" + +SET_SUBSYS(seastore_journal); + +namespace crimson::os::seastore::journal { + +std::ostream &operator<<(std::ostream &out, + const CircularJournalSpace::cbj_header_t &header) +{ + return out << "cbj_header_t(" + << "dirty_tail=" << header.dirty_tail + << ", alloc_tail=" << header.alloc_tail + << ", magic=" << header.magic + << ")"; +} + +CircularJournalSpace::CircularJournalSpace(RBMDevice * device) : device(device) {} + +bool CircularJournalSpace::needs_roll(std::size_t length) const { + if (length + get_rbm_addr(get_written_to()) > get_journal_end()) { + return true; + } + return false; +} + +extent_len_t CircularJournalSpace::get_block_size() const { + return device->get_block_size(); +} + +CircularJournalSpace::roll_ertr::future<> CircularJournalSpace::roll() { + paddr_t paddr = convert_abs_addr_to_paddr( + get_records_start(), + get_device_id()); + auto seq = get_written_to(); + seq.segment_seq++; + assert(seq.segment_seq < MAX_SEG_SEQ); + set_written_to( + journal_seq_t{seq.segment_seq, paddr}); + return roll_ertr::now(); +} + +CircularJournalSpace::write_ret +CircularJournalSpace::write(ceph::bufferlist&& to_write) { + LOG_PREFIX(CircularJournalSpace::write); + assert(get_written_to().segment_seq != NULL_SEG_SEQ); + auto encoded_size = to_write.length(); + if (encoded_size > get_records_available_size()) { + ceph_abort("should be impossible with EPM reservation"); + } + assert(encoded_size + get_rbm_addr(get_written_to()) + < get_journal_end()); + + journal_seq_t j_seq = get_written_to(); + auto target = get_rbm_addr(get_written_to()); + auto new_written_to = target + encoded_size; + assert(new_written_to < get_journal_end()); + paddr_t paddr = convert_abs_addr_to_paddr( + new_written_to, + get_device_id()); + set_written_to( + journal_seq_t{get_written_to().segment_seq, paddr}); + DEBUG("{}, target {}", to_write.length(), target); + + auto write_result = write_result_t{ + j_seq, + encoded_size + }; + return device_write_bl(target, to_write + ).safe_then([this, target, + length=encoded_size, + write_result, + FNAME] { + DEBUG("commit target {} used_size {} written length {}", + target, get_records_used_size(), length); + return write_result; + }).handle_error( + base_ertr::pass_further{}, + crimson::ct_error::assert_all{ "Invalid error" } + ); +} + +segment_nonce_t calc_new_nonce( + uint32_t crc, + unsigned char const *data, + unsigned length) +{ + crc &= std::numeric_limits<uint32_t>::max() >> 1; + return ceph_crc32c(crc, data, length); +} + +CircularJournalSpace::open_ret CircularJournalSpace::open(bool is_mkfs) { + std::ostringstream oss; + oss << device_id_printer_t{get_device_id()}; + print_name = oss.str(); + + if (is_mkfs) { + LOG_PREFIX(CircularJournalSpace::open); + assert(device); + ceph::bufferlist bl; + CircularJournalSpace::cbj_header_t head; + assert(device->get_journal_size()); + head.dirty_tail = + journal_seq_t{0, + convert_abs_addr_to_paddr( + get_records_start(), + device->get_device_id())}; + head.alloc_tail = head.dirty_tail; + auto meta = device->get_meta(); + head.magic = calc_new_nonce( + std::rand() % std::numeric_limits<uint32_t>::max(), + reinterpret_cast<const unsigned char *>(meta.seastore_id.bytes()), + sizeof(meta.seastore_id.uuid)); + encode(head, bl); + header = head; + set_written_to(head.dirty_tail); + initialized = true; + DEBUG( + "initialize header block in CircularJournalSpace length {}, head: {}", + bl.length(), header); + return write_header( + ).safe_then([this]() { + return open_ret( + open_ertr::ready_future_marker{}, + get_written_to()); + }).handle_error( + open_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error write_header" + } + ); + } + ceph_assert(initialized); + if (written_to.segment_seq == NULL_SEG_SEQ) { + written_to.segment_seq = 0; + } + return open_ret( + open_ertr::ready_future_marker{}, + get_written_to()); +} + +ceph::bufferlist CircularJournalSpace::encode_header() +{ + bufferlist bl; + encode(header, bl); + auto header_crc_filler = bl.append_hole(sizeof(checksum_t)); + auto bliter = bl.cbegin(); + auto header_crc = bliter.crc32c( + ceph::encoded_sizeof_bounded<cbj_header_t>(), + -1); + ceph_le32 header_crc_le; + header_crc_le = header_crc; + header_crc_filler.copy_in( + sizeof(checksum_t), + reinterpret_cast<const char *>(&header_crc_le)); + return bl; +} + +CircularJournalSpace::write_ertr::future<> CircularJournalSpace::device_write_bl( + rbm_abs_addr offset, bufferlist &bl) +{ + LOG_PREFIX(CircularJournalSpace::device_write_bl); + auto length = bl.length(); + if (offset + length > get_journal_end()) { + return crimson::ct_error::erange::make(); + } + DEBUG( + "overwrite in CircularJournalSpace, offset {}, length {}", + offset, + length); + return device->writev(offset, bl + ).handle_error( + write_ertr::pass_further{}, + crimson::ct_error::assert_all{ "Invalid error device->write" } + ); +} + +CircularJournalSpace::read_header_ret +CircularJournalSpace::read_header() +{ + LOG_PREFIX(CircularJournalSpace::read_header); + assert(device); + auto bptr = bufferptr(ceph::buffer::create_page_aligned( + device->get_block_size())); + DEBUG("reading {}", device->get_shard_journal_start()); + return device->read(device->get_shard_journal_start(), bptr + ).safe_then([bptr, FNAME]() mutable + -> read_header_ret { + bufferlist bl; + bl.append(bptr); + auto bp = bl.cbegin(); + cbj_header_t cbj_header; + try { + decode(cbj_header, bp); + } catch (ceph::buffer::error &e) { + ERROR("unable to read header block"); + return crimson::ct_error::enoent::make(); + } + auto bliter = bl.cbegin(); + auto test_crc = bliter.crc32c( + ceph::encoded_sizeof_bounded<cbj_header_t>(), + -1); + ceph_le32 recorded_crc_le; + decode(recorded_crc_le, bliter); + uint32_t recorded_crc = recorded_crc_le; + if (test_crc != recorded_crc) { + ERROR("error, header crc mismatch."); + return read_header_ret( + read_header_ertr::ready_future_marker{}, + std::nullopt); + } + return read_header_ret( + read_header_ertr::ready_future_marker{}, + std::make_pair(cbj_header, bl) + ); + }); +} + +CircularJournalSpace::write_ertr::future<> +CircularJournalSpace::write_header() +{ + LOG_PREFIX(CircularJournalSpace::write_header); + ceph::bufferlist bl = encode_header(); + ceph_assert(bl.length() <= get_block_size()); + DEBUG( + "sync header of CircularJournalSpace, length {}", + bl.length()); + assert(device); + auto iter = bl.begin(); + assert(bl.length() < get_block_size()); + bufferptr bp = bufferptr(ceph::buffer::create_page_aligned(get_block_size())); + iter.copy(bl.length(), bp.c_str()); + return device->write(device->get_shard_journal_start(), std::move(bp) + ).handle_error( + write_ertr::pass_further{}, + crimson::ct_error::assert_all{ "Invalid error device->write" } + ); +} + +} diff --git a/src/crimson/os/seastore/journal/circular_journal_space.h b/src/crimson/os/seastore/journal/circular_journal_space.h new file mode 100644 index 000000000..c88b65ad5 --- /dev/null +++ b/src/crimson/os/seastore/journal/circular_journal_space.h @@ -0,0 +1,261 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <optional> +#include <seastar/core/circular_buffer.hh> +#include <seastar/core/metrics.hh> +#include <seastar/core/shared_future.hh> + +#include "include/buffer.h" + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/journal.h" +#include "crimson/os/seastore/random_block_manager.h" +#include "crimson/os/seastore/random_block_manager/rbm_device.h" +#include "crimson/os/seastore/journal/record_submitter.h" +#include "crimson/os/seastore/async_cleaner.h" + +namespace crimson::os::seastore { + class SegmentProvider; + class JournalTrimmer; +} + +namespace crimson::os::seastore::journal { + +class CircularBoundedJournal; +class CircularJournalSpace : public JournalAllocator { + + public: + const std::string& get_name() const final { + return print_name; + } + + extent_len_t get_block_size() const final; + + bool can_write() const final { + return (device != nullptr); + } + + segment_nonce_t get_nonce() const final { + return header.magic; + } + + bool needs_roll(std::size_t length) const final; + + roll_ertr::future<> roll() final; + + write_ret write(ceph::bufferlist&& to_write) final; + + void update_modify_time(record_t& record) final {} + + close_ertr::future<> close() final { + return write_header( + ).safe_then([this]() -> close_ertr::future<> { + initialized = false; + return close_ertr::now(); + }).handle_error( + Journal::open_for_mount_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error write_header" + } + ); + } + + open_ret open(bool is_mkfs) final; + + public: + CircularJournalSpace(RBMDevice * device); + + struct cbj_header_t; + using write_ertr = Journal::submit_record_ertr; + /* + * device_write_bl + * + * @param device address to write + * @param bufferlist to write + * + */ + write_ertr::future<> device_write_bl(rbm_abs_addr offset, ceph::bufferlist &bl); + + using read_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + using read_header_ertr = read_ertr; + using read_header_ret = read_header_ertr::future< + std::optional<std::pair<cbj_header_t, bufferlist>> + >; + /* + * read_header + * + * read header block from given absolute address + * + * @param absolute address + * + */ + read_header_ret read_header(); + + ceph::bufferlist encode_header(); + + write_ertr::future<> write_header(); + + + /** + * CircularBoundedJournal structure + * + * +-------------------------------------------------------+ + * | header | record | record | record | record | ... | + * +-------------------------------------------------------+ + * ^-----------block aligned-----------------^ + * <----fixed----> + */ + + struct cbj_header_t { + // start offset of CircularBoundedJournal in the device + journal_seq_t dirty_tail; + journal_seq_t alloc_tail; + segment_nonce_t magic; + + DENC(cbj_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.dirty_tail, p); + denc(v.alloc_tail, p); + denc(v.magic, p); + DENC_FINISH(p); + } + }; + + /** + * + * Write position for CircularBoundedJournal + * + * | written to rbm | written length to CircularBoundedJournal | new write | + * ----------------->------------------------------------------------> + * ^ ^ + * applied_to written_to + * + */ + + journal_seq_t get_written_to() const { + return written_to; + } + rbm_abs_addr get_rbm_addr(journal_seq_t seq) const { + return convert_paddr_to_abs_addr(seq.offset); + } + void set_written_to(journal_seq_t seq) { + rbm_abs_addr addr = convert_paddr_to_abs_addr(seq.offset); + assert(addr >= get_records_start()); + assert(addr < get_journal_end()); + written_to = seq; + } + device_id_t get_device_id() const { + return device->get_device_id(); + } + + journal_seq_t get_dirty_tail() const { + return header.dirty_tail; + } + journal_seq_t get_alloc_tail() const { + return header.alloc_tail; + } + + /* + Size-related interfaces + +---------------------------------------------------------+ + | header | record | record | record | record | ... | + +---------------------------------------------------------+ + ^ ^ ^ + | | | + get_journal_start | get_journal_end + get_records_start + <-- get_records_total_size + block_size --> + <--------------- get_journal_size ------------------------> + */ + + size_t get_records_used_size() const { + auto rbm_written_to = get_rbm_addr(get_written_to()); + auto rbm_tail = get_rbm_addr(get_dirty_tail()); + return rbm_written_to >= rbm_tail ? + rbm_written_to - rbm_tail : + rbm_written_to + get_records_total_size() + get_block_size() + - rbm_tail; + } + size_t get_records_total_size() const { + assert(device); + // a block is for header and a block is reserved to denote the end + return device->get_journal_size() - (2 * get_block_size()); + } + rbm_abs_addr get_records_start() const { + assert(device); + return device->get_shard_journal_start() + get_block_size(); + } + size_t get_records_available_size() const { + return get_records_total_size() - get_records_used_size(); + } + bool is_available_size(uint64_t size) { + auto rbm_written_to = get_rbm_addr(get_written_to()); + auto rbm_tail = get_rbm_addr(get_dirty_tail()); + if (rbm_written_to > rbm_tail && + (get_journal_end() - rbm_written_to) < size && + size > (get_records_used_size() - + (get_journal_end() - rbm_written_to))) { + return false; + } + return get_records_available_size() >= size; + } + rbm_abs_addr get_journal_end() const { + assert(device); + return device->get_shard_journal_start() + device->get_journal_size(); + } + + read_ertr::future<> read( + uint64_t offset, + bufferptr &bptr) { + assert(device); + return device->read(offset, bptr); + } + + seastar::future<> update_journal_tail( + journal_seq_t dirty, + journal_seq_t alloc) { + header.dirty_tail = dirty; + header.alloc_tail = alloc; + return write_header( + ).handle_error( + crimson::ct_error::assert_all{ + "encountered invalid error in update_journal_tail" + }); + } + + void set_initialized(bool init) { + initialized = init; + } + + void set_cbj_header(cbj_header_t& head) { + header = head; + } + + cbj_header_t get_cbj_header() { + return header; + } + + private: + std::string print_name; + cbj_header_t header; + RBMDevice* device; + journal_seq_t written_to; + bool initialized = false; +}; + +std::ostream &operator<<(std::ostream &out, const CircularJournalSpace::cbj_header_t &header); + +} + +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal::CircularJournalSpace::cbj_header_t) + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::journal::CircularJournalSpace::cbj_header_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/journal/record_submitter.cc b/src/crimson/os/seastore/journal/record_submitter.cc new file mode 100644 index 000000000..5ca53b436 --- /dev/null +++ b/src/crimson/os/seastore/journal/record_submitter.cc @@ -0,0 +1,533 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "record_submitter.h" + +#include <fmt/format.h> +#include <fmt/os.h> + +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/async_cleaner.h" + +SET_SUBSYS(seastore_journal); + +namespace crimson::os::seastore::journal { + +RecordBatch::add_pending_ret +RecordBatch::add_pending( + const std::string& name, + record_t&& record, + extent_len_t block_size) +{ + LOG_PREFIX(RecordBatch::add_pending); + auto new_size = get_encoded_length_after(record, block_size); + auto dlength_offset = pending.size.dlength; + TRACE("{} batches={}, write_size={}, dlength_offset={} ...", + name, + pending.get_size() + 1, + new_size.get_encoded_length(), + dlength_offset); + assert(state != state_t::SUBMITTING); + assert(evaluate_submit(record.size, block_size).submit_size == new_size); + + pending.push_back( + std::move(record), block_size); + assert(pending.size == new_size); + if (state == state_t::EMPTY) { + assert(!io_promise.has_value()); + io_promise = seastar::shared_promise<maybe_promise_result_t>(); + } else { + assert(io_promise.has_value()); + } + state = state_t::PENDING; + + return io_promise->get_shared_future( + ).then([dlength_offset, FNAME, &name + ](auto maybe_promise_result) -> add_pending_ret { + if (!maybe_promise_result.has_value()) { + ERROR("{} write failed", name); + return crimson::ct_error::input_output_error::make(); + } + auto write_result = maybe_promise_result->write_result; + auto submit_result = record_locator_t{ + write_result.start_seq.offset.add_offset( + maybe_promise_result->mdlength + dlength_offset), + write_result + }; + TRACE("{} write finish with {}", name, submit_result); + return add_pending_ret( + add_pending_ertr::ready_future_marker{}, + submit_result); + }); +} + +std::pair<ceph::bufferlist, record_group_size_t> +RecordBatch::encode_batch( + const journal_seq_t& committed_to, + segment_nonce_t segment_nonce) +{ + assert(state == state_t::PENDING); + assert(pending.get_size() > 0); + assert(io_promise.has_value()); + + state = state_t::SUBMITTING; + submitting_size = pending.get_size(); + auto gsize = pending.size; + submitting_length = gsize.get_encoded_length(); + submitting_mdlength = gsize.get_mdlength(); + auto bl = encode_records(pending, committed_to, segment_nonce); + // Note: pending is cleared here + assert(bl.length() == submitting_length); + return std::make_pair(bl, gsize); +} + +void RecordBatch::set_result( + maybe_result_t maybe_write_result) +{ + maybe_promise_result_t result; + if (maybe_write_result.has_value()) { + assert(maybe_write_result->length == submitting_length); + result = promise_result_t{ + *maybe_write_result, + submitting_mdlength + }; + } + assert(state == state_t::SUBMITTING); + assert(io_promise.has_value()); + + state = state_t::EMPTY; + submitting_size = 0; + submitting_length = 0; + submitting_mdlength = 0; + io_promise->set_value(result); + io_promise.reset(); +} + +std::pair<ceph::bufferlist, record_group_size_t> +RecordBatch::submit_pending_fast( + record_t&& record, + extent_len_t block_size, + const journal_seq_t& committed_to, + segment_nonce_t segment_nonce) +{ + auto new_size = get_encoded_length_after(record, block_size); + std::ignore = new_size; + assert(state == state_t::EMPTY); + assert(evaluate_submit(record.size, block_size).submit_size == new_size); + + auto group = record_group_t(std::move(record), block_size); + auto size = group.size; + assert(size == new_size); + auto bl = encode_records(group, committed_to, segment_nonce); + assert(bl.length() == size.get_encoded_length()); + return std::make_pair(std::move(bl), size); +} + +RecordSubmitter::RecordSubmitter( + std::size_t io_depth, + std::size_t batch_capacity, + std::size_t batch_flush_size, + double preferred_fullness, + JournalAllocator& ja) + : io_depth_limit{io_depth}, + preferred_fullness{preferred_fullness}, + journal_allocator{ja}, + batches(new RecordBatch[io_depth + 1]) +{ + LOG_PREFIX(RecordSubmitter); + INFO("{} io_depth_limit={}, batch_capacity={}, batch_flush_size={}, " + "preferred_fullness={}", + get_name(), io_depth, batch_capacity, + batch_flush_size, preferred_fullness); + ceph_assert(io_depth > 0); + ceph_assert(batch_capacity > 0); + ceph_assert(preferred_fullness >= 0 && + preferred_fullness <= 1); + free_batch_ptrs.reserve(io_depth + 1); + for (std::size_t i = 0; i <= io_depth; ++i) { + batches[i].initialize(i, batch_capacity, batch_flush_size); + free_batch_ptrs.push_back(&batches[i]); + } + pop_free_batch(); +} + +bool RecordSubmitter::is_available() const +{ + auto ret = !wait_available_promise.has_value() && + !has_io_error; +#ifndef NDEBUG + if (ret) { + // unconditional invariants + ceph_assert(journal_allocator.can_write()); + ceph_assert(p_current_batch != nullptr); + ceph_assert(!p_current_batch->is_submitting()); + // the current batch accepts a further write + ceph_assert(!p_current_batch->needs_flush()); + if (!p_current_batch->is_empty()) { + auto submit_length = + p_current_batch->get_submit_size().get_encoded_length(); + ceph_assert(!journal_allocator.needs_roll(submit_length)); + } + // I'm not rolling + } +#endif + return ret; +} + +RecordSubmitter::wa_ertr::future<> +RecordSubmitter::wait_available() +{ + LOG_PREFIX(RecordSubmitter::wait_available); + assert(!is_available()); + if (has_io_error) { + ERROR("{} I/O is failed before wait", get_name()); + return crimson::ct_error::input_output_error::make(); + } + return wait_available_promise->get_shared_future( + ).then([FNAME, this]() -> wa_ertr::future<> { + if (has_io_error) { + ERROR("{} I/O is failed after wait", get_name()); + return crimson::ct_error::input_output_error::make(); + } + return wa_ertr::now(); + }); +} + +RecordSubmitter::action_t +RecordSubmitter::check_action( + const record_size_t& rsize) const +{ + assert(is_available()); + auto eval = p_current_batch->evaluate_submit( + rsize, journal_allocator.get_block_size()); + if (journal_allocator.needs_roll(eval.submit_size.get_encoded_length())) { + return action_t::ROLL; + } else if (eval.is_full) { + return action_t::SUBMIT_FULL; + } else { + return action_t::SUBMIT_NOT_FULL; + } +} + +RecordSubmitter::roll_segment_ertr::future<> +RecordSubmitter::roll_segment() +{ + LOG_PREFIX(RecordSubmitter::roll_segment); + ceph_assert(p_current_batch->needs_flush() || + is_available()); + // #1 block concurrent submissions due to rolling + wait_available_promise = seastar::shared_promise<>(); + ceph_assert(!wait_unfull_flush_promise.has_value()); + return [FNAME, this] { + if (p_current_batch->is_pending()) { + if (state == state_t::FULL) { + DEBUG("{} wait flush ...", get_name()); + wait_unfull_flush_promise = seastar::promise<>(); + return wait_unfull_flush_promise->get_future(); + } else { // IDLE/PENDING + DEBUG("{} flush", get_name()); + flush_current_batch(); + return seastar::now(); + } + } else { + assert(p_current_batch->is_empty()); + return seastar::now(); + } + }().then_wrapped([FNAME, this](auto fut) { + if (fut.failed()) { + ERROR("{} rolling is skipped unexpectedly, available", get_name()); + has_io_error = true; + wait_available_promise->set_value(); + wait_available_promise.reset(); + return roll_segment_ertr::now(); + } else { + // start rolling in background + std::ignore = journal_allocator.roll( + ).safe_then([FNAME, this] { + // good + DEBUG("{} rolling done, available", get_name()); + assert(!has_io_error); + wait_available_promise->set_value(); + wait_available_promise.reset(); + }).handle_error( + crimson::ct_error::all_same_way([FNAME, this](auto e) { + ERROR("{} got error {}, available", get_name(), e); + has_io_error = true; + wait_available_promise->set_value(); + wait_available_promise.reset(); + }) + ).handle_exception([FNAME, this](auto e) { + ERROR("{} got exception {}, available", get_name(), e); + has_io_error = true; + wait_available_promise->set_value(); + wait_available_promise.reset(); + }); + // wait for background rolling + return wait_available(); + } + }); +} + +RecordSubmitter::submit_ret +RecordSubmitter::submit( + record_t&& record, + bool with_atomic_roll_segment) +{ + LOG_PREFIX(RecordSubmitter::submit); + ceph_assert(is_available()); + assert(check_action(record.size) != action_t::ROLL); + journal_allocator.update_modify_time(record); + auto eval = p_current_batch->evaluate_submit( + record.size, journal_allocator.get_block_size()); + bool needs_flush = ( + state == state_t::IDLE || + eval.submit_size.get_fullness() > preferred_fullness || + // RecordBatch::needs_flush() + eval.is_full || + p_current_batch->get_num_records() + 1 >= + p_current_batch->get_batch_capacity()); + if (p_current_batch->is_empty() && + needs_flush && + state != state_t::FULL) { + // fast path with direct write + increment_io(); + auto [to_write, sizes] = p_current_batch->submit_pending_fast( + std::move(record), + journal_allocator.get_block_size(), + get_committed_to(), + journal_allocator.get_nonce()); + DEBUG("{} fast submit {}, committed_to={}, outstanding_io={} ...", + get_name(), sizes, get_committed_to(), num_outstanding_io); + account_submission(1, sizes); + return journal_allocator.write(std::move(to_write) + ).safe_then([mdlength = sizes.get_mdlength()](auto write_result) { + return record_locator_t{ + write_result.start_seq.offset.add_offset(mdlength), + write_result + }; + }).finally([this] { + decrement_io_with_flush(); + }); + } + // indirect batched write + auto write_fut = p_current_batch->add_pending( + get_name(), + std::move(record), + journal_allocator.get_block_size()); + if (needs_flush) { + if (state == state_t::FULL) { + // #2 block concurrent submissions due to lack of resource + DEBUG("{} added with {} pending, outstanding_io={}, unavailable, wait flush ...", + get_name(), + p_current_batch->get_num_records(), + num_outstanding_io); + if (with_atomic_roll_segment) { + // wait_available_promise and wait_unfull_flush_promise + // need to be delegated to the follow-up atomic roll_segment(); + assert(p_current_batch->is_pending()); + } else { + wait_available_promise = seastar::shared_promise<>(); + ceph_assert(!wait_unfull_flush_promise.has_value()); + wait_unfull_flush_promise = seastar::promise<>(); + // flush and mark available in background + std::ignore = wait_unfull_flush_promise->get_future( + ).finally([FNAME, this] { + DEBUG("{} flush done, available", get_name()); + wait_available_promise->set_value(); + wait_available_promise.reset(); + }); + } + } else { + DEBUG("{} added pending, flush", get_name()); + flush_current_batch(); + } + } else { + // will flush later + DEBUG("{} added with {} pending, outstanding_io={}", + get_name(), + p_current_batch->get_num_records(), + num_outstanding_io); + assert(!p_current_batch->needs_flush()); + } + return write_fut; +} + +RecordSubmitter::open_ret +RecordSubmitter::open(bool is_mkfs) +{ + return journal_allocator.open(is_mkfs + ).safe_then([this](journal_seq_t ret) { + LOG_PREFIX(RecordSubmitter::open); + DEBUG("{} register metrics", get_name()); + stats = {}; + namespace sm = seastar::metrics; + std::vector<sm::label_instance> label_instances; + label_instances.push_back(sm::label_instance("submitter", get_name())); + metrics.add_group( + "journal", + { + sm::make_counter( + "record_num", + stats.record_batch_stats.num_io, + sm::description("total number of records submitted"), + label_instances + ), + sm::make_counter( + "record_batch_num", + stats.record_batch_stats.num_io_grouped, + sm::description("total number of records batched"), + label_instances + ), + sm::make_counter( + "io_num", + stats.io_depth_stats.num_io, + sm::description("total number of io submitted"), + label_instances + ), + sm::make_counter( + "io_depth_num", + stats.io_depth_stats.num_io_grouped, + sm::description("total number of io depth"), + label_instances + ), + sm::make_counter( + "record_group_padding_bytes", + stats.record_group_padding_bytes, + sm::description("bytes of metadata padding when write record groups"), + label_instances + ), + sm::make_counter( + "record_group_metadata_bytes", + stats.record_group_metadata_bytes, + sm::description("bytes of raw metadata when write record groups"), + label_instances + ), + sm::make_counter( + "record_group_data_bytes", + stats.record_group_data_bytes, + sm::description("bytes of data when write record groups"), + label_instances + ), + } + ); + return ret; + }); +} + +RecordSubmitter::close_ertr::future<> +RecordSubmitter::close() +{ + committed_to = JOURNAL_SEQ_NULL; + ceph_assert(state == state_t::IDLE); + ceph_assert(num_outstanding_io == 0); + ceph_assert(p_current_batch != nullptr); + ceph_assert(p_current_batch->is_empty()); + ceph_assert(!wait_available_promise.has_value()); + has_io_error = false; + ceph_assert(!wait_unfull_flush_promise.has_value()); + metrics.clear(); + return journal_allocator.close(); +} + +void RecordSubmitter::update_state() +{ + if (num_outstanding_io == 0) { + state = state_t::IDLE; + } else if (num_outstanding_io < io_depth_limit) { + state = state_t::PENDING; + } else if (num_outstanding_io == io_depth_limit) { + state = state_t::FULL; + } else { + ceph_abort("fatal error: io-depth overflow"); + } +} + +void RecordSubmitter::decrement_io_with_flush() +{ + LOG_PREFIX(RecordSubmitter::decrement_io_with_flush); + assert(num_outstanding_io > 0); + auto prv_state = state; + --num_outstanding_io; + update_state(); + + if (prv_state == state_t::FULL) { + if (wait_unfull_flush_promise.has_value()) { + DEBUG("{} flush, resolve wait_unfull_flush_promise", get_name()); + assert(!p_current_batch->is_empty()); + assert(wait_available_promise.has_value()); + flush_current_batch(); + wait_unfull_flush_promise->set_value(); + wait_unfull_flush_promise.reset(); + return; + } + } else { + ceph_assert(!wait_unfull_flush_promise.has_value()); + } + + auto needs_flush = ( + !p_current_batch->is_empty() && ( + state == state_t::IDLE || + p_current_batch->get_submit_size().get_fullness() > preferred_fullness || + p_current_batch->needs_flush() + )); + if (needs_flush) { + DEBUG("{} flush", get_name()); + flush_current_batch(); + } +} + +void RecordSubmitter::account_submission( + std::size_t num, + const record_group_size_t& size) +{ + stats.record_group_padding_bytes += + (size.get_mdlength() - size.get_raw_mdlength()); + stats.record_group_metadata_bytes += size.get_raw_mdlength(); + stats.record_group_data_bytes += size.dlength; + stats.record_batch_stats.increment(num); +} + +void RecordSubmitter::finish_submit_batch( + RecordBatch* p_batch, + maybe_result_t maybe_result) +{ + assert(p_batch->is_submitting()); + p_batch->set_result(maybe_result); + free_batch_ptrs.push_back(p_batch); + decrement_io_with_flush(); +} + +void RecordSubmitter::flush_current_batch() +{ + LOG_PREFIX(RecordSubmitter::flush_current_batch); + RecordBatch* p_batch = p_current_batch; + assert(p_batch->is_pending()); + p_current_batch = nullptr; + pop_free_batch(); + + increment_io(); + auto num = p_batch->get_num_records(); + auto [to_write, sizes] = p_batch->encode_batch( + get_committed_to(), journal_allocator.get_nonce()); + DEBUG("{} {} records, {}, committed_to={}, outstanding_io={} ...", + get_name(), num, sizes, get_committed_to(), num_outstanding_io); + account_submission(num, sizes); + std::ignore = journal_allocator.write(std::move(to_write) + ).safe_then([this, p_batch, FNAME, num, sizes=sizes](auto write_result) { + TRACE("{} {} records, {}, write done with {}", + get_name(), num, sizes, write_result); + finish_submit_batch(p_batch, write_result); + }).handle_error( + crimson::ct_error::all_same_way([this, p_batch, FNAME, num, sizes=sizes](auto e) { + ERROR("{} {} records, {}, got error {}", + get_name(), num, sizes, e); + finish_submit_batch(p_batch, std::nullopt); + }) + ).handle_exception([this, p_batch, FNAME, num, sizes=sizes](auto e) { + ERROR("{} {} records, {}, got exception {}", + get_name(), num, sizes, e); + finish_submit_batch(p_batch, std::nullopt); + }); +} + +} diff --git a/src/crimson/os/seastore/journal/record_submitter.h b/src/crimson/os/seastore/journal/record_submitter.h new file mode 100644 index 000000000..eedd2dd8c --- /dev/null +++ b/src/crimson/os/seastore/journal/record_submitter.h @@ -0,0 +1,347 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <optional> +#include <seastar/core/circular_buffer.hh> +#include <seastar/core/metrics.hh> +#include <seastar/core/shared_future.hh> + +#include "include/buffer.h" + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/segment_manager_group.h" +#include "crimson/os/seastore/segment_seq_allocator.h" + +namespace crimson::os::seastore { + class SegmentProvider; + class JournalTrimmer; +} + +namespace crimson::os::seastore::journal { + +class JournalAllocator { +public: + using base_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + virtual const std::string& get_name() const = 0; + + virtual void update_modify_time(record_t& record) = 0; + + virtual extent_len_t get_block_size() const = 0; + + using close_ertr = base_ertr; + virtual close_ertr::future<> close() = 0; + + virtual segment_nonce_t get_nonce() const = 0; + + using write_ertr = base_ertr; + using write_ret = write_ertr::future<write_result_t>; + virtual write_ret write(ceph::bufferlist&& to_write) = 0; + + virtual bool can_write() const = 0; + + using roll_ertr = base_ertr; + virtual roll_ertr::future<> roll() = 0; + + virtual bool needs_roll(std::size_t length) const = 0; + + using open_ertr = base_ertr; + using open_ret = open_ertr::future<journal_seq_t>; + virtual open_ret open(bool is_mkfs) = 0; + +}; + +/** + * RecordBatch + * + * Maintain a batch of records for submit. + */ +class RecordBatch { + enum class state_t { + EMPTY = 0, + PENDING, + SUBMITTING + }; + +public: + RecordBatch() = default; + RecordBatch(RecordBatch&&) = delete; + RecordBatch(const RecordBatch&) = delete; + RecordBatch& operator=(RecordBatch&&) = delete; + RecordBatch& operator=(const RecordBatch&) = delete; + + bool is_empty() const { + return state == state_t::EMPTY; + } + + bool is_pending() const { + return state == state_t::PENDING; + } + + bool is_submitting() const { + return state == state_t::SUBMITTING; + } + + std::size_t get_index() const { + return index; + } + + std::size_t get_num_records() const { + return pending.get_size(); + } + + std::size_t get_batch_capacity() const { + return batch_capacity; + } + + const record_group_size_t& get_submit_size() const { + assert(state != state_t::EMPTY); + return pending.size; + } + + bool needs_flush() const { + assert(state != state_t::SUBMITTING); + assert(pending.get_size() <= batch_capacity); + if (state == state_t::EMPTY) { + return false; + } else { + assert(state == state_t::PENDING); + return (pending.get_size() >= batch_capacity || + pending.size.get_encoded_length() > batch_flush_size); + } + } + + struct evaluation_t { + record_group_size_t submit_size; + bool is_full; + }; + evaluation_t evaluate_submit( + const record_size_t& rsize, + extent_len_t block_size) const { + assert(!needs_flush()); + auto submit_size = pending.size.get_encoded_length_after( + rsize, block_size); + bool is_full = submit_size.get_encoded_length() > batch_flush_size; + return {submit_size, is_full}; + } + + void initialize(std::size_t i, + std::size_t _batch_capacity, + std::size_t _batch_flush_size) { + ceph_assert(_batch_capacity > 0); + index = i; + batch_capacity = _batch_capacity; + batch_flush_size = _batch_flush_size; + pending.reserve(batch_capacity); + } + + // Add to the batch, the future will be resolved after the batch is + // written. + // + // Set write_result_t::write_length to 0 if the record is not the first one + // in the batch. + using add_pending_ertr = JournalAllocator::write_ertr; + using add_pending_ret = add_pending_ertr::future<record_locator_t>; + add_pending_ret add_pending( + const std::string& name, + record_t&&, + extent_len_t block_size); + + // Encode the batched records for write. + std::pair<ceph::bufferlist, record_group_size_t> encode_batch( + const journal_seq_t& committed_to, + segment_nonce_t segment_nonce); + + // Set the write result and reset for reuse + using maybe_result_t = std::optional<write_result_t>; + void set_result(maybe_result_t maybe_write_end_seq); + + // The fast path that is equivalent to submit a single record as a batch. + // + // Essentially, equivalent to the combined logic of: + // add_pending(), encode_batch() and set_result() above without + // the intervention of the shared io_promise. + // + // Note the current RecordBatch can be reused afterwards. + std::pair<ceph::bufferlist, record_group_size_t> submit_pending_fast( + record_t&&, + extent_len_t block_size, + const journal_seq_t& committed_to, + segment_nonce_t segment_nonce); + +private: + record_group_size_t get_encoded_length_after( + const record_t& record, + extent_len_t block_size) const { + return pending.size.get_encoded_length_after( + record.size, block_size); + } + + state_t state = state_t::EMPTY; + std::size_t index = 0; + std::size_t batch_capacity = 0; + std::size_t batch_flush_size = 0; + + record_group_t pending; + std::size_t submitting_size = 0; + extent_len_t submitting_length = 0; + extent_len_t submitting_mdlength = 0; + + struct promise_result_t { + write_result_t write_result; + extent_len_t mdlength; + }; + using maybe_promise_result_t = std::optional<promise_result_t>; + std::optional<seastar::shared_promise<maybe_promise_result_t> > io_promise; +}; + +/** + * RecordSubmitter + * + * Submit records concurrently with RecordBatch with SegmentAllocator. + * + * Configurations and controls: + * - io_depth: the io-depth limit to SegmentAllocator; + * - batch_capacity: the number limit of records in a RecordBatch; + * - batch_flush_size: the bytes threshold to force flush a RecordBatch to + * control the maximum latency; + * - preferred_fullness: the fullness threshold to flush a RecordBatch; + */ +class RecordSubmitter { + enum class state_t { + IDLE = 0, // outstanding_io == 0 + PENDING, // outstanding_io < io_depth_limit + FULL // outstanding_io == io_depth_limit + // OVERFLOW: outstanding_io > io_depth_limit is impossible + }; + + struct grouped_io_stats { + uint64_t num_io = 0; + uint64_t num_io_grouped = 0; + + void increment(uint64_t num_grouped_io) { + ++num_io; + num_io_grouped += num_grouped_io; + } + }; + + using base_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + +public: + RecordSubmitter(std::size_t io_depth, + std::size_t batch_capacity, + std::size_t batch_flush_size, + double preferred_fullness, + JournalAllocator&); + + const std::string& get_name() const { + return journal_allocator.get_name(); + } + + journal_seq_t get_committed_to() const { + return committed_to; + } + + // whether is available to submit a record + bool is_available() const; + + // wait for available if cannot submit, should check is_available() again + // when the future is resolved. + using wa_ertr = base_ertr; + wa_ertr::future<> wait_available(); + + // when available, check for the submit action + // according to the pending record size + enum class action_t { + ROLL, + SUBMIT_FULL, + SUBMIT_NOT_FULL + }; + action_t check_action(const record_size_t&) const; + + // when available, roll the segment if needed + using roll_segment_ertr = base_ertr; + roll_segment_ertr::future<> roll_segment(); + + // when available, submit the record if possible + using submit_ertr = base_ertr; + using submit_ret = submit_ertr::future<record_locator_t>; + submit_ret submit(record_t&&, bool with_atomic_roll_segment=false); + + void update_committed_to(const journal_seq_t& new_committed_to) { + assert(new_committed_to != JOURNAL_SEQ_NULL); + assert(committed_to == JOURNAL_SEQ_NULL || + committed_to <= new_committed_to); + committed_to = new_committed_to; + } + + // open for write, generate the correct print name, and register metrics + using open_ertr = base_ertr; + using open_ret = open_ertr::future<journal_seq_t>; + open_ret open(bool is_mkfs); + + using close_ertr = base_ertr; + close_ertr::future<> close(); + +private: + void update_state(); + + void increment_io() { + ++num_outstanding_io; + stats.io_depth_stats.increment(num_outstanding_io); + update_state(); + } + + void decrement_io_with_flush(); + + void pop_free_batch() { + assert(p_current_batch == nullptr); + assert(!free_batch_ptrs.empty()); + p_current_batch = free_batch_ptrs.front(); + assert(p_current_batch->is_empty()); + assert(p_current_batch == &batches[p_current_batch->get_index()]); + free_batch_ptrs.pop_front(); + } + + void account_submission(std::size_t, const record_group_size_t&); + + using maybe_result_t = RecordBatch::maybe_result_t; + void finish_submit_batch(RecordBatch*, maybe_result_t); + + void flush_current_batch(); + + state_t state = state_t::IDLE; + std::size_t num_outstanding_io = 0; + std::size_t io_depth_limit; + double preferred_fullness; + + JournalAllocator& journal_allocator; + // committed_to may be in a previous journal segment + journal_seq_t committed_to = JOURNAL_SEQ_NULL; + + std::unique_ptr<RecordBatch[]> batches; + // should not be nullptr after constructed + RecordBatch* p_current_batch = nullptr; + seastar::circular_buffer<RecordBatch*> free_batch_ptrs; + + // blocked for rolling or lack of resource + std::optional<seastar::shared_promise<> > wait_available_promise; + bool has_io_error = false; + // when needs flush but io depth is full, + // wait for decrement_io_with_flush() + std::optional<seastar::promise<> > wait_unfull_flush_promise; + + struct { + grouped_io_stats record_batch_stats; + grouped_io_stats io_depth_stats; + uint64_t record_group_padding_bytes = 0; + uint64_t record_group_metadata_bytes = 0; + uint64_t record_group_data_bytes = 0; + } stats; + seastar::metrics::metric_group metrics; +}; + +} diff --git a/src/crimson/os/seastore/journal/segment_allocator.cc b/src/crimson/os/seastore/journal/segment_allocator.cc new file mode 100644 index 000000000..61e1be585 --- /dev/null +++ b/src/crimson/os/seastore/journal/segment_allocator.cc @@ -0,0 +1,283 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "segment_allocator.h" + +#include <fmt/format.h> +#include <fmt/os.h> + +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/async_cleaner.h" + +SET_SUBSYS(seastore_journal); + +namespace crimson::os::seastore::journal { + +SegmentAllocator::SegmentAllocator( + JournalTrimmer *trimmer, + data_category_t category, + rewrite_gen_t gen, + SegmentProvider &sp, + SegmentSeqAllocator &ssa) + : print_name{fmt::format("{}_G{}", category, gen)}, + type{trimmer == nullptr ? + segment_type_t::OOL : + segment_type_t::JOURNAL}, + category{category}, + gen{gen}, + segment_provider{sp}, + sm_group{*sp.get_segment_manager_group()}, + segment_seq_allocator(ssa), + trimmer{trimmer} +{ + reset(); +} + +segment_nonce_t calc_new_nonce( + segment_type_t type, + uint32_t crc, + unsigned char const *data, + unsigned length) +{ + crc &= std::numeric_limits<uint32_t>::max() >> 1; + crc |= static_cast<uint32_t>(type) << 31; + return ceph_crc32c(crc, data, length); +} + +SegmentAllocator::open_ret +SegmentAllocator::do_open(bool is_mkfs) +{ + LOG_PREFIX(SegmentAllocator::do_open); + ceph_assert(!current_segment); + segment_seq_t new_segment_seq = + segment_seq_allocator.get_and_inc_next_segment_seq(); + auto meta = sm_group.get_meta(); + current_segment_nonce = calc_new_nonce( + type, + new_segment_seq, + reinterpret_cast<const unsigned char *>(meta.seastore_id.bytes()), + sizeof(meta.seastore_id.uuid)); + auto new_segment_id = segment_provider.allocate_segment( + new_segment_seq, type, category, gen); + ceph_assert(new_segment_id != NULL_SEG_ID); + return sm_group.open(new_segment_id + ).handle_error( + open_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in SegmentAllocator::do_open open" + } + ).safe_then([this, is_mkfs, FNAME, new_segment_seq](auto sref) { + // initialize new segment + segment_id_t segment_id = sref->get_segment_id(); + journal_seq_t dirty_tail; + journal_seq_t alloc_tail; + if (type == segment_type_t::JOURNAL) { + dirty_tail = trimmer->get_dirty_tail(); + alloc_tail = trimmer->get_alloc_tail(); + if (is_mkfs) { + ceph_assert(dirty_tail == JOURNAL_SEQ_NULL); + ceph_assert(alloc_tail == JOURNAL_SEQ_NULL); + auto mkfs_seq = journal_seq_t{ + new_segment_seq, + paddr_t::make_seg_paddr(segment_id, 0) + }; + dirty_tail = mkfs_seq; + alloc_tail = mkfs_seq; + } else { + ceph_assert(dirty_tail != JOURNAL_SEQ_NULL); + ceph_assert(alloc_tail != JOURNAL_SEQ_NULL); + } + } else { // OOL + ceph_assert(!is_mkfs); + dirty_tail = JOURNAL_SEQ_NULL; + alloc_tail = JOURNAL_SEQ_NULL; + } + auto header = segment_header_t{ + new_segment_seq, + segment_id, + dirty_tail, + alloc_tail, + current_segment_nonce, + type, + category, + gen}; + INFO("{} writing header {}", print_name, header); + + auto header_length = get_block_size(); + bufferlist bl; + encode(header, bl); + bufferptr bp(ceph::buffer::create_page_aligned(header_length)); + bp.zero(); + auto iter = bl.cbegin(); + iter.copy(bl.length(), bp.c_str()); + bl.clear(); + bl.append(bp); + + ceph_assert(sref->get_write_ptr() == 0); + assert((unsigned)header_length == bl.length()); + written_to = header_length; + auto new_journal_seq = journal_seq_t{ + new_segment_seq, + paddr_t::make_seg_paddr(segment_id, written_to)}; + segment_provider.update_segment_avail_bytes( + type, new_journal_seq.offset); + return sref->write(0, std::move(bl) + ).handle_error( + open_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in SegmentAllocator::do_open write" + } + ).safe_then([this, + FNAME, + new_journal_seq, + sref=std::move(sref)]() mutable { + ceph_assert(!current_segment); + current_segment = std::move(sref); + DEBUG("{} rolled new segment id={}", + print_name, current_segment->get_segment_id()); + ceph_assert(new_journal_seq.segment_seq == + segment_provider.get_seg_info(current_segment->get_segment_id()).seq); + return new_journal_seq; + }); + }); +} + +SegmentAllocator::open_ret +SegmentAllocator::open(bool is_mkfs) +{ + LOG_PREFIX(SegmentAllocator::open); + auto& device_ids = sm_group.get_device_ids(); + ceph_assert(device_ids.size()); + std::ostringstream oss; + for (auto& device_id : device_ids) { + oss << device_id_printer_t{device_id} << "_"; + } + oss << fmt::format("{}_G{}", category, gen); + print_name = oss.str(); + + DEBUG("{}", print_name); + return do_open(is_mkfs); +} + +SegmentAllocator::roll_ertr::future<> +SegmentAllocator::roll() +{ + ceph_assert(can_write()); + return close_segment().safe_then([this] { + return do_open(false).discard_result(); + }); +} + +SegmentAllocator::write_ret +SegmentAllocator::write(ceph::bufferlist&& to_write) +{ + LOG_PREFIX(SegmentAllocator::write); + assert(can_write()); + auto write_length = to_write.length(); + auto write_start_offset = written_to; + auto write_start_seq = journal_seq_t{ + segment_provider.get_seg_info(current_segment->get_segment_id()).seq, + paddr_t::make_seg_paddr( + current_segment->get_segment_id(), write_start_offset) + }; + TRACE("{} {}~{}", print_name, write_start_seq, write_length); + assert(write_length > 0); + assert((write_length % get_block_size()) == 0); + assert(!needs_roll(write_length)); + + auto write_result = write_result_t{ + write_start_seq, + write_length + }; + written_to += write_length; + segment_provider.update_segment_avail_bytes( + type, + paddr_t::make_seg_paddr( + current_segment->get_segment_id(), written_to) + ); + return current_segment->write( + write_start_offset, std::move(to_write) + ).handle_error( + write_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in SegmentAllocator::write" + } + ).safe_then([write_result, cs=current_segment] { + return write_result; + }); +} + +SegmentAllocator::close_ertr::future<> +SegmentAllocator::close() +{ + return [this] { + LOG_PREFIX(SegmentAllocator::close); + if (current_segment) { + DEBUG("{} close current segment", print_name); + return close_segment(); + } else { + INFO("{} no current segment", print_name); + return close_segment_ertr::now(); + } + }().finally([this] { + reset(); + }); +} + +SegmentAllocator::close_segment_ertr::future<> +SegmentAllocator::close_segment() +{ + LOG_PREFIX(SegmentAllocator::close_segment); + assert(can_write()); + // Note: make sure no one can access the current segment once closing + auto seg_to_close = std::move(current_segment); + auto close_segment_id = seg_to_close->get_segment_id(); + auto close_seg_info = segment_provider.get_seg_info(close_segment_id); + ceph_assert((close_seg_info.modify_time == NULL_TIME && + close_seg_info.num_extents == 0) || + (close_seg_info.modify_time != NULL_TIME && + close_seg_info.num_extents != 0)); + auto tail = segment_tail_t{ + close_seg_info.seq, + close_segment_id, + current_segment_nonce, + type, + timepoint_to_mod(close_seg_info.modify_time), + close_seg_info.num_extents}; + ceph::bufferlist bl; + encode(tail, bl); + INFO("{} close segment {}, written_to={}", + print_name, + tail, + written_to); + + bufferptr bp(ceph::buffer::create_page_aligned(get_block_size())); + bp.zero(); + auto iter = bl.cbegin(); + iter.copy(bl.length(), bp.c_str()); + bl.clear(); + bl.append(bp); + + assert(bl.length() == sm_group.get_rounded_tail_length()); + + auto p_seg_to_close = seg_to_close.get(); + return p_seg_to_close->advance_wp( + sm_group.get_segment_size() - sm_group.get_rounded_tail_length() + ).safe_then([this, FNAME, bl=std::move(bl), p_seg_to_close]() mutable { + DEBUG("Writing tail info to segment {}", p_seg_to_close->get_segment_id()); + return p_seg_to_close->write( + sm_group.get_segment_size() - sm_group.get_rounded_tail_length(), + std::move(bl)); + }).safe_then([p_seg_to_close] { + return p_seg_to_close->close(); + }).safe_then([this, seg_to_close=std::move(seg_to_close)] { + segment_provider.close_segment(seg_to_close->get_segment_id()); + }).handle_error( + close_segment_ertr::pass_further{}, + crimson::ct_error::assert_all { + "Invalid error in SegmentAllocator::close_segment" + }); + +} + +} diff --git a/src/crimson/os/seastore/journal/segment_allocator.h b/src/crimson/os/seastore/journal/segment_allocator.h new file mode 100644 index 000000000..292c23070 --- /dev/null +++ b/src/crimson/os/seastore/journal/segment_allocator.h @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <optional> +#include <seastar/core/circular_buffer.hh> +#include <seastar/core/metrics.hh> +#include <seastar/core/shared_future.hh> + +#include "include/buffer.h" + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/segment_manager_group.h" +#include "crimson/os/seastore/segment_seq_allocator.h" +#include "crimson/os/seastore/journal/record_submitter.h" +#include "crimson/os/seastore/async_cleaner.h" + +namespace crimson::os::seastore { + class SegmentProvider; + class JournalTrimmer; +} + +namespace crimson::os::seastore::journal { + +/** + * SegmentAllocator + * + * Maintain an available segment for writes. + */ +class SegmentAllocator : public JournalAllocator { + + public: + // SegmentAllocator specific methods + SegmentAllocator(JournalTrimmer *trimmer, + data_category_t category, + rewrite_gen_t gen, + SegmentProvider &sp, + SegmentSeqAllocator &ssa); + + segment_id_t get_segment_id() const { + assert(can_write()); + return current_segment->get_segment_id(); + } + + extent_len_t get_max_write_length() const { + return sm_group.get_segment_size() - + sm_group.get_rounded_header_length() - + sm_group.get_rounded_tail_length(); + } + + public: + // overriding methods + const std::string& get_name() const final { + return print_name; + } + + extent_len_t get_block_size() const final { + return sm_group.get_block_size(); + } + + bool can_write() const final { + return !!current_segment; + } + + segment_nonce_t get_nonce() const final { + assert(can_write()); + return current_segment_nonce; + } + + // returns true iff the current segment has insufficient space + bool needs_roll(std::size_t length) const final { + assert(can_write()); + assert(current_segment->get_write_capacity() == + sm_group.get_segment_size()); + auto write_capacity = current_segment->get_write_capacity() - + sm_group.get_rounded_tail_length(); + return length + written_to > std::size_t(write_capacity); + } + + // open for write and generate the correct print name + open_ret open(bool is_mkfs) final; + + // close the current segment and initialize next one + roll_ertr::future<> roll() final; + + // write the buffer, return the write result + // + // May be called concurrently, but writes may complete in any order. + // If rolling/opening, no write is allowed. + write_ret write(ceph::bufferlist&& to_write) final; + + using close_ertr = base_ertr; + close_ertr::future<> close() final; + + void update_modify_time(record_t& record) final { + segment_provider.update_modify_time( + get_segment_id(), + record.modify_time, + record.extents.size()); + } + + private: + open_ret do_open(bool is_mkfs); + + void reset() { + current_segment.reset(); + written_to = 0; + + current_segment_nonce = 0; + } + + using close_segment_ertr = base_ertr; + close_segment_ertr::future<> close_segment(); + + // device id is not available during construction, + // so generate the print_name later. + std::string print_name; + const segment_type_t type; // JOURNAL or OOL + const data_category_t category; + const rewrite_gen_t gen; + SegmentProvider &segment_provider; + SegmentManagerGroup &sm_group; + SegmentRef current_segment; + segment_off_t written_to; + SegmentSeqAllocator &segment_seq_allocator; + segment_nonce_t current_segment_nonce; + JournalTrimmer *trimmer; +}; + +} diff --git a/src/crimson/os/seastore/journal/segmented_journal.cc b/src/crimson/os/seastore/journal/segmented_journal.cc new file mode 100644 index 000000000..58df91374 --- /dev/null +++ b/src/crimson/os/seastore/journal/segmented_journal.cc @@ -0,0 +1,433 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <iostream> + +#include <boost/iterator/counting_iterator.hpp> + +#include "include/intarith.h" + +#include "segmented_journal.h" + +#include "crimson/common/config_proxy.h" +#include "crimson/os/seastore/logging.h" + +SET_SUBSYS(seastore_journal); + +/* + * format: + * - H<handle-addr> information + * + * levels: + * - INFO: major initiation, closing, rolling and replay operations + * - DEBUG: INFO details, major submit operations + * - TRACE: DEBUG details + */ + +namespace crimson::os::seastore::journal { + +SegmentedJournal::SegmentedJournal( + SegmentProvider &segment_provider, + JournalTrimmer &trimmer) + : segment_seq_allocator( + new SegmentSeqAllocator(segment_type_t::JOURNAL)), + journal_segment_allocator(&trimmer, + data_category_t::METADATA, + INLINE_GENERATION, + segment_provider, + *segment_seq_allocator), + record_submitter(crimson::common::get_conf<uint64_t>( + "seastore_journal_iodepth_limit"), + crimson::common::get_conf<uint64_t>( + "seastore_journal_batch_capacity"), + crimson::common::get_conf<Option::size_t>( + "seastore_journal_batch_flush_size"), + crimson::common::get_conf<double>( + "seastore_journal_batch_preferred_fullness"), + journal_segment_allocator), + sm_group(*segment_provider.get_segment_manager_group()), + trimmer{trimmer} +{ +} + +SegmentedJournal::open_for_mkfs_ret +SegmentedJournal::open_for_mkfs() +{ + return record_submitter.open(true); +} + +SegmentedJournal::open_for_mount_ret +SegmentedJournal::open_for_mount() +{ + return record_submitter.open(false); +} + +SegmentedJournal::close_ertr::future<> SegmentedJournal::close() +{ + LOG_PREFIX(Journal::close); + INFO("closing, committed_to={}", + record_submitter.get_committed_to()); + return record_submitter.close(); +} + +SegmentedJournal::prep_replay_segments_fut +SegmentedJournal::prep_replay_segments( + std::vector<std::pair<segment_id_t, segment_header_t>> segments) +{ + LOG_PREFIX(Journal::prep_replay_segments); + if (segments.empty()) { + ERROR("no journal segments for replay"); + return crimson::ct_error::input_output_error::make(); + } + std::sort( + segments.begin(), + segments.end(), + [](const auto <, const auto &rt) { + return lt.second.segment_seq < + rt.second.segment_seq; + }); + + segment_seq_allocator->set_next_segment_seq( + segments.rbegin()->second.segment_seq + 1); + std::for_each( + segments.begin(), + segments.end(), + [FNAME](auto &seg) + { + if (seg.first != seg.second.physical_segment_id || + seg.second.get_type() != segment_type_t::JOURNAL) { + ERROR("illegal journal segment for replay -- {}", seg.second); + ceph_abort(); + } + }); + + auto last_segment_id = segments.rbegin()->first; + auto last_header = segments.rbegin()->second; + return scan_last_segment(last_segment_id, last_header + ).safe_then([this, FNAME, segments=std::move(segments)] { + INFO("dirty_tail={}, alloc_tail={}", + trimmer.get_dirty_tail(), + trimmer.get_alloc_tail()); + auto journal_tail = trimmer.get_journal_tail(); + auto journal_tail_paddr = journal_tail.offset; + ceph_assert(journal_tail != JOURNAL_SEQ_NULL); + ceph_assert(journal_tail_paddr != P_ADDR_NULL); + auto from = std::find_if( + segments.begin(), + segments.end(), + [&journal_tail_paddr](const auto &seg) -> bool { + auto& seg_addr = journal_tail_paddr.as_seg_paddr(); + return seg.first == seg_addr.get_segment_id(); + }); + if (from->second.segment_seq != journal_tail.segment_seq) { + ERROR("journal_tail {} does not match {}", + journal_tail, from->second); + ceph_abort(); + } + + auto num_segments = segments.end() - from; + INFO("{} segments to replay", num_segments); + auto ret = replay_segments_t(num_segments); + std::transform( + from, segments.end(), ret.begin(), + [this](const auto &p) { + auto ret = journal_seq_t{ + p.second.segment_seq, + paddr_t::make_seg_paddr( + p.first, + sm_group.get_block_size()) + }; + return std::make_pair(ret, p.second); + }); + ret[0].first.offset = journal_tail_paddr; + return prep_replay_segments_fut( + replay_ertr::ready_future_marker{}, + std::move(ret)); + }); +} + +SegmentedJournal::scan_last_segment_ertr::future<> +SegmentedJournal::scan_last_segment( + const segment_id_t &segment_id, + const segment_header_t &segment_header) +{ + LOG_PREFIX(SegmentedJournal::scan_last_segment); + assert(segment_id == segment_header.physical_segment_id); + trimmer.update_journal_tails( + segment_header.dirty_tail, segment_header.alloc_tail); + auto seq = journal_seq_t{ + segment_header.segment_seq, + paddr_t::make_seg_paddr(segment_id, 0) + }; + INFO("scanning journal tail deltas -- {}", segment_header); + return seastar::do_with( + scan_valid_records_cursor(seq), + SegmentManagerGroup::found_record_handler_t( + [FNAME, this]( + record_locator_t locator, + const record_group_header_t& record_group_header, + const bufferlist& mdbuf + ) -> SegmentManagerGroup::scan_valid_records_ertr::future<> + { + DEBUG("decoding {} at {}", record_group_header, locator); + bool has_tail_delta = false; + auto maybe_headers = try_decode_record_headers( + record_group_header, mdbuf); + if (!maybe_headers) { + // This should be impossible, we did check the crc on the mdbuf + ERROR("unable to decode headers from {} at {}", + record_group_header, locator); + ceph_abort(); + } + for (auto &record_header : *maybe_headers) { + ceph_assert(is_valid_transaction(record_header.type)); + if (is_background_transaction(record_header.type)) { + has_tail_delta = true; + } + } + if (has_tail_delta) { + bool found_delta = false; + auto maybe_record_deltas_list = try_decode_deltas( + record_group_header, mdbuf, locator.record_block_base); + if (!maybe_record_deltas_list) { + ERROR("unable to decode deltas from {} at {}", + record_group_header, locator); + ceph_abort(); + } + for (auto &record_deltas : *maybe_record_deltas_list) { + for (auto &[ctime, delta] : record_deltas.deltas) { + if (delta.type == extent_types_t::JOURNAL_TAIL) { + found_delta = true; + journal_tail_delta_t tail_delta; + decode(tail_delta, delta.bl); + auto start_seq = locator.write_result.start_seq; + DEBUG("got {}, at {}", tail_delta, start_seq); + ceph_assert(tail_delta.dirty_tail != JOURNAL_SEQ_NULL); + ceph_assert(tail_delta.alloc_tail != JOURNAL_SEQ_NULL); + trimmer.update_journal_tails( + tail_delta.dirty_tail, tail_delta.alloc_tail); + } + } + } + ceph_assert(found_delta); + } + return seastar::now(); + }), + [this, nonce=segment_header.segment_nonce](auto &cursor, auto &handler) + { + return sm_group.scan_valid_records( + cursor, + nonce, + std::numeric_limits<std::size_t>::max(), + handler).discard_result(); + }); +} + +SegmentedJournal::replay_ertr::future<> +SegmentedJournal::replay_segment( + journal_seq_t seq, + segment_header_t header, + delta_handler_t &handler, + replay_stats_t &stats) +{ + LOG_PREFIX(Journal::replay_segment); + INFO("starting at {} -- {}", seq, header); + return seastar::do_with( + scan_valid_records_cursor(seq), + SegmentManagerGroup::found_record_handler_t( + [&handler, this, &stats]( + record_locator_t locator, + const record_group_header_t& header, + const bufferlist& mdbuf) + -> SegmentManagerGroup::scan_valid_records_ertr::future<> + { + LOG_PREFIX(Journal::replay_segment); + ++stats.num_record_groups; + auto maybe_record_deltas_list = try_decode_deltas( + header, mdbuf, locator.record_block_base); + if (!maybe_record_deltas_list) { + // This should be impossible, we did check the crc on the mdbuf + ERROR("unable to decode deltas for record {} at {}", + header, locator); + return crimson::ct_error::input_output_error::make(); + } + + return seastar::do_with( + std::move(*maybe_record_deltas_list), + [write_result=locator.write_result, + this, + FNAME, + &handler, + &stats](auto& record_deltas_list) + { + return crimson::do_for_each( + record_deltas_list, + [write_result, + this, + FNAME, + &handler, + &stats](record_deltas_t& record_deltas) + { + ++stats.num_records; + auto locator = record_locator_t{ + record_deltas.record_block_base, + write_result + }; + DEBUG("processing {} deltas at block_base {}", + record_deltas.deltas.size(), + locator); + return crimson::do_for_each( + record_deltas.deltas, + [locator, + this, + &handler, + &stats](auto &p) + { + auto& modify_time = p.first; + auto& delta = p.second; + return handler( + locator, + delta, + trimmer.get_dirty_tail(), + trimmer.get_alloc_tail(), + modify_time + ).safe_then([&stats, delta_type=delta.type](bool is_applied) { + if (is_applied) { + // see Cache::replay_delta() + assert(delta_type != extent_types_t::JOURNAL_TAIL); + if (delta_type == extent_types_t::ALLOC_INFO) { + ++stats.num_alloc_deltas; + } else { + ++stats.num_dirty_deltas; + } + } + }); + }); + }); + }); + }), + [=, this](auto &cursor, auto &dhandler) { + return sm_group.scan_valid_records( + cursor, + header.segment_nonce, + std::numeric_limits<size_t>::max(), + dhandler).safe_then([](auto){} + ).handle_error( + replay_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "shouldn't meet with any other error other replay_ertr" + } + ); + } + ); +} + +SegmentedJournal::replay_ret SegmentedJournal::replay( + delta_handler_t &&delta_handler) +{ + LOG_PREFIX(Journal::replay); + return sm_group.find_journal_segment_headers( + ).safe_then([this, FNAME, delta_handler=std::move(delta_handler)] + (auto &&segment_headers) mutable -> replay_ret { + INFO("got {} segments", segment_headers.size()); + return seastar::do_with( + std::move(delta_handler), + replay_segments_t(), + replay_stats_t(), + [this, segment_headers=std::move(segment_headers), FNAME] + (auto &handler, auto &segments, auto &stats) mutable -> replay_ret { + return prep_replay_segments(std::move(segment_headers) + ).safe_then([this, &handler, &segments, &stats](auto replay_segs) mutable { + segments = std::move(replay_segs); + return crimson::do_for_each(segments,[this, &handler, &stats](auto i) mutable { + return replay_segment(i.first, i.second, handler, stats); + }); + }).safe_then([&stats, FNAME] { + INFO("replay done, record_groups={}, records={}, " + "alloc_deltas={}, dirty_deltas={}", + stats.num_record_groups, + stats.num_records, + stats.num_alloc_deltas, + stats.num_dirty_deltas); + }); + }); + }); +} + +seastar::future<> SegmentedJournal::flush(OrderingHandle &handle) +{ + LOG_PREFIX(SegmentedJournal::flush); + DEBUG("H{} flush ...", (void*)&handle); + assert(write_pipeline); + return handle.enter(write_pipeline->device_submission + ).then([this, &handle] { + return handle.enter(write_pipeline->finalize); + }).then([FNAME, &handle] { + DEBUG("H{} flush done", (void*)&handle); + }); +} + +SegmentedJournal::submit_record_ret +SegmentedJournal::do_submit_record( + record_t &&record, + OrderingHandle &handle) +{ + LOG_PREFIX(SegmentedJournal::do_submit_record); + if (!record_submitter.is_available()) { + DEBUG("H{} wait ...", (void*)&handle); + return record_submitter.wait_available( + ).safe_then([this, record=std::move(record), &handle]() mutable { + return do_submit_record(std::move(record), handle); + }); + } + auto action = record_submitter.check_action(record.size); + if (action == RecordSubmitter::action_t::ROLL) { + DEBUG("H{} roll, unavailable ...", (void*)&handle); + return record_submitter.roll_segment( + ).safe_then([this, record=std::move(record), &handle]() mutable { + return do_submit_record(std::move(record), handle); + }); + } else { // SUBMIT_FULL/NOT_FULL + DEBUG("H{} submit {} ...", + (void*)&handle, + action == RecordSubmitter::action_t::SUBMIT_FULL ? + "FULL" : "NOT_FULL"); + auto submit_fut = record_submitter.submit(std::move(record)); + return handle.enter(write_pipeline->device_submission + ).then([submit_fut=std::move(submit_fut)]() mutable { + return std::move(submit_fut); + }).safe_then([FNAME, this, &handle](record_locator_t result) { + return handle.enter(write_pipeline->finalize + ).then([FNAME, this, result, &handle] { + DEBUG("H{} finish with {}", (void*)&handle, result); + auto new_committed_to = result.write_result.get_end_seq(); + record_submitter.update_committed_to(new_committed_to); + return result; + }); + }); + } +} + +SegmentedJournal::submit_record_ret +SegmentedJournal::submit_record( + record_t &&record, + OrderingHandle &handle) +{ + LOG_PREFIX(SegmentedJournal::submit_record); + DEBUG("H{} {} start ...", (void*)&handle, record); + assert(write_pipeline); + auto expected_size = record_group_size_t( + record.size, + journal_segment_allocator.get_block_size() + ).get_encoded_length(); + auto max_record_length = journal_segment_allocator.get_max_write_length(); + if (expected_size > max_record_length) { + ERROR("H{} {} exceeds max record size {}", + (void*)&handle, record, max_record_length); + return crimson::ct_error::erange::make(); + } + + return do_submit_record(std::move(record), handle); +} + +} diff --git a/src/crimson/os/seastore/journal/segmented_journal.h b/src/crimson/os/seastore/journal/segmented_journal.h new file mode 100644 index 000000000..3d580817c --- /dev/null +++ b/src/crimson/os/seastore/journal/segmented_journal.h @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer.h" +#include "include/denc.h" + +#include "crimson/os/seastore/async_cleaner.h" +#include "crimson/os/seastore/journal.h" +#include "crimson/os/seastore/segment_manager_group.h" +#include "crimson/os/seastore/ordering_handle.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/osd/exceptions.h" +#include "segment_allocator.h" +#include "crimson/os/seastore/segment_seq_allocator.h" +#include "record_submitter.h" + +namespace crimson::os::seastore::journal { +/** + * Manages stream of atomically written records to a SegmentManager. + */ +class SegmentedJournal : public Journal { +public: + SegmentedJournal( + SegmentProvider &segment_provider, + JournalTrimmer &trimmer); + ~SegmentedJournal() {} + + JournalTrimmer &get_trimmer() final { + return trimmer; + } + + open_for_mkfs_ret open_for_mkfs() final; + + open_for_mount_ret open_for_mount() final; + + close_ertr::future<> close() final; + + submit_record_ret submit_record( + record_t &&record, + OrderingHandle &handle) final; + + seastar::future<> flush(OrderingHandle &handle) final; + + replay_ret replay(delta_handler_t &&delta_handler) final; + + void set_write_pipeline(WritePipeline *_write_pipeline) final { + write_pipeline = _write_pipeline; + } + + journal_type_t get_type() final { + return journal_type_t::SEGMENTED; + } + seastar::future<> finish_commit(transaction_type_t type) { + return seastar::now(); + } + +private: + submit_record_ret do_submit_record( + record_t &&record, + OrderingHandle &handle + ); + + SegmentSeqAllocatorRef segment_seq_allocator; + SegmentAllocator journal_segment_allocator; + RecordSubmitter record_submitter; + SegmentManagerGroup &sm_group; + JournalTrimmer &trimmer; + WritePipeline* write_pipeline = nullptr; + + /// return ordered vector of segments to replay + using replay_segments_t = std::vector< + std::pair<journal_seq_t, segment_header_t>>; + using prep_replay_segments_fut = replay_ertr::future< + replay_segments_t>; + prep_replay_segments_fut prep_replay_segments( + std::vector<std::pair<segment_id_t, segment_header_t>> segments); + + /// scan the last segment for tail deltas + using scan_last_segment_ertr = replay_ertr; + scan_last_segment_ertr::future<> scan_last_segment( + const segment_id_t&, const segment_header_t&); + + struct replay_stats_t { + std::size_t num_record_groups = 0; + std::size_t num_records = 0; + std::size_t num_alloc_deltas = 0; + std::size_t num_dirty_deltas = 0; + }; + + /// replays records starting at start through end of segment + replay_ertr::future<> + replay_segment( + journal_seq_t start, ///< [in] starting addr, seq + segment_header_t header, ///< [in] segment header + delta_handler_t &delta_handler, ///< [in] processes deltas in order + replay_stats_t &stats ///< [out] replay stats + ); +}; + +} diff --git a/src/crimson/os/seastore/lba_manager.cc b/src/crimson/os/seastore/lba_manager.cc new file mode 100644 index 000000000..d113bbd1e --- /dev/null +++ b/src/crimson/os/seastore/lba_manager.cc @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h" + +namespace crimson::os::seastore { + +LBAManager::update_mappings_ret +LBAManager::update_mappings( + Transaction& t, + const std::list<LogicalCachedExtentRef>& extents) +{ + return trans_intr::do_for_each(extents, + [this, &t](auto &extent) { + return update_mapping( + t, + extent->get_laddr(), + extent->get_prior_paddr_and_reset(), + extent->get_paddr(), + nullptr // all the extents should have already been + // added to the fixed_kv_btree + ); + }); +} + +LBAManagerRef lba_manager::create_lba_manager(Cache &cache) { + return LBAManagerRef(new btree::BtreeLBAManager(cache)); +} + +} diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h new file mode 100644 index 000000000..d7adf2304 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager.h @@ -0,0 +1,237 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer_fwd.h" +#include "include/interval_set.h" +#include "common/interval_map.h" + +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { + +/** + * Abstract interface for managing the logical to physical mapping + */ +class LBAManager { +public: + using base_iertr = Cache::base_iertr; + + using mkfs_iertr = base_iertr; + using mkfs_ret = mkfs_iertr::future<>; + virtual mkfs_ret mkfs( + Transaction &t + ) = 0; + + /** + * Fetches mappings for laddr_t in range [offset, offset + len) + * + * Future will not resolve until all pins have resolved (set_paddr called) + * For indirect lba mappings, get_mappings will always retrieve the original + * lba value. + */ + using get_mappings_iertr = base_iertr; + using get_mappings_ret = get_mappings_iertr::future<lba_pin_list_t>; + virtual get_mappings_ret get_mappings( + Transaction &t, + laddr_t offset, extent_len_t length) = 0; + + /** + * Fetches mappings for a list of laddr_t in range [offset, offset + len) + * + * Future will not resolve until all pins have resolved (set_paddr called) + * For indirect lba mappings, get_mappings will always retrieve the original + * lba value. + */ + virtual get_mappings_ret get_mappings( + Transaction &t, + laddr_list_t &&extent_lisk) = 0; + + /** + * Fetches the mapping for laddr_t + * + * Future will not resolve until the pin has resolved (set_paddr called) + * For indirect lba mappings, get_mapping will always retrieve the original + * lba value. + */ + using get_mapping_iertr = base_iertr::extend< + crimson::ct_error::enoent>; + using get_mapping_ret = get_mapping_iertr::future<LBAMappingRef>; + virtual get_mapping_ret get_mapping( + Transaction &t, + laddr_t offset) = 0; + + /** + * Allocates a new mapping referenced by LBARef + * + * Offset will be relative to the block offset of the record + * This mapping will block from transaction submission until set_paddr + * is called on the LBAMapping. + */ + using alloc_extent_iertr = base_iertr; + using alloc_extent_ret = alloc_extent_iertr::future<LBAMappingRef>; + virtual alloc_extent_ret alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + paddr_t addr, + LogicalCachedExtent &nextent) = 0; + + virtual alloc_extent_ret clone_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + laddr_t intermediate_key, + paddr_t actual_addr, + laddr_t intermediate_base) = 0; + + virtual alloc_extent_ret reserve_region( + Transaction &t, + laddr_t hint, + extent_len_t len) = 0; + + struct ref_update_result_t { + unsigned refcount = 0; + pladdr_t addr; + extent_len_t length = 0; + }; + using ref_iertr = base_iertr::extend< + crimson::ct_error::enoent>; + using ref_ret = ref_iertr::future<ref_update_result_t>; + + /** + * Decrements ref count on extent + * + * @return returns resulting refcount + */ + virtual ref_ret decref_extent( + Transaction &t, + laddr_t addr, + bool cascade_remove) = 0; + + /** + * Increments ref count on extent + * + * @return returns resulting refcount + */ + virtual ref_ret incref_extent( + Transaction &t, + laddr_t addr) = 0; + + /** + * Increments ref count on extent + * + * @return returns resulting refcount + */ + virtual ref_ret incref_extent( + Transaction &t, + laddr_t addr, + int delta) = 0; + + /** + * Should be called after replay on each cached extent. + * Implementation must initialize the LBAMapping on any + * LogicalCachedExtent's and may also read in any dependent + * structures, etc. + * + * @return returns whether the extent is alive + */ + using init_cached_extent_iertr = base_iertr; + using init_cached_extent_ret = init_cached_extent_iertr::future<bool>; + virtual init_cached_extent_ret init_cached_extent( + Transaction &t, + CachedExtentRef e) = 0; + + using check_child_trackers_ret = base_iertr::future<>; + virtual check_child_trackers_ret check_child_trackers(Transaction &t) = 0; + + /** + * Calls f for each mapping in [begin, end) + */ + using scan_mappings_iertr = base_iertr; + using scan_mappings_ret = scan_mappings_iertr::future<>; + using scan_mappings_func_t = std::function< + void(laddr_t, paddr_t, extent_len_t)>; + virtual scan_mappings_ret scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) = 0; + + /** + * rewrite_extent + * + * rewrite extent into passed transaction + */ + using rewrite_extent_iertr = base_iertr; + using rewrite_extent_ret = rewrite_extent_iertr::future<>; + virtual rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) = 0; + + /** + * update_mapping + * + * update lba mapping for a delayed allocated extent + */ + using update_mapping_iertr = base_iertr; + using update_mapping_ret = base_iertr::future<>; + virtual update_mapping_ret update_mapping( + Transaction& t, + laddr_t laddr, + paddr_t prev_addr, + paddr_t paddr, + LogicalCachedExtent *nextent) = 0; + + /** + * update_mappings + * + * update lba mappings for delayed allocated extents + */ + using update_mappings_iertr = update_mapping_iertr; + using update_mappings_ret = update_mapping_ret; + update_mappings_ret update_mappings( + Transaction& t, + const std::list<LogicalCachedExtentRef>& extents); + + /** + * get_physical_extent_if_live + * + * Returns extent at addr/laddr if still live (if laddr + * still points at addr). Extent must be an internal, physical + * extent. + * + * Returns a null CachedExtentRef if extent is not live. + */ + using get_physical_extent_if_live_iertr = base_iertr; + using get_physical_extent_if_live_ret = + get_physical_extent_if_live_iertr::future<CachedExtentRef>; + virtual get_physical_extent_if_live_ret get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + extent_len_t len) = 0; + + virtual ~LBAManager() {} +}; +using LBAManagerRef = std::unique_ptr<LBAManager>; + +class Cache; +namespace lba_manager { +LBAManagerRef create_lba_manager(Cache &cache); +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc new file mode 100644 index 000000000..a607cd612 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -0,0 +1,761 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include <seastar/core/metrics.hh> + +#include "include/buffer.h" +#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/logging.h" + +SET_SUBSYS(seastore_lba); +/* + * levels: + * - INFO: mkfs + * - DEBUG: modification operations + * - TRACE: read operations, DEBUG details + */ + +namespace crimson::os::seastore { + +template <typename T> +Transaction::tree_stats_t& get_tree_stats(Transaction &t) +{ + return t.get_lba_tree_stats(); +} + +template Transaction::tree_stats_t& +get_tree_stats< + crimson::os::seastore::lba_manager::btree::LBABtree>( + Transaction &t); + +template <typename T> +phy_tree_root_t& get_phy_tree_root(root_t &r) +{ + return r.lba_root; +} + +template phy_tree_root_t& +get_phy_tree_root< + crimson::os::seastore::lba_manager::btree::LBABtree>(root_t &r); + +template <> +const get_phy_tree_root_node_ret get_phy_tree_root_node< + crimson::os::seastore::lba_manager::btree::LBABtree>( + const RootBlockRef &root_block, op_context_t<laddr_t> c) +{ + auto lba_root = root_block->lba_root_node; + if (lba_root) { + ceph_assert(lba_root->is_initial_pending() + == root_block->is_pending()); + return {true, + trans_intr::make_interruptible( + c.cache.get_extent_viewable_by_trans(c.trans, lba_root))}; + } else if (root_block->is_pending()) { + auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance()); + lba_root = prior.lba_root_node; + if (lba_root) { + return {true, + trans_intr::make_interruptible( + c.cache.get_extent_viewable_by_trans(c.trans, lba_root))}; + } else { + return {false, + trans_intr::make_interruptible( + Cache::get_extent_ertr::make_ready_future< + CachedExtentRef>())}; + } + } else { + return {false, + trans_intr::make_interruptible( + Cache::get_extent_ertr::make_ready_future< + CachedExtentRef>())}; + } +} + +template <typename ROOT> +void link_phy_tree_root_node(RootBlockRef &root_block, ROOT* lba_root) { + root_block->lba_root_node = lba_root; + ceph_assert(lba_root != nullptr); + lba_root->root_block = root_block; +} + +template void link_phy_tree_root_node( + RootBlockRef &root_block, lba_manager::btree::LBAInternalNode* lba_root); +template void link_phy_tree_root_node( + RootBlockRef &root_block, lba_manager::btree::LBALeafNode* lba_root); +template void link_phy_tree_root_node( + RootBlockRef &root_block, lba_manager::btree::LBANode* lba_root); + +template <> +void unlink_phy_tree_root_node<laddr_t>(RootBlockRef &root_block) { + root_block->lba_root_node = nullptr; +} + +} + +namespace crimson::os::seastore::lba_manager::btree { + +BtreeLBAManager::mkfs_ret +BtreeLBAManager::mkfs( + Transaction &t) +{ + LOG_PREFIX(BtreeLBAManager::mkfs); + INFOT("start", t); + return cache.get_root(t).si_then([this, &t](auto croot) { + assert(croot->is_mutation_pending()); + croot->get_root().lba_root = LBABtree::mkfs(croot, get_context(t)); + return mkfs_iertr::now(); + }).handle_error_interruptible( + mkfs_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in BtreeLBAManager::mkfs" + } + ); +} + +BtreeLBAManager::get_mappings_ret +BtreeLBAManager::get_mappings( + Transaction &t, + laddr_t offset, extent_len_t length) +{ + LOG_PREFIX(BtreeLBAManager::get_mappings); + TRACET("{}~{}", t, offset, length); + auto c = get_context(t); + return with_btree_state<LBABtree, lba_pin_list_t>( + cache, + c, + [c, offset, length, FNAME, this](auto &btree, auto &ret) { + return seastar::do_with( + std::list<BtreeLBAMappingRef>(), + [offset, length, c, FNAME, this, &ret, &btree](auto &pin_list) { + return LBABtree::iterate_repeat( + c, + btree.upper_bound_right(c, offset), + [&pin_list, offset, length, c, FNAME](auto &pos) { + if (pos.is_end() || pos.get_key() >= (offset + length)) { + TRACET("{}~{} done with {} results", + c.trans, offset, length, pin_list.size()); + return LBABtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } + TRACET("{}~{} got {}, {}, repeat ...", + c.trans, offset, length, pos.get_key(), pos.get_val()); + ceph_assert((pos.get_key() + pos.get_val().len) > offset); + pin_list.push_back(pos.get_pin(c)); + return LBABtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + }).si_then([this, &ret, c, &pin_list] { + return _get_original_mappings(c, pin_list + ).si_then([&ret](auto _ret) { + ret = std::move(_ret); + }); + }); + }); + }); +} + +BtreeLBAManager::_get_original_mappings_ret +BtreeLBAManager::_get_original_mappings( + op_context_t<laddr_t> c, + std::list<BtreeLBAMappingRef> &pin_list) +{ + return seastar::do_with( + lba_pin_list_t(), + [this, c, &pin_list](auto &ret) { + return trans_intr::do_for_each( + pin_list, + [this, c, &ret](auto &pin) { + LOG_PREFIX(BtreeLBAManager::get_mappings); + if (pin->get_raw_val().is_paddr()) { + ret.emplace_back(std::move(pin)); + return get_mappings_iertr::now(); + } + TRACET( + "getting original mapping for indirect mapping {}~{}", + c.trans, pin->get_key(), pin->get_length()); + return this->get_mappings( + c.trans, pin->get_raw_val().get_laddr(), pin->get_length() + ).si_then([&pin, &ret, c](auto new_pin_list) { + LOG_PREFIX(BtreeLBAManager::get_mappings); + assert(new_pin_list.size() == 1); + auto &new_pin = new_pin_list.front(); + auto intermediate_key = pin->get_raw_val().get_laddr(); + assert(!new_pin->is_indirect()); + assert(new_pin->get_key() <= intermediate_key); + assert(new_pin->get_key() + new_pin->get_length() >= + intermediate_key + pin->get_length()); + + TRACET("Got mapping {}~{} for indirect mapping {}~{}, " + "intermediate_key {}", + c.trans, + new_pin->get_key(), new_pin->get_length(), + pin->get_key(), pin->get_length(), + pin->get_raw_val().get_laddr()); + auto &btree_new_pin = static_cast<BtreeLBAMapping&>(*new_pin); + btree_new_pin.set_key_for_indirect( + pin->get_key(), + pin->get_length(), + pin->get_raw_val().get_laddr()); + ret.emplace_back(std::move(new_pin)); + return seastar::now(); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further{}, + crimson::ct_error::assert_all("unexpected enoent") + ); + } + ).si_then([&ret] { + return std::move(ret); + }); + }); +} + + +BtreeLBAManager::get_mappings_ret +BtreeLBAManager::get_mappings( + Transaction &t, + laddr_list_t &&list) +{ + LOG_PREFIX(BtreeLBAManager::get_mappings); + TRACET("{}", t, list); + auto l = std::make_unique<laddr_list_t>(std::move(list)); + auto retptr = std::make_unique<lba_pin_list_t>(); + auto &ret = *retptr; + return trans_intr::do_for_each( + l->begin(), + l->end(), + [this, &t, &ret](const auto &p) { + return this->get_mappings(t, p.first, p.second).si_then( + [&ret](auto res) { + ret.splice(ret.end(), res, res.begin(), res.end()); + return get_mappings_iertr::now(); + }); + }).si_then([l=std::move(l), retptr=std::move(retptr)]() mutable { + return std::move(*retptr); + }); +} + +BtreeLBAManager::get_mapping_ret +BtreeLBAManager::get_mapping( + Transaction &t, + laddr_t offset) +{ + LOG_PREFIX(BtreeLBAManager::get_mapping); + TRACET("{}", t, offset); + return _get_mapping(t, offset + ).si_then([](auto pin) { + return get_mapping_iertr::make_ready_future<LBAMappingRef>(std::move(pin)); + }); +} + +BtreeLBAManager::_get_mapping_ret +BtreeLBAManager::_get_mapping( + Transaction &t, + laddr_t offset) +{ + LOG_PREFIX(BtreeLBAManager::_get_mapping); + TRACET("{}", t, offset); + auto c = get_context(t); + return with_btree_ret<LBABtree, BtreeLBAMappingRef>( + cache, + c, + [FNAME, c, offset, this](auto &btree) { + return btree.lower_bound( + c, offset + ).si_then([FNAME, offset, c](auto iter) -> _get_mapping_ret { + if (iter.is_end() || iter.get_key() != offset) { + ERRORT("laddr={} doesn't exist", c.trans, offset); + return crimson::ct_error::enoent::make(); + } else { + TRACET("{} got {}, {}", + c.trans, offset, iter.get_key(), iter.get_val()); + auto e = iter.get_pin(c); + return _get_mapping_ret( + interruptible::ready_future_marker{}, + std::move(e)); + } + }).si_then([this, c](auto pin) -> _get_mapping_ret { + if (pin->get_raw_val().is_laddr()) { + return seastar::do_with( + std::move(pin), + [this, c](auto &pin) { + return _get_mapping( + c.trans, pin->get_raw_val().get_laddr() + ).si_then([&pin](auto new_pin) { + ceph_assert(pin->get_length() == new_pin->get_length()); + new_pin->set_key_for_indirect( + pin->get_key(), + pin->get_length()); + return new_pin; + }); + }); + } else { + return get_mapping_iertr::make_ready_future<BtreeLBAMappingRef>(std::move(pin)); + } + }); + }); +} + +BtreeLBAManager::alloc_extent_ret +BtreeLBAManager::_alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + pladdr_t addr, + paddr_t actual_addr, + laddr_t intermediate_base, + LogicalCachedExtent* nextent) +{ + struct state_t { + laddr_t last_end; + + std::optional<typename LBABtree::iterator> insert_iter; + std::optional<typename LBABtree::iterator> ret; + + state_t(laddr_t hint) : last_end(hint) {} + }; + + LOG_PREFIX(BtreeLBAManager::_alloc_extent); + TRACET("{}~{}, hint={}", t, addr, len, hint); + auto c = get_context(t); + ++stats.num_alloc_extents; + auto lookup_attempts = stats.num_alloc_extents_iter_nexts; + return crimson::os::seastore::with_btree_state<LBABtree, state_t>( + cache, + c, + hint, + [this, FNAME, c, hint, len, addr, lookup_attempts, + &t, nextent](auto &btree, auto &state) { + return LBABtree::iterate_repeat( + c, + btree.upper_bound_right(c, hint), + [this, &state, len, addr, &t, hint, FNAME, lookup_attempts](auto &pos) { + ++stats.num_alloc_extents_iter_nexts; + if (pos.is_end()) { + DEBUGT("{}~{}, hint={}, state: end, done with {} attempts, insert at {}", + t, addr, len, hint, + stats.num_alloc_extents_iter_nexts - lookup_attempts, + state.last_end); + state.insert_iter = pos; + return typename LBABtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } else if (pos.get_key() >= (state.last_end + len)) { + DEBUGT("{}~{}, hint={}, state: {}~{}, done with {} attempts, insert at {} -- {}", + t, addr, len, hint, + pos.get_key(), pos.get_val().len, + stats.num_alloc_extents_iter_nexts - lookup_attempts, + state.last_end, + pos.get_val()); + state.insert_iter = pos; + return typename LBABtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } else { + state.last_end = pos.get_key() + pos.get_val().len; + TRACET("{}~{}, hint={}, state: {}~{}, repeat ... -- {}", + t, addr, len, hint, + pos.get_key(), pos.get_val().len, + pos.get_val()); + return typename LBABtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + } + }).si_then([FNAME, c, addr, len, hint, &btree, &state, nextent] { + return btree.insert( + c, + *state.insert_iter, + state.last_end, + lba_map_val_t{len, pladdr_t(addr), 1, 0}, + nextent + ).si_then([&state, FNAME, c, addr, len, hint, nextent](auto &&p) { + auto [iter, inserted] = std::move(p); + TRACET("{}~{}, hint={}, inserted at {}", + c.trans, addr, len, hint, state.last_end); + if (nextent) { + ceph_assert(addr.is_paddr()); + nextent->set_laddr(iter.get_key()); + } + ceph_assert(inserted); + state.ret = iter; + }); + }); + }).si_then([c, actual_addr, addr, intermediate_base](auto &&state) { + auto ret_pin = state.ret->get_pin(c); + if (actual_addr != P_ADDR_NULL) { + ceph_assert(addr.is_laddr()); + ret_pin->set_paddr(actual_addr); + ret_pin->set_intermediate_base(intermediate_base); + } else { + ceph_assert(addr.is_paddr()); + } + return alloc_extent_iertr::make_ready_future<LBAMappingRef>( + std::move(ret_pin)); + }); +} + +static bool is_lba_node(const CachedExtent &e) +{ + return is_lba_node(e.get_type()); +} + +BtreeLBAManager::base_iertr::template future<> +_init_cached_extent( + op_context_t<laddr_t> c, + const CachedExtentRef &e, + LBABtree &btree, + bool &ret) +{ + if (e->is_logical()) { + auto logn = e->cast<LogicalCachedExtent>(); + return btree.lower_bound( + c, + logn->get_laddr() + ).si_then([e, c, logn, &ret](auto iter) { + LOG_PREFIX(BtreeLBAManager::init_cached_extent); + if (!iter.is_end() && + iter.get_key() == logn->get_laddr() && + iter.get_val().pladdr.is_paddr() && + iter.get_val().pladdr.get_paddr() == logn->get_paddr()) { + assert(!iter.get_leaf_node()->is_pending()); + iter.get_leaf_node()->link_child(logn.get(), iter.get_leaf_pos()); + logn->set_laddr(iter.get_pin(c)->get_key()); + ceph_assert(iter.get_val().len == e->get_length()); + DEBUGT("logical extent {} live", c.trans, *logn); + ret = true; + } else { + DEBUGT("logical extent {} not live", c.trans, *logn); + ret = false; + } + }); + } else { + return btree.init_cached_extent(c, e + ).si_then([&ret](bool is_alive) { + ret = is_alive; + }); + } +} + +BtreeLBAManager::init_cached_extent_ret +BtreeLBAManager::init_cached_extent( + Transaction &t, + CachedExtentRef e) +{ + LOG_PREFIX(BtreeLBAManager::init_cached_extent); + TRACET("{}", t, *e); + return seastar::do_with(bool(), [this, e, &t](bool &ret) { + auto c = get_context(t); + return with_btree<LBABtree>( + cache, c, + [c, e, &ret](auto &btree) -> base_iertr::future<> { + LOG_PREFIX(BtreeLBAManager::init_cached_extent); + DEBUGT("extent {}", c.trans, *e); + return _init_cached_extent(c, e, btree, ret); + } + ).si_then([&ret] { return ret; }); + }); +} + +BtreeLBAManager::check_child_trackers_ret +BtreeLBAManager::check_child_trackers( + Transaction &t) { + auto c = get_context(t); + return with_btree<LBABtree>( + cache, c, + [c](auto &btree) { + return btree.check_child_trackers(c); + }); +} + +BtreeLBAManager::scan_mappings_ret +BtreeLBAManager::scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) +{ + LOG_PREFIX(BtreeLBAManager::scan_mappings); + DEBUGT("begin: {}, end: {}", t, begin, end); + + auto c = get_context(t); + return with_btree<LBABtree>( + cache, + c, + [c, f=std::move(f), begin, end](auto &btree) mutable { + return LBABtree::iterate_repeat( + c, + btree.upper_bound_right(c, begin), + [f=std::move(f), begin, end](auto &pos) { + if (pos.is_end() || pos.get_key() >= end) { + return typename LBABtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::yes); + } + ceph_assert((pos.get_key() + pos.get_val().len) > begin); + f(pos.get_key(), pos.get_val().pladdr.get_paddr(), pos.get_val().len); + return LBABtree::iterate_repeat_ret_inner( + interruptible::ready_future_marker{}, + seastar::stop_iteration::no); + }); + }); +} + +BtreeLBAManager::rewrite_extent_ret +BtreeLBAManager::rewrite_extent( + Transaction &t, + CachedExtentRef extent) +{ + LOG_PREFIX(BtreeLBAManager::rewrite_extent); + if (extent->has_been_invalidated()) { + ERRORT("extent has been invalidated -- {}", t, *extent); + ceph_abort(); + } + assert(!extent->is_logical()); + + if (is_lba_node(*extent)) { + DEBUGT("rewriting lba extent -- {}", t, *extent); + auto c = get_context(t); + return with_btree<LBABtree>( + cache, + c, + [c, extent](auto &btree) mutable { + return btree.rewrite_extent(c, extent); + }); + } else { + DEBUGT("skip non lba extent -- {}", t, *extent); + return rewrite_extent_iertr::now(); + } +} + +BtreeLBAManager::update_mapping_ret +BtreeLBAManager::update_mapping( + Transaction& t, + laddr_t laddr, + paddr_t prev_addr, + paddr_t addr, + LogicalCachedExtent *nextent) +{ + LOG_PREFIX(BtreeLBAManager::update_mapping); + TRACET("laddr={}, paddr {} => {}", t, laddr, prev_addr, addr); + return _update_mapping( + t, + laddr, + [prev_addr, addr]( + const lba_map_val_t &in) { + assert(!addr.is_null()); + lba_map_val_t ret = in; + ceph_assert(in.pladdr.is_paddr()); + ceph_assert(in.pladdr.get_paddr() == prev_addr); + ret.pladdr = addr; + return ret; + }, + nextent + ).si_then([&t, laddr, prev_addr, addr, FNAME](auto result) { + DEBUGT("laddr={}, paddr {} => {} done -- {}", + t, laddr, prev_addr, addr, result); + }, + update_mapping_iertr::pass_further{}, + /* ENOENT in particular should be impossible */ + crimson::ct_error::assert_all{ + "Invalid error in BtreeLBAManager::update_mapping" + } + ); +} + +BtreeLBAManager::get_physical_extent_if_live_ret +BtreeLBAManager::get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + extent_len_t len) +{ + LOG_PREFIX(BtreeLBAManager::get_physical_extent_if_live); + DEBUGT("{}, laddr={}, paddr={}, length={}", + t, type, laddr, addr, len); + ceph_assert(is_lba_node(type)); + auto c = get_context(t); + return with_btree_ret<LBABtree, CachedExtentRef>( + cache, + c, + [c, type, addr, laddr, len](auto &btree) { + if (type == extent_types_t::LADDR_INTERNAL) { + return btree.get_internal_if_live(c, addr, laddr, len); + } else { + assert(type == extent_types_t::LADDR_LEAF || + type == extent_types_t::DINK_LADDR_LEAF); + return btree.get_leaf_if_live(c, addr, laddr, len); + } + }); +} + +void BtreeLBAManager::register_metrics() +{ + LOG_PREFIX(BtreeLBAManager::register_metrics); + DEBUG("start"); + stats = {}; + namespace sm = seastar::metrics; + metrics.add_group( + "LBA", + { + sm::make_counter( + "alloc_extents", + stats.num_alloc_extents, + sm::description("total number of lba alloc_extent operations") + ), + sm::make_counter( + "alloc_extents_iter_nexts", + stats.num_alloc_extents_iter_nexts, + sm::description("total number of iterator next operations during extent allocation") + ), + } + ); +} + +BtreeLBAManager::ref_iertr::future<std::optional<std::pair<paddr_t, extent_len_t>>> +BtreeLBAManager::_decref_intermediate( + Transaction &t, + laddr_t addr, + extent_len_t len) +{ + auto c = get_context(t); + return with_btree<LBABtree>( + cache, + c, + [c, addr, len](auto &btree) mutable { + return btree.upper_bound_right( + c, addr + ).si_then([&btree, addr, len, c](auto iter) { + return seastar::do_with( + std::move(iter), + [&btree, addr, len, c](auto &iter) { + ceph_assert(!iter.is_end()); + ceph_assert(iter.get_key() <= addr); + auto val = iter.get_val(); + ceph_assert(iter.get_key() + val.len >= addr + len); + ceph_assert(val.pladdr.is_paddr()); + ceph_assert(val.refcount >= 1); + val.refcount -= 1; + + LOG_PREFIX(BtreeLBAManager::_decref_intermediate); + TRACET("decreased refcount of intermediate key {} -- {}", + c.trans, + iter.get_key(), + val); + + if (!val.refcount) { + return btree.remove(c, iter + ).si_then([val] { + return std::make_optional< + std::pair<paddr_t, extent_len_t>>( + val.pladdr.get_paddr(), val.len); + }); + } else { + return btree.update(c, iter, val, nullptr + ).si_then([](auto) { + return seastar::make_ready_future< + std::optional<std::pair<paddr_t, extent_len_t>>>(std::nullopt); + }); + } + }); + }); + }); +} + +BtreeLBAManager::update_refcount_ret +BtreeLBAManager::update_refcount( + Transaction &t, + laddr_t addr, + int delta, + bool cascade_remove) +{ + LOG_PREFIX(BtreeLBAManager::update_refcount); + TRACET("laddr={}, delta={}", t, addr, delta); + return _update_mapping( + t, + addr, + [delta](const lba_map_val_t &in) { + lba_map_val_t out = in; + ceph_assert((int)out.refcount + delta >= 0); + out.refcount += delta; + return out; + }, + nullptr + ).si_then([&t, addr, delta, FNAME, this, cascade_remove](auto result) { + DEBUGT("laddr={}, delta={} done -- {}", t, addr, delta, result); + auto fut = ref_iertr::make_ready_future< + std::optional<std::pair<paddr_t, extent_len_t>>>(); + if (!result.refcount && result.pladdr.is_laddr() && cascade_remove) { + fut = _decref_intermediate( + t, + result.pladdr.get_laddr(), + result.len + ); + } + return fut.si_then([result](auto removed) { + if (result.pladdr.is_laddr() + && removed) { + return ref_update_result_t{ + result.refcount, + removed->first, + removed->second}; + } else { + return ref_update_result_t{ + result.refcount, + result.pladdr, + result.len + }; + } + }); + }); +} + +BtreeLBAManager::_update_mapping_ret +BtreeLBAManager::_update_mapping( + Transaction &t, + laddr_t addr, + update_func_t &&f, + LogicalCachedExtent* nextent) +{ + auto c = get_context(t); + return with_btree_ret<LBABtree, lba_map_val_t>( + cache, + c, + [f=std::move(f), c, addr, nextent](auto &btree) mutable { + return btree.lower_bound( + c, addr + ).si_then([&btree, f=std::move(f), c, addr, nextent](auto iter) + -> _update_mapping_ret { + if (iter.is_end() || iter.get_key() != addr) { + LOG_PREFIX(BtreeLBAManager::_update_mapping); + ERRORT("laddr={} doesn't exist", c.trans, addr); + return crimson::ct_error::enoent::make(); + } + + auto ret = f(iter.get_val()); + if (ret.refcount == 0) { + return btree.remove( + c, + iter + ).si_then([ret] { + return ret; + }); + } else { + return btree.update( + c, + iter, + ret, + nextent + ).si_then([ret](auto) { + return ret; + }); + } + }); + }); +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h new file mode 100644 index 000000000..892600ed0 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -0,0 +1,396 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer_fwd.h" +#include "include/interval_set.h" +#include "common/interval_map.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/btree/fixed_kv_btree.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/cache.h" + +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/btree/btree_range_pin.h" + +namespace crimson::os::seastore::lba_manager::btree { + +class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> { +// To support cloning, there are two kinds of lba mappings: +// 1. physical lba mapping: the pladdr in the value of which is the paddr of +// the corresponding extent; +// 2. indirect lba mapping: the pladdr in the value of which is an laddr pointing +// to the physical lba mapping that's pointing to the actual paddr of the +// extent being searched; +// +// Accordingly, BtreeLBAMapping may also work under two modes: indirect or direct +// 1. BtreeLBAMappings that come from quering an indirect lba mapping in the lba tree +// are indirect; +// 2. BtreeLBAMappings that come from quering a physical lba mapping in the lba tree +// are direct. +// +// For direct BtreeLBAMappings, there are two important fields: +// 1. key: the laddr of the lba mapping being queried; +// 2. paddr: the paddr recorded in the value of the lba mapping being queried. +// For indirect BtreeLBAMappings, BtreeLBAMapping has three important fields: +// 1. key: the laddr key of the lba entry being queried; +// 2. intermediate_key: the laddr within the scope of the physical lba mapping +// that the current indirect lba mapping points to; although an indirect mapping +// points to the start of the physical lba mapping, it may change to other +// laddr after remap +// 3. intermediate_base: the laddr key of the physical lba mapping, intermediate_key +// and intermediate_base should be the same when doing cloning +// 4. intermediate_offset: intermediate_key - intermediate_base +// 5. paddr: the paddr recorded in the physical lba mapping pointed to by the +// indirect lba mapping being queried; +// +// NOTE THAT, for direct BtreeLBAMappings, their intermediate_keys are the same as +// their keys. +public: + BtreeLBAMapping(op_context_t<laddr_t> ctx) + : BtreeNodeMapping(ctx) {} + BtreeLBAMapping( + op_context_t<laddr_t> c, + CachedExtentRef parent, + uint16_t pos, + lba_map_val_t &val, + lba_node_meta_t meta) + : BtreeNodeMapping( + c, + parent, + pos, + val.pladdr.is_paddr() ? val.pladdr.get_paddr() : P_ADDR_NULL, + val.len, + meta), + key(meta.begin), + indirect(val.pladdr.is_laddr() ? true : false), + intermediate_key(indirect ? val.pladdr.get_laddr() : L_ADDR_NULL), + intermediate_length(indirect ? val.len : 0), + raw_val(val.pladdr), + map_val(val) + {} + + lba_map_val_t get_map_val() const { + return map_val; + } + + bool is_indirect() const final { + return indirect; + } + + void set_key_for_indirect( + laddr_t new_key, + extent_len_t length, + laddr_t interkey = L_ADDR_NULL) + { + turn_indirect(interkey); + key = new_key; + intermediate_length = len; + len = length; + } + + laddr_t get_key() const final { + return key; + } + + pladdr_t get_raw_val() const { + return raw_val; + } + + void set_paddr(paddr_t addr) { + value = addr; + } + + laddr_t get_intermediate_key() const final { + assert(is_indirect()); + assert(intermediate_key != L_ADDR_NULL); + return intermediate_key; + } + + laddr_t get_intermediate_base() const final { + assert(is_indirect()); + assert(intermediate_base != L_ADDR_NULL); + return intermediate_base; + } + + extent_len_t get_intermediate_offset() const final { + assert(intermediate_key >= intermediate_base); + assert((intermediate_key == L_ADDR_NULL) + == (intermediate_base == L_ADDR_NULL)); + return intermediate_key - intermediate_base; + } + + extent_len_t get_intermediate_length() const final { + assert(is_indirect()); + assert(intermediate_length); + return intermediate_length; + } + + void set_intermediate_base(laddr_t base) { + intermediate_base = base; + } + +protected: + std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate( + op_context_t<laddr_t> ctx) const final { + auto pin = std::unique_ptr<BtreeLBAMapping>(new BtreeLBAMapping(ctx)); + pin->key = key; + pin->intermediate_base = intermediate_base; + pin->intermediate_key = intermediate_key; + pin->indirect = indirect; + pin->raw_val = raw_val; + pin->map_val = map_val; + return pin; + } +private: + void turn_indirect(laddr_t interkey) { + assert(value.is_paddr()); + intermediate_base = key; + intermediate_key = (interkey == L_ADDR_NULL ? key : interkey); + indirect = true; + } + laddr_t key = L_ADDR_NULL; + bool indirect = false; + laddr_t intermediate_key = L_ADDR_NULL; + laddr_t intermediate_base = L_ADDR_NULL; + extent_len_t intermediate_length = 0; + pladdr_t raw_val; + lba_map_val_t map_val; +}; + +using BtreeLBAMappingRef = std::unique_ptr<BtreeLBAMapping>; + +using LBABtree = FixedKVBtree< + laddr_t, lba_map_val_t, LBAInternalNode, + LBALeafNode, BtreeLBAMapping, LBA_BLOCK_SIZE, true>; + +/** + * BtreeLBAManager + * + * Uses a wandering btree to track two things: + * 1) lba state including laddr_t -> paddr_t mapping + * 2) reverse paddr_t -> laddr_t mapping for gc (TODO) + * + * Generally, any transaction will involve + * 1) deltas against lba tree nodes + * 2) new lba tree nodes + * - Note, there must necessarily be a delta linking + * these new nodes into the tree -- might be a + * bootstrap_state_t delta if new root + * + * get_mappings, alloc_extent_*, etc populate a Transaction + * which then gets submitted + */ +class BtreeLBAManager : public LBAManager { +public: + BtreeLBAManager(Cache &cache) + : cache(cache) + { + register_metrics(); + } + + mkfs_ret mkfs( + Transaction &t) final; + + get_mappings_ret get_mappings( + Transaction &t, + laddr_t offset, extent_len_t length) final; + + get_mappings_ret get_mappings( + Transaction &t, + laddr_list_t &&list) final; + + get_mapping_ret get_mapping( + Transaction &t, + laddr_t offset) final; + + alloc_extent_ret reserve_region( + Transaction &t, + laddr_t hint, + extent_len_t len) + { + return _alloc_extent( + t, + hint, + len, + P_ADDR_ZERO, + P_ADDR_NULL, + L_ADDR_NULL, + nullptr); + } + + alloc_extent_ret clone_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + laddr_t intermediate_key, + paddr_t actual_addr, + laddr_t intermediate_base) + { + return _alloc_extent( + t, + hint, + len, + intermediate_key, + actual_addr, + intermediate_base, + nullptr); + } + + alloc_extent_ret alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + paddr_t addr, + LogicalCachedExtent &ext) final + { + return _alloc_extent( + t, + hint, + len, + addr, + P_ADDR_NULL, + L_ADDR_NULL, + &ext); + } + + ref_ret decref_extent( + Transaction &t, + laddr_t addr, + bool cascade_remove) final { + return update_refcount(t, addr, -1, cascade_remove); + } + + ref_ret incref_extent( + Transaction &t, + laddr_t addr) final { + return update_refcount(t, addr, 1, false); + } + + ref_ret incref_extent( + Transaction &t, + laddr_t addr, + int delta) final { + ceph_assert(delta > 0); + return update_refcount(t, addr, delta, false); + } + + /** + * init_cached_extent + * + * Checks whether e is live (reachable from lba tree) and drops or initializes + * accordingly. + * + * Returns if e is live. + */ + init_cached_extent_ret init_cached_extent( + Transaction &t, + CachedExtentRef e) final; + + check_child_trackers_ret check_child_trackers(Transaction &t) final; + + scan_mappings_ret scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) final; + + rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) final; + + update_mapping_ret update_mapping( + Transaction& t, + laddr_t laddr, + paddr_t prev_addr, + paddr_t paddr, + LogicalCachedExtent*) final; + + get_physical_extent_if_live_ret get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + extent_len_t len) final; +private: + Cache &cache; + + + struct { + uint64_t num_alloc_extents = 0; + uint64_t num_alloc_extents_iter_nexts = 0; + } stats; + + op_context_t<laddr_t> get_context(Transaction &t) { + return op_context_t<laddr_t>{cache, t}; + } + + seastar::metrics::metric_group metrics; + void register_metrics(); + + /** + * update_refcount + * + * Updates refcount, returns resulting refcount + */ + using update_refcount_ret = ref_ret; + update_refcount_ret update_refcount( + Transaction &t, + laddr_t addr, + int delta, + bool cascade_remove); + + /** + * _update_mapping + * + * Updates mapping, removes if f returns nullopt + */ + using _update_mapping_iertr = ref_iertr; + using _update_mapping_ret = ref_iertr::future<lba_map_val_t>; + using update_func_t = std::function< + lba_map_val_t(const lba_map_val_t &v) + >; + _update_mapping_ret _update_mapping( + Transaction &t, + laddr_t addr, + update_func_t &&f, + LogicalCachedExtent*); + + alloc_extent_ret _alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + pladdr_t addr, + paddr_t actual_addr, + laddr_t intermediate_base, + LogicalCachedExtent*); + + using _get_mapping_ret = get_mapping_iertr::future<BtreeLBAMappingRef>; + _get_mapping_ret _get_mapping( + Transaction &t, + laddr_t offset); + + using _get_original_mappings_ret = get_mappings_ret; + _get_original_mappings_ret _get_original_mappings( + op_context_t<laddr_t> c, + std::list<BtreeLBAMappingRef> &pin_list); + + ref_iertr::future<std::optional<std::pair<paddr_t, extent_len_t>>> + _decref_intermediate( + Transaction &t, + laddr_t addr, + extent_len_t len); +}; +using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>; + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc new file mode 100644 index 000000000..66dc94394 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include <memory> +#include <string.h> + +#include "include/buffer.h" +#include "include/byteorder.h" + +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/logging.h" + +SET_SUBSYS(seastore_lba); + +namespace crimson::os::seastore::lba_manager::btree { + +std::ostream& operator<<(std::ostream& out, const lba_map_val_t& v) +{ + return out << "lba_map_val_t(" + << v.pladdr + << "~" << v.len + << ", refcount=" << v.refcount + << ", checksum=" << v.checksum + << ")"; +} + +std::ostream &LBALeafNode::_print_detail(std::ostream &out) const +{ + out << ", size=" << this->get_size() + << ", meta=" << this->get_meta() + << ", my_tracker=" << (void*)this->my_tracker; + if (this->my_tracker) { + out << ", my_tracker->parent=" << (void*)this->my_tracker->get_parent().get(); + } + return out << ", root_block=" << (void*)this->root_block.get(); +} + +void LBALeafNode::resolve_relative_addrs(paddr_t base) +{ + LOG_PREFIX(LBALeafNode::resolve_relative_addrs); + for (auto i: *this) { + auto val = i->get_val(); + if (val.pladdr.is_paddr() && + val.pladdr.get_paddr().is_relative()) { + val.pladdr = base.add_relative(val.pladdr.get_paddr()); + TRACE("{} -> {}", i->get_val().pladdr, val.pladdr); + i->set_val(val); + } + } +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h new file mode 100644 index 000000000..ffce2c1b5 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -0,0 +1,294 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <sys/mman.h> +#include <memory> +#include <string.h> + + +#include "include/buffer.h" + +#include "crimson/common/fixed_kv_node_layout.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/cached_extent.h" + +#include "crimson/os/seastore/btree/btree_range_pin.h" +#include "crimson/os/seastore/btree/fixed_kv_btree.h" +#include "crimson/os/seastore/btree/fixed_kv_node.h" + +namespace crimson::os::seastore::lba_manager::btree { + +using base_iertr = LBAManager::base_iertr; +using LBANode = FixedKVNode<laddr_t>; + +/** + * lba_map_val_t + * + * struct representing a single lba mapping + */ +struct lba_map_val_t { + extent_len_t len = 0; ///< length of mapping + pladdr_t pladdr; ///< physical addr of mapping or + // laddr of a physical lba mapping(see btree_lba_manager.h) + uint32_t refcount = 0; ///< refcount + uint32_t checksum = 0; ///< checksum of original block written at paddr (TODO) + + lba_map_val_t() = default; + lba_map_val_t( + extent_len_t len, + pladdr_t pladdr, + uint32_t refcount, + uint32_t checksum) + : len(len), pladdr(pladdr), refcount(refcount), checksum(checksum) {} + bool operator==(const lba_map_val_t&) const = default; +}; + +std::ostream& operator<<(std::ostream& out, const lba_map_val_t&); + +constexpr size_t LBA_BLOCK_SIZE = 4096; + +using lba_node_meta_t = fixed_kv_node_meta_t<laddr_t>; + +using lba_node_meta_le_t = fixed_kv_node_meta_le_t<laddr_le_t>; + +/** + * LBAInternalNode + * + * Abstracts operations on and layout of internal nodes for the + * LBA Tree. + * + * Layout (4k): + * size : uint32_t[1] 4b + * (padding) : 4b + * meta : lba_node_meta_le_t[3] (1*24)b + * keys : laddr_t[255] (254*8)b + * values : paddr_t[255] (254*8)b + * = 4096 + + * TODO: make the above capacity calculation part of FixedKVNodeLayout + * TODO: the above alignment probably isn't portable without further work + */ +constexpr size_t INTERNAL_NODE_CAPACITY = 254; +struct LBAInternalNode + : FixedKVInternalNode< + INTERNAL_NODE_CAPACITY, + laddr_t, laddr_le_t, + LBA_BLOCK_SIZE, + LBAInternalNode> { + using Ref = TCachedExtentRef<LBAInternalNode>; + using internal_iterator_t = const_iterator; + template <typename... T> + LBAInternalNode(T&&... t) : + FixedKVInternalNode(std::forward<T>(t)...) {} + + static constexpr extent_types_t TYPE = extent_types_t::LADDR_INTERNAL; + + extent_types_t get_type() const final { + return TYPE; + } +}; +using LBAInternalNodeRef = LBAInternalNode::Ref; + +/** + * LBALeafNode + * + * Abstracts operations on and layout of leaf nodes for the + * LBA Tree. + * + * Layout (4k): + * size : uint32_t[1] 4b + * (padding) : 4b + * meta : lba_node_meta_le_t[3] (1*24)b + * keys : laddr_t[170] (140*8)b + * values : lba_map_val_t[170] (140*21)b + * = 4092 + * + * TODO: update FixedKVNodeLayout to handle the above calculation + * TODO: the above alignment probably isn't portable without further work + */ +constexpr size_t LEAF_NODE_CAPACITY = 140; + +/** + * lba_map_val_le_t + * + * On disk layout for lba_map_val_t. + */ +struct lba_map_val_le_t { + extent_len_le_t len = init_extent_len_le(0); + pladdr_le_t pladdr; + ceph_le32 refcount{0}; + ceph_le32 checksum{0}; + + lba_map_val_le_t() = default; + lba_map_val_le_t(const lba_map_val_le_t &) = default; + explicit lba_map_val_le_t(const lba_map_val_t &val) + : len(init_extent_len_le(val.len)), + pladdr(pladdr_le_t(val.pladdr)), + refcount(val.refcount), + checksum(val.checksum) {} + + operator lba_map_val_t() const { + return lba_map_val_t{ len, pladdr, refcount, checksum }; + } +}; + +struct LBALeafNode + : FixedKVLeafNode< + LEAF_NODE_CAPACITY, + laddr_t, laddr_le_t, + lba_map_val_t, lba_map_val_le_t, + LBA_BLOCK_SIZE, + LBALeafNode, + true> { + using Ref = TCachedExtentRef<LBALeafNode>; + using parent_type_t = FixedKVLeafNode< + LEAF_NODE_CAPACITY, + laddr_t, laddr_le_t, + lba_map_val_t, lba_map_val_le_t, + LBA_BLOCK_SIZE, + LBALeafNode, + true>; + using internal_const_iterator_t = + typename parent_type_t::node_layout_t::const_iterator; + using internal_iterator_t = + typename parent_type_t::node_layout_t::iterator; + template <typename... T> + LBALeafNode(T&&... t) : + parent_type_t(std::forward<T>(t)...) {} + + static constexpr extent_types_t TYPE = extent_types_t::LADDR_LEAF; + + bool validate_stable_children() final { + LOG_PREFIX(LBALeafNode::validate_stable_children); + if (this->children.empty()) { + return false; + } + + for (auto i : *this) { + auto child = (LogicalCachedExtent*)this->children[i.get_offset()]; + if (is_valid_child_ptr(child) && child->get_laddr() != i.get_key()) { + SUBERROR(seastore_fixedkv_tree, + "stable child not valid: child {}, key {}", + *child, + i.get_key()); + ceph_abort(); + return false; + } + } + return true; + } + + void update( + internal_const_iterator_t iter, + lba_map_val_t val, + LogicalCachedExtent* nextent) final { + LOG_PREFIX(LBALeafNode::update); + if (nextent) { + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, {}", + this->pending_for_transaction, + iter.get_offset(), + *nextent); + // child-ptr may already be correct, see LBAManager::update_mappings() + this->update_child_ptr(iter, nextent); + } + if (val.pladdr.is_paddr()) { + val.pladdr = maybe_generate_relative(val.pladdr.get_paddr()); + } + return this->journal_update( + iter, + val, + this->maybe_get_delta_buffer()); + } + + internal_const_iterator_t insert( + internal_const_iterator_t iter, + laddr_t addr, + lba_map_val_t val, + LogicalCachedExtent* nextent) final { + LOG_PREFIX(LBALeafNode::insert); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}, extent {}", + this->pending_for_transaction, + iter.get_offset(), + addr, + (void*)nextent); + this->insert_child_ptr(iter, nextent); + if (val.pladdr.is_paddr()) { + val.pladdr = maybe_generate_relative(val.pladdr.get_paddr()); + } + this->journal_insert( + iter, + addr, + val, + this->maybe_get_delta_buffer()); + return iter; + } + + void remove(internal_const_iterator_t iter) final { + LOG_PREFIX(LBALeafNode::remove); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}", + this->pending_for_transaction, + iter.get_offset(), + iter.get_key()); + assert(iter != this->end()); + this->remove_child_ptr(iter); + return this->journal_remove( + iter, + this->maybe_get_delta_buffer()); + } + + // See LBAInternalNode, same concept + void resolve_relative_addrs(paddr_t base); + void node_resolve_vals( + internal_iterator_t from, + internal_iterator_t to) const final + { + if (this->is_initial_pending()) { + for (auto i = from; i != to; ++i) { + auto val = i->get_val(); + if (val.pladdr.is_paddr() + && val.pladdr.get_paddr().is_relative()) { + assert(val.pladdr.get_paddr().is_block_relative()); + val.pladdr = this->get_paddr().add_relative(val.pladdr.get_paddr()); + i->set_val(val); + } + } + } + } + void node_unresolve_vals( + internal_iterator_t from, + internal_iterator_t to) const final + { + if (this->is_initial_pending()) { + for (auto i = from; i != to; ++i) { + auto val = i->get_val(); + if (val.pladdr.is_paddr() + && val.pladdr.get_paddr().is_relative()) { + assert(val.pladdr.get_paddr().is_record_relative()); + val.pladdr = val.pladdr.get_paddr().block_relative_to(this->get_paddr()); + i->set_val(val); + } + } + } + } + + extent_types_t get_type() const final { + return TYPE; + } + + std::ostream &_print_detail(std::ostream &out) const final; +}; +using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::lba_manager::btree::lba_node_meta_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::lba_manager::btree::lba_map_val_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::lba_manager::btree::LBAInternalNode> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::lba_manager::btree::LBALeafNode> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/logging.h b/src/crimson/os/seastore/logging.h new file mode 100644 index 000000000..3f12ee72c --- /dev/null +++ b/src/crimson/os/seastore/logging.h @@ -0,0 +1,30 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <fmt/format.h> + +#include "crimson/common/log.h" + +#define LOGT(level_, MSG, t, ...) \ + LOCAL_LOGGER.log(level_, "{} trans.{} {}: " MSG, (void*)&t, \ + (t).get_trans_id(), FNAME , ##__VA_ARGS__) +#define SUBLOGT(subname_, level_, MSG, t, ...) \ + LOGGER(subname_).log(level_, "{} trans.{} {}: " MSG, (void*)&t, \ + (t).get_trans_id(), FNAME , ##__VA_ARGS__) + +#define TRACET(...) LOGT(seastar::log_level::trace, __VA_ARGS__) +#define SUBTRACET(subname_, ...) SUBLOGT(subname_, seastar::log_level::trace, __VA_ARGS__) + +#define DEBUGT(...) LOGT(seastar::log_level::debug, __VA_ARGS__) +#define SUBDEBUGT(subname_, ...) SUBLOGT(subname_, seastar::log_level::debug, __VA_ARGS__) + +#define INFOT(...) LOGT(seastar::log_level::info, __VA_ARGS__) +#define SUBINFOT(subname_, ...) SUBLOGT(subname_, seastar::log_level::info, __VA_ARGS__) + +#define WARNT(...) LOGT(seastar::log_level::warn, __VA_ARGS__) +#define SUBWARNT(subname_, ...) SUBLOGT(subname_, seastar::log_level::warn, __VA_ARGS__) + +#define ERRORT(...) LOGT(seastar::log_level::error, __VA_ARGS__) +#define SUBERRORT(subname_, ...) SUBLOGT(subname_, seastar::log_level::error, __VA_ARGS__) diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc new file mode 100644 index 000000000..0d852696b --- /dev/null +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -0,0 +1,1638 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <utility> +#include <functional> + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/object_data_handler.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_seastore_odata); + } +} + +SET_SUBSYS(seastore_odata); + +namespace crimson::os::seastore { +#define assert_aligned(x) ceph_assert(((x)%ctx.tm.get_block_size()) == 0) + +using context_t = ObjectDataHandler::context_t; +using get_iertr = ObjectDataHandler::write_iertr; + +/** + * extent_to_write_t + * + * Encapsulates smallest write operations in overwrite. + * Indicates a zero/existing extent or a data extent based on whether + * to_write is populate. + * Should be handled by prepare_ops_list. + */ +struct extent_to_write_t { + enum class type_t { + DATA, + ZERO, + EXISTING, + }; + type_t type; + + /// pin of original extent, not nullptr if type == EXISTING + LBAMappingRef pin; + + laddr_t addr; + extent_len_t len; + + /// non-nullopt if and only if type == DATA + std::optional<bufferlist> to_write; + + extent_to_write_t(const extent_to_write_t &) = delete; + extent_to_write_t(extent_to_write_t &&) = default; + + bool is_data() const { + return type == type_t::DATA; + } + + bool is_zero() const { + return type == type_t::ZERO; + } + + bool is_existing() const { + return type == type_t::EXISTING; + } + + laddr_t get_end_addr() const { + return addr + len; + } + + static extent_to_write_t create_data( + laddr_t addr, bufferlist to_write) { + return extent_to_write_t(addr, to_write); + } + + static extent_to_write_t create_zero( + laddr_t addr, extent_len_t len) { + return extent_to_write_t(addr, len); + } + + static extent_to_write_t create_existing( + LBAMappingRef &&pin, laddr_t addr, extent_len_t len) { + assert(pin); + return extent_to_write_t(std::move(pin), addr, len); + } + +private: + extent_to_write_t(laddr_t addr, bufferlist to_write) + : type(type_t::DATA), addr(addr), len(to_write.length()), + to_write(to_write) {} + + extent_to_write_t(laddr_t addr, extent_len_t len) + : type(type_t::ZERO), addr(addr), len(len) {} + + extent_to_write_t(LBAMappingRef &&pin, laddr_t addr, extent_len_t len) + : type(type_t::EXISTING), pin(std::move(pin)), addr(addr), len(len) {} +}; +using extent_to_write_list_t = std::list<extent_to_write_t>; + +// Encapsulates extents to be written out using do_remappings. +struct extent_to_remap_t { + enum class type_t { + REMAP, + OVERWRITE + }; + type_t type; + /// pin of original extent + LBAMappingRef pin; + /// offset of remapped extent or overwrite part of overwrite extent. + /// overwrite part of overwrite extent might correspond to mutiple + /// fresh write extent. + extent_len_t new_offset; + /// length of remapped extent or overwrite part of overwrite extent + extent_len_t new_len; + + extent_to_remap_t(const extent_to_remap_t &) = delete; + extent_to_remap_t(extent_to_remap_t &&) = default; + + bool is_remap() const { + return type == type_t::REMAP; + } + + bool is_overwrite() const { + assert((new_offset != 0) && (pin->get_length() != new_offset + new_len)); + return type == type_t::OVERWRITE; + } + + using remap_entry = TransactionManager::remap_entry; + remap_entry create_remap_entry() { + assert(is_remap()); + return remap_entry( + new_offset, + new_len); + } + + remap_entry create_left_remap_entry() { + assert(is_overwrite()); + return remap_entry( + 0, + new_offset); + } + + remap_entry create_right_remap_entry() { + assert(is_overwrite()); + return remap_entry( + new_offset + new_len, + pin->get_length() - new_offset - new_len); + } + + static extent_to_remap_t create_remap( + LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) { + return extent_to_remap_t(type_t::REMAP, + std::move(pin), new_offset, new_len); + } + + static extent_to_remap_t create_overwrite( + LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) { + return extent_to_remap_t(type_t::OVERWRITE, + std::move(pin), new_offset, new_len); + } + +private: + extent_to_remap_t(type_t type, + LBAMappingRef &&pin, extent_len_t new_offset, extent_len_t new_len) + : type(type), + pin(std::move(pin)), new_offset(new_offset), new_len(new_len) {} +}; +using extent_to_remap_list_t = std::list<extent_to_remap_t>; + +// Encapsulates extents to be written out using do_insertions. +struct extent_to_insert_t { + enum class type_t { + DATA, + ZERO + }; + type_t type; + /// laddr of new extent + laddr_t addr; + /// length of new extent + extent_len_t len; + /// non-nullopt if type == DATA + std::optional<bufferlist> bl; + + extent_to_insert_t(const extent_to_insert_t &) = default; + extent_to_insert_t(extent_to_insert_t &&) = default; + + bool is_data() const { + return type == type_t::DATA; + } + + bool is_zero() const { + return type == type_t::ZERO; + } + + static extent_to_insert_t create_data( + laddr_t addr, extent_len_t len, std::optional<bufferlist> bl) { + return extent_to_insert_t(addr, len, bl); + } + + static extent_to_insert_t create_zero( + laddr_t addr, extent_len_t len) { + return extent_to_insert_t(addr, len); + } + +private: + extent_to_insert_t(laddr_t addr, extent_len_t len, + std::optional<bufferlist> bl) + :type(type_t::DATA), addr(addr), len(len), bl(bl) {} + + extent_to_insert_t(laddr_t addr, extent_len_t len) + :type(type_t::ZERO), addr(addr), len(len) {} +}; +using extent_to_insert_list_t = std::list<extent_to_insert_t>; + +// Encapsulates extents to be retired in do_removals. +using extent_to_remove_list_t = std::list<LBAMappingRef>; + +struct overwrite_ops_t { + extent_to_remap_list_t to_remap; + extent_to_insert_list_t to_insert; + extent_to_remove_list_t to_remove; +}; + +// prepare to_remap, to_retire, to_insert list +overwrite_ops_t prepare_ops_list( + lba_pin_list_t &pins_to_remove, + extent_to_write_list_t &to_write) { + assert(pins_to_remove.size() != 0); + overwrite_ops_t ops; + ops.to_remove.swap(pins_to_remove); + if (to_write.empty()) { + logger().debug("empty to_write"); + return ops; + } + long unsigned int visitted = 0; + auto& front = to_write.front(); + auto& back = to_write.back(); + + // prepare overwrite, happens in one original extent. + if (ops.to_remove.size() == 1 && + front.is_existing() && back.is_existing()) { + visitted += 2; + assert(to_write.size() > 2); + assert(front.addr == front.pin->get_key()); + assert(back.addr > back.pin->get_key()); + ops.to_remap.push_back(extent_to_remap_t::create_overwrite( + std::move(front.pin), + front.len, + back.addr - front.addr - front.len)); + ops.to_remove.pop_front(); + } else { + // prepare to_remap, happens in one or multiple extents + if (front.is_existing()) { + visitted++; + assert(to_write.size() > 1); + assert(front.addr == front.pin->get_key()); + ops.to_remap.push_back(extent_to_remap_t::create_remap( + std::move(front.pin), + 0, + front.len)); + ops.to_remove.pop_front(); + } + if (back.is_existing()) { + visitted++; + assert(to_write.size() > 1); + assert(back.addr + back.len == + back.pin->get_key() + back.pin->get_length()); + ops.to_remap.push_back(extent_to_remap_t::create_remap( + std::move(back.pin), + back.addr - back.pin->get_key(), + back.len)); + ops.to_remove.pop_back(); + } + } + + // prepare to_insert + for (auto ®ion : to_write) { + if (region.is_data()) { + visitted++; + assert(region.to_write.has_value()); + ops.to_insert.push_back(extent_to_insert_t::create_data( + region.addr, region.len, region.to_write)); + } else if (region.is_zero()) { + visitted++; + assert(!(region.to_write.has_value())); + ops.to_insert.push_back(extent_to_insert_t::create_zero( + region.addr, region.len)); + } + } + + logger().debug( + "to_remap list size: {}" + " to_insert list size: {}" + " to_remove list size: {}", + ops.to_remap.size(), ops.to_insert.size(), ops.to_remove.size()); + assert(visitted == to_write.size()); + return ops; +} + +/** + * append_extent_to_write + * + * Appends passed extent_to_write_t maintaining invariant that the + * list may not contain consecutive zero elements by checking and + * combining them. + */ +void append_extent_to_write( + extent_to_write_list_t &to_write, extent_to_write_t &&to_append) +{ + assert(to_write.empty() || + to_write.back().get_end_addr() == to_append.addr); + if (to_write.empty() || + to_write.back().is_data() || + to_append.is_data() || + to_write.back().type != to_append.type) { + to_write.push_back(std::move(to_append)); + } else { + to_write.back().len += to_append.len; + } +} + +/** + * splice_extent_to_write + * + * splices passed extent_to_write_list_t maintaining invariant that the + * list may not contain consecutive zero elements by checking and + * combining them. + */ +void splice_extent_to_write( + extent_to_write_list_t &to_write, extent_to_write_list_t &&to_splice) +{ + if (!to_splice.empty()) { + append_extent_to_write(to_write, std::move(to_splice.front())); + to_splice.pop_front(); + to_write.splice(to_write.end(), std::move(to_splice)); + } +} + +/// Creates remap extents in to_remap +ObjectDataHandler::write_ret do_remappings( + context_t ctx, + extent_to_remap_list_t &to_remap) +{ + return trans_intr::do_for_each( + to_remap, + [ctx](auto ®ion) { + if (region.is_remap()) { + return ctx.tm.remap_pin<ObjectDataBlock, 1>( + ctx.t, + std::move(region.pin), + std::array{ + region.create_remap_entry() + } + ).si_then([®ion](auto pins) { + ceph_assert(pins.size() == 1); + ceph_assert(region.new_len == pins[0]->get_length()); + return ObjectDataHandler::write_iertr::now(); + }); + } else if (region.is_overwrite()) { + return ctx.tm.remap_pin<ObjectDataBlock, 2>( + ctx.t, + std::move(region.pin), + std::array{ + region.create_left_remap_entry(), + region.create_right_remap_entry() + } + ).si_then([®ion](auto pins) { + ceph_assert(pins.size() == 2); + ceph_assert(region.pin->get_key() == pins[0]->get_key()); + ceph_assert(region.pin->get_key() + pins[0]->get_length() + + region.new_len == pins[1]->get_key()); + return ObjectDataHandler::write_iertr::now(); + }); + } else { + ceph_abort("impossible"); + return ObjectDataHandler::write_iertr::now(); + } + }); +} + +ObjectDataHandler::write_ret do_removals( + context_t ctx, + lba_pin_list_t &to_remove) +{ + return trans_intr::do_for_each( + to_remove, + [ctx](auto &pin) { + LOG_PREFIX(object_data_handler.cc::do_removals); + DEBUGT("decreasing ref: {}", + ctx.t, + pin->get_key()); + return ctx.tm.dec_ref( + ctx.t, + pin->get_key() + ).si_then( + [](auto){}, + ObjectDataHandler::write_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "object_data_handler::do_removals invalid error" + } + ); + }); +} + +/// Creates zero/data extents in to_insert +ObjectDataHandler::write_ret do_insertions( + context_t ctx, + extent_to_insert_list_t &to_insert) +{ + return trans_intr::do_for_each( + to_insert, + [ctx](auto ®ion) { + LOG_PREFIX(object_data_handler.cc::do_insertions); + if (region.is_data()) { + assert_aligned(region.addr); + assert_aligned(region.len); + ceph_assert(region.len == region.bl->length()); + DEBUGT("allocating extent: {}~{}", + ctx.t, + region.addr, + region.len); + return ctx.tm.alloc_extent<ObjectDataBlock>( + ctx.t, + region.addr, + region.len + ).si_then([®ion](auto extent) { + if (extent->get_laddr() != region.addr) { + logger().debug( + "object_data_handler::do_insertions alloc got addr {}," + " should have been {}", + extent->get_laddr(), + region.addr); + } + ceph_assert(extent->get_laddr() == region.addr); + ceph_assert(extent->get_length() == region.len); + auto iter = region.bl->cbegin(); + iter.copy(region.len, extent->get_bptr().c_str()); + return ObjectDataHandler::write_iertr::now(); + }); + } else if (region.is_zero()) { + DEBUGT("reserving: {}~{}", + ctx.t, + region.addr, + region.len); + return ctx.tm.reserve_region( + ctx.t, + region.addr, + region.len + ).si_then([FNAME, ctx, ®ion](auto pin) { + ceph_assert(pin->get_length() == region.len); + if (pin->get_key() != region.addr) { + ERRORT( + "inconsistent laddr: pin: {} region {}", + ctx.t, + pin->get_key(), + region.addr); + } + ceph_assert(pin->get_key() == region.addr); + return ObjectDataHandler::write_iertr::now(); + }); + } else { + ceph_abort("impossible"); + return ObjectDataHandler::write_iertr::now(); + } + }); +} + +enum class overwrite_operation_t { + UNKNOWN, + OVERWRITE_ZERO, // fill unaligned data with zero + MERGE_EXISTING, // if present, merge data with the clean/pending extent + SPLIT_EXISTING, // split the existing extent, and fill unaligned data +}; + +std::ostream& operator<<( + std::ostream &out, + const overwrite_operation_t &operation) +{ + switch (operation) { + case overwrite_operation_t::UNKNOWN: + return out << "UNKNOWN"; + case overwrite_operation_t::OVERWRITE_ZERO: + return out << "OVERWRITE_ZERO"; + case overwrite_operation_t::MERGE_EXISTING: + return out << "MERGE_EXISTING"; + case overwrite_operation_t::SPLIT_EXISTING: + return out << "SPLIT_EXISTING"; + default: + return out << "!IMPOSSIBLE_OPERATION"; + } +} + +/** + * overwrite_plan_t + * + * |<--------------------------pins_size---------------------------------------------->| + * pin_begin(aligned) pin_end(aligned) + * |<------aligned_data_size-------------------------->| (aligned-bl) + * aligned_data_begin aligned_data_end + * |<-data_size->| (bl) + * data_begin end + * left(l) right(r) + * |<l_extent_size>|<l_alignment_size>| |<r_alignment_size>|<r_extent_size>| + * |<-----------left_size------------>| |<-----------right_size----------->| + * + * |<-----(existing left extent/pin)----->| |<-----(existing right extent/pin)----->| + * left_paddr right_paddr + */ +struct overwrite_plan_t { + // addresses + laddr_t pin_begin; + laddr_t pin_end; + paddr_t left_paddr; + paddr_t right_paddr; + laddr_t data_begin; + laddr_t data_end; + laddr_t aligned_data_begin; + laddr_t aligned_data_end; + + // operations + overwrite_operation_t left_operation; + overwrite_operation_t right_operation; + + // helper member + extent_len_t block_size; + +public: + extent_len_t get_left_size() const { + return data_begin - pin_begin; + } + + extent_len_t get_left_extent_size() const { + return aligned_data_begin - pin_begin; + } + + extent_len_t get_left_alignment_size() const { + return data_begin - aligned_data_begin; + } + + extent_len_t get_right_size() const { + return pin_end - data_end; + } + + extent_len_t get_right_extent_size() const { + return pin_end - aligned_data_end; + } + + extent_len_t get_right_alignment_size() const { + return aligned_data_end - data_end; + } + + extent_len_t get_aligned_data_size() const { + return aligned_data_end - aligned_data_begin; + } + + extent_len_t get_pins_size() const { + return pin_end - pin_begin; + } + + friend std::ostream& operator<<( + std::ostream& out, + const overwrite_plan_t& overwrite_plan) { + return out << "overwrite_plan_t(" + << "pin_begin=" << overwrite_plan.pin_begin + << ", pin_end=" << overwrite_plan.pin_end + << ", left_paddr=" << overwrite_plan.left_paddr + << ", right_paddr=" << overwrite_plan.right_paddr + << ", data_begin=" << overwrite_plan.data_begin + << ", data_end=" << overwrite_plan.data_end + << ", aligned_data_begin=" << overwrite_plan.aligned_data_begin + << ", aligned_data_end=" << overwrite_plan.aligned_data_end + << ", left_operation=" << overwrite_plan.left_operation + << ", right_operation=" << overwrite_plan.right_operation + << ", block_size=" << overwrite_plan.block_size + << ")"; + } + + overwrite_plan_t(laddr_t offset, + extent_len_t len, + const lba_pin_list_t& pins, + extent_len_t block_size, + Transaction& t) : + pin_begin(pins.front()->get_key()), + pin_end(pins.back()->get_key() + pins.back()->get_length()), + left_paddr(pins.front()->get_val()), + right_paddr(pins.back()->get_val()), + data_begin(offset), + data_end(offset + len), + aligned_data_begin(p2align((uint64_t)data_begin, (uint64_t)block_size)), + aligned_data_end(p2roundup((uint64_t)data_end, (uint64_t)block_size)), + left_operation(overwrite_operation_t::UNKNOWN), + right_operation(overwrite_operation_t::UNKNOWN), + block_size(block_size) { + validate(); + evaluate_operations(t); + assert(left_operation != overwrite_operation_t::UNKNOWN); + assert(right_operation != overwrite_operation_t::UNKNOWN); + } + +private: + // refer to overwrite_plan_t description + void validate() const { + ceph_assert(pin_begin % block_size == 0); + ceph_assert(pin_end % block_size == 0); + ceph_assert(aligned_data_begin % block_size == 0); + ceph_assert(aligned_data_end % block_size == 0); + + ceph_assert(pin_begin <= aligned_data_begin); + ceph_assert(aligned_data_begin <= data_begin); + ceph_assert(data_begin <= data_end); + ceph_assert(data_end <= aligned_data_end); + ceph_assert(aligned_data_end <= pin_end); + } + + /* + * When trying to modify a portion of an object data block, follow + * the read-full-extent-then-merge-new-data strategy, if the write + * amplification caused by it is not greater than + * seastore_obj_data_write_amplification; otherwise, split the + * original extent into at most three parts: origin-left, part-to-be-modified + * and origin-right. + */ + void evaluate_operations(Transaction& t) { + auto actual_write_size = get_pins_size(); + auto aligned_data_size = get_aligned_data_size(); + auto left_ext_size = get_left_extent_size(); + auto right_ext_size = get_right_extent_size(); + + auto can_merge = [](Transaction& t, paddr_t paddr) { + CachedExtentRef ext; + if (paddr.is_relative() || paddr.is_delayed()) { + return true; + } else if (t.get_extent(paddr, &ext) == + Transaction::get_extent_ret::PRESENT) { + // FIXME: there is no need to lookup the cache if the pin can + // be associated with the extent state + if (ext->is_mutable()) { + return true; + } + } + return false; + }; + if (left_paddr.is_zero()) { + actual_write_size -= left_ext_size; + left_ext_size = 0; + left_operation = overwrite_operation_t::OVERWRITE_ZERO; + } else if (can_merge(t, left_paddr)) { + aligned_data_size += left_ext_size; + left_ext_size = 0; + left_operation = overwrite_operation_t::MERGE_EXISTING; + } + + if (right_paddr.is_zero()) { + actual_write_size -= right_ext_size; + right_ext_size = 0; + right_operation = overwrite_operation_t::OVERWRITE_ZERO; + } else if (can_merge(t, right_paddr)) { + aligned_data_size += right_ext_size; + right_ext_size = 0; + right_operation = overwrite_operation_t::MERGE_EXISTING; + } + + while (left_operation == overwrite_operation_t::UNKNOWN || + right_operation == overwrite_operation_t::UNKNOWN) { + if (((double)actual_write_size / (double)aligned_data_size) <= + crimson::common::get_conf<double>("seastore_obj_data_write_amplification")) { + break; + } + if (left_ext_size == 0 && right_ext_size == 0) { + break; + } + if (left_ext_size >= right_ext_size) { + // split left + assert(left_operation == overwrite_operation_t::UNKNOWN); + actual_write_size -= left_ext_size; + left_ext_size = 0; + left_operation = overwrite_operation_t::SPLIT_EXISTING; + } else { // left_ext_size < right_ext_size + // split right + assert(right_operation == overwrite_operation_t::UNKNOWN); + actual_write_size -= right_ext_size; + right_ext_size = 0; + right_operation = overwrite_operation_t::SPLIT_EXISTING; + } + } + + if (left_operation == overwrite_operation_t::UNKNOWN) { + // no split left, so merge with left + left_operation = overwrite_operation_t::MERGE_EXISTING; + } + + if (right_operation == overwrite_operation_t::UNKNOWN) { + // no split right, so merge with right + right_operation = overwrite_operation_t::MERGE_EXISTING; + } + } +}; + +} // namespace crimson::os::seastore + +#if FMT_VERSION >= 90000 +template<> struct fmt::formatter<crimson::os::seastore::overwrite_plan_t> : fmt::ostream_formatter {}; +#endif + +namespace crimson::os::seastore { + +/** + * operate_left + * + * Proceed overwrite_plan.left_operation. + */ +using operate_ret_bare = std::pair< + std::optional<extent_to_write_t>, + std::optional<bufferptr>>; +using operate_ret = get_iertr::future<operate_ret_bare>; +operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan_t &overwrite_plan) +{ + if (overwrite_plan.get_left_size() == 0) { + return get_iertr::make_ready_future<operate_ret_bare>( + std::nullopt, + std::nullopt); + } + + if (overwrite_plan.left_operation == overwrite_operation_t::OVERWRITE_ZERO) { + assert(pin->get_val().is_zero()); + auto zero_extent_len = overwrite_plan.get_left_extent_size(); + assert_aligned(zero_extent_len); + auto zero_prepend_len = overwrite_plan.get_left_alignment_size(); + return get_iertr::make_ready_future<operate_ret_bare>( + (zero_extent_len == 0 + ? std::nullopt + : std::make_optional(extent_to_write_t::create_zero( + overwrite_plan.pin_begin, zero_extent_len))), + (zero_prepend_len == 0 + ? std::nullopt + : std::make_optional(bufferptr( + ceph::buffer::create(zero_prepend_len, 0)))) + ); + } else if (overwrite_plan.left_operation == overwrite_operation_t::MERGE_EXISTING) { + auto prepend_len = overwrite_plan.get_left_size(); + if (prepend_len == 0) { + return get_iertr::make_ready_future<operate_ret_bare>( + std::nullopt, + std::nullopt); + } else { + extent_len_t off = pin->get_intermediate_offset(); + return ctx.tm.read_pin<ObjectDataBlock>( + ctx.t, pin->duplicate() + ).si_then([prepend_len, off](auto left_extent) { + return get_iertr::make_ready_future<operate_ret_bare>( + std::nullopt, + std::make_optional(bufferptr( + left_extent->get_bptr(), + off, + prepend_len))); + }); + } + } else { + assert(overwrite_plan.left_operation == overwrite_operation_t::SPLIT_EXISTING); + + auto extent_len = overwrite_plan.get_left_extent_size(); + assert(extent_len); + std::optional<extent_to_write_t> left_to_write_extent = + std::make_optional(extent_to_write_t::create_existing( + pin->duplicate(), + pin->get_key(), + extent_len)); + + auto prepend_len = overwrite_plan.get_left_alignment_size(); + if (prepend_len == 0) { + return get_iertr::make_ready_future<operate_ret_bare>( + std::move(left_to_write_extent), + std::nullopt); + } else { + extent_len_t off = pin->get_intermediate_offset(); + return ctx.tm.read_pin<ObjectDataBlock>( + ctx.t, pin->duplicate() + ).si_then([prepend_offset=extent_len + off, prepend_len, + left_to_write_extent=std::move(left_to_write_extent)] + (auto left_extent) mutable { + return get_iertr::make_ready_future<operate_ret_bare>( + std::move(left_to_write_extent), + std::make_optional(bufferptr( + left_extent->get_bptr(), + prepend_offset, + prepend_len))); + }); + } + } +}; + +/** + * operate_right + * + * Proceed overwrite_plan.right_operation. + */ +operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_plan_t &overwrite_plan) +{ + if (overwrite_plan.get_right_size() == 0) { + return get_iertr::make_ready_future<operate_ret_bare>( + std::nullopt, + std::nullopt); + } + + auto right_pin_begin = pin->get_key(); + assert(overwrite_plan.data_end >= right_pin_begin); + if (overwrite_plan.right_operation == overwrite_operation_t::OVERWRITE_ZERO) { + assert(pin->get_val().is_zero()); + auto zero_suffix_len = overwrite_plan.get_right_alignment_size(); + auto zero_extent_len = overwrite_plan.get_right_extent_size(); + assert_aligned(zero_extent_len); + return get_iertr::make_ready_future<operate_ret_bare>( + (zero_extent_len == 0 + ? std::nullopt + : std::make_optional(extent_to_write_t::create_zero( + overwrite_plan.aligned_data_end, zero_extent_len))), + (zero_suffix_len == 0 + ? std::nullopt + : std::make_optional(bufferptr( + ceph::buffer::create(zero_suffix_len, 0)))) + ); + } else if (overwrite_plan.right_operation == overwrite_operation_t::MERGE_EXISTING) { + auto append_len = overwrite_plan.get_right_size(); + if (append_len == 0) { + return get_iertr::make_ready_future<operate_ret_bare>( + std::nullopt, + std::nullopt); + } else { + auto append_offset = + overwrite_plan.data_end + - right_pin_begin + + pin->get_intermediate_offset(); + return ctx.tm.read_pin<ObjectDataBlock>( + ctx.t, pin->duplicate() + ).si_then([append_offset, append_len](auto right_extent) { + return get_iertr::make_ready_future<operate_ret_bare>( + std::nullopt, + std::make_optional(bufferptr( + right_extent->get_bptr(), + append_offset, + append_len))); + }); + } + } else { + assert(overwrite_plan.right_operation == overwrite_operation_t::SPLIT_EXISTING); + + auto extent_len = overwrite_plan.get_right_extent_size(); + assert(extent_len); + std::optional<extent_to_write_t> right_to_write_extent = + std::make_optional(extent_to_write_t::create_existing( + pin->duplicate(), + overwrite_plan.aligned_data_end, + extent_len)); + + auto append_len = overwrite_plan.get_right_alignment_size(); + if (append_len == 0) { + return get_iertr::make_ready_future<operate_ret_bare>( + std::move(right_to_write_extent), + std::nullopt); + } else { + auto append_offset = + overwrite_plan.data_end + - right_pin_begin + + pin->get_intermediate_offset(); + return ctx.tm.read_pin<ObjectDataBlock>( + ctx.t, pin->duplicate() + ).si_then([append_offset, append_len, + right_to_write_extent=std::move(right_to_write_extent)] + (auto right_extent) mutable { + return get_iertr::make_ready_future<operate_ret_bare>( + std::move(right_to_write_extent), + std::make_optional(bufferptr( + right_extent->get_bptr(), + append_offset, + append_len))); + }); + } + } +}; + +template <typename F> +auto with_object_data( + ObjectDataHandler::context_t ctx, + F &&f) +{ + return seastar::do_with( + ctx.onode.get_layout().object_data.get(), + std::forward<F>(f), + [ctx](auto &object_data, auto &f) { + return std::invoke(f, object_data + ).si_then([ctx, &object_data] { + if (object_data.must_update()) { + ctx.onode.get_mutable_layout(ctx.t).object_data.update(object_data); + } + return seastar::now(); + }); + }); +} + +template <typename F> +auto with_objects_data( + ObjectDataHandler::context_t ctx, + F &&f) +{ + ceph_assert(ctx.d_onode); + return seastar::do_with( + ctx.onode.get_layout().object_data.get(), + ctx.d_onode->get_layout().object_data.get(), + std::forward<F>(f), + [ctx](auto &object_data, auto &d_object_data, auto &f) { + return std::invoke(f, object_data, d_object_data + ).si_then([ctx, &object_data, &d_object_data] { + if (object_data.must_update()) { + ctx.onode.get_mutable_layout(ctx.t).object_data.update(object_data); + } + if (d_object_data.must_update()) { + ctx.d_onode->get_mutable_layout( + ctx.t).object_data.update(d_object_data); + } + return seastar::now(); + }); + }); +} + +ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation( + context_t ctx, + object_data_t &object_data, + extent_len_t size) +{ + LOG_PREFIX(ObjectDataHandler::prepare_data_reservation); + ceph_assert(size <= max_object_size); + if (!object_data.is_null()) { + ceph_assert(object_data.get_reserved_data_len() == max_object_size); + DEBUGT("reservation present: {}~{}", + ctx.t, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len()); + return write_iertr::now(); + } else { + DEBUGT("reserving: {}~{}", + ctx.t, + ctx.onode.get_data_hint(), + max_object_size); + return ctx.tm.reserve_region( + ctx.t, + ctx.onode.get_data_hint(), + max_object_size + ).si_then([max_object_size=max_object_size, &object_data](auto pin) { + ceph_assert(pin->get_length() == max_object_size); + object_data.update_reserved( + pin->get_key(), + pin->get_length()); + return write_iertr::now(); + }); + } +} + +ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation( + context_t ctx, object_data_t &object_data, extent_len_t size) +{ + ceph_assert(!object_data.is_null()); + ceph_assert(size <= object_data.get_reserved_data_len()); + return seastar::do_with( + lba_pin_list_t(), + extent_to_write_list_t(), + [ctx, size, &object_data](auto &pins, auto &to_write) { + LOG_PREFIX(ObjectDataHandler::trim_data_reservation); + DEBUGT("object_data: {}~{}", + ctx.t, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len()); + return ctx.tm.get_pins( + ctx.t, + object_data.get_reserved_data_base() + size, + object_data.get_reserved_data_len() - size + ).si_then([ctx, size, &pins, &object_data, &to_write](auto _pins) { + _pins.swap(pins); + ceph_assert(pins.size()); + if (!size) { + // no need to reserve region if we are truncating the object's + // size to 0 + return clear_iertr::now(); + } + auto &pin = *pins.front(); + ceph_assert(pin.get_key() >= object_data.get_reserved_data_base()); + ceph_assert( + pin.get_key() <= object_data.get_reserved_data_base() + size); + auto pin_offset = pin.get_key() - + object_data.get_reserved_data_base(); + if ((pin.get_key() == (object_data.get_reserved_data_base() + size)) || + (pin.get_val().is_zero())) { + /* First pin is exactly at the boundary or is a zero pin. Either way, + * remove all pins and add a single zero pin to the end. */ + to_write.push_back(extent_to_write_t::create_zero( + pin.get_key(), + object_data.get_reserved_data_len() - pin_offset)); + return clear_iertr::now(); + } else { + /* First pin overlaps the boundary and has data, remap it + * if aligned or rewrite it if not aligned to size */ + auto roundup_size = p2roundup(size, ctx.tm.get_block_size()); + auto append_len = roundup_size - size; + if (append_len == 0) { + LOG_PREFIX(ObjectDataHandler::trim_data_reservation); + TRACET("First pin overlaps the boundary and has aligned data" + "create existing at addr:{}, len:{}", + ctx.t, pin.get_key(), size - pin_offset); + to_write.push_back(extent_to_write_t::create_existing( + pin.duplicate(), + pin.get_key(), + size - pin_offset)); + to_write.push_back(extent_to_write_t::create_zero( + object_data.get_reserved_data_base() + roundup_size, + object_data.get_reserved_data_len() - roundup_size)); + return clear_iertr::now(); + } else { + return ctx.tm.read_pin<ObjectDataBlock>( + ctx.t, + pin.duplicate() + ).si_then([ctx, size, pin_offset, append_len, roundup_size, + &pin, &object_data, &to_write](auto extent) { + bufferlist bl; + bl.append( + bufferptr( + extent->get_bptr(), + pin.get_intermediate_offset(), + size - pin_offset + )); + bl.append_zero(append_len); + LOG_PREFIX(ObjectDataHandler::trim_data_reservation); + TRACET("First pin overlaps the boundary and has unaligned data" + "create data at addr:{}, len:{}", + ctx.t, pin.get_key(), bl.length()); + to_write.push_back(extent_to_write_t::create_data( + pin.get_key(), + bl)); + to_write.push_back(extent_to_write_t::create_zero( + object_data.get_reserved_data_base() + roundup_size, + object_data.get_reserved_data_len() - roundup_size)); + return clear_iertr::now(); + }); + } + } + }).si_then([ctx, size, &to_write, &object_data, &pins] { + return seastar::do_with( + prepare_ops_list(pins, to_write), + [ctx, size, &object_data](auto &ops) { + return do_remappings(ctx, ops.to_remap + ).si_then([ctx, &ops] { + return do_removals(ctx, ops.to_remove); + }).si_then([ctx, &ops] { + return do_insertions(ctx, ops.to_insert); + }).si_then([size, &object_data] { + if (size == 0) { + object_data.clear(); + } + return ObjectDataHandler::clear_iertr::now(); + }); + }); + }); + }); +} + +/** + * get_to_writes_with_zero_buffer + * + * Returns extent_to_write_t's reflecting a zero region extending + * from offset~len with headptr optionally on the left and tailptr + * optionally on the right. + */ +extent_to_write_list_t get_to_writes_with_zero_buffer( + const extent_len_t block_size, + laddr_t offset, extent_len_t len, + std::optional<bufferptr> &&headptr, std::optional<bufferptr> &&tailptr) +{ + auto zero_left = p2roundup(offset, (laddr_t)block_size); + auto zero_right = p2align(offset + len, (laddr_t)block_size); + auto left = headptr ? (offset - headptr->length()) : offset; + auto right = tailptr ? + (offset + len + tailptr->length()) : + (offset + len); + + assert( + (headptr && ((zero_left - left) == + p2roundup(headptr->length(), block_size))) ^ + (!headptr && (zero_left == left))); + assert( + (tailptr && ((right - zero_right) == + p2roundup(tailptr->length(), block_size))) ^ + (!tailptr && (right == zero_right))); + + assert(right > left); + assert((left % block_size) == 0); + assert((right % block_size) == 0); + + // zero region too small for a reserved section, + // headptr and tailptr in same extent + if (zero_right <= zero_left) { + bufferlist bl; + if (headptr) { + bl.append(*headptr); + } + bl.append_zero( + right - left - bl.length() - (tailptr ? tailptr->length() : 0)); + if (tailptr) { + bl.append(*tailptr); + } + assert(bl.length() % block_size == 0); + assert(bl.length() == (right - left)); + extent_to_write_list_t ret; + ret.push_back(extent_to_write_t::create_data(left, bl)); + return ret; + } else { + // reserved section between ends, headptr and tailptr in different extents + extent_to_write_list_t ret; + if (headptr) { + bufferlist headbl; + headbl.append(*headptr); + headbl.append_zero(zero_left - left - headbl.length()); + assert(headbl.length() % block_size == 0); + assert(headbl.length() > 0); + ret.push_back(extent_to_write_t::create_data(left, headbl)); + } + // reserved zero region + ret.push_back(extent_to_write_t::create_zero(zero_left, zero_right - zero_left)); + assert(ret.back().len % block_size == 0); + assert(ret.back().len > 0); + if (tailptr) { + bufferlist tailbl; + tailbl.append(*tailptr); + tailbl.append_zero(right - zero_right - tailbl.length()); + assert(tailbl.length() % block_size == 0); + assert(tailbl.length() > 0); + ret.push_back(extent_to_write_t::create_data(zero_right, tailbl)); + } + return ret; + } +} + +/** + * get_to_writes + * + * Returns extent_to_write_t's from bl. + * + * TODO: probably add some kind of upper limit on extent size. + */ +extent_to_write_list_t get_to_writes(laddr_t offset, bufferlist &bl) +{ + auto ret = extent_to_write_list_t(); + ret.push_back(extent_to_write_t::create_data(offset, bl)); + return ret; +}; + +ObjectDataHandler::write_ret ObjectDataHandler::overwrite( + context_t ctx, + laddr_t offset, + extent_len_t len, + std::optional<bufferlist> &&bl, + lba_pin_list_t &&_pins) +{ + if (bl.has_value()) { + assert(bl->length() == len); + } + overwrite_plan_t overwrite_plan(offset, len, _pins, ctx.tm.get_block_size(), ctx.t); + return seastar::do_with( + std::move(_pins), + extent_to_write_list_t(), + [ctx, len, offset, overwrite_plan, bl=std::move(bl)] + (auto &pins, auto &to_write) mutable + { + LOG_PREFIX(ObjectDataHandler::overwrite); + DEBUGT("overwrite: {}~{}", + ctx.t, + offset, + len); + ceph_assert(pins.size() >= 1); + DEBUGT("overwrite: split overwrite_plan {}", ctx.t, overwrite_plan); + + return operate_left( + ctx, + pins.front(), + overwrite_plan + ).si_then([ctx, len, offset, overwrite_plan, bl=std::move(bl), + &to_write, &pins](auto p) mutable { + auto &[left_extent, headptr] = p; + if (left_extent) { + ceph_assert(left_extent->addr == overwrite_plan.pin_begin); + append_extent_to_write(to_write, std::move(*left_extent)); + } + if (headptr) { + assert(headptr->length() > 0); + } + return operate_right( + ctx, + pins.back(), + overwrite_plan + ).si_then([ctx, len, offset, + pin_begin=overwrite_plan.pin_begin, + pin_end=overwrite_plan.pin_end, + bl=std::move(bl), headptr=std::move(headptr), + &to_write, &pins](auto p) mutable { + auto &[right_extent, tailptr] = p; + if (bl.has_value()) { + auto write_offset = offset; + bufferlist write_bl; + if (headptr) { + write_bl.append(*headptr); + write_offset -= headptr->length(); + assert_aligned(write_offset); + } + write_bl.claim_append(*bl); + if (tailptr) { + write_bl.append(*tailptr); + assert_aligned(write_bl.length()); + } + splice_extent_to_write( + to_write, + get_to_writes(write_offset, write_bl)); + } else { + splice_extent_to_write( + to_write, + get_to_writes_with_zero_buffer( + ctx.tm.get_block_size(), + offset, + len, + std::move(headptr), + std::move(tailptr))); + } + if (right_extent) { + ceph_assert(right_extent->get_end_addr() == pin_end); + append_extent_to_write(to_write, std::move(*right_extent)); + } + assert(to_write.size()); + assert(pin_begin == to_write.front().addr); + assert(pin_end == to_write.back().get_end_addr()); + + return seastar::do_with( + prepare_ops_list(pins, to_write), + [ctx](auto &ops) { + return do_remappings(ctx, ops.to_remap + ).si_then([ctx, &ops] { + return do_removals(ctx, ops.to_remove); + }).si_then([ctx, &ops] { + return do_insertions(ctx, ops.to_insert); + }); + }); + }); + }); + }); +} + +ObjectDataHandler::zero_ret ObjectDataHandler::zero( + context_t ctx, + objaddr_t offset, + extent_len_t len) +{ + return with_object_data( + ctx, + [this, ctx, offset, len](auto &object_data) { + LOG_PREFIX(ObjectDataHandler::zero); + DEBUGT("zero to {}~{}, object_data: {}~{}, is_null {}", + ctx.t, + offset, + len, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len(), + object_data.is_null()); + return prepare_data_reservation( + ctx, + object_data, + p2roundup(offset + len, ctx.tm.get_block_size()) + ).si_then([this, ctx, offset, len, &object_data] { + auto logical_offset = object_data.get_reserved_data_base() + offset; + return ctx.tm.get_pins( + ctx.t, + logical_offset, + len + ).si_then([this, ctx, logical_offset, len](auto pins) { + return overwrite( + ctx, logical_offset, len, + std::nullopt, std::move(pins)); + }); + }); + }); +} + +ObjectDataHandler::write_ret ObjectDataHandler::write( + context_t ctx, + objaddr_t offset, + const bufferlist &bl) +{ + return with_object_data( + ctx, + [this, ctx, offset, &bl](auto &object_data) { + LOG_PREFIX(ObjectDataHandler::write); + DEBUGT("writing to {}~{}, object_data: {}~{}, is_null {}", + ctx.t, + offset, + bl.length(), + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len(), + object_data.is_null()); + return prepare_data_reservation( + ctx, + object_data, + p2roundup(offset + bl.length(), ctx.tm.get_block_size()) + ).si_then([this, ctx, offset, &object_data, &bl] { + auto logical_offset = object_data.get_reserved_data_base() + offset; + return ctx.tm.get_pins( + ctx.t, + logical_offset, + bl.length() + ).si_then([this, ctx,logical_offset, &bl]( + auto pins) { + return overwrite( + ctx, logical_offset, bl.length(), + bufferlist(bl), std::move(pins)); + }); + }); + }); +} + +ObjectDataHandler::read_ret ObjectDataHandler::read( + context_t ctx, + objaddr_t obj_offset, + extent_len_t len) +{ + return seastar::do_with( + bufferlist(), + [ctx, obj_offset, len](auto &ret) { + return with_object_data( + ctx, + [ctx, obj_offset, len, &ret](const auto &object_data) { + LOG_PREFIX(ObjectDataHandler::read); + DEBUGT("reading {}~{}", + ctx.t, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len()); + /* Assumption: callers ensure that onode size is <= reserved + * size and that len is adjusted here prior to call */ + ceph_assert(!object_data.is_null()); + ceph_assert((obj_offset + len) <= object_data.get_reserved_data_len()); + ceph_assert(len > 0); + laddr_t loffset = + object_data.get_reserved_data_base() + obj_offset; + return ctx.tm.get_pins( + ctx.t, + loffset, + len + ).si_then([ctx, loffset, len, &ret](auto _pins) { + // offset~len falls within reserved region and len > 0 + ceph_assert(_pins.size() >= 1); + ceph_assert((*_pins.begin())->get_key() <= loffset); + return seastar::do_with( + std::move(_pins), + loffset, + [ctx, loffset, len, &ret](auto &pins, auto ¤t) { + return trans_intr::do_for_each( + pins, + [ctx, loffset, len, ¤t, &ret](auto &pin) + -> read_iertr::future<> { + ceph_assert(current <= (loffset + len)); + ceph_assert( + (loffset + len) > pin->get_key()); + laddr_t end = std::min( + pin->get_key() + pin->get_length(), + loffset + len); + if (pin->get_val().is_zero()) { + ceph_assert(end > current); // See LBAManager::get_mappings + ret.append_zero(end - current); + current = end; + return seastar::now(); + } else { + LOG_PREFIX(ObjectDataHandler::read); + auto key = pin->get_key(); + bool is_indirect = pin->is_indirect(); + extent_len_t off = pin->get_intermediate_offset(); + DEBUGT("reading {}~{}, indirect: {}, " + "intermediate offset: {}, current: {}, end: {}", + ctx.t, + key, + pin->get_length(), + is_indirect, + off, + current, + end); + return ctx.tm.read_pin<ObjectDataBlock>( + ctx.t, + std::move(pin) + ).si_then([&ret, ¤t, end, key, off, + is_indirect](auto extent) { + ceph_assert( + is_indirect + ? (key - off + extent->get_length()) >= end + : (extent->get_laddr() + extent->get_length()) >= end); + ceph_assert(end > current); + ret.append( + bufferptr( + extent->get_bptr(), + off + current - (is_indirect ? key : extent->get_laddr()), + end - current)); + current = end; + return seastar::now(); + }).handle_error_interruptible( + read_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "ObjectDataHandler::read hit invalid error" + } + ); + } + }); + }); + }); + }).si_then([&ret] { + return std::move(ret); + }); + }); +} + +ObjectDataHandler::fiemap_ret ObjectDataHandler::fiemap( + context_t ctx, + objaddr_t obj_offset, + extent_len_t len) +{ + return seastar::do_with( + std::map<uint64_t, uint64_t>(), + [ctx, obj_offset, len](auto &ret) { + return with_object_data( + ctx, + [ctx, obj_offset, len, &ret](const auto &object_data) { + LOG_PREFIX(ObjectDataHandler::fiemap); + DEBUGT( + "{}~{}, reservation {}~{}", + ctx.t, + obj_offset, + len, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len()); + /* Assumption: callers ensure that onode size is <= reserved + * size and that len is adjusted here prior to call */ + ceph_assert(!object_data.is_null()); + ceph_assert((obj_offset + len) <= object_data.get_reserved_data_len()); + ceph_assert(len > 0); + laddr_t loffset = + object_data.get_reserved_data_base() + obj_offset; + return ctx.tm.get_pins( + ctx.t, + loffset, + len + ).si_then([loffset, len, &object_data, &ret](auto &&pins) { + ceph_assert(pins.size() >= 1); + ceph_assert((*pins.begin())->get_key() <= loffset); + for (auto &&i: pins) { + if (!(i->get_val().is_zero())) { + auto ret_left = std::max(i->get_key(), loffset); + auto ret_right = std::min( + i->get_key() + i->get_length(), + loffset + len); + assert(ret_right > ret_left); + ret.emplace( + std::make_pair( + ret_left - object_data.get_reserved_data_base(), + ret_right - ret_left + )); + } + } + }); + }).si_then([&ret] { + return std::move(ret); + }); + }); +} + +ObjectDataHandler::truncate_ret ObjectDataHandler::truncate( + context_t ctx, + objaddr_t offset) +{ + return with_object_data( + ctx, + [this, ctx, offset](auto &object_data) { + LOG_PREFIX(ObjectDataHandler::truncate); + DEBUGT("truncating {}~{} offset: {}", + ctx.t, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len(), + offset); + if (offset < object_data.get_reserved_data_len()) { + return trim_data_reservation(ctx, object_data, offset); + } else if (offset > object_data.get_reserved_data_len()) { + return prepare_data_reservation( + ctx, + object_data, + p2roundup(offset, ctx.tm.get_block_size())); + } else { + return truncate_iertr::now(); + } + }); +} + +ObjectDataHandler::clear_ret ObjectDataHandler::clear( + context_t ctx) +{ + return with_object_data( + ctx, + [this, ctx](auto &object_data) { + LOG_PREFIX(ObjectDataHandler::clear); + DEBUGT("clearing: {}~{}", + ctx.t, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len()); + if (object_data.is_null()) { + return clear_iertr::now(); + } + return trim_data_reservation(ctx, object_data, 0); + }); +} + +ObjectDataHandler::clone_ret ObjectDataHandler::clone_extents( + context_t ctx, + object_data_t &object_data, + lba_pin_list_t &pins, + laddr_t data_base) +{ + LOG_PREFIX(ObjectDataHandler::clone_extents); + TRACET(" object_data: {}~{}, data_base: {}", + ctx.t, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len(), + data_base); + return ctx.tm.dec_ref( + ctx.t, + object_data.get_reserved_data_base() + ).si_then( + [&pins, &object_data, ctx, data_base](auto) mutable { + return seastar::do_with( + (extent_len_t)0, + [&object_data, ctx, data_base, &pins](auto &last_pos) { + return trans_intr::do_for_each( + pins, + [&last_pos, &object_data, ctx, data_base](auto &pin) { + auto offset = pin->get_key() - data_base; + ceph_assert(offset == last_pos); + auto fut = TransactionManager::alloc_extent_iertr + ::make_ready_future<LBAMappingRef>(); + auto addr = object_data.get_reserved_data_base() + offset; + if (pin->get_val().is_zero()) { + fut = ctx.tm.reserve_region(ctx.t, addr, pin->get_length()); + } else { + fut = ctx.tm.clone_pin(ctx.t, addr, *pin); + } + return fut.si_then( + [&pin, &last_pos, offset](auto) { + last_pos = offset + pin->get_length(); + return seastar::now(); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further(), + crimson::ct_error::assert_all("not possible") + ); + }).si_then([&last_pos, &object_data, ctx] { + if (last_pos != object_data.get_reserved_data_len()) { + return ctx.tm.reserve_region( + ctx.t, + object_data.get_reserved_data_base() + last_pos, + object_data.get_reserved_data_len() - last_pos + ).si_then([](auto) { + return seastar::now(); + }); + } + return TransactionManager::reserve_extent_iertr::now(); + }); + }); + }, + ObjectDataHandler::write_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "object_data_handler::clone invalid error" + } + ); +} + +ObjectDataHandler::clone_ret ObjectDataHandler::clone( + context_t ctx) +{ + // the whole clone procedure can be seperated into the following steps: + // 1. let clone onode(d_object_data) take the head onode's + // object data base; + // 2. reserve a new region in lba tree for the head onode; + // 3. clone all extents of the clone onode, see transaction_manager.h + // for the details of clone_pin; + // 4. reserve the space between the head onode's size and its reservation + // length. + return with_objects_data( + ctx, + [ctx, this](auto &object_data, auto &d_object_data) { + ceph_assert(d_object_data.is_null()); + if (object_data.is_null()) { + return clone_iertr::now(); + } + return prepare_data_reservation( + ctx, + d_object_data, + object_data.get_reserved_data_len() + ).si_then([&object_data, &d_object_data, ctx, this] { + assert(!object_data.is_null()); + auto base = object_data.get_reserved_data_base(); + auto len = object_data.get_reserved_data_len(); + object_data.clear(); + LOG_PREFIX(ObjectDataHandler::clone); + DEBUGT("cloned obj reserve_data_base: {}, len {}", + ctx.t, + d_object_data.get_reserved_data_base(), + d_object_data.get_reserved_data_len()); + return prepare_data_reservation( + ctx, + object_data, + d_object_data.get_reserved_data_len() + ).si_then([&d_object_data, ctx, &object_data, base, len, this] { + LOG_PREFIX("ObjectDataHandler::clone"); + DEBUGT("head obj reserve_data_base: {}, len {}", + ctx.t, + object_data.get_reserved_data_base(), + object_data.get_reserved_data_len()); + return ctx.tm.get_pins(ctx.t, base, len + ).si_then([ctx, &object_data, &d_object_data, base, this](auto pins) { + return seastar::do_with( + std::move(pins), + [ctx, &object_data, &d_object_data, base, this](auto &pins) { + return clone_extents(ctx, object_data, pins, base + ).si_then([ctx, &d_object_data, base, &pins, this] { + return clone_extents(ctx, d_object_data, pins, base); + }).si_then([&pins, ctx] { + return do_removals(ctx, pins); + }); + }); + }); + }); + }); + }); +} + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/object_data_handler.h b/src/crimson/os/seastore/object_data_handler.h new file mode 100644 index 000000000..b5f432d5a --- /dev/null +++ b/src/crimson/os/seastore/object_data_handler.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <limits> + +#include "include/buffer.h" + +#include "test/crimson/seastore/test_block.h" // TODO + +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/transaction.h" + +namespace crimson::os::seastore { + +struct ObjectDataBlock : crimson::os::seastore::LogicalCachedExtent { + using Ref = TCachedExtentRef<ObjectDataBlock>; + + explicit ObjectDataBlock(ceph::bufferptr &&ptr) + : LogicalCachedExtent(std::move(ptr)) {} + explicit ObjectDataBlock(const ObjectDataBlock &other) + : LogicalCachedExtent(other) {} + explicit ObjectDataBlock(extent_len_t length) + : LogicalCachedExtent(length) {} + + CachedExtentRef duplicate_for_write(Transaction&) final { + return CachedExtentRef(new ObjectDataBlock(*this)); + }; + + static constexpr extent_types_t TYPE = extent_types_t::OBJECT_DATA_BLOCK; + extent_types_t get_type() const final { + return TYPE; + } + + ceph::bufferlist get_delta() final { + /* Currently, we always allocate fresh ObjectDataBlock's rather than + * mutating existing ones. */ + ceph_assert(0 == "Should be impossible"); + } + + void apply_delta(const ceph::bufferlist &bl) final { + // See get_delta() + ceph_assert(0 == "Should be impossible"); + } +}; +using ObjectDataBlockRef = TCachedExtentRef<ObjectDataBlock>; + +class ObjectDataHandler { +public: + using base_iertr = TransactionManager::base_iertr; + + ObjectDataHandler(uint32_t mos) : max_object_size(mos) {} + + struct context_t { + TransactionManager &tm; + Transaction &t; + Onode &onode; + Onode *d_onode = nullptr; // The desination node in case of clone + }; + + /// Writes bl to [offset, offset + bl.length()) + using write_iertr = base_iertr; + using write_ret = write_iertr::future<>; + write_ret write( + context_t ctx, + objaddr_t offset, + const bufferlist &bl); + + using zero_iertr = base_iertr; + using zero_ret = zero_iertr::future<>; + zero_ret zero( + context_t ctx, + objaddr_t offset, + extent_len_t len); + + /// Reads data in [offset, offset + len) + using read_iertr = base_iertr; + using read_ret = read_iertr::future<bufferlist>; + read_ret read( + context_t ctx, + objaddr_t offset, + extent_len_t len); + + /// sparse read data, get range interval in [offset, offset + len) + using fiemap_iertr = base_iertr; + using fiemap_ret = fiemap_iertr::future<std::map<uint64_t, uint64_t>>; + fiemap_ret fiemap( + context_t ctx, + objaddr_t offset, + extent_len_t len); + + /// Clears data past offset + using truncate_iertr = base_iertr; + using truncate_ret = truncate_iertr::future<>; + truncate_ret truncate( + context_t ctx, + objaddr_t offset); + + /// Clears data and reservation + using clear_iertr = base_iertr; + using clear_ret = clear_iertr::future<>; + clear_ret clear(context_t ctx); + + /// Clone data of an Onode + using clone_iertr = base_iertr; + using clone_ret = clone_iertr::future<>; + clone_ret clone(context_t ctx); + +private: + /// Updates region [_offset, _offset + bl.length) to bl + write_ret overwrite( + context_t ctx, ///< [in] ctx + laddr_t offset, ///< [in] write offset + extent_len_t len, ///< [in] len to write, len == bl->length() if bl + std::optional<bufferlist> &&bl, ///< [in] buffer to write, empty for zeros + lba_pin_list_t &&pins ///< [in] set of pins overlapping above region + ); + + /// Ensures object_data reserved region is prepared + write_ret prepare_data_reservation( + context_t ctx, + object_data_t &object_data, + extent_len_t size); + + /// Trims data past size + clear_ret trim_data_reservation( + context_t ctx, + object_data_t &object_data, + extent_len_t size); + + clone_ret clone_extents( + context_t ctx, + object_data_t &object_data, + lba_pin_list_t &pins, + laddr_t data_base); + +private: + /** + * max_object_size + * + * For now, we allocate a fixed region of laddr space of size max_object_size + * for any object. In the future, once we have the ability to remap logical + * mappings (necessary for clone), we'll add the ability to grow and shrink + * these regions and remove this assumption. + */ + const uint32_t max_object_size = 0; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::ObjectDataBlock> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/omap_manager.cc b/src/crimson/os/seastore/omap_manager.cc new file mode 100644 index 000000000..7ad37a2e9 --- /dev/null +++ b/src/crimson/os/seastore/omap_manager.cc @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <experimental/iterator> +#include <iostream> + +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/omap_manager.h" +#include "crimson/os/seastore/omap_manager/btree/btree_omap_manager.h" + +namespace crimson::os::seastore::omap_manager { + +OMapManagerRef create_omap_manager(TransactionManager &trans_manager) { + return OMapManagerRef(new BtreeOMapManager(trans_manager)); +} + +} + +namespace std { +std::ostream &operator<<(std::ostream &out, const std::pair<std::string, std::string> &rhs) +{ + return out << "key_value_map (" << rhs.first<< "->" << rhs.second << ")"; +} +} + +namespace crimson::os::seastore { + +std::ostream &operator<<(std::ostream &out, const std::list<std::string> &rhs) +{ + out << '['; + std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", ")); + return out << ']'; +} + +std::ostream &operator<<(std::ostream &out, const std::vector<std::pair<std::string, std::string>> &rhs) +{ + out << '['; + std::ostream_iterator<std::pair<std::string, std::string>> out_it(out, ", "); + std::copy(rhs.begin(), rhs.end(), out_it); + return out << ']'; +} + +} diff --git a/src/crimson/os/seastore/omap_manager.h b/src/crimson/os/seastore/omap_manager.h new file mode 100644 index 000000000..fc4e03e2b --- /dev/null +++ b/src/crimson/os/seastore/omap_manager.h @@ -0,0 +1,210 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/future.hh> + +#include "crimson/osd/exceptions.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" + +#define OMAP_INNER_BLOCK_SIZE 4096 +#define OMAP_LEAF_BLOCK_SIZE 8192 + +namespace crimson::os::seastore { + +std::ostream &operator<<(std::ostream &out, const std::list<std::string> &rhs); +std::ostream &operator<<(std::ostream &out, const std::map<std::string, std::string> &rhs); + +class OMapManager { + /* all OMapManager API use reference to transfer input string parameters, + * the upper caller should guarantee the referenced string values alive (not freed) + * until these functions future resolved. + */ +public: + using base_iertr = TransactionManager::base_iertr; + + /** + * allocate omap tree root node + * + * @param Transaction &t, current transaction + * @retval return the omap_root_t structure. + */ + using initialize_omap_iertr = base_iertr; + using initialize_omap_ret = initialize_omap_iertr::future<omap_root_t>; + virtual initialize_omap_ret initialize_omap(Transaction &t, laddr_t hint) = 0; + + /** + * get value(string) by key(string) + * + * @param omap_root_t &omap_root, omap btree root information + * @param Transaction &t, current transaction + * @param string &key, omap string key + * @retval return string key->string value mapping pair. + */ + using omap_get_value_iertr = base_iertr; + using omap_get_value_ret = omap_get_value_iertr::future< + std::optional<bufferlist>>; + virtual omap_get_value_ret omap_get_value( + const omap_root_t &omap_root, + Transaction &t, + const std::string &key) = 0; + + /** + * set key value mapping in omap + * + * @param omap_root_t &omap_root, omap btree root information + * @param Transaction &t, current transaction + * @param string &key, omap string key + * @param string &value, mapped value corresponding key + */ + using omap_set_key_iertr = base_iertr; + using omap_set_key_ret = omap_set_key_iertr::future<>; + virtual omap_set_key_ret omap_set_key( + omap_root_t &omap_root, + Transaction &t, + const std::string &key, + const ceph::bufferlist &value) = 0; + + using omap_set_keys_iertr = base_iertr; + using omap_set_keys_ret = omap_set_keys_iertr::future<>; + virtual omap_set_keys_ret omap_set_keys( + omap_root_t &omap_root, + Transaction &t, + std::map<std::string, ceph::bufferlist>&& keys) = 0; + + /** + * remove key value mapping in omap tree + * + * @param omap_root_t &omap_root, omap btree root information + * @param Transaction &t, current transaction + * @param string &key, omap string key + */ + using omap_rm_key_iertr = base_iertr; + using omap_rm_key_ret = omap_rm_key_iertr::future<>; + virtual omap_rm_key_ret omap_rm_key( + omap_root_t &omap_root, + Transaction &t, + const std::string &key) = 0; + + /** + * omap_list + * + * Scans key/value pairs in order. + * + * @param omap_root: omap btree root information + * @param t: current transaction + * @param first: range start, nullopt sorts before any string, + * behavior based on config.inclusive, + * must alive during the call + * @param last: range end, nullopt sorts after any string, + * behavior based on config.inclusive, + * must alive during the call + * @param config: see below for params + * @retval listed key->value and bool indicating complete + */ + struct omap_list_config_t { + /// max results to return + size_t max_result_size = 128; + + /// true denotes behavior like lower_bound, upper_bound otherwise + /// range start behavior + bool first_inclusive = false; + /// range end behavior + bool last_inclusive = false; + + omap_list_config_t( + size_t max_result_size, + bool first_inclusive, + bool last_inclusive) + : max_result_size(max_result_size), + first_inclusive(first_inclusive), + last_inclusive(last_inclusive) {} + omap_list_config_t() {} + omap_list_config_t(const omap_list_config_t &) = default; + omap_list_config_t(omap_list_config_t &&) = default; + omap_list_config_t &operator=(const omap_list_config_t &) = default; + omap_list_config_t &operator=(omap_list_config_t &&) = default; + + auto with_max(size_t max) { + this->max_result_size = max; + return *this; + } + + auto without_max() { + this->max_result_size = std::numeric_limits<size_t>::max(); + return *this; + } + + auto with_inclusive( + bool first_inclusive, + bool last_inclusive) { + this->first_inclusive = first_inclusive; + this->last_inclusive = last_inclusive; + return *this; + } + + auto with_reduced_max(size_t reduced_by) const { + assert(reduced_by <= max_result_size); + return omap_list_config_t( + max_result_size - reduced_by, + first_inclusive, + last_inclusive); + } + }; + using omap_list_iertr = base_iertr; + using omap_list_bare_ret = std::tuple< + bool, + std::map<std::string, bufferlist, std::less<>>>; + using omap_list_ret = omap_list_iertr::future<omap_list_bare_ret>; + virtual omap_list_ret omap_list( + const omap_root_t &omap_root, + Transaction &t, + const std::optional<std::string> &first, + const std::optional<std::string> &last, + omap_list_config_t config = omap_list_config_t()) = 0; + + /** + * remove key value mappings in a key range from omap tree + * + * @param omap_root_t &omap_root, omap btree root information + * @param Transaction &t, current transaction + * @param string &first, range start + * @param string &last, range end + */ + using omap_rm_key_range_iertr = base_iertr; + using omap_rm_key_range_ret = omap_rm_key_range_iertr::future<>; + virtual omap_rm_key_range_ret omap_rm_key_range( + omap_root_t &omap_root, + Transaction &t, + const std::string &first, + const std::string &last, + omap_list_config_t config) = 0; + + /** + * clear all omap tree key->value mapping + * + * @param omap_root_t &omap_root, omap btree root information + * @param Transaction &t, current transaction + */ + using omap_clear_iertr = base_iertr; + using omap_clear_ret = omap_clear_iertr::future<>; + virtual omap_clear_ret omap_clear(omap_root_t &omap_root, Transaction &t) = 0; + + virtual ~OMapManager() {} +}; +using OMapManagerRef = std::unique_ptr<OMapManager>; + +namespace omap_manager { + +OMapManagerRef create_omap_manager ( + TransactionManager &trans_manager); +} + +} diff --git a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc new file mode 100644 index 000000000..1782d7ee6 --- /dev/null +++ b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.cc @@ -0,0 +1,293 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <string.h> + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/omap_manager/btree/btree_omap_manager.h" +#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h" + +SET_SUBSYS(seastore_omap); + +namespace crimson::os::seastore::omap_manager { + +BtreeOMapManager::BtreeOMapManager( + TransactionManager &tm) + : tm(tm) {} + +BtreeOMapManager::initialize_omap_ret +BtreeOMapManager::initialize_omap(Transaction &t, laddr_t hint) +{ + LOG_PREFIX(BtreeOMapManager::initialize_omap); + DEBUGT("hint: {}", t, hint); + return tm.alloc_extent<OMapLeafNode>(t, hint, OMAP_LEAF_BLOCK_SIZE) + .si_then([hint, &t](auto&& root_extent) { + root_extent->set_size(0); + omap_node_meta_t meta{1}; + root_extent->set_meta(meta); + omap_root_t omap_root; + omap_root.update(root_extent->get_laddr(), 1, hint); + t.get_omap_tree_stats().depth = 1u; + t.get_omap_tree_stats().extents_num_delta++; + return initialize_omap_iertr::make_ready_future<omap_root_t>(omap_root); + }); +} + +BtreeOMapManager::get_root_ret +BtreeOMapManager::get_omap_root(omap_context_t oc, const omap_root_t &omap_root) +{ + assert(omap_root.get_location() != L_ADDR_NULL); + laddr_t laddr = omap_root.get_location(); + return omap_load_extent(oc, laddr, omap_root.get_depth()); +} + +BtreeOMapManager::handle_root_split_ret +BtreeOMapManager::handle_root_split( + omap_context_t oc, + omap_root_t &omap_root, + const OMapNode::mutation_result_t& mresult) +{ + LOG_PREFIX(BtreeOMapManager::handle_root_split); + DEBUGT("{}", oc.t, omap_root); + return oc.tm.alloc_extent<OMapInnerNode>(oc.t, omap_root.hint, + OMAP_INNER_BLOCK_SIZE) + .si_then([&omap_root, mresult, oc](auto&& nroot) -> handle_root_split_ret { + auto [left, right, pivot] = *(mresult.split_tuple); + omap_node_meta_t meta{omap_root.depth + 1}; + nroot->set_meta(meta); + nroot->journal_inner_insert(nroot->iter_begin(), left->get_laddr(), + "", nroot->maybe_get_delta_buffer()); + nroot->journal_inner_insert(nroot->iter_begin() + 1, right->get_laddr(), + pivot, nroot->maybe_get_delta_buffer()); + omap_root.update(nroot->get_laddr(), omap_root.get_depth() + 1, omap_root.hint); + oc.t.get_omap_tree_stats().depth = omap_root.depth; + ++(oc.t.get_omap_tree_stats().extents_num_delta); + return seastar::now(); + }); +} + +BtreeOMapManager::handle_root_merge_ret +BtreeOMapManager::handle_root_merge( + omap_context_t oc, + omap_root_t &omap_root, + OMapNode::mutation_result_t mresult) +{ + LOG_PREFIX(BtreeOMapManager::handle_root_merge); + DEBUGT("{}", oc.t, omap_root); + auto root = *(mresult.need_merge); + auto iter = root->cast<OMapInnerNode>()->iter_begin(); + omap_root.update( + iter->get_val(), + omap_root.depth -= 1, + omap_root.hint); + oc.t.get_omap_tree_stats().depth = omap_root.depth; + oc.t.get_omap_tree_stats().extents_num_delta--; + return oc.tm.dec_ref(oc.t, root->get_laddr() + ).si_then([](auto &&ret) -> handle_root_merge_ret { + return seastar::now(); + }).handle_error_interruptible( + handle_root_merge_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in handle_root_merge" + } + ); +} + +BtreeOMapManager::omap_get_value_ret +BtreeOMapManager::omap_get_value( + const omap_root_t &omap_root, + Transaction &t, + const std::string &key) +{ + LOG_PREFIX(BtreeOMapManager::omap_get_value); + DEBUGT("key={}", t, key); + return get_omap_root( + get_omap_context(t, omap_root.hint), + omap_root + ).si_then([this, &t, &key, &omap_root](auto&& extent) { + return extent->get_value(get_omap_context(t, omap_root.hint), key); + }).si_then([](auto &&e) { + return omap_get_value_ret( + interruptible::ready_future_marker{}, + std::move(e)); + }); +} + +BtreeOMapManager::omap_set_keys_ret +BtreeOMapManager::omap_set_keys( + omap_root_t &omap_root, + Transaction &t, + std::map<std::string, ceph::bufferlist>&& keys) +{ + return seastar::do_with(std::move(keys), [&, this](auto& keys) { + return trans_intr::do_for_each( + keys.begin(), + keys.end(), + [&, this](auto &p) { + return omap_set_key(omap_root, t, p.first, p.second); + }); + }); +} + +BtreeOMapManager::omap_set_key_ret +BtreeOMapManager::omap_set_key( + omap_root_t &omap_root, + Transaction &t, + const std::string &key, + const ceph::bufferlist &value) +{ + LOG_PREFIX(BtreeOMapManager::omap_set_key); + DEBUGT("{} -> {}", t, key, value); + return get_omap_root( + get_omap_context(t, omap_root.hint), + omap_root + ).si_then([this, &t, &key, &value, &omap_root](auto root) { + return root->insert(get_omap_context(t, omap_root.hint), key, value); + }).si_then([this, &omap_root, &t](auto mresult) -> omap_set_key_ret { + if (mresult.status == mutation_status_t::SUCCESS) + return seastar::now(); + else if (mresult.status == mutation_status_t::WAS_SPLIT) + return handle_root_split(get_omap_context(t, omap_root.hint), omap_root, mresult); + else + return seastar::now(); + }); +} + +BtreeOMapManager::omap_rm_key_ret +BtreeOMapManager::omap_rm_key( + omap_root_t &omap_root, + Transaction &t, + const std::string &key) +{ + LOG_PREFIX(BtreeOMapManager::omap_rm_key); + DEBUGT("{}", t, key); + return get_omap_root( + get_omap_context(t, omap_root.hint), + omap_root + ).si_then([this, &t, &key, &omap_root](auto root) { + return root->rm_key(get_omap_context(t, omap_root.hint), key); + }).si_then([this, &omap_root, &t](auto mresult) -> omap_rm_key_ret { + if (mresult.status == mutation_status_t::SUCCESS) { + return seastar::now(); + } else if (mresult.status == mutation_status_t::WAS_SPLIT) { + return handle_root_split(get_omap_context(t, omap_root.hint), omap_root, mresult); + } else if (mresult.status == mutation_status_t::NEED_MERGE) { + auto root = *(mresult.need_merge); + if (root->get_node_size() == 1 && omap_root.depth != 1) { + return handle_root_merge(get_omap_context(t, omap_root.hint), omap_root, mresult); + } else { + return seastar::now(); + } + } else { + return seastar::now(); + } + }); + +} + +BtreeOMapManager::omap_rm_key_range_ret +BtreeOMapManager::omap_rm_key_range( + omap_root_t &omap_root, + Transaction &t, + const std::string &first, + const std::string &last, + omap_list_config_t config) +{ + LOG_PREFIX(BtreeOMapManager::omap_rm_key_range); + DEBUGT("{} ~ {}", t, first, last); + assert(first <= last); + return seastar::do_with( + std::make_optional<std::string>(first), + std::make_optional<std::string>(last), + [this, &omap_root, &t, config](auto &first, auto &last) { + return omap_list( + omap_root, + t, + first, + last, + config); + }).si_then([this, &omap_root, &t](auto results) { + LOG_PREFIX(BtreeOMapManager::omap_rm_key_range); + auto &[complete, kvs] = results; + std::vector<std::string> keys; + for (const auto& [k, _] : kvs) { + keys.push_back(k); + } + DEBUGT("total {} keys to remove", t, keys.size()); + return seastar::do_with( + std::move(keys), + [this, &omap_root, &t](auto& keys) { + return trans_intr::do_for_each( + keys.begin(), + keys.end(), + [this, &omap_root, &t](auto& key) { + return omap_rm_key(omap_root, t, key); + }); + }); + }); +} + +BtreeOMapManager::omap_list_ret +BtreeOMapManager::omap_list( + const omap_root_t &omap_root, + Transaction &t, + const std::optional<std::string> &first, + const std::optional<std::string> &last, + omap_list_config_t config) +{ + LOG_PREFIX(BtreeOMapManager::omap_list); + if (first && last) { + DEBUGT("{}, first: {}, last: {}", t, omap_root, *first, *last); + assert(last >= first); + } else if (first) { + DEBUGT("{}, first: {}", t, omap_root, *first); + } else if (last) { + DEBUGT("{}, last: {}", t, omap_root, *last); + } else { + DEBUGT("{}", t, omap_root); + } + + return get_omap_root( + get_omap_context(t, omap_root.hint), + omap_root + ).si_then([this, config, &t, &first, &last, &omap_root](auto extent) { + return extent->list( + get_omap_context(t, omap_root.hint), + first, + last, + config); + }); +} + +BtreeOMapManager::omap_clear_ret +BtreeOMapManager::omap_clear( + omap_root_t &omap_root, + Transaction &t) +{ + LOG_PREFIX(BtreeOMapManager::omap_clear); + DEBUGT("{}", t, omap_root); + return get_omap_root( + get_omap_context(t, omap_root.hint), + omap_root + ).si_then([this, &t, &omap_root](auto extent) { + return extent->clear(get_omap_context(t, omap_root.hint)); + }).si_then([this, &omap_root, &t] { + return tm.dec_ref( + t, omap_root.get_location() + ).si_then([&omap_root] (auto ret) { + omap_root.update( + L_ADDR_NULL, + 0, L_ADDR_MIN); + return omap_clear_iertr::now(); + }); + }).handle_error_interruptible( + omap_clear_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in BtreeOMapManager::omap_clear" + } + ); +} + +} diff --git a/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h new file mode 100644 index 000000000..7fcba64c0 --- /dev/null +++ b/src/crimson/os/seastore/omap_manager/btree/btree_omap_manager.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/omap_manager.h" +#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" + +namespace crimson::os::seastore::omap_manager { +/** + * BtreeOMapManager + * + * Uses a btree to track : + * string -> string mapping for each onode omap + */ + +class BtreeOMapManager : public OMapManager { + TransactionManager &tm; + + omap_context_t get_omap_context( + Transaction &t, laddr_t addr_min) { + return omap_context_t{tm, t, addr_min}; + } + + /* get_omap_root + * + * load omap tree root node + */ + using get_root_iertr = base_iertr; + using get_root_ret = get_root_iertr::future<OMapNodeRef>; + static get_root_ret get_omap_root( + omap_context_t c, + const omap_root_t &omap_root); + + /* handle_root_split + * + * root has been split and needs to update omap_root_t + */ + using handle_root_split_iertr = base_iertr; + using handle_root_split_ret = handle_root_split_iertr::future<>; + handle_root_split_ret handle_root_split( + omap_context_t c, + omap_root_t &omap_root, + const OMapNode::mutation_result_t& mresult); + + /* handle_root_merge + * + * root node has only one item and it is not leaf node, need remove a layer + */ + using handle_root_merge_iertr = base_iertr; + using handle_root_merge_ret = handle_root_merge_iertr::future<>; + handle_root_merge_ret handle_root_merge( + omap_context_t oc, + omap_root_t &omap_root, + OMapNode:: mutation_result_t mresult); + +public: + explicit BtreeOMapManager(TransactionManager &tm); + + initialize_omap_ret initialize_omap(Transaction &t, laddr_t hint) final; + + omap_get_value_ret omap_get_value( + const omap_root_t &omap_root, + Transaction &t, + const std::string &key) final; + + omap_set_key_ret omap_set_key( + omap_root_t &omap_root, + Transaction &t, + const std::string &key, const ceph::bufferlist &value) final; + + omap_set_keys_ret omap_set_keys( + omap_root_t &omap_root, + Transaction &t, + std::map<std::string, ceph::bufferlist>&& keys) final; + + omap_rm_key_ret omap_rm_key( + omap_root_t &omap_root, + Transaction &t, + const std::string &key) final; + + omap_rm_key_range_ret omap_rm_key_range( + omap_root_t &omap_root, + Transaction &t, + const std::string &first, + const std::string &last, + omap_list_config_t config) final; + + omap_list_ret omap_list( + const omap_root_t &omap_root, + Transaction &t, + const std::optional<std::string> &first, + const std::optional<std::string> &last, + omap_list_config_t config = omap_list_config_t()) final; + + omap_clear_ret omap_clear( + omap_root_t &omap_root, + Transaction &t) final; + +}; +using BtreeOMapManagerRef = std::unique_ptr<BtreeOMapManager>; + +} diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h new file mode 100644 index 000000000..795daeddb --- /dev/null +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include <string> +#include <vector> + +//#include <boost/iterator/counting_iterator.hpp> + +#include "crimson/common/log.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/omap_manager.h" +#include "crimson/os/seastore/omap_manager/btree/omap_types.h" + +namespace crimson::os::seastore::omap_manager{ + +struct omap_context_t { + TransactionManager &tm; + Transaction &t; + laddr_t hint; +}; + +enum class mutation_status_t : uint8_t { + SUCCESS = 0, + WAS_SPLIT = 1, + NEED_MERGE = 2, + FAIL = 3 +}; + +struct OMapNode : LogicalCachedExtent { + using base_iertr = OMapManager::base_iertr; + + using OMapNodeRef = TCachedExtentRef<OMapNode>; + + struct mutation_result_t { + mutation_status_t status; + /// Only populated if WAS_SPLIT, indicates the newly created left and right nodes + /// from splitting the target entry during insertion. + std::optional<std::tuple<OMapNodeRef, OMapNodeRef, std::string>> split_tuple; + /// only sopulated if need merged, indicate which entry need be doing merge in upper layer. + std::optional<OMapNodeRef> need_merge; + + mutation_result_t(mutation_status_t s, std::optional<std::tuple<OMapNodeRef, + OMapNodeRef, std::string>> tuple, std::optional<OMapNodeRef> n_merge) + : status(s), + split_tuple(tuple), + need_merge(n_merge) {} + }; + + OMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {} + OMapNode(const OMapNode &other) + : LogicalCachedExtent(other) {} + + using get_value_iertr = base_iertr; + using get_value_ret = OMapManager::omap_get_value_ret; + virtual get_value_ret get_value( + omap_context_t oc, + const std::string &key) = 0; + + using insert_iertr = base_iertr; + using insert_ret = insert_iertr::future<mutation_result_t>; + virtual insert_ret insert( + omap_context_t oc, + const std::string &key, + const ceph::bufferlist &value) = 0; + + using rm_key_iertr = base_iertr; + using rm_key_ret = rm_key_iertr::future<mutation_result_t>; + virtual rm_key_ret rm_key( + omap_context_t oc, + const std::string &key) = 0; + + using omap_list_config_t = OMapManager::omap_list_config_t; + using list_iertr = base_iertr; + using list_bare_ret = OMapManager::omap_list_bare_ret; + using list_ret = OMapManager::omap_list_ret; + virtual list_ret list( + omap_context_t oc, + const std::optional<std::string> &first, + const std::optional<std::string> &last, + omap_list_config_t config) = 0; + + using clear_iertr = base_iertr; + using clear_ret = clear_iertr::future<>; + virtual clear_ret clear(omap_context_t oc) = 0; + + using full_merge_iertr = base_iertr; + using full_merge_ret = full_merge_iertr::future<OMapNodeRef>; + virtual full_merge_ret make_full_merge( + omap_context_t oc, + OMapNodeRef right) = 0; + + using make_balanced_iertr = base_iertr; + using make_balanced_ret = make_balanced_iertr::future + <std::tuple<OMapNodeRef, OMapNodeRef, std::string>>; + virtual make_balanced_ret make_balanced( + omap_context_t oc, + OMapNodeRef _right) = 0; + + virtual omap_node_meta_t get_node_meta() const = 0; + virtual bool extent_will_overflow( + size_t ksize, + std::optional<size_t> vsize) const = 0; + virtual bool can_merge(OMapNodeRef right) const = 0; + virtual bool extent_is_below_min() const = 0; + virtual uint32_t get_node_size() = 0; + + virtual ~OMapNode() = default; +}; + +using OMapNodeRef = OMapNode::OMapNodeRef; + +using omap_load_extent_iertr = OMapNode::base_iertr; +omap_load_extent_iertr::future<OMapNodeRef> +omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth); + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::omap_manager::OMapNode> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc new file mode 100644 index 000000000..4db58414a --- /dev/null +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc @@ -0,0 +1,738 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> +#include <string.h> +#include "include/buffer.h" +#include "include/byteorder.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h" +#include "crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h" +#include "seastar/core/thread.hh" + +SET_SUBSYS(seastore_omap); + +namespace crimson::os::seastore::omap_manager { + +std::ostream &operator<<(std::ostream &out, const omap_inner_key_t &rhs) +{ + return out << "omap_inner_key (" << rhs.key_off<< " - " << rhs.key_len + << " - " << rhs.laddr << ")"; +} + +std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs) +{ + return out << "omap_leaf_key_t (" << rhs.key_off<< " - " << rhs.key_len + << " - " << rhs.val_len << ")"; +} + +std::ostream &OMapInnerNode::print_detail_l(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", depth=" << get_meta().depth; +} + +using dec_ref_iertr = OMapInnerNode::base_iertr; +using dec_ref_ret = dec_ref_iertr::future<>; +template <typename T> +dec_ref_ret dec_ref(omap_context_t oc, T&& addr) { + return oc.tm.dec_ref(oc.t, std::forward<T>(addr)).handle_error_interruptible( + dec_ref_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in OMapInnerNode helper dec_ref" + } + ).si_then([](auto &&e) {}); +} + +/** + * make_split_insert + * + * insert an entry at iter, with the address of key. + * will result in a split outcome encoded in the returned mutation_result_t + */ +OMapInnerNode::make_split_insert_ret +OMapInnerNode::make_split_insert( + omap_context_t oc, + internal_iterator_t iter, + std::string key, + laddr_t laddr) +{ + LOG_PREFIX(OMapInnerNode::make_split_insert); + DEBUGT("this: {}, key: {}", oc.t, *this, key); + return make_split_children(oc).si_then([=] (auto tuple) { + auto [left, right, pivot] = tuple; + if (pivot > key) { + auto liter = left->iter_idx(iter.get_index()); + left->journal_inner_insert(liter, laddr, key, + left->maybe_get_delta_buffer()); + } else { //right + auto riter = right->iter_idx(iter.get_index() - left->get_node_size()); + right->journal_inner_insert(riter, laddr, key, + right->maybe_get_delta_buffer()); + } + ++(oc.t.get_omap_tree_stats().extents_num_delta); + return make_split_insert_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::WAS_SPLIT, tuple, std::nullopt)); + }); + +} + + +OMapInnerNode::handle_split_ret +OMapInnerNode::handle_split( + omap_context_t oc, + internal_iterator_t iter, + mutation_result_t mresult) +{ + LOG_PREFIX(OMapInnerNode::handle_split); + DEBUGT("this: {}", oc.t, *this); + if (!is_mutable()) { + auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapInnerNode>(); + auto mut_iter = mut->iter_idx(iter.get_index()); + return mut->handle_split(oc, mut_iter, mresult); + } + auto [left, right, pivot] = *(mresult.split_tuple); + //update operation will not cause node overflow, so we can do it first. + journal_inner_update(iter, left->get_laddr(), maybe_get_delta_buffer()); + bool overflow = extent_will_overflow(pivot.size(), std::nullopt); + if (!overflow) { + journal_inner_insert(iter + 1, right->get_laddr(), pivot, + maybe_get_delta_buffer()); + return insert_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt)); + } else { + return make_split_insert(oc, iter + 1, pivot, right->get_laddr()) + .si_then([this, oc] (auto m_result) { + return dec_ref(oc, get_laddr()) + .si_then([m_result = std::move(m_result)] { + return insert_ret( + interruptible::ready_future_marker{}, + m_result); + }); + }); + } +} + +OMapInnerNode::get_value_ret +OMapInnerNode::get_value( + omap_context_t oc, + const std::string &key) +{ + LOG_PREFIX(OMapInnerNode::get_value); + DEBUGT("key = {}, this: {}", oc.t, key, *this); + auto child_pt = get_containing_child(key); + assert(child_pt != iter_cend()); + auto laddr = child_pt->get_val(); + return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then( + [oc, &key] (auto extent) { + return extent->get_value(oc, key); + }).finally([ref = OMapNodeRef(this)] {}); +} + +OMapInnerNode::insert_ret +OMapInnerNode::insert( + omap_context_t oc, + const std::string &key, + const ceph::bufferlist &value) +{ + LOG_PREFIX(OMapInnerNode::insert); + DEBUGT("{}->{}, this: {}", oc.t, key, value, *this); + auto child_pt = get_containing_child(key); + assert(child_pt != iter_cend()); + auto laddr = child_pt->get_val(); + return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then( + [oc, &key, &value] (auto extent) { + return extent->insert(oc, key, value); + }).si_then([this, oc, child_pt] (auto mresult) { + if (mresult.status == mutation_status_t::SUCCESS) { + return insert_iertr::make_ready_future<mutation_result_t>(mresult); + } else if (mresult.status == mutation_status_t::WAS_SPLIT) { + return handle_split(oc, child_pt, mresult); + } else { + return insert_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt)); + } + }); +} + +OMapInnerNode::rm_key_ret +OMapInnerNode::rm_key(omap_context_t oc, const std::string &key) +{ + LOG_PREFIX(OMapInnerNode::rm_key); + DEBUGT("key={}, this: {}", oc.t, key, *this); + auto child_pt = get_containing_child(key); + assert(child_pt != iter_cend()); + auto laddr = child_pt->get_val(); + return omap_load_extent(oc, laddr, get_meta().depth - 1).si_then( + [this, oc, &key, child_pt] (auto extent) { + return extent->rm_key(oc, key) + .si_then([this, oc, child_pt, extent = std::move(extent)] (auto mresult) { + switch (mresult.status) { + case mutation_status_t::SUCCESS: + case mutation_status_t::FAIL: + return rm_key_iertr::make_ready_future<mutation_result_t>(mresult); + case mutation_status_t::NEED_MERGE: { + if (get_node_size() >1) + return merge_entry(oc, child_pt, *(mresult.need_merge)); + else + return rm_key_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::SUCCESS, + std::nullopt, std::nullopt)); + } + case mutation_status_t::WAS_SPLIT: + return handle_split(oc, child_pt, mresult); + default: + return rm_key_iertr::make_ready_future<mutation_result_t>(mresult); + } + }); + }); +} + +OMapInnerNode::list_ret +OMapInnerNode::list( + omap_context_t oc, + const std::optional<std::string> &first, + const std::optional<std::string> &last, + omap_list_config_t config) +{ + LOG_PREFIX(OMapInnerNode::list); + if (first && last) { + DEBUGT("first: {}, last: {}, this: {}", oc.t, *first, *last, *this); + assert(*first <= *last); + } else if (first) { + DEBUGT("first: {}, this: {}", oc.t, *first, *this); + } else if (last) { + DEBUGT("last: {}, this: {}", oc.t, *last, *this); + } else { + DEBUGT("this: {}", oc.t, *this); + } + + auto first_iter = first ? + get_containing_child(*first) : + iter_cbegin(); + auto last_iter = last ? + get_containing_child(*last) + 1: + iter_cend(); + assert(first_iter != iter_cend()); + + return seastar::do_with( + first_iter, + last_iter, + iter_t(first_iter), + list_bare_ret(false, {}), + [this, &first, &last, oc, config]( + auto &fiter, + auto &liter, + auto &iter, + auto &ret) + { + auto &complete = std::get<0>(ret); + auto &result = std::get<1>(ret); + return trans_intr::repeat( + [&, config, oc, this]() -> list_iertr::future<seastar::stop_iteration> + { + if (iter == liter) { + complete = true; + return list_iertr::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + assert(result.size() < config.max_result_size); + auto laddr = iter->get_val(); + return omap_load_extent( + oc, laddr, + get_meta().depth - 1 + ).si_then([&, config, oc](auto &&extent) { + return seastar::do_with( + iter == fiter ? first : std::optional<std::string>(std::nullopt), + iter == liter - 1 ? last : std::optional<std::string>(std::nullopt), + [&result, extent = std::move(extent), config, oc]( + auto &nfirst, + auto &nlast) { + return extent->list( + oc, + nfirst, + nlast, + config.with_reduced_max(result.size())); + }).si_then([&, config](auto &&child_ret) mutable { + boost::ignore_unused(config); // avoid clang warning; + auto &[child_complete, child_result] = child_ret; + if (result.size() && child_result.size()) { + assert(child_result.begin()->first > result.rbegin()->first); + } + if (child_result.size() && first && iter == fiter) { + if (config.first_inclusive) { + assert(child_result.begin()->first >= *first); + } else { + assert(child_result.begin()->first > *first); + } + } + if (child_result.size() && last && iter == liter - 1) { + auto biter = --(child_result.end()); + if (config.last_inclusive) { + assert(biter->first <= *last); + } else { + assert(biter->first < *last); + } + } + result.merge(std::move(child_result)); + if (result.size() == config.max_result_size) { + return list_iertr::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + ++iter; + assert(child_complete); + return list_iertr::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }); + }); + }).si_then([&ret, ref = OMapNodeRef(this)] { + return list_iertr::make_ready_future<list_bare_ret>(std::move(ret)); + }); + }); +} + +OMapInnerNode::clear_ret +OMapInnerNode::clear(omap_context_t oc) +{ + LOG_PREFIX(OMapInnerNode::clear); + DEBUGT("this: {}", oc.t, *this); + return trans_intr::do_for_each(iter_begin(), iter_end(), + [oc, this](auto iter) { + auto laddr = iter->get_val(); + auto ndepth = get_meta().depth - 1; + if (ndepth > 1) { + return omap_load_extent(oc, laddr, ndepth + ).si_then([oc](auto &&extent) { + return extent->clear(oc); + }).si_then([oc, laddr] { + return dec_ref(oc, laddr); + }).si_then([ref = OMapNodeRef(this)] { + return clear_iertr::now(); + }); + } else { + assert(ndepth == 1); + return dec_ref(oc, laddr + ).si_then([ref = OMapNodeRef(this)] { + return clear_iertr::now(); + }); + } + }); +} + +OMapInnerNode::split_children_ret +OMapInnerNode:: make_split_children(omap_context_t oc) +{ + LOG_PREFIX(OMapInnerNode::make_split_children); + DEBUGT("this: {}", oc.t, *this); + return oc.tm.alloc_extents<OMapInnerNode>(oc.t, oc.hint, + OMAP_INNER_BLOCK_SIZE, 2) + .si_then([this, oc] (auto &&ext_pair) { + LOG_PREFIX(OMapInnerNode::make_split_children); + auto left = ext_pair.front(); + auto right = ext_pair.back(); + DEBUGT("this: {}, split into: l {} r {}", oc.t, *this, *left, *right); + return split_children_ret( + interruptible::ready_future_marker{}, + std::make_tuple(left, right, split_into(*left, *right))); + }); +} + +OMapInnerNode::full_merge_ret +OMapInnerNode::make_full_merge(omap_context_t oc, OMapNodeRef right) +{ + LOG_PREFIX(OMapInnerNode::make_full_merge); + DEBUGT("", oc.t); + return oc.tm.alloc_extent<OMapInnerNode>(oc.t, oc.hint, + OMAP_INNER_BLOCK_SIZE) + .si_then([this, right] (auto &&replacement) { + replacement->merge_from(*this, *right->cast<OMapInnerNode>()); + return full_merge_ret( + interruptible::ready_future_marker{}, + std::move(replacement)); + }); +} + +OMapInnerNode::make_balanced_ret +OMapInnerNode::make_balanced(omap_context_t oc, OMapNodeRef _right) +{ + LOG_PREFIX(OMapInnerNode::make_balanced); + DEBUGT("l: {}, r: {}", oc.t, *this, *_right); + ceph_assert(_right->get_type() == TYPE); + return oc.tm.alloc_extents<OMapInnerNode>(oc.t, oc.hint, + OMAP_INNER_BLOCK_SIZE, 2) + .si_then([this, _right] (auto &&replacement_pair){ + auto replacement_left = replacement_pair.front(); + auto replacement_right = replacement_pair.back(); + auto &right = *_right->cast<OMapInnerNode>(); + return make_balanced_ret( + interruptible::ready_future_marker{}, + std::make_tuple(replacement_left, replacement_right, + balance_into_new_nodes(*this, right, + *replacement_left, *replacement_right))); + }); +} + +OMapInnerNode::merge_entry_ret +OMapInnerNode::merge_entry( + omap_context_t oc, + internal_iterator_t iter, + OMapNodeRef entry) +{ + LOG_PREFIX(OMapInnerNode::merge_entry); + DEBUGT("{}, parent: {}", oc.t, *entry, *this); + if (!is_mutable()) { + auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapInnerNode>(); + auto mut_iter = mut->iter_idx(iter->get_index()); + return mut->merge_entry(oc, mut_iter, entry); + } + auto is_left = (iter + 1) == iter_cend(); + auto donor_iter = is_left ? iter - 1 : iter + 1; + return omap_load_extent(oc, donor_iter->get_val(), get_meta().depth - 1 + ).si_then([=, this](auto &&donor) mutable { + LOG_PREFIX(OMapInnerNode::merge_entry); + auto [l, r] = is_left ? + std::make_pair(donor, entry) : std::make_pair(entry, donor); + auto [liter, riter] = is_left ? + std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); + if (l->can_merge(r)) { + DEBUGT("make_full_merge l {} r {}", oc.t, *l, *r); + assert(entry->extent_is_below_min()); + return l->make_full_merge(oc, r + ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this] + (auto &&replacement) { + LOG_PREFIX(OMapInnerNode::merge_entry); + DEBUGT("to update parent: {}", oc.t, *this); + journal_inner_update( + liter, + replacement->get_laddr(), + maybe_get_delta_buffer()); + journal_inner_remove(riter, maybe_get_delta_buffer()); + //retire extent + std::vector<laddr_t> dec_laddrs {l->get_laddr(), r->get_laddr()}; + return dec_ref(oc, dec_laddrs + ).si_then([this, oc] { + --(oc.t.get_omap_tree_stats().extents_num_delta); + if (extent_is_below_min()) { + return merge_entry_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::NEED_MERGE, + std::nullopt, this)); + } else { + return merge_entry_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::SUCCESS, + std::nullopt, std::nullopt)); + } + }); + }); + } else { + DEBUGT("balanced l {} r {}", oc.t, *l, *r); + return l->make_balanced(oc, r + ).si_then([liter=liter, riter=riter, l=l, r=r, oc, this](auto tuple) { + LOG_PREFIX(OMapInnerNode::merge_entry); + DEBUGT("to update parent: {}", oc.t, *this); + auto [replacement_l, replacement_r, replacement_pivot] = tuple; + //update operation will not cuase node overflow, so we can do it first + journal_inner_update( + liter, + replacement_l->get_laddr(), + maybe_get_delta_buffer()); + bool overflow = extent_will_overflow(replacement_pivot.size(), + std::nullopt); + if (!overflow) { + journal_inner_remove(riter, maybe_get_delta_buffer()); + journal_inner_insert( + riter, + replacement_r->get_laddr(), + replacement_pivot, + maybe_get_delta_buffer()); + std::vector<laddr_t> dec_laddrs{l->get_laddr(), r->get_laddr()}; + return dec_ref(oc, dec_laddrs + ).si_then([] { + return merge_entry_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::SUCCESS, + std::nullopt, std::nullopt)); + }); + } else { + DEBUGT("balanced and split {} r {}", oc.t, *l, *r); + //use remove and insert to instead of replace, + //remove operation will not cause node split, so we can do it first + journal_inner_remove(riter, maybe_get_delta_buffer()); + return make_split_insert(oc, riter, replacement_pivot, + replacement_r->get_laddr() + ).si_then([this, oc, l = l, r = r](auto mresult) { + std::vector<laddr_t> dec_laddrs{ + l->get_laddr(), + r->get_laddr(), + get_laddr()}; + return dec_ref(oc, dec_laddrs + ).si_then([mresult = std::move(mresult)] { + return merge_entry_ret( + interruptible::ready_future_marker{}, mresult); + }); + }); + } + }); + } + }); + +} + +OMapInnerNode::internal_iterator_t +OMapInnerNode::get_containing_child(const std::string &key) +{ + auto iter = std::find_if(iter_begin(), iter_end(), + [&key](auto it) { return it.contains(key); }); + return iter; +} + +std::ostream &OMapLeafNode::print_detail_l(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", depth=" << get_meta().depth; +} + +OMapLeafNode::get_value_ret +OMapLeafNode::get_value(omap_context_t oc, const std::string &key) +{ + LOG_PREFIX(OMapLeafNode::get_value); + DEBUGT("key = {}, this: {}", oc.t, *this, key); + auto ite = find_string_key(key); + if (ite != iter_end()) { + auto value = ite->get_val(); + return get_value_ret( + interruptible::ready_future_marker{}, + value); + } else { + return get_value_ret( + interruptible::ready_future_marker{}, + std::nullopt); + } +} + +OMapLeafNode::insert_ret +OMapLeafNode::insert( + omap_context_t oc, + const std::string &key, + const ceph::bufferlist &value) +{ + LOG_PREFIX(OMapLeafNode::insert); + DEBUGT("{} -> {}, this: {}", oc.t, key, value, *this); + bool overflow = extent_will_overflow(key.size(), value.length()); + if (!overflow) { + if (!is_mutable()) { + auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapLeafNode>(); + return mut->insert(oc, key, value); + } + auto replace_pt = find_string_key(key); + if (replace_pt != iter_end()) { + ++(oc.t.get_omap_tree_stats().num_updates); + journal_leaf_update(replace_pt, key, value, maybe_get_delta_buffer()); + } else { + ++(oc.t.get_omap_tree_stats().num_inserts); + auto insert_pt = string_lower_bound(key); + journal_leaf_insert(insert_pt, key, value, maybe_get_delta_buffer()); + + DEBUGT("inserted {}, this: {}", oc.t, insert_pt.get_key(), *this); + } + return insert_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt)); + } else { + return make_split_children(oc).si_then([this, oc, &key, &value] (auto tuple) { + auto [left, right, pivot] = tuple; + auto replace_pt = find_string_key(key); + if (replace_pt != iter_end()) { + ++(oc.t.get_omap_tree_stats().num_updates); + if (key < pivot) { //left + auto mut_iter = left->iter_idx(replace_pt->get_index()); + left->journal_leaf_update(mut_iter, key, value, left->maybe_get_delta_buffer()); + } else if (key >= pivot) { //right + auto mut_iter = right->iter_idx(replace_pt->get_index() - left->get_node_size()); + right->journal_leaf_update(mut_iter, key, value, right->maybe_get_delta_buffer()); + } + } else { + ++(oc.t.get_omap_tree_stats().num_inserts); + auto insert_pt = string_lower_bound(key); + if (key < pivot) { //left + auto mut_iter = left->iter_idx(insert_pt->get_index()); + left->journal_leaf_insert(mut_iter, key, value, left->maybe_get_delta_buffer()); + } else { + auto mut_iter = right->iter_idx(insert_pt->get_index() - left->get_node_size()); + right->journal_leaf_insert(mut_iter, key, value, right->maybe_get_delta_buffer()); + } + } + ++(oc.t.get_omap_tree_stats().extents_num_delta); + return dec_ref(oc, get_laddr()) + .si_then([tuple = std::move(tuple)] { + return insert_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::WAS_SPLIT, tuple, std::nullopt)); + }); + }); + } +} + +OMapLeafNode::rm_key_ret +OMapLeafNode::rm_key(omap_context_t oc, const std::string &key) +{ + LOG_PREFIX(OMapLeafNode::rm_key); + DEBUGT("{}, this: {}", oc.t, key, *this); + auto rm_pt = find_string_key(key); + if (!is_mutable() && rm_pt != iter_end()) { + auto mut = oc.tm.get_mutable_extent(oc.t, this)->cast<OMapLeafNode>(); + return mut->rm_key(oc, key); + } + + if (rm_pt != iter_end()) { + ++(oc.t.get_omap_tree_stats().num_erases); + journal_leaf_remove(rm_pt, maybe_get_delta_buffer()); + if (extent_is_below_min()) { + return rm_key_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::NEED_MERGE, std::nullopt, + this->cast<OMapNode>())); + } else { + return rm_key_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::SUCCESS, std::nullopt, std::nullopt)); + } + } else { + return rm_key_ret( + interruptible::ready_future_marker{}, + mutation_result_t(mutation_status_t::FAIL, std::nullopt, std::nullopt)); + } + +} + +OMapLeafNode::list_ret +OMapLeafNode::list( + omap_context_t oc, + const std::optional<std::string> &first, + const std::optional<std::string> &last, + omap_list_config_t config) +{ + LOG_PREFIX(OMapLeafNode::list); + DEBUGT( + "first {} last {} max_result_size {} first_inclusive {} \ + last_inclusive {}, this: {}", + oc.t, + first ? first->c_str() : "", + last ? last->c_str() : "", + config.max_result_size, + config.first_inclusive, + config.last_inclusive, + *this + ); + auto ret = list_bare_ret(false, {}); + auto &[complete, result] = ret; + auto iter = first ? + (config.first_inclusive ? + string_lower_bound(*first) : + string_upper_bound(*first)) : + iter_begin(); + auto liter = last ? + (config.last_inclusive ? + string_upper_bound(*last) : + string_lower_bound(*last)) : + iter_end(); + + for (; iter != liter && result.size() < config.max_result_size; iter++) { + result.emplace(std::make_pair(iter->get_key(), iter->get_val())); + } + + complete = (iter == liter); + + return list_iertr::make_ready_future<list_bare_ret>( + std::move(ret)); +} + +OMapLeafNode::clear_ret +OMapLeafNode::clear(omap_context_t oc) +{ + return clear_iertr::now(); +} + +OMapLeafNode::split_children_ret +OMapLeafNode::make_split_children(omap_context_t oc) +{ + LOG_PREFIX(OMapLeafNode::make_split_children); + DEBUGT("this: {}", oc.t, *this); + return oc.tm.alloc_extents<OMapLeafNode>(oc.t, oc.hint, OMAP_LEAF_BLOCK_SIZE, 2) + .si_then([this] (auto &&ext_pair) { + auto left = ext_pair.front(); + auto right = ext_pair.back(); + return split_children_ret( + interruptible::ready_future_marker{}, + std::make_tuple(left, right, split_into(*left, *right))); + }); +} + +OMapLeafNode::full_merge_ret +OMapLeafNode::make_full_merge(omap_context_t oc, OMapNodeRef right) +{ + ceph_assert(right->get_type() == TYPE); + LOG_PREFIX(OMapLeafNode::make_full_merge); + DEBUGT("this: {}", oc.t, *this); + return oc.tm.alloc_extent<OMapLeafNode>(oc.t, oc.hint, OMAP_LEAF_BLOCK_SIZE) + .si_then([this, right] (auto &&replacement) { + replacement->merge_from(*this, *right->cast<OMapLeafNode>()); + return full_merge_ret( + interruptible::ready_future_marker{}, + std::move(replacement)); + }); +} + +OMapLeafNode::make_balanced_ret +OMapLeafNode::make_balanced(omap_context_t oc, OMapNodeRef _right) +{ + ceph_assert(_right->get_type() == TYPE); + LOG_PREFIX(OMapLeafNode::make_balanced); + DEBUGT("this: {}", oc.t, *this); + return oc.tm.alloc_extents<OMapLeafNode>(oc.t, oc.hint, OMAP_LEAF_BLOCK_SIZE, 2) + .si_then([this, _right] (auto &&replacement_pair) { + auto replacement_left = replacement_pair.front(); + auto replacement_right = replacement_pair.back(); + auto &right = *_right->cast<OMapLeafNode>(); + return make_balanced_ret( + interruptible::ready_future_marker{}, + std::make_tuple( + replacement_left, replacement_right, + balance_into_new_nodes( + *this, right, + *replacement_left, *replacement_right))); + }); +} + + +omap_load_extent_iertr::future<OMapNodeRef> +omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth) +{ + ceph_assert(depth > 0); + if (depth > 1) { + return oc.tm.read_extent<OMapInnerNode>(oc.t, laddr, + OMAP_INNER_BLOCK_SIZE) + .handle_error_interruptible( + omap_load_extent_iertr::pass_further{}, + crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" } + ).si_then( + [](auto&& e) { + return seastar::make_ready_future<OMapNodeRef>(std::move(e)); + }); + } else { + return oc.tm.read_extent<OMapLeafNode>(oc.t, laddr, OMAP_LEAF_BLOCK_SIZE + ).handle_error_interruptible( + omap_load_extent_iertr::pass_further{}, + crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" } + ).si_then( + [](auto&& e) { + return seastar::make_ready_future<OMapNodeRef>(std::move(e)); + }); + } +} +} diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h new file mode 100644 index 000000000..a2b51bbb0 --- /dev/null +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h @@ -0,0 +1,250 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string.h> + +#include "include/buffer.h" + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/omap_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h" +#include "crimson/os/seastore/omap_manager/btree/omap_types.h" +#include "crimson/os/seastore/omap_manager/btree/omap_btree_node.h" + +namespace crimson::os::seastore::omap_manager { + +/** + * OMapInnerNode + * + * Abstracts operations on and layout of internal nodes for the + * omap Tree. + * + * Layout (4k): + * num_entries: meta : keys : values : + */ + +struct OMapInnerNode + : OMapNode, + StringKVInnerNodeLayout { + using OMapInnerNodeRef = TCachedExtentRef<OMapInnerNode>; + using internal_iterator_t = const_iterator; + template <typename... T> + OMapInnerNode(T&&... t) : + OMapNode(std::forward<T>(t)...), + StringKVInnerNodeLayout(get_bptr().c_str()) {} + + omap_node_meta_t get_node_meta() const final { return get_meta(); } + bool extent_will_overflow(size_t ksize, std::optional<size_t> vsize) const { + return is_overflow(ksize); + } + bool can_merge(OMapNodeRef right) const { + return !is_overflow(*right->cast<OMapInnerNode>()); + } + bool extent_is_below_min() const { return below_min(); } + uint32_t get_node_size() { return get_size(); } + + CachedExtentRef duplicate_for_write(Transaction&) final { + assert(delta_buffer.empty()); + return CachedExtentRef(new OMapInnerNode(*this)); + } + + delta_inner_buffer_t delta_buffer; + delta_inner_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + get_value_ret get_value(omap_context_t oc, const std::string &key) final; + + insert_ret insert( + omap_context_t oc, + const std::string &key, + const ceph::bufferlist &value) final; + + rm_key_ret rm_key( + omap_context_t oc, + const std::string &key) final; + + list_ret list( + omap_context_t oc, + const std::optional<std::string> &first, + const std::optional<std::string> &last, + omap_list_config_t config) final; + + clear_ret clear(omap_context_t oc) final; + + using split_children_iertr = base_iertr; + using split_children_ret = split_children_iertr::future + <std::tuple<OMapInnerNodeRef, OMapInnerNodeRef, std::string>>; + split_children_ret make_split_children(omap_context_t oc); + + full_merge_ret make_full_merge( + omap_context_t oc, OMapNodeRef right) final; + + make_balanced_ret make_balanced( + omap_context_t oc, OMapNodeRef right) final; + + using make_split_insert_iertr = base_iertr; + using make_split_insert_ret = make_split_insert_iertr::future<mutation_result_t>; + make_split_insert_ret make_split_insert( + omap_context_t oc, internal_iterator_t iter, + std::string key, laddr_t laddr); + + using merge_entry_iertr = base_iertr; + using merge_entry_ret = merge_entry_iertr::future<mutation_result_t>; + merge_entry_ret merge_entry( + omap_context_t oc, + internal_iterator_t iter, OMapNodeRef entry); + + using handle_split_iertr = base_iertr; + using handle_split_ret = handle_split_iertr::future<mutation_result_t>; + handle_split_ret handle_split( + omap_context_t oc, internal_iterator_t iter, + mutation_result_t mresult); + + std::ostream &print_detail_l(std::ostream &out) const final; + + static constexpr extent_types_t TYPE = extent_types_t::OMAP_INNER; + extent_types_t get_type() const final { + return TYPE; + } + + ceph::bufferlist get_delta() final { + ceph::bufferlist bl; + if (!delta_buffer.empty()) { + encode(delta_buffer, bl); + delta_buffer.clear(); + } + return bl; + } + + void apply_delta(const ceph::bufferlist &bl) final { + assert(bl.length()); + delta_inner_buffer_t buffer; + auto bptr = bl.cbegin(); + decode(buffer, bptr); + buffer.replay(*this); + } + + internal_iterator_t get_containing_child(const std::string &key); +}; +using OMapInnerNodeRef = OMapInnerNode::OMapInnerNodeRef; + +/** + * OMapLeafNode + * + * Abstracts operations on and layout of leaf nodes for the + * OMap Tree. + * + * Layout (4k): + * num_entries: meta : keys : values : + */ + +struct OMapLeafNode + : OMapNode, + StringKVLeafNodeLayout { + + using OMapLeafNodeRef = TCachedExtentRef<OMapLeafNode>; + using internal_iterator_t = const_iterator; + template <typename... T> + OMapLeafNode(T&&... t) : + OMapNode(std::forward<T>(t)...), + StringKVLeafNodeLayout(get_bptr().c_str()) {} + + omap_node_meta_t get_node_meta() const final { return get_meta(); } + bool extent_will_overflow( + size_t ksize, std::optional<size_t> vsize) const { + return is_overflow(ksize, *vsize); + } + bool can_merge(OMapNodeRef right) const { + return !is_overflow(*right->cast<OMapLeafNode>()); + } + bool extent_is_below_min() const { return below_min(); } + uint32_t get_node_size() { return get_size(); } + + CachedExtentRef duplicate_for_write(Transaction&) final { + assert(delta_buffer.empty()); + return CachedExtentRef(new OMapLeafNode(*this)); + } + + delta_leaf_buffer_t delta_buffer; + delta_leaf_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + get_value_ret get_value( + omap_context_t oc, const std::string &key) final; + + insert_ret insert( + omap_context_t oc, + const std::string &key, + const ceph::bufferlist &value) final; + + rm_key_ret rm_key( + omap_context_t oc, const std::string &key) final; + + list_ret list( + omap_context_t oc, + const std::optional<std::string> &first, + const std::optional<std::string> &last, + omap_list_config_t config) final; + + clear_ret clear( + omap_context_t oc) final; + + using split_children_iertr = base_iertr; + using split_children_ret = split_children_iertr::future + <std::tuple<OMapLeafNodeRef, OMapLeafNodeRef, std::string>>; + split_children_ret make_split_children( + omap_context_t oc); + + full_merge_ret make_full_merge( + omap_context_t oc, + OMapNodeRef right) final; + + make_balanced_ret make_balanced( + omap_context_t oc, + OMapNodeRef _right) final; + + static constexpr extent_types_t TYPE = extent_types_t::OMAP_LEAF; + extent_types_t get_type() const final { + return TYPE; + } + + ceph::bufferlist get_delta() final { + ceph::bufferlist bl; + if (!delta_buffer.empty()) { + encode(delta_buffer, bl); + delta_buffer.clear(); + } + return bl; + } + + void apply_delta(const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_leaf_buffer_t buffer; + auto bptr = bl.cbegin(); + decode(buffer, bptr); + buffer.replay(*this); + } + + std::ostream &print_detail_l(std::ostream &out) const final; + + std::pair<internal_iterator_t, internal_iterator_t> + get_leaf_entries(std::string &key); + +}; +using OMapLeafNodeRef = OMapLeafNode::OMapLeafNodeRef; + +std::ostream &operator<<(std::ostream &out, const omap_inner_key_t &rhs); +std::ostream &operator<<(std::ostream &out, const omap_leaf_key_t &rhs); +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::omap_manager::OMapInnerNode> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::omap_manager::OMapLeafNode> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_types.h b/src/crimson/os/seastore/omap_manager/btree/omap_types.h new file mode 100644 index 000000000..9e0d10e03 --- /dev/null +++ b/src/crimson/os/seastore/omap_manager/btree/omap_types.h @@ -0,0 +1,157 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore::omap_manager { + +struct omap_node_meta_t { + depth_t depth = 0; + + std::pair<omap_node_meta_t, omap_node_meta_t> split_into() const { + return std::make_pair( + omap_node_meta_t{depth}, + omap_node_meta_t{depth}); + } + + static omap_node_meta_t merge_from( + const omap_node_meta_t &lhs, const omap_node_meta_t &rhs) { + assert(lhs.depth == rhs.depth); + return omap_node_meta_t{lhs.depth}; + } + + static std::pair<omap_node_meta_t, omap_node_meta_t> + rebalance(const omap_node_meta_t &lhs, const omap_node_meta_t &rhs) { + assert(lhs.depth == rhs.depth); + return std::make_pair( + omap_node_meta_t{lhs.depth}, + omap_node_meta_t{lhs.depth}); + } +}; + +struct omap_node_meta_le_t { + depth_le_t depth = init_depth_le(0); + + omap_node_meta_le_t() = default; + omap_node_meta_le_t(const omap_node_meta_le_t &) = default; + explicit omap_node_meta_le_t(const omap_node_meta_t &val) + : depth(init_depth_le(val.depth)) {} + + operator omap_node_meta_t() const { + return omap_node_meta_t{ depth }; + } +}; + +struct omap_inner_key_t { + uint16_t key_off = 0; + uint16_t key_len = 0; + laddr_t laddr = 0; + + omap_inner_key_t() = default; + omap_inner_key_t(uint16_t off, uint16_t len, laddr_t addr) + : key_off(off), key_len(len), laddr(addr) {} + + inline bool operator==(const omap_inner_key_t b) const { + return key_off == b.key_off && key_len == b.key_len && laddr == b.laddr; + } + inline bool operator!=(const omap_inner_key_t b) const { + return key_off != b.key_off || key_len != b.key_len || laddr != b.laddr; + } + DENC(omap_inner_key_t, v, p) { + DENC_START(1, 1, p); + denc(v.key_off, p); + denc(v.key_len, p); + denc(v.laddr, p); + DENC_FINISH(p); + } +}; + +struct omap_inner_key_le_t { + ceph_le16 key_off{0}; + ceph_le16 key_len{0}; + laddr_le_t laddr{0}; + + omap_inner_key_le_t() = default; + omap_inner_key_le_t(const omap_inner_key_le_t &) = default; + explicit omap_inner_key_le_t(const omap_inner_key_t &key) + : key_off(key.key_off), + key_len(key.key_len), + laddr(key.laddr) {} + + operator omap_inner_key_t() const { + return omap_inner_key_t{uint16_t(key_off), uint16_t(key_len), laddr_t(laddr)}; + } + + omap_inner_key_le_t& operator=(omap_inner_key_t key) { + key_off = key.key_off; + key_len = key.key_len; + laddr = laddr_le_t(key.laddr); + return *this; + } + + inline bool operator==(const omap_inner_key_le_t b) const { + return key_off == b.key_off && key_len == b.key_len && laddr == b.laddr; + } +}; + +struct omap_leaf_key_t { + uint16_t key_off = 0; + uint16_t key_len = 0; + uint16_t val_len = 0; + + omap_leaf_key_t() = default; + omap_leaf_key_t(uint16_t k_off, uint16_t k_len, uint16_t v_len) + : key_off(k_off), key_len(k_len), val_len(v_len) {} + + inline bool operator==(const omap_leaf_key_t b) const { + return key_off == b.key_off && key_len == b.key_len && + val_len == b.val_len; + } + inline bool operator!=(const omap_leaf_key_t b) const { + return key_off != b.key_off || key_len != b.key_len || + val_len != b.val_len; + } + + DENC(omap_leaf_key_t, v, p) { + DENC_START(1, 1, p); + denc(v.key_off, p); + denc(v.key_len, p); + denc(v.val_len, p); + DENC_FINISH(p); + } +}; + +struct omap_leaf_key_le_t { + ceph_le16 key_off{0}; + ceph_le16 key_len{0}; + ceph_le16 val_len{0}; + + omap_leaf_key_le_t() = default; + omap_leaf_key_le_t(const omap_leaf_key_le_t &) = default; + explicit omap_leaf_key_le_t(const omap_leaf_key_t &key) + : key_off(key.key_off), + key_len(key.key_len), + val_len(key.val_len) {} + + operator omap_leaf_key_t() const { + return omap_leaf_key_t{uint16_t(key_off), uint16_t(key_len), + uint16_t(val_len)}; + } + + omap_leaf_key_le_t& operator=(omap_leaf_key_t key) { + key_off = key.key_off; + key_len = key.key_len; + val_len = key.val_len; + return *this; + } + + inline bool operator==(const omap_leaf_key_le_t b) const { + return key_off == b.key_off && key_len == b.key_len && + val_len == b.val_len; + } +}; + +} +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::omap_manager::omap_inner_key_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::omap_manager::omap_leaf_key_t) diff --git a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h new file mode 100644 index 000000000..72b13fedf --- /dev/null +++ b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h @@ -0,0 +1,1550 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <string> + +#include "include/byteorder.h" +#include "include/denc.h" +#include "include/encoding.h" + +#include "crimson/common/layout.h" +#include "crimson/common/fixed_kv_node_layout.h" +#include "crimson/os/seastore/omap_manager.h" +#include "crimson/os/seastore/omap_manager/btree/omap_types.h" + +namespace crimson::os::seastore::omap_manager { +class StringKVInnerNodeLayout; +class StringKVLeafNodeLayout; + +/** + * copy_from_foreign + * + * Copy from another node entries to this node. + * [from_src, to_src) is another node entry range. + * tgt is this node entry to copy to. + * tgt and from_src must be from different nodes. + * from_src and to_src must be in the same node. + */ +template <typename iterator, typename const_iterator> +static void copy_from_foreign( + iterator tgt, + const_iterator from_src, + const_iterator to_src) { + assert(tgt->node != from_src->node); + assert(to_src->node == from_src->node); + if (from_src == to_src) + return; + + auto to_copy = from_src->get_right_ptr_end() - to_src->get_right_ptr_end(); + assert(to_copy > 0); + memcpy( + tgt->get_right_ptr_end() - to_copy, + to_src->get_right_ptr_end(), + to_copy); + memcpy( + tgt->get_node_key_ptr(), + from_src->get_node_key_ptr(), + to_src->get_node_key_ptr() - from_src->get_node_key_ptr()); + + auto offset_diff = tgt->get_right_offset_end() - from_src->get_right_offset_end(); + for (auto i = tgt; i != tgt + (to_src - from_src); ++i) { + i->update_offset(offset_diff); + } +} + +/** + * copy_from_local + * + * Copies entries from [from_src, to_src) to tgt. + * tgt, from_src, and to_src must be from the same node. + */ +template <typename iterator> +static void copy_from_local( + unsigned len, + iterator tgt, + iterator from_src, + iterator to_src) { + assert(tgt->node == from_src->node); + assert(to_src->node == from_src->node); + + auto to_copy = from_src->get_right_ptr_end() - to_src->get_right_ptr_end(); + assert(to_copy > 0); + int adjust_offset = tgt > from_src? -len : len; + memmove(to_src->get_right_ptr_end() + adjust_offset, + to_src->get_right_ptr_end(), + to_copy); + + for ( auto ite = from_src; ite < to_src; ite++) { + ite->update_offset(-adjust_offset); + } + memmove(tgt->get_node_key_ptr(), from_src->get_node_key_ptr(), + to_src->get_node_key_ptr() - from_src->get_node_key_ptr()); +} + +struct delta_inner_t { + enum class op_t : uint_fast8_t { + INSERT, + UPDATE, + REMOVE, + } op; + std::string key; + laddr_t addr; + + DENC(delta_inner_t, v, p) { + DENC_START(1, 1, p); + denc(v.op, p); + denc(v.key, p); + denc(v.addr, p); + DENC_FINISH(p); + } + + void replay(StringKVInnerNodeLayout &l); + bool operator==(const delta_inner_t &rhs) const { + return op == rhs.op && + key == rhs.key && + addr == rhs.addr; + } +}; +} +WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_inner_t) + +namespace crimson::os::seastore::omap_manager { +struct delta_leaf_t { + enum class op_t : uint_fast8_t { + INSERT, + UPDATE, + REMOVE, + } op; + std::string key; + ceph::bufferlist val; + + DENC(delta_leaf_t, v, p) { + DENC_START(1, 1, p); + denc(v.op, p); + denc(v.key, p); + denc(v.val, p); + DENC_FINISH(p); + } + + void replay(StringKVLeafNodeLayout &l); + bool operator==(const delta_leaf_t &rhs) const { + return op == rhs.op && + key == rhs.key && + val == rhs.val; + } +}; +} +WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_leaf_t) + +namespace crimson::os::seastore::omap_manager { +class delta_inner_buffer_t { + std::vector<delta_inner_t> buffer; +public: + bool empty() const { + return buffer.empty(); + } + void insert( + const std::string &key, + laddr_t addr) { + buffer.push_back( + delta_inner_t{ + delta_inner_t::op_t::INSERT, + key, + addr + }); + } + void update( + const std::string &key, + laddr_t addr) { + buffer.push_back( + delta_inner_t{ + delta_inner_t::op_t::UPDATE, + key, + addr + }); + } + void remove(const std::string &key) { + buffer.push_back( + delta_inner_t{ + delta_inner_t::op_t::REMOVE, + key, + L_ADDR_NULL + }); + } + + void replay(StringKVInnerNodeLayout &node) { + for (auto &i: buffer) { + i.replay(node); + } + } + + void clear() { + buffer.clear(); + } + + DENC(delta_inner_buffer_t, v, p) { + DENC_START(1, 1, p); + denc(v.buffer, p); + DENC_FINISH(p); + } + + bool operator==(const delta_inner_buffer_t &rhs) const { + return buffer == rhs.buffer; + } +}; +} +WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_inner_buffer_t) + +namespace crimson::os::seastore::omap_manager { +class delta_leaf_buffer_t { + std::vector<delta_leaf_t> buffer; +public: + bool empty() const { + return buffer.empty(); + } + void insert( + const std::string &key, + const ceph::bufferlist &val) { + buffer.push_back( + delta_leaf_t{ + delta_leaf_t::op_t::INSERT, + key, + val + }); + } + void update( + const std::string &key, + const ceph::bufferlist &val) { + buffer.push_back( + delta_leaf_t{ + delta_leaf_t::op_t::UPDATE, + key, + val + }); + } + void remove(const std::string &key) { + buffer.push_back( + delta_leaf_t{ + delta_leaf_t::op_t::REMOVE, + key, + bufferlist() + }); + } + + void replay(StringKVLeafNodeLayout &node) { + for (auto &i: buffer) { + i.replay(node); + } + } + + void clear() { + buffer.clear(); + } + + DENC(delta_leaf_buffer_t, v, p) { + DENC_START(1, 1, p); + denc(v.buffer, p); + DENC_FINISH(p); + } + + bool operator==(const delta_leaf_buffer_t &rhs) const { + return buffer == rhs.buffer; + } +}; +} +WRITE_CLASS_DENC(crimson::os::seastore::omap_manager::delta_leaf_buffer_t) + +namespace crimson::os::seastore::omap_manager { +/** + * StringKVInnerNodeLayout + * + * Uses absl::container_internal::Layout for the actual key memory layout. + * + * The primary interface exposed is centered on the iterator + * and related methods. + * + * Also included are helpers for doing splits and merges as for a btree. + * + * layout diagram: + * + * # <----------------------------- node range --------------------------------------------> # + * # #<~># free space # + * # <------------- left part -----------------------------> # <~# <----- right keys -----> # + * # # <------------ left keys --------------> #~> # # + * # # keys [2, n) |<~># #<~>| right keys [2, n) # + * # # <--- key 0 ----> | <--- key 1 ----> | # # | <- k1 -> | <-- k0 --> # + * # # | | # # | | # + * # num_ | meta # key | key | val | key | key | val | # # | key | key # + * # keys | depth # off | len | laddr| off | len | laddr| # # | buff | buff # + * # | # 0 | 0 | 0 | 1 | 1 | 1 |...#...#...| key 1 | key 0 # + * # | | | <- off --+----------> # + * # | | ^ | <- off --> # + * | | | ^ + * | +----------------------------------+ | + * +----------------------------------------------------------------+ + */ +class StringKVInnerNodeLayout { + char *buf = nullptr; + + using L = absl::container_internal::Layout<ceph_le32, omap_node_meta_le_t, omap_inner_key_le_t>; + static constexpr L layout{1, 1, 1}; // = L::Partial(1, 1, 1); + friend class delta_inner_t; +public: + template <bool is_const> + class iter_t { + friend class StringKVInnerNodeLayout; + + template <typename iterator, typename const_iterator> + friend void copy_from_foreign(iterator, const_iterator, const_iterator); + template <typename iterator> + friend void copy_from_local(unsigned, iterator, iterator, iterator); + + using parent_t = typename crimson::common::maybe_const_t<StringKVInnerNodeLayout, is_const>::type; + + mutable parent_t node; + uint16_t index; + + iter_t( + parent_t parent, + uint16_t index) : node(parent), index(index) {} + + public: + using iterator_category = std::input_iterator_tag; + using value_type = StringKVInnerNodeLayout; + using difference_type = std::ptrdiff_t; + using pointer = StringKVInnerNodeLayout*; + using reference = iter_t&; + + iter_t(const iter_t &) = default; + iter_t(iter_t &&) = default; + iter_t &operator=(const iter_t &) = default; + iter_t &operator=(iter_t &&) = default; + + operator iter_t<!is_const>() const { + static_assert(!is_const); + return iter_t<!is_const>(node, index); + } + + iter_t &operator*() { return *this; } + iter_t *operator->() { return this; } + + iter_t operator++(int) { + auto ret = *this; + ++index; + return ret; + } + + iter_t &operator++() { + ++index; + return *this; + } + + iter_t operator--(int) { + auto ret = *this; + assert(index > 0); + --index; + return ret; + } + + iter_t &operator--() { + assert(index > 0); + --index; + return *this; + } + + uint16_t operator-(const iter_t &rhs) const { + assert(rhs.node == node); + return index - rhs.index; + } + + iter_t operator+(uint16_t off) const { + return iter_t(node, index + off); + } + iter_t operator-(uint16_t off) const { + return iter_t(node, index - off); + } + + uint16_t operator<(const iter_t &rhs) const { + assert(rhs.node == node); + return index < rhs.index; + } + + uint16_t operator>(const iter_t &rhs) const { + assert(rhs.node == node); + return index > rhs.index; + } + + friend bool operator==(const iter_t &lhs, const iter_t &rhs) { + assert(lhs.node == rhs.node); + return lhs.index == rhs.index; + } + + private: + omap_inner_key_t get_node_key() const { + omap_inner_key_le_t kint = node->get_node_key_ptr()[index]; + return omap_inner_key_t(kint); + } + auto get_node_key_ptr() const { + return reinterpret_cast< + typename crimson::common::maybe_const_t<char, is_const>::type>( + node->get_node_key_ptr() + index); + } + + uint32_t get_node_val_offset() const { + return get_node_key().key_off; + } + auto get_node_val_ptr() const { + auto tail = node->buf + OMAP_INNER_BLOCK_SIZE; + if (*this == node->iter_end()) + return tail; + else { + return tail - get_node_val_offset(); + } + } + + int get_right_offset_end() const { + if (index == 0) + return 0; + else + return (*this - 1)->get_node_val_offset(); + } + auto get_right_ptr_end() const { + return node->buf + OMAP_INNER_BLOCK_SIZE - get_right_offset_end(); + } + + void update_offset(int offset) { + static_assert(!is_const); + auto key = get_node_key(); + assert(offset + key.key_off >= 0); + key.key_off += offset; + set_node_key(key); + } + + void set_node_key(omap_inner_key_t _lb) { + static_assert(!is_const); + omap_inner_key_le_t lb; + lb = _lb; + node->get_node_key_ptr()[index] = lb; + } + + void set_node_val(const std::string &str) { + static_assert(!is_const); + assert(str.size() == get_node_key().key_len); + assert(get_node_key().key_off >= str.size()); + assert(get_node_key().key_off < OMAP_INNER_BLOCK_SIZE); + assert(str.size() < OMAP_INNER_BLOCK_SIZE); + ::memcpy(get_node_val_ptr(), str.data(), str.size()); + } + + public: + uint16_t get_index() const { + return index; + } + + std::string get_key() const { + return std::string( + get_node_val_ptr(), + get_node_key().key_len); + } + + laddr_t get_val() const { + return get_node_key().laddr; + } + + bool contains(std::string_view key) const { + assert(*this != node->iter_end()); + auto next = *this + 1; + if (next == node->iter_end()) { + return get_key() <= key; + } else { + return (get_key() <= key) && (next->get_key() > key); + } + } + }; + using const_iterator = iter_t<true>; + using iterator = iter_t<false>; + +public: + void journal_inner_insert( + const_iterator _iter, + const laddr_t laddr, + const std::string &key, + delta_inner_buffer_t *recorder) { + auto iter = iterator(this, _iter.index); + if (recorder) { + recorder->insert( + key, + laddr); + } + inner_insert(iter, key, laddr); + } + + void journal_inner_update( + const_iterator _iter, + const laddr_t laddr, + delta_inner_buffer_t *recorder) { + auto iter = iterator(this, _iter.index); + auto key = iter->get_key(); + if (recorder) { + recorder->update(key, laddr); + } + inner_update(iter, laddr); + } + + void journal_inner_remove( + const_iterator _iter, + delta_inner_buffer_t *recorder) { + auto iter = iterator(this, _iter.index); + if (recorder) { + recorder->remove(iter->get_key()); + } + inner_remove(iter); + } + + StringKVInnerNodeLayout(char *buf) : + buf(buf) {} + + uint32_t get_size() const { + ceph_le32 &size = *layout.template Pointer<0>(buf); + return uint32_t(size); + } + + /** + * set_size + * + * Set size representation to match size + */ + void set_size(uint32_t size) { + ceph_le32 s; + s = size; + *layout.template Pointer<0>(buf) = s; + } + + const_iterator iter_cbegin() const { + return const_iterator( + this, + 0); + } + const_iterator iter_begin() const { + return iter_cbegin(); + } + + const_iterator iter_cend() const { + return const_iterator( + this, + get_size()); + } + const_iterator iter_end() const { + return iter_cend(); + } + + iterator iter_begin() { + return iterator( + this, + 0); + } + + iterator iter_end() { + return iterator( + this, + get_size()); + } + + const_iterator iter_idx(uint16_t off) const { + return const_iterator( + this, + off); + } + + const_iterator string_lower_bound(std::string_view str) const { + auto it = std::lower_bound(boost::make_counting_iterator<uint16_t>(0), + boost::make_counting_iterator<uint16_t>(get_size()), + str, + [this](uint16_t i, std::string_view str) { + const_iterator iter(this, i); + return iter->get_key() < str; + }); + return const_iterator(this, *it); + } + + iterator string_lower_bound(std::string_view str) { + const auto &tref = *this; + return iterator(this, tref.string_lower_bound(str).index); + } + + const_iterator string_upper_bound(std::string_view str) const { + auto it = std::upper_bound(boost::make_counting_iterator<uint16_t>(0), + boost::make_counting_iterator<uint16_t>(get_size()), + str, + [this](std::string_view str, uint16_t i) { + const_iterator iter(this, i); + return str < iter->get_key(); + }); + return const_iterator(this, *it); + } + + iterator string_upper_bound(std::string_view str) { + const auto &tref = *this; + return iterator(this, tref.string_upper_bound(str).index); + } + + const_iterator find_string_key(std::string_view str) const { + auto ret = iter_begin(); + for (; ret != iter_end(); ++ret) { + std::string s = ret->get_key(); + if (s == str) + break; + } + return ret; + } + + iterator find_string_key(std::string_view str) { + const auto &tref = *this; + return iterator(this, tref.find_string_key(str).index); + } + + const_iterator get_split_pivot() const { + uint32_t total_size = omap_inner_key_t( + get_node_key_ptr()[get_size()-1]).key_off; + uint32_t pivot_size = total_size / 2; + uint32_t size = 0; + for (auto ite = iter_begin(); ite < iter_end(); ite++) { + auto node_key = ite->get_node_key(); + size += node_key.key_len; + if (size >= pivot_size){ + return ite; + } + } + return iter_end(); + } + + + /** + * get_meta/set_meta + * + * Enables stashing a templated type within the layout. + * Cannot be modified after initial write as it is not represented + * in delta_t + */ + omap_node_meta_t get_meta() const { + omap_node_meta_le_t &metaint = *layout.template Pointer<1>(buf); + return omap_node_meta_t(metaint); + } + void set_meta(const omap_node_meta_t &meta) { + *layout.template Pointer<1>(buf) = omap_node_meta_le_t(meta); + } + + uint32_t used_space() const { + uint32_t count = get_size(); + if (count) { + omap_inner_key_t last_key = omap_inner_key_t(get_node_key_ptr()[count-1]); + return last_key.key_off + count * sizeof(omap_inner_key_le_t); + } else { + return 0; + } + } + + uint32_t free_space() const { + return capacity() - used_space(); + } + + uint16_t capacity() const { + return OMAP_INNER_BLOCK_SIZE + - (reinterpret_cast<char*>(layout.template Pointer<2>(buf)) + - reinterpret_cast<char*>(layout.template Pointer<0>(buf))); + } + + bool is_overflow(size_t ksize) const { + return free_space() < (sizeof(omap_inner_key_le_t) + ksize); + } + + bool is_overflow(const StringKVInnerNodeLayout &rhs) const { + return free_space() < rhs.used_space(); + } + + bool below_min() const { + return free_space() > (capacity() / 2); + } + + bool operator==(const StringKVInnerNodeLayout &rhs) const { + if (get_size() != rhs.get_size()) { + return false; + } + + auto iter = iter_begin(); + auto iter2 = rhs.iter_begin(); + while (iter != iter_end()) { + if (iter->get_key() != iter2->get_key() || + iter->get_val() != iter2->get_val()) { + return false; + } + iter++; + iter2++; + } + return true; + } + + /** + * split_into + * + * Takes *this and splits its contents into left and right. + */ + std::string split_into( + StringKVInnerNodeLayout &left, + StringKVInnerNodeLayout &right) const { + auto piviter = get_split_pivot(); + assert(piviter != iter_end()); + + copy_from_foreign(left.iter_begin(), iter_begin(), piviter); + left.set_size(piviter - iter_begin()); + + copy_from_foreign(right.iter_begin(), piviter, iter_end()); + right.set_size(iter_end() - piviter); + + auto [lmeta, rmeta] = get_meta().split_into(); + left.set_meta(lmeta); + right.set_meta(rmeta); + + return piviter->get_key(); + } + + /** + * merge_from + * + * Takes two nodes and copies their contents into *this. + * + * precondition: left.size() + right.size() < CAPACITY + */ + void merge_from( + const StringKVInnerNodeLayout &left, + const StringKVInnerNodeLayout &right) { + copy_from_foreign( + iter_end(), + left.iter_begin(), + left.iter_end()); + set_size(left.get_size()); + + copy_from_foreign( + iter_end(), + right.iter_begin(), + right.iter_end()); + set_size(left.get_size() + right.get_size()); + set_meta(omap_node_meta_t::merge_from(left.get_meta(), right.get_meta())); + } + + /** + * balance_into_new_nodes + * + * Takes the contents of left and right and copies them into + * replacement_left and replacement_right such that + * the size of replacement_left just >= 1/2 of (left + right) + */ + static std::string balance_into_new_nodes( + const StringKVInnerNodeLayout &left, + const StringKVInnerNodeLayout &right, + StringKVInnerNodeLayout &replacement_left, + StringKVInnerNodeLayout &replacement_right) + { + uint32_t left_size = omap_inner_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off; + uint32_t right_size = omap_inner_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off; + uint32_t total = left_size + right_size; + uint32_t pivot_size = total / 2; + uint32_t pivot_idx = 0; + if (pivot_size < left_size) { + uint32_t size = 0; + for (auto ite = left.iter_begin(); ite < left.iter_end(); ite++) { + auto node_key = ite->get_node_key(); + size += node_key.key_len; + if (size >= pivot_size){ + pivot_idx = ite.get_index(); + break; + } + } + } else { + uint32_t more_size = pivot_size - left_size; + uint32_t size = 0; + for (auto ite = right.iter_begin(); ite < right.iter_end(); ite++) { + auto node_key = ite->get_node_key(); + size += node_key.key_len; + if (size >= more_size){ + pivot_idx = ite.get_index() + left.get_size(); + break; + } + } + } + + auto replacement_pivot = pivot_idx >= left.get_size() ? + right.iter_idx(pivot_idx - left.get_size())->get_key() : + left.iter_idx(pivot_idx)->get_key(); + + if (pivot_size < left_size) { + copy_from_foreign( + replacement_left.iter_end(), + left.iter_begin(), + left.iter_idx(pivot_idx)); + replacement_left.set_size(pivot_idx); + + copy_from_foreign( + replacement_right.iter_end(), + left.iter_idx(pivot_idx), + left.iter_end()); + replacement_right.set_size(left.get_size() - pivot_idx); + + copy_from_foreign( + replacement_right.iter_end(), + right.iter_begin(), + right.iter_end()); + replacement_right.set_size(right.get_size() + left.get_size()- pivot_idx); + } else { + copy_from_foreign( + replacement_left.iter_end(), + left.iter_begin(), + left.iter_end()); + replacement_left.set_size(left.get_size()); + + copy_from_foreign( + replacement_left.iter_end(), + right.iter_begin(), + right.iter_idx(pivot_idx - left.get_size())); + replacement_left.set_size(pivot_idx); + + copy_from_foreign( + replacement_right.iter_end(), + right.iter_idx(pivot_idx - left.get_size()), + right.iter_end()); + replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx); + } + + auto [lmeta, rmeta] = omap_node_meta_t::rebalance( + left.get_meta(), right.get_meta()); + replacement_left.set_meta(lmeta); + replacement_right.set_meta(rmeta); + return replacement_pivot; + } + +private: + void inner_insert( + iterator iter, + const std::string &key, + laddr_t val) { + if (iter != iter_begin()) { + assert((iter - 1)->get_key() < key); + } + if (iter != iter_end()) { + assert(iter->get_key() > key); + } + assert(!is_overflow(key.size())); + + if (iter != iter_end()) { + copy_from_local(key.size(), iter + 1, iter, iter_end()); + } + + omap_inner_key_t nkey; + nkey.key_len = key.size(); + nkey.laddr = val; + if (iter != iter_begin()) { + auto pkey = (iter - 1).get_node_key(); + nkey.key_off = nkey.key_len + pkey.key_off; + } else { + nkey.key_off = nkey.key_len; + } + + iter->set_node_key(nkey); + set_size(get_size() + 1); + iter->set_node_val(key); + } + + void inner_update( + iterator iter, + laddr_t addr) { + assert(iter != iter_end()); + auto node_key = iter->get_node_key(); + node_key.laddr = addr; + iter->set_node_key(node_key); + } + + void inner_remove(iterator iter) { + assert(iter != iter_end()); + if ((iter + 1) != iter_end()) + copy_from_local(iter->get_node_key().key_len, iter, iter + 1, iter_end()); + set_size(get_size() - 1); + } + + /** + * get_key_ptr + * + * Get pointer to start of key array + */ + omap_inner_key_le_t *get_node_key_ptr() { + return L::Partial(1, 1, get_size()).template Pointer<2>(buf); + } + const omap_inner_key_le_t *get_node_key_ptr() const { + return L::Partial(1, 1, get_size()).template Pointer<2>(buf); + } + +}; + +/** + * StringKVLeafNodeLayout + * + * layout diagram: + * + * # <----------------------------- node range -------------------------------------------------> # + * # #<~># free space # + * # <------------- left part ---------------------------> # <~# <----- right key-value pairs --> # + * # # <------------ left keys ------------> #~> # # + * # # keys [2, n) |<~># #<~>| right kvs [2, n) # + * # # <--- key 0 ---> | <--- key 1 ---> | # # | <-- kv 1 --> | <-- kv 0 --> # + * # # | | # # | | # + * # num_ | meta # key | key | val | key | key | val | # # | key | val | key | val # + * # keys | depth # off | len | len | off | len | len | # # | buff | buff | buff | buff # + * # # 0 | 0 | 0 | 1 | 1 | 1 |...#...#...| key 1 | val 1| key 0 | val 0 # + * # | | | <--- off ----+-------------> # + * # | | ^ | <--- off ---> # + * | | | ^ + * | +-----------------------------------+ | + * +-------------------------------------------------------------------+ + */ +class StringKVLeafNodeLayout { + char *buf = nullptr; + + using L = absl::container_internal::Layout<ceph_le32, omap_node_meta_le_t, omap_leaf_key_le_t>; + static constexpr L layout{1, 1, 1}; // = L::Partial(1, 1, 1); + friend class delta_leaf_t; + +public: + template <bool is_const> + class iter_t { + friend class StringKVLeafNodeLayout; + using parent_t = typename crimson::common::maybe_const_t<StringKVLeafNodeLayout, is_const>::type; + + template <typename iterator, typename const_iterator> + friend void copy_from_foreign(iterator, const_iterator, const_iterator); + template <typename iterator> + friend void copy_from_local(unsigned, iterator, iterator, iterator); + + parent_t node; + uint16_t index; + + iter_t( + parent_t parent, + uint16_t index) : node(parent), index(index) {} + + public: + iter_t(const iter_t &) = default; + iter_t(iter_t &&) = default; + iter_t &operator=(const iter_t &) = default; + iter_t &operator=(iter_t &&) = default; + + operator iter_t<!is_const>() const { + static_assert(!is_const); + return iter_t<!is_const>(node, index); + } + + iter_t &operator*() { return *this; } + iter_t *operator->() { return this; } + + iter_t operator++(int) { + auto ret = *this; + ++index; + return ret; + } + + iter_t &operator++() { + ++index; + return *this; + } + + uint16_t operator-(const iter_t &rhs) const { + assert(rhs.node == node); + return index - rhs.index; + } + + iter_t operator+(uint16_t off) const { + return iter_t( + node, + index + off); + } + iter_t operator-(uint16_t off) const { + return iter_t( + node, + index - off); + } + + uint16_t operator<(const iter_t &rhs) const { + assert(rhs.node == node); + return index < rhs.index; + } + + uint16_t operator>(const iter_t &rhs) const { + assert(rhs.node == node); + return index > rhs.index; + } + + bool operator==(const iter_t &rhs) const { + assert(node == rhs.node); + return rhs.index == index; + } + + bool operator!=(const iter_t &rhs) const { + assert(node == rhs.node); + return index != rhs.index; + } + + private: + omap_leaf_key_t get_node_key() const { + omap_leaf_key_le_t kint = node->get_node_key_ptr()[index]; + return omap_leaf_key_t(kint); + } + auto get_node_key_ptr() const { + return reinterpret_cast< + typename crimson::common::maybe_const_t<char, is_const>::type>( + node->get_node_key_ptr() + index); + } + + uint32_t get_node_val_offset() const { + return get_node_key().key_off; + } + auto get_node_val_ptr() const { + auto tail = node->buf + OMAP_LEAF_BLOCK_SIZE; + if (*this == node->iter_end()) + return tail; + else { + return tail - get_node_val_offset(); + } + } + + int get_right_offset_end() const { + if (index == 0) + return 0; + else + return (*this - 1)->get_node_val_offset(); + } + auto get_right_ptr_end() const { + return node->buf + OMAP_LEAF_BLOCK_SIZE - get_right_offset_end(); + } + + void update_offset(int offset) { + auto key = get_node_key(); + assert(offset + key.key_off >= 0); + key.key_off += offset; + set_node_key(key); + } + + void set_node_key(omap_leaf_key_t _lb) const { + static_assert(!is_const); + omap_leaf_key_le_t lb; + lb = _lb; + node->get_node_key_ptr()[index] = lb; + } + + void set_node_val(const std::string &key, const ceph::bufferlist &val) { + static_assert(!is_const); + auto node_key = get_node_key(); + assert(key.size() == node_key.key_len); + assert(val.length() == node_key.val_len); + ::memcpy(get_node_val_ptr(), key.data(), key.size()); + auto bliter = val.begin(); + bliter.copy(node_key.val_len, get_node_val_ptr() + node_key.key_len); + } + + public: + uint16_t get_index() const { + return index; + } + + std::string get_key() const { + return std::string( + get_node_val_ptr(), + get_node_key().key_len); + } + + std::string get_str_val() const { + auto node_key = get_node_key(); + return std::string( + get_node_val_ptr() + node_key.key_len, + get_node_key().val_len); + } + + ceph::bufferlist get_val() const { + auto node_key = get_node_key(); + ceph::bufferlist bl; + ceph::bufferptr bptr( + get_node_val_ptr() + node_key.key_len, + get_node_key().val_len); + bl.append(bptr); + return bl; + } + }; + using const_iterator = iter_t<true>; + using iterator = iter_t<false>; + +public: + void journal_leaf_insert( + const_iterator _iter, + const std::string &key, + const ceph::bufferlist &val, + delta_leaf_buffer_t *recorder) { + auto iter = iterator(this, _iter.index); + if (recorder) { + recorder->insert( + key, + val); + } + leaf_insert(iter, key, val); + } + + void journal_leaf_update( + const_iterator _iter, + const std::string &key, + const ceph::bufferlist &val, + delta_leaf_buffer_t *recorder) { + auto iter = iterator(this, _iter.index); + if (recorder) { + recorder->remove(iter->get_key()); + recorder->insert(key, val); + } + leaf_update(iter, key, val); + } + + void journal_leaf_remove( + const_iterator _iter, + delta_leaf_buffer_t *recorder) { + auto iter = iterator(this, _iter.index); + if (recorder) { + recorder->remove(iter->get_key()); + } + leaf_remove(iter); + } + + StringKVLeafNodeLayout(char *buf) : + buf(buf) {} + + const_iterator iter_begin() const { + return const_iterator( + this, + 0); + } + + const_iterator iter_end() const { + return const_iterator( + this, + get_size()); + } + + iterator iter_begin() { + return iterator( + this, + 0); + } + + iterator iter_end() { + return iterator( + this, + get_size()); + } + + const_iterator iter_idx(uint16_t off) const { + return const_iterator( + this, + off); + } + + const_iterator string_lower_bound(std::string_view str) const { + uint16_t start = 0, end = get_size(); + while (start != end) { + unsigned mid = (start + end) / 2; + const_iterator iter(this, mid); + std::string s = iter->get_key(); + if (s < str) { + start = ++mid; + } else if (s > str) { + end = mid; + } else { + return iter; + } + } + return const_iterator(this, start); + } + + iterator string_lower_bound(std::string_view str) { + const auto &tref = *this; + return iterator(this, tref.string_lower_bound(str).index); + } + + const_iterator string_upper_bound(std::string_view str) const { + auto ret = iter_begin(); + for (; ret != iter_end(); ++ret) { + std::string s = ret->get_key(); + if (s > str) + break; + } + return ret; + } + + iterator string_upper_bound(std::string_view str) { + const auto &tref = *this; + return iterator(this, tref.string_upper_bound(str).index); + } + + const_iterator find_string_key(std::string_view str) const { + auto ret = iter_begin(); + for (; ret != iter_end(); ++ret) { + std::string s = ret->get_key(); + if (s == str) + break; + } + return ret; + } + iterator find_string_key(std::string_view str) { + const auto &tref = *this; + return iterator(this, tref.find_string_key(str).index); + } + + const_iterator get_split_pivot() const { + uint32_t total_size = omap_leaf_key_t(get_node_key_ptr()[get_size()-1]).key_off; + uint32_t pivot_size = total_size / 2; + uint32_t size = 0; + for (auto ite = iter_begin(); ite < iter_end(); ite++) { + auto node_key = ite->get_node_key(); + size += node_key.key_len + node_key.val_len; + if (size >= pivot_size){ + return ite; + } + } + return iter_end(); + } + + uint32_t get_size() const { + ceph_le32 &size = *layout.template Pointer<0>(buf); + return uint32_t(size); + } + + /** + * set_size + * + * Set size representation to match size + */ + void set_size(uint32_t size) { + ceph_le32 s; + s = size; + *layout.template Pointer<0>(buf) = s; + } + + /** + * get_meta/set_meta + * + * Enables stashing a templated type within the layout. + * Cannot be modified after initial write as it is not represented + * in delta_t + */ + omap_node_meta_t get_meta() const { + omap_node_meta_le_t &metaint = *layout.template Pointer<1>(buf); + return omap_node_meta_t(metaint); + } + void set_meta(const omap_node_meta_t &meta) { + *layout.template Pointer<1>(buf) = omap_node_meta_le_t(meta); + } + + uint32_t used_space() const { + uint32_t count = get_size(); + if (count) { + omap_leaf_key_t last_key = omap_leaf_key_t(get_node_key_ptr()[count-1]); + return last_key.key_off + count * sizeof(omap_leaf_key_le_t); + } else { + return 0; + } + } + + uint32_t free_space() const { + return capacity() - used_space(); + } + + uint32_t capacity() const { + return OMAP_LEAF_BLOCK_SIZE + - (reinterpret_cast<char*>(layout.template Pointer<2>(buf)) + - reinterpret_cast<char*>(layout.template Pointer<0>(buf))); + } + + bool is_overflow(size_t ksize, size_t vsize) const { + return free_space() < (sizeof(omap_leaf_key_le_t) + ksize + vsize); + } + + bool is_overflow(const StringKVLeafNodeLayout &rhs) const { + return free_space() < rhs.used_space(); + } + + bool below_min() const { + return free_space() > (capacity() / 2); + } + + bool operator==(const StringKVLeafNodeLayout &rhs) const { + if (get_size() != rhs.get_size()) { + return false; + } + + auto iter = iter_begin(); + auto iter2 = rhs.iter_begin(); + while (iter != iter_end()) { + if(iter->get_key() != iter2->get_key() || + iter->get_val() != iter2->get_val()) { + return false; + } + iter++; + iter2++; + } + return true; + } + + /** + * split_into + * + * Takes *this and splits its contents into left and right. + */ + std::string split_into( + StringKVLeafNodeLayout &left, + StringKVLeafNodeLayout &right) const { + auto piviter = get_split_pivot(); + assert (piviter != iter_end()); + + copy_from_foreign(left.iter_begin(), iter_begin(), piviter); + left.set_size(piviter - iter_begin()); + + copy_from_foreign(right.iter_begin(), piviter, iter_end()); + right.set_size(iter_end() - piviter); + + auto [lmeta, rmeta] = get_meta().split_into(); + left.set_meta(lmeta); + right.set_meta(rmeta); + + return piviter->get_key(); + } + + /** + * merge_from + * + * Takes two nodes and copies their contents into *this. + * + * precondition: left.size() + right.size() < CAPACITY + */ + void merge_from( + const StringKVLeafNodeLayout &left, + const StringKVLeafNodeLayout &right) + { + copy_from_foreign( + iter_end(), + left.iter_begin(), + left.iter_end()); + set_size(left.get_size()); + copy_from_foreign( + iter_end(), + right.iter_begin(), + right.iter_end()); + set_size(left.get_size() + right.get_size()); + set_meta(omap_node_meta_t::merge_from(left.get_meta(), right.get_meta())); + } + + /** + * balance_into_new_nodes + * + * Takes the contents of left and right and copies them into + * replacement_left and replacement_right such that + * the size of replacement_left side just >= 1/2 of the total size (left + right). + */ + static std::string balance_into_new_nodes( + const StringKVLeafNodeLayout &left, + const StringKVLeafNodeLayout &right, + StringKVLeafNodeLayout &replacement_left, + StringKVLeafNodeLayout &replacement_right) + { + uint32_t left_size = omap_leaf_key_t(left.get_node_key_ptr()[left.get_size()-1]).key_off; + uint32_t right_size = omap_leaf_key_t(right.get_node_key_ptr()[right.get_size()-1]).key_off; + uint32_t total = left_size + right_size; + uint32_t pivot_size = total / 2; + uint32_t pivot_idx = 0; + if (pivot_size < left_size) { + uint32_t size = 0; + for (auto ite = left.iter_begin(); ite < left.iter_end(); ite++) { + auto node_key = ite->get_node_key(); + size += node_key.key_len + node_key.val_len; + if (size >= pivot_size){ + pivot_idx = ite.get_index(); + break; + } + } + } else { + uint32_t more_size = pivot_size - left_size; + uint32_t size = 0; + for (auto ite = right.iter_begin(); ite < right.iter_end(); ite++) { + auto node_key = ite->get_node_key(); + size += node_key.key_len + node_key.val_len; + if (size >= more_size){ + pivot_idx = ite.get_index() + left.get_size(); + break; + } + } + } + + auto replacement_pivot = pivot_idx >= left.get_size() ? + right.iter_idx(pivot_idx - left.get_size())->get_key() : + left.iter_idx(pivot_idx)->get_key(); + + if (pivot_size < left_size) { + copy_from_foreign( + replacement_left.iter_end(), + left.iter_begin(), + left.iter_idx(pivot_idx)); + replacement_left.set_size(pivot_idx); + + copy_from_foreign( + replacement_right.iter_end(), + left.iter_idx(pivot_idx), + left.iter_end()); + replacement_right.set_size(left.get_size() - pivot_idx); + + copy_from_foreign( + replacement_right.iter_end(), + right.iter_begin(), + right.iter_end()); + replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx); + } else { + copy_from_foreign( + replacement_left.iter_end(), + left.iter_begin(), + left.iter_end()); + replacement_left.set_size(left.get_size()); + + copy_from_foreign( + replacement_left.iter_end(), + right.iter_begin(), + right.iter_idx(pivot_idx - left.get_size())); + replacement_left.set_size(pivot_idx); + + copy_from_foreign( + replacement_right.iter_end(), + right.iter_idx(pivot_idx - left.get_size()), + right.iter_end()); + replacement_right.set_size(right.get_size() + left.get_size() - pivot_idx); + } + + auto [lmeta, rmeta] = omap_node_meta_t::rebalance( + left.get_meta(), right.get_meta()); + replacement_left.set_meta(lmeta); + replacement_right.set_meta(rmeta); + return replacement_pivot; + } + +private: + void leaf_insert( + iterator iter, + const std::string &key, + const bufferlist &val) { + if (iter != iter_begin()) { + assert((iter - 1)->get_key() < key); + } + if (iter != iter_end()) { + assert(iter->get_key() > key); + } + assert(!is_overflow(key.size(), val.length())); + omap_leaf_key_t node_key; + if (iter == iter_begin()) { + node_key.key_off = key.size() + val.length(); + node_key.key_len = key.size(); + node_key.val_len = val.length(); + } else { + node_key.key_off = (iter - 1)->get_node_key().key_off + + (key.size() + val.length()); + node_key.key_len = key.size(); + node_key.val_len = val.length(); + } + if (get_size() != 0 && iter != iter_end()) + copy_from_local(node_key.key_len + node_key.val_len, iter + 1, iter, iter_end()); + + iter->set_node_key(node_key); + set_size(get_size() + 1); + iter->set_node_val(key, val); + } + + void leaf_update( + iterator iter, + const std::string &key, + const ceph::bufferlist &val) { + assert(iter != iter_end()); + leaf_remove(iter); + assert(!is_overflow(key.size(), val.length())); + leaf_insert(iter, key, val); + } + + void leaf_remove(iterator iter) { + assert(iter != iter_end()); + if ((iter + 1) != iter_end()) { + omap_leaf_key_t key = iter->get_node_key(); + copy_from_local(key.key_len + key.val_len, iter, iter + 1, iter_end()); + } + set_size(get_size() - 1); + } + + /** + * get_key_ptr + * + * Get pointer to start of key array + */ + omap_leaf_key_le_t *get_node_key_ptr() { + return L::Partial(1, 1, get_size()).template Pointer<2>(buf); + } + const omap_leaf_key_le_t *get_node_key_ptr() const { + return L::Partial(1, 1, get_size()).template Pointer<2>(buf); + } + +}; + +inline void delta_inner_t::replay(StringKVInnerNodeLayout &l) { + switch (op) { + case op_t::INSERT: { + l.inner_insert(l.string_lower_bound(key), key, addr); + break; + } + case op_t::UPDATE: { + auto iter = l.find_string_key(key); + assert(iter != l.iter_end()); + l.inner_update(iter, addr); + break; + } + case op_t::REMOVE: { + auto iter = l.find_string_key(key); + assert(iter != l.iter_end()); + l.inner_remove(iter); + break; + } + default: + assert(0 == "Impossible"); + } +} + +inline void delta_leaf_t::replay(StringKVLeafNodeLayout &l) { + switch (op) { + case op_t::INSERT: { + l.leaf_insert(l.string_lower_bound(key), key, val); + break; + } + case op_t::UPDATE: { + auto iter = l.find_string_key(key); + assert(iter != l.iter_end()); + l.leaf_update(iter, key, val); + break; + } + case op_t::REMOVE: { + auto iter = l.find_string_key(key); + assert(iter != l.iter_end()); + l.leaf_remove(iter); + break; + } + default: + assert(0 == "Impossible"); + } +} + +} diff --git a/src/crimson/os/seastore/onode.cc b/src/crimson/os/seastore/onode.cc new file mode 100644 index 000000000..96b69fb7c --- /dev/null +++ b/src/crimson/os/seastore/onode.cc @@ -0,0 +1,18 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "onode.h" +#include <iostream> + +namespace crimson::os::seastore { + +std::ostream& operator<<(std::ostream &out, const Onode &rhs) +{ + auto &layout = rhs.get_layout(); + return out << "Onode(" + << "size=" << static_cast<uint32_t>(layout.size) + << ")"; +} + +} + diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h new file mode 100644 index 000000000..069daa3df --- /dev/null +++ b/src/crimson/os/seastore/onode.h @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iosfwd> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "include/byteorder.h" +#include "seastore_types.h" + +namespace crimson::os::seastore { + +struct onode_layout_t { + // The expected decode size of object_info_t without oid. + static constexpr int MAX_OI_LENGTH = 232; + // We might want to move the ss field out of onode_layout_t. + // The reason is that ss_attr may grow to relative large, as + // its clone_overlap may grow to a large size, if applications + // set objects to a relative large size(for the purpose of reducing + // the number of objects per OSD, so that all objects' metadata + // can be cached in memory) and do many modifications between + // snapshots. + // TODO: implement flexible-sized onode value to store inline ss_attr + // effectively. + static constexpr int MAX_SS_LENGTH = 1; + + ceph_le32 size{0}; + ceph_le32 oi_size{0}; + ceph_le32 ss_size{0}; + omap_root_le_t omap_root; + omap_root_le_t xattr_root; + + object_data_le_t object_data; + + char oi[MAX_OI_LENGTH]; + char ss[MAX_SS_LENGTH]; +} __attribute__((packed)); + +class Transaction; + +/** + * Onode + * + * Interface manipulated by seastore. OnodeManager implementations should + * return objects derived from this interface with layout referencing + * internal representation of onode_layout_t. + */ +class Onode : public boost::intrusive_ref_counter< + Onode, + boost::thread_unsafe_counter> +{ +protected: + virtual laddr_t get_hint() const = 0; + const uint32_t default_metadata_offset = 0; + const uint32_t default_metadata_range = 0; +public: + Onode(uint32_t ddr, uint32_t dmr) + : default_metadata_offset(ddr), + default_metadata_range(dmr) + {} + + virtual bool is_alive() const = 0; + virtual const onode_layout_t &get_layout() const = 0; + virtual onode_layout_t &get_mutable_layout(Transaction &t) = 0; + virtual ~Onode() = default; + + laddr_t get_metadata_hint(uint64_t block_size) const { + assert(default_metadata_offset); + assert(default_metadata_range); + uint64_t range_blocks = default_metadata_range / block_size; + return get_hint() + default_metadata_offset + + (((uint32_t)std::rand() % range_blocks) * block_size); + } + laddr_t get_data_hint() const { + return get_hint(); + } +}; + + +std::ostream& operator<<(std::ostream &out, const Onode &rhs); +using OnodeRef = boost::intrusive_ptr<Onode>; +} + +#if FMT_VERSION >= 90000 +template<> struct fmt::formatter<crimson::os::seastore::Onode> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/onode_manager.h b/src/crimson/os/seastore/onode_manager.h new file mode 100644 index 000000000..123c9e4f8 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/buffer_fwd.h" +#include "include/ceph_assert.h" +#include "common/hobject.h" + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/osd/exceptions.h" + +namespace crimson::os::seastore { + +class OnodeManager { + using base_iertr = TransactionManager::base_iertr; +public: + using mkfs_iertr = base_iertr; + using mkfs_ret = mkfs_iertr::future<>; + virtual mkfs_ret mkfs(Transaction &t) = 0; + + using contains_onode_iertr = base_iertr; + using contains_onode_ret = contains_onode_iertr::future<bool>; + virtual contains_onode_ret contains_onode( + Transaction &trans, + const ghobject_t &hoid) = 0; + + using get_onode_iertr = base_iertr::extend< + crimson::ct_error::enoent>; + using get_onode_ret = get_onode_iertr::future< + OnodeRef>; + virtual get_onode_ret get_onode( + Transaction &trans, + const ghobject_t &hoid) = 0; + + using get_or_create_onode_iertr = base_iertr::extend< + crimson::ct_error::value_too_large>; + using get_or_create_onode_ret = get_or_create_onode_iertr::future< + OnodeRef>; + virtual get_or_create_onode_ret get_or_create_onode( + Transaction &trans, + const ghobject_t &hoid) = 0; + + using get_or_create_onodes_iertr = base_iertr::extend< + crimson::ct_error::value_too_large>; + using get_or_create_onodes_ret = get_or_create_onodes_iertr::future< + std::vector<OnodeRef>>; + virtual get_or_create_onodes_ret get_or_create_onodes( + Transaction &trans, + const std::vector<ghobject_t> &hoids) = 0; + + using write_dirty_iertr = base_iertr; + using write_dirty_ret = write_dirty_iertr::future<>; + virtual write_dirty_ret write_dirty( + Transaction &trans, + const std::vector<OnodeRef> &onodes) = 0; + + using erase_onode_iertr = base_iertr; + using erase_onode_ret = erase_onode_iertr::future<>; + virtual erase_onode_ret erase_onode( + Transaction &trans, + OnodeRef &onode) = 0; + + using list_onodes_iertr = base_iertr; + using list_onodes_bare_ret = std::tuple<std::vector<ghobject_t>, ghobject_t>; + using list_onodes_ret = list_onodes_iertr::future<list_onodes_bare_ret>; + virtual list_onodes_ret list_onodes( + Transaction &trans, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) = 0; + + virtual ~OnodeManager() {} +}; +using OnodeManagerRef = std::unique_ptr<OnodeManager>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc new file mode 100644 index 000000000..bff27ab65 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.cc @@ -0,0 +1,183 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/logging.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h" + +SET_SUBSYS(seastore_onode); + +namespace crimson::os::seastore::onode { + +FLTreeOnodeManager::contains_onode_ret FLTreeOnodeManager::contains_onode( + Transaction &trans, + const ghobject_t &hoid) +{ + return tree.contains(trans, hoid); +} + +FLTreeOnodeManager::get_onode_ret FLTreeOnodeManager::get_onode( + Transaction &trans, + const ghobject_t &hoid) +{ + LOG_PREFIX(FLTreeOnodeManager::get_onode); + return tree.find( + trans, hoid + ).si_then([this, &hoid, &trans, FNAME](auto cursor) + -> get_onode_ret { + if (cursor == tree.end()) { + DEBUGT("no entry for {}", trans, hoid); + return crimson::ct_error::enoent::make(); + } + auto val = OnodeRef(new FLTreeOnode( + default_data_reservation, + default_metadata_range, + cursor.value())); + return get_onode_iertr::make_ready_future<OnodeRef>( + val + ); + }); +} + +FLTreeOnodeManager::get_or_create_onode_ret +FLTreeOnodeManager::get_or_create_onode( + Transaction &trans, + const ghobject_t &hoid) +{ + LOG_PREFIX(FLTreeOnodeManager::get_or_create_onode); + return tree.insert( + trans, hoid, + OnodeTree::tree_value_config_t{sizeof(onode_layout_t)} + ).si_then([this, &trans, &hoid, FNAME](auto p) + -> get_or_create_onode_ret { + auto [cursor, created] = std::move(p); + auto val = OnodeRef(new FLTreeOnode( + default_data_reservation, + default_metadata_range, + cursor.value())); + if (created) { + DEBUGT("created onode for entry for {}", trans, hoid); + val->get_mutable_layout(trans) = onode_layout_t{}; + } + return get_or_create_onode_iertr::make_ready_future<OnodeRef>( + val + ); + }); +} + +FLTreeOnodeManager::get_or_create_onodes_ret +FLTreeOnodeManager::get_or_create_onodes( + Transaction &trans, + const std::vector<ghobject_t> &hoids) +{ + return seastar::do_with( + std::vector<OnodeRef>(), + [this, &hoids, &trans](auto &ret) { + ret.reserve(hoids.size()); + return trans_intr::do_for_each( + hoids, + [this, &trans, &ret](auto &hoid) { + return get_or_create_onode(trans, hoid + ).si_then([&ret](auto &&onoderef) { + ret.push_back(std::move(onoderef)); + }); + }).si_then([&ret] { + return std::move(ret); + }); + }); +} + +FLTreeOnodeManager::write_dirty_ret FLTreeOnodeManager::write_dirty( + Transaction &trans, + const std::vector<OnodeRef> &onodes) +{ + return trans_intr::do_for_each( + onodes, + [&trans](auto &onode) -> eagain_ifuture<> { + if (!onode) { + return eagain_iertr::make_ready_future<>(); + } + auto &flonode = static_cast<FLTreeOnode&>(*onode); + if (!flonode.is_alive()) { + return eagain_iertr::make_ready_future<>(); + } + switch (flonode.status) { + case FLTreeOnode::status_t::MUTATED: { + flonode.populate_recorder(trans); + return eagain_iertr::make_ready_future<>(); + } + case FLTreeOnode::status_t::STABLE: { + return eagain_iertr::make_ready_future<>(); + } + default: + __builtin_unreachable(); + } + }); +} + +FLTreeOnodeManager::erase_onode_ret FLTreeOnodeManager::erase_onode( + Transaction &trans, + OnodeRef &onode) +{ + auto &flonode = static_cast<FLTreeOnode&>(*onode); + assert(flonode.is_alive()); + if (flonode.status == FLTreeOnode::status_t::MUTATED) { + flonode.populate_recorder(trans); + } + flonode.mark_delete(); + return tree.erase(trans, flonode); +} + +FLTreeOnodeManager::list_onodes_ret FLTreeOnodeManager::list_onodes( + Transaction &trans, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) +{ + return tree.lower_bound(trans, start + ).si_then([this, &trans, end, limit] (auto&& cursor) { + using crimson::os::seastore::onode::full_key_t; + return seastar::do_with( + limit, + std::move(cursor), + list_onodes_bare_ret(), + [this, &trans, end] (auto& to_list, auto& current_cursor, auto& ret) { + return trans_intr::repeat( + [this, &trans, end, &to_list, ¤t_cursor, &ret] () + -> eagain_ifuture<seastar::stop_iteration> { + if (current_cursor.is_end()) { + std::get<1>(ret) = ghobject_t::get_max(); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } else if (current_cursor.get_ghobj() >= end) { + std::get<1>(ret) = end; + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + if (to_list == 0) { + std::get<1>(ret) = current_cursor.get_ghobj(); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + std::get<0>(ret).emplace_back(current_cursor.get_ghobj()); + return tree.get_next(trans, current_cursor + ).si_then([&to_list, ¤t_cursor] (auto&& next_cursor) mutable { + // we intentionally hold the current_cursor during get_next() to + // accelerate tree lookup. + --to_list; + current_cursor = next_cursor; + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }); + }).si_then([&ret] () mutable { + return seastar::make_ready_future<list_onodes_bare_ret>( + std::move(ret)); + // return ret; + }); + }); + }); +} + +FLTreeOnodeManager::~FLTreeOnodeManager() {} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h new file mode 100644 index 000000000..09998fbfa --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h @@ -0,0 +1,171 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/value.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/tree.h" + +namespace crimson::os::seastore::onode { + +struct FLTreeOnode final : Onode, Value { + static constexpr tree_conf_t TREE_CONF = { + value_magic_t::ONODE, + 256, // max_ns_size + // same to option osd_max_object_namespace_len + 2048, // max_oid_size + // same to option osd_max_object_name_len + 1200, // max_value_payload_size + // see crimson::os::seastore::onode_layout_t + 8192, // internal_node_size + // see the formula in validate_tree_config + 16384 // leaf_node_size + // see the formula in validate_tree_config + }; + + enum class status_t { + STABLE, + MUTATED, + DELETED + } status = status_t::STABLE; + + FLTreeOnode(FLTreeOnode&&) = default; + FLTreeOnode& operator=(FLTreeOnode&&) = delete; + + FLTreeOnode(const FLTreeOnode&) = default; + FLTreeOnode& operator=(const FLTreeOnode&) = delete; + + template <typename... T> + FLTreeOnode(uint32_t ddr, uint32_t dmr, T&&... args) + : Onode(ddr, dmr), + Value(std::forward<T>(args)...) {} + + template <typename... T> + FLTreeOnode(T&&... args) + : Onode(0, 0), + Value(std::forward<T>(args)...) {} + + struct Recorder : public ValueDeltaRecorder { + Recorder(bufferlist &bl) : ValueDeltaRecorder(bl) {} + + value_magic_t get_header_magic() const final { + return TREE_CONF.value_magic; + } + + void apply_value_delta( + ceph::bufferlist::const_iterator &bliter, + NodeExtentMutable &value, + laddr_t) final { + assert(value.get_length() == sizeof(onode_layout_t)); + bliter.copy(value.get_length(), value.get_write()); + } + + void record_delta(NodeExtentMutable &value) { + // TODO: probably could use versioning, etc + assert(value.get_length() == sizeof(onode_layout_t)); + ceph::buffer::ptr bptr(value.get_length()); + memcpy(bptr.c_str(), value.get_read(), value.get_length()); + get_encoded(value).append(bptr); + } + }; + + bool is_alive() const { + return status != status_t::DELETED; + } + const onode_layout_t &get_layout() const final { + assert(status != status_t::DELETED); + return *read_payload<onode_layout_t>(); + } + + onode_layout_t &get_mutable_layout(Transaction &t) final { + assert(status != status_t::DELETED); + auto p = prepare_mutate_payload< + onode_layout_t, + Recorder>(t); + status = status_t::MUTATED; + return *reinterpret_cast<onode_layout_t*>(p.first.get_write()); + }; + + void populate_recorder(Transaction &t) { + assert(status == status_t::MUTATED); + auto p = prepare_mutate_payload< + onode_layout_t, + Recorder>(t); + if (p.second) { + p.second->record_delta( + p.first); + } + status = status_t::STABLE; + } + + void mark_delete() { + assert(status != status_t::DELETED); + status = status_t::DELETED; + } + + laddr_t get_hint() const final { + return Value::get_hint(); + } + ~FLTreeOnode() final {} +}; + +using OnodeTree = Btree<FLTreeOnode>; + +using crimson::common::get_conf; + +class FLTreeOnodeManager : public crimson::os::seastore::OnodeManager { + OnodeTree tree; + + uint32_t default_data_reservation = 0; + uint32_t default_metadata_offset = 0; + uint32_t default_metadata_range = 0; +public: + FLTreeOnodeManager(TransactionManager &tm) : + tree(NodeExtentManager::create_seastore(tm)), + default_data_reservation( + get_conf<uint64_t>("seastore_default_max_object_size")), + default_metadata_offset(default_data_reservation), + default_metadata_range( + get_conf<uint64_t>("seastore_default_object_metadata_reservation")) + {} + + mkfs_ret mkfs(Transaction &t) { + return tree.mkfs(t); + } + + contains_onode_ret contains_onode( + Transaction &trans, + const ghobject_t &hoid) final; + + get_onode_ret get_onode( + Transaction &trans, + const ghobject_t &hoid) final; + + get_or_create_onode_ret get_or_create_onode( + Transaction &trans, + const ghobject_t &hoid) final; + + get_or_create_onodes_ret get_or_create_onodes( + Transaction &trans, + const std::vector<ghobject_t> &hoids) final; + + write_dirty_ret write_dirty( + Transaction &trans, + const std::vector<OnodeRef> &onodes) final; + + erase_onode_ret erase_onode( + Transaction &trans, + OnodeRef &onode) final; + + list_onodes_ret list_onodes( + Transaction &trans, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) final; + + ~FLTreeOnodeManager(); +}; +using FLTreeOnodeManagerRef = std::unique_ptr<FLTreeOnodeManager>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h new file mode 100644 index 000000000..43f8b87ed --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h @@ -0,0 +1,196 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <algorithm> +#include <cstring> +#include <limits> +#include <memory> +#include <ostream> +#include <string> + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction.h" + +namespace crimson::os::seastore::onode { + +using eagain_iertr = trans_iertr< + crimson::errorator<crimson::ct_error::input_output_error> >; +template <class ValueT=void> +using eagain_ifuture = eagain_iertr::future<ValueT>; + +using crimson::os::seastore::Transaction; +using crimson::os::seastore::TransactionRef; +using crimson::os::seastore::laddr_t; +using crimson::os::seastore::L_ADDR_MIN; +using crimson::os::seastore::L_ADDR_NULL; +using crimson::os::seastore::extent_len_t; + +class DeltaRecorder; +class NodeExtent; +class NodeExtentManager; +class RootNodeTracker; +struct ValueBuilder; +using DeltaRecorderURef = std::unique_ptr<DeltaRecorder>; +using NodeExtentRef = crimson::os::seastore::TCachedExtentRef<NodeExtent>; +using NodeExtentManagerURef = std::unique_ptr<NodeExtentManager>; +using RootNodeTrackerURef = std::unique_ptr<RootNodeTracker>; +struct context_t { + NodeExtentManager& nm; + const ValueBuilder& vb; + Transaction& t; +}; + +class LeafNodeImpl; +class InternalNodeImpl; +class NodeImpl; +using LeafNodeImplURef = std::unique_ptr<LeafNodeImpl>; +using InternalNodeImplURef = std::unique_ptr<InternalNodeImpl>; +using NodeImplURef = std::unique_ptr<NodeImpl>; + +using level_t = uint8_t; +constexpr auto MAX_LEVEL = std::numeric_limits<level_t>::max(); + +// a type only to index within a node, 32 bits should be enough +using index_t = uint32_t; +constexpr auto INDEX_END = std::numeric_limits<index_t>::max(); +constexpr auto INDEX_LAST = INDEX_END - 0x4; +constexpr auto INDEX_UPPER_BOUND = INDEX_END - 0x8; +inline bool is_valid_index(index_t index) { return index < INDEX_UPPER_BOUND; } + +// we support up to 64 KiB tree nodes +using node_offset_t = uint16_t; +constexpr node_offset_t DISK_BLOCK_SIZE = 1u << 12; +constexpr auto MAX_NODE_SIZE = + (extent_len_t)std::numeric_limits<node_offset_t>::max() + 1; +inline bool is_valid_node_size(extent_len_t node_size) { + return (node_size > 0 && + node_size <= MAX_NODE_SIZE && + node_size % DISK_BLOCK_SIZE == 0); +} + +using string_size_t = uint16_t; + +enum class MatchKindBS : int8_t { NE = -1, EQ = 0 }; + +enum class MatchKindCMP : int8_t { LT = -1, EQ = 0, GT }; +inline MatchKindCMP toMatchKindCMP(int value) { + if (value > 0) { + return MatchKindCMP::GT; + } else if (value < 0) { + return MatchKindCMP::LT; + } else { + return MatchKindCMP::EQ; + } +} +template <typename Type> +MatchKindCMP toMatchKindCMP(const Type& l, const Type& r) { + if (l > r) { + return MatchKindCMP::GT; + } else if (l < r) { + return MatchKindCMP::LT; + } else { + return MatchKindCMP::EQ; + } +} + +inline MatchKindCMP toMatchKindCMP( + std::string_view l, std::string_view r) { + return toMatchKindCMP(l.compare(r)); +} + +inline MatchKindCMP reverse(MatchKindCMP cmp) { + if (cmp == MatchKindCMP::LT) { + return MatchKindCMP::GT; + } else if (cmp == MatchKindCMP::GT) { + return MatchKindCMP::LT; + } else { + return cmp; + } +} + +struct tree_stats_t { + size_t size_persistent_leaf = 0; + size_t size_persistent_internal = 0; + size_t size_filled_leaf = 0; + size_t size_filled_internal = 0; + size_t size_logical_leaf = 0; + size_t size_logical_internal = 0; + size_t size_overhead_leaf = 0; + size_t size_overhead_internal = 0; + size_t size_value_leaf = 0; + size_t size_value_internal = 0; + unsigned num_kvs_leaf = 0; + unsigned num_kvs_internal = 0; + unsigned num_nodes_leaf = 0; + unsigned num_nodes_internal = 0; + unsigned height = 0; + + size_t size_persistent() const { + return size_persistent_leaf + size_persistent_internal; } + size_t size_filled() const { + return size_filled_leaf + size_filled_internal; } + size_t size_logical() const { + return size_logical_leaf + size_logical_internal; } + size_t size_overhead() const { + return size_overhead_leaf + size_overhead_internal; } + size_t size_value() const { + return size_value_leaf + size_value_internal; } + unsigned num_kvs() const { + return num_kvs_leaf + num_kvs_internal; } + unsigned num_nodes() const { + return num_nodes_leaf + num_nodes_internal; } + + double ratio_fullness() const { + return (double)size_filled() / size_persistent(); } + double ratio_key_compression() const { + return (double)(size_filled() - size_value()) / (size_logical() - size_value()); } + double ratio_overhead() const { + return (double)size_overhead() / size_filled(); } + double ratio_keys_leaf() const { + return (double)num_kvs_leaf / num_kvs(); } + double ratio_nodes_leaf() const { + return (double)num_nodes_leaf / num_nodes(); } + double ratio_filled_leaf() const { + return (double)size_filled_leaf / size_filled(); } +}; +inline std::ostream& operator<<(std::ostream& os, const tree_stats_t& stats) { + os << "Tree stats:" + << "\n height = " << stats.height + << "\n num values = " << stats.num_kvs_leaf + << "\n num nodes = " << stats.num_nodes() + << " (leaf=" << stats.num_nodes_leaf + << ", internal=" << stats.num_nodes_internal << ")" + << "\n size persistent = " << stats.size_persistent() << "B" + << "\n size filled = " << stats.size_filled() << "B" + << " (value=" << stats.size_value_leaf << "B" + << ", rest=" << stats.size_filled() - stats.size_value_leaf << "B)" + << "\n size logical = " << stats.size_logical() << "B" + << "\n size overhead = " << stats.size_overhead() << "B" + << "\n ratio fullness = " << stats.ratio_fullness() + << "\n ratio keys leaf = " << stats.ratio_keys_leaf() + << "\n ratio nodes leaf = " << stats.ratio_nodes_leaf() + << "\n ratio filled leaf = " << stats.ratio_filled_leaf() + << "\n ratio key compression = " << stats.ratio_key_compression(); + assert(stats.num_kvs_internal + 1 == stats.num_nodes()); + return os; +} + +template <typename PtrType> +void reset_ptr(PtrType& ptr, const char* origin_base, + const char* new_base, extent_len_t node_size) { + assert((const char*)ptr > origin_base); + assert((const char*)ptr - origin_base < (int)node_size); + ptr = reinterpret_cast<PtrType>( + (const char*)ptr - origin_base + new_base); +} + +} + +#if FMT_VERSION >= 90000 +template<> +struct fmt::formatter<crimson::os::seastore::onode::tree_stats_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc new file mode 100644 index 000000000..6f08f4d3c --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc @@ -0,0 +1,2282 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node.h" + +#include <cassert> +#include <exception> +#include <sstream> + +#include "common/likely.h" +#include "crimson/common/utility.h" +#include "crimson/os/seastore/logging.h" + +#include "node_extent_manager.h" +#include "node_impl.h" +#include "stages/node_stage_layout.h" + +SET_SUBSYS(seastore_onode); + +namespace fmt { +template <typename T> +const void* ptr(const ::boost::intrusive_ptr<T>& p) { + return p.get(); +} +} + +namespace crimson::os::seastore::onode { +/* + * tree_cursor_t + */ + +// create from insert +tree_cursor_t::tree_cursor_t(Ref<LeafNode> node, const search_position_t& pos) + : ref_leaf_node{node}, position{pos}, cache{ref_leaf_node} +{ + assert(is_tracked()); + ref_leaf_node->do_track_cursor<true>(*this); + // do not account updates for the inserted values + is_mutated = true; +} + +// create from lookup +tree_cursor_t::tree_cursor_t( + Ref<LeafNode> node, const search_position_t& pos, + const key_view_t& key_view, const value_header_t* p_value_header) + : ref_leaf_node{node}, position{pos}, cache{ref_leaf_node} +{ + assert(is_tracked()); + update_cache_same_node(key_view, p_value_header); + ref_leaf_node->do_track_cursor<true>(*this); +} + +// lookup reaches the end, contain leaf node for further insert +tree_cursor_t::tree_cursor_t(Ref<LeafNode> node) + : ref_leaf_node{node}, position{search_position_t::end()}, cache{ref_leaf_node} +{ + assert(is_end()); + assert(ref_leaf_node->is_level_tail()); +} + +// create an invalid tree_cursor_t +tree_cursor_t::~tree_cursor_t() +{ + if (is_tracked()) { + ref_leaf_node->do_untrack_cursor(*this); + } +} + +eagain_ifuture<Ref<tree_cursor_t>> +tree_cursor_t::get_next(context_t c) +{ + assert(is_tracked()); + return ref_leaf_node->get_next_cursor(c, position); +} + +void tree_cursor_t::assert_next_to( + const tree_cursor_t& prv, value_magic_t magic) const +{ +#ifndef NDEBUG + assert(!prv.is_end()); + if (is_end()) { + assert(ref_leaf_node == prv.ref_leaf_node); + assert(ref_leaf_node->is_level_tail()); + } else if (is_tracked()) { + auto key = get_key_view(magic); + auto prv_key = prv.get_key_view(magic); + assert(key > prv_key); + if (ref_leaf_node == prv.ref_leaf_node) { + position.assert_next_to(prv.position); + } else { + assert(!prv.ref_leaf_node->is_level_tail()); + assert(position == search_position_t::begin()); + } + } else { + assert(is_invalid()); + ceph_abort("impossible"); + } +#endif +} + +template <bool FORCE_MERGE> +eagain_ifuture<Ref<tree_cursor_t>> +tree_cursor_t::erase(context_t c, bool get_next) +{ + assert(is_tracked()); + return ref_leaf_node->erase<FORCE_MERGE>(c, position, get_next); +} +template eagain_ifuture<Ref<tree_cursor_t>> +tree_cursor_t::erase<true>(context_t, bool); +template eagain_ifuture<Ref<tree_cursor_t>> +tree_cursor_t::erase<false>(context_t, bool); + +std::strong_ordering tree_cursor_t::compare_to( + const tree_cursor_t& o, value_magic_t magic) const +{ + if (!is_tracked() && !o.is_tracked()) { + return std::strong_ordering::equal; + } else if (!is_tracked()) { + return std::strong_ordering::greater; + } else if (!o.is_tracked()) { + return std::strong_ordering::less; + } + + assert(is_tracked() && o.is_tracked()); + // all tracked cursors are singletons + if (this == &o) { + return std::strong_ordering::equal; + } + + std::strong_ordering ret = std::strong_ordering::equal; + if (ref_leaf_node == o.ref_leaf_node) { + ret = position <=> o.position; + } else { + auto key = get_key_view(magic); + auto o_key = o.get_key_view(magic); + ret = key <=> o_key; + } + assert(ret != 0); + return ret; +} + +eagain_ifuture<> +tree_cursor_t::extend_value(context_t c, value_size_t extend_size) +{ + assert(is_tracked()); + return ref_leaf_node->extend_value(c, position, extend_size); +} + +eagain_ifuture<> +tree_cursor_t::trim_value(context_t c, value_size_t trim_size) +{ + assert(is_tracked()); + return ref_leaf_node->trim_value(c, position, trim_size); +} + +template <bool VALIDATE> +void tree_cursor_t::update_track( + Ref<LeafNode> node, const search_position_t& pos) +{ + // I must be already untracked + assert(is_tracked()); + assert(!ref_leaf_node->check_is_tracking(*this)); + // track the new node and new pos + assert(!pos.is_end()); + ref_leaf_node = node; + position = pos; + // we lazy update the key/value information until user asked + cache.invalidate(); + ref_leaf_node->do_track_cursor<VALIDATE>(*this); +} +template void tree_cursor_t::update_track<true>(Ref<LeafNode>, const search_position_t&); +template void tree_cursor_t::update_track<false>(Ref<LeafNode>, const search_position_t&); + +void tree_cursor_t::update_cache_same_node(const key_view_t& key_view, + const value_header_t* p_value_header) const +{ + assert(is_tracked()); + cache.update_all(ref_leaf_node->get_version(), key_view, p_value_header); + cache.validate_is_latest(position); +} + +void tree_cursor_t::invalidate() +{ + assert(is_tracked()); + ref_leaf_node.reset(); + assert(is_invalid()); + // I must be removed from LeafNode +} + +/* + * tree_cursor_t::Cache + */ + +tree_cursor_t::Cache::Cache(Ref<LeafNode>& ref_leaf_node) + : ref_leaf_node{ref_leaf_node} {} + +void tree_cursor_t::Cache::update_all(const node_version_t& current_version, + const key_view_t& _key_view, + const value_header_t* _p_value_header) +{ + assert(_p_value_header); + + needs_update_all = false; + version = current_version; + + p_node_base = ref_leaf_node->read(); + key_view = _key_view; + p_value_header = _p_value_header; + assert((const char*)p_value_header > p_node_base); + assert((const char*)p_value_header - p_node_base < + (int)ref_leaf_node->get_node_size()); + + value_payload_mut.reset(); + p_value_recorder = nullptr; +} + +void tree_cursor_t::Cache::maybe_duplicate(const node_version_t& current_version) +{ + assert(!needs_update_all); + assert(version.layout == current_version.layout); + if (version.state == current_version.state) { + // cache is already latest. + } else if (version.state < current_version.state) { + // the extent has been copied but the layout has not been changed. + assert(p_node_base != nullptr); + assert(key_view.has_value()); + assert(p_value_header != nullptr); + + auto current_p_node_base = ref_leaf_node->read(); + assert(current_p_node_base != p_node_base); + auto node_size = ref_leaf_node->get_node_size(); + + version.state = current_version.state; + reset_ptr(p_value_header, p_node_base, + current_p_node_base, node_size); + key_view->reset_to(p_node_base, current_p_node_base, node_size); + value_payload_mut.reset(); + p_value_recorder = nullptr; + + p_node_base = current_p_node_base; + } else { + // It is impossible to change state backwards, see node_types.h. + ceph_abort("impossible"); + } +} + +void tree_cursor_t::Cache::make_latest( + value_magic_t magic, const search_position_t& pos) +{ + auto current_version = ref_leaf_node->get_version(); + if (needs_update_all || version.layout != current_version.layout) { + auto [_key_view, _p_value_header] = ref_leaf_node->get_kv(pos); + update_all(current_version, _key_view, _p_value_header); + } else { + maybe_duplicate(current_version); + } + assert(p_value_header->magic == magic); + validate_is_latest(pos); +} + +void tree_cursor_t::Cache::validate_is_latest(const search_position_t& pos) const +{ +#ifndef NDEBUG + assert(!needs_update_all); + assert(version == ref_leaf_node->get_version()); + + auto [_key_view, _p_value_header] = ref_leaf_node->get_kv(pos); + assert(p_node_base == ref_leaf_node->read()); + assert(key_view ==_key_view); + assert(p_value_header == _p_value_header); +#endif +} + +std::pair<NodeExtentMutable&, ValueDeltaRecorder*> +tree_cursor_t::Cache::prepare_mutate_value_payload( + context_t c, const search_position_t& pos) +{ + make_latest(c.vb.get_header_magic(), pos); + if (!value_payload_mut.has_value()) { + assert(!p_value_recorder); + auto value_mutable = ref_leaf_node->prepare_mutate_value_payload(c); + auto current_version = ref_leaf_node->get_version(); + maybe_duplicate(current_version); + value_payload_mut = p_value_header->get_payload_mutable(value_mutable.first); + p_value_recorder = value_mutable.second; + validate_is_latest(pos); + } + return {*value_payload_mut, p_value_recorder}; +} + +/* + * Node + */ + +Node::Node(NodeImplURef&& impl) : impl{std::move(impl)} {} + +Node::~Node() +{ + if (!is_tracked()) { + // possible scenarios: + // a. I'm erased; + // b. Eagain happened after the node extent is allocated/loaded + // and before the node is initialized correctly; + } else { + assert(!impl->is_extent_retired()); + if (is_root()) { + super->do_untrack_root(*this); + } else { + _parent_info->ptr->do_untrack_child(*this); + } + } +} + +level_t Node::level() const +{ + return impl->level(); +} + +eagain_ifuture<Node::search_result_t> Node::lower_bound( + context_t c, const key_hobj_t& key) +{ + return seastar::do_with( + MatchHistory(), [this, c, &key](auto& history) { + return lower_bound_tracked(c, key, history); + } + ); +} + +eagain_ifuture<std::pair<Ref<tree_cursor_t>, bool>> Node::insert( + context_t c, + const key_hobj_t& key, + value_config_t vconf, + Ref<Node>&& this_ref) +{ + return seastar::do_with( + MatchHistory(), [this, c, &key, vconf, + this_ref = std::move(this_ref)] (auto& history) mutable { + return lower_bound_tracked(c, key, history + ).si_then([c, &key, vconf, &history, + this_ref = std::move(this_ref)] (auto result) mutable { + // the cursor in the result should already hold the root node upwards + this_ref.reset(); + if (result.match() == MatchKindBS::EQ) { + return eagain_iertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>( + std::make_pair(result.p_cursor, false)); + } else { + auto leaf_node = result.p_cursor->get_leaf_node(); + return leaf_node->insert_value( + c, key, vconf, result.p_cursor->get_position(), history, result.mstat + ).si_then([](auto p_cursor) { + return seastar::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>( + std::make_pair(p_cursor, true)); + }); + } + }); + } + ); +} + +eagain_ifuture<std::size_t> Node::erase( + context_t c, + const key_hobj_t& key, + Ref<Node>&& this_ref) +{ + return lower_bound(c, key + ).si_then([c, this_ref = std::move(this_ref)] (auto result) mutable { + // the cursor in the result should already hold the root node upwards + this_ref.reset(); + if (result.match() != MatchKindBS::EQ) { + return eagain_iertr::make_ready_future<std::size_t>(0); + } + auto ref_cursor = result.p_cursor; + return ref_cursor->erase(c, false + ).si_then([ref_cursor] (auto next_cursor) { + assert(ref_cursor->is_invalid()); + assert(!next_cursor); + return std::size_t(1); + }); + }); +} + +eagain_ifuture<tree_stats_t> Node::get_tree_stats(context_t c) +{ + return seastar::do_with( + tree_stats_t(), [this, c](auto& stats) { + return do_get_tree_stats(c, stats).si_then([&stats] { + return stats; + }); + } + ); +} + +std::ostream& Node::dump(std::ostream& os) const +{ + return impl->dump(os); +} + +std::ostream& Node::dump_brief(std::ostream& os) const +{ + return impl->dump_brief(os); +} + +const std::string& Node::get_name() const +{ + return impl->get_name(); +} + +void Node::test_make_destructable( + context_t c, NodeExtentMutable& mut, Super::URef&& _super) +{ + impl->test_set_tail(mut); + make_root(c, std::move(_super)); +} + +eagain_ifuture<> Node::mkfs(context_t c, RootNodeTracker& root_tracker) +{ + LOG_PREFIX(OTree::Node::mkfs); + return LeafNode::allocate_root(c, root_tracker + ).si_then([c, FNAME](auto ret) { + c.t.get_onode_tree_stats().extents_num_delta++; + INFOT("allocated root {}", c.t, ret->get_name()); + }); +} + +eagain_ifuture<Ref<Node>> Node::load_root(context_t c, RootNodeTracker& root_tracker) +{ + LOG_PREFIX(OTree::Node::load_root); + return c.nm.get_super(c.t, root_tracker + ).handle_error_interruptible( + eagain_iertr::pass_further{}, + crimson::ct_error::input_output_error::handle([FNAME, c] { + ERRORT("EIO during get_super()", c.t); + ceph_abort("fatal error"); + }) + ).si_then([c, &root_tracker, FNAME](auto&& _super) { + assert(_super); + auto root_addr = _super->get_root_laddr(); + assert(root_addr != L_ADDR_NULL); + TRACET("loading root_addr={:x} ...", c.t, root_addr); + return Node::load(c, root_addr, true + ).si_then([c, _super = std::move(_super), + &root_tracker, FNAME](auto root) mutable { + TRACET("loaded {}", c.t, root->get_name()); + assert(root->impl->field_type() == field_type_t::N0); + root->as_root(std::move(_super)); + std::ignore = c; // as only used in an assert + std::ignore = root_tracker; + assert(root == root_tracker.get_root(c.t)); + return seastar::make_ready_future<Ref<Node>>(root); + }); + }); +} + +void Node::make_root(context_t c, Super::URef&& _super) +{ + _super->write_root_laddr(c, impl->laddr()); + as_root(std::move(_super)); + c.t.get_onode_tree_stats().depth = static_cast<uint64_t>(level()) + 1; +} + +void Node::as_root(Super::URef&& _super) +{ + assert(!is_tracked()); + assert(_super->get_root_laddr() == impl->laddr()); + assert(impl->is_level_tail()); + super = std::move(_super); + super->do_track_root(*this); + assert(is_root()); +} + +Super::URef Node::deref_super() +{ + assert(is_root()); + assert(super->get_root_laddr() == impl->laddr()); + assert(impl->is_level_tail()); + super->do_untrack_root(*this); + auto ret = std::move(super); + assert(!is_tracked()); + return ret; +} + +eagain_ifuture<> Node::upgrade_root(context_t c, laddr_t hint) +{ + LOG_PREFIX(OTree::Node::upgrade_root); + assert(impl->field_type() == field_type_t::N0); + auto super_to_move = deref_super(); + return InternalNode::allocate_root( + c, hint, impl->level(), impl->laddr(), std::move(super_to_move) + ).si_then([this, c, FNAME](auto new_root) { + as_child(search_position_t::end(), new_root); + INFOT("upgraded from {} to {}", + c.t, get_name(), new_root->get_name()); + }); +} + +template <bool VALIDATE> +void Node::as_child(const search_position_t& pos, Ref<InternalNode> parent_node) +{ + assert(!is_tracked() || !is_root()); +#ifndef NDEBUG + // Although I might have an outdated _parent_info during fixing, + // I must be already untracked. + if (_parent_info.has_value()) { + assert(!_parent_info->ptr->check_is_tracking(*this)); + } +#endif + _parent_info = parent_info_t{pos, parent_node}; + parent_info().ptr->do_track_child<VALIDATE>(*this); + assert(!is_root()); +} +template void Node::as_child<true>(const search_position_t&, Ref<InternalNode>); +template void Node::as_child<false>(const search_position_t&, Ref<InternalNode>); + +Ref<InternalNode> Node::deref_parent() +{ + assert(!is_root()); + auto parent_ref = std::move(parent_info().ptr); + parent_ref->do_untrack_child(*this); + _parent_info.reset(); + assert(!is_tracked()); + return parent_ref; +} + +eagain_ifuture<> Node::apply_split_to_parent( + context_t c, + Ref<Node>&& this_ref, + Ref<Node>&& split_right, + bool update_right_index) +{ + assert(!is_root()); + assert(this == this_ref.get()); + // TODO(cross-node string dedup) + return parent_info().ptr->apply_child_split( + c, std::move(this_ref), std::move(split_right), update_right_index); +} + +eagain_ifuture<Ref<tree_cursor_t>> +Node::get_next_cursor_from_parent(context_t c) +{ + assert(!impl->is_level_tail()); + assert(!is_root()); + return parent_info().ptr->get_next_cursor(c, parent_info().position); +} + +template <bool FORCE_MERGE> +eagain_ifuture<> +Node::try_merge_adjacent( + context_t c, bool update_parent_index, Ref<Node>&& this_ref) +{ + LOG_PREFIX(OTree::Node::try_merge_adjacent); + assert(this == this_ref.get()); + impl->validate_non_empty(); + assert(!is_root()); + if constexpr (!FORCE_MERGE) { + if (!impl->is_size_underflow() && + !impl->has_single_value()) { + // skip merge + if (update_parent_index) { + return fix_parent_index(c, std::move(this_ref), false); + } else { + parent_info().ptr->validate_child_tracked(*this); + return eagain_iertr::now(); + } + } + } + + return parent_info().ptr->get_child_peers(c, parent_info().position + ).si_then([c, this_ref = std::move(this_ref), this, FNAME, + update_parent_index] (auto lr_nodes) mutable -> eagain_ifuture<> { + auto& [lnode, rnode] = lr_nodes; + Ref<Node> left_for_merge; + Ref<Node> right_for_merge; + Ref<Node>* p_this_ref; + bool is_left; + if (!lnode && !rnode) { + // XXX: this is possible before node rebalance is implemented, + // when its parent cannot merge with its peers and has only one child + // (this node). + p_this_ref = &this_ref; + } else if (!lnode) { + left_for_merge = std::move(this_ref); + p_this_ref = &left_for_merge; + right_for_merge = std::move(rnode); + is_left = true; + } else if (!rnode) { + left_for_merge = std::move(lnode); + right_for_merge = std::move(this_ref); + p_this_ref = &right_for_merge; + is_left = false; + } else { // lnode && rnode + if (lnode->impl->free_size() > rnode->impl->free_size()) { + left_for_merge = std::move(lnode); + right_for_merge = std::move(this_ref); + p_this_ref = &right_for_merge; + is_left = false; + } else { // lnode free size <= rnode free size + left_for_merge = std::move(this_ref); + p_this_ref = &left_for_merge; + right_for_merge = std::move(rnode); + is_left = true; + } + } + + if (left_for_merge) { + assert(right_for_merge); + auto [merge_stage, merge_size] = left_for_merge->impl->evaluate_merge( + *right_for_merge->impl); + if (merge_size <= left_for_merge->impl->total_size()) { + // proceed merge + bool update_index_after_merge; + if (is_left) { + update_index_after_merge = false; + } else { + update_index_after_merge = update_parent_index; + } + DEBUGT("merge {} and {} at merge_stage={}, merge_size={}B, " + "update_index={}, is_left={} ...", + c.t, left_for_merge->get_name(), right_for_merge->get_name(), + merge_stage, merge_size, update_index_after_merge, is_left); + // we currently cannot generate delta depends on another extent content, + // so use rebuild_extent() as a workaround to rebuild the node from a + // fresh extent, thus no need to generate delta. + auto left_addr = left_for_merge->impl->laddr(); + return left_for_merge->rebuild_extent(c + ).si_then([c, update_index_after_merge, + left_addr, + merge_stage = merge_stage, + merge_size = merge_size, + left_for_merge = std::move(left_for_merge), + right_for_merge = std::move(right_for_merge)] (auto left_mut) mutable { + if (left_for_merge->impl->node_type() == node_type_t::LEAF) { + auto& left = *static_cast<LeafNode*>(left_for_merge.get()); + left.on_layout_change(); + } + search_position_t left_last_pos = left_for_merge->impl->merge( + left_mut, *right_for_merge->impl, merge_stage, merge_size); + left_for_merge->track_merge(right_for_merge, merge_stage, left_last_pos); + --(c.t.get_onode_tree_stats().extents_num_delta); + return left_for_merge->parent_info().ptr->apply_children_merge( + c, std::move(left_for_merge), left_addr, + std::move(right_for_merge), update_index_after_merge); + }); + } else { + // size will overflow if merge + } + } + + // cannot merge + if (update_parent_index) { + return fix_parent_index(c, std::move(*p_this_ref), false); + } else { + parent_info().ptr->validate_child_tracked(*this); + return eagain_iertr::now(); + } + // XXX: rebalance + }); +} +template eagain_ifuture<> Node::try_merge_adjacent<true>(context_t, bool, Ref<Node>&&); +template eagain_ifuture<> Node::try_merge_adjacent<false>(context_t, bool, Ref<Node>&&); + +eagain_ifuture<> Node::erase_node(context_t c, Ref<Node>&& this_ref) +{ + // To erase a node: + // 1. I'm supposed to have already untracked any children or cursors + // 2. unlink parent/super --ptr-> me + // 3. unlink me --ref-> parent/super + // 4. retire extent + // 5. destruct node + assert(this_ref.get() == this); + assert(!is_tracking()); + assert(!is_root()); + assert(this_ref->use_count() == 1); + return parent_info().ptr->erase_child(c, std::move(this_ref)); +} + +template <bool FORCE_MERGE> +eagain_ifuture<> Node::fix_parent_index( + context_t c, Ref<Node>&& this_ref, bool check_downgrade) +{ + assert(!is_root()); + assert(this == this_ref.get()); + return parent_info().ptr->fix_index<FORCE_MERGE>( + c, std::move(this_ref), check_downgrade); +} +template eagain_ifuture<> Node::fix_parent_index<true>(context_t, Ref<Node>&&, bool); +template eagain_ifuture<> Node::fix_parent_index<false>(context_t, Ref<Node>&&, bool); + +eagain_ifuture<Ref<Node>> Node::load( + context_t c, laddr_t addr, bool expect_is_level_tail) +{ + LOG_PREFIX(OTree::Node::load); + return c.nm.read_extent(c.t, addr + ).handle_error_interruptible( + eagain_iertr::pass_further{}, + crimson::ct_error::input_output_error::handle( + [FNAME, c, addr, expect_is_level_tail] { + ERRORT("EIO -- addr={:x}, is_level_tail={}", + c.t, addr, expect_is_level_tail); + ceph_abort("fatal error"); + }), + crimson::ct_error::invarg::handle( + [FNAME, c, addr, expect_is_level_tail] { + ERRORT("EINVAL -- addr={:x}, is_level_tail={}", + c.t, addr, expect_is_level_tail); + ceph_abort("fatal error"); + }), + crimson::ct_error::enoent::handle( + [FNAME, c, addr, expect_is_level_tail] { + ERRORT("ENOENT -- addr={:x}, is_level_tail={}", + c.t, addr, expect_is_level_tail); + ceph_abort("fatal error"); + }), + crimson::ct_error::erange::handle( + [FNAME, c, addr, expect_is_level_tail] { + ERRORT("ERANGE -- addr={:x}, is_level_tail={}", + c.t, addr, expect_is_level_tail); + ceph_abort("fatal error"); + }) + ).si_then([FNAME, c, addr, expect_is_level_tail](auto extent) + -> eagain_ifuture<Ref<Node>> { + assert(extent); + auto header = extent->get_header(); + auto field_type = header.get_field_type(); + if (!field_type) { + ERRORT("load addr={:x}, is_level_tail={} error, " + "got invalid header -- {}", + c.t, addr, expect_is_level_tail, fmt::ptr(extent)); + ceph_abort("fatal error"); + } + if (header.get_is_level_tail() != expect_is_level_tail) { + ERRORT("load addr={:x}, is_level_tail={} error, " + "is_level_tail mismatch -- {}", + c.t, addr, expect_is_level_tail, fmt::ptr(extent)); + ceph_abort("fatal error"); + } + + auto node_type = header.get_node_type(); + if (node_type == node_type_t::LEAF) { + if (extent->get_length() != c.vb.get_leaf_node_size()) { + ERRORT("load addr={:x}, is_level_tail={} error, " + "leaf length mismatch -- {}", + c.t, addr, expect_is_level_tail, fmt::ptr(extent)); + ceph_abort("fatal error"); + } + auto impl = LeafNodeImpl::load(extent, *field_type); + auto *derived_ptr = impl.get(); + return eagain_iertr::make_ready_future<Ref<Node>>( + new LeafNode(derived_ptr, std::move(impl))); + } else if (node_type == node_type_t::INTERNAL) { + if (extent->get_length() != c.vb.get_internal_node_size()) { + ERRORT("load addr={:x}, is_level_tail={} error, " + "internal length mismatch -- {}", + c.t, addr, expect_is_level_tail, fmt::ptr(extent)); + ceph_abort("fatal error"); + } + auto impl = InternalNodeImpl::load(extent, *field_type); + auto *derived_ptr = impl.get(); + return eagain_iertr::make_ready_future<Ref<Node>>( + new InternalNode(derived_ptr, std::move(impl))); + } else { + ceph_abort("impossible path"); + } + }); +} + +eagain_ifuture<NodeExtentMutable> Node::rebuild_extent(context_t c) +{ + LOG_PREFIX(OTree::Node::rebuild_extent); + DEBUGT("{} ...", c.t, get_name()); + assert(!is_root()); + // assume I'm already ref counted by caller + + // note: laddr can be changed after rebuild, but we don't fix the parent + // mapping as it is part of the merge process. + return impl->rebuild_extent(c); +} + +eagain_ifuture<> Node::retire(context_t c, Ref<Node>&& this_ref) +{ + LOG_PREFIX(OTree::Node::retire); + DEBUGT("{} ...", c.t, get_name()); + assert(this_ref.get() == this); + assert(!is_tracking()); + assert(!is_tracked()); + assert(this_ref->use_count() == 1); + + return impl->retire_extent(c + ).si_then([this_ref = std::move(this_ref)]{ /* deallocate node */}); +} + +void Node::make_tail(context_t c) +{ + LOG_PREFIX(OTree::Node::make_tail); + assert(!impl->is_level_tail()); + assert(!impl->is_keys_empty()); + DEBUGT("{} ...", c.t, get_name()); + impl->prepare_mutate(c); + auto tail_pos = impl->make_tail(); + if (impl->node_type() == node_type_t::INTERNAL) { + auto& node = *static_cast<InternalNode*>(this); + node.track_make_tail(tail_pos); + } +} + +/* + * InternalNode + */ + +InternalNode::InternalNode(InternalNodeImpl* impl, NodeImplURef&& impl_ref) + : Node(std::move(impl_ref)), impl{impl} {} + +eagain_ifuture<Ref<tree_cursor_t>> +InternalNode::get_next_cursor(context_t c, const search_position_t& pos) +{ + impl->validate_non_empty(); + if (pos.is_end()) { + assert(impl->is_level_tail()); + return get_next_cursor_from_parent(c); + } + + search_position_t next_pos = pos; + const laddr_packed_t* p_child_addr = nullptr; + impl->get_next_slot(next_pos, nullptr, &p_child_addr); + if (next_pos.is_end() && !impl->is_level_tail()) { + return get_next_cursor_from_parent(c); + } else { + if (next_pos.is_end()) { + p_child_addr = impl->get_tail_value(); + } + assert(p_child_addr); + return get_or_track_child(c, next_pos, p_child_addr->value + ).si_then([c](auto child) { + return child->lookup_smallest(c); + }); + } +} + +eagain_ifuture<> InternalNode::apply_child_split( + context_t c, Ref<Node>&& left_child, Ref<Node>&& right_child, + bool update_right_index) +{ + LOG_PREFIX(OTree::InternalNode::apply_child_split); + auto& left_pos = left_child->parent_info().position; + +#ifndef NDEBUG + assert(left_child->parent_info().ptr.get() == this); + assert(!left_child->impl->is_level_tail()); + if (left_pos.is_end()) { + assert(impl->is_level_tail()); + assert(right_child->impl->is_level_tail()); + assert(!update_right_index); + } + + // right_child has not assigned parent yet + assert(!right_child->is_tracked()); +#endif + + impl->prepare_mutate(c); + + DEBUGT("apply {}'s child {} to split to {}, update_index={} ...", + c.t, get_name(), left_child->get_name(), + right_child->get_name(), update_right_index); + + // update layout from left_pos => left_child_addr to right_child_addr + auto left_child_addr = left_child->impl->laddr(); + auto right_child_addr = right_child->impl->laddr(); + impl->replace_child_addr(left_pos, right_child_addr, left_child_addr); + + // update track from left_pos => left_child to right_child + replace_track(right_child, left_child, update_right_index); + + auto left_key = *left_child->impl->get_pivot_index(); + Ref<Node> this_ref = this; + return insert_or_split( + c, left_pos, left_key, left_child, + (update_right_index ? right_child : nullptr) + ).si_then([this, c, + this_ref = std::move(this_ref)] (auto split_right) mutable { + if (split_right) { + // even if update_right_index could be true, + // we haven't fixed the right_child index of this node yet, + // so my parent index should be correct now. + return apply_split_to_parent( + c, std::move(this_ref), std::move(split_right), false); + } else { + return eagain_iertr::now(); + } + }).si_then([c, update_right_index, + right_child = std::move(right_child)] () mutable { + if (update_right_index) { + // XXX: might not need to call validate_tracked_children() in fix_index() + return right_child->fix_parent_index(c, std::move(right_child), false); + } else { + // there is no need to call try_merge_adjacent() because + // the filled size of the inserted node or the split right node + // won't be reduced if update_right_index is false. + return eagain_iertr::now(); + } + }); +} + +eagain_ifuture<> InternalNode::erase_child(context_t c, Ref<Node>&& child_ref) +{ + LOG_PREFIX(OTree::InternalNode::erase_child); + // this is a special version of recursive merge + impl->validate_non_empty(); + assert(child_ref->use_count() == 1); + validate_child_tracked(*child_ref); + + // fix the child's previous node as the new tail, + // and trigger prv_child_ref->try_merge_adjacent() at the end + bool fix_tail = (child_ref->parent_info().position.is_end() && + !impl->is_keys_empty()); + return eagain_iertr::now().si_then([c, this, fix_tail] { + if (fix_tail) { + search_position_t new_tail_pos; + const laddr_packed_t* new_tail_p_addr = nullptr; + impl->get_largest_slot(&new_tail_pos, nullptr, &new_tail_p_addr); + return get_or_track_child(c, new_tail_pos, new_tail_p_addr->value); + } else { + return eagain_iertr::make_ready_future<Ref<Node>>(); + } + }).si_then([c, this, child_ref = std::move(child_ref), FNAME] + (auto&& new_tail_child) mutable { + auto child_pos = child_ref->parent_info().position; + if (new_tail_child) { + DEBUGT("erase {}'s child {} at pos({}), " + "and fix new child tail {} at pos({}) ...", + c.t, get_name(), child_ref->get_name(), child_pos, + new_tail_child->get_name(), new_tail_child->parent_info().position); + assert(!new_tail_child->impl->is_level_tail()); + new_tail_child->make_tail(c); + assert(new_tail_child->impl->is_level_tail()); + if (new_tail_child->impl->node_type() == node_type_t::LEAF) { + // no need to proceed merge because the filled size is not changed + new_tail_child.reset(); + } + } else { + DEBUGT("erase {}'s child {} at pos({}) ...", + c.t, get_name(), child_ref->get_name(), child_pos); + } + + Ref<Node> this_ref = child_ref->deref_parent(); + assert(this_ref == this); + return child_ref->retire(c, std::move(child_ref) + ).si_then([c, this, child_pos, FNAME, + this_ref = std::move(this_ref)] () mutable { + if (impl->has_single_value()) { + // fast path without mutating the extent + DEBUGT("{} has one value left, erase ...", c.t, get_name()); +#ifndef NDEBUG + if (impl->is_level_tail()) { + assert(child_pos.is_end()); + } else { + assert(child_pos == search_position_t::begin()); + } +#endif + + if (is_root()) { + // Note: if merge/split works as expected, we should never encounter the + // situation when the internal root has <=1 children: + // + // A newly created internal root (see Node::upgrade_root()) will have 2 + // children after split is finished. + // + // When merge happens, children will try to merge each other, and if the + // root detects there is only one child left, the root will be + // down-graded to the only child. + // + // In order to preserve the invariant, we need to make sure the new + // internal root also has at least 2 children. + ceph_abort("trying to erase the last item from the internal root node"); + } + + // track erase + assert(tracked_child_nodes.empty()); + + // no child should be referencing this node now, this_ref is the last one. + assert(this_ref->use_count() == 1); + return Node::erase_node(c, std::move(this_ref)); + } + + impl->prepare_mutate(c); + auto [erase_stage, next_or_last_pos] = impl->erase(child_pos); + if (child_pos.is_end()) { + // next_or_last_pos as last_pos + track_make_tail(next_or_last_pos); + } else { + // next_or_last_pos as next_pos + track_erase(child_pos, erase_stage); + } + validate_tracked_children(); + + if (is_root()) { + return try_downgrade_root(c, std::move(this_ref)); + } else { + bool update_parent_index; + if (impl->is_level_tail()) { + update_parent_index = false; + } else { + // next_or_last_pos as next_pos + next_or_last_pos.is_end() ? update_parent_index = true + : update_parent_index = false; + } + return try_merge_adjacent(c, update_parent_index, std::move(this_ref)); + } + }).si_then([c, new_tail_child = std::move(new_tail_child)] () mutable { + // finally, check if the new tail child needs to merge + if (new_tail_child && !new_tail_child->is_root()) { + assert(new_tail_child->impl->is_level_tail()); + return new_tail_child->try_merge_adjacent( + c, false, std::move(new_tail_child)); + } else { + return eagain_iertr::now(); + } + }); + }); +} + +template <bool FORCE_MERGE> +eagain_ifuture<> InternalNode::fix_index( + context_t c, Ref<Node>&& child, bool check_downgrade) +{ + LOG_PREFIX(OTree::InternalNode::fix_index); + impl->validate_non_empty(); + + validate_child_inconsistent(*child); + auto& child_pos = child->parent_info().position; + Ref<Node> this_ref = child->deref_parent(); + assert(this_ref == this); + validate_tracked_children(); + + impl->prepare_mutate(c); + + key_view_t new_key = *child->impl->get_pivot_index(); + DEBUGT("fix {}'s index of child {} at pos({}), new_key={} ...", + c.t, get_name(), child->get_name(), child_pos, new_key); + + // erase the incorrect item + auto [erase_stage, next_pos] = impl->erase(child_pos); + track_erase(child_pos, erase_stage); + validate_tracked_children(); + + // find out whether there is a need to fix parent index recursively + bool update_parent_index; + if (impl->is_level_tail()) { + update_parent_index = false; + } else { + next_pos.is_end() ? update_parent_index = true + : update_parent_index = false; + } + + return insert_or_split(c, next_pos, new_key, child + ).si_then([this, c, update_parent_index, check_downgrade, + this_ref = std::move(this_ref)] (auto split_right) mutable { + if (split_right) { + // after split, the parent index to the split_right will be incorrect + // if update_parent_index is true. + return apply_split_to_parent( + c, std::move(this_ref), std::move(split_right), update_parent_index); + } else { + // no split path + if (is_root()) { + if (check_downgrade) { + return try_downgrade_root(c, std::move(this_ref)); + } else { + // no need to call try_downgrade_root() because the number of keys + // has not changed, and I must have at least 2 keys. + assert(!impl->is_keys_empty()); + return eagain_iertr::now(); + } + } else { + // for non-root, maybe need merge adjacent or fix parent, + // because the filled node size may be reduced. + return try_merge_adjacent<FORCE_MERGE>( + c, update_parent_index, std::move(this_ref)); + } + } + }); +} + +template <bool FORCE_MERGE> +eagain_ifuture<> InternalNode::apply_children_merge( + context_t c, Ref<Node>&& left_child, laddr_t origin_left_addr, + Ref<Node>&& right_child, bool update_index) +{ + LOG_PREFIX(OTree::InternalNode::apply_children_merge); + auto left_pos = left_child->parent_info().position; + auto left_addr = left_child->impl->laddr(); + auto& right_pos = right_child->parent_info().position; + auto right_addr = right_child->impl->laddr(); + DEBUGT("apply {}'s child {} (was {:#x}) at pos({}), " + "to merge with {} at pos({}), update_index={} ...", + c.t, get_name(), left_child->get_name(), origin_left_addr, left_pos, + right_child->get_name(), right_pos, update_index); + +#ifndef NDEBUG + assert(left_child->parent_info().ptr == this); + assert(!left_pos.is_end()); + const laddr_packed_t* p_value_left; + impl->get_slot(left_pos, nullptr, &p_value_left); + assert(p_value_left->value == origin_left_addr); + + assert(right_child->use_count() == 1); + assert(right_child->parent_info().ptr == this); + const laddr_packed_t* p_value_right; + if (right_pos.is_end()) { + assert(right_child->impl->is_level_tail()); + assert(left_child->impl->is_level_tail()); + assert(impl->is_level_tail()); + assert(!update_index); + p_value_right = impl->get_tail_value(); + } else { + assert(!right_child->impl->is_level_tail()); + assert(!left_child->impl->is_level_tail()); + impl->get_slot(right_pos, nullptr, &p_value_right); + } + assert(p_value_right->value == right_addr); +#endif + + // XXX: we may jump to try_downgrade_root() without mutating this node. + + // update layout from right_pos => right_addr to left_addr + impl->prepare_mutate(c); + impl->replace_child_addr(right_pos, left_addr, right_addr); + + // update track from right_pos => right_child to left_child + left_child->deref_parent(); + replace_track(left_child, right_child, update_index); + + // erase left_pos from layout + auto [erase_stage, next_pos] = impl->erase(left_pos); + track_erase<false>(left_pos, erase_stage); + assert(next_pos == left_child->parent_info().position); + + // All good to retire the right_child. + // I'm already ref-counted by left_child. + return right_child->retire(c, std::move(right_child) + ).si_then([c, this, update_index, + left_child = std::move(left_child)] () mutable { + if (update_index) { + // I'm all good but: + // - my number of keys is reduced by 1 + // - my size may underflow, but try_merge_adjacent() is already part of fix_index() + return left_child->fix_parent_index<FORCE_MERGE>(c, std::move(left_child), true); + } else { + validate_tracked_children(); + Ref<Node> this_ref = this; + left_child.reset(); + // I'm all good but: + // - my number of keys is reduced by 1 + // - my size may underflow + if (is_root()) { + return try_downgrade_root(c, std::move(this_ref)); + } else { + return try_merge_adjacent<FORCE_MERGE>( + c, false, std::move(this_ref)); + } + } + }); +} +template eagain_ifuture<> InternalNode::apply_children_merge<true>( + context_t, Ref<Node>&&, laddr_t, Ref<Node>&&, bool); +template eagain_ifuture<> InternalNode::apply_children_merge<false>( + context_t, Ref<Node>&&, laddr_t, Ref<Node>&&, bool); + +eagain_ifuture<std::pair<Ref<Node>, Ref<Node>>> InternalNode::get_child_peers( + context_t c, const search_position_t& pos) +{ + // assume I'm already ref counted by caller + search_position_t prev_pos; + const laddr_packed_t* prev_p_child_addr = nullptr; + search_position_t next_pos; + const laddr_packed_t* next_p_child_addr = nullptr; + + if (pos.is_end()) { + assert(impl->is_level_tail()); + if (!impl->is_keys_empty()) { + // got previous child only + impl->get_largest_slot(&prev_pos, nullptr, &prev_p_child_addr); + assert(prev_pos < pos); + assert(prev_p_child_addr != nullptr); + } else { + // no keys, so no peer children + } + } else { // !pos.is_end() + if (pos != search_position_t::begin()) { + // got previous child + prev_pos = pos; + impl->get_prev_slot(prev_pos, nullptr, &prev_p_child_addr); + assert(prev_pos < pos); + assert(prev_p_child_addr != nullptr); + } else { + // is already the first child, so no previous child + } + + next_pos = pos; + impl->get_next_slot(next_pos, nullptr, &next_p_child_addr); + if (next_pos.is_end()) { + if (impl->is_level_tail()) { + // the next child is the tail + next_p_child_addr = impl->get_tail_value(); + assert(pos < next_pos); + assert(next_p_child_addr != nullptr); + } else { + // next child doesn't exist + assert(next_p_child_addr == nullptr); + } + } else { + // got the next child + assert(pos < next_pos); + assert(next_p_child_addr != nullptr); + } + } + + return eagain_iertr::now().si_then([this, c, prev_pos, prev_p_child_addr] { + if (prev_p_child_addr != nullptr) { + return get_or_track_child(c, prev_pos, prev_p_child_addr->value); + } else { + return eagain_iertr::make_ready_future<Ref<Node>>(); + } + }).si_then([this, c, next_pos, next_p_child_addr] (Ref<Node> lnode) { + if (next_p_child_addr != nullptr) { + return get_or_track_child(c, next_pos, next_p_child_addr->value + ).si_then([lnode] (Ref<Node> rnode) { + return seastar::make_ready_future<std::pair<Ref<Node>, Ref<Node>>>( + lnode, rnode); + }); + } else { + return eagain_iertr::make_ready_future<std::pair<Ref<Node>, Ref<Node>>>( + lnode, nullptr); + } + }); +} + +eagain_ifuture<Ref<InternalNode>> InternalNode::allocate_root( + context_t c, laddr_t hint, level_t old_root_level, + laddr_t old_root_addr, Super::URef&& super) +{ + // support tree height up to 256 + ceph_assert(old_root_level < MAX_LEVEL); + return InternalNode::allocate(c, hint, field_type_t::N0, true, old_root_level + 1 + ).si_then([c, old_root_addr, + super = std::move(super)](auto fresh_node) mutable { + auto root = fresh_node.node; + assert(root->impl->is_keys_empty()); + auto p_value = root->impl->get_tail_value(); + fresh_node.mut.copy_in_absolute( + const_cast<laddr_packed_t*>(p_value), old_root_addr); + root->make_root_from(c, std::move(super), old_root_addr); + ++(c.t.get_onode_tree_stats().extents_num_delta); + return root; + }); +} + +eagain_ifuture<Ref<tree_cursor_t>> +InternalNode::lookup_smallest(context_t c) +{ + impl->validate_non_empty(); + auto position = search_position_t::begin(); + const laddr_packed_t* p_child_addr; + impl->get_slot(position, nullptr, &p_child_addr); + return get_or_track_child(c, position, p_child_addr->value + ).si_then([c](auto child) { + return child->lookup_smallest(c); + }); +} + +eagain_ifuture<Ref<tree_cursor_t>> +InternalNode::lookup_largest(context_t c) +{ + // NOTE: unlike LeafNode::lookup_largest(), this only works for the tail + // internal node to return the tail child address. + impl->validate_non_empty(); + assert(impl->is_level_tail()); + auto p_child_addr = impl->get_tail_value(); + return get_or_track_child(c, search_position_t::end(), p_child_addr->value + ).si_then([c](auto child) { + return child->lookup_largest(c); + }); +} + +eagain_ifuture<Node::search_result_t> +InternalNode::lower_bound_tracked( + context_t c, const key_hobj_t& key, MatchHistory& history) +{ + auto result = impl->lower_bound(key, history); + return get_or_track_child(c, result.position, result.p_value->value + ).si_then([c, &key, &history](auto child) { + // XXX(multi-type): pass result.mstat to child + return child->lower_bound_tracked(c, key, history); + }); +} + +eagain_ifuture<> InternalNode::do_get_tree_stats( + context_t c, tree_stats_t& stats) +{ + impl->validate_non_empty(); + auto nstats = impl->get_stats(); + stats.size_persistent_internal += nstats.size_persistent; + stats.size_filled_internal += nstats.size_filled; + stats.size_logical_internal += nstats.size_logical; + stats.size_overhead_internal += nstats.size_overhead; + stats.size_value_internal += nstats.size_value; + stats.num_kvs_internal += nstats.num_kvs; + stats.num_nodes_internal += 1; + + Ref<Node> this_ref = this; + return seastar::do_with( + search_position_t(), (const laddr_packed_t*)(nullptr), + [this, this_ref, c, &stats](auto& pos, auto& p_child_addr) { + pos = search_position_t::begin(); + impl->get_slot(pos, nullptr, &p_child_addr); + return trans_intr::repeat( + [this, this_ref, c, &stats, &pos, &p_child_addr]() + -> eagain_ifuture<seastar::stop_iteration> { + return get_or_track_child(c, pos, p_child_addr->value + ).si_then([c, &stats](auto child) { + return child->do_get_tree_stats(c, stats); + }).si_then([this, this_ref, &pos, &p_child_addr] { + if (pos.is_end()) { + return seastar::stop_iteration::yes; + } else { + impl->get_next_slot(pos, nullptr, &p_child_addr); + if (pos.is_end()) { + if (impl->is_level_tail()) { + p_child_addr = impl->get_tail_value(); + return seastar::stop_iteration::no; + } else { + return seastar::stop_iteration::yes; + } + } else { + return seastar::stop_iteration::no; + } + } + }); + }); + } + ); +} + +void InternalNode::track_merge( + Ref<Node> _right_node, match_stage_t stage, search_position_t& left_last_pos) +{ + assert(level() == _right_node->level()); + assert(impl->node_type() == _right_node->impl->node_type()); + auto& right_node = *static_cast<InternalNode*>(_right_node.get()); + if (right_node.tracked_child_nodes.empty()) { + return; + } + + match_stage_t curr_stage = STAGE_BOTTOM; + + // prepare the initial left_last_pos for offset + while (curr_stage < stage) { + left_last_pos.index_by_stage(curr_stage) = 0; + ++curr_stage; + } + ++left_last_pos.index_by_stage(curr_stage); + + // fix the tracked child nodes of right_node, stage by stage. + auto& right_tracked_children = right_node.tracked_child_nodes; + auto rit = right_tracked_children.begin(); + while (curr_stage <= STAGE_TOP) { + auto right_pos_until = search_position_t::begin(); + right_pos_until.index_by_stage(curr_stage) = INDEX_UPPER_BOUND; + auto rend = right_tracked_children.lower_bound(right_pos_until); + while (rit != rend) { + auto new_pos = rit->second->parent_info().position; + assert(new_pos == rit->first); + assert(rit->second->parent_info().ptr == &right_node); + new_pos += left_last_pos; + auto p_child = rit->second; + rit = right_tracked_children.erase(rit); + p_child->as_child(new_pos, this); + } + left_last_pos.index_by_stage(curr_stage) = 0; + ++curr_stage; + } + + // fix the end tracked child node of right_node, if exists. + if (rit != right_tracked_children.end()) { + assert(rit->first == search_position_t::end()); + assert(rit->second->parent_info().position == search_position_t::end()); + assert(right_node.impl->is_level_tail()); + assert(impl->is_level_tail()); + auto p_child = rit->second; + rit = right_tracked_children.erase(rit); + p_child->as_child(search_position_t::end(), this); + } + assert(right_tracked_children.empty()); + + validate_tracked_children(); +} + +eagain_ifuture<> InternalNode::test_clone_root( + context_t c_other, RootNodeTracker& tracker_other) const +{ + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + Ref<const Node> this_ref = this; + return InternalNode::allocate(c_other, L_ADDR_MIN, field_type_t::N0, true, impl->level() + ).si_then([this, c_other, &tracker_other](auto fresh_other) { + impl->test_copy_to(fresh_other.mut); + auto cloned_root = fresh_other.node; + return c_other.nm.get_super(c_other.t, tracker_other + ).handle_error_interruptible( + eagain_iertr::pass_further{}, + crimson::ct_error::assert_all{"Invalid error during test clone"} + ).si_then([c_other, cloned_root](auto&& super_other) { + assert(super_other); + cloned_root->make_root_new(c_other, std::move(super_other)); + return cloned_root; + }); + }).si_then([this_ref, this, c_other](auto cloned_root) { + // clone tracked children + // In some unit tests, the children are stubbed out that they + // don't exist in NodeExtentManager, and are only tracked in memory. + return trans_intr::do_for_each( + tracked_child_nodes.begin(), + tracked_child_nodes.end(), + [this_ref, c_other, cloned_root](auto& kv) { + assert(kv.first == kv.second->parent_info().position); + return kv.second->test_clone_non_root(c_other, cloned_root); + } + ); + }); +} + +eagain_ifuture<> InternalNode::try_downgrade_root( + context_t c, Ref<Node>&& this_ref) +{ + LOG_PREFIX(OTree::InternalNode::try_downgrade_root); + assert(this_ref.get() == this); + assert(is_root()); + assert(impl->is_level_tail()); + if (!impl->is_keys_empty()) { + // I have more than 1 values, no need to downgrade + return eagain_iertr::now(); + } + + // proceed downgrade root to the only child + laddr_t child_addr = impl->get_tail_value()->value; + return get_or_track_child(c, search_position_t::end(), child_addr + ).si_then([c, this, FNAME, + this_ref = std::move(this_ref)] (auto child) mutable { + INFOT("downgrade {} to new root {}", + c.t, get_name(), child->get_name()); + // Invariant, see InternalNode::erase_child() + // the new internal root should have at least 2 children. + assert(child->impl->is_level_tail()); + if (child->impl->node_type() == node_type_t::INTERNAL) { + ceph_assert(!child->impl->is_keys_empty()); + } + + assert(tracked_child_nodes.size() == 1); + child->deref_parent(); + auto super_to_move = deref_super(); + child->make_root_from(c, std::move(super_to_move), impl->laddr()); + --(c.t.get_onode_tree_stats().extents_num_delta); + return retire(c, std::move(this_ref)); + }); +} + +eagain_ifuture<Ref<InternalNode>> InternalNode::insert_or_split( + context_t c, + const search_position_t& pos, + const key_view_t& insert_key, + Ref<Node> insert_child, + Ref<Node> outdated_child) +{ + LOG_PREFIX(OTree::InternalNode::insert_or_split); + // XXX: check the insert_child is unlinked from this node +#ifndef NDEBUG + auto _insert_key = *insert_child->impl->get_pivot_index(); + assert(insert_key == _insert_key); +#endif + auto insert_value = insert_child->impl->laddr(); + auto insert_pos = pos; + DEBUGT("insert {} with insert_key={}, insert_child={}, insert_pos({}), " + "outdated_child={} ...", + c.t, get_name(), insert_key, insert_child->get_name(), + insert_pos, (outdated_child ? "True" : "False")); + auto [insert_stage, insert_size] = impl->evaluate_insert( + insert_key, insert_value, insert_pos); + auto free_size = impl->free_size(); + if (free_size >= insert_size) { + // proceed to insert + [[maybe_unused]] auto p_value = impl->insert( + insert_key, insert_value, insert_pos, insert_stage, insert_size); + assert(impl->free_size() == free_size - insert_size); + assert(insert_pos <= pos); + assert(p_value->value == insert_value); + + if (outdated_child) { + track_insert<false>(insert_pos, insert_stage, insert_child); + validate_child_inconsistent(*outdated_child); +#ifndef NDEBUG + do_untrack_child(*outdated_child); + validate_tracked_children(); + do_track_child<false>(*outdated_child); +#endif + } else { + track_insert(insert_pos, insert_stage, insert_child); + validate_tracked_children(); + } + + return eagain_iertr::make_ready_future<Ref<InternalNode>>(nullptr); + } + + // proceed to split with insert + // assume I'm already ref-counted by caller + laddr_t left_hint, right_hint; + { + key_view_t left_key; + impl->get_slot(search_position_t::begin(), &left_key, nullptr); + left_hint = left_key.get_hint(); + key_view_t right_key; + impl->get_largest_slot(nullptr, &right_key, nullptr); + right_hint = right_key.get_hint(); + } + return (is_root() ? upgrade_root(c, left_hint) : eagain_iertr::now() + ).si_then([this, c, right_hint] { + return InternalNode::allocate( + c, right_hint, impl->field_type(), impl->is_level_tail(), impl->level()); + }).si_then([this, insert_key, insert_child, insert_pos, + insert_stage=insert_stage, insert_size=insert_size, + outdated_child, c, FNAME](auto fresh_right) mutable { + // I'm the left_node and need to split into the right_node + auto right_node = fresh_right.node; + DEBUGT("proceed split {} to fresh {} with insert_child={}," + " outdated_child={} ...", + c.t, get_name(), right_node->get_name(), + insert_child->get_name(), + (outdated_child ? outdated_child->get_name() : "N/A")); + auto insert_value = insert_child->impl->laddr(); + auto [split_pos, is_insert_left, p_value] = impl->split_insert( + fresh_right.mut, *right_node->impl, insert_key, insert_value, + insert_pos, insert_stage, insert_size); + assert(p_value->value == insert_value); + track_split(split_pos, right_node); + + if (outdated_child) { + if (is_insert_left) { + track_insert<false>(insert_pos, insert_stage, insert_child); + } else { + right_node->template track_insert<false>(insert_pos, insert_stage, insert_child); + } +#ifndef NDEBUG + auto& _parent = outdated_child->parent_info().ptr; + _parent->validate_child_inconsistent(*outdated_child); + _parent->do_untrack_child(*outdated_child); + validate_tracked_children(); + right_node->validate_tracked_children(); + _parent->do_track_child<false>(*outdated_child); +#endif + } else { + if (is_insert_left) { + track_insert(insert_pos, insert_stage, insert_child); + } else { + right_node->track_insert(insert_pos, insert_stage, insert_child); + } + validate_tracked_children(); + right_node->validate_tracked_children(); + } + ++(c.t.get_onode_tree_stats().extents_num_delta); + return right_node; + }); +} + +eagain_ifuture<Ref<Node>> InternalNode::get_or_track_child( + context_t c, const search_position_t& position, laddr_t child_addr) +{ + LOG_PREFIX(OTree::InternalNode::get_or_track_child); + Ref<Node> this_ref = this; + return [this, position, child_addr, c, FNAME] { + auto found = tracked_child_nodes.find(position); + if (found != tracked_child_nodes.end()) { + TRACET("loaded child tracked {} at pos({}) addr={:x}", + c.t, found->second->get_name(), position, child_addr); + return eagain_iertr::make_ready_future<Ref<Node>>(found->second); + } + // the child is not loaded yet + TRACET("loading child at pos({}) addr={:x} ...", + c.t, position, child_addr); + bool level_tail = position.is_end(); + return Node::load(c, child_addr, level_tail + ).si_then([this, position, c, FNAME] (auto child) { + TRACET("loaded child untracked {}", + c.t, child->get_name()); + if (child->level() + 1 != level()) { + ERRORT("loaded child {} error from parent {} at pos({}), level mismatch", + c.t, child->get_name(), get_name(), position); + ceph_abort("fatal error"); + } + child->as_child(position, this); + return child; + }); + }().si_then([this_ref, this, position, child_addr] (auto child) { + assert(child_addr == child->impl->laddr()); + assert(position == child->parent_info().position); + std::ignore = position; + std::ignore = child_addr; + validate_child_tracked(*child); + return child; + }); +} + +template <bool VALIDATE> +void InternalNode::track_insert( + const search_position_t& insert_pos, match_stage_t insert_stage, + Ref<Node> insert_child, Ref<Node> nxt_child) +{ + // update tracks + auto pos_upper_bound = insert_pos; + pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND; + auto first = tracked_child_nodes.lower_bound(insert_pos); + auto last = tracked_child_nodes.lower_bound(pos_upper_bound); + std::vector<Node*> nodes; + std::for_each(first, last, [&nodes](auto& kv) { + nodes.push_back(kv.second); + }); + tracked_child_nodes.erase(first, last); + for (auto& node : nodes) { + auto _pos = node->parent_info().position; + assert(!_pos.is_end()); + ++_pos.index_by_stage(insert_stage); + node->as_child<VALIDATE>(_pos, this); + } + // track insert + insert_child->as_child(insert_pos, this); + +#ifndef NDEBUG + // validate left_child is before right_child + if (nxt_child) { + auto iter = tracked_child_nodes.find(insert_pos); + ++iter; + assert(iter->second == nxt_child); + } +#endif +} +template void InternalNode::track_insert<true>(const search_position_t&, match_stage_t, Ref<Node>, Ref<Node>); +template void InternalNode::track_insert<false>(const search_position_t&, match_stage_t, Ref<Node>, Ref<Node>); + +void InternalNode::replace_track( + Ref<Node> new_child, Ref<Node> old_child, bool is_new_child_outdated) +{ + assert(!new_child->is_tracked()); + auto& pos = old_child->parent_info().position; + auto this_ref = old_child->deref_parent(); + assert(this_ref == this); + if (is_new_child_outdated) { + // we need to keep track of the outdated child through + // insert and split. + new_child->as_child<false>(pos, this); + } else { + new_child->as_child(pos, this); + } + +#ifndef NDEBUG + if (is_new_child_outdated) { + validate_child_inconsistent(*new_child); + } else { + validate_child_tracked(*new_child); + } +#endif +} + +void InternalNode::track_split( + const search_position_t& split_pos, Ref<InternalNode> right_node) +{ + auto iter = tracked_child_nodes.lower_bound(split_pos); + while (iter != tracked_child_nodes.end()) { + auto new_pos = iter->first; + auto p_node = iter->second; + iter = tracked_child_nodes.erase(iter); + new_pos -= split_pos; + p_node->as_child<false>(new_pos, right_node); + } +} + +template <bool VALIDATE> +void InternalNode::track_erase( + const search_position_t& erase_pos, match_stage_t erase_stage) +{ + auto first = tracked_child_nodes.lower_bound(erase_pos); + assert(first == tracked_child_nodes.end() || + first->first != erase_pos); + auto pos_upper_bound = erase_pos; + pos_upper_bound.index_by_stage(erase_stage) = INDEX_UPPER_BOUND; + auto last = tracked_child_nodes.lower_bound(pos_upper_bound); + std::vector<Node*> p_nodes; + std::for_each(first, last, [&p_nodes](auto& kv) { + p_nodes.push_back(kv.second); + }); + tracked_child_nodes.erase(first, last); + for (auto& p_node: p_nodes) { + auto new_pos = p_node->parent_info().position; + assert(new_pos.index_by_stage(erase_stage) > 0); + --new_pos.index_by_stage(erase_stage); + p_node->as_child<VALIDATE>(new_pos, this); + } +} +template void InternalNode::track_erase<true>(const search_position_t&, match_stage_t); +template void InternalNode::track_erase<false>(const search_position_t&, match_stage_t); + +void InternalNode::track_make_tail(const search_position_t& last_pos) +{ + // assume I'm ref counted by the caller. + assert(impl->is_level_tail()); + assert(!last_pos.is_end()); + assert(tracked_child_nodes.find(search_position_t::end()) == + tracked_child_nodes.end()); + auto last_it = tracked_child_nodes.find(last_pos); + if (last_it != tracked_child_nodes.end()) { + assert(std::next(last_it) == tracked_child_nodes.end()); + auto p_last_child = last_it->second; + tracked_child_nodes.erase(last_it); + p_last_child->as_child(search_position_t::end(), this); + } else { + assert(tracked_child_nodes.lower_bound(last_pos) == + tracked_child_nodes.end()); + } +} + +void InternalNode::validate_child(const Node& child) const +{ +#ifndef NDEBUG + assert(impl->level() - 1 == child.impl->level()); + assert(this == child.parent_info().ptr); + auto& child_pos = child.parent_info().position; + if (child_pos.is_end()) { + assert(impl->is_level_tail()); + assert(child.impl->is_level_tail()); + assert(impl->get_tail_value()->value == child.impl->laddr()); + } else { + assert(!child.impl->is_level_tail()); + key_view_t index_key; + const laddr_packed_t* p_child_addr; + impl->get_slot(child_pos, &index_key, &p_child_addr); + assert(index_key == *child.impl->get_pivot_index()); + assert(p_child_addr->value == child.impl->laddr()); + } + // XXX(multi-type) + assert(impl->field_type() <= child.impl->field_type()); +#endif +} + +void InternalNode::validate_child_inconsistent(const Node& child) const +{ +#ifndef NDEBUG + assert(impl->level() - 1 == child.impl->level()); + assert(check_is_tracking(child)); + auto& child_pos = child.parent_info().position; + // the tail value has no key to fix + assert(!child_pos.is_end()); + assert(!child.impl->is_level_tail()); + + key_view_t current_key; + const laddr_packed_t* p_value; + impl->get_slot(child_pos, ¤t_key, &p_value); + key_view_t new_key = *child.impl->get_pivot_index(); + assert(current_key != new_key); + assert(p_value->value == child.impl->laddr()); +#endif +} + +eagain_ifuture<InternalNode::fresh_node_t> InternalNode::allocate( + context_t c, laddr_t hint, field_type_t field_type, bool is_level_tail, level_t level) +{ + return InternalNodeImpl::allocate(c, hint, field_type, is_level_tail, level + ).si_then([](auto&& fresh_impl) { + auto *derived_ptr = fresh_impl.impl.get(); + auto node = Ref<InternalNode>(new InternalNode( + derived_ptr, std::move(fresh_impl.impl))); + return fresh_node_t{node, fresh_impl.mut}; + }); +} + +/* + * LeafNode + */ + +LeafNode::LeafNode(LeafNodeImpl* impl, NodeImplURef&& impl_ref) + : Node(std::move(impl_ref)), impl{impl} {} + +bool LeafNode::is_level_tail() const +{ + return impl->is_level_tail(); +} + +node_version_t LeafNode::get_version() const +{ + return {layout_version, impl->get_extent_state()}; +} + +const char* LeafNode::read() const +{ + return impl->read(); +} + +extent_len_t LeafNode::get_node_size() const +{ + return impl->get_node_size(); +} + +std::tuple<key_view_t, const value_header_t*> +LeafNode::get_kv(const search_position_t& pos) const +{ + key_view_t key_view; + const value_header_t* p_value_header; + impl->get_slot(pos, &key_view, &p_value_header); + return {key_view, p_value_header}; +} + +eagain_ifuture<Ref<tree_cursor_t>> +LeafNode::get_next_cursor(context_t c, const search_position_t& pos) +{ + impl->validate_non_empty(); + search_position_t next_pos = pos; + key_view_t index_key; + const value_header_t* p_value_header = nullptr; + impl->get_next_slot(next_pos, &index_key, &p_value_header); + if (next_pos.is_end()) { + if (unlikely(is_level_tail())) { + return eagain_iertr::make_ready_future<Ref<tree_cursor_t>>( + tree_cursor_t::create_end(this)); + } else { + return get_next_cursor_from_parent(c); + } + } else { + return eagain_iertr::make_ready_future<Ref<tree_cursor_t>>( + get_or_track_cursor(next_pos, index_key, p_value_header)); + } +} + +template <bool FORCE_MERGE> +eagain_ifuture<Ref<tree_cursor_t>> +LeafNode::erase(context_t c, const search_position_t& pos, bool get_next) +{ + LOG_PREFIX(OTree::LeafNode::erase); + assert(!pos.is_end()); + assert(!impl->is_keys_empty()); + Ref<Node> this_ref = this; + DEBUGT("erase {}'s pos({}), get_next={} ...", + c.t, get_name(), pos, get_next); + ++(c.t.get_onode_tree_stats().num_erases); + + // get the next cursor + return eagain_iertr::now().si_then([c, &pos, get_next, this] { + if (get_next) { + return get_next_cursor(c, pos); + } else { + return eagain_iertr::make_ready_future<Ref<tree_cursor_t>>(); + } + }).si_then([c, &pos, this_ref = std::move(this_ref), + this, FNAME] (Ref<tree_cursor_t> next_cursor) mutable { + if (next_cursor && next_cursor->is_end()) { + // reset the node reference from the end cursor + next_cursor.reset(); + } + return eagain_iertr::now().si_then( + [c, &pos, this_ref = std::move(this_ref), this, FNAME] () mutable { + assert_moveable(this_ref); +#ifndef NDEBUG + assert(!impl->is_keys_empty()); + if (impl->has_single_value()) { + assert(pos == search_position_t::begin()); + } +#endif + if (!is_root() && impl->has_single_value()) { + // we need to keep the root as an empty leaf node + // fast path without mutating the extent + // track_erase + DEBUGT("{} has one value left, erase ...", c.t, get_name()); + assert(tracked_cursors.size() == 1); + auto iter = tracked_cursors.begin(); + assert(iter->first == pos); + iter->second->invalidate(); + tracked_cursors.clear(); + + // no cursor should be referencing this node now, this_ref is the last one. + assert(this_ref->use_count() == 1); + return Node::erase_node(c, std::move(this_ref)); + } + + on_layout_change(); + impl->prepare_mutate(c); + auto [erase_stage, next_pos] = impl->erase(pos); + track_erase(pos, erase_stage); + validate_tracked_cursors(); + + if (is_root()) { + return eagain_iertr::now(); + } else { + bool update_parent_index; + if (impl->is_level_tail()) { + update_parent_index = false; + } else { + next_pos.is_end() ? update_parent_index = true + : update_parent_index = false; + } + return try_merge_adjacent<FORCE_MERGE>( + c, update_parent_index, std::move(this_ref)); + } + }).si_then([next_cursor] { + return next_cursor; + }); + }); +} +template eagain_ifuture<Ref<tree_cursor_t>> +LeafNode::erase<true>(context_t, const search_position_t&, bool); +template eagain_ifuture<Ref<tree_cursor_t>> +LeafNode::erase<false>(context_t, const search_position_t&, bool); + +eagain_ifuture<> LeafNode::extend_value( + context_t c, const search_position_t& pos, value_size_t extend_size) +{ + ceph_abort("not implemented"); + return eagain_iertr::now(); +} + +eagain_ifuture<> LeafNode::trim_value( + context_t c, const search_position_t& pos, value_size_t trim_size) +{ + ceph_abort("not implemented"); + return eagain_iertr::now(); +} + +std::pair<NodeExtentMutable&, ValueDeltaRecorder*> +LeafNode::prepare_mutate_value_payload(context_t c) +{ + return impl->prepare_mutate_value_payload(c); +} + +eagain_ifuture<Ref<tree_cursor_t>> +LeafNode::lookup_smallest(context_t) +{ + if (unlikely(impl->is_keys_empty())) { + assert(is_root()); + return seastar::make_ready_future<Ref<tree_cursor_t>>( + tree_cursor_t::create_end(this)); + } + auto pos = search_position_t::begin(); + key_view_t index_key; + const value_header_t* p_value_header; + impl->get_slot(pos, &index_key, &p_value_header); + return seastar::make_ready_future<Ref<tree_cursor_t>>( + get_or_track_cursor(pos, index_key, p_value_header)); +} + +eagain_ifuture<Ref<tree_cursor_t>> +LeafNode::lookup_largest(context_t) +{ + if (unlikely(impl->is_keys_empty())) { + assert(is_root()); + return seastar::make_ready_future<Ref<tree_cursor_t>>( + tree_cursor_t::create_end(this)); + } + search_position_t pos; + key_view_t index_key; + const value_header_t* p_value_header = nullptr; + impl->get_largest_slot(&pos, &index_key, &p_value_header); + return seastar::make_ready_future<Ref<tree_cursor_t>>( + get_or_track_cursor(pos, index_key, p_value_header)); +} + +eagain_ifuture<Node::search_result_t> +LeafNode::lower_bound_tracked( + context_t c, const key_hobj_t& key, MatchHistory& history) +{ + key_view_t index_key; + auto result = impl->lower_bound(key, history, &index_key); + Ref<tree_cursor_t> cursor; + if (result.position.is_end()) { + assert(!result.p_value); + cursor = tree_cursor_t::create_end(this); + } else { + cursor = get_or_track_cursor(result.position, index_key, result.p_value); + } + search_result_t ret{cursor, result.mstat}; + ret.validate_input_key(key, c.vb.get_header_magic()); + return seastar::make_ready_future<search_result_t>(ret); +} + +eagain_ifuture<> LeafNode::do_get_tree_stats(context_t, tree_stats_t& stats) +{ + auto nstats = impl->get_stats(); + stats.size_persistent_leaf += nstats.size_persistent; + stats.size_filled_leaf += nstats.size_filled; + stats.size_logical_leaf += nstats.size_logical; + stats.size_overhead_leaf += nstats.size_overhead; + stats.size_value_leaf += nstats.size_value; + stats.num_kvs_leaf += nstats.num_kvs; + stats.num_nodes_leaf += 1; + return eagain_iertr::now(); +} + +void LeafNode::track_merge( + Ref<Node> _right_node, match_stage_t stage, search_position_t& left_last_pos) +{ + assert(level() == _right_node->level()); + // assert(impl->node_type() == _right_node->impl->node_type()); + auto& right_node = *static_cast<LeafNode*>(_right_node.get()); + if (right_node.tracked_cursors.empty()) { + return; + } + + match_stage_t curr_stage = STAGE_BOTTOM; + + // prepare the initial left_last_pos for offset + while (curr_stage < stage) { + left_last_pos.index_by_stage(curr_stage) = 0; + ++curr_stage; + } + ++left_last_pos.index_by_stage(curr_stage); + + // fix the tracked child nodes of right_node, stage by stage. + auto& right_tracked_cursors = right_node.tracked_cursors; + auto rit = right_tracked_cursors.begin(); + while (curr_stage <= STAGE_TOP) { + auto right_pos_until = search_position_t::begin(); + right_pos_until.index_by_stage(curr_stage) = INDEX_UPPER_BOUND; + auto rend = right_tracked_cursors.lower_bound(right_pos_until); + while (rit != rend) { + auto new_pos = rit->second->get_position(); + assert(new_pos == rit->first); + assert(rit->second->get_leaf_node().get() == &right_node); + new_pos += left_last_pos; + auto p_cursor = rit->second; + rit = right_tracked_cursors.erase(rit); + p_cursor->update_track<true>(this, new_pos); + } + left_last_pos.index_by_stage(curr_stage) = 0; + ++curr_stage; + } + assert(right_tracked_cursors.empty()); + + validate_tracked_cursors(); +} + +eagain_ifuture<> LeafNode::test_clone_root( + context_t c_other, RootNodeTracker& tracker_other) const +{ + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + Ref<const Node> this_ref = this; + return LeafNode::allocate(c_other, L_ADDR_MIN, field_type_t::N0, true + ).si_then([this, c_other, &tracker_other](auto fresh_other) { + impl->test_copy_to(fresh_other.mut); + auto cloned_root = fresh_other.node; + return c_other.nm.get_super(c_other.t, tracker_other + ).handle_error_interruptible( + eagain_iertr::pass_further{}, + crimson::ct_error::assert_all{"Invalid error during test clone"} + ).si_then([c_other, cloned_root](auto&& super_other) { + assert(super_other); + cloned_root->make_root_new(c_other, std::move(super_other)); + }); + }).si_then([this_ref]{}); +} + +eagain_ifuture<Ref<tree_cursor_t>> LeafNode::insert_value( + context_t c, const key_hobj_t& key, value_config_t vconf, + const search_position_t& pos, const MatchHistory& history, + match_stat_t mstat) +{ + LOG_PREFIX(OTree::LeafNode::insert_value); +#ifndef NDEBUG + if (pos.is_end()) { + assert(impl->is_level_tail()); + } +#endif + DEBUGT("insert {} with insert_key={}, insert_value={}, insert_pos({}), " + "history={}, mstat({}) ...", + c.t, get_name(), key, vconf, pos, history, mstat); + ++(c.t.get_onode_tree_stats().num_inserts); + search_position_t insert_pos = pos; + auto [insert_stage, insert_size] = impl->evaluate_insert( + key, vconf, history, mstat, insert_pos); + auto free_size = impl->free_size(); + if (free_size >= insert_size) { + // proceed to insert + on_layout_change(); + impl->prepare_mutate(c); + auto p_value_header = impl->insert(key, vconf, insert_pos, insert_stage, insert_size); + assert(impl->free_size() == free_size - insert_size); + assert(insert_pos <= pos); + assert(p_value_header->payload_size == vconf.payload_size); + auto ret = track_insert(insert_pos, insert_stage, p_value_header); + validate_tracked_cursors(); + return eagain_iertr::make_ready_future<Ref<tree_cursor_t>>(ret); + } + // split and insert + Ref<Node> this_ref = this; + laddr_t left_hint, right_hint; + { + key_view_t left_key; + impl->get_slot(search_position_t::begin(), &left_key, nullptr); + left_hint = left_key.get_hint(); + key_view_t right_key; + impl->get_largest_slot(nullptr, &right_key, nullptr); + right_hint = right_key.get_hint(); + } + return (is_root() ? upgrade_root(c, left_hint) : eagain_iertr::now() + ).si_then([this, c, right_hint] { + return LeafNode::allocate(c, right_hint, impl->field_type(), impl->is_level_tail()); + }).si_then([this_ref = std::move(this_ref), this, c, &key, vconf, FNAME, + insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable { + auto right_node = fresh_right.node; + DEBUGT("proceed split {} to fresh {} ...", + c.t, get_name(), right_node->get_name()); + // no need to bump version for right node, as it is fresh + on_layout_change(); + impl->prepare_mutate(c); + auto [split_pos, is_insert_left, p_value_header] = impl->split_insert( + fresh_right.mut, *right_node->impl, key, vconf, + insert_pos, insert_stage, insert_size); + assert(p_value_header->payload_size == vconf.payload_size); + track_split(split_pos, right_node); + Ref<tree_cursor_t> ret; + if (is_insert_left) { + ret = track_insert(insert_pos, insert_stage, p_value_header); + } else { + ret = right_node->track_insert(insert_pos, insert_stage, p_value_header); + } + validate_tracked_cursors(); + right_node->validate_tracked_cursors(); + + ++(c.t.get_onode_tree_stats().extents_num_delta); + return apply_split_to_parent( + c, std::move(this_ref), std::move(right_node), false + ).si_then([ret] { + return ret; + }); + // TODO (optimize) + // try to acquire space from siblings before split... see btrfs + }); +} + +eagain_ifuture<Ref<LeafNode>> LeafNode::allocate_root( + context_t c, RootNodeTracker& root_tracker) +{ + LOG_PREFIX(OTree::LeafNode::allocate_root); + return LeafNode::allocate(c, L_ADDR_MIN, field_type_t::N0, true + ).si_then([c, &root_tracker, FNAME](auto fresh_node) { + auto root = fresh_node.node; + return c.nm.get_super(c.t, root_tracker + ).handle_error_interruptible( + eagain_iertr::pass_further{}, + crimson::ct_error::input_output_error::handle([FNAME, c] { + ERRORT("EIO during get_super()", c.t); + ceph_abort("fatal error"); + }) + ).si_then([c, root](auto&& super) { + assert(super); + root->make_root_new(c, std::move(super)); + return root; + }); + }); +} + +Ref<tree_cursor_t> LeafNode::get_or_track_cursor( + const search_position_t& position, + const key_view_t& key, const value_header_t* p_value_header) +{ + assert(!position.is_end()); + assert(p_value_header); + Ref<tree_cursor_t> p_cursor; + auto found = tracked_cursors.find(position); + if (found == tracked_cursors.end()) { + p_cursor = tree_cursor_t::create_tracked( + this, position, key, p_value_header); + } else { + p_cursor = found->second; + assert(p_cursor->get_leaf_node() == this); + assert(p_cursor->get_position() == position); + p_cursor->update_cache_same_node(key, p_value_header); + } + return p_cursor; +} + +void LeafNode::validate_cursor(const tree_cursor_t& cursor) const +{ +#ifndef NDEBUG + assert(this == cursor.get_leaf_node().get()); + assert(cursor.is_tracked()); + assert(!impl->is_extent_retired()); + + // We need to make sure user has freed all the cursors before submitting the + // according transaction. Otherwise the below checks will have undefined + // behaviors. + auto [key, p_value_header] = get_kv(cursor.get_position()); + auto magic = p_value_header->magic; + assert(key == cursor.get_key_view(magic)); + assert(p_value_header == cursor.read_value_header(magic)); +#endif +} + +Ref<tree_cursor_t> LeafNode::track_insert( + const search_position_t& insert_pos, match_stage_t insert_stage, + const value_header_t* p_value_header) +{ + // update cursor position + auto pos_upper_bound = insert_pos; + pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND; + auto first = tracked_cursors.lower_bound(insert_pos); + auto last = tracked_cursors.lower_bound(pos_upper_bound); + std::vector<tree_cursor_t*> p_cursors; + std::for_each(first, last, [&p_cursors](auto& kv) { + p_cursors.push_back(kv.second); + }); + tracked_cursors.erase(first, last); + for (auto& p_cursor : p_cursors) { + search_position_t new_pos = p_cursor->get_position(); + ++new_pos.index_by_stage(insert_stage); + p_cursor->update_track<true>(this, new_pos); + } + + // track insert + // TODO: getting key_view_t from stage::proceed_insert() and + // stage::append_insert() has not supported yet + return tree_cursor_t::create_inserted( + this, insert_pos); +} + +void LeafNode::track_split( + const search_position_t& split_pos, Ref<LeafNode> right_node) +{ + // update cursor ownership and position + auto iter = tracked_cursors.lower_bound(split_pos); + while (iter != tracked_cursors.end()) { + auto new_pos = iter->first; + auto p_cursor = iter->second; + iter = tracked_cursors.erase(iter); + new_pos -= split_pos; + p_cursor->update_track<false>(right_node, new_pos); + } +} + +void LeafNode::track_erase( + const search_position_t& erase_pos, match_stage_t erase_stage) +{ + // erase tracking and invalidate the erased cursor + auto to_erase = tracked_cursors.find(erase_pos); + assert(to_erase != tracked_cursors.end()); + to_erase->second->invalidate(); + auto first = tracked_cursors.erase(to_erase); + + // update cursor position + assert(first == tracked_cursors.lower_bound(erase_pos)); + auto pos_upper_bound = erase_pos; + pos_upper_bound.index_by_stage(erase_stage) = INDEX_UPPER_BOUND; + auto last = tracked_cursors.lower_bound(pos_upper_bound); + std::vector<tree_cursor_t*> p_cursors; + std::for_each(first, last, [&p_cursors](auto& kv) { + p_cursors.push_back(kv.second); + }); + tracked_cursors.erase(first, last); + for (auto& p_cursor : p_cursors) { + search_position_t new_pos = p_cursor->get_position(); + assert(new_pos.index_by_stage(erase_stage) > 0); + --new_pos.index_by_stage(erase_stage); + p_cursor->update_track<true>(this, new_pos); + } +} + +eagain_ifuture<LeafNode::fresh_node_t> LeafNode::allocate( + context_t c, laddr_t hint, field_type_t field_type, bool is_level_tail) +{ + return LeafNodeImpl::allocate(c, hint, field_type, is_level_tail + ).si_then([](auto&& fresh_impl) { + auto *derived_ptr = fresh_impl.impl.get(); + auto node = Ref<LeafNode>(new LeafNode( + derived_ptr, std::move(fresh_impl.impl))); + return fresh_node_t{node, fresh_impl.mut}; + }); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h new file mode 100644 index 000000000..0b764172e --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h @@ -0,0 +1,743 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <compare> +#include <map> +#include <memory> +#include <ostream> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "crimson/common/type_helpers.h" + +#include "node_extent_mutable.h" +#include "stages/key_layout.h" +#include "stages/stage_types.h" +#include "super.h" +#include "value.h" + +/** + * Tree example (2 levels): + * + * Root node keys: [ 3 7 ] + * values: [p1 p2 p3] + * / | \ + * ------- | ------- + * | | | + * V V V + * Leaf node keys: [ 1 2 3] [ 4 5 7] [ 9 11 12] + * values: [v1 v2 v3] [v4 v5 v6] [v7 v8 v9] + * + * Tree structure properties: + * - As illustrated above, the parent key is strictly equal to its left child's + * largest key; + * - If a tree is indexing multiple seastore transactions, each transaction + * will be mapped to a Super which points to a distinct root node. So the + * transactions are isolated at tree level. However, tree nodes from + * different transactions can reference the same seastore CachedExtent before + * modification; + * - The resources of the transactional tree are tracked by tree_cursor_ts held + * by users. As long as any cursor is alive, the according tree hierarchy is + * alive and keeps tracked. See the reversed resource management sections + * below; + */ + +namespace crimson::os::seastore::onode { + +class LeafNode; +class InternalNode; + +using layout_version_t = uint32_t; +struct node_version_t { + layout_version_t layout; + nextent_state_t state; + + bool operator==(const node_version_t& rhs) const { + return (layout == rhs.layout && state == rhs.state); + } + bool operator!=(const node_version_t& rhs) const { + return !(*this == rhs); + } +}; + +/** + * tree_cursor_t + * + * A cursor points to a position (LeafNode and search_position_t) of the tree + * where it can find the according key and value pair. The position is updated + * by LeafNode insert/split/delete/merge internally and is kept valid. It also + * caches the key-value information for a specific node layout version. + * + * Exposes public interfaces for Btree::Cursor. + */ +class tree_cursor_t final + : public boost::intrusive_ref_counter< + tree_cursor_t, boost::thread_unsafe_counter> { + public: + ~tree_cursor_t(); + tree_cursor_t(const tree_cursor_t&) = delete; + tree_cursor_t(tree_cursor_t&&) = delete; + tree_cursor_t& operator=(const tree_cursor_t&) = delete; + tree_cursor_t& operator=(tree_cursor_t&&) = delete; + + // public to Btree + + /** + * is_end + * + * Represents one-past-the-last of all the sorted key-value + * pairs in the tree. An end cursor won't contain valid key-value + * information. + */ + bool is_end() const { return !!ref_leaf_node && position.is_end(); } + + /** + * is_tracked + * + * Represents a key-value pair stored in the tree, which is always tracked + * across insert/split/erase/merge operations. + */ + bool is_tracked() const { return !!ref_leaf_node && !position.is_end(); } + + /** + * is_invalid + * + * Represents an invalid cursor which was once valid and tracked by the tree + * but is now erased and untracked. User may still hold an invalid cursor. + */ + bool is_invalid() const { return !ref_leaf_node; } + + /// Returns the key view in tree if it is not an end cursor. + const key_view_t& get_key_view(value_magic_t magic) const { + assert(is_tracked()); + return cache.get_key_view(magic, position); + } + + /// Returns the next tree_cursor_t in tree, can be end if there's no next. + eagain_ifuture<Ref<tree_cursor_t>> get_next(context_t); + + /// Check that this is next to prv + void assert_next_to(const tree_cursor_t&, value_magic_t) const; + + /// Erases the key-value pair from tree. + template <bool FORCE_MERGE = false> + eagain_ifuture<Ref<tree_cursor_t>> erase(context_t, bool get_next); + + std::strong_ordering compare_to(const tree_cursor_t&, value_magic_t) const; + + // public to Value + + /// Get the latest value_header_t pointer for read. + const value_header_t* read_value_header(value_magic_t magic) const { + assert(is_tracked()); + return cache.get_p_value_header(magic, position); + } + + /// Prepare the node extent to be mutable and recorded. + std::pair<NodeExtentMutable&, ValueDeltaRecorder*> + prepare_mutate_value_payload(context_t c) { + assert(is_tracked()); + if (!is_mutated) { + is_mutated = true; + ++(c.t.get_onode_tree_stats().num_updates); + } + return cache.prepare_mutate_value_payload(c, position); + } + + /// Extends the size of value payload. + eagain_ifuture<> extend_value(context_t, value_size_t); + + /// Trim and shrink the value payload. + eagain_ifuture<> trim_value(context_t, value_size_t); + + static Ref<tree_cursor_t> get_invalid() { + Ref<tree_cursor_t> INVALID = new tree_cursor_t(); + return INVALID; + } + + private: + // create from insert + tree_cursor_t(Ref<LeafNode>, const search_position_t&); + // create from lookup + tree_cursor_t(Ref<LeafNode>, const search_position_t&, + const key_view_t&, const value_header_t*); + // lookup reaches the end, contain leaf node for further insert + tree_cursor_t(Ref<LeafNode>); + // create an invalid tree_cursor_t + tree_cursor_t() : cache{ref_leaf_node} {} + + const search_position_t& get_position() const { return position; } + Ref<LeafNode> get_leaf_node() const { return ref_leaf_node; } + template <bool VALIDATE> + void update_track(Ref<LeafNode>, const search_position_t&); + void update_cache_same_node(const key_view_t&, + const value_header_t*) const; + void invalidate(); + + static Ref<tree_cursor_t> create_inserted( + Ref<LeafNode> node, const search_position_t& pos) { + return new tree_cursor_t(node, pos); + } + + static Ref<tree_cursor_t> create_tracked( + Ref<LeafNode> node, const search_position_t& pos, + const key_view_t& key, const value_header_t* p_header) { + return new tree_cursor_t(node, pos, key, p_header); + } + + static Ref<tree_cursor_t> create_end(Ref<LeafNode> node) { + return new tree_cursor_t(node); + } + + /** + * Reversed resource management (tree_cursor_t) + * + * tree_cursor_t holds a reference to the LeafNode, so the LeafNode will be + * alive as long as any of it's cursors is still referenced by user. + */ + Ref<LeafNode> ref_leaf_node; + search_position_t position; + + // account 1 update even if there are multiple updates to the same value + bool is_mutated = false; + + /** Cache + * + * Cached memory pointers or views which may be outdated due to + * extent copy-on-write or asynchronous leaf node updates. + */ + class Cache { + public: + Cache(Ref<LeafNode>&); + void validate_is_latest(const search_position_t&) const; + void invalidate() { needs_update_all = true; } + void update_all(const node_version_t&, const key_view_t&, const value_header_t*); + const key_view_t& get_key_view( + value_magic_t magic, const search_position_t& pos) { + make_latest(magic, pos); + return *key_view; + } + const value_header_t* get_p_value_header( + value_magic_t magic, const search_position_t& pos) { + make_latest(magic, pos); + return p_value_header; + } + std::pair<NodeExtentMutable&, ValueDeltaRecorder*> + prepare_mutate_value_payload(context_t, const search_position_t&); + + private: + void maybe_duplicate(const node_version_t&); + void make_latest(value_magic_t, const search_position_t&); + + // metadata about how cache is valid + Ref<LeafNode>& ref_leaf_node; + bool needs_update_all = true; + node_version_t version; + + // cached key value info + const char* p_node_base = nullptr; + std::optional<key_view_t> key_view; + const value_header_t* p_value_header = nullptr; + + // cached data-structures to update value payload + std::optional<NodeExtentMutable> value_payload_mut; + ValueDeltaRecorder* p_value_recorder = nullptr; + }; + mutable Cache cache; + + friend class LeafNode; + friend class Node; // get_position(), get_leaf_node() +}; + +/** + * Node + * + * An abstracted class for both InternalNode and LeafNode. + * + * Exposes public interfaces for Btree. + */ +class Node + : public boost::intrusive_ref_counter< + Node, boost::thread_unsafe_counter> { + public: + // public to Btree + struct search_result_t { + bool is_end() const { return p_cursor->is_end(); } + Ref<tree_cursor_t> p_cursor; + match_stat_t mstat; + + MatchKindBS match() const { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + return (mstat == MSTAT_EQ ? MatchKindBS::EQ : MatchKindBS::NE); + } + + void validate_input_key(const key_hobj_t& key, value_magic_t magic) const { +#ifndef NDEBUG + if (match() == MatchKindBS::EQ) { + assert(key == p_cursor->get_key_view(magic)); + } else { + assert(match() == MatchKindBS::NE); + if (p_cursor->is_tracked()) { + assert(key < p_cursor->get_key_view(magic)); + } else if (p_cursor->is_end()) { + // good + } else { + assert(p_cursor->is_invalid()); + ceph_abort("impossible"); + } + } +#endif + } + }; + + virtual ~Node(); + Node(const Node&) = delete; + Node(Node&&) = delete; + Node& operator=(const Node&) = delete; + Node& operator=(Node&&) = delete; + + /** + * level + * + * A positive value denotes the level (or height) of this node in tree. + * 0 means LeafNode, positive means InternalNode. + */ + level_t level() const; + + /** + * lookup_smallest + * + * Returns a cursor pointing to the smallest key in the sub-tree formed by + * this node. + * + * Returns an end cursor if it is an empty root node. + */ + virtual eagain_ifuture<Ref<tree_cursor_t>> lookup_smallest(context_t) = 0; + + /** + * lookup_largest + * + * Returns a cursor pointing to the largest key in the sub-tree formed by + * this node. + * + * Returns an end cursor if it is an empty root node. + */ + virtual eagain_ifuture<Ref<tree_cursor_t>> lookup_largest(context_t) = 0; + + /** + * lower_bound + * + * Returns a cursor pointing to the first element in the range [first, last) + * of the sub-tree which does not compare less than the input key. The + * result also denotes whether the pointed key is equal to the input key. + * + * Returns an end cursor with MatchKindBS::NE if: + * - It is an empty root node; + * - Or the input key is larger than all the keys in the sub-tree; + */ + eagain_ifuture<search_result_t> lower_bound(context_t c, const key_hobj_t& key); + + /** + * insert + * + * Try to insert a key-value pair into the sub-tree formed by this node. + * + * Returns a boolean denoting whether the insertion is successful: + * - If true, the returned cursor points to the inserted element in tree; + * - If false, the returned cursor points to the conflicting element in tree; + */ + eagain_ifuture<std::pair<Ref<tree_cursor_t>, bool>> insert( + context_t, const key_hobj_t&, value_config_t, Ref<Node>&&); + + /** + * erase + * + * Removes a key-value pair from the sub-tree formed by this node. + * + * Returns the number of erased key-value pairs (0 or 1). + */ + eagain_ifuture<std::size_t> erase(context_t, const key_hobj_t&, Ref<Node>&&); + + /// Recursively collects the statistics of the sub-tree formed by this node + eagain_ifuture<tree_stats_t> get_tree_stats(context_t); + + /// Returns an ostream containing a dump of all the elements in the node. + std::ostream& dump(std::ostream&) const; + + /// Returns an ostream containing an one-line summary of this node. + std::ostream& dump_brief(std::ostream&) const; + + /// Print the node name + const std::string& get_name() const; + + /// Initializes the tree by allocating an empty root node. + static eagain_ifuture<> mkfs(context_t, RootNodeTracker&); + + /// Loads the tree root. The tree must be initialized. + static eagain_ifuture<Ref<Node>> load_root(context_t, RootNodeTracker&); + + // Only for unit test purposes. + void test_make_destructable(context_t, NodeExtentMutable&, Super::URef&&); + virtual eagain_ifuture<> test_clone_root(context_t, RootNodeTracker&) const = 0; + + protected: + virtual eagain_ifuture<> test_clone_non_root(context_t, Ref<InternalNode>) const { + ceph_abort("impossible path"); + } + virtual eagain_ifuture<search_result_t> lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) = 0; + virtual eagain_ifuture<> do_get_tree_stats(context_t, tree_stats_t&) = 0; + + virtual bool is_tracking() const = 0; + + virtual void track_merge(Ref<Node>, match_stage_t, search_position_t&) = 0; + + protected: + Node(NodeImplURef&&); + + bool is_tracked() const { + assert(!(super && _parent_info.has_value())); + return (super || _parent_info.has_value()); + } + + bool is_root() const { + assert(is_tracked()); + return !_parent_info.has_value(); + } + + // as root + void make_root(context_t c, Super::URef&& _super); + void make_root_new(context_t c, Super::URef&& _super) { + assert(_super->get_root_laddr() == L_ADDR_NULL); + make_root(c, std::move(_super)); + } + void make_root_from(context_t c, Super::URef&& _super, laddr_t from_addr) { + assert(_super->get_root_laddr() == from_addr); + make_root(c, std::move(_super)); + } + void as_root(Super::URef&& _super); + eagain_ifuture<> upgrade_root(context_t, laddr_t); + + Super::URef deref_super(); + + // as child/non-root + template <bool VALIDATE = true> + void as_child(const search_position_t&, Ref<InternalNode>); + + struct parent_info_t { + search_position_t position; + Ref<InternalNode> ptr; + }; + const parent_info_t& parent_info() const { return *_parent_info; } + + Ref<InternalNode> deref_parent(); + + eagain_ifuture<> apply_split_to_parent(context_t, Ref<Node>&&, Ref<Node>&&, bool); + eagain_ifuture<Ref<tree_cursor_t>> get_next_cursor_from_parent(context_t); + template <bool FORCE_MERGE = false> + eagain_ifuture<> try_merge_adjacent(context_t, bool, Ref<Node>&&); + eagain_ifuture<> erase_node(context_t, Ref<Node>&&); + template <bool FORCE_MERGE = false> + eagain_ifuture<> fix_parent_index(context_t, Ref<Node>&&, bool); + eagain_ifuture<NodeExtentMutable> rebuild_extent(context_t); + eagain_ifuture<> retire(context_t, Ref<Node>&&); + void make_tail(context_t); + + private: + /** + * Reversed resource management (Node) + * + * Root Node holds a reference to its parent Super class, so its parent + * will be alive as long as this root node is alive. + * + * None-root Node holds a reference to its parent Node, so its parent will + * be alive as long as any of it's children is alive. + */ + // as root + Super::URef super; + // as child/non-root + std::optional<parent_info_t> _parent_info; + + private: + static eagain_ifuture<Ref<Node>> load(context_t, laddr_t, bool expect_is_level_tail); + + NodeImplURef impl; + friend class InternalNode; +}; +inline std::ostream& operator<<(std::ostream& os, const Node& node) { + return node.dump_brief(os); +} + +/** + * InternalNode + * + * A concrete implementation of Node class that represents an internal tree + * node. Its level is always positive and its values are logical block + * addresses to its child nodes. An internal node cannot be empty. + */ +class InternalNode final : public Node { + public: + // public to Node + InternalNode(InternalNodeImpl*, NodeImplURef&&); + ~InternalNode() override { assert(tracked_child_nodes.empty()); } + InternalNode(const InternalNode&) = delete; + InternalNode(InternalNode&&) = delete; + InternalNode& operator=(const InternalNode&) = delete; + InternalNode& operator=(InternalNode&&) = delete; + + eagain_ifuture<Ref<tree_cursor_t>> get_next_cursor(context_t, const search_position_t&); + + eagain_ifuture<> apply_child_split(context_t, Ref<Node>&& left, Ref<Node>&& right, bool); + + template <bool VALIDATE> + void do_track_child(Node& child) { + if constexpr (VALIDATE) { + validate_child(child); + } + auto& child_pos = child.parent_info().position; + assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end()); + tracked_child_nodes[child_pos] = &child; + } + + void do_untrack_child(const Node& child) { + assert(check_is_tracking(child)); + auto& child_pos = child.parent_info().position; + [[maybe_unused]] auto removed = tracked_child_nodes.erase(child_pos); + assert(removed); + } + + bool check_is_tracking(const Node& child) const { + auto& child_pos = child.parent_info().position; + auto found = tracked_child_nodes.find(child_pos); + if (found != tracked_child_nodes.end() && found->second == &child) { + assert(child.parent_info().ptr == this); + return true; + } else { + return false; + } + } + + eagain_ifuture<std::pair<Ref<Node>, Ref<Node>>> get_child_peers( + context_t, const search_position_t&); + + eagain_ifuture<> erase_child(context_t, Ref<Node>&&); + + template <bool FORCE_MERGE = false> + eagain_ifuture<> fix_index(context_t, Ref<Node>&&, bool); + + template <bool FORCE_MERGE = false> + eagain_ifuture<> apply_children_merge( + context_t, Ref<Node>&& left, laddr_t, Ref<Node>&& right, bool update_index); + + void validate_child_tracked(const Node& child) const { + validate_child(child); + assert(tracked_child_nodes.find(child.parent_info().position) != + tracked_child_nodes.end()); + assert(tracked_child_nodes.find(child.parent_info().position)->second == &child); + } + + void validate_child_inconsistent(const Node& child) const; + + void validate_tracked_children() const { +#ifndef NDEBUG + for (auto& kv : tracked_child_nodes) { + assert(kv.first == kv.second->parent_info().position); + validate_child(*kv.second); + } +#endif + } + + void track_make_tail(const search_position_t&); + + static eagain_ifuture<Ref<InternalNode>> allocate_root( + context_t, laddr_t, level_t, laddr_t, Super::URef&&); + + protected: + eagain_ifuture<Ref<tree_cursor_t>> lookup_smallest(context_t) override; + eagain_ifuture<Ref<tree_cursor_t>> lookup_largest(context_t) override; + eagain_ifuture<search_result_t> lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) override; + eagain_ifuture<> do_get_tree_stats(context_t, tree_stats_t&) override; + bool is_tracking() const override { + return !tracked_child_nodes.empty(); + } + void track_merge(Ref<Node>, match_stage_t, search_position_t&) override; + + eagain_ifuture<> test_clone_root(context_t, RootNodeTracker&) const override; + + private: + eagain_ifuture<> try_downgrade_root(context_t, Ref<Node>&&); + + eagain_ifuture<Ref<InternalNode>> insert_or_split( + context_t, const search_position_t&, const key_view_t&, Ref<Node>, + Ref<Node> outdated_child=nullptr); + + // XXX: extract a common tracker for InternalNode to track Node, + // and LeafNode to track tree_cursor_t. + eagain_ifuture<Ref<Node>> get_or_track_child(context_t, const search_position_t&, laddr_t); + template <bool VALIDATE = true> + void track_insert( + const search_position_t&, match_stage_t, Ref<Node>, Ref<Node> nxt_child = nullptr); + void replace_track(Ref<Node> new_child, Ref<Node> old_child, bool); + void track_split(const search_position_t&, Ref<InternalNode>); + template <bool VALIDATE = true> + void track_erase(const search_position_t&, match_stage_t); + void validate_child(const Node& child) const; + + struct fresh_node_t { + Ref<InternalNode> node; + NodeExtentMutable mut; + std::pair<Ref<Node>, NodeExtentMutable> make_pair() { + return std::make_pair(Ref<Node>(node), mut); + } + }; + static eagain_ifuture<fresh_node_t> allocate(context_t, laddr_t, field_type_t, bool, level_t); + + private: + /** + * Reversed resource management (InternalNode) + * + * InteralNode keeps track of its child nodes which are still alive in + * memory, and their positions will be updated throughout + * insert/split/delete/merge operations of this node. + */ + // XXX: leverage intrusive data structure to control memory overhead + std::map<search_position_t, Node*> tracked_child_nodes; + InternalNodeImpl* impl; +}; + +/** + * LeafNode + * + * A concrete implementation of Node class that represents a leaf tree node. + * Its level is always 0. A leaf node can only be empty if it is root. + */ +class LeafNode final : public Node { + public: + // public to tree_cursor_t + ~LeafNode() override { assert(tracked_cursors.empty()); } + LeafNode(const LeafNode&) = delete; + LeafNode(LeafNode&&) = delete; + LeafNode& operator=(const LeafNode&) = delete; + LeafNode& operator=(LeafNode&&) = delete; + + bool is_level_tail() const; + node_version_t get_version() const; + const char* read() const; + extent_len_t get_node_size() const; + std::tuple<key_view_t, const value_header_t*> get_kv(const search_position_t&) const; + eagain_ifuture<Ref<tree_cursor_t>> get_next_cursor(context_t, const search_position_t&); + + /** + * erase + * + * Removes a key-value pair from the position. + * + * If get_next is true, returns the cursor pointing to the next key-value + * pair that followed the erased element, which can be nullptr if is end. + */ + template <bool FORCE_MERGE> + eagain_ifuture<Ref<tree_cursor_t>> erase( + context_t, const search_position_t&, bool get_next); + + template <bool VALIDATE> + void do_track_cursor(tree_cursor_t& cursor) { + if constexpr (VALIDATE) { + validate_cursor(cursor); + } + auto& cursor_pos = cursor.get_position(); + assert(tracked_cursors.find(cursor_pos) == tracked_cursors.end()); + tracked_cursors.emplace(cursor_pos, &cursor); + } + void do_untrack_cursor(const tree_cursor_t& cursor) { + validate_cursor(cursor); + auto& cursor_pos = cursor.get_position(); + assert(check_is_tracking(cursor)); + [[maybe_unused]] auto removed = tracked_cursors.erase(cursor_pos); + assert(removed); + } + bool check_is_tracking(const tree_cursor_t& cursor) const { + auto& cursor_pos = cursor.get_position(); + auto found = tracked_cursors.find(cursor_pos); + if (found != tracked_cursors.end() && found->second == &cursor) { + assert(cursor.ref_leaf_node == this); + return true; + } else { + return false; + } + } + + eagain_ifuture<> extend_value(context_t, const search_position_t&, value_size_t); + eagain_ifuture<> trim_value(context_t, const search_position_t&, value_size_t); + + std::pair<NodeExtentMutable&, ValueDeltaRecorder*> + prepare_mutate_value_payload(context_t); + + protected: + eagain_ifuture<Ref<tree_cursor_t>> lookup_smallest(context_t) override; + eagain_ifuture<Ref<tree_cursor_t>> lookup_largest(context_t) override; + eagain_ifuture<search_result_t> lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) override; + eagain_ifuture<> do_get_tree_stats(context_t, tree_stats_t&) override; + bool is_tracking() const override { + return !tracked_cursors.empty(); + } + void track_merge(Ref<Node>, match_stage_t, search_position_t&) override; + + eagain_ifuture<> test_clone_root(context_t, RootNodeTracker&) const override; + + private: + LeafNode(LeafNodeImpl*, NodeImplURef&&); + eagain_ifuture<Ref<tree_cursor_t>> insert_value( + context_t, const key_hobj_t&, value_config_t, + const search_position_t&, const MatchHistory&, + match_stat_t mstat); + static eagain_ifuture<Ref<LeafNode>> allocate_root(context_t, RootNodeTracker&); + friend class Node; + + private: + // XXX: extract a common tracker for InternalNode to track Node, + // and LeafNode to track tree_cursor_t. + Ref<tree_cursor_t> get_or_track_cursor( + const search_position_t&, const key_view_t&, const value_header_t*); + Ref<tree_cursor_t> track_insert( + const search_position_t&, match_stage_t, const value_header_t*); + void track_split(const search_position_t&, Ref<LeafNode>); + void track_erase(const search_position_t&, match_stage_t); + void validate_tracked_cursors() const { +#ifndef NDEBUG + for (auto& kv : tracked_cursors) { + assert(kv.first == kv.second->get_position()); + validate_cursor(*kv.second); + } +#endif + } + void validate_cursor(const tree_cursor_t& cursor) const; + // invalidate p_value pointers in tree_cursor_t + void on_layout_change() { ++layout_version; } + + struct fresh_node_t { + Ref<LeafNode> node; + NodeExtentMutable mut; + std::pair<Ref<Node>, NodeExtentMutable> make_pair() { + return std::make_pair(Ref<Node>(node), mut); + } + }; + static eagain_ifuture<fresh_node_t> allocate(context_t, laddr_t, field_type_t, bool); + + private: + /** + * Reversed resource management (LeafNode) + * + * LeafNode keeps track of the referencing cursors which are still alive in + * memory, and their positions will be updated throughout + * insert/split/delete/merge operations of this node. + */ + // XXX: leverage intrusive data structure to control memory overhead + std::map<search_position_t, tree_cursor_t*> tracked_cursors; + LeafNodeImpl* impl; + layout_version_t layout_version = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h new file mode 100644 index 000000000..ea26195de --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/buffer.h" +#include "node_types.h" +#include "value.h" + +namespace crimson::os::seastore::onode { + +/** + * DeltaRecorder + * + * An abstracted class to encapsulate different implementations to apply delta + * to a specific node layout. + */ +class DeltaRecorder { + public: + virtual ~DeltaRecorder() { + /* May be non-empty if transaction is abandoned without + * being submitted -- conflicts are a particularly common + * example (denoted generally by returning crimson::ct_error::eagain). + */ + } + + bool is_empty() const { + return encoded.length() == 0; + } + + ceph::bufferlist get_delta() { + return std::move(encoded); + } + + ValueDeltaRecorder* get_value_recorder() const { + assert(value_recorder); + return value_recorder.get(); + } + + virtual node_type_t node_type() const = 0; + virtual field_type_t field_type() const = 0; + virtual void apply_delta(ceph::bufferlist::const_iterator&, + NodeExtentMutable&, + const NodeExtent&) = 0; + + protected: + DeltaRecorder() = default; + DeltaRecorder(const ValueBuilder& vb) + : value_recorder{vb.build_value_recorder(encoded)} {} + + ceph::bufferlist encoded; + std::unique_ptr<ValueDeltaRecorder> value_recorder; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h new file mode 100644 index 000000000..1a03036d3 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h @@ -0,0 +1,619 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/logging.h" + +#include "node_extent_manager.h" +#include "node_delta_recorder.h" +#include "node_layout_replayable.h" +#include "value.h" + +#ifndef NDEBUG +#include "node_extent_manager/test_replay.h" +#endif + +namespace crimson::os::seastore::onode { + +/** + * DeltaRecorderT + * + * Responsible to encode and decode delta, and apply delta for a specific node + * layout. + */ +template <typename FieldType, node_type_t NODE_TYPE> +class DeltaRecorderT final: public DeltaRecorder { + public: + using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>; + using node_stage_t = typename layout_t::node_stage_t; + using position_t = typename layout_t::position_t; + using StagedIterator = typename layout_t::StagedIterator; + using value_input_t = typename layout_t::value_input_t; + static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE; + + ~DeltaRecorderT() override = default; + + template <KeyT KT> + void encode_insert( + const full_key_t<KT>& key, + const value_input_t& value, + const position_t& insert_pos, + const match_stage_t& insert_stage, + const node_offset_t& insert_size) { + ceph::encode(node_delta_op_t::INSERT, encoded); + encode_key(key, encoded); + encode_value(value, encoded); + insert_pos.encode(encoded); + ceph::encode(insert_stage, encoded); + ceph::encode(insert_size, encoded); + } + + void encode_split( + const StagedIterator& split_at, + const char* p_node_start) { + ceph::encode(node_delta_op_t::SPLIT, encoded); + split_at.encode(p_node_start, encoded); + } + + template <KeyT KT> + void encode_split_insert( + const StagedIterator& split_at, + const full_key_t<KT>& key, + const value_input_t& value, + const position_t& insert_pos, + const match_stage_t& insert_stage, + const node_offset_t& insert_size, + const char* p_node_start) { + ceph::encode(node_delta_op_t::SPLIT_INSERT, encoded); + split_at.encode(p_node_start, encoded); + encode_key(key, encoded); + encode_value(value, encoded); + insert_pos.encode(encoded); + ceph::encode(insert_stage, encoded); + ceph::encode(insert_size, encoded); + } + + void encode_update_child_addr( + const laddr_t new_addr, + const laddr_packed_t* p_addr, + const char* p_node_start, + extent_len_t node_size) { + ceph::encode(node_delta_op_t::UPDATE_CHILD_ADDR, encoded); + ceph::encode(new_addr, encoded); + int node_offset = reinterpret_cast<const char*>(p_addr) - p_node_start; + assert(node_offset > 0 && node_offset < (int)node_size); + ceph::encode(static_cast<node_offset_t>(node_offset), encoded); + } + + void encode_erase( + const position_t& erase_pos) { + ceph::encode(node_delta_op_t::ERASE, encoded); + erase_pos.encode(encoded); + } + + void encode_make_tail() { + ceph::encode(node_delta_op_t::MAKE_TAIL, encoded); + } + + static DeltaRecorderURef create_for_encode(const ValueBuilder& v_builder) { + return std::unique_ptr<DeltaRecorder>(new DeltaRecorderT(v_builder)); + } + + static DeltaRecorderURef create_for_replay() { + return std::unique_ptr<DeltaRecorder>(new DeltaRecorderT()); + } + + protected: + DeltaRecorderT() : DeltaRecorder() {} + DeltaRecorderT(const ValueBuilder& vb) : DeltaRecorder(vb) {} + node_type_t node_type() const override { return NODE_TYPE; } + field_type_t field_type() const override { return FIELD_TYPE; } + void apply_delta(ceph::bufferlist::const_iterator& delta, + NodeExtentMutable& mut, + const NodeExtent& node) override { + LOG_PREFIX(OTree::Extent::Replay); + assert(is_empty()); + node_stage_t stage(reinterpret_cast<const FieldType*>(mut.get_read()), + mut.get_length()); + node_delta_op_t op; + try { + ceph::decode(op, delta); + switch (op) { + case node_delta_op_t::INSERT: { + SUBDEBUG(seastore_onode, "decoding INSERT ..."); + auto key = key_hobj_t::decode(delta); + auto value = decode_value(delta); + auto insert_pos = position_t::decode(delta); + match_stage_t insert_stage; + ceph::decode(insert_stage, delta); + node_offset_t insert_size; + ceph::decode(insert_size, delta); + SUBDEBUG(seastore_onode, + "apply {}, {}, insert_pos({}), insert_stage={}, " + "insert_size={}B ...", + key, value, insert_pos, insert_stage, insert_size); + layout_t::template insert<KeyT::HOBJ>( + mut, stage, key, value, insert_pos, insert_stage, insert_size); + break; + } + case node_delta_op_t::SPLIT: { + SUBDEBUG(seastore_onode, "decoding SPLIT ..."); + auto split_at = StagedIterator::decode( + mut.get_read(), mut.get_length(), delta); + SUBDEBUG(seastore_onode, "apply split_at={} ...", split_at); + layout_t::split(mut, stage, split_at); + break; + } + case node_delta_op_t::SPLIT_INSERT: { + SUBDEBUG(seastore_onode, "decoding SPLIT_INSERT ..."); + auto split_at = StagedIterator::decode( + mut.get_read(), mut.get_length(), delta); + auto key = key_hobj_t::decode(delta); + auto value = decode_value(delta); + auto insert_pos = position_t::decode(delta); + match_stage_t insert_stage; + ceph::decode(insert_stage, delta); + node_offset_t insert_size; + ceph::decode(insert_size, delta); + SUBDEBUG(seastore_onode, + "apply split_at={}, {}, {}, insert_pos({}), insert_stage={}, " + "insert_size={}B ...", + split_at, key, value, insert_pos, insert_stage, insert_size); + layout_t::template split_insert<KeyT::HOBJ>( + mut, stage, split_at, key, value, insert_pos, insert_stage, insert_size); + break; + } + case node_delta_op_t::UPDATE_CHILD_ADDR: { + SUBDEBUG(seastore_onode, "decoding UPDATE_CHILD_ADDR ..."); + laddr_t new_addr; + ceph::decode(new_addr, delta); + node_offset_t update_offset; + ceph::decode(update_offset, delta); + auto p_addr = reinterpret_cast<laddr_packed_t*>( + mut.get_write() + update_offset); + SUBDEBUG(seastore_onode, + "apply {:#x} to offset {:#x} ...", + new_addr, update_offset); + layout_t::update_child_addr(mut, new_addr, p_addr); + break; + } + case node_delta_op_t::ERASE: { + SUBDEBUG(seastore_onode, "decoding ERASE ..."); + auto erase_pos = position_t::decode(delta); + SUBDEBUG(seastore_onode, "apply erase_pos({}) ...", erase_pos); + layout_t::erase(mut, stage, erase_pos); + break; + } + case node_delta_op_t::MAKE_TAIL: { + SUBDEBUG(seastore_onode, "decoded MAKE_TAIL, apply ..."); + layout_t::make_tail(mut, stage); + break; + } + case node_delta_op_t::SUBOP_UPDATE_VALUE: { + SUBDEBUG(seastore_onode, "decoding SUBOP_UPDATE_VALUE ..."); + node_offset_t value_header_offset; + ceph::decode(value_header_offset, delta); + auto p_header = mut.get_read() + value_header_offset; + auto p_header_ = reinterpret_cast<const value_header_t*>(p_header); + SUBDEBUG(seastore_onode, "update {} at {:#x} ...", *p_header_, value_header_offset); + auto payload_mut = p_header_->get_payload_mutable(mut); + auto value_addr = node.get_laddr() + payload_mut.get_node_offset(); + get_value_replayer(p_header_->magic)->apply_value_delta( + delta, payload_mut, value_addr); + break; + } + default: + SUBERROR(seastore_onode, + "got unknown op {} when replay {}", + op, node); + ceph_abort("fatal error"); + } + } catch (buffer::error& e) { + SUBERROR(seastore_onode, + "got decode error {} when replay {}", + e.what(), node); + ceph_abort("fatal error"); + } + } + + private: + ValueDeltaRecorder* get_value_replayer(value_magic_t magic) { + // Replay procedure is independent of Btree and happens at lower level in + // seastore. There is no ValueBuilder so the recoder needs to build the + // ValueDeltaRecorder by itself. + if (value_replayer) { + if (value_replayer->get_header_magic() != magic) { + ceph_abort_msgf("OTree::Extent::Replay: value magic mismatch %x != %x", + value_replayer->get_header_magic(), magic); + } + } else { + value_replayer = build_value_recorder_by_type(encoded, magic); + if (!value_replayer) { + ceph_abort_msgf("OTree::Extent::Replay: got unexpected value magic = %x", + magic); + } + } + return value_replayer.get(); + } + + void encode_value(const value_input_t& value, ceph::bufferlist& encoded) const { + if constexpr (std::is_same_v<value_input_t, laddr_t>) { + // NODE_TYPE == node_type_t::INTERNAL + ceph::encode(value, encoded); + } else if constexpr (std::is_same_v<value_input_t, value_config_t>) { + // NODE_TYPE == node_type_t::LEAF + value.encode(encoded); + } else { + ceph_abort("impossible path"); + } + } + + value_input_t decode_value(ceph::bufferlist::const_iterator& delta) const { + if constexpr (std::is_same_v<value_input_t, laddr_t>) { + // NODE_TYPE == node_type_t::INTERNAL + laddr_t value; + ceph::decode(value, delta); + return value; + } else if constexpr (std::is_same_v<value_input_t, value_config_t>) { + // NODE_TYPE == node_type_t::LEAF + return value_config_t::decode(delta); + } else { + ceph_abort("impossible path"); + } + } + + std::unique_ptr<ValueDeltaRecorder> value_replayer; +}; + +/** + * NodeExtentAccessorT + * + * This component is responsible to reference and mutate the underlying + * NodeExtent, record mutation parameters when needed, and apply the recorded + * modifications for a specific node layout. + * + * For possible internal states, see node_types.h. + */ +template <typename FieldType, node_type_t NODE_TYPE> +class NodeExtentAccessorT { + public: + using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>; + using node_stage_t = typename layout_t::node_stage_t; + using position_t = typename layout_t::position_t; + using recorder_t = DeltaRecorderT<FieldType, NODE_TYPE>; + using StagedIterator = typename layout_t::StagedIterator; + using value_input_t = typename layout_t::value_input_t; + using value_t = typename layout_t::value_t; + static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE; + + NodeExtentAccessorT(NodeExtentRef extent) + : extent{extent}, + node_stage{reinterpret_cast<const FieldType*>(extent->get_read()), + extent->get_length()} { + assert(is_valid_node_size(extent->get_length())); + if (extent->is_initial_pending()) { + state = nextent_state_t::FRESH; + mut.emplace(extent->get_mutable()); + assert(extent->get_recorder() == nullptr); + recorder = nullptr; + } else if (extent->is_mutation_pending()) { + state = nextent_state_t::MUTATION_PENDING; + mut.emplace(extent->get_mutable()); + auto p_recorder = extent->get_recorder(); + assert(p_recorder != nullptr); + assert(p_recorder->node_type() == NODE_TYPE); + assert(p_recorder->field_type() == FIELD_TYPE); + recorder = static_cast<recorder_t*>(p_recorder); + } else if (!extent->is_mutable() && extent->is_valid()) { + state = nextent_state_t::READ_ONLY; + // mut is empty + assert(extent->get_recorder() == nullptr || + extent->get_recorder()->is_empty()); + recorder = nullptr; + } else { + // extent is invalid or retired + ceph_abort("impossible path"); + } +#ifndef NDEBUG + auto ref_recorder = recorder_t::create_for_replay(); + test_recorder = static_cast<recorder_t*>(ref_recorder.get()); + test_extent = TestReplayExtent::create( + get_length(), std::move(ref_recorder)); +#endif + } + ~NodeExtentAccessorT() = default; + NodeExtentAccessorT(const NodeExtentAccessorT&) = delete; + NodeExtentAccessorT(NodeExtentAccessorT&&) = delete; + NodeExtentAccessorT& operator=(const NodeExtentAccessorT&) = delete; + NodeExtentAccessorT& operator=(NodeExtentAccessorT&&) = delete; + + const node_stage_t& read() const { return node_stage; } + laddr_t get_laddr() const { return extent->get_laddr(); } + extent_len_t get_length() const { + auto len = extent->get_length(); + assert(is_valid_node_size(len)); + return len; + } + nextent_state_t get_state() const { + assert(!is_retired()); + // we cannot rely on the underlying extent state because + // FRESH/MUTATION_PENDING can become DIRTY after transaction submission. + return state; + } + + bool is_retired() const { + if (extent) { + return false; + } else { + return true; + } + } + + // must be called before any mutate attempes. + // for the safety of mixed read and mutate, call before read. + void prepare_mutate(context_t c) { + assert(!is_retired()); + if (state == nextent_state_t::READ_ONLY) { + assert(!extent->is_mutable()); + auto ref_recorder = recorder_t::create_for_encode(c.vb); + recorder = static_cast<recorder_t*>(ref_recorder.get()); + extent = extent->mutate(c, std::move(ref_recorder)); + state = nextent_state_t::MUTATION_PENDING; + assert(extent->is_mutation_pending()); + node_stage = node_stage_t(reinterpret_cast<const FieldType*>(extent->get_read()), + get_length()); + assert(recorder == static_cast<recorder_t*>(extent->get_recorder())); + mut.emplace(extent->get_mutable()); + } + assert(extent->is_mutable()); + } + + template <KeyT KT> + const value_t* insert_replayable( + const full_key_t<KT>& key, + const value_input_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + assert(extent->is_mutable()); + assert(state != nextent_state_t::READ_ONLY); + if (state == nextent_state_t::MUTATION_PENDING) { + recorder->template encode_insert<KT>( + key, value, insert_pos, insert_stage, insert_size); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_insert<KT>( + key, value, insert_pos, insert_stage, insert_size); +#endif + auto ret = layout_t::template insert<KT>( + *mut, read(), key, value, + insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; + } + + void split_replayable(StagedIterator& split_at) { + assert(extent->is_mutable()); + assert(state != nextent_state_t::READ_ONLY); + if (state == nextent_state_t::MUTATION_PENDING) { + recorder->encode_split(split_at, read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->encode_split(split_at, read().p_start()); +#endif + layout_t::split(*mut, read(), split_at); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + } + + template <KeyT KT> + const value_t* split_insert_replayable( + StagedIterator& split_at, + const full_key_t<KT>& key, + const value_input_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + assert(extent->is_mutable()); + assert(state != nextent_state_t::READ_ONLY); + if (state == nextent_state_t::MUTATION_PENDING) { + recorder->template encode_split_insert<KT>( + split_at, key, value, insert_pos, insert_stage, insert_size, + read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_split_insert<KT>( + split_at, key, value, insert_pos, insert_stage, insert_size, + read().p_start()); +#endif + auto ret = layout_t::template split_insert<KT>( + *mut, read(), split_at, key, value, + insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; + } + + void update_child_addr_replayable( + const laddr_t new_addr, laddr_packed_t* p_addr) { + assert(extent->is_mutable()); + assert(state != nextent_state_t::READ_ONLY); + if (state == nextent_state_t::MUTATION_PENDING) { + recorder->encode_update_child_addr( + new_addr, p_addr, read().p_start(), get_length()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->encode_update_child_addr( + new_addr, p_addr, read().p_start(), get_length()); +#endif + layout_t::update_child_addr(*mut, new_addr, p_addr); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + } + + std::tuple<match_stage_t, position_t> erase_replayable(const position_t& pos) { + assert(extent->is_mutable()); + assert(state != nextent_state_t::READ_ONLY); + if (state == nextent_state_t::MUTATION_PENDING) { + recorder->encode_erase(pos); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->encode_erase(pos); +#endif + auto ret = layout_t::erase(*mut, read(), pos); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; + } + + position_t make_tail_replayable() { + assert(extent->is_mutable()); + assert(state != nextent_state_t::READ_ONLY); + if (state == nextent_state_t::MUTATION_PENDING) { + recorder->encode_make_tail(); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->encode_make_tail(); +#endif + auto ret = layout_t::make_tail(*mut, read()); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; + } + + std::pair<NodeExtentMutable&, ValueDeltaRecorder*> + prepare_mutate_value_payload(context_t c) { + prepare_mutate(c); + ValueDeltaRecorder* p_value_recorder = nullptr; + if (state == nextent_state_t::MUTATION_PENDING) { + p_value_recorder = recorder->get_value_recorder(); + } + return {*mut, p_value_recorder}; + } + + void test_copy_to(NodeExtentMutable& to) const { + assert(extent->get_length() == to.get_length()); + std::memcpy(to.get_write(), extent->get_read(), get_length()); + } + + eagain_ifuture<NodeExtentMutable> rebuild(context_t c, laddr_t hint) { + LOG_PREFIX(OTree::Extent::rebuild); + assert(!is_retired()); + if (state == nextent_state_t::FRESH) { + assert(extent->is_initial_pending()); + // already fresh and no need to record + return eagain_iertr::make_ready_future<NodeExtentMutable>(*mut); + } + assert(!extent->is_initial_pending()); + auto alloc_size = get_length(); + return c.nm.alloc_extent(c.t, hint, alloc_size + ).handle_error_interruptible( + eagain_iertr::pass_further{}, + crimson::ct_error::input_output_error::handle( + [FNAME, c, alloc_size, l_to_discard = extent->get_laddr()] { + SUBERRORT(seastore_onode, + "EIO during allocate -- node_size={}, to_discard={:x}", + c.t, alloc_size, l_to_discard); + ceph_abort("fatal error"); + }) + ).si_then([this, c, FNAME] (auto fresh_extent) { + SUBDEBUGT(seastore_onode, + "update addr from {:#x} to {:#x} ...", + c.t, extent->get_laddr(), fresh_extent->get_laddr()); + assert(fresh_extent); + assert(fresh_extent->is_initial_pending()); + assert(fresh_extent->get_recorder() == nullptr); + assert(get_length() == fresh_extent->get_length()); + auto fresh_mut = fresh_extent->get_mutable(); + std::memcpy(fresh_mut.get_write(), extent->get_read(), get_length()); + NodeExtentRef to_discard = extent; + + extent = fresh_extent; + node_stage = node_stage_t(reinterpret_cast<const FieldType*>(extent->get_read()), + get_length()); + state = nextent_state_t::FRESH; + mut.emplace(fresh_mut); + recorder = nullptr; + + return c.nm.retire_extent(c.t, to_discard + ).handle_error_interruptible( + eagain_iertr::pass_further{}, + crimson::ct_error::input_output_error::handle( + [FNAME, c, l_to_discard = to_discard->get_laddr(), + l_fresh = fresh_extent->get_laddr()] { + SUBERRORT(seastore_onode, + "EIO during retire -- to_disgard={:x}, fresh={:x}", + c.t, l_to_discard, l_fresh); + ceph_abort("fatal error"); + }), + crimson::ct_error::enoent::handle( + [FNAME, c, l_to_discard = to_discard->get_laddr(), + l_fresh = fresh_extent->get_laddr()] { + SUBERRORT(seastore_onode, + "ENOENT during retire -- to_disgard={:x}, fresh={:x}", + c.t, l_to_discard, l_fresh); + ceph_abort("fatal error"); + }) + ); + }).si_then([this, c] { + boost::ignore_unused(c); // avoid clang warning; + assert(!c.t.is_conflicted()); + return *mut; + }); + } + + eagain_ifuture<> retire(context_t c) { + LOG_PREFIX(OTree::Extent::retire); + assert(!is_retired()); + auto addr = extent->get_laddr(); + return c.nm.retire_extent(c.t, std::move(extent) + ).handle_error_interruptible( + eagain_iertr::pass_further{}, + crimson::ct_error::input_output_error::handle( + [FNAME, c, addr] { + SUBERRORT(seastore_onode, "EIO -- addr={:x}", c.t, addr); + ceph_abort("fatal error"); + }), + crimson::ct_error::enoent::handle( + [FNAME, c, addr] { + SUBERRORT(seastore_onode, "ENOENT -- addr={:x}", c.t, addr); + ceph_abort("fatal error"); + }) +#ifndef NDEBUG + ).si_then([c] { + assert(!c.t.is_conflicted()); + } +#endif + ); + } + + private: + NodeExtentRef extent; + node_stage_t node_stage; + nextent_state_t state; + std::optional<NodeExtentMutable> mut; + // owned by extent + recorder_t* recorder; + +#ifndef NDEBUG + // verify record replay using a different memory block + TestReplayExtent::Ref test_extent; + recorder_t* test_recorder; +#endif +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc new file mode 100644 index 000000000..8e6f16a74 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_extent_manager.h" + +#include "node_extent_manager/dummy.h" +#include "node_extent_manager/seastore.h" + +namespace crimson::os::seastore::onode { + +NodeExtentManagerURef NodeExtentManager::create_dummy(bool is_sync) +{ + if (is_sync) { + return NodeExtentManagerURef(new DummyNodeExtentManager<true>()); + } else { + return NodeExtentManagerURef(new DummyNodeExtentManager<false>()); + } +} + +NodeExtentManagerURef NodeExtentManager::create_seastore( + TransactionManager &tm, laddr_t min_laddr, double p_eagain) +{ + if (p_eagain == 0.0) { + return NodeExtentManagerURef( + new SeastoreNodeExtentManager<false>(tm, min_laddr, p_eagain)); + } else { + return NodeExtentManagerURef( + new SeastoreNodeExtentManager<true>(tm, min_laddr, p_eagain)); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h new file mode 100644 index 000000000..f8772929c --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/type_helpers.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/transaction_manager.h" + +#include "fwd.h" +#include "node_extent_mutable.h" +#include "node_types.h" +#include "stages/node_stage_layout.h" +#include "super.h" + +/** + * node_extent_manager.h + * + * Contains general interfaces for different backends (Dummy and Seastore). + */ + +namespace crimson::os::seastore::onode { + +using crimson::os::seastore::LogicalCachedExtent; +class NodeExtent : public LogicalCachedExtent { + public: + virtual ~NodeExtent() = default; + const node_header_t& get_header() const { + return *reinterpret_cast<const node_header_t*>(get_read()); + } + const char* get_read() const { + return get_bptr().c_str(); + } + NodeExtentMutable get_mutable() { + assert(is_mutable()); + return do_get_mutable(); + } + + virtual DeltaRecorder* get_recorder() const = 0; + virtual NodeExtentRef mutate(context_t, DeltaRecorderURef&&) = 0; + + protected: + template <typename... T> + NodeExtent(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {} + + NodeExtentMutable do_get_mutable() { + return NodeExtentMutable(get_bptr().c_str(), get_length()); + } + + std::ostream& print_detail_l(std::ostream& out) const final { + return out << ", fltree_header=" << get_header(); + } + + /** + * Abstracted interfaces to implement: + * - CacheExtent::duplicate_for_write() -> CachedExtentRef + * - CacheExtent::get_type() -> extent_types_t + * - CacheExtent::get_delta() -> ceph::bufferlist + * - LogicalCachedExtent::apply_delta(const ceph::bufferlist) -> void + */ +}; + +using crimson::os::seastore::TransactionManager; +class NodeExtentManager { + using base_iertr = TransactionManager::base_iertr; + public: + virtual ~NodeExtentManager() = default; + + virtual bool is_read_isolated() const = 0; + + using read_iertr = base_iertr::extend< + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + virtual read_iertr::future<NodeExtentRef> read_extent( + Transaction&, laddr_t) = 0; + + using alloc_iertr = base_iertr; + virtual alloc_iertr::future<NodeExtentRef> alloc_extent( + Transaction&, laddr_t hint, extent_len_t) = 0; + + using retire_iertr = base_iertr::extend< + crimson::ct_error::enoent>; + virtual retire_iertr::future<> retire_extent( + Transaction&, NodeExtentRef) = 0; + + using getsuper_iertr = base_iertr; + virtual getsuper_iertr::future<Super::URef> get_super( + Transaction&, RootNodeTracker&) = 0; + + virtual std::ostream& print(std::ostream& os) const = 0; + + static NodeExtentManagerURef create_dummy(bool is_sync); + static NodeExtentManagerURef create_seastore( + TransactionManager &tm, laddr_t min_laddr = L_ADDR_MIN, double p_eagain = 0.0); +}; +inline std::ostream& operator<<(std::ostream& os, const NodeExtentManager& nm) { + return nm.print(os); +} + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::onode::NodeExtent> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h new file mode 100644 index 000000000..24df8b548 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h @@ -0,0 +1,196 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <chrono> +#include <seastar/core/sleep.hh> + +#include "include/buffer_raw.h" +#include "crimson/os/seastore/logging.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +/** + * dummy.h + * + * Dummy backend implementations for test purposes. + */ + +namespace crimson::os::seastore::onode { + +class DummySuper final: public Super { + public: + DummySuper(Transaction& t, RootNodeTracker& tracker, laddr_t* p_root_laddr) + : Super(t, tracker), p_root_laddr{p_root_laddr} {} + ~DummySuper() override = default; + protected: + laddr_t get_root_laddr() const override { return *p_root_laddr; } + void write_root_laddr(context_t c, laddr_t addr) override { + LOG_PREFIX(OTree::Dummy); + SUBDEBUGT(seastore_onode, "update root {:#x} ...", c.t, addr); + *p_root_laddr = addr; + } + private: + laddr_t* p_root_laddr; +}; + +class DummyNodeExtent final: public NodeExtent { + public: + DummyNodeExtent(ceph::bufferptr &&ptr) : NodeExtent(std::move(ptr)) { + state = extent_state_t::INITIAL_WRITE_PENDING; + } + DummyNodeExtent(const DummyNodeExtent& other) = delete; + ~DummyNodeExtent() override = default; + + void retire() { + assert(state == extent_state_t::INITIAL_WRITE_PENDING); + state = extent_state_t::INVALID; + bufferptr empty_bptr; + get_bptr().swap(empty_bptr); + } + + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override { + ceph_abort("impossible path"); } + DeltaRecorder* get_recorder() const override { + return nullptr; } + CachedExtentRef duplicate_for_write(Transaction&) override { + ceph_abort("impossible path"); } + extent_types_t get_type() const override { + return extent_types_t::TEST_BLOCK; } + ceph::bufferlist get_delta() override { + ceph_abort("impossible path"); } + void apply_delta(const ceph::bufferlist&) override { + ceph_abort("impossible path"); } +}; + +template <bool SYNC> +class DummyNodeExtentManager final: public NodeExtentManager { + static constexpr size_t ALIGNMENT = 4096; + public: + ~DummyNodeExtentManager() override = default; + std::size_t size() const { return allocate_map.size(); } + + protected: + bool is_read_isolated() const override { return false; } + + read_iertr::future<NodeExtentRef> read_extent( + Transaction& t, laddr_t addr) override { + SUBTRACET(seastore_onode, "reading at {:#x} ...", t, addr); + if constexpr (SYNC) { + return read_extent_sync(t, addr); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, addr] { + return read_extent_sync(t, addr); + }); + } + } + + alloc_iertr::future<NodeExtentRef> alloc_extent( + Transaction& t, laddr_t hint, extent_len_t len) override { + SUBTRACET(seastore_onode, "allocating {}B with hint {:#x} ...", t, len, hint); + if constexpr (SYNC) { + return alloc_extent_sync(t, len); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, len] { + return alloc_extent_sync(t, len); + }); + } + } + + retire_iertr::future<> retire_extent( + Transaction& t, NodeExtentRef extent) override { + SUBTRACET(seastore_onode, + "retiring {}B at {:#x} -- {} ...", + t, extent->get_length(), extent->get_laddr(), *extent); + if constexpr (SYNC) { + return retire_extent_sync(t, extent); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, extent] { + return retire_extent_sync(t, extent); + }); + } + } + + getsuper_iertr::future<Super::URef> get_super( + Transaction& t, RootNodeTracker& tracker) override { + SUBTRACET(seastore_onode, "get root ...", t); + if constexpr (SYNC) { + return get_super_sync(t, tracker); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, &tracker] { + return get_super_sync(t, tracker); + }); + } + } + + std::ostream& print(std::ostream& os) const override { + return os << "DummyNodeExtentManager(sync=" << SYNC << ")"; + } + + private: + read_iertr::future<NodeExtentRef> read_extent_sync( + Transaction& t, laddr_t addr) { + auto iter = allocate_map.find(addr); + assert(iter != allocate_map.end()); + auto extent = iter->second; + SUBTRACET(seastore_onode, + "read {}B at {:#x} -- {}", + t, extent->get_length(), extent->get_laddr(), *extent); + assert(extent->get_laddr() == addr); + return read_iertr::make_ready_future<NodeExtentRef>(extent); + } + + alloc_iertr::future<NodeExtentRef> alloc_extent_sync( + Transaction& t, extent_len_t len) { + assert(len % ALIGNMENT == 0); + auto r = ceph::buffer::create_aligned(len, ALIGNMENT); + auto addr = reinterpret_cast<laddr_t>(r->get_data()); + auto bp = ceph::bufferptr(std::move(r)); + auto extent = Ref<DummyNodeExtent>(new DummyNodeExtent(std::move(bp))); + extent->set_laddr(addr); + assert(allocate_map.find(extent->get_laddr()) == allocate_map.end()); + allocate_map.insert({extent->get_laddr(), extent}); + SUBDEBUGT(seastore_onode, + "allocated {}B at {:#x} -- {}", + t, extent->get_length(), extent->get_laddr(), *extent); + assert(extent->get_length() == len); + return alloc_iertr::make_ready_future<NodeExtentRef>(extent); + } + + retire_iertr::future<> retire_extent_sync( + Transaction& t, NodeExtentRef _extent) { + auto& extent = static_cast<DummyNodeExtent&>(*_extent.get()); + auto addr = extent.get_laddr(); + auto len = extent.get_length(); + extent.retire(); + auto iter = allocate_map.find(addr); + assert(iter != allocate_map.end()); + allocate_map.erase(iter); + SUBDEBUGT(seastore_onode, "retired {}B at {:#x}", t, len, addr); + return retire_iertr::now(); + } + + getsuper_iertr::future<Super::URef> get_super_sync( + Transaction& t, RootNodeTracker& tracker) { + SUBTRACET(seastore_onode, "got root {:#x}", t, root_laddr); + return getsuper_iertr::make_ready_future<Super::URef>( + Super::URef(new DummySuper(t, tracker, &root_laddr))); + } + + static LOG_PREFIX(OTree::Dummy); + + std::map<laddr_t, Ref<DummyNodeExtent>> allocate_map; + laddr_t root_laddr = L_ADDR_NULL; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::onode::DummyNodeExtent> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc new file mode 100644 index 000000000..3b52c5dc0 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc @@ -0,0 +1,90 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "seastore.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h" + +namespace { +LOG_PREFIX(OTree::Seastore); +} + +SET_SUBSYS(seastore_onode); + +namespace crimson::os::seastore::onode { + +static DeltaRecorderURef create_replay_recorder( + node_type_t node_type, field_type_t field_type) +{ + if (node_type == node_type_t::LEAF) { + if (field_type == field_type_t::N0) { + return DeltaRecorderT<node_fields_0_t, node_type_t::LEAF>::create_for_replay(); + } else if (field_type == field_type_t::N1) { + return DeltaRecorderT<node_fields_1_t, node_type_t::LEAF>::create_for_replay(); + } else if (field_type == field_type_t::N2) { + return DeltaRecorderT<node_fields_2_t, node_type_t::LEAF>::create_for_replay(); + } else if (field_type == field_type_t::N3) { + return DeltaRecorderT<leaf_fields_3_t, node_type_t::LEAF>::create_for_replay(); + } else { + ceph_abort("impossible path"); + } + } else if (node_type == node_type_t::INTERNAL) { + if (field_type == field_type_t::N0) { + return DeltaRecorderT<node_fields_0_t, node_type_t::INTERNAL>::create_for_replay(); + } else if (field_type == field_type_t::N1) { + return DeltaRecorderT<node_fields_1_t, node_type_t::INTERNAL>::create_for_replay(); + } else if (field_type == field_type_t::N2) { + return DeltaRecorderT<node_fields_2_t, node_type_t::INTERNAL>::create_for_replay(); + } else if (field_type == field_type_t::N3) { + return DeltaRecorderT<internal_fields_3_t, node_type_t::INTERNAL>::create_for_replay(); + } else { + ceph_abort("impossible path"); + } + } else { + ceph_abort("impossible path"); + } +} + +NodeExtentRef SeastoreNodeExtent::mutate( + context_t c, DeltaRecorderURef&& _recorder) +{ + DEBUGT("mutate {} ...", c.t, *this); + auto p_handle = static_cast<TransactionManagerHandle*>(&c.nm); + auto extent = p_handle->tm.get_mutable_extent(c.t, this); + auto ret = extent->cast<SeastoreNodeExtent>(); + // A replayed extent may already have an empty recorder, we discard it for + // simplicity. + assert(!ret->recorder || ret->recorder->is_empty()); + ret->recorder = std::move(_recorder); + return ret; +} + +void SeastoreNodeExtent::apply_delta(const ceph::bufferlist& bl) +{ + DEBUG("replay {} ...", *this); + if (!recorder) { + auto header = get_header(); + auto field_type = header.get_field_type(); + if (!field_type.has_value()) { + ERROR("replay got invalid node -- {}", *this); + ceph_abort("fatal error"); + } + auto node_type = header.get_node_type(); + recorder = create_replay_recorder(node_type, *field_type); + } else { +#ifndef NDEBUG + auto header = get_header(); + assert(recorder->node_type() == header.get_node_type()); + assert(recorder->field_type() == *header.get_field_type()); +#endif + } + auto mut = do_get_mutable(); + auto p = bl.cbegin(); + while (p != bl.end()) { + recorder->apply_delta(p, mut, *this); + } + DEBUG("relay done!"); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h new file mode 100644 index 000000000..f7cfa8c21 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h @@ -0,0 +1,223 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <random> + +#include "crimson/os/seastore/logging.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h" + +/** + * seastore.h + * + * Seastore backend implementations. + */ + +namespace crimson::os::seastore::onode { + +class SeastoreSuper final: public Super { + public: + SeastoreSuper(Transaction& t, RootNodeTracker& tracker, + laddr_t root_addr, TransactionManager& tm) + : Super(t, tracker), root_addr{root_addr}, tm{tm} {} + ~SeastoreSuper() override = default; + protected: + laddr_t get_root_laddr() const override { + return root_addr; + } + void write_root_laddr(context_t c, laddr_t addr) override { + LOG_PREFIX(OTree::Seastore); + SUBDEBUGT(seastore_onode, "update root {:#x} ...", c.t, addr); + root_addr = addr; + tm.write_onode_root(c.t, addr); + } + private: + laddr_t root_addr; + TransactionManager &tm; +}; + +class SeastoreNodeExtent final: public NodeExtent { + public: + SeastoreNodeExtent(ceph::bufferptr &&ptr) + : NodeExtent(std::move(ptr)) {} + SeastoreNodeExtent(const SeastoreNodeExtent& other) + : NodeExtent(other) {} + ~SeastoreNodeExtent() override = default; + + constexpr static extent_types_t TYPE = extent_types_t::ONODE_BLOCK_STAGED; + extent_types_t get_type() const override { + return TYPE; + } + + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override; + + DeltaRecorder* get_recorder() const override { + return recorder.get(); + } + + CachedExtentRef duplicate_for_write(Transaction&) override { + return CachedExtentRef(new SeastoreNodeExtent(*this)); + } + ceph::bufferlist get_delta() override { + assert(recorder); + return recorder->get_delta(); + } + void apply_delta(const ceph::bufferlist&) override; + + private: + DeltaRecorderURef recorder; +}; + +class TransactionManagerHandle : public NodeExtentManager { + public: + TransactionManagerHandle(TransactionManager &tm) : tm{tm} {} + TransactionManager &tm; +}; + +template <bool INJECT_EAGAIN=false> +class SeastoreNodeExtentManager final: public TransactionManagerHandle { + public: + SeastoreNodeExtentManager( + TransactionManager &tm, laddr_t min, double p_eagain) + : TransactionManagerHandle(tm), addr_min{min}, p_eagain{p_eagain} { + if constexpr (INJECT_EAGAIN) { + assert(p_eagain > 0.0 && p_eagain < 1.0); + } else { + assert(p_eagain == 0.0); + } + } + + ~SeastoreNodeExtentManager() override = default; + + void set_generate_eagain(bool enable) { + generate_eagain = enable; + } + + protected: + bool is_read_isolated() const override { return true; } + + read_iertr::future<NodeExtentRef> read_extent( + Transaction& t, laddr_t addr) override { + SUBTRACET(seastore_onode, "reading at {:#x} ...", t, addr); + if constexpr (INJECT_EAGAIN) { + if (trigger_eagain()) { + SUBDEBUGT(seastore_onode, "reading at {:#x}: trigger eagain", t, addr); + t.test_set_conflict(); + return read_iertr::make_ready_future<NodeExtentRef>(); + } + } + return tm.read_extent<SeastoreNodeExtent>(t, addr + ).si_then([addr, &t](auto&& e) -> read_iertr::future<NodeExtentRef> { + SUBTRACET(seastore_onode, + "read {}B at {:#x} -- {}", + t, e->get_length(), e->get_laddr(), *e); + assert(e->get_laddr() == addr); + std::ignore = addr; + return read_iertr::make_ready_future<NodeExtentRef>(e); + }); + } + + alloc_iertr::future<NodeExtentRef> alloc_extent( + Transaction& t, laddr_t hint, extent_len_t len) override { + SUBTRACET(seastore_onode, "allocating {}B with hint {:#x} ...", t, len, hint); + if constexpr (INJECT_EAGAIN) { + if (trigger_eagain()) { + SUBDEBUGT(seastore_onode, "allocating {}B: trigger eagain", t, len); + t.test_set_conflict(); + return alloc_iertr::make_ready_future<NodeExtentRef>(); + } + } + return tm.alloc_extent<SeastoreNodeExtent>(t, hint, len + ).si_then([len, &t](auto extent) { + SUBDEBUGT(seastore_onode, + "allocated {}B at {:#x} -- {}", + t, extent->get_length(), extent->get_laddr(), *extent); + if (!extent->is_initial_pending()) { + SUBERRORT(seastore_onode, + "allocated {}B but got invalid extent: {}", + t, len, *extent); + ceph_abort("fatal error"); + } + assert(extent->get_length() == len); + std::ignore = len; + return NodeExtentRef(extent); + }); + } + + retire_iertr::future<> retire_extent( + Transaction& t, NodeExtentRef _extent) override { + LogicalCachedExtentRef extent = _extent; + auto addr = extent->get_laddr(); + auto len = extent->get_length(); + SUBDEBUGT(seastore_onode, + "retiring {}B at {:#x} -- {} ...", + t, len, addr, *extent); + if constexpr (INJECT_EAGAIN) { + if (trigger_eagain()) { + SUBDEBUGT(seastore_onode, + "retiring {}B at {:#x} -- {} : trigger eagain", + t, len, addr, *extent); + t.test_set_conflict(); + return retire_iertr::now(); + } + } + return tm.dec_ref(t, extent).si_then([addr, len, &t] (unsigned cnt) { + assert(cnt == 0); + SUBTRACET(seastore_onode, "retired {}B at {:#x} ...", t, len, addr); + }); + } + + getsuper_iertr::future<Super::URef> get_super( + Transaction& t, RootNodeTracker& tracker) override { + SUBTRACET(seastore_onode, "get root ...", t); + if constexpr (INJECT_EAGAIN) { + if (trigger_eagain()) { + SUBDEBUGT(seastore_onode, "get root: trigger eagain", t); + t.test_set_conflict(); + return getsuper_iertr::make_ready_future<Super::URef>(); + } + } + return tm.read_onode_root(t).si_then([this, &t, &tracker](auto root_addr) { + SUBTRACET(seastore_onode, "got root {:#x}", t, root_addr); + return Super::URef(new SeastoreSuper(t, tracker, root_addr, tm)); + }); + } + + std::ostream& print(std::ostream& os) const override { + os << "SeastoreNodeExtentManager"; + if constexpr (INJECT_EAGAIN) { + os << "(p_eagain=" << p_eagain << ")"; + } + return os; + } + + private: + static LOG_PREFIX(OTree::Seastore); + + const laddr_t addr_min; + + // XXX: conditional members by INJECT_EAGAIN + bool trigger_eagain() { + if (generate_eagain) { + double dice = rd(); + assert(rd.min() == 0); + dice /= rd.max(); + return dice <= p_eagain; + } else { + return false; + } + } + bool generate_eagain = true; + std::random_device rd; + double p_eagain; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::onode::SeastoreNodeExtent> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h new file mode 100644 index 000000000..bce74e381 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +/** test_replay.h + * + * A special version of NodeExtent to help verify delta encode, decode and + * replay in recorder_t under debug build. + */ + +namespace crimson::os::seastore::onode { + +class TestReplayExtent final: public NodeExtent { + public: + using Ref = crimson::os::seastore::TCachedExtentRef<TestReplayExtent>; + + void prepare_replay(NodeExtentRef from_extent) { + assert(get_length() == from_extent->get_length()); + auto mut = do_get_mutable(); + std::memcpy(mut.get_write(), from_extent->get_read(), get_length()); + } + + void replay_and_verify(NodeExtentRef replayed_extent) { + assert(get_length() == replayed_extent->get_length()); + auto mut = do_get_mutable(); + auto bl = recorder->get_delta(); + assert(bl.length()); + auto p = bl.cbegin(); + recorder->apply_delta(p, mut, *this); + assert(p == bl.end()); + auto cmp = std::memcmp(get_read(), replayed_extent->get_read(), get_length()); + ceph_assert(cmp == 0 && "replay mismatch!"); + } + + static Ref create(extent_len_t length, DeltaRecorderURef&& recorder) { + auto r = ceph::buffer::create_aligned(length, 4096); + auto bp = ceph::bufferptr(std::move(r)); + return new TestReplayExtent(std::move(bp), std::move(recorder)); + } + + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override { + ceph_abort("impossible path"); } + DeltaRecorder* get_recorder() const override { + ceph_abort("impossible path"); } + CachedExtentRef duplicate_for_write(Transaction&) override { + ceph_abort("impossible path"); } + extent_types_t get_type() const override { + return extent_types_t::TEST_BLOCK; } + ceph::bufferlist get_delta() override { + ceph_abort("impossible path"); } + void apply_delta(const ceph::bufferlist&) override { + ceph_abort("impossible path"); } + + private: + TestReplayExtent(ceph::bufferptr&& ptr, DeltaRecorderURef&& recorder) + : NodeExtent(std::move(ptr)), recorder(std::move(recorder)) { + state = extent_state_t::MUTATION_PENDING; + } + DeltaRecorderURef recorder; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h new file mode 100644 index 000000000..6f92ca9ed --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h @@ -0,0 +1,113 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include <cstring> + +#include "fwd.h" + +#pragma once + +namespace crimson::os::seastore::onode { + +/** + * NodeExtentMutable + * + * A thin wrapper of NodeExtent to make sure that only the newly allocated + * or the duplicated NodeExtent is mutable, and the memory modifications are + * safe within the extent range. + */ +class NodeExtentMutable { + public: + void copy_in_absolute(void* dst, const void* src, extent_len_t len) { + assert(is_safe(dst, len)); + std::memcpy(dst, src, len); + } + template <typename T> + void copy_in_absolute(void* dst, const T& src) { + copy_in_absolute(dst, &src, sizeof(T)); + } + + const void* copy_in_relative( + extent_len_t dst_offset, const void* src, extent_len_t len) { + auto dst = get_write() + dst_offset; + copy_in_absolute(dst, src, len); + return dst; + } + template <typename T> + const T* copy_in_relative( + extent_len_t dst_offset, const T& src) { + auto dst = copy_in_relative(dst_offset, &src, sizeof(T)); + return static_cast<const T*>(dst); + } + + void shift_absolute(const void* src, extent_len_t len, int offset) { + assert(is_safe(src, len)); + char* to = (char*)src + offset; + assert(is_safe(to, len)); + if (len != 0) { + std::memmove(to, src, len); + } + } + void shift_relative(extent_len_t src_offset, extent_len_t len, int offset) { + shift_absolute(get_write() + src_offset, len, offset); + } + + void set_absolute(void* dst, int value, extent_len_t len) { + assert(is_safe(dst, len)); + std::memset(dst, value, len); + } + void set_relative(extent_len_t dst_offset, int value, extent_len_t len) { + auto dst = get_write() + dst_offset; + set_absolute(dst, value, len); + } + + template <typename T> + void validate_inplace_update(const T& updated) { + assert(is_safe(&updated, sizeof(T))); + } + + const char* get_read() const { return p_start; } + char* get_write() { return p_start; } + extent_len_t get_length() const { +#ifndef NDEBUG + if (node_offset == 0) { + assert(is_valid_node_size(length)); + } +#endif + return length; + } + node_offset_t get_node_offset() const { return node_offset; } + + NodeExtentMutable get_mutable_absolute(const void* dst, node_offset_t len) const { + assert(node_offset == 0); + assert(is_safe(dst, len)); + assert((const char*)dst != get_read()); + auto ret = *this; + node_offset_t offset = (const char*)dst - get_read(); + assert(offset != 0); + ret.p_start += offset; + ret.length = len; + ret.node_offset = offset; + return ret; + } + NodeExtentMutable get_mutable_relative( + node_offset_t offset, node_offset_t len) const { + return get_mutable_absolute(get_read() + offset, len); + } + + private: + NodeExtentMutable(char* p_start, extent_len_t length) + : p_start{p_start}, length{length} {} + bool is_safe(const void* src, extent_len_t len) const { + return ((const char*)src >= p_start) && + ((const char*)src + len <= p_start + length); + } + + char* p_start; + extent_len_t length; + node_offset_t node_offset = 0; + + friend class NodeExtent; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc new file mode 100644 index 000000000..5db0f83dd --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_impl.h" +#include "node_layout.h" + +namespace crimson::os::seastore::onode { + +#ifdef UNIT_TESTS_BUILT +last_split_info_t last_split = {}; +#endif + +// XXX: branchless allocation +eagain_ifuture<InternalNodeImpl::fresh_impl_t> +InternalNodeImpl::allocate( + context_t c, laddr_t hint, field_type_t type, bool is_level_tail, level_t level) +{ + if (type == field_type_t::N0) { + return InternalNode0::allocate(c, hint, is_level_tail, level); + } else if (type == field_type_t::N1) { + return InternalNode1::allocate(c, hint, is_level_tail, level); + } else if (type == field_type_t::N2) { + return InternalNode2::allocate(c, hint, is_level_tail, level); + } else if (type == field_type_t::N3) { + return InternalNode3::allocate(c, hint, is_level_tail, level); + } else { + ceph_abort("impossible path"); + } +} + +eagain_ifuture<LeafNodeImpl::fresh_impl_t> +LeafNodeImpl::allocate( + context_t c, laddr_t hint, field_type_t type, bool is_level_tail) +{ + if (type == field_type_t::N0) { + return LeafNode0::allocate(c, hint, is_level_tail, 0); + } else if (type == field_type_t::N1) { + return LeafNode1::allocate(c, hint, is_level_tail, 0); + } else if (type == field_type_t::N2) { + return LeafNode2::allocate(c, hint, is_level_tail, 0); + } else if (type == field_type_t::N3) { + return LeafNode3::allocate(c, hint, is_level_tail, 0); + } else { + ceph_abort("impossible path"); + } +} + +InternalNodeImplURef InternalNodeImpl::load( + NodeExtentRef extent, field_type_t type) +{ + if (type == field_type_t::N0) { + return InternalNode0::load(extent); + } else if (type == field_type_t::N1) { + return InternalNode1::load(extent); + } else if (type == field_type_t::N2) { + return InternalNode2::load(extent); + } else if (type == field_type_t::N3) { + return InternalNode3::load(extent); + } else { + ceph_abort("impossible path"); + } +} + +LeafNodeImplURef LeafNodeImpl::load( + NodeExtentRef extent, field_type_t type) +{ + if (type == field_type_t::N0) { + return LeafNode0::load(extent); + } else if (type == field_type_t::N1) { + return LeafNode1::load(extent); + } else if (type == field_type_t::N2) { + return LeafNode2::load(extent); + } else if (type == field_type_t::N3) { + return LeafNode3::load(extent); + } else { + ceph_abort("impossible path"); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h new file mode 100644 index 000000000..cf452618b --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h @@ -0,0 +1,270 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> + +#include "node_extent_mutable.h" +#include "node_types.h" +#include "stages/stage_types.h" + +namespace crimson::os::seastore::onode { + +#ifdef UNIT_TESTS_BUILT +enum class InsertType { BEGIN, LAST, MID }; +struct split_expectation_t { + match_stage_t split_stage; + match_stage_t insert_stage; + bool is_insert_left; + InsertType insert_type; +}; +struct last_split_info_t { + search_position_t split_pos; + match_stage_t insert_stage; + bool is_insert_left; + InsertType insert_type; + bool match(const split_expectation_t& e) const { + match_stage_t split_stage; + if (split_pos.nxt.nxt.index == 0) { + if (split_pos.nxt.index == 0) { + split_stage = 2; + } else { + split_stage = 1; + } + } else { + split_stage = 0; + } + return split_stage == e.split_stage && + insert_stage == e.insert_stage && + is_insert_left == e.is_insert_left && + insert_type == e.insert_type; + } + bool match_split_pos(const search_position_t& pos) const { + return split_pos == pos; + } +}; +extern last_split_info_t last_split; +#endif + +struct key_hobj_t; +struct key_view_t; +class NodeExtentMutable; + +/** + * NodeImpl + * + * Hides type specific node layout implementations for Node. + */ +class NodeImpl { + public: + virtual ~NodeImpl() = default; + + virtual node_type_t node_type() const = 0; + virtual field_type_t field_type() const = 0; + virtual laddr_t laddr() const = 0; + virtual const char* read() const = 0; + virtual extent_len_t get_node_size() const = 0; + virtual nextent_state_t get_extent_state() const = 0; + virtual void prepare_mutate(context_t) = 0; + virtual bool is_level_tail() const = 0; + + /* Invariants for num_keys and num_values: + * - for leaf node and non-tail internal node, num_keys == num_values; + * - for tail internal node, num_keys + 1 == num_values; + * - all node must have at least 1 value, except the root leaf node; + * - the root internal node must have more than 1 values; + */ + virtual void validate_non_empty() const = 0; + virtual bool is_keys_empty() const = 0; + // under the assumption that node is not empty + virtual bool has_single_value() const = 0; + + virtual level_t level() const = 0; + virtual node_offset_t free_size() const = 0; + virtual extent_len_t total_size() const = 0; + virtual bool is_extent_retired() const = 0; + virtual std::optional<key_view_t> get_pivot_index() const = 0; + virtual bool is_size_underflow() const = 0; + + virtual std::tuple<match_stage_t, search_position_t> erase(const search_position_t&) = 0; + virtual std::tuple<match_stage_t, std::size_t> evaluate_merge(NodeImpl&) = 0; + virtual search_position_t merge(NodeExtentMutable&, NodeImpl&, match_stage_t, extent_len_t) = 0; + virtual eagain_ifuture<NodeExtentMutable> rebuild_extent(context_t) = 0; + virtual eagain_ifuture<> retire_extent(context_t) = 0; + virtual search_position_t make_tail() = 0; + + virtual node_stats_t get_stats() const = 0; + virtual std::ostream& dump(std::ostream&) const = 0; + virtual std::ostream& dump_brief(std::ostream&) const = 0; + virtual const std::string& get_name() const = 0; + virtual void validate_layout() const = 0; + + virtual void test_copy_to(NodeExtentMutable&) const = 0; + virtual void test_set_tail(NodeExtentMutable&) = 0; + + protected: + NodeImpl() = default; +}; + +/** + * InternalNodeImpl + * + * Hides type specific node layout implementations for InternalNode. + */ +class InternalNodeImpl : public NodeImpl { + public: + struct internal_marker_t {}; + virtual ~InternalNodeImpl() = default; + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual void get_slot(const search_position_t&, // IN + key_view_t* = nullptr, // OUT + const laddr_packed_t** = nullptr) const { // OUT + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual void get_prev_slot(search_position_t&, // IN&OUT + key_view_t* = nullptr, // OUT + const laddr_packed_t** = nullptr) const { // OUT + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual void get_next_slot(search_position_t&, // IN&OUT + key_view_t* = nullptr, // OUT + const laddr_packed_t** = nullptr) const { // OUT + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual void get_largest_slot(search_position_t* = nullptr, // OUT + key_view_t* = nullptr, // OUT + const laddr_packed_t** = nullptr) const { // OUT + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual lookup_result_t<node_type_t::INTERNAL> lower_bound( + const key_hobj_t&, MatchHistory&, + key_view_t* = nullptr, internal_marker_t = {}) const { + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const laddr_packed_t* insert( + const key_view_t&, const laddr_t&, search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual std::tuple<search_position_t, bool, const laddr_packed_t*> split_insert( + NodeExtentMutable&, NodeImpl&, const key_view_t&, const laddr_t&, + search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + + virtual const laddr_packed_t* get_tail_value() const = 0; + + virtual void replace_child_addr(const search_position_t&, laddr_t dst, laddr_t src) = 0; + + virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_view_t&, const laddr_t&, search_position_t&) const = 0; + + struct fresh_impl_t { + InternalNodeImplURef impl; + NodeExtentMutable mut; + std::pair<NodeImplURef, NodeExtentMutable> make_pair() { + return {std::move(impl), mut}; + } + }; + static eagain_ifuture<fresh_impl_t> allocate(context_t, laddr_t, field_type_t, bool, level_t); + + static InternalNodeImplURef load(NodeExtentRef, field_type_t); + + protected: + InternalNodeImpl() = default; +}; + +/** + * LeafNodeImpl + * + * Hides type specific node layout implementations for LeafNode. + */ +class LeafNodeImpl : public NodeImpl { + public: + struct leaf_marker_t {}; + virtual ~LeafNodeImpl() = default; + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual void get_slot(const search_position_t&, // IN + key_view_t* = nullptr, // OUT + const value_header_t** = nullptr) const { // OUT + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual void get_prev_slot(search_position_t&, // IN&OUT + key_view_t* = nullptr, // OUT + const value_header_t** = nullptr) const { // OUT + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual void get_next_slot(search_position_t&, // IN&OUT + key_view_t* = nullptr, // OUT + const value_header_t** = nullptr) const { // OUT + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual void get_largest_slot(search_position_t* = nullptr, // OUT + key_view_t* = nullptr, // OUT + const value_header_t** = nullptr) const { // OUT + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual lookup_result_t<node_type_t::LEAF> lower_bound( + const key_hobj_t&, MatchHistory&, + key_view_t* = nullptr, leaf_marker_t = {}) const { + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const value_header_t* insert( + const key_hobj_t&, const value_config_t&, search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual std::tuple<search_position_t, bool, const value_header_t*> split_insert( + NodeExtentMutable&, NodeImpl&, const key_hobj_t&, const value_config_t&, + search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + + virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_hobj_t&, const value_config_t&, + const MatchHistory&, match_stat_t, search_position_t&) const = 0; + + virtual std::pair<NodeExtentMutable&, ValueDeltaRecorder*> + prepare_mutate_value_payload(context_t) = 0; + + struct fresh_impl_t { + LeafNodeImplURef impl; + NodeExtentMutable mut; + std::pair<NodeImplURef, NodeExtentMutable> make_pair() { + return {std::move(impl), mut}; + } + }; + static eagain_ifuture<fresh_impl_t> allocate(context_t, laddr_t, field_type_t, bool); + + static LeafNodeImplURef load(NodeExtentRef, field_type_t); + + protected: + LeafNodeImpl() = default; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h new file mode 100644 index 000000000..783a0c6cc --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h @@ -0,0 +1,948 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> +#include <sstream> + +#include "common/likely.h" +#include "crimson/os/seastore/logging.h" + +#include "node_extent_accessor.h" +#include "node_impl.h" +#include "stages/node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +template <node_type_t NODE_TYPE> struct insert_key_type; +template <> struct insert_key_type<node_type_t::INTERNAL> { + static constexpr auto type = KeyT::VIEW; }; +template <> struct insert_key_type<node_type_t::LEAF> { + static constexpr auto type = KeyT::HOBJ; }; + +template <node_type_t NODE_TYPE> struct node_impl_type; +template <> struct node_impl_type<node_type_t::INTERNAL> { + using type = InternalNodeImpl; }; +template <> struct node_impl_type<node_type_t::LEAF> { + using type = LeafNodeImpl; }; + +template <node_type_t NODE_TYPE> struct node_marker_type; +template <> struct node_marker_type<node_type_t::INTERNAL> { + using type = InternalNodeImpl::internal_marker_t; }; +template <> struct node_marker_type<node_type_t::LEAF> { + using type = LeafNodeImpl::leaf_marker_t; }; + +/** + * NodeLayoutT + * + * Contains templated and concrete implementations for both InternalNodeImpl + * and LeafNodeImpl under a specific node layout. + */ +template <typename FieldType, node_type_t NODE_TYPE> +class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { + public: + using URef = std::unique_ptr<NodeLayoutT>; + using extent_t = NodeExtentAccessorT<FieldType, NODE_TYPE>; + using parent_t = typename node_impl_type<NODE_TYPE>::type; + using marker_t = typename node_marker_type<NODE_TYPE>::type; + using node_stage_t = typename extent_t::node_stage_t; + using stage_t = node_to_stage_t<node_stage_t>; + using position_t = typename extent_t::position_t; + using value_input_t = typename extent_t::value_input_t; + using value_t = typename extent_t::value_t; + static constexpr auto FIELD_TYPE = extent_t::FIELD_TYPE; + static constexpr auto KEY_TYPE = insert_key_type<NODE_TYPE>::type; + static constexpr auto STAGE = stage_t::STAGE; + + NodeLayoutT(const NodeLayoutT&) = delete; + NodeLayoutT(NodeLayoutT&&) = delete; + NodeLayoutT& operator=(const NodeLayoutT&) = delete; + NodeLayoutT& operator=(NodeLayoutT&&) = delete; + ~NodeLayoutT() override = default; + + static URef load(NodeExtentRef extent) { + std::unique_ptr<NodeLayoutT> ret(new NodeLayoutT(extent)); + return ret; + } + + static eagain_ifuture<typename parent_t::fresh_impl_t> allocate( + context_t c, laddr_t hint, bool is_level_tail, level_t level) { + LOG_PREFIX(OTree::Layout::allocate); + extent_len_t extent_size; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + extent_size = c.vb.get_leaf_node_size(); + } else { + extent_size = c.vb.get_internal_node_size(); + } + return c.nm.alloc_extent(c.t, hint, extent_size + ).handle_error_interruptible( + eagain_iertr::pass_further{}, + crimson::ct_error::input_output_error::handle( + [FNAME, c, extent_size, is_level_tail, level] { + SUBERRORT(seastore_onode, + "EIO -- extent_size={}, is_level_tail={}, level={}", + c.t, extent_size, is_level_tail, level); + ceph_abort("fatal error"); + }) + ).si_then([is_level_tail, level](auto extent) { + assert(extent); + assert(extent->is_initial_pending()); + auto mut = extent->get_mutable(); + node_stage_t::bootstrap_extent( + mut, FIELD_TYPE, NODE_TYPE, is_level_tail, level); + return typename parent_t::fresh_impl_t{ + std::unique_ptr<parent_t>(new NodeLayoutT(extent)), mut}; + }); + } + + protected: + /* + * NodeImpl + */ + node_type_t node_type() const override { return NODE_TYPE; } + field_type_t field_type() const override { return FIELD_TYPE; } + laddr_t laddr() const override { return extent.get_laddr(); } + const char* read() const override { return extent.read().p_start(); } + extent_len_t get_node_size() const override { return extent.get_length(); } + nextent_state_t get_extent_state() const override { return extent.get_state(); } + void prepare_mutate(context_t c) override { return extent.prepare_mutate(c); } + bool is_level_tail() const override { return extent.read().is_level_tail(); } + + void validate_non_empty() const override { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + return; + } + } + assert(!is_keys_empty()); + } + + bool is_keys_empty() const override { return extent.read().keys() == 0; } + + bool has_single_value() const override { + validate_non_empty(); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + return ((is_level_tail() && is_keys_empty()) || + (!is_level_tail() && stage_t::is_keys_one(extent.read()))); + } else { + return stage_t::is_keys_one(extent.read()); + } + } + + level_t level() const override { return extent.read().level(); } + node_offset_t free_size() const override { return extent.read().free_size(); } + extent_len_t total_size() const override { return extent.read().total_size(); } + bool is_extent_retired() const override { return extent.is_retired(); } + + std::optional<key_view_t> get_pivot_index() const override { + if (is_level_tail()) { + return std::nullopt; + } + assert(!is_keys_empty()); + key_view_t pivot_index; + stage_t::template get_largest_slot<false, true, false>( + extent.read(), nullptr, &pivot_index, nullptr); + return {pivot_index}; + } + + bool is_size_underflow() const override { + /** + * There might be 2 node-merge strategies: + * + * The first is to rebalance and merge nodes and perfer tree fillness as + * much as possible in order to save space and improve key density for + * lookup, in exchange to the efforts of frequent merge, split and + * rebalance. These operations cannot benefit from seastore deltas because + * they are allocating fresh extents which need to be write into the + * journal as a whole, making write amplification much larger. + * + * The second is to delay rebalance and merge. When submit the transaction, + * simple insert and erase only need to append delta including just enough + * information about the inserted/erase item. The downside is tree fillness + * is not as good as the first strategy. + * + * Currently the decision is the second way by delaying merge until the + * node is 1/4 full, so that: + * - After a split operation (making the node at least 1/2 full): + * - The next merge need to erase items taking at least 1/4 space; + * - The next split need to insert items taking at most 1/2 space; + * - After a merge operation (making the node at least 1/2 full): + * - The next merge need to erase items taking at least 1/4 space; + * - The next split need to insert items taking at most 1/2 space; + * - TODO: before node rebalance is implemented, the node size can be below + * the underflow limit if it cannot be merged with peers; + */ + auto& node_stage = extent.read(); + size_t empty_size = node_stage.size_before(0); + size_t filled_kv_size = filled_size() - empty_size; + size_t full_kv_size = node_stage.total_size() - empty_size; + return filled_kv_size <= full_kv_size / 4; + } + + std::tuple<match_stage_t, search_position_t> + erase(const search_position_t& pos) override { + LOG_PREFIX(OTree::Layout::erase); + SUBDEBUG(seastore_onode, "begin at erase_pos({}) ...", pos); + if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + SUBTRACE(seastore_onode, "-- dump\n{}", sos.str()); + } + auto [stage, next_or_last_pos] = extent.erase_replayable(cast_down<STAGE>(pos)); + SUBDEBUG(seastore_onode, "done at erase_stage={}, n/l_pos({})", stage, next_or_last_pos); + if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + SUBTRACE(seastore_onode, "-- dump\n{}", sos.str()); + } +#ifndef NDEBUG + if (!is_keys_empty()) { + validate_layout(); + } +#endif + return {stage, normalize(std::move(next_or_last_pos))}; + } + + std::tuple<match_stage_t, std::size_t> evaluate_merge( + NodeImpl& _right_node) override { + auto& left_node_stage = extent.read(); + auto& right_node = dynamic_cast<NodeLayoutT&>(_right_node); + auto& right_node_stage = right_node.extent.read(); + + assert(NODE_TYPE == _right_node.node_type()); + assert(FIELD_TYPE == _right_node.field_type()); + assert(!is_level_tail()); + assert(!is_keys_empty()); + + match_stage_t merge_stage; + node_offset_t size_comp; + if (right_node.is_keys_empty()) { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(right_node.is_level_tail()); + merge_stage = STAGE; + size_comp = right_node_stage.header_size(); + } else { + ceph_abort("impossible path"); + } + } else { + key_view_t left_pivot_index; + stage_t::template get_largest_slot<false, true, false>( + left_node_stage, nullptr, &left_pivot_index, nullptr); + std::tie(merge_stage, size_comp) = stage_t::evaluate_merge( + left_pivot_index, right_node_stage); + } + auto size_left = filled_size(); + auto size_right = right_node.filled_size(); + assert(size_right > size_comp); + std::size_t merge_size = size_left + size_right - size_comp; + return {merge_stage, merge_size}; + } + + search_position_t merge( + NodeExtentMutable& mut, + NodeImpl& _right_node, + match_stage_t merge_stage, + extent_len_t merge_size) override { + LOG_PREFIX(OTree::Layout::merge); + + auto& left_node_stage = extent.read(); + auto& right_node = dynamic_cast<NodeLayoutT&>(_right_node); + auto& right_node_stage = right_node.extent.read(); + if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::debug))) { + { + std::ostringstream sos; + dump(sos); + SUBDEBUG(seastore_onode, "-- left node dump\n{}", sos.str()); + } + { + std::ostringstream sos; + right_node.dump(sos); + SUBDEBUG(seastore_onode, "-- right node dump\n{}", sos.str()); + } + } + + assert(NODE_TYPE == _right_node.node_type()); + assert(FIELD_TYPE == _right_node.field_type()); + assert(!is_level_tail()); + assert(!is_keys_empty()); + + if (right_node.is_level_tail()) { + node_stage_t::update_is_level_tail(mut, left_node_stage, true); + build_name(); + } + position_t left_last_pos; + stage_t::template get_largest_slot<true, false, false>( + left_node_stage, &left_last_pos, nullptr, nullptr); + + if (right_node.is_keys_empty()) { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(right_node.is_level_tail()); + laddr_t tail_value = right_node_stage.get_end_p_laddr()->value; + auto p_write = left_node_stage.get_end_p_laddr(); + mut.copy_in_absolute((void*)p_write, tail_value); + } else { + ceph_abort("impossible path"); + } + } else { + typename stage_t::template StagedAppender<KeyT::VIEW> left_appender; + left_appender.init_tail(&mut, left_node_stage, merge_stage); + + typename stage_t::StagedIterator right_append_at; + right_append_at.set(right_node_stage); + + auto pos_end = position_t::end(); + stage_t::template append_until<KeyT::VIEW>( + right_append_at, left_appender, pos_end, STAGE); + assert(right_append_at.is_end()); + left_appender.wrap(); + } + + if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + dump(sos); + SUBDEBUG(seastore_onode, "-- merged node dump\n{}", sos.str()); + } + assert(merge_size == filled_size()); + return normalize(std::move(left_last_pos)); + } + + eagain_ifuture<NodeExtentMutable> + rebuild_extent(context_t c) override { + assert(!is_keys_empty()); + key_view_t first_index; + stage_t::template get_slot<true, false>( + extent.read(), position_t::begin(), &first_index, nullptr); + auto hint = first_index.get_hint(); + return extent.rebuild(c, hint).si_then([this] (auto mut) { + // addr may change + build_name(); + return mut; + }); + } + + eagain_ifuture<> retire_extent(context_t c) override { + return extent.retire(c); + } + + search_position_t make_tail() override { + auto&& ret = extent.make_tail_replayable(); + // is_level_tail is changed + build_name(); + return normalize(std::move(ret)); + } + + node_stats_t get_stats() const override { + node_stats_t stats; + auto& node_stage = extent.read(); + key_view_t index_key; + if (!is_keys_empty()) { + stage_t::get_stats(node_stage, stats, index_key); + } + stats.size_persistent = extent.get_length(); + stats.size_filled = filled_size(); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + stats.size_logical += sizeof(value_t); + stats.size_value += sizeof(value_t); + stats.num_kvs += 1; + } + } + return stats; + } + + std::ostream& dump(std::ostream& os) const override { + auto& node_stage = extent.read(); + auto p_start = node_stage.p_start(); + dump_brief(os); + auto stats = get_stats(); + os << " num_kvs=" << stats.num_kvs + << ", logical=" << stats.size_logical + << "B, overhead=" << stats.size_overhead + << "B, value=" << stats.size_value << "B"; + os << ":\n header: " << node_stage_t::header_size() << "B"; + size_t size = 0u; + if (!is_keys_empty()) { + stage_t::dump(node_stage, os, " ", size, p_start); + } else { + size += node_stage_t::header_size(); + if (NODE_TYPE == node_type_t::LEAF || !node_stage.is_level_tail()) { + os << " empty!"; + } + } + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (node_stage.is_level_tail()) { + size += sizeof(laddr_t); + auto value_ptr = node_stage.get_end_p_laddr(); + int offset = reinterpret_cast<const char*>(value_ptr) - p_start; + os << "\n tail value: 0x" + << std::hex << value_ptr->value << std::dec + << " " << size << "B" + << " @" << offset << "B"; + } + } + assert(size == filled_size()); + return os; + } + + std::ostream& dump_brief(std::ostream& os) const override { + os << name + << "(filled=" << filled_size() << "B" + << ", free=" << extent.read().free_size() << "B)"; + return os; + } + + const std::string& get_name() const override { return name; } + + void validate_layout() const override { +#ifndef NDEBUG + stage_t::validate(extent.read()); +#endif + } + + void test_copy_to(NodeExtentMutable& to) const override { + extent.test_copy_to(to); + } + + void test_set_tail(NodeExtentMutable& mut) override { + node_stage_t::update_is_level_tail(mut, extent.read(), true); + build_name(); + } + + /* + * Common + */ + void get_slot(const search_position_t& pos, + key_view_t* p_index_key = nullptr, + const value_t** pp_value = nullptr) const override { + assert(!is_keys_empty()); + assert(!pos.is_end()); + if (p_index_key && pp_value) { + stage_t::template get_slot<true, true>( + extent.read(), cast_down<STAGE>(pos), p_index_key, pp_value); + } else if (!p_index_key && pp_value) { + stage_t::template get_slot<false, true>( + extent.read(), cast_down<STAGE>(pos), nullptr, pp_value); + } else if (p_index_key && !pp_value) { + stage_t::template get_slot<true, false>( + extent.read(), cast_down<STAGE>(pos), p_index_key, nullptr); + } else { + ceph_abort("impossible path"); + } +#ifndef NDEBUG + if (pp_value) { + assert((const char*)(*pp_value) - extent.read().p_start() < + extent.get_length()); + } +#endif + } + + void get_prev_slot(search_position_t& pos, + key_view_t* p_index_key = nullptr, + const value_t** pp_value = nullptr) const override { + assert(!is_keys_empty()); + assert(!pos.is_end()); + auto& _pos = cast_down<STAGE>(pos); +#ifndef NDEBUG + auto nxt_pos = _pos; +#endif + if (!p_index_key && pp_value) { + stage_t::template get_prev_slot<false, true>( + extent.read(), _pos, nullptr, pp_value); + } else { + ceph_abort("not implemented"); + } +#ifndef NDEBUG + auto _nxt_pos = _pos; + stage_t::template get_next_slot<false, false>( + extent.read(), _nxt_pos, nullptr, nullptr); + assert(nxt_pos == _nxt_pos); +#endif + } + + void get_next_slot(search_position_t& pos, + key_view_t* p_index_key = nullptr, + const value_t** pp_value = nullptr) const override { + assert(!is_keys_empty()); + assert(!pos.is_end()); + bool find_next; + if (p_index_key && pp_value) { + find_next = stage_t::template get_next_slot<true, true>( + extent.read(), cast_down<STAGE>(pos), p_index_key, pp_value); + } else if (!p_index_key && pp_value) { + find_next = stage_t::template get_next_slot<false, true>( + extent.read(), cast_down<STAGE>(pos), nullptr, pp_value); + } else { + ceph_abort("not implemented"); + } + if (find_next) { + pos = search_position_t::end(); + } + } + + void get_largest_slot(search_position_t* p_pos = nullptr, + key_view_t* p_index_key = nullptr, + const value_t** pp_value = nullptr) const override { + assert(!is_keys_empty()); + if (p_pos && p_index_key && pp_value) { + stage_t::template get_largest_slot<true, true, true>( + extent.read(), &cast_down_fill_0<STAGE>(*p_pos), p_index_key, pp_value); + } else if (!p_pos && p_index_key && !pp_value) { + stage_t::template get_largest_slot<false, true, false>( + extent.read(), nullptr, p_index_key, nullptr); + } else if (p_pos && !p_index_key && pp_value) { + stage_t::template get_largest_slot<true, false, true>( + extent.read(), &cast_down_fill_0<STAGE>(*p_pos), nullptr, pp_value); + } else if (p_pos && !p_index_key && !pp_value) { + stage_t::template get_largest_slot<true, false, false>( + extent.read(), &cast_down_fill_0<STAGE>(*p_pos), nullptr, nullptr); + } else { + ceph_abort("not implemented"); + } + } + + + lookup_result_t<NODE_TYPE> lower_bound( + const key_hobj_t& key, MatchHistory& history, + key_view_t* index_key=nullptr, marker_t={}) const override { + auto& node_stage = extent.read(); + if constexpr (NODE_TYPE == node_type_t::LEAF) { + if (unlikely(is_keys_empty())) { + history.set<STAGE_LEFT>(MatchKindCMP::LT); + return lookup_result_t<NODE_TYPE>::end(); + } + } + assert(!is_keys_empty()); + + typename stage_t::result_t result_raw; + if (index_key) { + result_raw = stage_t::template lower_bound<true>( + node_stage, key, history, index_key); +#ifndef NDEBUG + if (!result_raw.is_end()) { + key_view_t index; + stage_t::template get_slot<true, false>( + node_stage, result_raw.position, &index, nullptr); + assert(index == *index_key); + } +#endif + } else { + result_raw = stage_t::lower_bound(node_stage, key, history); + } +#ifndef NDEBUG + if (result_raw.is_end()) { + assert(result_raw.mstat == MSTAT_END); + } else { + key_view_t index; + stage_t::template get_slot<true, false>( + node_stage, result_raw.position, &index, nullptr); + assert_mstat(key, index, result_raw.mstat); + } +#endif + + // calculate MSTAT_LT3 + if constexpr (FIELD_TYPE == field_type_t::N0) { + // currently only internal node checks mstat + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (result_raw.mstat == MSTAT_LT2) { + auto cmp = + key <=> node_stage[result_raw.position.index].shard_pool; + assert(cmp != std::strong_ordering::greater); + if (cmp != 0) { + result_raw.mstat = MSTAT_LT3; + } + } + } + } + + auto result = normalize(std::move(result_raw)); + if (result.is_end()) { + assert(node_stage.is_level_tail()); + assert(result.p_value == nullptr); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + result.p_value = node_stage.get_end_p_laddr(); + } + } else { + assert(result.p_value != nullptr); + } + return result; + } + + const value_t* insert( + const full_key_t<KEY_TYPE>& key, const value_input_t& value, + search_position_t& insert_pos, match_stage_t& insert_stage, + node_offset_t& insert_size) override { + LOG_PREFIX(OTree::Layout::insert); + SUBDEBUG(seastore_onode, + "begin at insert_pos({}), insert_stage={}, insert_size={}B ...", + insert_pos, insert_stage, insert_size); + if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + SUBTRACE(seastore_onode, "-- dump\n{}", sos.str()); + } + auto ret = extent.template insert_replayable<KEY_TYPE>( + key, value, cast_down<STAGE>(insert_pos), insert_stage, insert_size); + SUBDEBUG(seastore_onode, + "done at insert_pos({}), insert_stage={}, insert_size={}B", + insert_pos, insert_stage, insert_size); + if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + SUBTRACE(seastore_onode, "-- dump\n{}", sos.str()); + } + validate_layout(); +#ifndef NDEBUG + key_view_t index; + get_slot(insert_pos, &index, nullptr); + assert(index == key); +#endif + return ret; + } + + std::tuple<search_position_t, bool, const value_t*> split_insert( + NodeExtentMutable& right_mut, NodeImpl& _right_impl, + const full_key_t<KEY_TYPE>& key, const value_input_t& value, + search_position_t& _insert_pos, match_stage_t& insert_stage, + node_offset_t& insert_size) override { + LOG_PREFIX(OTree::Layout::split_insert); + assert(_right_impl.node_type() == NODE_TYPE); + assert(_right_impl.field_type() == FIELD_TYPE); + auto& right_impl = dynamic_cast<NodeLayoutT&>(_right_impl); + SUBDEBUG(seastore_onode, + "begin at insert_pos({}), insert_stage={}, insert_size={}B ...", + _insert_pos, insert_stage, insert_size); + if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + dump(sos); + SUBDEBUG(seastore_onode, "-- dump\n{}", sos.str()); + } +#ifdef UNIT_TESTS_BUILT + auto insert_stage_pre = insert_stage; +#endif + + auto& insert_pos = cast_down<STAGE>(_insert_pos); + auto& node_stage = extent.read(); + typename stage_t::StagedIterator split_at; + bool is_insert_left; + size_t split_size; + size_t target_split_size; + { + size_t empty_size = node_stage.size_before(0); + size_t filled_kv_size = filled_size() - empty_size; + /** NODE_BLOCK_SIZE considerations + * + * Generally, + * target_split_size = (filled_size + insert_size) / 2 + * We can have two locate_split() strategies: + * A. the simpler one is to locate the largest split position where + * the estimated left_node_size <= target_split_size; + * B. the fair one takes a further step to calculate the next slot of + * P KiB, and if left_node_size + P/2 < target_split_size, compensate + * the split position to include the next slot; + * + * Say that the node_block_size = N KiB, the largest allowed + * insert_size = 1/I * N KiB (I > 1). We want to identify the minimal 'I' + * that won't lead to "double split" effect, meaning after a split, + * the right node size is still larger than N KiB and need to split + * again. I think "double split" makes split much more complicated and + * we can no longer identify whether the node is safe under concurrent + * operations. + * + * We need to evaluate the worst case in order to identify 'I'. This means: + * - filled_size ~= N KiB + * - insert_size == N/I KiB + * - target_split_size ~= (I+1)/2I * N KiB + * To simplify the below calculations, node_block_size is normalized to 1. + * + * With strategy A, the worst case is when left_node_size cannot include + * the next slot that will just overflow the target_split_size: + * - left_node_size + 1/I ~= (I+1)/2I + * - left_node_size ~= (I-1)/2I + * - right_node_size ~= 1 + 1/I - left_node_size ~= (I+3)/2I + * The right_node_size cannot larger than the node_block_size in the + * worst case, which means (I+3)/2I < 1, so I > 3, meaning the largest + * possible insert_size must be smaller than 1/3 of the node_block_size. + * + * With strategy B, the worst case is when left_node_size cannot include + * the next slot that will just overflow the threshold + * target_split_size - 1/2I, thus: + * - left_node_size ~= (I+1)/2I - 1/2I ~= 1/2 + * - right_node_size ~= 1 + 1/I - 1/2 ~= (I+2)/2I < node_block_size(1) + * - I > 2 + * This means the largest possible insert_size must be smaller than 1/2 of + * the node_block_size, which is better than strategy A. + * + * In order to avoid "double split", there is another side-effect we need + * to take into consideration: if split happens with snap-gen indexes, the + * according ns-oid string needs to be copied to the right node. That is + * to say: right_node_size + string_size < node_block_size. + * + * Say that the largest allowed string size is 1/S of the largest allowed + * insert_size N/I KiB. If we go with stragety B, and when split happens + * with snap-gen indexes and split just overflow the target_split_size: + * - left_node_size ~= target_split_size - 1/2 * (1/I - 1/IS) + * ~= 1/2 + 1/2IS + * - right_node_size ~= 1 + 1/I - left_node_size + 1/IS + * ~= 1/2 + 1/I + 1/2IS < 1 + * - I > 2 + 1/S (S > 1) + * + * Now back to NODE_BLOCK_SIZE calculation, if we have limits of at most + * X KiB ns-oid string and Y KiB of value to store in this BTree, then: + * - largest_insert_size ~= X+Y KiB + * - 1/S == X/(X+Y) + * - I > (3X+2Y)/(X+Y) + * - node_block_size(N) == I * insert_size > 3X+2Y KiB + * + * In conclusion, + * (TODO) the current node block size (4 KiB) is too small to + * store entire 2 KiB ns-oid string. We need to consider a larger + * node_block_size. + * + * We are setting X = Y = 640 B in order not to break the current + * implementations with 4KiB node. + * + * (TODO) Implement smarter logics to check when "double split" happens. + */ + target_split_size = empty_size + (filled_kv_size + insert_size) / 2; + assert(insert_size < (node_stage.total_size() - empty_size) / 2); + + std::optional<bool> _is_insert_left; + split_at.set(node_stage); + split_size = 0; + bool locate_nxt = stage_t::recursively_locate_split_inserted( + split_size, 0, target_split_size, insert_pos, + insert_stage, insert_size, _is_insert_left, split_at); + is_insert_left = *_is_insert_left; + SUBDEBUG(seastore_onode, + "-- located split_at({}), insert_pos({}), is_insert_left={}, " + "split_size={}B(target={}B, current={}B)", + split_at, insert_pos, is_insert_left, + split_size, target_split_size, filled_size()); + // split_size can be larger than target_split_size in strategy B + // assert(split_size <= target_split_size); + if (locate_nxt) { + assert(insert_stage == STAGE); + assert(split_at.get().is_last()); + split_at.set_end(); + assert(insert_pos.index == split_at.index()); + } + } + + auto append_at = split_at; + // TODO(cross-node string dedup) + typename stage_t::template StagedAppender<KEY_TYPE> right_appender; + right_appender.init_empty(&right_mut, right_mut.get_write()); + const value_t* p_value = nullptr; + if (!is_insert_left) { + // right node: append [start(append_at), insert_pos) + stage_t::template append_until<KEY_TYPE>( + append_at, right_appender, insert_pos, insert_stage); + SUBDEBUG(seastore_onode, + "-- right appended until " + "insert_pos({}), insert_stage={}, insert/append the rest ...", + insert_pos, insert_stage); + // right node: append [insert_pos(key, value)] + bool is_front_insert = (insert_pos == position_t::begin()); + [[maybe_unused]] bool is_end = stage_t::template append_insert<KEY_TYPE>( + key, value, append_at, right_appender, + is_front_insert, insert_stage, p_value); + assert(append_at.is_end() == is_end); + } else { + SUBDEBUG(seastore_onode, "-- right appending ..."); + } + + // right node: append (insert_pos, end) + auto pos_end = position_t::end(); + stage_t::template append_until<KEY_TYPE>( + append_at, right_appender, pos_end, STAGE); + assert(append_at.is_end()); + right_appender.wrap(); + if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + right_impl.dump(sos); + SUBDEBUG(seastore_onode, "-- right node dump\n{}", sos.str()); + } + right_impl.validate_layout(); + + // mutate left node + if (is_insert_left) { + SUBDEBUG(seastore_onode, + "-- left trim/insert at insert_pos({}), insert_stage={} ...", + insert_pos, insert_stage); + p_value = extent.template split_insert_replayable<KEY_TYPE>( + split_at, key, value, insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + key_view_t index; + get_slot(_insert_pos, &index, nullptr); + assert(index == key); +#endif + } else { + SUBDEBUG(seastore_onode, "-- left trim ..."); +#ifndef NDEBUG + key_view_t index; + right_impl.get_slot(_insert_pos, &index, nullptr); + assert(index == key); +#endif + extent.split_replayable(split_at); + } + if (right_impl.is_level_tail()) { + // is_level_tail of left is changed by split/split_insert + build_name(); + } + if (unlikely(LOGGER(seastore_onode).is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + dump(sos); + SUBDEBUG(seastore_onode, "-- left node dump\n{}", sos.str()); + } + validate_layout(); + assert(p_value); + + auto split_pos = normalize(split_at.get_pos()); + SUBDEBUG(seastore_onode, + "done at insert_pos({}), insert_stage={}, insert_size={}B, " + "split_at({}), is_insert_left={}, split_size={}B(target={}B)", + _insert_pos, insert_stage, insert_size, split_pos, + is_insert_left, split_size, target_split_size); + assert(split_size == filled_size()); + +#ifdef UNIT_TESTS_BUILT + InsertType insert_type; + search_position_t last_pos; + if (is_insert_left) { + stage_t::template get_largest_slot<true, false, false>( + extent.read(), &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr); + } else { + node_stage_t right_stage{reinterpret_cast<FieldType*>(right_mut.get_write()), + right_mut.get_length()}; + stage_t::template get_largest_slot<true, false, false>( + right_stage, &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr); + } + if (_insert_pos == search_position_t::begin()) { + insert_type = InsertType::BEGIN; + } else if (_insert_pos == last_pos) { + insert_type = InsertType::LAST; + } else { + insert_type = InsertType::MID; + } + last_split = {split_pos, insert_stage_pre, is_insert_left, insert_type}; +#endif + return {split_pos, is_insert_left, p_value}; + } + + /* + * InternalNodeImpl + */ + const laddr_packed_t* get_tail_value() const override { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(is_level_tail()); + return extent.read().get_end_p_laddr(); + } else { + ceph_abort("impossible path"); + } + } + + void replace_child_addr( + const search_position_t& pos, laddr_t dst, laddr_t src) override { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + LOG_PREFIX(OTree::Layout::replace_child_addr); + SUBDEBUG(seastore_onode, "update from {:#x} to {:#x} at pos({}) ...", src, dst, pos); + const laddr_packed_t* p_value; + if (pos.is_end()) { + assert(is_level_tail()); + p_value = get_tail_value(); + } else { + get_slot(pos, nullptr, &p_value); + } + assert(p_value->value == src); + extent.update_child_addr_replayable(dst, const_cast<laddr_packed_t*>(p_value)); + } else { + ceph_abort("impossible path"); + } + } + + std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_view_t& key, const laddr_t& value, + search_position_t& insert_pos) const override { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + auto& node_stage = extent.read(); + match_stage_t insert_stage; + node_offset_t insert_size; + if (unlikely(is_keys_empty())) { + assert(insert_pos.is_end()); + insert_stage = STAGE; + insert_size = stage_t::insert_size(key, value); + } else { + std::tie(insert_stage, insert_size) = stage_t::evaluate_insert( + node_stage, key, value, cast_down<STAGE>(insert_pos), false); + } + return {insert_stage, insert_size}; + } else { + ceph_abort("impossible path"); + } + } + + /* + * LeafNodeImpl + */ + std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_hobj_t& key, const value_config_t& value, + const MatchHistory& history, match_stat_t mstat, + search_position_t& insert_pos) const override { + if constexpr (NODE_TYPE == node_type_t::LEAF) { + if (unlikely(is_keys_empty())) { + assert(insert_pos.is_end()); + assert(is_level_tail()); + return {STAGE, stage_t::insert_size(key, value)}; + } else { + return stage_t::evaluate_insert( + key, value, history, mstat, cast_down<STAGE>(insert_pos)); + } + } else { + ceph_abort("impossible path"); + } + } + + std::pair<NodeExtentMutable&, ValueDeltaRecorder*> + prepare_mutate_value_payload(context_t c) { + return extent.prepare_mutate_value_payload(c); + } + + private: + NodeLayoutT(NodeExtentRef extent) : extent{extent} { + build_name(); + } + + extent_len_t filled_size() const { + auto& node_stage = extent.read(); + auto ret = node_stage.size_before(node_stage.keys()); + assert(ret == node_stage.total_size() - node_stage.free_size()); + return ret; + } + + // rebuild the name whenever addr, type, level, tail is changed + void build_name() { + // XXX: maybe also include the extent state + std::ostringstream sos; + sos << "Node" << NODE_TYPE << FIELD_TYPE + << "@0x" << std::hex << extent.get_laddr() + << "+" << extent.get_length() << std::dec + << "Lv" << (unsigned)level() + << (is_level_tail() ? "$" : ""); + name = sos.str(); + } + + extent_t extent; + std::string name = "Node-N/A"; +}; + +using InternalNode0 = NodeLayoutT<node_fields_0_t, node_type_t::INTERNAL>; +using InternalNode1 = NodeLayoutT<node_fields_1_t, node_type_t::INTERNAL>; +using InternalNode2 = NodeLayoutT<node_fields_2_t, node_type_t::INTERNAL>; +using InternalNode3 = NodeLayoutT<internal_fields_3_t, node_type_t::INTERNAL>; +using LeafNode0 = NodeLayoutT<node_fields_0_t, node_type_t::LEAF>; +using LeafNode1 = NodeLayoutT<node_fields_1_t, node_type_t::LEAF>; +using LeafNode2 = NodeLayoutT<node_fields_2_t, node_type_t::LEAF>; +using LeafNode3 = NodeLayoutT<leaf_fields_3_t, node_type_t::LEAF>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h new file mode 100644 index 000000000..d8a18231e --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h @@ -0,0 +1,138 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "node_extent_mutable.h" +#include "stages/node_stage.h" +#include "stages/stage.h" + +namespace crimson::os::seastore::onode { + +/** + * NodeLayoutReplayableT + * + * Contains templated logics to modify the layout of a NodeExtend which are + * also replayable. Used by NodeExtentAccessorT at runtime and by + * DeltaRecorderT during replay. + */ +template <typename FieldType, node_type_t NODE_TYPE> +struct NodeLayoutReplayableT { + using node_stage_t = node_extent_t<FieldType, NODE_TYPE>; + using stage_t = node_to_stage_t<node_stage_t>; + using position_t = typename stage_t::position_t; + using StagedIterator = typename stage_t::StagedIterator; + using value_input_t = value_input_type_t<NODE_TYPE>; + using value_t = value_type_t<NODE_TYPE>; + static constexpr auto FIELD_TYPE = FieldType::FIELD_TYPE; + + template <KeyT KT> + static const value_t* insert( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + const full_key_t<KT>& key, + const value_input_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + auto p_value = stage_t::template proceed_insert<KT, false>( + mut, node_stage, key, value, insert_pos, insert_stage, insert_size); + return p_value; + } + + static void split( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + StagedIterator& split_at) { + node_stage_t::update_is_level_tail(mut, node_stage, false); + stage_t::trim(mut, split_at); + } + + template <KeyT KT> + static const value_t* split_insert( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + StagedIterator& split_at, + const full_key_t<KT>& key, + const value_input_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + node_stage_t::update_is_level_tail(mut, node_stage, false); + stage_t::trim(mut, split_at); + auto p_value = stage_t::template proceed_insert<KT, true>( + mut, node_stage, key, value, insert_pos, insert_stage, insert_size); + return p_value; + } + + static void update_child_addr( + NodeExtentMutable& mut, const laddr_t new_addr, laddr_packed_t* p_addr) { + assert(NODE_TYPE == node_type_t::INTERNAL); + mut.copy_in_absolute(p_addr, new_addr); + } + + static std::tuple<match_stage_t, position_t> erase( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + const position_t& _erase_pos) { + if (_erase_pos.is_end()) { + // must be internal node + assert(node_stage.is_level_tail()); + // return erase_stage, last_pos + return update_last_to_tail(mut, node_stage); + } + + assert(node_stage.keys() != 0); + position_t erase_pos = _erase_pos; + auto erase_stage = stage_t::erase(mut, node_stage, erase_pos); + // return erase_stage, next_pos + return {erase_stage, erase_pos}; + } + + static position_t make_tail( + NodeExtentMutable& mut, + const node_stage_t& node_stage) { + assert(!node_stage.is_level_tail()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + auto [r_stage, r_last_pos] = update_last_to_tail(mut, node_stage); + std::ignore = r_stage; + return r_last_pos; + } else { + node_stage_t::update_is_level_tail(mut, node_stage, true); + // no need to calculate the last pos + return position_t::end(); + } + } + + private: + static std::tuple<match_stage_t, position_t> update_last_to_tail( + NodeExtentMutable& mut, + const node_stage_t& node_stage) { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(node_stage.keys() != 0); + position_t last_pos; + laddr_t last_value; + { + const laddr_packed_t* p_last_value; + stage_t::template get_largest_slot<true, false, true>( + node_stage, &last_pos, nullptr, &p_last_value); + last_value = p_last_value->value; + } + + auto erase_pos = last_pos; + auto erase_stage = stage_t::erase(mut, node_stage, erase_pos); + assert(erase_pos.is_end()); + + node_stage_t::update_is_level_tail(mut, node_stage, true); + auto p_last_value = const_cast<laddr_packed_t*>( + node_stage.get_end_p_laddr()); + mut.copy_in_absolute(p_last_value, last_value); + // return erase_stage, last_pos + return {erase_stage, last_pos}; + } else { + ceph_abort("impossible path"); + } + } +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h new file mode 100644 index 000000000..22c140b59 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <ostream> + +#include "fwd.h" + +namespace crimson::os::seastore::onode { + +constexpr uint8_t FIELD_TYPE_MAGIC = 0x25; +enum class field_type_t : uint8_t { + N0 = FIELD_TYPE_MAGIC, + N1, + N2, + N3, + _MAX +}; +inline uint8_t to_unsigned(field_type_t type) { + auto value = static_cast<uint8_t>(type); + assert(value >= FIELD_TYPE_MAGIC); + assert(value < static_cast<uint8_t>(field_type_t::_MAX)); + return value - FIELD_TYPE_MAGIC; +} +inline std::ostream& operator<<(std::ostream &os, field_type_t type) { + const char* const names[] = {"0", "1", "2", "3"}; + auto index = to_unsigned(type); + os << names[index]; + return os; +} + +enum class node_type_t : uint8_t { + LEAF = 0, + INTERNAL +}; +inline std::ostream& operator<<(std::ostream &os, const node_type_t& type) { + const char* const names[] = {"L", "I"}; + auto index = static_cast<uint8_t>(type); + assert(index <= 1u); + os << names[index]; + return os; +} + +struct laddr_packed_t { + laddr_t value; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const laddr_packed_t& laddr) { + return os << "laddr_packed(0x" << std::hex << laddr.value << std::dec << ")"; +} + +using match_stat_t = int8_t; +constexpr match_stat_t MSTAT_END = -2; // index is search_position_t::end() +constexpr match_stat_t MSTAT_EQ = -1; // key == index +constexpr match_stat_t MSTAT_LT0 = 0; // key == index [pool/shard crush ns/oid]; key < index [snap/gen] +constexpr match_stat_t MSTAT_LT1 = 1; // key == index [pool/shard crush]; key < index [ns/oid] +constexpr match_stat_t MSTAT_LT2 = 2; // key < index [pool/shard crush ns/oid] || + // key == index [pool/shard]; key < index [crush] +constexpr match_stat_t MSTAT_LT3 = 3; // key < index [pool/shard] +constexpr match_stat_t MSTAT_MIN = MSTAT_END; +constexpr match_stat_t MSTAT_MAX = MSTAT_LT3; + +enum class node_delta_op_t : uint8_t { + INSERT, + SPLIT, + SPLIT_INSERT, + UPDATE_CHILD_ADDR, + ERASE, + MAKE_TAIL, + SUBOP_UPDATE_VALUE = 0xff, +}; + +/** nextent_state_t + * + * The possible states of tree node extent(NodeExtentAccessorT). + * + * State transition implies the following capabilities is changed: + * - mutability is changed; + * - whether to record; + * - memory has been copied; + * + * load()----+ + * | + * alloc() v + * | +--> [READ_ONLY] ---------+ + * | | | | + * | | prepare_mutate() | + * | | | | + * | v v v + * | +--> [MUTATION_PENDING]---+ + * | | | + * | | rebuild() + * | | | + * | v v + * +------->+--> [FRESH] <------------+ + * + * Note that NodeExtentAccessorT might still be MUTATION_PENDING/FRESH while + * the internal extent has become DIRTY after the transaction submission is + * started while nodes destruction and validation has not been completed yet. + */ +enum class nextent_state_t : uint8_t { + READ_ONLY = 0, // requires mutate for recording + // CLEAN/DIRTY + MUTATION_PENDING, // can mutate, needs recording + // MUTATION_PENDING + FRESH, // can mutate, no recording + // INITIAL_WRITE_PENDING +}; + +} + +template <> struct fmt::formatter<crimson::os::seastore::onode::node_delta_op_t> + : fmt::formatter<std::string_view> { + using node_delta_op_t = crimson::os::seastore::onode::node_delta_op_t; + // parse is inherited from formatter<string_view>. + template <typename FormatContext> + auto format(node_delta_op_t op, FormatContext& ctx) { + std::string_view name = "unknown"; + switch (op) { + case node_delta_op_t::INSERT: + name = "insert"; + break; + case node_delta_op_t::SPLIT: + name = "split"; + break; + case node_delta_op_t::SPLIT_INSERT: + name = "split_insert"; + break; + case node_delta_op_t::UPDATE_CHILD_ADDR: + name = "update_child_addr"; + break; + case node_delta_op_t::ERASE: + name = "erase"; + break; + case node_delta_op_t::MAKE_TAIL: + name = "make_tail"; + break; + case node_delta_op_t::SUBOP_UPDATE_VALUE: + name = "subop_update_value"; + break; + } + return formatter<string_view>::format(name, ctx); + } +}; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc new file mode 100644 index 000000000..9252fb99a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc @@ -0,0 +1,202 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "item_iterator_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +#define ITER_T item_iterator_t<NODE_TYPE> +#define ITER_INST(NT) item_iterator_t<NT> + +template <node_type_t NODE_TYPE> +template <IsFullKey Key> +memory_range_t ITER_T::insert_prefix( + NodeExtentMutable& mut, const ITER_T& iter, const Key& key, + bool is_end, node_offset_t size, const char* p_left_bound) +{ + // 1. insert range + char* p_insert; + if (is_end) { + assert(!iter.has_next()); + p_insert = const_cast<char*>(iter.p_start()); + } else { + p_insert = const_cast<char*>(iter.p_end()); + } + char* p_insert_front = p_insert - size; + + // 2. shift memory + const char* p_shift_start = p_left_bound; + const char* p_shift_end = p_insert; + mut.shift_absolute(p_shift_start, + p_shift_end - p_shift_start, + -(int)size); + + // 3. append header + p_insert -= sizeof(node_offset_t); + node_offset_t back_offset = (p_insert - p_insert_front); + mut.copy_in_absolute(p_insert, back_offset); + ns_oid_view_t::append(mut, key, p_insert); + + return {p_insert_front, p_insert}; +} +#define IP_TEMPLATE(NT, Key) \ + template memory_range_t ITER_INST(NT)::insert_prefix<Key>( \ + NodeExtentMutable&, const ITER_INST(NT)&, const Key&, \ + bool, node_offset_t, const char*) +IP_TEMPLATE(node_type_t::LEAF, key_view_t); +IP_TEMPLATE(node_type_t::INTERNAL, key_view_t); +IP_TEMPLATE(node_type_t::LEAF, key_hobj_t); +IP_TEMPLATE(node_type_t::INTERNAL, key_hobj_t); + +template <node_type_t NODE_TYPE> +void ITER_T::update_size( + NodeExtentMutable& mut, const ITER_T& iter, int change) +{ + node_offset_t offset = iter.get_back_offset(); + int new_size = change + offset; + assert(new_size > 0 && new_size < (int)mut.get_length()); + mut.copy_in_absolute( + (void*)iter.get_item_range().p_end, node_offset_t(new_size)); +} + +template <node_type_t NODE_TYPE> +node_offset_t ITER_T::trim_until(NodeExtentMutable& mut, const ITER_T& iter) +{ + assert(iter.index() != 0); + size_t ret = iter.p_end() - iter.p_items_start; + assert(ret < mut.get_length()); + return ret; +} + +template <node_type_t NODE_TYPE> +node_offset_t ITER_T::trim_at( + NodeExtentMutable& mut, const ITER_T& iter, node_offset_t trimmed) +{ + size_t trim_size = iter.p_start() - iter.p_items_start + trimmed; + assert(trim_size < mut.get_length()); + assert(iter.get_back_offset() > trimmed); + node_offset_t new_offset = iter.get_back_offset() - trimmed; + mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset); + return trim_size; +} + +template <node_type_t NODE_TYPE> +node_offset_t ITER_T::erase( + NodeExtentMutable& mut, const ITER_T& iter, const char* p_left_bound) +{ + node_offset_t erase_size = iter.p_end() - iter.p_start(); + const char* p_shift_start = p_left_bound; + assert(p_left_bound <= iter.p_start()); + extent_len_t shift_len = iter.p_start() - p_left_bound; + int shift_off = erase_size; + mut.shift_absolute(p_shift_start, shift_len, shift_off); + return erase_size; +} + +#define ITER_TEMPLATE(NT) template class ITER_INST(NT) +ITER_TEMPLATE(node_type_t::LEAF); +ITER_TEMPLATE(node_type_t::INTERNAL); + +#define APPEND_T ITER_T::Appender<KT> + +template <node_type_t NODE_TYPE> +template <KeyT KT> +APPEND_T::Appender(NodeExtentMutable* p_mut, + const item_iterator_t& iter, + bool open) : p_mut{p_mut} +{ + assert(!iter.has_next()); + if (open) { + p_append = const_cast<char*>(iter.get_key().p_start()); + p_offset_while_open = const_cast<char*>(iter.item_range.p_end); + } else { + // XXX: this doesn't need to advance the iter to last + p_append = const_cast<char*>(iter.p_items_start); + } +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +bool APPEND_T::append(const ITER_T& src, index_t& items) +{ + auto p_end = src.p_end(); + bool append_till_end = false; + if (is_valid_index(items)) { + for (auto i = 1u; i <= items; ++i) { + if (!src.has_next()) { + assert(i == items); + append_till_end = true; + break; + } + ++src; + } + } else { + if (items == INDEX_END) { + append_till_end = true; + } else { + assert(items == INDEX_LAST); + } + items = 0; + while (src.has_next()) { + ++src; + ++items; + } + if (append_till_end) { + ++items; + } + } + + const char* p_start; + if (append_till_end) { + p_start = src.p_start(); + } else { + p_start = src.p_end(); + } + assert(p_end >= p_start); + size_t append_size = p_end - p_start; + p_append -= append_size; + p_mut->copy_in_absolute(p_append, p_start, append_size); + return append_till_end; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const key_get_type& partial_key) +{ + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append(*p_mut, partial_key, p_append); + return {p_mut, p_append}; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const full_key_t<KT>& key) +{ + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append(*p_mut, key, p_append); + return {p_mut, p_append}; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::wrap_nxt(char* _p_append) +{ + assert(_p_append < p_append); + p_mut->copy_in_absolute( + p_offset_while_open, node_offset_t(p_offset_while_open - _p_append)); + p_append = _p_append; +} + +#define APPEND_TEMPLATE(NT, KT) template class ITER_INST(NT)::Appender<KT> +APPEND_TEMPLATE(node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ); + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h new file mode 100644 index 000000000..9d12474ac --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h @@ -0,0 +1,193 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/** + * item_iterator_t + * + * The STAGE_STRING implementation for node N0/N1, implements staged contract + * as an iterative container to resolve crush hash conflicts. + * + * The layout of the contaner to index ns, oid strings storing n items: + * + * # <--------- container range ---------> # + * #<~># items [i+1, n) # + * # # items [0, i) #<~># + * # # <------ item i -------------> # # + * # # <--- item_range ---> | # # + * # # | # # + * # # next-stage | ns-oid | back_ # # + * # # contaner | strings | offset # # + * #...# range | | #...# + * ^ ^ | ^ + * | | | | + * | +---------------------------+ | + * + p_items_start p_items_end + + */ +template <node_type_t NODE_TYPE> +class item_iterator_t { + using value_input_t = value_input_type_t<NODE_TYPE>; + using value_t = value_type_t<NODE_TYPE>; + public: + item_iterator_t(const container_range_t& range) + : node_size{range.node_size}, + p_items_start(range.range.p_start), + p_items_end(range.range.p_end) { + assert(is_valid_node_size(node_size)); + assert(p_items_start < p_items_end); + next_item_range(p_items_end); + } + + const char* p_start() const { return item_range.p_start; } + const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); } + const memory_range_t& get_item_range() const { return item_range; } + node_offset_t get_back_offset() const { return back_offset; } + + // container type system + using key_get_type = const ns_oid_view_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE; + index_t index() const { return _index; } + key_get_type get_key() const { + if (!key.has_value()) { + key = ns_oid_view_t(item_range.p_end); + assert(item_range.p_start < (*key).p_start()); + } + return *key; + } + node_offset_t size() const { + size_t ret = item_range.p_end - item_range.p_start + sizeof(node_offset_t); + assert(ret < node_size); + return ret; + }; + node_offset_t size_to_nxt() const { + size_t ret = get_key().size() + sizeof(node_offset_t); + assert(ret < node_size); + return ret; + } + node_offset_t size_overhead() const { + return sizeof(node_offset_t) + get_key().size_overhead(); + } + container_range_t get_nxt_container() const { + return {{item_range.p_start, get_key().p_start()}, node_size}; + } + bool has_next() const { + assert(p_items_start <= item_range.p_start); + return p_items_start < item_range.p_start; + } + const item_iterator_t<NODE_TYPE>& operator++() const { + assert(has_next()); + next_item_range(item_range.p_start); + key.reset(); + ++_index; + return *this; + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + int start_offset = p_items_start - p_node_start; + int stage_size = p_items_end - p_items_start; + assert(start_offset > 0); + assert(stage_size > 0); + assert(start_offset + stage_size <= (int)node_size); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(stage_size), encoded); + ceph::encode(_index, encoded); + } + + static item_iterator_t decode(const char* p_node_start, + extent_len_t node_size, + ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t stage_size; + ceph::decode(stage_size, delta); + assert(start_offset > 0); + assert(stage_size > 0); + assert((unsigned)start_offset + stage_size <= node_size); + index_t index; + ceph::decode(index, delta); + + item_iterator_t ret({{p_node_start + start_offset, + p_node_start + start_offset + stage_size}, + node_size}); + while (index > 0) { + ++ret; + --index; + } + return ret; + } + + static node_offset_t header_size() { return 0u; } + + template <IsFullKey Key> + static node_offset_t estimate_insert( + const Key& key, const value_input_t&) { + return ns_oid_view_t::estimate_size(key) + sizeof(node_offset_t); + } + + template <IsFullKey Key> + static memory_range_t insert_prefix( + NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, + const Key& key, bool is_end, + node_offset_t size, const char* p_left_bound); + + static void update_size( + NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, int change); + + static node_offset_t trim_until(NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&); + static node_offset_t trim_at( + NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, node_offset_t trimmed); + + static node_offset_t erase( + NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, const char*); + + template <KeyT KT> + class Appender; + + private: + void next_item_range(const char* p_end) const { + auto p_item_end = p_end - sizeof(node_offset_t); + assert(p_items_start < p_item_end); + back_offset = reinterpret_cast<const node_offset_packed_t*>(p_item_end)->value; + assert(back_offset); + const char* p_item_start = p_item_end - back_offset; + assert(p_items_start <= p_item_start); + item_range = {p_item_start, p_item_end}; + } + + extent_len_t node_size; + const char* p_items_start; + const char* p_items_end; + mutable memory_range_t item_range; + mutable node_offset_t back_offset; + mutable std::optional<ns_oid_view_t> key; + mutable index_t _index = 0u; +}; + +template <node_type_t NODE_TYPE> +template <KeyT KT> +class item_iterator_t<NODE_TYPE>::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + Appender(NodeExtentMutable*, const item_iterator_t&, bool open); + bool append(const item_iterator_t<NODE_TYPE>& src, index_t& items); + char* wrap() { return p_append; } + std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&); + std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&); + void wrap_nxt(char* _p_append); + + private: + NodeExtentMutable* p_mut; + char* p_append; + char* p_offset_while_open; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc new file mode 100644 index 000000000..4bf717dc2 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "key_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void string_key_view_t::append_str( + NodeExtentMutable& mut, std::string_view str, char*& p_append) +{ + assert(is_valid_size(str.length())); + p_append -= sizeof(string_size_t); + string_size_t len = str.length(); + mut.copy_in_absolute(p_append, len); + p_append -= len; + mut.copy_in_absolute(p_append, str.data(), len); +} + +void string_key_view_t::append_dedup( + NodeExtentMutable& mut, const Type& dedup_type, char*& p_append) +{ + p_append -= sizeof(string_size_t); + if (dedup_type == Type::MIN) { + mut.copy_in_absolute(p_append, MARKER_MIN); + } else if (dedup_type == Type::MAX) { + mut.copy_in_absolute(p_append, MARKER_MAX); + } else { + ceph_abort("impossible path"); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h new file mode 100644 index 000000000..fcd485355 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h @@ -0,0 +1,910 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <limits> +#include <optional> +#include <ostream> + +#include "common/hobject.h" +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" + +namespace crimson::os::seastore::onode { + +using shard_t = int8_t; +using pool_t = int64_t; +// Note: this is the reversed version of the object hash +using crush_hash_t = uint32_t; +using snap_t = uint64_t; +using gen_t = uint64_t; +static_assert(sizeof(shard_t) == sizeof(ghobject_t().shard_id.id)); +static_assert(sizeof(pool_t) == sizeof(ghobject_t().hobj.pool)); +static_assert(sizeof(crush_hash_t) == sizeof(ghobject_t().hobj.get_bitwise_key_u32())); +static_assert(sizeof(snap_t) == sizeof(ghobject_t().hobj.snap.val)); +static_assert(sizeof(gen_t) == sizeof(ghobject_t().generation)); + +constexpr auto MAX_SHARD = std::numeric_limits<shard_t>::max(); +constexpr auto MAX_POOL = std::numeric_limits<pool_t>::max(); +constexpr auto MAX_CRUSH = std::numeric_limits<crush_hash_t>::max(); +constexpr auto MAX_SNAP = std::numeric_limits<snap_t>::max(); +constexpr auto MAX_GEN = std::numeric_limits<gen_t>::max(); + +class NodeExtentMutable; +class key_view_t; +class key_hobj_t; +enum class KeyT { VIEW, HOBJ }; +template <KeyT> struct _full_key_type; +template<> struct _full_key_type<KeyT::VIEW> { using type = key_view_t; }; +template<> struct _full_key_type<KeyT::HOBJ> { using type = key_hobj_t; }; +template <KeyT type> +using full_key_t = typename _full_key_type<type>::type; + +static laddr_t get_lba_hint(shard_t shard, pool_t pool, crush_hash_t crush) { + // FIXME: It is possible that PGs from different pools share the same prefix + // if the mask 0xFF is not long enough, result in unexpected transaction + // conflicts. + return ((uint64_t)(shard & 0XFF)<<56 | + (uint64_t)(pool & 0xFF)<<48 | + (uint64_t)(crush )<<16); +} + +struct node_offset_packed_t { + node_offset_t value; +} __attribute__((packed)); + +/** + * string_key_view_t + * + * The layout to store char array as an oid or an ns string which may be + * compressed. + * + * (TODO) If compressed, the physical block only stores an unsigned int of + * string_size_t, with value MARKER_MIN denoting Type::MIN, and value + * MARKER_MAX denoting Type::MAX. + * + * If not compressed (Type::STR), the physical block stores the char array and + * a valid string_size_t value. + */ +struct string_key_view_t { + enum class Type {MIN, STR, MAX}; + static constexpr auto MARKER_MAX = std::numeric_limits<string_size_t>::max(); + static constexpr auto MARKER_MIN = std::numeric_limits<string_size_t>::max() - 1; + static constexpr auto VALID_UPPER_BOUND = std::numeric_limits<string_size_t>::max() - 2; + static bool is_valid_size(size_t size) { + return size <= VALID_UPPER_BOUND; + } + + string_key_view_t(const char* p_end) { + p_length = p_end - sizeof(string_size_t); + std::memcpy(&length, p_length, sizeof(string_size_t)); + if (is_valid_size(length)) { + auto _p_key = p_length - length; + p_key = static_cast<const char*>(_p_key); + } else { + assert(length == MARKER_MAX || length == MARKER_MIN); + p_key = nullptr; + } + } + Type type() const { + if (length == MARKER_MIN) { + return Type::MIN; + } else if (length == MARKER_MAX) { + return Type::MAX; + } else { + assert(is_valid_size(length)); + return Type::STR; + } + } + const char* p_start() const { + if (p_key) { + return p_key; + } else { + return p_length; + } + } + const char* p_next_end() const { + if (p_key) { + return p_start(); + } else { + return p_length + sizeof(string_size_t); + } + } + node_offset_t size() const { + size_t ret = length + sizeof(string_size_t); + assert(ret < MAX_NODE_SIZE); + return ret; + } + node_offset_t size_logical() const { + assert(type() == Type::STR); + assert(is_valid_size(length)); + return length; + } + node_offset_t size_overhead() const { + assert(type() == Type::STR); + return sizeof(string_size_t); + } + + std::string_view to_string_view() const { + assert(type() == Type::STR); + assert(is_valid_size(length)); + return {p_key, length}; + } + bool operator==(const string_key_view_t& x) const { + if (type() == x.type() && type() != Type::STR) + return true; + if (type() != x.type()) + return false; + if (length != x.length) + return false; + return (memcmp(p_key, x.p_key, length) == 0); + } + bool operator!=(const string_key_view_t& x) const { return !(*this == x); } + + void reset_to(const char* origin_base, + const char* new_base, + extent_len_t node_size) { + reset_ptr(p_key, origin_base, new_base, node_size); + reset_ptr(p_length, origin_base, new_base, node_size); +#ifndef NDEBUG + string_size_t current_length; + std::memcpy(¤t_length, p_length, sizeof(string_size_t)); + assert(length == current_length); +#endif + } + + static void append_str( + NodeExtentMutable&, std::string_view, char*& p_append); + + static void test_append_str(std::string_view str, char*& p_append) { + assert(is_valid_size(str.length())); + p_append -= sizeof(string_size_t); + string_size_t len = str.length(); + std::memcpy(p_append, &len, sizeof(string_size_t)); + p_append -= len; + std::memcpy(p_append, str.data(), len); + } + + static void append_dedup( + NodeExtentMutable&, const Type& dedup_type, char*& p_append); + + static void test_append_dedup(const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + string_size_t len; + if (dedup_type == Type::MIN) { + len = MARKER_MIN; + } else if (dedup_type == Type::MAX) { + len = MARKER_MAX; + } else { + ceph_abort("impossible path"); + } + std::memcpy(p_append, &len, sizeof(string_size_t)); + } + + const char* p_key; + const char* p_length; + // TODO: remove if p_length is aligned + string_size_t length; +}; + +/** + * string_view_masked_t + * + * A common class to hide the underlying string implementation regardless of a + * string_key_view_t (maybe compressed), a string/string_view, or a compressed + * string. And leverage this consistant class to do compare, print, convert and + * append operations. + */ +class string_view_masked_t { + public: + using Type = string_key_view_t::Type; + explicit string_view_masked_t(const string_key_view_t& index) + : type{index.type()} { + if (type == Type::STR) { + view = index.to_string_view(); + } + } + explicit string_view_masked_t(std::string_view str) + : type{Type::STR}, view{str} { + assert(string_key_view_t::is_valid_size(view.size())); + } + + Type get_type() const { return type; } + std::string_view to_string_view() const { + assert(get_type() == Type::STR); + return view; + } + string_size_t size() const { + assert(get_type() == Type::STR); + assert(string_key_view_t::is_valid_size(view.size())); + return view.size(); + } + bool operator==(const string_view_masked_t& x) const { + if (get_type() == x.get_type() && get_type() != Type::STR) + return true; + if (get_type() != x.get_type()) + return false; + if (size() != x.size()) + return false; + return (memcmp(view.data(), x.view.data(), size()) == 0); + } + auto operator<=>(std::string_view rhs) const { + using Type = string_view_masked_t::Type; + assert(string_key_view_t::is_valid_size(rhs.size())); + auto lhs_type = get_type(); + if (lhs_type == Type::MIN) { + return std::strong_ordering::less; + } else if (lhs_type == Type::MAX) { + return std::strong_ordering::greater; + } else { // r_type == Type::STR + assert(string_key_view_t::is_valid_size(size())); + return to_string_view() <=> rhs; + } + } + void encode(ceph::bufferlist& bl) const { + if (get_type() == Type::MIN) { + ceph::encode(string_key_view_t::MARKER_MIN, bl); + } else if (get_type() == Type::MAX) { + ceph::encode(string_key_view_t::MARKER_MAX, bl); + } else { + ceph::encode(size(), bl); + ceph::encode_nohead(view, bl); + } + } + static auto min() { return string_view_masked_t{Type::MIN}; } + static auto max() { return string_view_masked_t{Type::MAX}; } + static string_view_masked_t decode( + std::string& str_storage, ceph::bufferlist::const_iterator& delta) { + string_size_t size; + ceph::decode(size, delta); + if (size == string_key_view_t::MARKER_MIN) { + return min(); + } else if (size == string_key_view_t::MARKER_MAX) { + return max(); + } else { + ceph::decode_nohead(size, str_storage, delta); + return string_view_masked_t(str_storage); + } + } + + private: + explicit string_view_masked_t(Type type) + : type{type} {} + + Type type; + std::string_view view; +}; + +inline auto operator<=>(const string_view_masked_t& l, const string_view_masked_t& r) { + using Type = string_view_masked_t::Type; + auto l_type = l.get_type(); + auto r_type = r.get_type(); + if (l_type == Type::STR && r_type == Type::STR) { + assert(string_key_view_t::is_valid_size(l.size())); + assert(string_key_view_t::is_valid_size(r.size())); + return l.to_string_view() <=> r.to_string_view(); + } else if (l_type == r_type) { + return std::strong_ordering::equal; + } else if (l_type == Type::MIN || r_type == Type::MAX) { + return std::strong_ordering::less; + } else { // l_type == Type::MAX || r_type == Type::MIN + return std::strong_ordering::greater; + } +} + +inline std::ostream& operator<<(std::ostream& os, const string_view_masked_t& masked) { + using Type = string_view_masked_t::Type; + auto type = masked.get_type(); + if (type == Type::MIN) { + return os << "MIN"; + } else if (type == Type::MAX) { + return os << "MAX"; + } else { // type == Type::STR + auto view = masked.to_string_view(); + if (view.length() <= 12) { + os << "\"" << view << "\""; + } else { + os << "\"" << std::string_view(view.data(), 4) << ".." + << std::string_view(view.data() + view.length() - 2, 2) + << "/" << view.length() << "B\""; + } + return os; + } +} + +struct ns_oid_view_t { + using Type = string_key_view_t::Type; + + ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {} + Type type() const { return oid.type(); } + const char* p_start() const { return oid.p_start(); } + node_offset_t size() const { + if (type() == Type::STR) { + size_t ret = nspace.size() + oid.size(); + assert(ret < MAX_NODE_SIZE); + return ret; + } else { + return sizeof(string_size_t); + } + } + node_offset_t size_logical() const { + assert(type() == Type::STR); + return nspace.size_logical() + oid.size_logical(); + } + node_offset_t size_overhead() const { + assert(type() == Type::STR); + return nspace.size_overhead() + oid.size_overhead(); + } + bool operator==(const ns_oid_view_t& x) const { + return (string_view_masked_t{nspace} == string_view_masked_t{x.nspace} && + string_view_masked_t{oid} == string_view_masked_t{x.oid}); + } + + void reset_to(const char* origin_base, + const char* new_base, + extent_len_t node_size) { + nspace.reset_to(origin_base, new_base, node_size); + oid.reset_to(origin_base, new_base, node_size); + } + + template <typename Key> + static node_offset_t estimate_size(const Key& key); + + template <typename Key> + static void append(NodeExtentMutable&, + const Key& key, + char*& p_append); + + static void append(NodeExtentMutable& mut, + const ns_oid_view_t& view, + char*& p_append) { + if (view.type() == Type::STR) { + string_key_view_t::append_str(mut, view.nspace.to_string_view(), p_append); + string_key_view_t::append_str(mut, view.oid.to_string_view(), p_append); + } else { + string_key_view_t::append_dedup(mut, view.type(), p_append); + } + } + + template <typename Key> + static void test_append(const Key& key, char*& p_append); + + string_key_view_t nspace; + string_key_view_t oid; +}; +inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) { + return os << string_view_masked_t{ns_oid.nspace} << "," + << string_view_masked_t{ns_oid.oid}; +} +inline auto operator<=>(const ns_oid_view_t& l, const ns_oid_view_t& r) { + auto ret = (string_view_masked_t{l.nspace} <=> string_view_masked_t{r.nspace}); + if (ret != 0) + return ret; + return string_view_masked_t{l.oid} <=> string_view_masked_t{r.oid}; +} + +inline const ghobject_t _MIN_OID() { + assert(ghobject_t().is_min()); + // don't extern _MIN_OID + return ghobject_t(); +} + +/* + * Unfortunally the ghobject_t representitive as tree key doesn't have max + * field, so we define our own _MAX_OID and translate it from/to + * ghobject_t::get_max() if necessary. + */ +inline const ghobject_t _MAX_OID() { + auto ret = ghobject_t(shard_id_t(MAX_SHARD), MAX_POOL, MAX_CRUSH, + "MAX", "MAX", MAX_SNAP, MAX_GEN); + assert(ret.hobj.get_hash() == ret.hobj.get_bitwise_key_u32()); + return ret; +} + +// the valid key stored in tree should be in the range of (_MIN_OID, _MAX_OID) +template <typename Key> +bool is_valid_key(const Key& key); + +/** + * key_hobj_t + * + * A specialized implementation of a full_key_t storing a ghobject_t passed + * from user. + */ +class key_hobj_t { + public: + explicit key_hobj_t(const ghobject_t& _ghobj) { + if (_ghobj.is_max()) { + ghobj = _MAX_OID(); + } else { + // including when _ghobj.is_min() + ghobj = _ghobj; + } + // I can be in the range of [_MIN_OID, _MAX_OID] + assert(ghobj >= _MIN_OID()); + assert(ghobj <= _MAX_OID()); + } + /* + * common interfaces as a full_key_t + */ + shard_t shard() const { + return ghobj.shard_id; + } + pool_t pool() const { + return ghobj.hobj.pool; + } + crush_hash_t crush() const { + // Note: this is the reversed version of the object hash + return ghobj.hobj.get_bitwise_key_u32(); + } + laddr_t get_hint() const { + return get_lba_hint(shard(), pool(), crush()); + } + std::string_view nspace() const { + // TODO(cross-node string dedup) + return ghobj.hobj.nspace; + } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{nspace()}; + } + std::string_view oid() const { + // TODO(cross-node string dedup) + return ghobj.hobj.oid.name; + } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{oid()}; + } + ns_oid_view_t::Type dedup_type() const { + return _dedup_type; + } + snap_t snap() const { + return ghobj.hobj.snap; + } + gen_t gen() const { + return ghobj.generation; + } + + std::ostream& dump(std::ostream& os) const { + os << "key_hobj(" << (int)shard() << "," + << pool() << ",0x" << std::hex << crush() << std::dec << "; " + << string_view_masked_t{nspace()} << "," + << string_view_masked_t{oid()} << "; " + << snap() << "," << gen() << ")"; + return os; + } + + bool is_valid() const { + return is_valid_key(*this); + } + + static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) { + shard_t shard; + ceph::decode(shard, delta); + pool_t pool; + ceph::decode(pool, delta); + // Note: this is the reversed version of the object hash + crush_hash_t crush; + ceph::decode(crush, delta); + std::string nspace; + [[maybe_unused]] auto nspace_masked = string_view_masked_t::decode(nspace, delta); + // TODO(cross-node string dedup) + assert(nspace_masked.get_type() == string_view_masked_t::Type::STR); + std::string oid; + [[maybe_unused]] auto oid_masked = string_view_masked_t::decode(oid, delta); + // TODO(cross-node string dedup) + assert(oid_masked.get_type() == string_view_masked_t::Type::STR); + snap_t snap; + ceph::decode(snap, delta); + gen_t gen; + ceph::decode(gen, delta); + return key_hobj_t(ghobject_t( + shard_id_t(shard), pool, crush, nspace, oid, snap, gen)); + } + + private: + ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR; + ghobject_t ghobj; +}; +inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) { + return key.dump(os); +} + +struct shard_pool_t; +struct crush_t; +struct shard_pool_crush_t; +struct snap_gen_t; + +/** + * key_view_t + * + * A specialized implementation of a full_key_t pointing to the locations + * storing the full key in a tree node. + */ +class key_view_t { + public: + /** + * common interfaces as a full_key_t + */ + inline shard_t shard() const; + inline pool_t pool() const; + inline crush_hash_t crush() const; + laddr_t get_hint() const { + return get_lba_hint(shard(), pool(), crush()); + } + std::string_view nspace() const { + // TODO(cross-node string dedup) + return ns_oid_view().nspace.to_string_view(); + } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().nspace}; + } + std::string_view oid() const { + // TODO(cross-node string dedup) + return ns_oid_view().oid.to_string_view(); + } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().oid}; + } + ns_oid_view_t::Type dedup_type() const { + return ns_oid_view().type(); + } + inline snap_t snap() const; + inline gen_t gen() const; + + /** + * key_view_t specific interfaces + */ + bool has_shard_pool() const { + return p_shard_pool != nullptr; + } + bool has_crush() const { + return p_crush != nullptr; + } + bool has_ns_oid() const { + return p_ns_oid.has_value(); + } + bool has_snap_gen() const { + return p_snap_gen != nullptr; + } + + const shard_pool_t& shard_pool_packed() const { + assert(has_shard_pool()); + return *p_shard_pool; + } + const crush_t& crush_packed() const { + assert(has_crush()); + return *p_crush; + } + const ns_oid_view_t& ns_oid_view() const { + assert(has_ns_oid()); + return *p_ns_oid; + } + const snap_gen_t& snap_gen_packed() const { + assert(has_snap_gen()); + return *p_snap_gen; + } + + size_t size_logical() const { + return sizeof(shard_t) + sizeof(pool_t) + sizeof(crush_hash_t) + + sizeof(snap_t) + sizeof(gen_t) + ns_oid_view().size_logical(); + } + + ghobject_t to_ghobj() const { + assert(is_valid_key(*this)); + return ghobject_t( + shard_id_t(shard()), pool(), crush(), + std::string(nspace()), std::string(oid()), snap(), gen()); + } + + void replace(const crush_t& key) { p_crush = &key; } + void set(const crush_t& key) { + assert(!has_crush()); + replace(key); + } + inline void replace(const shard_pool_crush_t& key); + inline void set(const shard_pool_crush_t& key); + void replace(const ns_oid_view_t& key) { p_ns_oid = key; } + void set(const ns_oid_view_t& key) { + assert(!has_ns_oid()); + replace(key); + } + void replace(const snap_gen_t& key) { p_snap_gen = &key; } + void set(const snap_gen_t& key) { + assert(!has_snap_gen()); + replace(key); + } + + void reset_to(const char* origin_base, + const char* new_base, + extent_len_t node_size) { + if (p_shard_pool != nullptr) { + reset_ptr(p_shard_pool, origin_base, new_base, node_size); + } + if (p_crush != nullptr) { + reset_ptr(p_crush, origin_base, new_base, node_size); + } + if (p_ns_oid.has_value()) { + p_ns_oid->reset_to(origin_base, new_base, node_size); + } + if (p_snap_gen != nullptr) { + reset_ptr(p_snap_gen, origin_base, new_base, node_size); + } + } + + std::ostream& dump(std::ostream& os) const { + os << "key_view("; + if (has_shard_pool()) { + os << (int)shard() << "," << pool() << ","; + } else { + os << "X,X,"; + } + if (has_crush()) { + os << "0x" << std::hex << crush() << std::dec << "; "; + } else { + os << "X; "; + } + if (has_ns_oid()) { + os << ns_oid_view() << "; "; + } else { + os << "X,X; "; + } + if (has_snap_gen()) { + os << snap() << "," << gen() << ")"; + } else { + os << "X,X)"; + } + return os; + } + + private: + const shard_pool_t* p_shard_pool = nullptr; + const crush_t* p_crush = nullptr; + std::optional<ns_oid_view_t> p_ns_oid; + const snap_gen_t* p_snap_gen = nullptr; +}; + +template<typename T> +concept IsFullKey = std::same_as<T, key_hobj_t> || std::same_as<T, key_view_t>; + +// TODO: consider alignments +struct shard_pool_t { + auto operator<=>(const shard_pool_t&) const = default; + + pool_t pool() const { return _pool; } + + template <IsFullKey Key> + static shard_pool_t from_key(const Key& key) { + if constexpr (std::same_as<Key, key_view_t>) { + return key.shard_pool_packed(); + } else { + return {key.shard(), key.pool()}; + } + } + + shard_t shard; + pool_t _pool; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) { + return os << (int)sp.shard << "," << sp.pool(); +} + +// Note: this is the reversed version of the object hash +struct crush_t { + auto operator<=>(const crush_t&) const = default; + + template <IsFullKey Key> + static crush_t from_key(const Key& key) { + if constexpr (std::same_as<Key, key_view_t>) { + return key.crush_packed(); + } else { + return {key.crush()}; + } + } + + crush_hash_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const crush_t& c) { + return os << "0x" << std::hex << c.crush << std::dec; +} + +struct shard_pool_crush_t { + auto operator<=>(const shard_pool_crush_t&) const = default; + + template <IsFullKey Key> + static shard_pool_crush_t from_key(const Key& key) { + return {shard_pool_t::from_key(key), crush_t::from_key(key)}; + } + + shard_pool_t shard_pool; + crush_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) { + return os << spc.shard_pool << ",0x" << std::hex << spc.crush << std::dec; +} + +struct snap_gen_t { + auto operator<=>(const snap_gen_t&) const = default; + + template <IsFullKey Key> + static snap_gen_t from_key(const Key& key) { + if constexpr (std::same_as<Key, key_view_t>) { + return key.snap_gen_packed(); + } else { + return {key.snap(), key.gen()}; + } + } + + snap_t snap; + gen_t gen; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) { + return os << sg.snap << "," << sg.gen; +} + +shard_t key_view_t::shard() const { + return shard_pool_packed().shard; +} + +pool_t key_view_t::pool() const { + return shard_pool_packed().pool(); +} + +crush_hash_t key_view_t::crush() const { + return crush_packed().crush; +} + +snap_t key_view_t::snap() const { + return snap_gen_packed().snap; +} + +gen_t key_view_t::gen() const { + return snap_gen_packed().gen; +} + +void key_view_t::replace(const shard_pool_crush_t& key) { + p_shard_pool = &key.shard_pool; +} + +void key_view_t::set(const shard_pool_crush_t& key) { + set(key.crush); + assert(!has_shard_pool()); + replace(key); +} + +template <IsFullKey Key> +void encode_key(const Key& key, ceph::bufferlist& bl) { + ceph::encode(key.shard(), bl); + ceph::encode(key.pool(), bl); + ceph::encode(key.crush(), bl); + key.nspace_masked().encode(bl); + key.oid_masked().encode(bl); + ceph::encode(key.snap(), bl); + ceph::encode(key.gen(), bl); +} + +template<IsFullKey LHS, IsFullKey RHS> +std::strong_ordering operator<=>(const LHS& lhs, const RHS& rhs) noexcept { + auto ret = lhs.shard() <=> rhs.shard(); + if (ret != 0) + return ret; + ret = lhs.pool() <=> rhs.pool(); + if (ret != 0) + return ret; + ret = lhs.crush() <=> rhs.crush(); + if (ret != 0) + return ret; + ret = lhs.nspace() <=> rhs.nspace(); + if (ret != 0) + return ret; + ret = lhs.oid() <=> rhs.oid(); + if (ret != 0) + return ret; + ret = lhs.snap() <=> rhs.snap(); + if (ret != 0) + return ret; + return lhs.gen() <=> rhs.gen(); +} + +template <typename Key> +bool is_valid_key(const Key& key) { + static_assert(IsFullKey<Key>); + return (key > key_hobj_t(ghobject_t()) && + key < key_hobj_t(ghobject_t::get_max())); +} + +inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) { + return key.dump(os); +} + +template <IsFullKey T> +auto operator<=>(const T& key, const shard_pool_t& target) { + auto ret = key.shard() <=> target.shard; + if (ret != 0) + return ret; + return key.pool() <=> target.pool(); +} + +template <IsFullKey T> +auto operator<=>(const T& key, const crush_t& target) { + return key.crush() <=> target.crush; +} + +template <IsFullKey T> +auto operator<=>(const T& key, const shard_pool_crush_t& target) { + auto ret = key <=> target.shard_pool; + if (ret != 0) + return ret; + return key <=> target.crush; +} + +template <IsFullKey T> +auto operator<=>(const T& key, const ns_oid_view_t& target) { + auto ret = key.nspace() <=> string_view_masked_t{target.nspace}; + if (ret != 0) + return ret; + return key.oid() <=> string_view_masked_t{target.oid}; +} + +template <IsFullKey T> +auto operator<=>(const T& key, const snap_gen_t& target) { + auto ret = key.snap() <=> target.snap; + if (ret != 0) + return ret; + return key.gen() <=> target.gen; +} + +template <IsFullKey LHS, typename RHS> +bool operator==(LHS lhs, RHS rhs) { + return lhs <=> rhs == 0; +} + +template <typename Key> +node_offset_t ns_oid_view_t::estimate_size(const Key& key) { + static_assert(IsFullKey<Key>); + if constexpr (std::same_as<Key, key_view_t>) { + return key.ns_oid_view().size(); + } else { + if (key.dedup_type() != Type::STR) { + // size after deduplication + return sizeof(string_size_t); + } else { + return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size(); + } + } +} + +template <typename Key> +void ns_oid_view_t::append( + NodeExtentMutable& mut, const Key& key, char*& p_append) { + static_assert(IsFullKey<Key>); + if (key.dedup_type() == Type::STR) { + string_key_view_t::append_str(mut, key.nspace(), p_append); + string_key_view_t::append_str(mut, key.oid(), p_append); + } else { + string_key_view_t::append_dedup(mut, key.dedup_type(), p_append); + } +} + +template <typename Key> +void ns_oid_view_t::test_append(const Key& key, char*& p_append) { + static_assert(IsFullKey<Key>); + if (key.dedup_type() == Type::STR) { + string_key_view_t::test_append_str(key.nspace(), p_append); + string_key_view_t::test_append_str(key.oid(), p_append); + } else { + string_key_view_t::test_append_dedup(key.dedup_type(), p_append); + } +} + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::onode::key_hobj_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::onode::key_view_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc new file mode 100644 index 000000000..3ed401c37 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc @@ -0,0 +1,420 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" +#include "node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +#define NODE_T node_extent_t<FieldType, NODE_TYPE> +#define NODE_INST(FT, NT) node_extent_t<FT, NT> + +template <typename FieldType, node_type_t NODE_TYPE> +const char* NODE_T::p_left_bound() const +{ + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + // N3 internal node doesn't have the right part + return nullptr; + } else { + auto ret = p_start() + + fields().get_item_end_offset(keys(), node_size); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + ret -= sizeof(laddr_t); + } + } + return ret; + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::size_to_nxt_at(index_t index) const +{ + assert(index < keys()); + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + return FieldType::estimate_insert_one(); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + auto p_end = p_start() + + p_fields->get_item_end_offset(index, node_size); + return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size(); + } else { + ceph_abort("N3 node is not nested"); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +container_range_t NODE_T::get_nxt_container(index_t index) const +{ + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("N3 internal node doesn't have the right part"); + } else { + auto item_start_offset = p_fields->get_item_start_offset( + index, node_size); + auto item_end_offset = p_fields->get_item_end_offset( + index, node_size); + assert(item_start_offset < item_end_offset); + auto item_p_start = p_start() + item_start_offset; + auto item_p_end = p_start() + item_end_offset; + if constexpr (FIELD_TYPE == field_type_t::N2) { + // range for sub_items_t<NODE_TYPE> + item_p_end = ns_oid_view_t(item_p_end).p_start(); + assert(item_p_start < item_p_end); + } else { + // range for item_iterator_t<NODE_TYPE> + } + return {{item_p_start, item_p_end}, node_size}; + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) +{ + node_header_t::bootstrap_extent( + mut, field_type, node_type, is_level_tail, level); + mut.copy_in_relative( + sizeof(node_header_t), typename FieldType::num_keys_t(0u)); +} + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::update_is_level_tail( + NodeExtentMutable& mut, const node_extent_t& extent, bool value) +{ + assert(mut.get_length() == extent.node_size); + assert(mut.get_read() == extent.p_start()); + node_header_t::update_is_level_tail(mut, extent.p_fields->header, value); +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <IsFullKey Key> +memory_range_t NODE_T::insert_prefix_at( + NodeExtentMutable& mut, const node_extent_t& node, const Key& key, + index_t index, node_offset_t size, const char* p_left_bound) +{ + assert(mut.get_length() == node.node_size); + assert(mut.get_read() == node.p_start()); + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + assert(index <= node.keys()); + assert(p_left_bound == node.p_left_bound()); + assert(size > FieldType::estimate_insert_one()); + auto size_right = size - FieldType::estimate_insert_one(); + const char* p_insert = node.p_start() + + node.fields().get_item_end_offset(index, mut.get_length()); + const char* p_insert_front = p_insert - size_right; + FieldType::insert_at(mut, key, node.fields(), index, size_right); + mut.shift_absolute(p_left_bound, + p_insert - p_left_bound, + -(int)size_right); + return {p_insert_front, p_insert}; + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible"); + } +} +#define IPA_TEMPLATE(FT, NT, Key) \ + template memory_range_t NODE_INST(FT, NT)::insert_prefix_at<Key>( \ + NodeExtentMutable&, const node_extent_t&, const Key&, \ + index_t, node_offset_t, const char*) +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, key_view_t); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, key_view_t); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, key_view_t); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, key_view_t); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, key_view_t); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, key_view_t); +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, key_hobj_t); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, key_hobj_t); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, key_hobj_t); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, key_hobj_t); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, key_hobj_t); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, key_hobj_t); + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::update_size_at( + NodeExtentMutable& mut, const node_extent_t& node, index_t index, int change) +{ + assert(mut.get_length() == node.node_size); + assert(mut.get_read() == node.p_start()); + assert(index < node.keys()); + FieldType::update_size_at(mut, node.fields(), index, change); +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::trim_until( + NodeExtentMutable& mut, const node_extent_t& node, index_t index) +{ + assert(mut.get_length() == node.node_size); + assert(mut.get_read() == node.p_start()); + assert(!node.is_level_tail()); + auto keys = node.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("not implemented"); + } else { + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index)); + } + // no need to calculate trim size for node + return 0; +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::trim_at( + NodeExtentMutable& mut, const node_extent_t& node, + index_t index, node_offset_t trimmed) +{ + assert(mut.get_length() == node.node_size); + assert(mut.get_read() == node.p_start()); + assert(!node.is_level_tail()); + assert(index < node.keys()); + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("not implemented"); + } else { + extent_len_t node_size = mut.get_length(); + node_offset_t offset = node.p_fields->get_item_start_offset( + index, node_size); + size_t new_offset = offset + trimmed; + assert(new_offset < node.p_fields->get_item_end_offset(index, node_size)); + mut.copy_in_absolute(const_cast<void*>(node.p_fields->p_offset(index)), + node_offset_t(new_offset)); + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index + 1)); + } + // no need to calculate trim size for node + return 0; +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::erase_at( + NodeExtentMutable& mut, const node_extent_t& node, + index_t index, const char* p_left_bound) +{ + assert(mut.get_length() == node.node_size); + assert(mut.get_read() == node.p_start()); + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + assert(node.keys() > 0); + assert(index < node.keys()); + assert(p_left_bound == node.p_left_bound()); + return FieldType::erase_at(mut, node.fields(), index, p_left_bound); + } else { + ceph_abort("not implemented"); + } +} + +#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT) +NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL); +NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF); +NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF); + +#define APPEND_T node_extent_t<FieldType, NODE_TYPE>::Appender<KT> + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +APPEND_T::Appender(NodeExtentMutable* p_mut, const node_extent_t& node, bool open) + : p_mut{p_mut}, p_start{p_mut->get_write()} +{ + assert(p_start == node.p_start()); + assert(node.keys()); + assert(node.node_size == p_mut->get_length()); + extent_len_t node_size = node.node_size; + if (open) { + // seek as open_nxt() + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + p_append_left = p_start + node.fields().get_key_start_offset( + node.keys() - 1, node_size); + p_append_left += sizeof(typename FieldType::key_t); + p_append_right = p_start + + node.fields().get_item_end_offset(node.keys() - 1, + node_size); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible path"); + } + num_keys = node.keys() - 1; + } else { + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + std::ignore = node_size; + ceph_abort("not implemented"); + } else { + p_append_left = p_start + node.fields().get_key_start_offset( + node.keys(), node_size); + p_append_right = p_start + + node.fields().get_item_end_offset(node.keys(), + node_size); + } + num_keys = node.keys(); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) +{ + assert(from <= src.keys()); + if (p_src == nullptr) { + p_src = &src; + } else { + assert(p_src == &src); + } + assert(p_src->node_size == p_mut->get_length()); + extent_len_t node_size = src.node_size; + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + num_keys += items; + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + std::ignore = node_size; + ceph_abort("not implemented"); + } else { + // append left part forwards + node_offset_t offset_left_start = src.fields().get_key_start_offset( + from, node_size); + node_offset_t offset_left_end = src.fields().get_key_start_offset( + from + items, node_size); + node_offset_t left_size = offset_left_end - offset_left_start; + if (num_keys == 0) { + // no need to adjust offset + assert(from == 0); + assert(p_start + offset_left_start == p_append_left); + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + } else { + node_offset_t step_size = FieldType::estimate_insert_one(); + extent_len_t offset_base = src.fields().get_item_end_offset( + from, node_size); + int offset_change = p_append_right - p_start - offset_base; + auto p_offset_dst = p_append_left; + if constexpr (FIELD_TYPE != field_type_t::N2) { + // copy keys + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + // point to offset for update + p_offset_dst += sizeof(typename FieldType::key_t); + } + for (auto i = from; i < from + items; ++i) { + int new_offset = src.fields().get_item_start_offset(i, node_size) + + offset_change; + assert(new_offset > 0); + assert(new_offset < (int)node_size); + p_mut->copy_in_absolute(p_offset_dst, node_offset_t(new_offset)); + p_offset_dst += step_size; + } + assert(p_append_left + left_size + sizeof(typename FieldType::key_t) == + p_offset_dst); + } + p_append_left += left_size; + + // append right part backwards + auto offset_right_start = src.fields().get_item_end_offset( + from + items, node_size); + auto offset_right_end = src.fields().get_item_end_offset( + from, node_size); + int right_size = offset_right_end - offset_right_start; + assert(right_size > 0); + assert(right_size < (int)node_size); + p_append_right -= right_size; + p_mut->copy_in_absolute(p_append_right, + src.p_start() + offset_right_start, node_offset_t(right_size)); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::append( + const full_key_t<KT>& key, const value_input_t& value, const value_t*& p_value) +{ + if constexpr (FIELD_TYPE == field_type_t::N3) { + ceph_abort("not implemented"); + } else { + ceph_abort("should not happen"); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const key_get_type& partial_key) +{ + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::append_key(*p_mut, partial_key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::append_key(*p_mut, partial_key, p_append_right); + } else { + ceph_abort("impossible path"); + } + return {p_mut, p_append_right}; +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const full_key_t<KT>& key) +{ + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::append_key(*p_mut, key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::append_key(*p_mut, key, p_append_right); + } else { + ceph_abort("impossible path"); + } + return {p_mut, p_append_right}; +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +char* APPEND_T::wrap() +{ + assert(p_append_left <= p_append_right); + assert(p_src); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (p_src->is_level_tail()) { + laddr_t tail_value = p_src->get_end_p_laddr()->value; + p_append_right -= sizeof(laddr_t); + assert(p_append_left <= p_append_right); + p_mut->copy_in_absolute(p_append_right, tail_value); + } + } + p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys); + return p_append_left; +} + +#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t<FT, NT>::Appender<KT> +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ); + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h new file mode 100644 index 000000000..5615998f8 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h @@ -0,0 +1,232 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/** + * node_extent_t + * + * The top indexing stage implementation for node N0/N1/N2/N3, implements + * staged contract as an indexable container, and provides access to node + * header. + * + * The specific field layout are defined by FieldType which are + * node_fields_0_t, node_fields_1_t, node_fields_2_t, internal_fields_3_t and + * leaf_fields_3_t. Diagrams see node_stage_layout.h. + */ +template <typename FieldType, node_type_t _NODE_TYPE> +class node_extent_t { + public: + using value_input_t = value_input_type_t<_NODE_TYPE>; + using value_t = value_type_t<_NODE_TYPE>; + using num_keys_t = typename FieldType::num_keys_t; + static constexpr node_type_t NODE_TYPE = _NODE_TYPE; + static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE; + + // TODO: remove + node_extent_t() = default; + + node_extent_t(const FieldType* p_fields, extent_len_t node_size) + : p_fields{p_fields}, node_size{node_size} { + assert(is_valid_node_size(node_size)); + validate(*p_fields); + } + + const char* p_start() const { return fields_start(*p_fields); } + + bool is_level_tail() const { return p_fields->is_level_tail(); } + level_t level() const { return p_fields->header.level; } + node_offset_t free_size() const { + return p_fields->template free_size_before<NODE_TYPE>( + keys(), node_size); + } + extent_len_t total_size() const { + return p_fields->total_size(node_size); + } + const char* p_left_bound() const; + template <node_type_t T = NODE_TYPE> + std::enable_if_t<T == node_type_t::INTERNAL, const laddr_packed_t*> + get_end_p_laddr() const { + assert(is_level_tail()); + if constexpr (FIELD_TYPE == field_type_t::N3) { + return p_fields->get_p_child_addr(keys(), node_size); + } else { + auto offset_start = p_fields->get_item_end_offset( + keys(), node_size); + assert(offset_start <= node_size); + offset_start -= sizeof(laddr_packed_t); + auto p_addr = p_start() + offset_start; + return reinterpret_cast<const laddr_packed_t*>(p_addr); + } + } + + // container type system + using key_get_type = typename FieldType::key_get_type; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + index_t keys() const { return p_fields->num_keys; } + key_get_type operator[] (index_t index) const { + return p_fields->get_key(index, node_size); + } + extent_len_t size_before(index_t index) const { + auto free_size = p_fields->template free_size_before<NODE_TYPE>( + index, node_size); + assert(total_size() >= free_size); + return total_size() - free_size; + } + node_offset_t size_to_nxt_at(index_t index) const; + node_offset_t size_overhead_at(index_t index) const { + return FieldType::ITEM_OVERHEAD; } + container_range_t get_nxt_container(index_t index) const; + + template <typename T = FieldType> + std::enable_if_t<T::FIELD_TYPE == field_type_t::N3, const value_t*> + get_p_value(index_t index) const { + assert(index < keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + return p_fields->get_p_child_addr(index, node_size); + } else { + auto range = get_nxt_container(index).range; + auto ret = reinterpret_cast<const value_header_t*>(range.p_start); + assert(range.p_start + ret->allocation_size() == range.p_end); + return ret; + } + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + assert(p_node_start == p_start()); + // nothing to encode as the container range is the entire extent + } + + static node_extent_t decode(const char* p_node_start, + extent_len_t node_size, + ceph::bufferlist::const_iterator& delta) { + // nothing to decode + return node_extent_t( + reinterpret_cast<const FieldType*>(p_node_start), + node_size); + } + + static void validate(const FieldType& fields) { +#ifndef NDEBUG + assert(fields.header.get_node_type() == NODE_TYPE); + assert(fields.header.get_field_type() == FieldType::FIELD_TYPE); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(fields.header.level > 0u); + } else { + assert(fields.header.level == 0u); + } +#endif + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool); + + static node_offset_t header_size() { return FieldType::HEADER_SIZE; } + + template <IsFullKey Key> + static node_offset_t estimate_insert( + const Key& key, const value_input_t& value) { + auto size = FieldType::estimate_insert_one(); + if constexpr (FIELD_TYPE == field_type_t::N2) { + size += ns_oid_view_t::estimate_size(key); + } else if constexpr (FIELD_TYPE == field_type_t::N3 && + NODE_TYPE == node_type_t::LEAF) { + size += value.allocation_size(); + } + return size; + } + + template <IsFullKey Key> + static const value_t* insert_at( + NodeExtentMutable& mut, const node_extent_t&, + const Key& key, const value_input_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible"); + } + } + + template <IsFullKey Key> + static memory_range_t insert_prefix_at( + NodeExtentMutable&, const node_extent_t&, + const Key& key, + index_t index, node_offset_t size, const char* p_left_bound); + + static void update_size_at( + NodeExtentMutable&, const node_extent_t&, index_t index, int change); + + static node_offset_t trim_until( + NodeExtentMutable&, const node_extent_t&, index_t index); + static node_offset_t trim_at(NodeExtentMutable&, const node_extent_t&, + index_t index, node_offset_t trimmed); + + static node_offset_t erase_at(NodeExtentMutable&, const node_extent_t&, + index_t index, const char* p_left_bound); + + template <KeyT KT> + class Appender; + + private: + const FieldType& fields() const { return *p_fields; } + const FieldType* p_fields; + extent_len_t node_size; +}; + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +class node_extent_t<FieldType, NODE_TYPE>::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_start{p_append} { +#ifndef NDEBUG + auto p_fields = reinterpret_cast<const FieldType*>(p_append); + assert(*(p_fields->header.get_field_type()) == FIELD_TYPE); + assert(p_fields->header.get_node_type() == NODE_TYPE); + assert(p_fields->num_keys == 0); +#endif + p_append_left = p_start + FieldType::HEADER_SIZE; + p_append_right = p_start + p_mut->get_length(); + } + Appender(NodeExtentMutable*, const node_extent_t&, bool open = false); + void append(const node_extent_t& src, index_t from, index_t items); + void append(const full_key_t<KT>&, const value_input_t&, const value_t*&); + char* wrap(); + std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&); + std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&); + void wrap_nxt(char* p_append) { + if constexpr (FIELD_TYPE != field_type_t::N3) { + assert(p_append < p_append_right); + assert(p_append_left < p_append); + p_append_right = p_append; + auto new_offset = p_append - p_start; + assert(new_offset > 0); + assert(new_offset < p_mut->get_length()); + FieldType::append_offset(*p_mut, new_offset, p_append_left); + ++num_keys; + } else { + ceph_abort("not implemented"); + } + } + + private: + const node_extent_t* p_src = nullptr; + NodeExtentMutable* p_mut; + char* p_start; + char* p_append_left; + char* p_append_right; + num_keys_t num_keys = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc new file mode 100644 index 000000000..a0752e0fc --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void node_header_t::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) +{ + node_header_t header; + header.set_field_type(field_type); + header.set_node_type(node_type); + header.set_is_level_tail(is_level_tail); + header.level = level; + mut.copy_in_relative(0, header); +} + +void node_header_t::update_is_level_tail( + NodeExtentMutable& mut, const node_header_t& header, bool value) +{ + auto& _header = const_cast<node_header_t&>(header); + _header.set_is_level_tail(value); + mut.validate_inplace_update(_header); +} + +#define F013_T _node_fields_013_t<SlotType> +#define F013_INST(ST) _node_fields_013_t<ST> + +template <typename SlotType> +void F013_T::update_size_at( + NodeExtentMutable& mut, const me_t& node, index_t index, int change) +{ + assert(index <= node.num_keys); + [[maybe_unused]] extent_len_t node_size = mut.get_length(); +#ifndef NDEBUG + // check underflow + if (change < 0 && index != node.num_keys) { + assert(node.get_item_start_offset(index, node_size) < + node.get_item_end_offset(index, node_size)); + } +#endif + for (const auto* p_slot = &node.slots[index]; + p_slot < &node.slots[node.num_keys]; + ++p_slot) { + node_offset_t offset = p_slot->right_offset; + int new_offset = offset - change; + assert(new_offset > 0); + assert(new_offset < (int)node_size); + mut.copy_in_absolute( + (void*)&(p_slot->right_offset), + node_offset_t(new_offset)); + } +#ifndef NDEBUG + // check overflow + if (change > 0 && index != node.num_keys) { + assert(node.num_keys > 0); + assert(node.get_key_start_offset(node.num_keys, node_size) <= + node.slots[node.num_keys - 1].right_offset); + } +#endif +} + +template <typename SlotType> +void F013_T::append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) +{ + mut.copy_in_absolute(p_append, key); + p_append += sizeof(key_t); +} + +template <typename SlotType> +void F013_T::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) +{ + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +template <typename SlotType> +template <IsFullKey Key> +void F013_T::insert_at( + NodeExtentMutable& mut, const Key& key, + const me_t& node, index_t index, node_offset_t size_right) +{ + assert(index <= node.num_keys); + extent_len_t node_size = mut.get_length(); + update_size_at(mut, node, index, size_right); + auto p_insert = const_cast<char*>(fields_start(node)) + + node.get_key_start_offset(index, node_size); + auto p_shift_end = fields_start(node) + + node.get_key_start_offset(node.num_keys, node_size); + mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one()); + mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1)); + append_key(mut, key_t::from_key(key), p_insert); + int new_offset = node.get_item_end_offset(index, node_size) - size_right; + assert(new_offset > 0); + assert(new_offset < (int)node_size); + append_offset(mut, new_offset, p_insert); +} +#define IA_TEMPLATE(ST, KT) template void F013_INST(ST):: \ + insert_at<KT>(NodeExtentMutable&, const KT&, \ + const F013_INST(ST)&, index_t, node_offset_t) +IA_TEMPLATE(slot_0_t, key_view_t); +IA_TEMPLATE(slot_1_t, key_view_t); +IA_TEMPLATE(slot_3_t, key_view_t); +IA_TEMPLATE(slot_0_t, key_hobj_t); +IA_TEMPLATE(slot_1_t, key_hobj_t); +IA_TEMPLATE(slot_3_t, key_hobj_t); + +template <typename SlotType> +node_offset_t F013_T::erase_at( + NodeExtentMutable& mut, const me_t& node, index_t index, const char* p_left_bound) +{ + extent_len_t node_size = mut.get_length(); + auto offset_item_start = node.get_item_start_offset(index, node_size); + auto offset_item_end = node.get_item_end_offset(index, node_size); + assert(offset_item_start < offset_item_end); + auto erase_size = offset_item_end - offset_item_start; + // fix and shift the left part + update_size_at(mut, node, index + 1, -erase_size); + const char* p_shift_start = fields_start(node) + + node.get_key_start_offset(index + 1, node_size); + extent_len_t shift_len = sizeof(SlotType) * (node.num_keys - index - 1); + int shift_off = -(int)sizeof(SlotType); + mut.shift_absolute(p_shift_start, shift_len, shift_off); + // shift the right part + p_shift_start = p_left_bound; + shift_len = fields_start(node) + offset_item_start - p_left_bound; + shift_off = erase_size; + mut.shift_absolute(p_shift_start, shift_len, shift_off); + // fix num_keys + mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys - 1)); + return erase_size; +} + +#define F013_TEMPLATE(ST) template struct F013_INST(ST) +F013_TEMPLATE(slot_0_t); +F013_TEMPLATE(slot_1_t); +F013_TEMPLATE(slot_3_t); + +void node_fields_2_t::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) +{ + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h new file mode 100644 index 000000000..1ed4865a6 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h @@ -0,0 +1,406 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "key_layout.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct node_header_t { + static constexpr unsigned FIELD_TYPE_BITS = 6u; + static_assert(static_cast<uint8_t>(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS); + static constexpr unsigned NODE_TYPE_BITS = 1u; + static constexpr unsigned B_LEVEL_TAIL_BITS = 1u; + using bits_t = uint8_t; + + node_header_t() {} + std::optional<field_type_t> get_field_type() const { + if (field_type >= FIELD_TYPE_MAGIC && + field_type < static_cast<uint8_t>(field_type_t::_MAX)) { + return static_cast<field_type_t>(field_type); + } else { + return std::nullopt; + } + } + node_type_t get_node_type() const { + return static_cast<node_type_t>(node_type); + } + bool get_is_level_tail() const { + return is_level_tail; + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool); + + bits_t field_type : FIELD_TYPE_BITS; + bits_t node_type : NODE_TYPE_BITS; + bits_t is_level_tail : B_LEVEL_TAIL_BITS; + static_assert(sizeof(bits_t) * 8 == + FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS); + level_t level; + + private: + void set_field_type(field_type_t type) { + field_type = static_cast<uint8_t>(type); + } + void set_node_type(node_type_t type) { + node_type = static_cast<uint8_t>(type); + } + void set_is_level_tail(bool value) { + is_level_tail = static_cast<uint8_t>(value); + } +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const node_header_t& header) { + auto field_type = header.get_field_type(); + if (field_type.has_value()) { + os << "header" << header.get_node_type() << *field_type + << "(is_level_tail=" << header.get_is_level_tail() + << ", level=" << (unsigned)header.level << ")"; + } else { + os << "header(INVALID)"; + } + return os; +} + +template <typename FixedKeyType, field_type_t _FIELD_TYPE> +struct _slot_t { + using key_t = FixedKeyType; + static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE; + static constexpr node_offset_t OVERHEAD = sizeof(_slot_t) - sizeof(key_t); + + key_t key; + node_offset_t right_offset; +} __attribute__((packed)); +using slot_0_t = _slot_t<shard_pool_crush_t, field_type_t::N0>; +using slot_1_t = _slot_t<crush_t, field_type_t::N1>; +using slot_3_t = _slot_t<snap_gen_t, field_type_t::N3>; + +struct node_range_t { + extent_len_t start; + extent_len_t end; +}; + +template <typename FieldType> +const char* fields_start(const FieldType& node) { + return reinterpret_cast<const char*>(&node); +} + +template <node_type_t NODE_TYPE, typename FieldType> +node_range_t fields_free_range_before( + const FieldType& node, index_t index, extent_len_t node_size) { + assert(index <= node.num_keys); + extent_len_t offset_start = node.get_key_start_offset(index, node_size); + extent_len_t offset_end = node.get_item_end_offset(index, node_size); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (node.is_level_tail() && index == node.num_keys) { + offset_end -= sizeof(laddr_t); + } + } + assert(offset_start <= offset_end); + assert(offset_end - offset_start < node_size); + return {offset_start, offset_end}; +} + +/** + * _node_fields_013_t (node_fields_0_t, node_fields_1_t, leaf_fields_3_t + * + * The STAGE_LEFT layout implementation for node N0/N1, or the STAGE_RIGHT + * layout implementation for leaf node N3. + * + * The node layout storing n slots: + * + * # <----------------------------- node range --------------------------------------> # + * # #<~># free space # + * # <----- left part -----------------------------> # <~# <----- right slots -------> # + * # # <---- left slots -------------> #~> # # + * # # slots [2, n) |<~># #<~>| right slots [2, n) # + * # # <- slot 0 -> | <- slot 1 -> | # # | <-- s1 --> | <-- s0 --> # + * # # | | # # | | # + * # | num_ # | right | | right | # # | next-stage | next-stage # + * # header | keys # key | offset | key | offset | # # | container | container # + * # | # 0 | 0 | 1 | 1 |...#...#...| or onode 1 | or onode 0 # + * | | ^ ^ + * | | | | + * | +----------------+ | + * +--------------------------------------------+ + */ +template <typename SlotType> +struct _node_fields_013_t { + // should be enough to index all keys under 64 KiB node + using num_keys_t = uint16_t; + using key_t = typename SlotType::key_t; + using key_get_type = const key_t&; + using me_t = _node_fields_013_t<SlotType>; + static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = SlotType::OVERHEAD; + + bool is_level_tail() const { return header.get_is_level_tail(); } + extent_len_t total_size(extent_len_t node_size) const { + return node_size; + } + key_get_type get_key( + index_t index, extent_len_t node_size) const { + assert(index < num_keys); + return slots[index].key; + } + node_offset_t get_key_start_offset( + index_t index, extent_len_t node_size) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(SlotType) * index; + assert(offset < node_size); + return offset; + } + node_offset_t get_item_start_offset( + index_t index, extent_len_t node_size) const { + assert(index < num_keys); + auto offset = slots[index].right_offset; + assert(offset < node_size); + return offset; + } + const void* p_offset(index_t index) const { + assert(index < num_keys); + return &slots[index].right_offset; + } + extent_len_t get_item_end_offset( + index_t index, extent_len_t node_size) const { + return index == 0 ? node_size + : get_item_start_offset(index - 1, node_size); + } + template <node_type_t NODE_TYPE> + node_offset_t free_size_before( + index_t index, extent_len_t node_size) const { + auto range = fields_free_range_before<NODE_TYPE>(*this, index, node_size); + return range.end - range.start; + } + + static node_offset_t estimate_insert_one() { return sizeof(SlotType); } + template <IsFullKey Key> + static void insert_at( + NodeExtentMutable&, const Key& key, + const me_t& node, index_t index, node_offset_t size_right); + static node_offset_t erase_at(NodeExtentMutable&, const me_t&, index_t, const char*); + static void update_size_at( + NodeExtentMutable&, const me_t& node, index_t index, int change); + static void append_key( + NodeExtentMutable&, const key_t& key, char*& p_append); + template <IsFullKey Key> + static void append_key( + NodeExtentMutable& mut, const Key& key, char*& p_append) { + append_key(mut, key_t::from_key(key), p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + SlotType slots[]; +} __attribute__((packed)); +using node_fields_0_t = _node_fields_013_t<slot_0_t>; +using node_fields_1_t = _node_fields_013_t<slot_1_t>; + +/** + * node_fields_2_t + * + * The STAGE_STRING layout implementation for node N2. + * + * The node layout storing n slots: + * + * # <--------------------------------- node range ----------------------------------------> # + * # #<~># free space # + * # <------- left part ---------------> # <~# <--------- right slots ---------------------> # + * # # <---- offsets ----> #~> #<~>| slots [2, n) # + * # # offsets [2, n) |<~># # | <----- slot 1 ----> | <----- slot 0 ----> # + * # # | # # | | # + * # | num_ # offset | offset | # # | next-stage | ns-oid | next-stage | ns-oid # + * # header | keys # 0 | 1 |...#...#...| container1 | 1 | container0 | 0 # + * | | ^ ^ + * | | | | + * | +----------------+ | + * +-----------------------------------------------+ + */ +struct node_fields_2_t { + // should be enough to index all keys under 64 KiB node + using num_keys_t = uint16_t; + using key_t = ns_oid_view_t; + using key_get_type = key_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N2; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = sizeof(node_offset_t); + + bool is_level_tail() const { return header.get_is_level_tail(); } + extent_len_t total_size(extent_len_t node_size) const { + return node_size; + } + key_get_type get_key( + index_t index, extent_len_t node_size) const { + assert(index < num_keys); + auto item_end_offset = get_item_end_offset(index, node_size); + const char* p_start = fields_start(*this); + return key_t(p_start + item_end_offset); + } + node_offset_t get_key_start_offset( + index_t index, extent_len_t node_size) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys; + assert(offset < node_size); + return offset; + } + node_offset_t get_item_start_offset( + index_t index, extent_len_t node_size) const { + assert(index < num_keys); + auto offset = offsets[index]; + assert(offset < node_size); + return offset; + } + const void* p_offset(index_t index) const { + assert(index < num_keys); + return &offsets[index]; + } + extent_len_t get_item_end_offset( + index_t index, extent_len_t node_size) const { + return index == 0 ? node_size + : get_item_start_offset(index - 1, node_size); + } + template <node_type_t NODE_TYPE> + node_offset_t free_size_before( + index_t index, extent_len_t node_size) const { + auto range = fields_free_range_before<NODE_TYPE>(*this, index, node_size); + return range.end - range.start; + } + + static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); } + template <IsFullKey Key> + static void insert_at( + NodeExtentMutable& mut, const Key& key, + const node_fields_2_t& node, index_t index, node_offset_t size_right) { + ceph_abort("not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const node_fields_2_t& node, index_t index, int change) { + ceph_abort("not implemented"); + } + static void append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + ns_oid_view_t::append(mut, key, p_append); + } + template <IsFullKey Key> + static void append_key( + NodeExtentMutable& mut, const Key& key, char*& p_append) { + ns_oid_view_t::append(mut, key, p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + node_offset_t offsets[]; +} __attribute__((packed)); + +/** + * internal_fields_3_t + * + * The STAGE_RIGHT layout implementation for N2. + * + * The node layout storing 3 children: + * + * # <---------------- node range ---------------------------> # + * # # <-- keys ---> # <---- laddrs -----------> # + * # free space: # |<~># |<~># + * # # | # | # + * # | num_ # key | key | # laddr | laddr | laddr | # + * # header | keys # 0 | 1 |...# 0 | 1 | 2 |...# + */ +struct internal_fields_3_t { + using key_get_type = const snap_gen_t&; + // should be enough to index all keys under 64 KiB node + using num_keys_t = uint16_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N3; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_SIZE = + sizeof(snap_gen_t) + sizeof(laddr_t); + static constexpr node_offset_t ITEM_OVERHEAD = 0u; + + bool is_level_tail() const { return header.get_is_level_tail(); } + extent_len_t total_size(extent_len_t node_size) const { + if (is_level_tail()) { + return node_size - sizeof(snap_gen_t); + } else { + return node_size; + } + } + key_get_type get_key( + index_t index, extent_len_t node_size) const { + assert(index < num_keys); + return keys[index]; + } + template <node_type_t NODE_TYPE> + std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, node_offset_t> + free_size_before(index_t index, extent_len_t node_size) const { + assert(index <= num_keys); + assert(num_keys <= get_max_num_keys(node_size)); + extent_len_t free = total_size(node_size) - HEADER_SIZE - + index * ITEM_SIZE; + if (is_level_tail() && index == num_keys) { + free -= sizeof(laddr_t); + } + return free; + } + + const laddr_packed_t* get_p_child_addr( + index_t index, extent_len_t node_size) const { +#ifndef NDEBUG + if (is_level_tail()) { + assert(index <= num_keys); + } else { + assert(index < num_keys); + } +#endif + auto p_addrs = reinterpret_cast<const laddr_packed_t*>( + &keys[get_num_keys_limit(node_size)]); + auto ret = p_addrs + index; + assert((const char*)ret < fields_start(*this) + node_size); + return ret; + } + + static node_offset_t estimate_insert_one() { return ITEM_SIZE; } + + template <IsFullKey Key> + static void insert_at( + NodeExtentMutable& mut, const Key& key, + const internal_fields_3_t& node, + index_t index, node_offset_t size_right) { + ceph_abort("not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const internal_fields_3_t& node, + index_t index, int change) { + ceph_abort("not implemented"); + } + + node_header_t header; + num_keys_t num_keys = 0u; + snap_gen_t keys[]; + + private: + num_keys_t get_max_num_keys(extent_len_t node_size) const { + auto num_limit = get_num_keys_limit(node_size); + return (is_level_tail() ? num_limit - 1 : num_limit); + } + static num_keys_t get_num_keys_limit(extent_len_t node_size) { + return (node_size - HEADER_SIZE) / ITEM_SIZE; + } +} __attribute__((packed)); + +using leaf_fields_3_t = _node_fields_013_t<slot_3_t>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h new file mode 100644 index 000000000..7185b15ee --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h @@ -0,0 +1,2488 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <compare> +#include <optional> +#include <ostream> +#include <sstream> +#include <type_traits> + +#include "common/likely.h" + +#include "sub_items_stage.h" +#include "item_iterator_stage.h" + +namespace crimson::os::seastore::onode { + +struct search_result_bs_t { + index_t index; + MatchKindBS match; +}; +template <typename FGetKey> +search_result_bs_t binary_search( + const key_hobj_t& key, + index_t begin, index_t end, FGetKey&& f_get_key) { + assert(begin <= end); + while (begin < end) { + auto total = begin + end; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get_key(mid)) target = f_get_key(mid); + auto match = key <=> target; + if (match == std::strong_ordering::less) { + end = mid; + } else if (match == std::strong_ordering::greater) { + begin = mid + 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {begin , MatchKindBS::NE}; +} + +template <typename PivotType, typename FGet> +search_result_bs_t binary_search_r( + index_t rend, index_t rbegin, FGet&& f_get, const PivotType& key) { + assert(rend <= rbegin); + while (rend < rbegin) { + auto total = rend + rbegin + 1; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get(mid)) target = f_get(mid); + int match = target - key; + if (match < 0) { + rend = mid; + } else if (match > 0) { + rbegin = mid - 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {rbegin, MatchKindBS::NE}; +} + +inline bool matchable(field_type_t type, match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + /* + * compressed prefix by field type: + * N0: NONE + * N1: pool/shard + * N2: pool/shard crush + * N3: pool/shard crush ns/oid + * + * if key matches the node's compressed prefix, return true + * else, return false + */ +#ifndef NDEBUG + if (mstat == MSTAT_END) { + assert(type == field_type_t::N0); + } +#endif + return mstat + to_unsigned(type) < 4; +} + +inline void assert_mstat( + const key_hobj_t& key, + const key_view_t& index, + match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_LT2); + // key < index ... + switch (mstat) { + case MSTAT_EQ: + break; + case MSTAT_LT0: + assert(key < index.snap_gen_packed()); + break; + case MSTAT_LT1: + assert(key < index.ns_oid_view()); + break; + case MSTAT_LT2: + if (index.has_shard_pool()) { + assert((key < shard_pool_crush_t{ + index.shard_pool_packed(), index.crush_packed()})); + } else { + assert(key < index.crush_packed()); + } + break; + default: + ceph_abort("impossible path"); + } + // key == index ... + switch (mstat) { + case MSTAT_EQ: + assert(key == index.snap_gen_packed()); + case MSTAT_LT0: + if (!index.has_ns_oid()) + break; + assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX || + key == index.ns_oid_view()); + case MSTAT_LT1: + if (!index.has_crush()) + break; + assert(key == index.crush_packed()); + if (!index.has_shard_pool()) + break; + assert(key == index.shard_pool_packed()); + default: + break; + } +} + +#define NXT_STAGE_T staged<next_param_t> + +enum class TrimType { BEFORE, AFTER, AT }; + +/** + * staged + * + * Implements recursive logic that modifies or reads the node layout + * (N0/N1/N2/N3 * LEAF/INTERNAL) with the multi-stage design. The specific + * stage implementation is flexible. So the implementations for different + * stages can be assembled independently, as long as they follow the + * definitions of container interfaces. + * + * Multi-stage is designed to index different portions of onode keys + * stage-by-stage. There are at most 3 stages for a node: + * - STAGE_LEFT: index shard-pool-crush for N0, or index crush for N1 node; + * - STAGE_STRING: index ns-oid for N0/N1/N2 nodes; + * - STAGE_RIGHT: index snap-gen for N0/N1/N2/N3 nodes; + * + * The intention is to consolidate the high-level indexing implementations at + * the level of stage, so we don't need to write them repeatedly for every + * stage and for every node type. + */ +template <typename Params> +struct staged { + static_assert(Params::STAGE >= STAGE_BOTTOM); + static_assert(Params::STAGE <= STAGE_TOP); + using container_t = typename Params::container_t; + using key_get_type = typename container_t::key_get_type; + using next_param_t = typename Params::next_param_t; + using position_t = staged_position_t<Params::STAGE>; + using result_t = staged_result_t<Params::NODE_TYPE, Params::STAGE>; + using value_input_t = value_input_type_t<Params::NODE_TYPE>; + using value_t = value_type_t<Params::NODE_TYPE>; + static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE; + static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM); + static constexpr auto NODE_TYPE = Params::NODE_TYPE; + static constexpr auto STAGE = Params::STAGE; + + template <bool is_exclusive> + static void _left_or_right(index_t& split_index, index_t insert_index, + std::optional<bool>& is_insert_left) { + assert(!is_insert_left.has_value()); + assert(is_valid_index(split_index)); + if constexpr (is_exclusive) { + if (split_index <= insert_index) { + // ...[s_index-1] |!| (i_index) [s_index]... + // offset i_position to right + is_insert_left = false; + } else { + // ...[s_index-1] (i_index)) |?[s_index]| ... + // ...(i_index)...[s_index-1] |?[s_index]| ... + is_insert_left = true; + --split_index; + } + } else { + if (split_index < insert_index) { + // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]... + is_insert_left = false; + } else if (split_index > insert_index) { + // ...[(i_index)s_index-1] |?[s_index]| ... + // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ... + is_insert_left = true; + } else { + // ...[s_index-1] |?[(i_index)s_index]| ... + // i_to_left = std::nullopt; + } + } + } + + template <ContainerType CTYPE, typename Enable = void> class _iterator_t; + template <ContainerType CTYPE> + class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::INDEXABLE>> { + /* + * indexable container type system: + * CONTAINER_TYPE = ContainerType::INDEXABLE + * keys() const -> index_t + * operator[](index_t) const -> key_get_type + * size_before(index_t) const -> extent_len_t + * size_overhead_at(index_t) const -> node_offset_t + * (IS_BOTTOM) get_p_value(index_t) const -> const value_t* + * (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t + * (!IS_BOTTOM) get_nxt_container(index_t) const + * encode(p_node_start, encoded) + * decode(p_node_start, node_size, delta) -> container_t + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * (IS_BOTTOM) insert_at(mut, src, key, value, + * index, size, p_left_bound) -> const value_t* + * (!IS_BOTTOM) insert_prefix_at(mut, src, key, + * index, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size_at(mut, src, index, size) + * trim_until(mut, container, index) -> trim_size + * (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size + * erase_at(mut, container, index, p_left_bound) -> erase_size + * + * Appender::append(const container_t& src, from, items) + */ + public: + using me_t = _iterator_t<CTYPE>; + + _iterator_t(const container_t& container) : container{container} { + assert(container.keys()); + } + + index_t index() const { + return _index; + } + key_get_type get_key() const { + assert(!is_end()); + return container[_index]; + } + node_offset_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt_at(_index); + } + template <typename T = typename NXT_STAGE_T::container_t> + std::enable_if_t<!IS_BOTTOM, T> get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(_index); + } + template <typename T = value_t> + std::enable_if_t<IS_BOTTOM, const T*> get_p_value() const { + assert(!is_end()); + return container.get_p_value(_index); + } + bool is_last() const { + return _index + 1 == container.keys(); + } + bool is_end() const { return _index == container.keys(); } + node_offset_t size() const { + assert(!is_end()); + assert(header_size() == container.size_before(0)); + assert(container.size_before(_index + 1) > container.size_before(_index)); + return container.size_before(_index + 1) - + container.size_before(_index); + } + node_offset_t size_overhead() const { + assert(!is_end()); + return container.size_overhead_at(_index); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++_index; + return *this; + } + void seek_at(index_t index) { + assert(index < container.keys()); + seek_till_end(index); + } + void seek_till_end(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + assert(index <= container.keys()); + _index = index; + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + _index = container.keys() - 1; + } + void set_end() { + assert(!is_end()); + assert(is_last()); + ++_index; + } + // Note: possible to return an end iterator + MatchKindBS seek(const key_hobj_t& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + index_t end_index = container.keys(); + if (exclude_last) { + assert(end_index); + --end_index; + assert(key < container[end_index]); + } + auto ret = binary_search(key, _index, end_index, + [this] (index_t index) { return container[index]; }); + _index = ret.index; + return ret.match; + } + + template <IsFullKey Key, typename T = value_t> + std::enable_if_t<IS_BOTTOM, const T*> insert( + NodeExtentMutable& mut, + const Key& key, + const value_input_t& value, + node_offset_t insert_size, + const char* p_left_bound) { + return container_t::insert_at( + mut, container, key, value, _index, insert_size, p_left_bound); + } + + template <IsFullKey Key, typename T = memory_range_t> + std::enable_if_t<!IS_BOTTOM, T> insert_prefix( + NodeExtentMutable& mut, const Key& key, + node_offset_t size, const char* p_left_bound) { + return container_t::insert_prefix_at( + mut, container, key, _index, size, p_left_bound); + } + + template <typename T = void> + std::enable_if_t<!IS_BOTTOM, T> + update_size(NodeExtentMutable& mut, int insert_size) { + assert(!is_end()); + container_t::update_size_at(mut, container, _index, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + template <bool is_exclusive> + size_t seek_split_inserted( + size_t start_size, size_t extra_size, size_t target_size, + index_t& insert_index, size_t insert_size, + std::optional<bool>& is_insert_left) { + assert(!is_end()); + assert(index() == 0); + // replace insert_index placeholder + if constexpr (!is_exclusive) { + if (insert_index == INDEX_LAST) { + insert_index = container.keys() - 1; + } + } else { + if (insert_index == INDEX_END) { + insert_index = container.keys(); + } + } + assert(insert_index <= container.keys()); + + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1, + insert_index, insert_size] (index_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + current_size = start_size_1; + if (index > insert_index) { + current_size += insert_size; + if constexpr (is_exclusive) { + --index; + } + } + // already includes header size + current_size += container.size_before(index); + } + return current_size; + }; + index_t s_end; + if constexpr (is_exclusive) { + s_end = container.keys(); + } else { + s_end = container.keys() - 1; + } + _index = binary_search_r(0, s_end, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + + _left_or_right<is_exclusive>(_index, insert_index, is_insert_left); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1] (index_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + // already includes header size + current_size = start_size_1 + container.size_before(index); + } + return current_size; + }; + _index = binary_search_r( + 0, container.keys() - 1, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if to_index == INDEX_END + template <KeyT KT> + void copy_out_until( + typename container_t::template Appender<KT>& appender, index_t& to_index) { + auto num_keys = container.keys(); + index_t items; + if (to_index == INDEX_END) { + items = num_keys - _index; + appender.append(container, _index, items); + _index = num_keys; + to_index = _index; + } else if (to_index == INDEX_LAST) { + assert(!is_end()); + items = num_keys - 1 - _index; + appender.append(container, _index, items); + _index = num_keys - 1; + to_index = _index; + } else { + assert(_index <= to_index); + assert(to_index <= num_keys); + items = to_index - _index; + appender.append(container, _index, items); + _index = to_index; + } + } + + node_offset_t trim_until(NodeExtentMutable& mut) { + return container_t::trim_until(mut, container, _index); + } + + template <typename T = node_offset_t> + std::enable_if_t<!IS_BOTTOM, T> + trim_at(NodeExtentMutable& mut, node_offset_t trimmed) { + return container_t::trim_at(mut, container, _index, trimmed); + } + + node_offset_t erase(NodeExtentMutable& mut, const char* p_left_bound) { + assert(!is_end()); + return container_t::erase_at(mut, container, _index, p_left_bound); + } + + template <KeyT KT> + typename container_t::template Appender<KT> + get_appender(NodeExtentMutable* p_mut) { + assert(_index + 1 == container.keys()); + return typename container_t::template Appender<KT>(p_mut, container); + } + + template <KeyT KT> + typename container_t::template Appender<KT> + get_appender_opened(NodeExtentMutable* p_mut) { + if constexpr (!IS_BOTTOM) { + assert(_index + 1 == container.keys()); + return typename container_t::template Appender<KT>(p_mut, container, true); + } else { + ceph_abort("impossible path"); + } + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + ceph::encode(_index, encoded); + } + + static me_t decode(const char* p_node_start, + extent_len_t node_size, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode( + p_node_start, node_size, delta); + auto ret = me_t(container); + index_t index; + ceph::decode(index, delta); + ret.seek_till_end(index); + return ret; + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template <IsFullKey Key> + static node_offset_t estimate_insert( + const Key& key, const value_input_t& value) { + return container_t::estimate_insert(key, value); + } + + private: + container_t container; + index_t _index = 0; + }; + + template <ContainerType CTYPE> + class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::ITERATIVE>> { + /* + * iterative container type system (!IS_BOTTOM): + * CONTAINER_TYPE = ContainerType::ITERATIVE + * index() const -> index_t + * get_key() const -> key_get_type + * size() const -> node_offset_t + * size_to_nxt() const -> node_offset_t + * size_overhead() const -> node_offset_t + * get_nxt_container() const + * has_next() const -> bool + * encode(p_node_start, encoded) + * decode(p_node_start, node_length, delta) -> container_t + * operator++() + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t + * update_size(mut, src, size) + * trim_until(mut, container) -> trim_size + * trim_at(mut, container, trimmed) -> trim_size + * erase(mut, container, p_left_bound) -> erase_size + */ + // currently the iterative iterator is only implemented with STAGE_STRING + // for in-node space efficiency + static_assert(STAGE == STAGE_STRING); + public: + using me_t = _iterator_t<CTYPE>; + + _iterator_t(const container_t& container) : container{container} {} + + index_t index() const { + if (is_end()) { + return container.index() + 1; + } else { + return container.index(); + } + } + key_get_type get_key() const { + assert(!is_end()); + return container.get_key(); + } + node_offset_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt(); + } + const typename NXT_STAGE_T::container_t get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(); + } + bool is_last() const { + assert(!is_end()); + return !container.has_next(); + } + bool is_end() const { +#ifndef NDEBUG + if (_is_end) { + assert(!container.has_next()); + } +#endif + return _is_end; + } + node_offset_t size() const { + assert(!is_end()); + return container.size(); + } + node_offset_t size_overhead() const { + assert(!is_end()); + return container.size_overhead(); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++container; + return *this; + } + void seek_at(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + assert(container.has_next()); + ++container; + --index; + } + } + void seek_till_end(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + if (!container.has_next()) { + assert(index == 1); + set_end(); + break; + } + ++container; + --index; + } + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + while (container.has_next()) { + ++container; + } + } + void set_end() { + assert(!is_end()); + assert(is_last()); + _is_end = true; + } + // Note: possible to return an end iterator + MatchKindBS seek(const key_hobj_t& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + do { + if (exclude_last && is_last()) { + assert(key < get_key()); + return MatchKindBS::NE; + } + auto match = key <=> get_key(); + if (match == std::strong_ordering::less) { + return MatchKindBS::NE; + } else if (match == std::strong_ordering::equal) { + return MatchKindBS::EQ; + } else { + if (container.has_next()) { + ++container; + } else { + // end + break; + } + } + } while (true); + assert(!exclude_last); + set_end(); + return MatchKindBS::NE; + } + + template <IsFullKey Key> + memory_range_t insert_prefix( + NodeExtentMutable& mut, const Key& key, + node_offset_t size, const char* p_left_bound) { + return container_t::insert_prefix( + mut, container, key, is_end(), size, p_left_bound); + } + + void update_size(NodeExtentMutable& mut, int insert_size) { + assert(!is_end()); + container_t::update_size(mut, container, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + // insert_index can still be INDEX_LAST or INDEX_END + template <bool is_exclusive> + size_t seek_split_inserted( + size_t start_size, size_t extra_size, size_t target_size, + index_t& insert_index, size_t insert_size, + std::optional<bool>& is_insert_left) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + index_t split_index = 0; + extra_size += header_size(); + do { + if constexpr (!is_exclusive) { + if (is_last()) { + assert(split_index == index()); + if (insert_index == INDEX_LAST) { + insert_index = index(); + } + assert(insert_index <= index()); + break; + } + } + + size_t nxt_size = current_size; + if (split_index == 0) { + nxt_size += extra_size; + } + if (split_index == insert_index) { + nxt_size += insert_size; + if constexpr (is_exclusive) { + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++split_index; + } + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + + if constexpr (is_exclusive) { + if (is_last()) { + assert(split_index == index()); + set_end(); + split_index = index(); + if (insert_index == INDEX_END) { + insert_index = index(); + } + assert(insert_index == index()); + break; + } else { + ++(*this); + ++split_index; + } + } else { + ++(*this); + ++split_index; + } + } while (true); + assert(current_size <= target_size); + + _left_or_right<is_exclusive>(split_index, insert_index, is_insert_left); + assert(split_index == index()); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + do { + if (is_last()) { + break; + } + + size_t nxt_size = current_size; + if (index() == 0) { + nxt_size += extra_size; + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++(*this); + } while (true); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if to_index == INDEX_END + template <KeyT KT> + void copy_out_until( + typename container_t::template Appender<KT>& appender, index_t& to_index) { + if (is_end()) { + assert(!container.has_next()); + if (to_index == INDEX_END) { + to_index = index(); + } + assert(to_index == index()); + return; + } + index_t items; + if (to_index == INDEX_END || to_index == INDEX_LAST) { + items = to_index; + } else { + assert(is_valid_index(to_index)); + assert(index() <= to_index); + items = to_index - index(); + } + if (appender.append(container, items)) { + set_end(); + } + to_index = index(); + } + + node_offset_t trim_until(NodeExtentMutable& mut) { + if (is_end()) { + return 0; + } + return container_t::trim_until(mut, container); + } + + node_offset_t trim_at(NodeExtentMutable& mut, node_offset_t trimmed) { + assert(!is_end()); + return container_t::trim_at(mut, container, trimmed); + } + + node_offset_t erase(NodeExtentMutable& mut, const char* p_left_bound) { + assert(!is_end()); + return container_t::erase(mut, container, p_left_bound); + } + + template <KeyT KT> + typename container_t::template Appender<KT> + get_appender(NodeExtentMutable* p_mut) { + return typename container_t::template Appender<KT>(p_mut, container, false); + } + + template <KeyT KT> + typename container_t::template Appender<KT> + get_appender_opened(NodeExtentMutable* p_mut) { + if constexpr (!IS_BOTTOM) { + return typename container_t::template Appender<KT>(p_mut, container, true); + } else { + ceph_abort("impossible path"); + } + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + uint8_t is_end = _is_end; + ceph::encode(is_end, encoded); + } + + static me_t decode(const char* p_node_start, + extent_len_t node_size, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode( + p_node_start, node_size, delta); + auto ret = me_t(container); + uint8_t is_end; + ceph::decode(is_end, delta); + if (is_end) { + ret.set_end(); + } + return ret; + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template <IsFullKey Key> + static node_offset_t estimate_insert(const Key& key, + const value_input_t& value) { + return container_t::estimate_insert(key, value); + } + + private: + container_t container; + bool _is_end = false; + }; + + /* + * iterator_t encapsulates both indexable and iterative implementations + * from a *non-empty* container. + * cstr(const container_t&) + * access: + * index() -> index_t + * get_key() -> key_get_type (const reference or value type) + * is_last() -> bool + * is_end() -> bool + * size() -> node_offset_t + * size_overhead() -> node_offset_t + * (IS_BOTTOM) get_p_value() -> const value_t* + * (!IS_BOTTOM) get_nxt_container() -> container_range_t + * (!IS_BOTTOM) size_to_nxt() -> node_offset_t + * seek: + * operator++() -> iterator_t& + * seek_at(index) + * seek_till_end(index) + * seek_last() + * set_end() + * seek(key, exclude_last) -> MatchKindBS + * insert: + * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value + * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size(mut, size) + * split: + * seek_split_inserted<bool is_exclusive>( + * start_size, extra_size, target_size, insert_index, insert_size, + * std::optional<bool>& is_insert_left) + * -> insert to left/right/unknown (!exclusive) + * -> insert to left/right (exclusive, can be end) + * -> split_size + * seek_split(start_size, extra_size, target_size) -> split_size + * copy_out_until(appender, to_index) (can be end) + * trim_until(mut) -> trim_size + * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size + * erase: + * erase(mut, p_left_bound) -> erase_size + * merge: + * get_appender(p_mut) -> Appender + * (!IS_BOTTOM)get_appender_opened(p_mut) -> Appender + * denc: + * encode(p_node_start, encoded) + * decode(p_node_start, node_size, delta) -> iterator_t + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + */ + using iterator_t = _iterator_t<CONTAINER_TYPE>; + /* TODO: detailed comments + * - trim_until(mut) -> trim_size + * * keep 0 to i - 1, and remove the rest, return the size trimmed. + * * if this is the end iterator, do nothing and return 0. + * * if this is the start iterator, normally needs to go to the higher + * stage to trim the entire container. + * - trim_at(mut, trimmed) -> trim_size + * * trim happens inside the current iterator, causing the size reduced by + * <trimmed>, return the total size trimmed. + */ + + /* + * Lookup internals (hide?) + */ + + static bool is_keys_one( + const container_t& container) { // IN + auto iter = iterator_t(container); + iter.seek_last(); + if (iter.index() == 0) { + if constexpr (IS_BOTTOM) { + // ok, there is only 1 key + return true; + } else { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::is_keys_one(nxt_container); + } + } else { + // more than 1 keys + return false; + } + } + + template <bool GET_KEY> + static result_t smallest_result( + const iterator_t& iter, key_view_t* p_index_key) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto nxt_container = iter.get_nxt_container(); + auto pos_smallest = NXT_STAGE_T::position_t::begin(); + const value_t* p_value; + NXT_STAGE_T::template get_slot<GET_KEY, true>( + nxt_container, pos_smallest, p_index_key, &p_value); + if constexpr (GET_KEY) { + assert(p_index_key); + p_index_key->set(iter.get_key()); + } else { + assert(!p_index_key); + } + return result_t{{iter.index(), pos_smallest}, p_value, STAGE}; + } + + template <bool GET_KEY> + static result_t nxt_lower_bound( + const key_hobj_t& key, iterator_t& iter, + MatchHistory& history, key_view_t* index_key) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>( + nxt_container, key, history, index_key); + if (nxt_result.is_end()) { + if (iter.is_last()) { + return result_t::end(); + } else { + return smallest_result<GET_KEY>(++iter, index_key); + } + } else { + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + return result_t::from_nxt(iter.index(), nxt_result); + } + } + + template <bool GET_POS, bool GET_KEY, bool GET_VAL> + static void get_largest_slot( + const container_t& container, // IN + position_t* p_position, // OUT + key_view_t* p_index_key, // OUT + const value_t** pp_value) { // OUT + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (GET_KEY) { + assert(p_index_key); + p_index_key->set(iter.get_key()); + } else { + assert(!p_index_key); + } + if constexpr (GET_POS) { + assert(p_position); + p_position->index = iter.index(); + } else { + assert(!p_position); + } + if constexpr (IS_BOTTOM) { + if constexpr (GET_VAL) { + assert(pp_value); + *pp_value = iter.get_p_value(); + } else { + assert(!pp_value); + } + } else { + auto nxt_container = iter.get_nxt_container(); + if constexpr (GET_POS) { + NXT_STAGE_T::template get_largest_slot<true, GET_KEY, GET_VAL>( + nxt_container, &p_position->nxt, p_index_key, pp_value); + } else { + NXT_STAGE_T::template get_largest_slot<false, GET_KEY, GET_VAL>( + nxt_container, nullptr, p_index_key, pp_value); + } + } + } + + template <bool GET_KEY, bool GET_VAL> + static void get_slot( + const container_t& container, // IN + const position_t& pos, // IN + key_view_t* p_index_key, // OUT + const value_t** pp_value) { // OUT + auto iter = iterator_t(container); + iter.seek_at(pos.index); + + if constexpr (GET_KEY) { + assert(p_index_key); + p_index_key->set(iter.get_key()); + } else { + assert(!p_index_key); + } + + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::template get_slot<GET_KEY, GET_VAL>( + nxt_container, pos.nxt, p_index_key, pp_value); + } else { + if constexpr (GET_VAL) { + assert(pp_value); + *pp_value = iter.get_p_value(); + } else { + assert(!pp_value); + } + } + } + + template <bool GET_KEY = false> + static result_t lower_bound( + const container_t& container, + const key_hobj_t& key, + MatchHistory& history, + key_view_t* index_key = nullptr) { + bool exclude_last = false; + if (history.get<STAGE>().has_value()) { + if (*history.get<STAGE>() == MatchKindCMP::EQ) { + // lookup is short-circuited + if constexpr (!IS_BOTTOM) { + assert(history.get<STAGE - 1>().has_value()); + if (history.is_GT<STAGE - 1>()) { + auto iter = iterator_t(container); + bool test_key_equal; + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN); + auto cmp = key <=> iter.get_key(); + assert(cmp != std::strong_ordering::greater); + test_key_equal = (cmp == 0); + } else { + auto cmp = key <=> iter.get_key(); + // From history, key[stage] == parent[stage][index - 1] + // which should be the smallest possible value for all + // index[stage][*] + assert(cmp != std::strong_ordering::greater); + test_key_equal = (cmp == 0); + } + if (test_key_equal) { + return nxt_lower_bound<GET_KEY>(key, iter, history, index_key); + } else { + // key[stage] < index[stage][left-most] + return smallest_result<GET_KEY>(iter, index_key); + } + } + } + // IS_BOTTOM || !history.is_GT<STAGE - 1>() + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX); + assert(key == iter.get_key()); + } else { + assert(key == iter.get_key()); + } + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + if constexpr (IS_BOTTOM) { + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, MSTAT_EQ}; + } else { + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>( + nxt_container, key, history, index_key); + // !history.is_GT<STAGE - 1>() means + // key[stage+1 ...] <= index[stage+1 ...][*] + assert(!nxt_result.is_end()); + return result_t::from_nxt(iter.index(), nxt_result); + } + } else if (*history.get<STAGE>() == MatchKindCMP::LT) { + exclude_last = true; + } + } + auto iter = iterator_t(container); + auto bs_match = iter.seek(key, exclude_last); + if (iter.is_end()) { + assert(!exclude_last); + assert(bs_match == MatchKindBS::NE); + history.set<STAGE>(MatchKindCMP::GT); + return result_t::end(); + } + history.set<STAGE>(bs_match == MatchKindBS::EQ ? + MatchKindCMP::EQ : MatchKindCMP::LT); + if constexpr (IS_BOTTOM) { + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, + (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_LT0)}; + } else { + if (bs_match == MatchKindBS::EQ) { + return nxt_lower_bound<GET_KEY>(key, iter, history, index_key); + } else { + return smallest_result<GET_KEY>(iter, index_key); + } + } + } + + template <IsFullKey Key> + static node_offset_t insert_size(const Key& key, + const value_input_t& value) { + if constexpr (IS_BOTTOM) { + return iterator_t::estimate_insert(key, value); + } else { + return iterator_t::estimate_insert(key, value) + + NXT_STAGE_T::iterator_t::header_size() + + NXT_STAGE_T::insert_size(key, value); + } + } + + template <IsFullKey Key> + static node_offset_t insert_size_at(match_stage_t stage, + const Key& key, + const value_input_t& value) { + if (stage == STAGE) { + return insert_size(key, value); + } else { + assert(stage < STAGE); + return NXT_STAGE_T::template insert_size_at(stage, key, value); + } + } + + template <typename T = std::tuple<match_stage_t, node_offset_t>> + static std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, T> evaluate_insert( + const container_t& container, const key_view_t& key, + const value_input_t& value, position_t& position, bool evaluate_last) { + auto iter = iterator_t(container); + auto& index = position.index; + if (evaluate_last || index == INDEX_END) { + iter.seek_last(); + index = iter.index(); + // evaluate the previous index + } else { + assert(is_valid_index(index)); + // evaluate the current index + iter.seek_at(index); + auto match = key <=> iter.get_key(); + if (match == 0) { + if constexpr (IS_BOTTOM) { + ceph_abort("insert conflict at current index!"); + } else { + // insert into the current index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, false); + } + } else { + assert(match == std::strong_ordering::less); + if (index == 0) { + // already the first index, so insert at the current index + return {STAGE, insert_size(key, value)}; + } + --index; + iter = iterator_t(container); + iter.seek_at(index); + // proceed to evaluate the previous index + } + } + + // XXX(multi-type): when key is from a different type of node + auto match = key <=> iter.get_key(); + if (match == std::strong_ordering::greater) { + // key doesn't match both indexes, so insert at the current index + ++index; + return {STAGE, insert_size(key, value)}; + } else { + assert(match == std::strong_ordering::equal); + if constexpr (IS_BOTTOM) { + // ceph_abort? + ceph_abort("insert conflict at the previous index!"); + } else { + // insert into the previous index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, true); + } + } + } + + template <typename T = bool> + static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> + compensate_insert_position_at(match_stage_t stage, position_t& position) { + auto& index = position.index; + if (stage == STAGE) { + assert(index == 0); + // insert at the end of the current stage + index = INDEX_END; + return true; + } else { + if constexpr (IS_BOTTOM) { + ceph_abort("impossible path"); + } else { + assert(stage < STAGE); + bool compensate = NXT_STAGE_T:: + compensate_insert_position_at(stage, position.nxt); + if (compensate) { + assert(is_valid_index(index)); + if (index == 0) { + // insert into the *last* index of the current stage + index = INDEX_LAST; + return true; + } else { + --index; + return false; + } + } else { + return false; + } + } + } + } + + static void patch_insert_end(position_t& insert_pos, match_stage_t insert_stage) { + assert(insert_stage <= STAGE); + if (insert_stage == STAGE) { + insert_pos.index = INDEX_END; + } else if constexpr (!IS_BOTTOM) { + insert_pos.index = INDEX_LAST; + NXT_STAGE_T::patch_insert_end(insert_pos.nxt, insert_stage); + } + } + + template <typename T = std::tuple<match_stage_t, node_offset_t>> + static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> evaluate_insert( + const key_hobj_t& key, const value_config_t& value, + const MatchHistory& history, match_stat_t mstat, position_t& position) { + match_stage_t insert_stage = STAGE_TOP; + while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) { + assert(insert_stage != STAGE_BOTTOM && "insert conflict!"); + --insert_stage; + } + + if (history.is_GT()) { + if (position.is_end()) { + // no need to compensate insert position + assert(insert_stage <= STAGE && "impossible insert stage"); + } else if (position == position_t::begin()) { + // I must be short-circuited by staged::smallest_result() + // in staged::lower_bound(), so we need to rely on mstat instead + assert(mstat >= MSTAT_LT0 && mstat <= MSTAT_LT3); + if (mstat == MSTAT_LT0) { + insert_stage = STAGE_RIGHT; + } else if (mstat == MSTAT_LT1) { + insert_stage = STAGE_STRING; + } else { + insert_stage = STAGE_LEFT; + } + // XXX(multi-type): need to upgrade node type before inserting an + // incompatible index at front. + assert(insert_stage <= STAGE && "incompatible insert"); + } else { + assert(insert_stage <= STAGE && "impossible insert stage"); + [[maybe_unused]] bool ret = compensate_insert_position_at(insert_stage, position); + assert(!ret); + } + } + + if (position.is_end()) { + patch_insert_end(position, insert_stage); + } + + node_offset_t insert_size = insert_size_at(insert_stage, key, value); + + return {insert_stage, insert_size}; + } + + template <KeyT KT> + static const value_t* insert_new( + NodeExtentMutable& mut, const memory_range_t& range, + const full_key_t<KT>& key, const value_input_t& value) { + char* p_insert = const_cast<char*>(range.p_end); + const value_t* p_value = nullptr; + StagedAppender<KT> appender; + appender.init_empty(&mut, p_insert); + appender.append(key, value, p_value); + [[maybe_unused]] const char* p_insert_front = appender.wrap(); + assert(p_insert_front == range.p_start); + return p_value; + } + + template <KeyT KT, bool SPLIT> + static const value_t* proceed_insert_recursively( + NodeExtentMutable& mut, const container_t& container, + const full_key_t<KT>& key, const value_input_t& value, + position_t& position, match_stage_t& stage, + node_offset_t& _insert_size, const char* p_left_bound) { + // proceed insert from right to left + assert(stage <= STAGE); + auto iter = iterator_t(container); + auto& index = position.index; + + bool do_insert = false; + if (stage == STAGE) { + if (index == INDEX_END) { + iter.seek_last(); + iter.set_end(); + index = iter.index(); + } else { + assert(is_valid_index(index)); + iter.seek_till_end(index); + } + do_insert = true; + } else { // stage < STAGE + if (index == INDEX_LAST) { + iter.seek_last(); + index = iter.index(); + } else { + assert(is_valid_index(index)); + iter.seek_till_end(index); + } + if constexpr (SPLIT) { + if (iter.is_end()) { + // insert at the higher stage due to split + do_insert = true; + _insert_size = insert_size(key, value); + stage = STAGE; + } + } else { + assert(!iter.is_end()); + } + } + + if (do_insert) { + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + assert(_insert_size == insert_size(key, value)); + if constexpr (IS_BOTTOM) { + return iter.insert( + mut, key, value, _insert_size, p_left_bound); + } else { + auto range = iter.insert_prefix( + mut, key, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value); + } + } else { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + auto p_value = NXT_STAGE_T::template proceed_insert_recursively<KT, SPLIT>( + mut, nxt_container, key, value, + position.nxt, stage, _insert_size, p_left_bound); + iter.update_size(mut, _insert_size); + return p_value; + } else { + ceph_abort("impossible path"); + } + } + } + + template <KeyT KT, bool SPLIT> + static const value_t* proceed_insert( + NodeExtentMutable& mut, const container_t& container, + const full_key_t<KT>& key, const value_input_t& value, + position_t& position, match_stage_t& stage, node_offset_t& _insert_size) { + auto p_left_bound = container.p_left_bound(); + if (unlikely(!container.keys())) { + if (position.is_end()) { + position = position_t::begin(); + assert(stage == STAGE); + assert(_insert_size == insert_size(key, value)); + } else if (position == position_t::begin()) { + // when insert into a trimmed and empty left node + stage = STAGE; + _insert_size = insert_size(key, value); + } else { + ceph_abort("impossible path"); + } + if constexpr (IS_BOTTOM) { + return container_t::insert_at( + mut, container, key, value, 0, _insert_size, p_left_bound); + } else { + auto range = container_t::template insert_prefix_at( + mut, container, key, 0, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value); + } + } else { + return proceed_insert_recursively<KT, SPLIT>( + mut, container, key, value, + position, stage, _insert_size, p_left_bound); + } + } + + static std::ostream& dump(const container_t& container, + std::ostream& os, + const std::string& prefix, + size_t& size, + const char* p_start) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + std::string prefix_blank(prefix.size(), ' '); + const std::string* p_prefix = &prefix; + size += iterator_t::header_size(); + do { + std::ostringstream sos; + sos << *p_prefix << iter.get_key() << ": "; + std::string i_prefix = sos.str(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + size += iter.size_to_nxt(); + NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start); + } else { + auto value_ptr = iter.get_p_value(); + int offset = reinterpret_cast<const char*>(value_ptr) - p_start; + size += iter.size(); + os << "\n" << i_prefix; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + os << *value_ptr; + } else { + os << "0x" << std::hex << value_ptr->value << std::dec; + } + os << " " << size << "B" + << " @" << offset << "B"; + } + if (iter.is_last()) { + break; + } else { + ++iter; + p_prefix = &prefix_blank; + } + } while (true); + return os; + } + + static void validate(const container_t& container) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + auto key = iter.get_key(); + do { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::validate(nxt_container); + } + if (iter.is_last()) { + break; + } else { + ++iter; + assert(key < iter.get_key()); + key = iter.get_key(); + } + } while (true); + } + + static void get_stats(const container_t& container, node_stats_t& stats, + key_view_t& index_key) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + stats.size_overhead += iterator_t::header_size(); + do { + index_key.replace(iter.get_key()); + stats.size_overhead += iter.size_overhead(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::get_stats(nxt_container, stats, index_key); + } else { + ++stats.num_kvs; + size_t kv_logical_size = index_key.size_logical(); + size_t value_size; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + value_size = iter.get_p_value()->allocation_size(); + } else { + value_size = sizeof(value_t); + } + stats.size_value += value_size; + kv_logical_size += value_size; + stats.size_logical += kv_logical_size; + } + if (iter.is_last()) { + break; + } else { + ++iter; + } + } while (true); + } + + template <bool GET_KEY, bool GET_VAL> + static bool get_next_slot( + const container_t& container, // IN + position_t& pos, // IN&OUT + key_view_t* p_index_key, // OUT + const value_t** pp_value) { // OUT + auto iter = iterator_t(container); + assert(!iter.is_end()); + iter.seek_at(pos.index); + bool find_next; + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + find_next = NXT_STAGE_T::template get_next_slot<GET_KEY, GET_VAL>( + nxt_container, pos.nxt, p_index_key, pp_value); + } else { + find_next = true; + } + + if (find_next) { + if (iter.is_last()) { + return true; + } else { + pos.index = iter.index() + 1; + if constexpr (!IS_BOTTOM) { + pos.nxt = NXT_STAGE_T::position_t::begin(); + } + get_slot<GET_KEY, GET_VAL>( + container, pos, p_index_key, pp_value); + return false; + } + } else { // !find_next && !IS_BOTTOM + if constexpr (GET_KEY) { + assert(p_index_key); + p_index_key->set(iter.get_key()); + } else { + assert(!p_index_key); + } + return false; + } + } + + template <bool GET_KEY, bool GET_VAL> + static void get_prev_slot( + const container_t& container, // IN + position_t& pos, // IN&OUT + key_view_t* p_index_key, // OUT + const value_t** pp_value) { // OUT + assert(pos != position_t::begin()); + assert(!pos.is_end()); + auto& index = pos.index; + auto iter = iterator_t(container); + if constexpr (!IS_BOTTOM) { + auto& nxt_pos = pos.nxt; + if (nxt_pos == NXT_STAGE_T::position_t::begin()) { + assert(index); + --index; + iter.seek_at(index); + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::template get_largest_slot<true, GET_KEY, GET_VAL>( + nxt_container, &nxt_pos, p_index_key, pp_value); + } else { + iter.seek_at(index); + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::template get_prev_slot<GET_KEY, GET_VAL>( + nxt_container, nxt_pos, p_index_key, pp_value); + } + } else { + assert(index); + --index; + iter.seek_at(index); + if constexpr (GET_VAL) { + assert(pp_value); + *pp_value = iter.get_p_value(); + } else { + assert(!pp_value); + } + } + if constexpr (GET_KEY) { + p_index_key->set(iter.get_key()); + } else { + assert(!p_index_key); + } + } + + struct _BaseEmpty {}; + class _BaseWithNxtIterator { + protected: + typename NXT_STAGE_T::StagedIterator _nxt; + }; + class StagedIterator + : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtIterator> { + public: + StagedIterator() = default; + bool valid() const { return iter.has_value(); } + index_t index() const { + return iter->index(); + } + bool is_end() const { return iter->is_end(); } + bool in_progress() const { + assert(valid()); + assert(!is_end()); + if constexpr (!IS_BOTTOM) { + if (this->_nxt.valid()) { + if (this->_nxt.index() == 0) { + return this->_nxt.in_progress(); + } else { + return true; + } + } else { + return false; + } + } else { + return false; + } + } + key_get_type get_key() const { return iter->get_key(); } + + iterator_t& get() { return *iter; } + void set(const container_t& container) { + assert(!valid()); + iter = iterator_t(container); + } + void set_end() { iter->set_end(); } + typename NXT_STAGE_T::StagedIterator& nxt() { + if constexpr (!IS_BOTTOM) { + if (!this->_nxt.valid()) { + auto nxt_container = iter->get_nxt_container(); + this->_nxt.set(nxt_container); + } + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::StagedIterator& get_nxt() { + if constexpr (!IS_BOTTOM) { + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + StagedIterator& operator++() { + if (iter->is_last()) { + iter->set_end(); + } else { + ++(*iter); + } + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + return *this; + } + void reset() { + if (valid()) { + iter.reset(); + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + } + } + + template<typename OutputIt> + auto do_format_to(OutputIt out, bool is_top) const { + if (valid()) { + if (iter->is_end()) { + return fmt::format_to(out, "END"); + } else { + out = fmt::format_to(out, "{}", index()); + } + } else { + if (is_top) { + return fmt::format_to(out, "invalid StagedIterator!"); + } else { + out = fmt::format_to(out, "0!"); + } + } + if constexpr (!IS_BOTTOM) { + out = fmt::format_to(out, ", "); + return this->_nxt.do_format_to(out, false); + } else { + return out; + } + } + + position_t get_pos() const { + if (valid()) { + if constexpr (IS_BOTTOM) { + return position_t{index()}; + } else { + return position_t{index(), this->_nxt.get_pos()}; + } + } else { + return position_t::begin(); + } + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + uint8_t present = static_cast<bool>(iter); + ceph::encode(present, encoded); + if (iter.has_value()) { + iter->encode(p_node_start, encoded); + if constexpr (!IS_BOTTOM) { + this->_nxt.encode(p_node_start, encoded); + } + } + } + static StagedIterator decode(const char* p_node_start, + extent_len_t node_size, + ceph::bufferlist::const_iterator& delta) { + StagedIterator ret; + uint8_t present; + ceph::decode(present, delta); + if (present) { + ret.iter = iterator_t::decode( + p_node_start, node_size, delta); + if constexpr (!IS_BOTTOM) { + ret._nxt = NXT_STAGE_T::StagedIterator::decode( + p_node_start, node_size, delta); + } + } + return ret; + } + private: + std::optional<iterator_t> iter; + }; + + static bool recursively_locate_split( + size_t& current_size, size_t extra_size, + size_t target_size, StagedIterator& split_at) { + assert(current_size <= target_size); + iterator_t& split_iter = split_at.get(); + current_size = split_iter.seek_split(current_size, extra_size, target_size); + assert(current_size <= target_size); + assert(!split_iter.is_end()); + if (split_iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + bool locate_nxt; + if constexpr (!IS_BOTTOM) { + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); + } else { // IS_BOTTOM + // located upper_bound, fair split strategy + size_t nxt_size = split_iter.size() + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + current_size += nxt_size; + locate_nxt = true; + } else { + // exclude next + locate_nxt = false; + } + } + if (locate_nxt) { + if (split_iter.is_last()) { + return true; + } else { + ++split_at; + return false; + } + } else { + return false; + } + } + + static bool recursively_locate_split_inserted( + size_t& current_size, size_t extra_size, size_t target_size, + position_t& insert_pos, match_stage_t insert_stage, size_t insert_size, + std::optional<bool>& is_insert_left, StagedIterator& split_at) { + assert(current_size <= target_size); + assert(!is_insert_left.has_value()); + iterator_t& split_iter = split_at.get(); + auto& insert_index = insert_pos.index; + if (insert_stage == STAGE) { + current_size = split_iter.template seek_split_inserted<true>( + current_size, extra_size, target_size, + insert_index, insert_size, is_insert_left); + assert(is_insert_left.has_value()); + assert(current_size <= target_size); + if (split_iter.index() == 0) { + if (insert_index == 0) { + if (*is_insert_left == false) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + } else { + extra_size += iterator_t::header_size(); + } + } else { + extra_size = 0; + } + if (*is_insert_left == false && split_iter.index() == insert_index) { + // split_iter can be end + // found the lower-bound of target_size + // ...[s_index-1] |!| (i_index) [s_index]... + + // located upper-bound, fair split strategy + // look at the next slot (the insert item) + size_t nxt_size = insert_size + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + *is_insert_left = true; + current_size += nxt_size; + if (split_iter.is_end()) { + // ...[s_index-1] (i_index) |!| + return true; + } else { + return false; + } + } else { + // exclude next + return false; + } + } else { + // Already considered insert effect in the current stage. + // Look into the next stage to identify the target_size lower-bound w/o + // insert effect. + assert(!split_iter.is_end()); + bool locate_nxt; + if constexpr (!IS_BOTTOM) { + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); + } else { // IS_BOTTOM + // located upper-bound, fair split strategy + // look at the next slot + size_t nxt_size = split_iter.size() + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + current_size += nxt_size; + locate_nxt = true; + } else { + // exclude next + locate_nxt = false; + } + } + if (locate_nxt) { + if (split_iter.is_last()) { + auto end_index = split_iter.index() + 1; + if (insert_index == INDEX_END) { + insert_index = end_index; + } + assert(insert_index <= end_index); + if (insert_index == end_index) { + assert(*is_insert_left == false); + split_iter.set_end(); + // ...[s_index-1] |!| (i_index) + return false; + } else { + assert(*is_insert_left == true); + return true; + } + } else { + ++split_at; + return false; + } + } else { + return false; + } + } + } else { + if constexpr (!IS_BOTTOM) { + assert(insert_stage < STAGE); + current_size = split_iter.template seek_split_inserted<false>( + current_size, extra_size, target_size, + insert_index, insert_size, is_insert_left); + assert(!split_iter.is_end()); + assert(current_size <= target_size); + if (split_iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + bool locate_nxt; + if (!is_insert_left.has_value()) { + // Considered insert effect in the current stage, and insert happens + // in the lower stage. + // Look into the next stage to identify the target_size lower-bound w/ + // insert effect. + assert(split_iter.index() == insert_index); + locate_nxt = NXT_STAGE_T::recursively_locate_split_inserted( + current_size, extra_size + split_iter.size_to_nxt(), target_size, + insert_pos.nxt, insert_stage, insert_size, + is_insert_left, split_at.nxt()); + assert(is_insert_left.has_value()); +#ifndef NDEBUG + if (locate_nxt) { + assert(*is_insert_left == true); + } +#endif + } else { + // is_insert_left.has_value() == true + // Insert will *not* happen in the lower stage. + // Need to look into the next stage to identify the target_size + // lower-bound w/ insert effect + assert(split_iter.index() != insert_index); + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); +#ifndef NDEBUG + if (split_iter.index() < insert_index) { + assert(*is_insert_left == false); + } else { + assert(*is_insert_left == true); + } +#endif + } + if (locate_nxt) { + if (split_iter.is_last()) { + return true; + } else { + ++split_at; + return false; + } + } else { + return false; + } + } else { + ceph_abort("impossible path"); + return false;; + } + } + } + + /* + * container appender type system + * container_t::Appender(NodeExtentMutable& mut, char* p_append) + * append(const container_t& src, index_t from, index_t items) + * wrap() -> char* + * IF !IS_BOTTOM: + * open_nxt(const key_get_type&) + * open_nxt(const full_key_t&) + * -> std::tuple<NodeExtentMutable&, char*> + * wrap_nxt(char* p_append) + * ELSE + * append(const full_key_t& key, const value_input_t& value) + */ + template <KeyT KT> + struct _BaseWithNxtAppender { + typename NXT_STAGE_T::template StagedAppender<KT> _nxt; + }; + template <KeyT KT> + class StagedAppender + : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtAppender<KT>> { + public: + StagedAppender() = default; + ~StagedAppender() { + assert(!require_wrap_nxt); + assert(!valid()); + } + bool valid() const { return appender.has_value(); } + index_t index() const { + assert(valid()); + return _index; + } + bool in_progress() const { return require_wrap_nxt; } + // TODO: pass by reference + void init_empty(NodeExtentMutable* p_mut, char* p_start) { + assert(!valid()); + appender = typename container_t::template Appender<KT>(p_mut, p_start); + _index = 0; + } + void init_tail(NodeExtentMutable* p_mut, + const container_t& container, + match_stage_t stage) { + assert(!valid()); + auto iter = iterator_t(container); + iter.seek_last(); + if (stage == STAGE) { + appender = iter.template get_appender<KT>(p_mut); + _index = iter.index() + 1; + if constexpr (!IS_BOTTOM) { + assert(!this->_nxt.valid()); + } + } else { + assert(stage < STAGE); + if constexpr (!IS_BOTTOM) { + appender = iter.template get_appender_opened<KT>(p_mut); + _index = iter.index(); + require_wrap_nxt = true; + auto nxt_container = iter.get_nxt_container(); + this->_nxt.init_tail(p_mut, nxt_container, stage); + } else { + ceph_abort("impossible path"); + } + } + } + // possible to make src_iter end if to_index == INDEX_END + void append_until(StagedIterator& src_iter, index_t& to_index) { + assert(!require_wrap_nxt); + auto s_index = src_iter.index(); + src_iter.get().template copy_out_until<KT>(*appender, to_index); + assert(src_iter.index() == to_index); + assert(to_index >= s_index); + auto increment = (to_index - s_index); + if (increment) { + _index += increment; + if constexpr (!IS_BOTTOM) { + src_iter.get_nxt().reset(); + } + } + } + void append(const full_key_t<KT>& key, + const value_input_t& value, const value_t*& p_value) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + auto& nxt = open_nxt(key); + nxt.append(key, value, p_value); + wrap_nxt(); + } else { + appender->append(key, value, p_value); + ++_index; + } + } + char* wrap() { + assert(valid()); + assert(_index > 0); + if constexpr (!IS_BOTTOM) { + if (require_wrap_nxt) { + wrap_nxt(); + } + } + auto ret = appender->wrap(); + appender.reset(); + return ret; + } + typename NXT_STAGE_T::template StagedAppender<KT>& + open_nxt(key_get_type paritial_key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(paritial_key); + this->_nxt.init_empty(p_mut, p_append); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::template StagedAppender<KT>& + open_nxt(const full_key_t<KT>& key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(key); + this->_nxt.init_empty(p_mut, p_append); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::template StagedAppender<KT>& get_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + void wrap_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + require_wrap_nxt = false; + auto p_append = this->_nxt.wrap(); + appender->wrap_nxt(p_append); + ++_index; + } else { + ceph_abort("impossible path"); + } + } + private: + std::optional<typename container_t::template Appender<KT>> appender; + index_t _index; + bool require_wrap_nxt = false; + }; + + template <KeyT KT> + static void _append_range( + StagedIterator& src_iter, StagedAppender<KT>& appender, index_t& to_index) { + if (src_iter.is_end()) { + // append done + assert(to_index == INDEX_END); + to_index = src_iter.index(); + } else if constexpr (!IS_BOTTOM) { + if (appender.in_progress()) { + // appender has appended something at the current item, + // cannot append the current item as-a-whole + index_t to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range<KT>( + src_iter.nxt(), appender.get_nxt(), to_index_nxt); + ++src_iter; + appender.wrap_nxt(); + } else if (src_iter.in_progress()) { + // src_iter is not at the beginning of the current item, + // cannot append the current item as-a-whole + index_t to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range<KT>( + src_iter.get_nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt); + ++src_iter; + appender.wrap_nxt(); + } else { + // we can safely append the current item as-a-whole + } + } + appender.append_until(src_iter, to_index); + } + + template <KeyT KT> + static void _append_into(StagedIterator& src_iter, StagedAppender<KT>& appender, + position_t& position, match_stage_t stage) { + assert(position.index == src_iter.index()); + // reaches the last item + if (stage == STAGE) { + // done, end recursion + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + } else { + assert(stage < STAGE); + // proceed append in the next stage + NXT_STAGE_T::template append_until<KT>( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), + position.nxt, stage); + } + } + + template <KeyT KT> + static void append_until(StagedIterator& src_iter, StagedAppender<KT>& appender, + position_t& position, match_stage_t stage) { + index_t from_index = src_iter.index(); + index_t& to_index = position.index; + assert(from_index <= to_index); + if constexpr (IS_BOTTOM) { + assert(stage == STAGE); + appender.append_until(src_iter, to_index); + } else { + assert(stage <= STAGE); + if (src_iter.index() == to_index) { + _append_into<KT>(src_iter, appender, position, stage); + } else { + if (to_index == INDEX_END) { + assert(stage == STAGE); + } else if (to_index == INDEX_LAST) { + assert(stage < STAGE); + } + _append_range<KT>(src_iter, appender, to_index); + _append_into<KT>(src_iter, appender, position, stage); + } + } + to_index -= from_index; + } + + template <KeyT KT> + static bool append_insert( + const full_key_t<KT>& key, const value_input_t& value, + StagedIterator& src_iter, StagedAppender<KT>& appender, + bool is_front_insert, match_stage_t& stage, const value_t*& p_value) { + assert(src_iter.valid()); + if (stage == STAGE) { + appender.append(key, value, p_value); + if (src_iter.is_end()) { + return true; + } else { + return false; + } + } else { + assert(stage < STAGE); + if constexpr (!IS_BOTTOM) { + auto nxt_is_end = NXT_STAGE_T::template append_insert<KT>( + key, value, src_iter.get_nxt(), appender.get_nxt(), + is_front_insert, stage, p_value); + if (nxt_is_end) { + appender.wrap_nxt(); + ++src_iter; + if (is_front_insert) { + stage = STAGE; + } + if (src_iter.is_end()) { + return true; + } + } + return false; + } else { + ceph_abort("impossible path"); + } + } + } + + /* TrimType: + * BEFORE: remove the entire container, normally means the according higher + * stage iterator needs to be trimmed as-a-whole. + * AFTER: retain the entire container, normally means the trim should be + * start from the next iterator at the higher stage. + * AT: trim happens in the current container, and the according higher + * stage iterator needs to be adjusted by the trimmed size. + */ + static std::tuple<TrimType, node_offset_t> + recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + if (!trim_at.valid()) { + return {TrimType::BEFORE, 0u}; + } + if (trim_at.is_end()) { + return {TrimType::AFTER, 0u}; + } + + auto& iter = trim_at.get(); + if constexpr (!IS_BOTTOM) { + auto [type, trimmed] = NXT_STAGE_T::recursively_trim( + mut, trim_at.get_nxt()); + node_offset_t trim_size; + if (type == TrimType::AFTER) { + if (iter.is_last()) { + return {TrimType::AFTER, 0u}; + } + ++trim_at; + trim_size = iter.trim_until(mut); + } else if (type == TrimType::BEFORE) { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } + trim_size = iter.trim_until(mut); + } else { + trim_size = iter.trim_at(mut, trimmed); + } + return {TrimType::AT, trim_size}; + } else { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } else { + auto trimmed = iter.trim_until(mut); + return {TrimType::AT, trimmed}; + } + } + } + + static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + auto [type, trimmed] = recursively_trim(mut, trim_at); + if (type == TrimType::BEFORE) { + assert(trim_at.valid()); + auto& iter = trim_at.get(); + iter.trim_until(mut); + } + } + + static std::optional<std::tuple<match_stage_t, node_offset_t, bool>> + proceed_erase_recursively( + NodeExtentMutable& mut, + const container_t& container, // IN + const char* p_left_bound, // IN + position_t& pos) { // IN&OUT + auto iter = iterator_t(container); + auto& index = pos.index; + assert(is_valid_index(index)); + iter.seek_at(index); + bool is_last = iter.is_last(); + + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + auto ret = NXT_STAGE_T::proceed_erase_recursively( + mut, nxt_container, p_left_bound, pos.nxt); + if (ret.has_value()) { + // erased at lower level + auto [r_stage, r_erase_size, r_done] = *ret; + assert(r_erase_size != 0); + iter.update_size(mut, -r_erase_size); + if (r_done) { + // done, the next_pos is calculated + return ret; + } else { + if (is_last) { + // need to find the next pos at upper stage + return ret; + } else { + // done, calculate the next pos + ++index; + pos.nxt = NXT_STAGE_T::position_t::begin(); + return {{r_stage, r_erase_size, true}}; + } + } + } + // not erased at lower level + } + + // not erased yet + if (index == 0 && is_last) { + // need to erase from the upper stage + return std::nullopt; + } else { + auto erase_size = iter.erase(mut, p_left_bound); + assert(erase_size != 0); + if (is_last) { + // need to find the next pos at upper stage + return {{STAGE, erase_size, false}}; + } else { + // done, calculate the next pos (should be correct already) + if constexpr (!IS_BOTTOM) { + assert(pos.nxt == NXT_STAGE_T::position_t::begin()); + } + return {{STAGE, erase_size, true}}; + } + } + } + + static match_stage_t erase( + NodeExtentMutable& mut, + const container_t& node_stage, // IN + position_t& erase_pos) { // IN&OUT + auto p_left_bound = node_stage.p_left_bound(); + auto ret = proceed_erase_recursively( + mut, node_stage, p_left_bound, erase_pos); + if (ret.has_value()) { + auto [r_stage, r_erase_size, r_done] = *ret; + std::ignore = r_erase_size; + if (r_done) { + assert(!erase_pos.is_end()); + return r_stage; + } else { + // erased the last kv + erase_pos = position_t::end(); + return r_stage; + } + } else { + assert(node_stage.keys() == 1); + node_stage.erase_at(mut, node_stage, 0, p_left_bound); + erase_pos = position_t::end(); + return STAGE; + } + } + + static std::tuple<match_stage_t, node_offset_t> evaluate_merge( + const key_view_t& left_pivot_index, + const container_t& right_container) { + auto r_iter = iterator_t(right_container); + r_iter.seek_at(0); + node_offset_t compensate = r_iter.header_size(); + auto cmp = left_pivot_index <=> r_iter.get_key(); + if (cmp == std::strong_ordering::equal) { + if constexpr (!IS_BOTTOM) { + // the index is equal, compensate and look at the lower stage + compensate += r_iter.size_to_nxt(); + auto r_nxt_container = r_iter.get_nxt_container(); + auto [ret_stage, ret_compensate] = NXT_STAGE_T::evaluate_merge( + left_pivot_index, r_nxt_container); + compensate += ret_compensate; + return {ret_stage, compensate}; + } else { + ceph_abort("impossible path: left_pivot_key == right_first_key"); + } + } else if (cmp == std::strong_ordering::less) { + // ok, do merge here + return {STAGE, compensate}; + } else { + ceph_abort("impossible path: left_pivot_key < right_first_key"); + } + } +}; + +/** + * Configurations for struct staged + * + * staged_params_* assembles different container_t implementations (defined by + * stated::_iterator_t) by STAGE, and constructs the final multi-stage + * implementations for different node layouts defined by + * node_extent_t<FieldType, NODE_TYPE>. + * + * The specialized implementations for different layouts are accessible through + * the helper type node_to_stage_t<node_extent_t<FieldType, NODE_TYPE>>. + * + * Specifically, the settings of 8 layouts are: + * + * The layout (N0, LEAF/INTERNAL) has 3 stages: + * - STAGE_LEFT: node_extent_t<node_fields_0_t, LEAF/INTERNAL> + * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N1, LEAF/INTERNAL) has 3 stages: + * - STAGE_LEFT: node_extent_t<node_fields_1_t, LEAF/INTERNAL> + * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N2, LEAF/INTERNAL) has 2 stages: + * - STAGE_STRING: node_extent_t<node_fields_2_t, LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N3, LEAF) has 1 stage: + * - STAGE_RIGHT: node_extent_t<leaf_fields_3_t, LEAF> + * + * The layout (N3, INTERNAL) has 1 stage: + * - STAGE_RIGHT: node_extent_t<internal_fields_3_t, INTERNAL> + */ + +template <node_type_t _NODE_TYPE> +struct staged_params_subitems { + using container_t = sub_items_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <node_type_t _NODE_TYPE> +struct staged_params_item_iterator { + using container_t = item_iterator_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_01 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_LEFT; + + using next_param_t = staged_params_item_iterator<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_2 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_3 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_node_3<NodeType>; +}; + +template <typename NodeType, typename Enable = void> struct _node_to_stage_t; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N0 || + NodeType::FIELD_TYPE == field_type_t::N1>> { + using type = staged<staged_params_node_01<NodeType>>; +}; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N2>> { + using type = staged<staged_params_node_2<NodeType>>; +}; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N3>> { + using type = staged<staged_params_node_3<NodeType>>; +}; +template <typename NodeType> +using node_to_stage_t = typename _node_to_stage_t<NodeType>::type; + +} + +template<typename T> +concept HasDoFormatTo = requires(T x, std::back_insert_iterator<fmt::memory_buffer> out) { + { x.do_format_to(out, true) } -> std::same_as<decltype(out)>; +}; +template <HasDoFormatTo T> struct fmt::formatter<T> : fmt::formatter<std::string_view> { + template <typename FormatContext> + auto format(const T& staged_iterator, FormatContext& ctx) { + return staged_iterator.do_format_to(ctx.out(), true); + } +}; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h new file mode 100644 index 000000000..3c1b32a41 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h @@ -0,0 +1,442 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <optional> +#include <ostream> + +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/value.h" + +namespace crimson::os::seastore::onode { + +using match_stage_t = int8_t; +constexpr match_stage_t STAGE_LEFT = 2; // shard/pool/crush +constexpr match_stage_t STAGE_STRING = 1; // nspace/oid +constexpr match_stage_t STAGE_RIGHT = 0; // snap/gen +constexpr auto STAGE_TOP = STAGE_LEFT; +constexpr auto STAGE_BOTTOM = STAGE_RIGHT; +constexpr bool is_valid_stage(match_stage_t stage) { + return std::clamp(stage, STAGE_BOTTOM, STAGE_TOP) == stage; +} +// TODO: replace by +// using match_history_t = int8_t; +// left_m, str_m, right_m +// 3: GT, +// 2: EQ, GT, +// 1: EQ, EQ, GT +// 0: EQ, EQ, EQ +// -1: EQ, EQ, LT +// -2: EQ, LT, +// -3: LT, + +struct MatchHistory { + template <match_stage_t STAGE> + const std::optional<MatchKindCMP>& get() const { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE == STAGE_RIGHT) { + return right_match; + } else if (STAGE == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + const std::optional<MatchKindCMP>& + get_by_stage(match_stage_t stage) const { + assert(is_valid_stage(stage)); + if (stage == STAGE_RIGHT) { + return right_match; + } else if (stage == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + template <match_stage_t STAGE = STAGE_TOP> + const bool is_GT() const; + + template <match_stage_t STAGE> + void set(MatchKindCMP match) { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE < STAGE_TOP) { + assert(*get<STAGE + 1>() == MatchKindCMP::EQ); + } + assert(!get<STAGE>().has_value() || *get<STAGE>() != MatchKindCMP::EQ); + const_cast<std::optional<MatchKindCMP>&>(get<STAGE>()) = match; + } + + std::ostream& dump(std::ostream& os) const { + os << "history("; + dump_each(os, left_match) << ", "; + dump_each(os, string_match) << ", "; + dump_each(os, right_match) << ")"; + return os; + } + + std::ostream& dump_each( + std::ostream& os, const std::optional<MatchKindCMP>& match) const { + if (!match.has_value()) { + return os << "--"; + } else if (*match == MatchKindCMP::LT) { + return os << "LT"; + } else if (*match == MatchKindCMP::EQ) { + return os << "EQ"; + } else if (*match == MatchKindCMP::GT) { + return os << "GT"; + } else { + ceph_abort("impossble path"); + } + } + + std::optional<MatchKindCMP> left_match; + std::optional<MatchKindCMP> string_match; + std::optional<MatchKindCMP> right_match; +}; +inline std::ostream& operator<<(std::ostream& os, const MatchHistory& pos) { + return pos.dump(os); +} + +template <match_stage_t STAGE> +struct _check_GT_t { + static bool eval(const MatchHistory* history) { + return history->get<STAGE>() && + (*history->get<STAGE>() == MatchKindCMP::GT || + (*history->get<STAGE>() == MatchKindCMP::EQ && + _check_GT_t<STAGE - 1>::eval(history))); + } +}; +template <> +struct _check_GT_t<STAGE_RIGHT> { + static bool eval(const MatchHistory* history) { + return history->get<STAGE_RIGHT>() && + *history->get<STAGE_RIGHT>() == MatchKindCMP::GT; + } +}; +template <match_stage_t STAGE> +const bool MatchHistory::is_GT() const { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE < STAGE_TOP) { + assert(get<STAGE + 1>() == MatchKindCMP::EQ); + } + return _check_GT_t<STAGE>::eval(this); +} + +template <match_stage_t STAGE> +struct staged_position_t { + static_assert(is_valid_stage(STAGE)); + using me_t = staged_position_t<STAGE>; + using nxt_t = staged_position_t<STAGE - 1>; + bool is_end() const { + if (index == INDEX_END) { + return true; + } else { + assert(is_valid_index(index)); + return false; + } + } + index_t& index_by_stage(match_stage_t stage) { + assert(stage <= STAGE); + if (STAGE == stage) { + return index; + } else { + return nxt.index_by_stage(stage); + } + } + + auto operator<=>(const me_t& o) const = default; + + void assert_next_to(const me_t& prv) const { +#ifndef NDEBUG + if (is_end()) { + assert(!prv.is_end()); + } else if (index == prv.index) { + assert(!nxt.is_end()); + nxt.assert_next_to(prv.nxt); + } else if (index == prv.index + 1) { + assert(!prv.nxt.is_end()); + assert(nxt == nxt_t::begin()); + } else { + assert(false); + } +#endif + } + + me_t& operator-=(const me_t& o) { + assert(is_valid_index(o.index)); + assert(index >= o.index); + if (index != INDEX_END) { + assert(is_valid_index(index)); + index -= o.index; + if (index == 0) { + nxt -= o.nxt; + } + } + return *this; + } + + me_t& operator+=(const me_t& o) { + assert(is_valid_index(index)); + assert(is_valid_index(o.index)); + index += o.index; + nxt += o.nxt; + return *this; + } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + nxt.encode(encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + ret.nxt = nxt_t::decode(delta); + return ret; + } + + static me_t begin() { return {0u, nxt_t::begin()}; } + static me_t end() { + return {INDEX_END, nxt_t::end()}; + } + + index_t index; + nxt_t nxt; +}; +template <match_stage_t STAGE> +std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE>& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else if (pos.index == INDEX_LAST) { + os << "LAST"; + } else { + os << pos.index; + assert(is_valid_index(pos.index)); + } + return os << ", " << pos.nxt; +} + +template <> +struct staged_position_t<STAGE_BOTTOM> { + using me_t = staged_position_t<STAGE_BOTTOM>; + bool is_end() const { + if (index == INDEX_END) { + return true; + } else { + assert(is_valid_index(index)); + return false; + } + } + index_t& index_by_stage(match_stage_t stage) { + assert(stage == STAGE_BOTTOM); + return index; + } + + auto operator<=>(const me_t&) const = default; + + me_t& operator-=(const me_t& o) { + assert(is_valid_index(o.index)); + assert(index >= o.index); + if (index != INDEX_END) { + assert(is_valid_index(index)); + index -= o.index; + } + return *this; + } + + me_t& operator+=(const me_t& o) { + assert(is_valid_index(index)); + assert(is_valid_index(o.index)); + index += o.index; + return *this; + } + + void assert_next_to(const me_t& prv) const { +#ifndef NDEBUG + if (is_end()) { + assert(!prv.is_end()); + } else { + assert(index == prv.index + 1); + } +#endif + } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + return ret; + } + + static me_t begin() { return {0u}; } + static me_t end() { return {INDEX_END}; } + + index_t index; +}; +template <> +inline std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE_BOTTOM>& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else if (pos.index == INDEX_LAST) { + os << "LAST"; + } else { + os << pos.index; + assert(is_valid_index(pos.index)); + } + return os; +} + +using search_position_t = staged_position_t<STAGE_TOP>; + +template <match_stage_t STAGE> +const staged_position_t<STAGE>& cast_down(const search_position_t& pos) { + if constexpr (STAGE == STAGE_LEFT) { + return pos; + } else if constexpr (STAGE == STAGE_STRING) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.is_end()); + } else { + assert(pos.index == 0u); + } +#endif + return pos.nxt; + } else if constexpr (STAGE == STAGE_RIGHT) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.nxt.is_end()); + } else { + assert(pos.index == 0u); + assert(pos.nxt.index == 0u); + } +#endif + return pos.nxt.nxt; + } else { + ceph_abort("impossible path"); + } +} + +template <match_stage_t STAGE> +staged_position_t<STAGE>& cast_down(search_position_t& pos) { + const search_position_t& _pos = pos; + return const_cast<staged_position_t<STAGE>&>(cast_down<STAGE>(_pos)); +} + +template <match_stage_t STAGE> +staged_position_t<STAGE>& cast_down_fill_0(search_position_t& pos) { + if constexpr (STAGE == STAGE_LEFT) { + return pos; + } if constexpr (STAGE == STAGE_STRING) { + pos.index = 0; + return pos.nxt; + } else if constexpr (STAGE == STAGE_RIGHT) { + pos.index = 0; + pos.nxt.index = 0; + return pos.nxt.nxt; + } else { + ceph_abort("impossible path"); + } +} + +inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); } + +template <match_stage_t STAGE, typename = std::enable_if_t<STAGE != STAGE_TOP>> +search_position_t normalize(staged_position_t<STAGE>&& pos) { + if (pos.is_end()) { + return search_position_t::end(); + } + if constexpr (STAGE == STAGE_STRING) { + return {0u, std::move(pos)}; + } else if (STAGE == STAGE_RIGHT) { + return {0u, {0u, std::move(pos)}}; + } else { + ceph_abort("impossible path"); + } +} + +struct memory_range_t { + const char* p_start; + const char* p_end; +}; + +struct container_range_t { + memory_range_t range; + extent_len_t node_size; +}; + +enum class ContainerType { ITERATIVE, INDEXABLE }; + +// the input type to construct the value during insert. +template <node_type_t> struct value_input_type; +template<> struct value_input_type<node_type_t::INTERNAL> { using type = laddr_t; }; +template<> struct value_input_type<node_type_t::LEAF> { using type = value_config_t; }; +template <node_type_t NODE_TYPE> +using value_input_type_t = typename value_input_type<NODE_TYPE>::type; + +template <node_type_t> struct value_type; +template<> struct value_type<node_type_t::INTERNAL> { using type = laddr_packed_t; }; +template<> struct value_type<node_type_t::LEAF> { using type = value_header_t; }; +template <node_type_t NODE_TYPE> +using value_type_t = typename value_type<NODE_TYPE>::type; + +template <node_type_t NODE_TYPE, match_stage_t STAGE> +struct staged_result_t { + using me_t = staged_result_t<NODE_TYPE, STAGE>; + bool is_end() const { return position.is_end(); } + + static me_t end() { + return {staged_position_t<STAGE>::end(), nullptr, MSTAT_END}; + } + template <typename T = me_t> + static std::enable_if_t<STAGE != STAGE_BOTTOM, T> from_nxt( + index_t index, const staged_result_t<NODE_TYPE, STAGE - 1>& nxt_stage_result) { + return {{index, nxt_stage_result.position}, + nxt_stage_result.p_value, + nxt_stage_result.mstat}; + } + + staged_position_t<STAGE> position; + const value_type_t<NODE_TYPE>* p_value; + match_stat_t mstat; +}; + +template <node_type_t NODE_TYPE> +using lookup_result_t = staged_result_t<NODE_TYPE, STAGE_TOP>; + +template <node_type_t NODE_TYPE> +lookup_result_t<NODE_TYPE>&& normalize( + lookup_result_t<NODE_TYPE>&& result) { return std::move(result); } + +template <node_type_t NODE_TYPE, match_stage_t STAGE, + typename = std::enable_if_t<STAGE != STAGE_TOP>> +lookup_result_t<NODE_TYPE> normalize( + staged_result_t<NODE_TYPE, STAGE>&& result) { + // FIXME: assert result.mstat correct + return {normalize(std::move(result.position)), result.p_value, result.mstat}; +} + +struct node_stats_t { + size_t size_persistent = 0; + size_t size_filled = 0; + // filled by staged::get_stats() + size_t size_logical = 0; + size_t size_overhead = 0; + size_t size_value = 0; + unsigned num_kvs = 0; +}; + +} + +#if FMT_VERSION >= 90000 +template <crimson::os::seastore::onode::match_stage_t S> +struct fmt::formatter<crimson::os::seastore::onode::staged_position_t<S>> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::onode::MatchHistory> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc new file mode 100644 index 000000000..28e6f7102 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc @@ -0,0 +1,329 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "sub_items_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +template <IsFullKey Key> +const laddr_packed_t* internal_sub_items_t::insert_at( + NodeExtentMutable& mut, const internal_sub_items_t& sub_items, + const Key& key, const laddr_t& value, + index_t index, node_offset_t size, const char* p_left_bound) +{ + assert(index <= sub_items.keys()); + assert(size == estimate_insert(key, value)); + const char* p_shift_start = p_left_bound; + const char* p_shift_end = reinterpret_cast<const char*>( + sub_items.p_first_item + 1 - index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + auto p_insert = const_cast<char*>(p_shift_end) - size; + auto item = internal_sub_item_t{ + snap_gen_t::from_key(key), laddr_packed_t{value}}; + mut.copy_in_absolute(p_insert, item); + return &reinterpret_cast<internal_sub_item_t*>(p_insert)->value; +} +#define IA_TEMPLATE(Key) \ + template const laddr_packed_t* internal_sub_items_t::insert_at<Key>( \ + NodeExtentMutable&, const internal_sub_items_t&, const Key&, \ + const laddr_t&, index_t, node_offset_t, const char*) +IA_TEMPLATE(key_view_t); +IA_TEMPLATE(key_hobj_t); + +node_offset_t internal_sub_items_t::trim_until( + NodeExtentMutable& mut, internal_sub_items_t& items, index_t index) +{ + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + size_t ret = sizeof(internal_sub_item_t) * (keys - index); + assert(ret < mut.get_length()); + return ret; +} + +node_offset_t internal_sub_items_t::erase_at( + NodeExtentMutable& mut, const internal_sub_items_t& sub_items, + index_t index, const char* p_left_bound) +{ + assert(index < sub_items.keys()); + node_offset_t erase_size = sizeof(internal_sub_item_t); + const char* p_shift_start = p_left_bound; + const char* p_shift_end = reinterpret_cast<const char*>( + sub_items.p_first_item - index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, erase_size); + return erase_size; +} + +template <KeyT KT> +void internal_sub_items_t::Appender<KT>::append( + const internal_sub_items_t& src, index_t from, index_t items) +{ + assert(from <= src.keys()); + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + node_offset_t size = sizeof(internal_sub_item_t) * items; + p_append -= size; + p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size); +} + +template <KeyT KT> +void internal_sub_items_t::Appender<KT>::append( + const full_key_t<KT>& key, const laddr_t& value, + const laddr_packed_t*& p_value) +{ + p_append -= sizeof(internal_sub_item_t); + auto item = internal_sub_item_t{ + snap_gen_t::from_key(key), laddr_packed_t{value}}; + p_mut->copy_in_absolute(p_append, item); + p_value = &reinterpret_cast<internal_sub_item_t*>(p_append)->value; +} + +template <IsFullKey Key> +const value_header_t* leaf_sub_items_t::insert_at( + NodeExtentMutable& mut, const leaf_sub_items_t& sub_items, + const Key& key, const value_config_t& value, + index_t index, node_offset_t size, const char* p_left_bound) +{ + assert(index <= sub_items.keys()); + assert(size == estimate_insert(key, value)); + // a. [... item(index)] << size + const char* p_shift_start = p_left_bound; + const char* p_shift_end = sub_items.get_item_end(index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + // b. insert item + auto p_insert = const_cast<char*>(p_shift_end - size); + auto p_value = reinterpret_cast<value_header_t*>(p_insert); + p_value->initiate(mut, value); + p_insert += value.allocation_size(); + mut.copy_in_absolute(p_insert, snap_gen_t::from_key(key)); + assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end); + + // c. compensate affected offsets + auto item_size = value.allocation_size() + sizeof(snap_gen_t); + for (auto i = index; i < sub_items.keys(); ++i) { + const node_offset_packed_t& offset_i = sub_items.get_offset(i); + mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value + item_size)); + } + + // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t) + const char* p_offset = (index == 0 ? + (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) : + (const char*)&sub_items.get_offset(index - 1)); + p_shift_start = p_shift_end; + p_shift_end = p_offset; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t)); + + // e. insert offset + node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index); + mut.copy_in_absolute( + const_cast<char*>(p_shift_end) - sizeof(node_offset_t), offset_to_item_start); + + // f. update num_sub_keys + mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1)); + + return p_value; +} +template const value_header_t* leaf_sub_items_t::insert_at<key_hobj_t>( + NodeExtentMutable&, const leaf_sub_items_t&, const key_hobj_t&, + const value_config_t&, index_t, node_offset_t, const char*); + +node_offset_t leaf_sub_items_t::trim_until( + NodeExtentMutable& mut, leaf_sub_items_t& items, index_t index) +{ + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + index_t trim_items = keys - index; + const char* p_items_start = items.p_start(); + const char* p_shift_start = items.get_item_end(index); + const char* p_shift_end = items.get_item_end(0); + size_t size_trim_offsets = sizeof(node_offset_t) * trim_items; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, + size_trim_offsets); + mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index)); + size_t ret = size_trim_offsets + (p_shift_start - p_items_start); + assert(ret < mut.get_length()); + return ret; +} + +node_offset_t leaf_sub_items_t::erase_at( + NodeExtentMutable& mut, const leaf_sub_items_t& sub_items, + index_t index, const char* p_left_bound) +{ + assert(sub_items.keys() > 0); + assert(index < sub_items.keys()); + auto p_item_start = sub_items.get_item_start(index); + auto p_item_end = sub_items.get_item_end(index); + assert(p_item_start < p_item_end); + node_offset_t item_erase_size = p_item_end - p_item_start; + node_offset_t erase_size = item_erase_size + sizeof(node_offset_t); + auto p_offset_end = (const char*)&sub_items.get_offset(index); + + // a. compensate affected offset[n] ... offset[index+1] + for (auto i = index + 1; i < sub_items.keys(); ++i) { + const node_offset_packed_t& offset_i = sub_items.get_offset(i); + mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value - item_erase_size)); + } + + // b. kv[index-1] ... kv[0] ... offset[index+1] >> sizeof(node_offset_t) + mut.shift_absolute(p_item_end, p_offset_end - p_item_end, sizeof(node_offset_t)); + + // c. ... kv[n] ... kv[index+1] >> item_erase_size + mut.shift_absolute(p_left_bound, p_item_start - p_left_bound, erase_size); + + // d. update num_keys + mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() - 1)); + + return erase_size; +} + +template class internal_sub_items_t::Appender<KeyT::VIEW>; +template class internal_sub_items_t::Appender<KeyT::HOBJ>; + +// helper type for the visitor +template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; }; +// explicit deduction guide +template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>; + +template <KeyT KT> +void leaf_sub_items_t::Appender<KT>::append( + const leaf_sub_items_t& src, index_t from, index_t items) +{ + if (p_append) { + // append from empty + assert(cnt <= APPENDER_LIMIT); + assert(from <= src.keys()); + if (items == 0) { + return; + } + if (op_src) { + assert(*op_src == src); + } else { + op_src = src; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + appends[cnt] = range_items_t{from, items}; + ++cnt; + } else { + // append from existing + assert(op_dst.has_value()); + assert(!p_appended); + assert(from == 0); + assert(items); + assert(items == src.keys()); + + num_keys_t num_keys = op_dst->keys(); + node_offset_t compensate = op_dst->get_offset(num_keys - 1).value; + const char* p_items_start = op_dst->p_start(); + const char* p_items_end = op_dst->p_items_end; + + // update dst num_keys + num_keys += items; + p_mut->copy_in_absolute((char*)op_dst->p_num_keys, num_keys); + + // shift dst items + std::size_t src_offsets_size = sizeof(node_offset_t) * items; + p_mut->shift_absolute(p_items_start, + p_items_end - p_items_start, + -(int)src_offsets_size); + + // fill offsets from src + node_offset_t offset; + char* p_cur_offset = const_cast<char*>(p_items_end); + for (auto i = from; i < from + items; ++i) { + offset = src.get_offset(i).value + compensate; + p_cur_offset -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur_offset, offset); + } + + // fill items from src + auto p_src_items_start = src.get_item_end(from + items); + std::size_t src_items_size = src.get_item_end(from) - p_src_items_start; + p_appended = const_cast<char*>(p_items_start) - src_offsets_size - src_items_size; + p_mut->copy_in_absolute(p_appended, p_src_items_start, src_items_size); + } +} + +template <KeyT KT> +char* leaf_sub_items_t::Appender<KT>::wrap() +{ + if (op_dst.has_value()) { + // append from existing + assert(p_appended); + return p_appended; + } + // append from empty + assert(p_append); + auto p_cur = p_append; + num_keys_t num_keys = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { num_keys += arg.items; }, + [&] (const kv_item_t& arg) { ++num_keys; } + }, a); + } + assert(num_keys); + p_cur -= sizeof(num_keys_t); + p_mut->copy_in_absolute(p_cur, num_keys); + + node_offset_t last_offset = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + int compensate = (last_offset - op_src->get_offset_to_end(arg.from)); + node_offset_t offset; + for (auto i = arg.from; i < arg.from + arg.items; ++i) { + offset = op_src->get_offset(i).value + compensate; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, offset); + } + last_offset = offset; + }, + [&] (const kv_item_t& arg) { + last_offset += sizeof(snap_gen_t) + arg.value_config.allocation_size(); + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, last_offset); + } + }, a); + } + + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + auto _p_start = op_src->get_item_end(arg.from + arg.items); + size_t _len = op_src->get_item_end(arg.from) - _p_start; + p_cur -= _len; + p_mut->copy_in_absolute(p_cur, _p_start, _len); + }, + [&] (const kv_item_t& arg) { + assert(pp_value); + p_cur -= sizeof(snap_gen_t); + p_mut->copy_in_absolute(p_cur, snap_gen_t::from_key(*arg.p_key)); + p_cur -= arg.value_config.allocation_size(); + auto p_value = reinterpret_cast<value_header_t*>(p_cur); + p_value->initiate(*p_mut, arg.value_config); + *pp_value = p_value; + } + }, a); + } + return p_cur; +} + +template class leaf_sub_items_t::Appender<KeyT::VIEW>; +template class leaf_sub_items_t::Appender<KeyT::HOBJ>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h new file mode 100644 index 000000000..e3d1fd7c5 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h @@ -0,0 +1,368 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <variant> + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct internal_sub_item_t { + const snap_gen_t& get_key() const { return key; } + const laddr_packed_t* get_p_value() const { return &value; } + + snap_gen_t key; + laddr_packed_t value; +} __attribute__((packed)); + +/** + * internal_sub_items_t + * + * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged + * contract as an indexable container to index snap-gen to child node + * addresses. + * + * The layout of the contaner storing n sub-items: + * + * # <--------- container range -----------> # + * #<~># sub-items [2, n) # + * # # <- sub-item 1 -> # <- sub-item 0 -> # + * #...# snap-gen | laddr # snap-gen | laddr # + * ^ + * | + * p_first_item + + */ +class internal_sub_items_t { + public: + using num_keys_t = index_t; + + internal_sub_items_t(const container_range_t& _range) + : node_size{_range.node_size} { + assert(is_valid_node_size(node_size)); + auto& range = _range.range; + assert(range.p_start < range.p_end); + assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0); + num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t); + assert(num_items > 0); + auto _p_first_item = range.p_end - sizeof(internal_sub_item_t); + p_first_item = reinterpret_cast<const internal_sub_item_t*>(_p_first_item); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return num_items; } + key_get_type operator[](index_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_key(); + } + node_offset_t size_before(index_t index) const { + size_t ret = index * sizeof(internal_sub_item_t); + assert(ret < node_size); + return ret; + } + const laddr_packed_t* get_p_value(index_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_p_value(); + } + node_offset_t size_overhead_at(index_t index) const { return 0u; } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast<const char*>(p_first_item) + + sizeof(internal_sub_item_t); + auto p_start = p_end - num_items * sizeof(internal_sub_item_t); + int start_offset = p_start - p_node_start; + int stage_size = p_end - p_start; + assert(start_offset > 0); + assert(stage_size > 0); + assert(start_offset + stage_size < (int)node_size); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(stage_size), encoded); + } + + static internal_sub_items_t decode( + const char* p_node_start, + extent_len_t node_size, + ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t stage_size; + ceph::decode(stage_size, delta); + assert(start_offset > 0); + assert(stage_size > 0); + assert((unsigned)start_offset + stage_size < node_size); + return internal_sub_items_t({{p_node_start + start_offset, + p_node_start + start_offset + stage_size}, + node_size}); + } + + static node_offset_t header_size() { return 0u; } + + template <IsFullKey Key> + static node_offset_t estimate_insert( + const Key&, const laddr_t&) { + return sizeof(internal_sub_item_t); + } + + template <IsFullKey Key> + static const laddr_packed_t* insert_at( + NodeExtentMutable&, const internal_sub_items_t&, + const Key&, const laddr_t&, + index_t index, node_offset_t size, const char* p_left_bound); + + static node_offset_t trim_until(NodeExtentMutable&, internal_sub_items_t&, index_t); + + static node_offset_t erase_at( + NodeExtentMutable&, const internal_sub_items_t&, index_t, const char*); + + template <KeyT KT> + class Appender; + + private: + extent_len_t node_size; + index_t num_items; + const internal_sub_item_t* p_first_item; +}; + +template <KeyT KT> +class internal_sub_items_t::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + Appender(NodeExtentMutable* p_mut, const internal_sub_items_t& sub_items) + : p_mut{p_mut}, + p_append{(char*)(sub_items.p_first_item + 1 - sub_items.keys())} { + assert(sub_items.keys()); + } + void append(const internal_sub_items_t& src, index_t from, index_t items); + void append(const full_key_t<KT>&, const laddr_t&, const laddr_packed_t*&); + char* wrap() { return p_append; } + private: + NodeExtentMutable* p_mut; + char* p_append; +}; + +/** + * leaf_sub_items_t + * + * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged + * contract as an indexable container to index snap-gen to value_header_t. + * + * The layout of the contaner storing n sub-items: + * + * # <------------------------ container range -------------------------------> # + * # <---------- sub-items ----------------> # <--- offsets ---------# # + * #<~># sub-items [2, n) #<~>| offsets [2, n) # # + * # # <- sub-item 1 -> # <- sub-item 0 -> # | # # + * #...# snap-gen | value # snap-gen | value #...| offset1 | offset0 # num_keys # + * ^ ^ ^ + * | | | + * p_items_end + p_offsets + | + * p_num_keys + + */ +class leaf_sub_items_t { + public: + // should be enough to index all keys under 64 KiB node + using num_keys_t = uint16_t; + + // TODO: remove if num_keys_t is aligned + struct num_keys_packed_t { + num_keys_t value; + } __attribute__((packed)); + + leaf_sub_items_t(const container_range_t& _range) + : node_size{_range.node_size} { + assert(is_valid_node_size(node_size)); + auto& range = _range.range; + assert(range.p_start < range.p_end); + auto _p_num_keys = range.p_end - sizeof(num_keys_t); + assert(range.p_start < _p_num_keys); + p_num_keys = reinterpret_cast<const num_keys_packed_t*>(_p_num_keys); + assert(keys()); + auto _p_offsets = _p_num_keys - sizeof(node_offset_t); + assert(range.p_start < _p_offsets); + p_offsets = reinterpret_cast<const node_offset_packed_t*>(_p_offsets); + p_items_end = reinterpret_cast<const char*>(&get_offset(keys() - 1)); + assert(range.p_start < p_items_end); + assert(range.p_start == p_start()); + } + + bool operator==(const leaf_sub_items_t& x) { + return (p_num_keys == x.p_num_keys && + p_offsets == x.p_offsets && + p_items_end == x.p_items_end); + } + + const char* p_start() const { return get_item_end(keys()); } + + const node_offset_packed_t& get_offset(index_t index) const { + assert(index < keys()); + return *(p_offsets - index); + } + + const node_offset_t get_offset_to_end(index_t index) const { + assert(index <= keys()); + return index == 0 ? 0 : get_offset(index - 1).value; + } + + const char* get_item_start(index_t index) const { + return p_items_end - get_offset(index).value; + } + + const char* get_item_end(index_t index) const { + return p_items_end - get_offset_to_end(index); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return p_num_keys->value; } + key_get_type operator[](index_t index) const { + assert(index < keys()); + auto pointer = get_item_end(index); + assert(get_item_start(index) < pointer); + pointer -= sizeof(snap_gen_t); + assert(get_item_start(index) < pointer); + return *reinterpret_cast<const snap_gen_t*>(pointer); + } + node_offset_t size_before(index_t index) const { + assert(index <= keys()); + size_t ret; + if (index == 0) { + ret = sizeof(num_keys_t); + } else { + --index; + ret = sizeof(num_keys_t) + + (index + 1) * sizeof(node_offset_t) + + get_offset(index).value; + } + assert(ret < node_size); + return ret; + } + node_offset_t size_overhead_at(index_t index) const { return sizeof(node_offset_t); } + const value_header_t* get_p_value(index_t index) const { + assert(index < keys()); + auto pointer = get_item_start(index); + auto value = reinterpret_cast<const value_header_t*>(pointer); + assert(pointer + value->allocation_size() + sizeof(snap_gen_t) == + get_item_end(index)); + return value; + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast<const char*>(p_num_keys) + + sizeof(num_keys_t); + int start_offset = p_start() - p_node_start; + int stage_size = p_end - p_start(); + assert(start_offset > 0); + assert(stage_size > 0); + assert(start_offset + stage_size < (int)node_size); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(stage_size), encoded); + } + + static leaf_sub_items_t decode( + const char* p_node_start, + extent_len_t node_size, + ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t stage_size; + ceph::decode(stage_size, delta); + assert(start_offset > 0); + assert(stage_size > 0); + assert((unsigned)start_offset + stage_size < node_size); + return leaf_sub_items_t({{p_node_start + start_offset, + p_node_start + start_offset + stage_size}, + node_size}); + } + + static node_offset_t header_size() { return sizeof(num_keys_t); } + + template <IsFullKey Key> + static node_offset_t estimate_insert( + const Key&, const value_config_t& value) { + return value.allocation_size() + sizeof(snap_gen_t) + sizeof(node_offset_t); + } + + template <IsFullKey Key> + static const value_header_t* insert_at( + NodeExtentMutable&, const leaf_sub_items_t&, + const Key&, const value_config_t&, + index_t index, node_offset_t size, const char* p_left_bound); + + static node_offset_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, index_t index); + + static node_offset_t erase_at( + NodeExtentMutable&, const leaf_sub_items_t&, index_t, const char*); + + template <KeyT KT> + class Appender; + + private: + extent_len_t node_size; + const num_keys_packed_t* p_num_keys; + const node_offset_packed_t* p_offsets; + const char* p_items_end; +}; + +constexpr index_t APPENDER_LIMIT = 3u; + +template <KeyT KT> +class leaf_sub_items_t::Appender { + struct range_items_t { + index_t from; + index_t items; + }; + struct kv_item_t { + const full_key_t<KT>* p_key; + value_config_t value_config; + }; + using var_t = std::variant<range_items_t, kv_item_t>; + + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} { + } + Appender(NodeExtentMutable* p_mut, const leaf_sub_items_t& sub_items) + : p_mut{p_mut} , op_dst(sub_items) { + assert(sub_items.keys()); + } + + void append(const leaf_sub_items_t& src, index_t from, index_t items); + void append(const full_key_t<KT>& key, + const value_config_t& value, const value_header_t*& p_value) { + // append from empty + assert(p_append); + assert(pp_value == nullptr); + assert(cnt <= APPENDER_LIMIT); + appends[cnt] = kv_item_t{&key, value}; + ++cnt; + pp_value = &p_value; + } + char* wrap(); + + private: + NodeExtentMutable* p_mut; + // append from empty + std::optional<leaf_sub_items_t> op_src; + const value_header_t** pp_value = nullptr; + char* p_append = nullptr; + var_t appends[APPENDER_LIMIT]; + index_t cnt = 0; + // append from existing + std::optional<leaf_sub_items_t> op_dst; + char* p_appended = nullptr; +}; + +template <node_type_t> struct _sub_items_t; +template<> struct _sub_items_t<node_type_t::INTERNAL> { using type = internal_sub_items_t; }; +template<> struct _sub_items_t<node_type_t::LEAF> { using type = leaf_sub_items_t; }; +template <node_type_t NODE_TYPE> +using sub_items_t = typename _sub_items_t<NODE_TYPE>::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc new file mode 100644 index 000000000..09f20db3a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "super.h" +#include "node.h" + +namespace crimson::os::seastore::onode { + +Ref<Node> RootNodeTrackerIsolated::get_root(Transaction& t) const +{ + auto iter = tracked_supers.find(&t); + if (iter == tracked_supers.end()) { + return nullptr; + } else { + return iter->second->get_p_root(); + } +} + +Ref<Node> RootNodeTrackerShared::get_root(Transaction&) const +{ + if (is_clean()) { + return nullptr; + } else { + return tracked_super->get_p_root(); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.h b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h new file mode 100644 index 000000000..5eefee9ff --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> + +#include "crimson/common/type_helpers.h" + +#include "fwd.h" + +namespace crimson::os::seastore::onode { + +class Node; +class Super; + +/** + * RootNodeTracker + * + * An abstracted tracker to get the root node by Transaction. + */ +class RootNodeTracker { + public: + virtual ~RootNodeTracker() = default; + virtual bool is_clean() const = 0; + virtual Ref<Node> get_root(Transaction&) const = 0; + static RootNodeTrackerURef create(bool read_isolated); + protected: + RootNodeTracker() = default; + RootNodeTracker(const RootNodeTracker&) = delete; + RootNodeTracker(RootNodeTracker&&) = delete; + RootNodeTracker& operator=(const RootNodeTracker&) = delete; + RootNodeTracker& operator=(RootNodeTracker&&) = delete; + virtual void do_track_super(Transaction&, Super&) = 0; + virtual void do_untrack_super(Transaction&, Super&) = 0; + friend class Super; +}; + +/** + * Super + * + * The parent of root node. It contains the relationship between a Transaction + * and a root node address. + */ +class Super { + public: + using URef = std::unique_ptr<Super>; + Super(const Super&) = delete; + Super(Super&&) = delete; + Super& operator=(const Super&) = delete; + Super& operator=(Super&&) = delete; + virtual ~Super() { + assert(tracked_root_node == nullptr); + tracker.do_untrack_super(t, *this); + } + + virtual laddr_t get_root_laddr() const = 0; + virtual void write_root_laddr(context_t, laddr_t) = 0; + + void do_track_root(Node& root) { + assert(tracked_root_node == nullptr); + tracked_root_node = &root; + } + void do_untrack_root(Node& root) { + assert(tracked_root_node == &root); + tracked_root_node = nullptr; + } + Node* get_p_root() const { + assert(tracked_root_node != nullptr); + return tracked_root_node; + } + + protected: + Super(Transaction& t, RootNodeTracker& tracker) + : t{t}, tracker{tracker} { + tracker.do_track_super(t, *this); + } + + private: + Transaction& t; + RootNodeTracker& tracker; + Node* tracked_root_node = nullptr; +}; + +/** + * RootNodeTrackerIsolated + * + * A concrete RootNodeTracker implementation which provides root node isolation + * between Transactions for Seastore backend. + */ +class RootNodeTrackerIsolated final : public RootNodeTracker { + public: + ~RootNodeTrackerIsolated() override { assert(is_clean()); } + protected: + bool is_clean() const override { + return tracked_supers.empty(); + } + void do_track_super(Transaction& t, Super& super) override { + assert(tracked_supers.find(&t) == tracked_supers.end()); + tracked_supers[&t] = &super; + } + void do_untrack_super(Transaction& t, Super& super) override { + [[maybe_unused]] auto removed = tracked_supers.erase(&t); + assert(removed); + } + ::Ref<Node> get_root(Transaction& t) const override; + std::map<Transaction*, Super*> tracked_supers; +}; + +/** + * RootNodeTrackerShared + * + * A concrete RootNodeTracker implementation which has no isolation between + * Transactions for Dummy backend. + */ +class RootNodeTrackerShared final : public RootNodeTracker { + public: + ~RootNodeTrackerShared() override { assert(is_clean()); } + protected: + bool is_clean() const override { + return tracked_super == nullptr; + } + void do_track_super(Transaction&, Super& super) override { + assert(is_clean()); + tracked_super = &super; + } + void do_untrack_super(Transaction&, Super& super) override { + assert(tracked_super == &super); + tracked_super = nullptr; + } + ::Ref<Node> get_root(Transaction&) const override; + Super* tracked_super = nullptr; +}; + +inline RootNodeTrackerURef RootNodeTracker::create(bool read_isolated) { + if (read_isolated) { + return RootNodeTrackerURef(new RootNodeTrackerIsolated()); + } else { + return RootNodeTrackerURef(new RootNodeTrackerShared()); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h new file mode 100644 index 000000000..7385e080c --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h @@ -0,0 +1,387 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> + +#include "common/hobject.h" +#include "crimson/common/type_helpers.h" +#include "crimson/os/seastore/logging.h" + +#include "fwd.h" +#include "node.h" +#include "node_extent_manager.h" +#include "stages/key_layout.h" +#include "super.h" +#include "value.h" + +/** + * tree.h + * + * A special-purpose and b-tree-based implementation that: + * - Fulfills requirements of OnodeManager to index ordered onode key-values; + * - Runs above seastore block and transaction layer; + * - Specially optimized for onode key structures and seastore + * delta/transaction semantics; + * + * Note: Cursor/Value are transactional, they cannot be used outside the scope + * of the according transaction, or the behavior is undefined. + */ + +namespace crimson::os::seastore::onode { + +class Node; +class tree_cursor_t; + +template <typename ValueImpl> +class Btree { + public: + Btree(NodeExtentManagerURef&& _nm) + : nm{std::move(_nm)}, + root_tracker{RootNodeTracker::create(nm->is_read_isolated())} {} + ~Btree() { assert(root_tracker->is_clean()); } + + Btree(const Btree&) = delete; + Btree(Btree&&) = delete; + Btree& operator=(const Btree&) = delete; + Btree& operator=(Btree&&) = delete; + + eagain_ifuture<> mkfs(Transaction& t) { + return Node::mkfs(get_context(t), *root_tracker); + } + + class Cursor { + public: + Cursor(const Cursor&) = default; + Cursor(Cursor&&) noexcept = default; + Cursor& operator=(const Cursor&) = default; + Cursor& operator=(Cursor&&) = default; + ~Cursor() = default; + + bool is_end() const { + if (p_cursor->is_tracked()) { + return false; + } else if (p_cursor->is_invalid()) { + return true; + } else { + // we don't actually store end cursor because it will hold a reference + // to an end leaf node and is not kept updated. + assert(p_cursor->is_end()); + ceph_abort("impossible"); + } + } + + /// Invalidate the Cursor before submitting transaction. + void invalidate() { + p_cursor.reset(); + } + + // XXX: return key_view_t to avoid unecessary ghobject_t constructions + ghobject_t get_ghobj() const { + assert(!is_end()); + auto view = p_cursor->get_key_view( + p_tree->value_builder.get_header_magic()); + assert(view.nspace().size() <= + p_tree->value_builder.get_max_ns_size()); + assert(view.oid().size() <= + p_tree->value_builder.get_max_oid_size()); + return view.to_ghobj(); + } + + ValueImpl value() { + assert(!is_end()); + return p_tree->value_builder.build_value( + *p_tree->nm, p_tree->value_builder, p_cursor); + } + + bool operator==(const Cursor& o) const { return operator<=>(o) == 0; } + + eagain_ifuture<Cursor> get_next(Transaction& t) { + assert(!is_end()); + auto this_obj = *this; + return p_cursor->get_next(p_tree->get_context(t) + ).si_then([this_obj] (Ref<tree_cursor_t> next_cursor) { + next_cursor->assert_next_to( + *this_obj.p_cursor, this_obj.p_tree->value_builder.get_header_magic()); + auto ret = Cursor{this_obj.p_tree, next_cursor}; + assert(this_obj < ret); + return ret; + }); + } + + template <bool FORCE_MERGE = false> + eagain_ifuture<Cursor> erase(Transaction& t) { + assert(!is_end()); + auto this_obj = *this; + return p_cursor->erase<FORCE_MERGE>(p_tree->get_context(t), true + ).si_then([this_obj, this] (Ref<tree_cursor_t> next_cursor) { + assert(p_cursor->is_invalid()); + if (next_cursor) { + assert(!next_cursor->is_end()); + return Cursor{p_tree, next_cursor}; + } else { + return Cursor{p_tree}; + } + }); + } + + private: + Cursor(Btree* p_tree, Ref<tree_cursor_t> _p_cursor) : p_tree(p_tree) { + if (_p_cursor->is_invalid()) { + // we don't create Cursor from an invalid tree_cursor_t. + ceph_abort("impossible"); + } else if (_p_cursor->is_end()) { + // we don't actually store end cursor because it will hold a reference + // to an end leaf node and is not kept updated. + } else { + assert(_p_cursor->is_tracked()); + p_cursor = _p_cursor; + } + } + Cursor(Btree* p_tree) : p_tree{p_tree} {} + + std::strong_ordering operator<=>(const Cursor& o) const { + assert(p_tree == o.p_tree); + return p_cursor->compare_to( + *o.p_cursor, p_tree->value_builder.get_header_magic()); + } + + static Cursor make_end(Btree* p_tree) { + return {p_tree}; + } + + Btree* p_tree; + Ref<tree_cursor_t> p_cursor = tree_cursor_t::get_invalid(); + + friend class Btree; + }; + + /* + * lookup + */ + + eagain_ifuture<Cursor> begin(Transaction& t) { + return get_root(t).si_then([this, &t](auto root) { + return root->lookup_smallest(get_context(t)); + }).si_then([this](auto cursor) { + return Cursor{this, cursor}; + }); + } + + eagain_ifuture<Cursor> last(Transaction& t) { + return get_root(t).si_then([this, &t](auto root) { + return root->lookup_largest(get_context(t)); + }).si_then([this](auto cursor) { + return Cursor(this, cursor); + }); + } + + Cursor end() { + return Cursor::make_end(this); + } + + eagain_ifuture<bool> contains(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + key_hobj_t{obj}, + [this, &t](auto& key) -> eagain_ifuture<bool> { + return get_root(t).si_then([this, &t, &key](auto root) { + // TODO: improve lower_bound() + return root->lower_bound(get_context(t), key); + }).si_then([](auto result) { + return MatchKindBS::EQ == result.match(); + }); + } + ); + } + + eagain_ifuture<Cursor> find(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + key_hobj_t{obj}, + [this, &t](auto& key) -> eagain_ifuture<Cursor> { + return get_root(t).si_then([this, &t, &key](auto root) { + // TODO: improve lower_bound() + return root->lower_bound(get_context(t), key); + }).si_then([this](auto result) { + if (result.match() == MatchKindBS::EQ) { + return Cursor(this, result.p_cursor); + } else { + return Cursor::make_end(this); + } + }); + } + ); + } + + /** + * lower_bound + * + * Returns a Cursor pointing to the element that is equal to the key, or the + * first element larger than the key, or the end Cursor if that element + * doesn't exist. + */ + eagain_ifuture<Cursor> lower_bound(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + key_hobj_t{obj}, + [this, &t](auto& key) -> eagain_ifuture<Cursor> { + return get_root(t).si_then([this, &t, &key](auto root) { + return root->lower_bound(get_context(t), key); + }).si_then([this](auto result) { + return Cursor(this, result.p_cursor); + }); + } + ); + } + + eagain_ifuture<Cursor> get_next(Transaction& t, Cursor& cursor) { + return cursor.get_next(t); + } + + /* + * modifiers + */ + + struct tree_value_config_t { + value_size_t payload_size = 256; + }; + using insert_iertr = eagain_iertr::extend< + crimson::ct_error::value_too_large>; + insert_iertr::future<std::pair<Cursor, bool>> + insert(Transaction& t, const ghobject_t& obj, tree_value_config_t _vconf) { + LOG_PREFIX(OTree::insert); + if (_vconf.payload_size > value_builder.get_max_value_payload_size()) { + SUBERRORT(seastore_onode, "value payload size {} too large to insert {}", + t, _vconf.payload_size, key_hobj_t{obj}); + return crimson::ct_error::value_too_large::make(); + } + if (obj.hobj.nspace.size() > value_builder.get_max_ns_size()) { + SUBERRORT(seastore_onode, "namespace size {} too large to insert {}", + t, obj.hobj.nspace.size(), key_hobj_t{obj}); + return crimson::ct_error::value_too_large::make(); + } + if (obj.hobj.oid.name.size() > value_builder.get_max_oid_size()) { + SUBERRORT(seastore_onode, "oid size {} too large to insert {}", + t, obj.hobj.oid.name.size(), key_hobj_t{obj}); + return crimson::ct_error::value_too_large::make(); + } + value_config_t vconf{value_builder.get_header_magic(), _vconf.payload_size}; + return seastar::do_with( + key_hobj_t{obj}, + [this, &t, vconf](auto& key) -> eagain_ifuture<std::pair<Cursor, bool>> { + ceph_assert(key.is_valid()); + return get_root(t).si_then([this, &t, &key, vconf](auto root) { + return root->insert(get_context(t), key, vconf, std::move(root)); + }).si_then([this](auto ret) { + auto& [cursor, success] = ret; + return std::make_pair(Cursor(this, cursor), success); + }); + } + ); + } + + eagain_ifuture<std::size_t> erase(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + key_hobj_t{obj}, + [this, &t](auto& key) -> eagain_ifuture<std::size_t> { + return get_root(t).si_then([this, &t, &key](auto root) { + return root->erase(get_context(t), key, std::move(root)); + }); + } + ); + } + + eagain_ifuture<Cursor> erase(Transaction& t, Cursor& pos) { + return pos.erase(t); + } + + eagain_ifuture<> erase(Transaction& t, Value& value) { + assert(value.is_tracked()); + auto ref_cursor = value.p_cursor; + return ref_cursor->erase(get_context(t), false + ).si_then([ref_cursor] (auto next_cursor) { + assert(ref_cursor->is_invalid()); + assert(!next_cursor); + }); + } + + /* + * stats + */ + + eagain_ifuture<size_t> height(Transaction& t) { + return get_root(t).si_then([](auto root) { + return size_t(root->level() + 1); + }); + } + + eagain_ifuture<tree_stats_t> get_stats_slow(Transaction& t) { + return get_root(t).si_then([this, &t](auto root) { + unsigned height = root->level() + 1; + return root->get_tree_stats(get_context(t) + ).si_then([height](auto stats) { + stats.height = height; + return seastar::make_ready_future<tree_stats_t>(stats); + }); + }); + } + + std::ostream& dump(Transaction& t, std::ostream& os) { + auto root = root_tracker->get_root(t); + if (root) { + root->dump(os); + } else { + os << "empty tree!"; + } + return os; + } + + std::ostream& print(std::ostream& os) const { + return os << "BTree-" << *nm; + } + + /* + * test_only + */ + + bool test_is_clean() const { + return root_tracker->is_clean(); + } + + eagain_ifuture<> test_clone_from( + Transaction& t, Transaction& t_from, Btree& from) { + // Note: assume the tree to clone is tracked correctly in memory. + // In some unit tests, parts of the tree are stubbed out that they + // should not be loaded from NodeExtentManager. + return from.get_root(t_from + ).si_then([this, &t](auto root_from) { + return root_from->test_clone_root(get_context(t), *root_tracker); + }); + } + + private: + context_t get_context(Transaction& t) { + return {*nm, value_builder, t}; + } + + eagain_ifuture<Ref<Node>> get_root(Transaction& t) { + auto root = root_tracker->get_root(t); + if (root) { + return seastar::make_ready_future<Ref<Node>>(root); + } else { + return Node::load_root(get_context(t), *root_tracker); + } + } + + NodeExtentManagerURef nm; + const ValueBuilderImpl<ValueImpl> value_builder; + RootNodeTrackerURef root_tracker; + + friend class DummyChildPool; +}; + +template <typename ValueImpl> +inline std::ostream& operator<<(std::ostream& os, const Btree<ValueImpl>& tree) { + return tree.print(os); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h new file mode 100644 index 000000000..3ad3564a6 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h @@ -0,0 +1,565 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <cstring> +#include <random> +#include <string> +#include <sstream> +#include <utility> +#include <vector> + +#include <seastar/core/thread.hh> + +#include "crimson/common/log.h" +#include "stages/key_layout.h" +#include "tree.h" + +/** + * tree_utils.h + * + * Contains shared logic for unit tests and perf tool. + */ + +namespace crimson::os::seastore::onode { + +/** + * templates to work with tree utility classes: + * + * struct ValueItem { + * <public members> + * + * value_size_t get_payload_size() const; + * static ValueItem create(std::size_t expected_size, std::size_t id); + * }; + * std::ostream& operator<<(std::ostream& os, const ValueItem& item); + * + * class ValueImpl final : public Value { + * ... + * + * using item_t = ValueItem; + * void initialize(Transaction& t, const item_t& item); + * void validate(const item_t& item); + * }; + * + */ + +template <typename CursorType> +void initialize_cursor_from_item( + Transaction& t, + const ghobject_t& key, + const typename decltype(std::declval<CursorType>().value())::item_t& item, + CursorType& cursor, + bool insert_success) { + ceph_assert(insert_success); + ceph_assert(!cursor.is_end()); + ceph_assert(cursor.get_ghobj() == key); + auto tree_value = cursor.value(); + tree_value.initialize(t, item); +} + + +template <typename CursorType> +void validate_cursor_from_item( + const ghobject_t& key, + const typename decltype(std::declval<CursorType>().value())::item_t& item, + CursorType& cursor) { + ceph_assert(!cursor.is_end()); + ceph_assert(cursor.get_ghobj() == key); + auto tree_value = cursor.value(); + tree_value.validate(item); +} + +template <typename ValueItem> +class Values { + public: + Values(size_t n) { + for (size_t i = 1; i <= n; ++i) { + auto item = create(i * 8); + values.push_back(item); + } + } + + Values(std::vector<size_t> sizes) { + for (auto& size : sizes) { + auto item = create(size); + values.push_back(item); + } + } + + ~Values() = default; + + ValueItem create(size_t size) { + return ValueItem::create(size, id++); + } + + ValueItem pick() const { + auto index = rd() % values.size(); + return values[index]; + } + + private: + std::size_t id = 0; + mutable std::random_device rd; + std::vector<ValueItem> values; +}; + +template <typename ValueItem> +class KVPool { + public: + struct kv_t { + ghobject_t key; + ValueItem value; + }; + using kv_vector_t = std::vector<kv_t>; + using kvptr_vector_t = std::vector<kv_t*>; + using iterator_t = typename kvptr_vector_t::iterator; + + size_t size() const { + return kvs.size(); + } + + iterator_t begin() { + return serial_p_kvs.begin(); + } + iterator_t end() { + return serial_p_kvs.end(); + } + iterator_t random_begin() { + return random_p_kvs.begin(); + } + iterator_t random_end() { + return random_p_kvs.end(); + } + + void shuffle() { + std::shuffle(random_p_kvs.begin(), random_p_kvs.end(), std::default_random_engine{}); + } + + void erase_from_random(iterator_t begin, iterator_t end) { + random_p_kvs.erase(begin, end); + kv_vector_t new_kvs; + for (auto p_kv : random_p_kvs) { + new_kvs.emplace_back(*p_kv); + } + std::sort(new_kvs.begin(), new_kvs.end(), [](auto& l, auto& r) { + return l.key < r.key; + }); + + kvs.swap(new_kvs); + serial_p_kvs.resize(kvs.size()); + random_p_kvs.resize(kvs.size()); + init(); + } + + static KVPool create_raw_range( + const std::vector<size_t>& ns_sizes, + const std::vector<size_t>& oid_sizes, + const std::vector<size_t>& value_sizes, + const std::pair<index_t, index_t>& range2, + const std::pair<index_t, index_t>& range1, + const std::pair<index_t, index_t>& range0) { + ceph_assert(range2.first < range2.second); + ceph_assert(range2.second - 1 <= MAX_SHARD); + ceph_assert(range2.second - 1 <= MAX_CRUSH); + ceph_assert(range1.first < range1.second); + ceph_assert(range1.second - 1 <= 9); + ceph_assert(range0.first < range0.second); + + kv_vector_t kvs; + std::random_device rd; + Values<ValueItem> values{value_sizes}; + for (index_t i = range2.first; i < range2.second; ++i) { + for (index_t j = range1.first; j < range1.second; ++j) { + size_t ns_size; + size_t oid_size; + if (j == 0) { + // store ns0, oid0 as empty strings for test purposes + ns_size = 0; + oid_size = 0; + } else { + ns_size = ns_sizes[rd() % ns_sizes.size()]; + oid_size = oid_sizes[rd() % oid_sizes.size()]; + assert(ns_size && oid_size); + } + for (index_t k = range0.first; k < range0.second; ++k) { + kvs.emplace_back( + kv_t{make_raw_oid(i, j, k, ns_size, oid_size), values.pick()} + ); + } + } + } + return KVPool(std::move(kvs)); + } + + static KVPool create_range( + const std::pair<index_t, index_t>& range_i, + const std::vector<size_t>& value_sizes, + const uint64_t block_size) { + kv_vector_t kvs; + std::random_device rd; + for (index_t i = range_i.first; i < range_i.second; ++i) { + auto value_size = value_sizes[rd() % value_sizes.size()]; + kvs.emplace_back( + kv_t{make_oid(i), ValueItem::create(value_size, i, block_size)} + ); + } + return KVPool(std::move(kvs)); + } + + private: + KVPool(kv_vector_t&& _kvs) + : kvs(std::move(_kvs)), serial_p_kvs(kvs.size()), random_p_kvs(kvs.size()) { + init(); + } + + void init() { + std::transform(kvs.begin(), kvs.end(), serial_p_kvs.begin(), + [] (kv_t& item) { return &item; }); + std::transform(kvs.begin(), kvs.end(), random_p_kvs.begin(), + [] (kv_t& item) { return &item; }); + shuffle(); + } + + static ghobject_t make_raw_oid( + index_t index2, index_t index1, index_t index0, + size_t ns_size, size_t oid_size) { + assert(index1 < 10); + std::ostringstream os_ns; + std::ostringstream os_oid; + if (index1 == 0) { + assert(!ns_size); + assert(!oid_size); + } else { + os_ns << "ns" << index1; + auto current_size = (size_t)os_ns.tellp(); + assert(ns_size >= current_size); + os_ns << std::string(ns_size - current_size, '_'); + + os_oid << "oid" << index1; + current_size = (size_t)os_oid.tellp(); + assert(oid_size >= current_size); + os_oid << std::string(oid_size - current_size, '_'); + } + + return ghobject_t(shard_id_t(index2), index2, index2, + os_ns.str(), os_oid.str(), index0, index0); + } + + static ghobject_t make_oid(index_t i) { + std::stringstream ss; + ss << "object_" << i; + auto ret = ghobject_t( + hobject_t( + sobject_t(ss.str(), CEPH_NOSNAP))); + ret.set_shard(shard_id_t(0)); + ret.hobj.nspace = "asdf"; + return ret; + } + + kv_vector_t kvs; + kvptr_vector_t serial_p_kvs; + kvptr_vector_t random_p_kvs; +}; + +template <bool TRACK, typename ValueImpl> +class TreeBuilder { + public: + using BtreeImpl = Btree<ValueImpl>; + using BtreeCursor = typename BtreeImpl::Cursor; + using ValueItem = typename ValueImpl::item_t; + using iterator_t = typename KVPool<ValueItem>::iterator_t; + + TreeBuilder(KVPool<ValueItem>& kvs, NodeExtentManagerURef&& nm) + : kvs{kvs} { + tree.emplace(std::move(nm)); + } + + eagain_ifuture<> bootstrap(Transaction& t) { + std::ostringstream oss; +#ifndef NDEBUG + oss << "debug=on, "; +#else + oss << "debug=off, "; +#endif +#ifdef UNIT_TESTS_BUILT + oss << "UNIT_TEST_BUILT=on, "; +#else + oss << "UNIT_TEST_BUILT=off, "; +#endif + if constexpr (TRACK) { + oss << "track=on, "; + } else { + oss << "track=off, "; + } + oss << *tree; + logger().warn("TreeBuilder: {}, bootstrapping ...", oss.str()); + return tree->mkfs(t); + } + + eagain_ifuture<BtreeCursor> insert_one( + Transaction& t, const iterator_t& iter_rd) { + auto p_kv = *iter_rd; + logger().debug("[{}] insert {} -> {}", + iter_rd - kvs.random_begin(), + key_hobj_t{p_kv->key}, + p_kv->value); + return tree->insert( + t, p_kv->key, {p_kv->value.get_payload_size()} + ).si_then([&t, this, p_kv](auto ret) { + boost::ignore_unused(this); // avoid clang warning; + auto success = ret.second; + auto cursor = std::move(ret.first); + initialize_cursor_from_item(t, p_kv->key, p_kv->value, cursor, success); +#ifndef NDEBUG + validate_cursor_from_item(p_kv->key, p_kv->value, cursor); + return tree->find(t, p_kv->key + ).si_then([cursor, p_kv](auto cursor_) mutable { + assert(!cursor_.is_end()); + ceph_assert(cursor_.get_ghobj() == p_kv->key); + ceph_assert(cursor_.value() == cursor.value()); + validate_cursor_from_item(p_kv->key, p_kv->value, cursor_); + return cursor; + }); +#else + return eagain_iertr::make_ready_future<BtreeCursor>(cursor); +#endif + }).handle_error_interruptible( + [] (const crimson::ct_error::value_too_large& e) { + ceph_abort("impossible path"); + }, + crimson::ct_error::pass_further_all{} + ); + } + + eagain_ifuture<> insert(Transaction& t) { + auto ref_kv_iter = seastar::make_lw_shared<iterator_t>(); + *ref_kv_iter = kvs.random_begin(); + auto cursors = seastar::make_lw_shared<std::vector<BtreeCursor>>(); + logger().warn("start inserting {} kvs ...", kvs.size()); + auto start_time = mono_clock::now(); + return trans_intr::repeat([&t, this, cursors, ref_kv_iter, + start_time]() + -> eagain_ifuture<seastar::stop_iteration> { + if (*ref_kv_iter == kvs.random_end()) { + std::chrono::duration<double> duration = mono_clock::now() - start_time; + logger().warn("Insert done! {}s", duration.count()); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } else { + return insert_one(t, *ref_kv_iter + ).si_then([cursors, ref_kv_iter] (auto cursor) { + if constexpr (TRACK) { + cursors->emplace_back(cursor); + } + ++(*ref_kv_iter); + return seastar::stop_iteration::no; + }); + } + }).si_then([&t, this, cursors, ref_kv_iter] { + if (!cursors->empty()) { + logger().info("Verifing tracked cursors ..."); + *ref_kv_iter = kvs.random_begin(); + return seastar::do_with( + cursors->begin(), + [&t, this, cursors, ref_kv_iter] (auto& c_iter) { + return trans_intr::repeat( + [&t, this, &c_iter, cursors, ref_kv_iter] () + -> eagain_ifuture<seastar::stop_iteration> { + if (*ref_kv_iter == kvs.random_end()) { + logger().info("Verify done!"); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + assert(c_iter != cursors->end()); + auto p_kv = **ref_kv_iter; + // validate values in tree keep intact + return tree->find(t, p_kv->key).si_then([&c_iter, ref_kv_iter](auto cursor) { + auto p_kv = **ref_kv_iter; + validate_cursor_from_item(p_kv->key, p_kv->value, cursor); + // validate values in cursors keep intact + validate_cursor_from_item(p_kv->key, p_kv->value, *c_iter); + ++(*ref_kv_iter); + ++c_iter; + return seastar::stop_iteration::no; + }); + }); + }); + } else { + return eagain_iertr::now(); + } + }); + } + + eagain_ifuture<> erase_one( + Transaction& t, const iterator_t& iter_rd) { + auto p_kv = *iter_rd; + logger().debug("[{}] erase {} -> {}", + iter_rd - kvs.random_begin(), + key_hobj_t{p_kv->key}, + p_kv->value); + return tree->erase(t, p_kv->key + ).si_then([&t, this, p_kv] (auto size) { + boost::ignore_unused(t); // avoid clang warning; + boost::ignore_unused(this); + boost::ignore_unused(p_kv); + ceph_assert(size == 1); +#ifndef NDEBUG + return tree->contains(t, p_kv->key + ).si_then([] (bool ret) { + ceph_assert(ret == false); + }); +#else + return eagain_iertr::now(); +#endif + }); + } + + eagain_ifuture<> erase(Transaction& t, std::size_t erase_size) { + assert(erase_size <= kvs.size()); + kvs.shuffle(); + auto erase_end = kvs.random_begin() + erase_size; + auto ref_kv_iter = seastar::make_lw_shared<iterator_t>(); + auto cursors = seastar::make_lw_shared<std::map<ghobject_t, BtreeCursor>>(); + return eagain_iertr::now().si_then([&t, this, cursors, ref_kv_iter] { + (void)this; // silence clang warning for !TRACK + (void)t; // silence clang warning for !TRACK + if constexpr (TRACK) { + logger().info("Tracking cursors before erase ..."); + *ref_kv_iter = kvs.begin(); + auto start_time = mono_clock::now(); + return trans_intr::repeat( + [&t, this, cursors, ref_kv_iter, start_time] () + -> eagain_ifuture<seastar::stop_iteration> { + if (*ref_kv_iter == kvs.end()) { + std::chrono::duration<double> duration = mono_clock::now() - start_time; + logger().info("Track done! {}s", duration.count()); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + auto p_kv = **ref_kv_iter; + return tree->find(t, p_kv->key).si_then([cursors, ref_kv_iter](auto cursor) { + auto p_kv = **ref_kv_iter; + validate_cursor_from_item(p_kv->key, p_kv->value, cursor); + cursors->emplace(p_kv->key, cursor); + ++(*ref_kv_iter); + return seastar::stop_iteration::no; + }); + }); + } else { + return eagain_iertr::now(); + } + }).si_then([&t, this, ref_kv_iter, erase_end] { + *ref_kv_iter = kvs.random_begin(); + logger().warn("start erasing {}/{} kvs ...", + erase_end - kvs.random_begin(), kvs.size()); + auto start_time = mono_clock::now(); + return trans_intr::repeat([&t, this, ref_kv_iter, + start_time, erase_end] () + -> eagain_ifuture<seastar::stop_iteration> { + if (*ref_kv_iter == erase_end) { + std::chrono::duration<double> duration = mono_clock::now() - start_time; + logger().warn("Erase done! {}s", duration.count()); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } else { + return erase_one(t, *ref_kv_iter + ).si_then([ref_kv_iter] { + ++(*ref_kv_iter); + return seastar::stop_iteration::no; + }); + } + }); + }).si_then([this, cursors, ref_kv_iter, erase_end] { + if constexpr (TRACK) { + logger().info("Verifing tracked cursors ..."); + *ref_kv_iter = kvs.random_begin(); + while (*ref_kv_iter != erase_end) { + auto p_kv = **ref_kv_iter; + auto c_it = cursors->find(p_kv->key); + ceph_assert(c_it != cursors->end()); + ceph_assert(c_it->second.is_end()); + cursors->erase(c_it); + ++(*ref_kv_iter); + } + } + kvs.erase_from_random(kvs.random_begin(), erase_end); + if constexpr (TRACK) { + *ref_kv_iter = kvs.begin(); + for (auto& [k, c] : *cursors) { + assert(*ref_kv_iter != kvs.end()); + auto p_kv = **ref_kv_iter; + validate_cursor_from_item(p_kv->key, p_kv->value, c); + ++(*ref_kv_iter); + } + logger().info("Verify done!"); + } + }); + } + + eagain_ifuture<> get_stats(Transaction& t) { + return tree->get_stats_slow(t + ).si_then([](auto stats) { + logger().warn("{}", stats); + }); + } + + eagain_ifuture<std::size_t> height(Transaction& t) { + return tree->height(t); + } + + void reload(NodeExtentManagerURef&& nm) { + tree.emplace(std::move(nm)); + } + + eagain_ifuture<> validate_one( + Transaction& t, const iterator_t& iter_seq) { + assert(iter_seq != kvs.end()); + auto next_iter = iter_seq + 1; + auto p_kv = *iter_seq; + return tree->find(t, p_kv->key + ).si_then([p_kv, &t] (auto cursor) { + validate_cursor_from_item(p_kv->key, p_kv->value, cursor); + return cursor.get_next(t); + }).si_then([next_iter, this] (auto cursor) { + if (next_iter == kvs.end()) { + ceph_assert(cursor.is_end()); + } else { + auto p_kv = *next_iter; + validate_cursor_from_item(p_kv->key, p_kv->value, cursor); + } + }); + } + + eagain_ifuture<> validate(Transaction& t) { + logger().info("Verifing inserted ..."); + return seastar::do_with( + kvs.begin(), + [this, &t] (auto &iter) { + return trans_intr::repeat( + [this, &t, &iter]() ->eagain_iertr::future<seastar::stop_iteration> { + if (iter == kvs.end()) { + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + return validate_one(t, iter).si_then([&iter] { + ++iter; + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }); + }); + }); + } + + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_test); + } + + KVPool<ValueItem>& kvs; + std::optional<BtreeImpl> tree; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/value.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/value.cc new file mode 100644 index 000000000..694480d4e --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/value.cc @@ -0,0 +1,164 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "value.h" + +#include "node.h" +#include "node_delta_recorder.h" +#include "node_layout.h" + +// value implementations +#include "test/crimson/seastore/onode_tree/test_value.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h" + +namespace crimson::os::seastore::onode { + +ceph::bufferlist& +ValueDeltaRecorder::get_encoded(NodeExtentMutable& payload_mut) +{ + ceph::encode(node_delta_op_t::SUBOP_UPDATE_VALUE, encoded); + node_offset_t offset = payload_mut.get_node_offset(); + assert(offset > sizeof(value_header_t)); + offset -= sizeof(value_header_t); + ceph::encode(offset, encoded); + return encoded; +} + +Value::Value(NodeExtentManager& nm, + const ValueBuilder& vb, + Ref<tree_cursor_t>& p_cursor) + : nm{nm}, vb{vb}, p_cursor{p_cursor} {} + +Value::~Value() {} + +bool Value::is_tracked() const +{ + assert(!p_cursor->is_end()); + return p_cursor->is_tracked(); +} + +void Value::invalidate() +{ + p_cursor.reset(); +} + +eagain_ifuture<> Value::extend(Transaction& t, value_size_t extend_size) +{ + assert(is_tracked()); + [[maybe_unused]] auto target_size = get_payload_size() + extend_size; + return p_cursor->extend_value(get_context(t), extend_size) +#ifndef NDEBUG + .si_then([this, target_size] { + assert(target_size == get_payload_size()); + }) +#endif + ; +} + +eagain_ifuture<> Value::trim(Transaction& t, value_size_t trim_size) +{ + assert(is_tracked()); + assert(get_payload_size() > trim_size); + [[maybe_unused]] auto target_size = get_payload_size() - trim_size; + return p_cursor->trim_value(get_context(t), trim_size) +#ifndef NDEBUG + .si_then([this, target_size] { + assert(target_size == get_payload_size()); + }) +#endif + ; +} + +const value_header_t* Value::read_value_header() const +{ + auto ret = p_cursor->read_value_header(vb.get_header_magic()); + assert(ret->payload_size <= vb.get_max_value_payload_size()); + return ret; +} + +std::pair<NodeExtentMutable&, ValueDeltaRecorder*> +Value::do_prepare_mutate_payload(Transaction& t) +{ + return p_cursor->prepare_mutate_value_payload(get_context(t)); +} + +laddr_t Value::get_hint() const +{ + return p_cursor->get_key_view(vb.get_header_magic()).get_hint(); +} + +std::unique_ptr<ValueDeltaRecorder> +build_value_recorder_by_type(ceph::bufferlist& encoded, + const value_magic_t& magic) +{ + std::unique_ptr<ValueDeltaRecorder> ret; + switch (magic) { + case value_magic_t::ONODE: + ret = std::make_unique<FLTreeOnode::Recorder>(encoded); + break; + case value_magic_t::TEST_UNBOUND: + ret = std::make_unique<UnboundedValue::Recorder>(encoded); + break; + case value_magic_t::TEST_BOUNDED: + ret = std::make_unique<BoundedValue::Recorder>(encoded); + break; + case value_magic_t::TEST_EXTENDED: + ret = std::make_unique<ExtendedValue::Recorder>(encoded); + break; + default: + ret = nullptr; + break; + } + assert(!ret || ret->get_header_magic() == magic); + return ret; +} + +void validate_tree_config(const tree_conf_t& conf) +{ + ceph_assert(conf.max_ns_size < + string_key_view_t::VALID_UPPER_BOUND); + ceph_assert(conf.max_oid_size < + string_key_view_t::VALID_UPPER_BOUND); + ceph_assert(is_valid_node_size(conf.internal_node_size)); + ceph_assert(is_valid_node_size(conf.leaf_node_size)); + + if (conf.do_split_check) { + // In hope to comply with 3 * (oid + ns) + 2 * value < node + // + // see node_layout.h for NODE_BLOCK_SIZE considerations + // + // The below calculations also consider the internal indexing overhead in + // order to be accurate, so the equation has become: + // node-header-size + 2 * max-full-insert-size + + // max-ns/oid-split-overhead <= node-size + + auto obj = ghobject_t{shard_id_t{0}, 0, 0, "", "", 0, 0}; + key_hobj_t key(obj); + auto max_str_size = conf.max_ns_size + conf.max_oid_size; +#define _STAGE_T(NodeType) node_to_stage_t<typename NodeType::node_stage_t> +#define NXT_T(StageType) staged<typename StageType::next_param_t> + + laddr_t i_value{0}; + auto insert_size_2 = + _STAGE_T(InternalNode0)::insert_size(key, i_value); + auto insert_size_0 = + NXT_T(NXT_T(_STAGE_T(InternalNode0)))::insert_size(key, i_value); + unsigned internal_size_bound = sizeof(node_header_t) + + (insert_size_2 + max_str_size) * 2 + + (insert_size_2 - insert_size_0 + max_str_size); + ceph_assert(internal_size_bound <= conf.internal_node_size); + + value_config_t l_value; + l_value.payload_size = conf.max_value_payload_size; + insert_size_2 = + _STAGE_T(LeafNode0)::insert_size(key, l_value); + insert_size_0 = + NXT_T(NXT_T(_STAGE_T(LeafNode0)))::insert_size(key, l_value); + unsigned leaf_size_bound = sizeof(node_header_t) + + (insert_size_2 + max_str_size) * 2 + + (insert_size_2 - insert_size_0 + max_str_size); + ceph_assert(leaf_size_bound <= conf.leaf_node_size); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/value.h b/src/crimson/os/seastore/onode_manager/staged-fltree/value.h new file mode 100644 index 000000000..d9f0c231a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/value.h @@ -0,0 +1,337 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> + +#include "include/buffer.h" +#include "crimson/common/type_helpers.h" + +#include "fwd.h" +#include "node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +// value size up to 64 KiB +using value_size_t = uint16_t; +enum class value_magic_t : uint8_t { + ONODE = 0x52, + TEST_UNBOUND, + TEST_BOUNDED, + TEST_EXTENDED, +}; +inline std::ostream& operator<<(std::ostream& os, const value_magic_t& magic) { + switch (magic) { + case value_magic_t::ONODE: + return os << "ONODE"; + case value_magic_t::TEST_UNBOUND: + return os << "TEST_UNBOUND"; + case value_magic_t::TEST_BOUNDED: + return os << "TEST_BOUNDED"; + case value_magic_t::TEST_EXTENDED: + return os << "TEST_EXTENDED"; + default: + return os << "UNKNOWN(" << magic << ")"; + } +} + +/** + * value_config_t + * + * Parameters to create a value. + */ +struct value_config_t { + value_magic_t magic; + value_size_t payload_size; + + value_size_t allocation_size() const; + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(magic, encoded); + ceph::encode(payload_size, encoded); + } + + static value_config_t decode(ceph::bufferlist::const_iterator& delta) { + value_magic_t magic; + ceph::decode(magic, delta); + value_size_t payload_size; + ceph::decode(payload_size, delta); + return {magic, payload_size}; + } +}; +inline std::ostream& operator<<(std::ostream& os, const value_config_t& conf) { + return os << "ValueConf(" << conf.magic + << ", " << conf.payload_size << "B)"; +} + +/** + * value_header_t + * + * The header structure in value layout. + * + * Value layout: + * + * # <- alloc size -> # + * # header | payload # + */ +struct value_header_t { + value_magic_t magic; + value_size_t payload_size; + + bool operator==(const value_header_t& rhs) const { + return (magic == rhs.magic && payload_size == rhs.payload_size); + } + bool operator!=(const value_header_t& rhs) const { + return !(*this == rhs); + } + + value_size_t allocation_size() const { + return payload_size + sizeof(value_header_t); + } + + const char* get_payload() const { + return reinterpret_cast<const char*>(this) + sizeof(value_header_t); + } + + NodeExtentMutable get_payload_mutable(NodeExtentMutable& node) const { + return node.get_mutable_absolute(get_payload(), payload_size); + } + + char* get_payload() { + return reinterpret_cast<char*>(this) + sizeof(value_header_t); + } + + void initiate(NodeExtentMutable& mut, const value_config_t& config) { + value_header_t header{config.magic, config.payload_size}; + mut.copy_in_absolute(this, header); + mut.set_absolute(get_payload(), 0, config.payload_size); + } + + static value_size_t estimate_allocation_size(value_size_t payload_size) { + return payload_size + sizeof(value_header_t); + } +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const value_header_t& header) { + return os << "Value(" << header.magic + << ", " << header.payload_size << "B)"; +} + +inline value_size_t value_config_t::allocation_size() const { + return value_header_t::estimate_allocation_size(payload_size); +} + +/** + * ValueDeltaRecorder + * + * An abstracted class to handle user-defined value delta encode, decode and + * replay. + */ +class ValueDeltaRecorder { + public: + virtual ~ValueDeltaRecorder() = default; + ValueDeltaRecorder(const ValueDeltaRecorder&) = delete; + ValueDeltaRecorder(ValueDeltaRecorder&&) = delete; + ValueDeltaRecorder& operator=(const ValueDeltaRecorder&) = delete; + ValueDeltaRecorder& operator=(ValueDeltaRecorder&&) = delete; + + /// Returns the value header magic for validation purpose. + virtual value_magic_t get_header_magic() const = 0; + + /// Called by DeltaRecorderT to apply user-defined value delta. + virtual void apply_value_delta(ceph::bufferlist::const_iterator&, + NodeExtentMutable&, + laddr_t) = 0; + + protected: + ValueDeltaRecorder(ceph::bufferlist& encoded) : encoded{encoded} {} + + /// Get the delta buffer to encode user-defined value delta. + ceph::bufferlist& get_encoded(NodeExtentMutable&); + + private: + ceph::bufferlist& encoded; +}; + +/** + * tree_conf_t + * + * Hard limits and compile-time configurations. + */ +struct tree_conf_t { + value_magic_t value_magic; + string_size_t max_ns_size; + string_size_t max_oid_size; + value_size_t max_value_payload_size; + extent_len_t internal_node_size; + extent_len_t leaf_node_size; + bool do_split_check = true; +}; + +class tree_cursor_t; +/** + * Value + * + * Value is a stateless view of the underlying value header and payload content + * stored in a tree leaf node, with the support to implement user-defined value + * deltas and to extend and trim the underlying payload data (not implemented + * yet). + * + * In the current implementation, we don't guarantee any alignment for value + * payload due to unaligned node layout and the according merge and split + * operations. + */ +class Value { + public: + virtual ~Value(); + Value(const Value&) = default; + Value(Value&&) = default; + Value& operator=(const Value&) = delete; + Value& operator=(Value&&) = delete; + + /// Returns whether the Value is still tracked in tree. + bool is_tracked() const; + + /// Invalidate the Value before submitting transaction. + void invalidate(); + + /// Returns the value payload size. + value_size_t get_payload_size() const { + assert(is_tracked()); + return read_value_header()->payload_size; + } + + laddr_t get_hint() const; + + bool operator==(const Value& v) const { return p_cursor == v.p_cursor; } + bool operator!=(const Value& v) const { return !(*this == v); } + + protected: + Value(NodeExtentManager&, const ValueBuilder&, Ref<tree_cursor_t>&); + + /// Extends the payload size. + eagain_ifuture<> extend(Transaction&, value_size_t extend_size); + + /// Trim and shrink the payload. + eagain_ifuture<> trim(Transaction&, value_size_t trim_size); + + /// Get the permission to mutate the payload with the optional value recorder. + template <typename PayloadT, typename ValueDeltaRecorderT> + std::pair<NodeExtentMutable&, ValueDeltaRecorderT*> + prepare_mutate_payload(Transaction& t) { + assert(is_tracked()); + assert(sizeof(PayloadT) <= get_payload_size()); + + auto value_mutable = do_prepare_mutate_payload(t); + assert(value_mutable.first.get_write() == + const_cast<const Value*>(this)->template read_payload<char>()); + assert(value_mutable.first.get_length() == get_payload_size()); + return {value_mutable.first, + static_cast<ValueDeltaRecorderT*>(value_mutable.second)}; + } + + /// Get the latest payload pointer for read. + template <typename PayloadT> + const PayloadT* read_payload() const { + assert(is_tracked()); + // see Value documentation + static_assert(alignof(PayloadT) == 1); + assert(sizeof(PayloadT) <= get_payload_size()); + return reinterpret_cast<const PayloadT*>(read_value_header()->get_payload()); + } + + private: + const value_header_t* read_value_header() const; + context_t get_context(Transaction& t) { + return {nm, vb, t}; + } + + std::pair<NodeExtentMutable&, ValueDeltaRecorder*> + do_prepare_mutate_payload(Transaction&); + + NodeExtentManager& nm; + const ValueBuilder& vb; + Ref<tree_cursor_t> p_cursor; + + template <typename ValueImpl> + friend class Btree; +}; + +/** + * ValueBuilder + * + * For tree nodes to build values without the need to depend on the actual + * implementation. + */ +struct ValueBuilder { + virtual value_magic_t get_header_magic() const = 0; + virtual string_size_t get_max_ns_size() const = 0; + virtual string_size_t get_max_oid_size() const = 0; + virtual value_size_t get_max_value_payload_size() const = 0; + virtual extent_len_t get_internal_node_size() const = 0; + virtual extent_len_t get_leaf_node_size() const = 0; + virtual std::unique_ptr<ValueDeltaRecorder> + build_value_recorder(ceph::bufferlist&) const = 0; +}; + +/** + * ValueBuilderImpl + * + * The concrete ValueBuilder implementation in Btree. + */ +template <typename ValueImpl> +struct ValueBuilderImpl final : public ValueBuilder { + ValueBuilderImpl() { + validate_tree_config(ValueImpl::TREE_CONF); + } + + value_magic_t get_header_magic() const { + return ValueImpl::TREE_CONF.value_magic; + } + string_size_t get_max_ns_size() const override { + return ValueImpl::TREE_CONF.max_ns_size; + } + string_size_t get_max_oid_size() const override { + return ValueImpl::TREE_CONF.max_oid_size; + } + value_size_t get_max_value_payload_size() const override { + return ValueImpl::TREE_CONF.max_value_payload_size; + } + extent_len_t get_internal_node_size() const override { + return ValueImpl::TREE_CONF.internal_node_size; + } + extent_len_t get_leaf_node_size() const override { + return ValueImpl::TREE_CONF.leaf_node_size; + } + + std::unique_ptr<ValueDeltaRecorder> + build_value_recorder(ceph::bufferlist& encoded) const override { + std::unique_ptr<ValueDeltaRecorder> ret = + std::make_unique<typename ValueImpl::Recorder>(encoded); + assert(ret->get_header_magic() == get_header_magic()); + return ret; + } + + ValueImpl build_value(NodeExtentManager& nm, + const ValueBuilder& vb, + Ref<tree_cursor_t>& p_cursor) const { + assert(vb.get_header_magic() == get_header_magic()); + return ValueImpl(nm, vb, p_cursor); + } +}; + +void validate_tree_config(const tree_conf_t& conf); + +/** + * Get the value recorder by type (the magic value) when the ValueBuilder is + * unavailable. + */ +std::unique_ptr<ValueDeltaRecorder> +build_value_recorder_by_type(ceph::bufferlist& encoded, const value_magic_t& magic); + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::onode::value_config_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::onode::value_header_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/ordering_handle.h b/src/crimson/os/seastore/ordering_handle.h new file mode 100644 index 000000000..a7802fda3 --- /dev/null +++ b/src/crimson/os/seastore/ordering_handle.h @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/shared_mutex.hh> + +#include "crimson/common/operation.h" +#include "crimson/osd/osd_operation.h" + +namespace crimson::os::seastore { + +struct WritePipeline { + struct ReserveProjectedUsage : OrderedExclusivePhaseT<ReserveProjectedUsage> { + constexpr static auto type_name = "WritePipeline::reserve_projected_usage"; + } reserve_projected_usage; + struct OolWrites : UnorderedStageT<OolWrites> { + constexpr static auto type_name = "UnorderedStage::ool_writes_stage"; + } ool_writes; + struct Prepare : OrderedExclusivePhaseT<Prepare> { + constexpr static auto type_name = "WritePipeline::prepare_phase"; + } prepare; + struct DeviceSubmission : OrderedConcurrentPhaseT<DeviceSubmission> { + constexpr static auto type_name = "WritePipeline::device_submission_phase"; + } device_submission; + struct Finalize : OrderedExclusivePhaseT<Finalize> { + constexpr static auto type_name = "WritePipeline::finalize_phase"; + } finalize; + + using BlockingEvents = std::tuple< + ReserveProjectedUsage::BlockingEvent, + OolWrites::BlockingEvent, + Prepare::BlockingEvent, + DeviceSubmission::BlockingEvent, + Finalize::BlockingEvent + >; +}; + +/** + * PlaceholderOperation + * + * Once seastore is more complete, I expect to update the externally + * facing interfaces to permit passing the osd level operation through. + * Until then (and for tests likely permanently) we'll use this unregistered + * placeholder for the pipeline phases necessary for journal correctness. + */ +class PlaceholderOperation : public crimson::osd::PhasedOperationT<PlaceholderOperation> { +public: + constexpr static auto type = 0U; + constexpr static auto type_name = + "crimson::os::seastore::PlaceholderOperation"; + + static PlaceholderOperation::IRef create() { + return IRef{new PlaceholderOperation()}; + } + + PipelineHandle handle; + WritePipeline::BlockingEvents tracking_events; + + PipelineHandle& get_handle() { + return handle; + } +private: + void dump_detail(ceph::Formatter *f) const final {} + void print(std::ostream &) const final {} +}; + +struct OperationProxy { + OperationRef op; + OperationProxy(OperationRef op) : op(std::move(op)) {} + + virtual seastar::future<> enter(WritePipeline::ReserveProjectedUsage&) = 0; + virtual seastar::future<> enter(WritePipeline::OolWrites&) = 0; + virtual seastar::future<> enter(WritePipeline::Prepare&) = 0; + virtual seastar::future<> enter(WritePipeline::DeviceSubmission&) = 0; + virtual seastar::future<> enter(WritePipeline::Finalize&) = 0; + + virtual void exit() = 0; + virtual seastar::future<> complete() = 0; + + virtual ~OperationProxy() = default; +}; + +template <typename OpT> +struct OperationProxyT : OperationProxy { + OperationProxyT(typename OpT::IRef op) : OperationProxy(op) {} + + OpT* that() { + return static_cast<OpT*>(op.get()); + } + const OpT* that() const { + return static_cast<const OpT*>(op.get()); + } + + seastar::future<> enter(WritePipeline::ReserveProjectedUsage& s) final { + return that()->enter_stage(s); + } + seastar::future<> enter(WritePipeline::OolWrites& s) final { + return that()->enter_stage(s); + } + seastar::future<> enter(WritePipeline::Prepare& s) final { + return that()->enter_stage(s); + } + seastar::future<> enter(WritePipeline::DeviceSubmission& s) final { + return that()->enter_stage(s); + } + seastar::future<> enter(WritePipeline::Finalize& s) final { + return that()->enter_stage(s); + } + + void exit() final { + return that()->handle.exit(); + } + seastar::future<> complete() final { + return that()->handle.complete(); + } +}; + +struct OrderingHandle { + // we can easily optimize this dynalloc out as all concretes are + // supposed to have exactly the same size. + std::unique_ptr<OperationProxy> op; + seastar::shared_mutex *collection_ordering_lock = nullptr; + + // in the future we might add further constructors / template to type + // erasure while extracting the location of tracking events. + OrderingHandle(std::unique_ptr<OperationProxy> op) : op(std::move(op)) {} + OrderingHandle(OrderingHandle &&other) + : op(std::move(other.op)), + collection_ordering_lock(other.collection_ordering_lock) { + other.collection_ordering_lock = nullptr; + } + + seastar::future<> take_collection_lock(seastar::shared_mutex &mutex) { + ceph_assert(!collection_ordering_lock); + collection_ordering_lock = &mutex; + return collection_ordering_lock->lock(); + } + + void maybe_release_collection_lock() { + if (collection_ordering_lock) { + collection_ordering_lock->unlock(); + collection_ordering_lock = nullptr; + } + } + + template <typename T> + seastar::future<> enter(T &t) { + return op->enter(t); + } + + void exit() { + op->exit(); + } + + seastar::future<> complete() { + return op->complete(); + } + + ~OrderingHandle() { + maybe_release_collection_lock(); + } +}; + +inline OrderingHandle get_dummy_ordering_handle() { + using PlaceholderOpProxy = OperationProxyT<PlaceholderOperation>; + return OrderingHandle{ + std::make_unique<PlaceholderOpProxy>(PlaceholderOperation::create())}; +} + +} // namespace crimson::os::seastore + +namespace crimson { + template <> + struct EventBackendRegistry<os::seastore::PlaceholderOperation> { + static std::tuple<> get_backends() { + return {}; + } + }; +} // namespace crimson + diff --git a/src/crimson/os/seastore/random_block_manager.cc b/src/crimson/os/seastore/random_block_manager.cc new file mode 100644 index 000000000..749edc97f --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager.cc @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/random_block_manager.h" +#include "crimson/os/seastore/random_block_manager/nvme_block_device.h" +#include "crimson/os/seastore/random_block_manager/rbm_device.h" + +namespace crimson::os::seastore { + +seastar::future<random_block_device::RBMDeviceRef> +get_rb_device( + const std::string &device) +{ + return seastar::make_ready_future<random_block_device::RBMDeviceRef>( + std::make_unique< + random_block_device::nvme::NVMeBlockDevice + >(device + "/block")); +} + +} diff --git a/src/crimson/os/seastore/random_block_manager.h b/src/crimson/os/seastore/random_block_manager.h new file mode 100644 index 000000000..d9be1b5e6 --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager.h @@ -0,0 +1,176 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iosfwd> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "crimson/os/seastore/seastore_types.h" +#include "include/buffer_fwd.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/transaction.h" + +#include "crimson/common/layout.h" +#include "include/buffer.h" +#include "crimson/os/seastore/device.h" + +namespace crimson::os::seastore { + +struct rbm_shard_info_t { + std::size_t size = 0; + uint64_t start_offset = 0; + + DENC(rbm_shard_info_t, v, p) { + DENC_START(1, 1, p); + denc(v.size, p); + denc(v.start_offset, p); + DENC_FINISH(p); + } +}; + +struct rbm_metadata_header_t { + size_t size = 0; + size_t block_size = 0; + uint64_t feature = 0; + uint64_t journal_size = 0; + checksum_t crc = 0; + device_config_t config; + unsigned int shard_num = 0; + std::vector<rbm_shard_info_t> shard_infos; + + DENC(rbm_metadata_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.size, p); + denc(v.block_size, p); + denc(v.feature, p); + + denc(v.journal_size, p); + denc(v.crc, p); + denc(v.config, p); + denc(v.shard_num, p); + denc(v.shard_infos, p); + DENC_FINISH(p); + } + + void validate() const { + ceph_assert(shard_num == seastar::smp::count); + ceph_assert(block_size > 0); + for (unsigned int i = 0; i < seastar::smp::count; i ++) { + ceph_assert(shard_infos[i].size > block_size && + shard_infos[i].size % block_size == 0); + ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX); + ceph_assert(journal_size > 0 && + journal_size % block_size == 0); + ceph_assert(shard_infos[i].start_offset < size && + shard_infos[i].start_offset % block_size == 0); + } + ceph_assert(config.spec.magic != 0); + ceph_assert(get_default_backend_of_device(config.spec.dtype) == + backend_type_t::RANDOM_BLOCK); + ceph_assert(config.spec.id <= DEVICE_ID_MAX_VALID); + } +}; + +enum class rbm_extent_state_t { + FREE, // not allocated + RESERVED, // extent is reserved by alloc_new_extent, but is not persistent + ALLOCATED, // extent is persistent +}; + +class Device; +using rbm_abs_addr = uint64_t; +constexpr rbm_abs_addr RBM_START_ADDRESS = 0; +class RandomBlockManager { +public: + + using read_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + virtual read_ertr::future<> read(paddr_t addr, bufferptr &buffer) = 0; + + using write_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::ebadf, + crimson::ct_error::enospc, + crimson::ct_error::erange + >; + virtual write_ertr::future<> write(paddr_t addr, bufferptr &buf) = 0; + + using open_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + virtual open_ertr::future<> open() = 0; + + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg>; + virtual close_ertr::future<> close() = 0; + + using allocate_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enospc + >; + using allocate_ret = allocate_ertr::future<paddr_t>; + // allocator, return start addr of allocated blocks + virtual paddr_t alloc_extent(size_t size) = 0; + + virtual void mark_space_used(paddr_t paddr, size_t len) = 0; + virtual void mark_space_free(paddr_t paddr, size_t len) = 0; + + virtual void complete_allocation(paddr_t addr, size_t size) = 0; + + virtual size_t get_size() const = 0; + virtual extent_len_t get_block_size() const = 0; + virtual uint64_t get_free_blocks() const = 0; + virtual device_id_t get_device_id() const = 0; + virtual const seastore_meta_t &get_meta() const = 0; + virtual Device* get_device() = 0; + virtual paddr_t get_start() = 0; + virtual rbm_extent_state_t get_extent_state(paddr_t addr, size_t size) = 0; + virtual size_t get_journal_size() const = 0; + virtual ~RandomBlockManager() {} +}; +using RandomBlockManagerRef = std::unique_ptr<RandomBlockManager>; + +inline rbm_abs_addr convert_paddr_to_abs_addr(const paddr_t& paddr) { + const blk_paddr_t& blk_addr = paddr.as_blk_paddr(); + return blk_addr.get_device_off(); +} + +inline paddr_t convert_abs_addr_to_paddr(rbm_abs_addr addr, device_id_t d_id) { + return paddr_t::make_blk_paddr(d_id, addr); +} + +namespace random_block_device { + class RBMDevice; +} + +seastar::future<std::unique_ptr<random_block_device::RBMDevice>> + get_rb_device(const std::string &device); + +std::ostream &operator<<(std::ostream &out, const rbm_metadata_header_t &header); +std::ostream &operator<<(std::ostream &out, const rbm_shard_info_t &shard); +} + +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::rbm_shard_info_t +) +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::rbm_metadata_header_t +) + +#if FMT_VERSION >= 90000 +template<> struct fmt::formatter<crimson::os::seastore::rbm_metadata_header_t> : fmt::ostream_formatter {}; +template<> struct fmt::formatter<crimson::os::seastore::rbm_shard_info_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/random_block_manager/avlallocator.cc b/src/crimson/os/seastore/random_block_manager/avlallocator.cc new file mode 100644 index 000000000..28137a23d --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/avlallocator.cc @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +// +#include "avlallocator.h" +#include "crimson/os/seastore/logging.h" + +SET_SUBSYS(seastore_device); + +namespace crimson::os::seastore { + +void AvlAllocator::mark_extent_used(rbm_abs_addr addr, size_t size) +{ + LOG_PREFIX(AvlAllocator::mark_extent_used); + DEBUG("addr: {}, size: {}, avail: {}", addr, size, available_size); + _remove_from_tree(addr, size); +} + +void AvlAllocator::init(rbm_abs_addr addr, size_t size, size_t b_size) +{ + LOG_PREFIX(AvlAllocator::init); + DEBUG("addr: {}, size: {}", addr, size); + auto r = new extent_range_t{ addr, addr + size }; + extent_tree.insert(*r); + extent_size_tree.insert(*r); + available_size = size; + block_size = b_size; + total_size = size; + base_addr = addr; +} + +void AvlAllocator::_remove_from_tree(rbm_abs_addr start, rbm_abs_addr size) +{ + LOG_PREFIX(AvlAllocator::_remove_from_tree); + rbm_abs_addr end = start + size; + + ceph_assert(size != 0); + ceph_assert(size <= available_size); + + auto rs = extent_tree.find(extent_range_t{start, end}, extent_tree.key_comp()); + DEBUG("rs start: {}, rs end: {}", rs->start, rs->end); + ceph_assert(rs != extent_tree.end()); + ceph_assert(rs->start <= start); + ceph_assert(rs->end >= end); + + bool left_over = (rs->start != start); + bool right_over = (rs->end != end); + + _extent_size_tree_rm(*rs); + + if (left_over && right_over) { + auto old_right_end = rs->end; + auto insert_pos = rs; + ceph_assert(insert_pos != extent_tree.end()); + ++insert_pos; + rs->end = start; + + auto r = new extent_range_t{end, old_right_end}; + extent_tree.insert_before(insert_pos, *r); + extent_size_tree.insert(*r); + available_size += r->length(); + _extent_size_tree_try_insert(*rs); + } else if (left_over) { + assert(is_aligned(start, block_size)); + rs->end = start; + _extent_size_tree_try_insert(*rs); + } else if (right_over) { + assert(is_aligned(end, block_size)); + rs->start = end; + _extent_size_tree_try_insert(*rs); + } else { + extent_tree.erase_and_dispose(rs, dispose_rs{}); + } +} + +rbm_abs_addr AvlAllocator::find_block(size_t size) +{ + const auto comp = extent_size_tree.key_comp(); + auto iter = extent_size_tree.lower_bound( + extent_range_t{base_addr, base_addr + size}, comp); + for (; iter != extent_size_tree.end(); ++iter) { + assert(is_aligned(iter->start, block_size)); + rbm_abs_addr off = iter->start; + if (off + size <= iter->end) { + return off; + } + } + return total_size; +} + +void AvlAllocator::_add_to_tree(rbm_abs_addr start, rbm_abs_addr size) +{ + LOG_PREFIX(AvlAllocator::_add_to_tree); + ceph_assert(size != 0); + DEBUG("addr: {}, size: {}", start, size); + + rbm_abs_addr end = start + size; + + auto rs_after = extent_tree.upper_bound(extent_range_t{start, end}, + extent_tree.key_comp()); + + auto rs_before = extent_tree.end(); + if (rs_after != extent_tree.begin()) { + rs_before = std::prev(rs_after); + } + + bool merge_before = (rs_before != extent_tree.end() && rs_before->end == start); + bool merge_after = (rs_after != extent_tree.end() && rs_after->start == end); + + if (merge_before && merge_after) { + _extent_size_tree_rm(*rs_before); + _extent_size_tree_rm(*rs_after); + rs_after->start = rs_before->start; + extent_tree.erase_and_dispose(rs_before, dispose_rs{}); + _extent_size_tree_try_insert(*rs_after); + } else if (merge_before) { + _extent_size_tree_rm(*rs_before); + rs_before->end = end; + _extent_size_tree_try_insert(*rs_before); + } else if (merge_after) { + _extent_size_tree_rm(*rs_after); + rs_after->start = start; + _extent_size_tree_try_insert(*rs_after); + } else { + auto r = new extent_range_t{start, end}; + extent_tree.insert(*r); + extent_size_tree.insert(*r); + available_size += r->length(); + } +} + +std::optional<interval_set<rbm_abs_addr>> AvlAllocator::alloc_extent( + size_t size) +{ + LOG_PREFIX(AvlAllocator::alloc_extent); + if (available_size < size) { + return std::nullopt; + } + if (extent_size_tree.empty()) { + return std::nullopt; + } + ceph_assert(size > 0); + ceph_assert(is_aligned(size, block_size)); + + interval_set<rbm_abs_addr> result; + + auto try_to_alloc_block = [this, &result, FNAME] (uint64_t alloc_size) -> uint64_t + { + rbm_abs_addr start = find_block(alloc_size); + if (start != base_addr + total_size) { + _remove_from_tree(start, alloc_size); + DEBUG("allocate addr: {}, allocate size: {}, available size: {}", + start, alloc_size, available_size); + result.insert(start, alloc_size); + return alloc_size; + } + return 0; + }; + + auto alloc = std::min(max_alloc_size, size); + rbm_abs_addr ret = try_to_alloc_block(alloc); + if (ret == 0) { + return std::nullopt; + } + + assert(!result.empty()); + assert(result.num_intervals() == 1); + for (auto p : result) { + INFO("result start: {}, end: {}", p.first, p.first + p.second); + if (detailed) { + assert(!reserved_extent_tracker.contains(p.first, p.second)); + reserved_extent_tracker.insert(p.first, p.second); + } + } + return result; +} + +void AvlAllocator::free_extent(rbm_abs_addr addr, size_t size) +{ + assert(total_size); + assert(total_size > available_size); + _add_to_tree(addr, size); + if (detailed && reserved_extent_tracker.contains(addr, size)) { + reserved_extent_tracker.erase(addr, size); + } +} + +bool AvlAllocator::is_free_extent(rbm_abs_addr start, size_t size) +{ + rbm_abs_addr end = start + size; + ceph_assert(size != 0); + if (start < base_addr || base_addr + total_size < end) { + return false; + } + + auto rs = extent_tree.find(extent_range_t{start, end}, extent_tree.key_comp()); + if (rs != extent_tree.end() && rs->start <= start && rs->end >= end) { + return true; + } + return false; +} +} diff --git a/src/crimson/os/seastore/random_block_manager/avlallocator.h b/src/crimson/os/seastore/random_block_manager/avlallocator.h new file mode 100644 index 000000000..d1a4fabca --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/avlallocator.h @@ -0,0 +1,174 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include "extent_allocator.h" +#include "include/ceph_assert.h" +#include "include/buffer_fwd.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/transaction.h" +#include <string.h> +#include "include/buffer.h" + +#include <boost/intrusive/avl_set.hpp> +#include <optional> +#include <vector> + +namespace crimson::os::seastore { + +struct extent_range_t { + rbm_abs_addr start; + rbm_abs_addr end; + + extent_range_t(rbm_abs_addr start, rbm_abs_addr end) : + start(start), end(end) + {} + + struct before_t { + template<typename KeyLeft, typename KeyRight> + bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const { + return lhs.end <= rhs.start; + } + }; + boost::intrusive::avl_set_member_hook<> offset_hook; + + struct shorter_t { + template<typename KeyType> + bool operator()(const extent_range_t& lhs, const KeyType& rhs) const { + auto lhs_size = lhs.length(); + auto rhs_size = rhs.end - rhs.start; + if (lhs_size < rhs_size) { + return true; + } else if (lhs_size > rhs_size) { + return false; + } else { + return lhs.start < rhs.start; + } + } + }; + + size_t length() const { + return end - start; + } + boost::intrusive::avl_set_member_hook<> size_hook; +}; + +/* + * This is the simplest version of avlallocator from bluestore's avlallocator + */ +class AvlAllocator : public ExtentAllocator { +public: + AvlAllocator(bool detailed) : + detailed(detailed) {} + std::optional<interval_set<rbm_abs_addr>> alloc_extent( + size_t size) final; + + void free_extent(rbm_abs_addr addr, size_t size) final; + void mark_extent_used(rbm_abs_addr addr, size_t size) final; + void init(rbm_abs_addr addr, size_t size, size_t b_size); + + struct dispose_rs { + void operator()(extent_range_t* p) + { + delete p; + } + }; + + ~AvlAllocator() { + close(); + } + + void close() { + if (!detailed) { + assert(reserved_extent_tracker.size() == 0); + } + extent_size_tree.clear(); + extent_tree.clear_and_dispose(dispose_rs{}); + total_size = 0; + block_size = 0; + available_size = 0; + base_addr = 0; + } + + uint64_t get_available_size() const final { + return available_size; + } + + uint64_t get_max_alloc_size() const final { + return max_alloc_size; + } + + bool is_free_extent(rbm_abs_addr start, size_t size); + + void complete_allocation(rbm_abs_addr start, size_t size) final { + if (detailed) { + assert(reserved_extent_tracker.contains(start, size)); + reserved_extent_tracker.erase(start, size); + } + } + + bool is_reserved_extent(rbm_abs_addr start, size_t size) { + if (detailed) { + return reserved_extent_tracker.contains(start, size); + } + return false; + } + + rbm_extent_state_t get_extent_state(rbm_abs_addr addr, size_t size) final { + if (is_reserved_extent(addr, size)) { + return rbm_extent_state_t::RESERVED; + } else if (is_free_extent(addr, size)) { + return rbm_extent_state_t::FREE; + } + return rbm_extent_state_t::ALLOCATED; + } + +private: + void _add_to_tree(rbm_abs_addr start, size_t size); + + void _extent_size_tree_rm(extent_range_t& r) { + ceph_assert(available_size >= r.length()); + available_size -= r.length(); + extent_size_tree.erase(r); + } + + void _extent_size_tree_try_insert(extent_range_t& r) { + extent_size_tree.insert(r); + available_size += r.length(); + } + + void _remove_from_tree(rbm_abs_addr start, rbm_abs_addr size); + rbm_abs_addr find_block(size_t size); + + using extent_tree_t = + boost::intrusive::avl_set< + extent_range_t, + boost::intrusive::compare<extent_range_t::before_t>, + boost::intrusive::member_hook< + extent_range_t, + boost::intrusive::avl_set_member_hook<>, + &extent_range_t::offset_hook>>; + extent_tree_t extent_tree; + + using extent_size_tree_t = + boost::intrusive::avl_set< + extent_range_t, + boost::intrusive::compare<extent_range_t::shorter_t>, + boost::intrusive::member_hook< + extent_range_t, + boost::intrusive::avl_set_member_hook<>, + &extent_range_t::size_hook>>; + extent_size_tree_t extent_size_tree; + + uint64_t block_size = 0; + uint64_t available_size = 0; + uint64_t total_size = 0; + uint64_t base_addr = 0; + uint64_t max_alloc_size = 4 << 20; + bool detailed; + interval_set<rbm_abs_addr> reserved_extent_tracker; +}; + +} diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc new file mode 100644 index 000000000..511b70a2e --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc @@ -0,0 +1,176 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include "crimson/os/seastore/logging.h" + +#include "include/buffer.h" +#include "rbm_device.h" +#include "include/interval_set.h" +#include "include/intarith.h" +#include "block_rb_manager.h" + +SET_SUBSYS(seastore_device); + +namespace crimson::os::seastore { + +device_config_t get_rbm_ephemeral_device_config( + std::size_t index, std::size_t num_devices) +{ + assert(num_devices > index); + magic_t magic = 0xfffa; + auto type = device_type_t::RANDOM_BLOCK_EPHEMERAL; + bool is_major_device; + secondary_device_set_t secondary_devices; + if (index == 0) { + is_major_device = true; + for (std::size_t secondary_index = index + 1; + secondary_index < num_devices; + ++secondary_index) { + device_id_t secondary_id = static_cast<device_id_t>(secondary_index); + secondary_devices.insert({ + secondary_index, device_spec_t{magic, type, secondary_id} + }); + } + } else { // index > 0 + is_major_device = false; + } + + device_id_t id = static_cast<device_id_t>(DEVICE_ID_RANDOM_BLOCK_MIN + index); + seastore_meta_t meta = {}; + return {is_major_device, + device_spec_t{magic, type, id}, + meta, + secondary_devices}; +} + +paddr_t BlockRBManager::alloc_extent(size_t size) +{ + LOG_PREFIX(BlockRBManager::alloc_extent); + assert(allocator); + auto alloc = allocator->alloc_extent(size); + ceph_assert((*alloc).num_intervals() == 1); + auto extent = (*alloc).begin(); + ceph_assert(size == extent.get_len()); + paddr_t paddr = convert_abs_addr_to_paddr( + extent.get_start(), + device->get_device_id()); + DEBUG("allocated addr: {}, size: {}, requested size: {}", + paddr, extent.get_len(), size); + return paddr; +} + +void BlockRBManager::complete_allocation( + paddr_t paddr, size_t size) +{ + assert(allocator); + rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); + allocator->complete_allocation(addr, size); +} + +BlockRBManager::open_ertr::future<> BlockRBManager::open() +{ + assert(device); + assert(device->get_available_size() > 0); + assert(device->get_block_size() > 0); + auto ool_start = get_start_rbm_addr(); + allocator->init( + ool_start, + device->get_shard_end() - + ool_start, + device->get_block_size()); + return open_ertr::now(); +} + +BlockRBManager::write_ertr::future<> BlockRBManager::write( + paddr_t paddr, + bufferptr &bptr) +{ + LOG_PREFIX(BlockRBManager::write); + ceph_assert(device); + rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); + rbm_abs_addr start = device->get_shard_start(); + rbm_abs_addr end = device->get_shard_end(); + if (addr < start || addr + bptr.length() > end) { + ERROR("out of range: start {}, end {}, addr {}, length {}", + start, end, addr, bptr.length()); + return crimson::ct_error::erange::make(); + } + bufferptr bp = bufferptr(ceph::buffer::create_page_aligned(bptr.length())); + bp.copy_in(0, bptr.length(), bptr.c_str()); + return device->write( + addr, + std::move(bp)); +} + +BlockRBManager::read_ertr::future<> BlockRBManager::read( + paddr_t paddr, + bufferptr &bptr) +{ + LOG_PREFIX(BlockRBManager::read); + ceph_assert(device); + rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); + rbm_abs_addr start = device->get_shard_start(); + rbm_abs_addr end = device->get_shard_end(); + if (addr < start || addr + bptr.length() > end) { + ERROR("out of range: start {}, end {}, addr {}, length {}", + start, end, addr, bptr.length()); + return crimson::ct_error::erange::make(); + } + return device->read( + addr, + bptr); +} + +BlockRBManager::close_ertr::future<> BlockRBManager::close() +{ + ceph_assert(device); + allocator->close(); + return device->close(); +} + +BlockRBManager::write_ertr::future<> BlockRBManager::write( + rbm_abs_addr addr, + bufferlist &bl) +{ + LOG_PREFIX(BlockRBManager::write); + ceph_assert(device); + bufferptr bptr; + try { + bptr = bufferptr(ceph::buffer::create_page_aligned(bl.length())); + auto iter = bl.cbegin(); + iter.copy(bl.length(), bptr.c_str()); + } catch (const std::exception &e) { + DEBUG("write: exception creating aligned buffer {}", e); + ceph_assert(0 == "unhandled exception"); + } + return device->write( + addr, + std::move(bptr)); +} + +std::ostream &operator<<(std::ostream &out, const rbm_metadata_header_t &header) +{ + out << " rbm_metadata_header_t(size=" << header.size + << ", block_size=" << header.block_size + << ", feature=" << header.feature + << ", journal_size=" << header.journal_size + << ", crc=" << header.crc + << ", config=" << header.config + << ", shard_num=" << header.shard_num; + for (auto p : header.shard_infos) { + out << p; + } + return out << ")"; +} + +std::ostream &operator<<(std::ostream &out, const rbm_shard_info_t &shard) +{ + out << " rbm_shard_info_t(size=" << shard.size + << ", start_offset=" << shard.start_offset; + return out << ")"; +} + +} diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.h b/src/crimson/os/seastore/random_block_manager/block_rb_manager.h new file mode 100644 index 000000000..b686820d0 --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.h @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iosfwd> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "crimson/os/seastore/seastore_types.h" +#include "include/buffer_fwd.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/transaction.h" +#include "rbm_device.h" +#include "crimson/os/seastore/random_block_manager.h" + +#include "crimson/common/layout.h" +#include "include/buffer.h" +#include "include/uuid.h" +#include "avlallocator.h" + + +namespace crimson::os::seastore { + +using RBMDevice = random_block_device::RBMDevice; +using RBMDeviceRef = std::unique_ptr<RBMDevice>; + +device_config_t get_rbm_ephemeral_device_config( + std::size_t index, std::size_t num_devices); + +class BlockRBManager final : public RandomBlockManager { +public: + /* + * Ondisk layout (TODO) + * + * --------------------------------------------------------------------------- + * | rbm_metadata_header_t | metadatas | ... | data blocks | + * --------------------------------------------------------------------------- + */ + + read_ertr::future<> read(paddr_t addr, bufferptr &buffer) final; + write_ertr::future<> write(paddr_t addr, bufferptr &buf) final; + open_ertr::future<> open() final; + close_ertr::future<> close() final; + + /* + * alloc_extent + * + * The role of this function is to find out free blocks the transaction requires. + * To do so, alloc_extent() looks into both in-memory allocator + * and freebitmap blocks. + * + * TODO: multiple allocation + * + */ + paddr_t alloc_extent(size_t size) final; // allocator, return blocks + + void complete_allocation(paddr_t addr, size_t size) final; + + size_t get_start_rbm_addr() const { + return device->get_shard_journal_start() + device->get_journal_size(); + } + size_t get_size() const final { + return device->get_shard_end() - get_start_rbm_addr(); + }; + extent_len_t get_block_size() const final { return device->get_block_size(); } + + BlockRBManager(RBMDevice * device, std::string path, bool detailed) + : device(device), path(path) { + allocator.reset(new AvlAllocator(detailed)); + } + + write_ertr::future<> write(rbm_abs_addr addr, bufferlist &bl); + + device_id_t get_device_id() const final { + assert(device); + return device->get_device_id(); + } + + uint64_t get_free_blocks() const final { + // TODO: return correct free blocks after block allocator is introduced + assert(device); + return get_size() / get_block_size(); + } + const seastore_meta_t &get_meta() const final { + return device->get_meta(); + } + RBMDevice* get_device() { + return device; + } + + void mark_space_used(paddr_t paddr, size_t len) final { + assert(allocator); + rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); + assert(addr >= get_start_rbm_addr() && + addr + len <= device->get_shard_end()); + allocator->mark_extent_used(addr, len); + } + + void mark_space_free(paddr_t paddr, size_t len) final { + assert(allocator); + rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); + assert(addr >= get_start_rbm_addr() && + addr + len <= device->get_shard_end()); + allocator->free_extent(addr, len); + } + + paddr_t get_start() final { + return convert_abs_addr_to_paddr( + get_start_rbm_addr(), + device->get_device_id()); + } + + rbm_extent_state_t get_extent_state(paddr_t paddr, size_t size) final { + assert(allocator); + rbm_abs_addr addr = convert_paddr_to_abs_addr(paddr); + assert(addr >= get_start_rbm_addr() && + addr + size <= device->get_shard_end()); + return allocator->get_extent_state(addr, size); + } + + size_t get_journal_size() const final { + return device->get_journal_size(); + } + +private: + /* + * this contains the number of bitmap blocks, free blocks and + * rbm specific information + */ + ExtentAllocatorRef allocator; + RBMDevice * device; + std::string path; + int stream_id; // for multi-stream +}; +using BlockRBManagerRef = std::unique_ptr<BlockRBManager>; + +} diff --git a/src/crimson/os/seastore/random_block_manager/extent_allocator.h b/src/crimson/os/seastore/random_block_manager/extent_allocator.h new file mode 100644 index 000000000..8a3e62c6d --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/extent_allocator.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/random_block_manager.h" +#include "include/interval_set.h" + +namespace crimson::os::seastore { + +class ExtentAllocator { +public: + /** + * alloc_extent + * + * Allocate continous region as much as given size. + * Note that the inital state of extent is RESERVED after alloc_extent(). + * see rbm_extent_state_t in random_block_manager.h + * + * @param size + * @return nullopt or the address range (rbm_abs_addr, len) + */ + virtual std::optional<interval_set<rbm_abs_addr>> alloc_extent( + size_t size) = 0; + /** + * free_extent + * + * free given region + * + * @param rbm_abs_addr + * @param size + */ + virtual void free_extent(rbm_abs_addr addr, size_t size) = 0; + /** + * mark_extent_used + * + * This marks given region as used without alloc_extent. + * + * @param rbm_abs_addr + * @param size + */ + virtual void mark_extent_used(rbm_abs_addr addr, size_t size) = 0; + /** + * init + * + * Initialize the address space the ExtentAllocator will manage + * + * @param start address (rbm_abs_addr) + * @param total size + * @param block size + */ + virtual void init(rbm_abs_addr addr, size_t size, size_t b_size) = 0; + virtual uint64_t get_available_size() const = 0; + virtual uint64_t get_max_alloc_size() const = 0; + virtual void close() = 0; + /** + * complete_allocation + * + * This changes this extent state from RESERVED to ALLOCATED + * + * @param start address + * @param size + */ + virtual void complete_allocation(rbm_abs_addr start, size_t size) = 0; + virtual rbm_extent_state_t get_extent_state(rbm_abs_addr addr, size_t size) = 0; + virtual ~ExtentAllocator() {} +}; +using ExtentAllocatorRef = std::unique_ptr<ExtentAllocator>; + + +} diff --git a/src/crimson/os/seastore/random_block_manager/nvme_block_device.cc b/src/crimson/os/seastore/random_block_manager/nvme_block_device.cc new file mode 100644 index 000000000..6437f06a4 --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/nvme_block_device.cc @@ -0,0 +1,280 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include <fcntl.h> + +#include "crimson/common/log.h" +#include "crimson/common/errorator-loop.h" + +#include "include/buffer.h" +#include "rbm_device.h" +#include "nvme_block_device.h" +#include "block_rb_manager.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_seastore_tm); + } +} + +namespace crimson::os::seastore::random_block_device::nvme { + +NVMeBlockDevice::mkfs_ret NVMeBlockDevice::mkfs(device_config_t config) { + using crimson::common::get_conf; + return shard_devices.local().do_primary_mkfs(config, + seastar::smp::count, + get_conf<Option::size_t>("seastore_cbjournal_size") + ); +} + +open_ertr::future<> NVMeBlockDevice::open( + const std::string &in_path, + seastar::open_flags mode) { + return seastar::do_with(in_path, [this, mode](auto& in_path) { + return seastar::file_stat(in_path).then([this, mode, in_path](auto stat) { + return seastar::open_file_dma(in_path, mode).then([=, this](auto file) { + device = std::move(file); + logger().debug("open"); + // Get SSD's features from identify_controller and namespace command. + // Do identify_controller first, and then identify_namespace. + return identify_controller(device).safe_then([this, in_path, mode]( + auto id_controller_data) { + support_multistream = id_controller_data.oacs.support_directives; + if (support_multistream) { + stream_id_count = WRITE_LIFE_MAX; + } + awupf = id_controller_data.awupf + 1; + return identify_namespace(device).safe_then([this, in_path, mode] ( + auto id_namespace_data) { + atomic_write_unit = awupf * super.block_size; + data_protection_type = id_namespace_data.dps.protection_type; + data_protection_enabled = (data_protection_type > 0); + if (id_namespace_data.nsfeat.opterf == 1){ + // NPWG and NPWA is 0'based value + write_granularity = super.block_size * (id_namespace_data.npwg + 1); + write_alignment = super.block_size * (id_namespace_data.npwa + 1); + } + return open_for_io(in_path, mode); + }); + }).handle_error(crimson::ct_error::input_output_error::handle([this, in_path, mode]{ + logger().error("open: id ctrlr failed. open without ioctl"); + return open_for_io(in_path, mode); + }), crimson::ct_error::pass_further_all{}); + }); + }); + }); +} + +open_ertr::future<> NVMeBlockDevice::open_for_io( + const std::string& in_path, + seastar::open_flags mode) { + io_device.resize(stream_id_count); + return seastar::do_for_each(io_device, [=, this](auto &target_device) { + return seastar::open_file_dma(in_path, mode).then([this]( + auto file) { + assert(io_device.size() > stream_index_to_open); + io_device[stream_index_to_open] = std::move(file); + return io_device[stream_index_to_open].fcntl( + F_SET_FILE_RW_HINT, + (uintptr_t)&stream_index_to_open).then([this](auto ret) { + stream_index_to_open++; + return seastar::now(); + }); + }); + }); +} + +NVMeBlockDevice::mount_ret NVMeBlockDevice::mount() +{ + logger().debug(" mount "); + return shard_devices.invoke_on_all([](auto &local_device) { + return local_device.do_shard_mount( + ).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in RBMDevice::do_mount" + }); + }); +} + +write_ertr::future<> NVMeBlockDevice::write( + uint64_t offset, + bufferptr &&bptr, + uint16_t stream) { + logger().debug( + "block: write offset {} len {}", + offset, + bptr.length()); + auto length = bptr.length(); + + assert((length % super.block_size) == 0); + uint16_t supported_stream = stream; + if (stream >= stream_id_count) { + supported_stream = WRITE_LIFE_NOT_SET; + } + return seastar::do_with( + std::move(bptr), + [this, offset, length, supported_stream] (auto& bptr) { + return io_device[supported_stream].dma_write( + offset, bptr.c_str(), length).handle_exception( + [](auto e) -> write_ertr::future<size_t> { + logger().error("write: dma_write got error{}", e); + return crimson::ct_error::input_output_error::make(); + }).then([length](auto result) -> write_ertr::future<> { + if (result != length) { + logger().error("write: dma_write got error with not proper length"); + return crimson::ct_error::input_output_error::make(); + } + return write_ertr::now(); + }); + }); +} + +read_ertr::future<> NVMeBlockDevice::read( + uint64_t offset, + bufferptr &bptr) { + logger().debug( + "block: read offset {} len {}", + offset, + bptr.length()); + auto length = bptr.length(); + + assert((length % super.block_size) == 0); + + return device.dma_read(offset, bptr.c_str(), length).handle_exception( + [](auto e) -> read_ertr::future<size_t> { + logger().error("read: dma_read got error{}", e); + return crimson::ct_error::input_output_error::make(); + }).then([length](auto result) -> read_ertr::future<> { + if (result != length) { + logger().error("read: dma_read got error with not proper length"); + return crimson::ct_error::input_output_error::make(); + } + return read_ertr::now(); + }); +} + +write_ertr::future<> NVMeBlockDevice::writev( + uint64_t offset, + ceph::bufferlist bl, + uint16_t stream) { + logger().debug( + "block: write offset {} len {}", + offset, + bl.length()); + + uint16_t supported_stream = stream; + if (stream >= stream_id_count) { + supported_stream = WRITE_LIFE_NOT_SET; + } + bl.rebuild_aligned(super.block_size); + + return seastar::do_with( + bl.prepare_iovs(), + std::move(bl), + [this, supported_stream, offset](auto& iovs, auto& bl) + { + return write_ertr::parallel_for_each( + iovs, + [this, supported_stream, offset](auto& p) mutable + { + auto off = offset + p.offset; + auto len = p.length; + auto& iov = p.iov; + return io_device[supported_stream].dma_write(off, std::move(iov) + ).handle_exception( + [this, off, len](auto e) -> write_ertr::future<size_t> + { + logger().error("{} poffset={}~{} dma_write got error -- {}", + device_id_printer_t{get_device_id()}, off, len, e); + return crimson::ct_error::input_output_error::make(); + }).then([this, off, len](size_t written) -> write_ertr::future<> { + if (written != len) { + logger().error("{} poffset={}~{} dma_write len={} inconsistent", + device_id_printer_t{get_device_id()}, off, len, written); + return crimson::ct_error::input_output_error::make(); + } + return write_ertr::now(); + }); + }); + }); +} + +Device::close_ertr::future<> NVMeBlockDevice::close() { + logger().debug(" close "); + stream_index_to_open = WRITE_LIFE_NOT_SET; + return device.close().then([this]() { + return seastar::do_for_each(io_device, [](auto target_device) { + return target_device.close(); + }); + }); +} + +nvme_command_ertr::future<nvme_identify_controller_data_t> +NVMeBlockDevice::identify_controller(seastar::file f) { + return seastar::do_with( + nvme_admin_command_t(), + nvme_identify_controller_data_t(), + [this, f](auto &admin_command, auto &data) { + admin_command.common.opcode = nvme_admin_command_t::OPCODE_IDENTIFY; + admin_command.common.addr = (uint64_t)&data; + admin_command.common.data_len = sizeof(data); + admin_command.identify.cns = nvme_identify_command_t::CNS_CONTROLLER; + + return pass_admin(admin_command, f).safe_then([&data](auto status) { + return seastar::make_ready_future<nvme_identify_controller_data_t>( + std::move(data)); + }); + }); +} + +discard_ertr::future<> NVMeBlockDevice::discard(uint64_t offset, uint64_t len) { + return device.discard(offset, len); +} + +nvme_command_ertr::future<nvme_identify_namespace_data_t> +NVMeBlockDevice::identify_namespace(seastar::file f) { + return get_nsid(f).safe_then([this, f](auto nsid) { + return seastar::do_with( + nvme_admin_command_t(), + nvme_identify_namespace_data_t(), + [this, nsid, f](auto &admin_command, auto &data) { + admin_command.common.opcode = nvme_admin_command_t::OPCODE_IDENTIFY; + admin_command.common.addr = (uint64_t)&data; + admin_command.common.data_len = sizeof(data); + admin_command.common.nsid = nsid; + admin_command.identify.cns = nvme_identify_command_t::CNS_NAMESPACE; + + return pass_admin(admin_command, f).safe_then([&data](auto status){ + return seastar::make_ready_future<nvme_identify_namespace_data_t>( + std::move(data)); + }); + }); + }); +} + +nvme_command_ertr::future<int> NVMeBlockDevice::get_nsid(seastar::file f) { + return f.ioctl(NVME_IOCTL_ID, nullptr).handle_exception( + [](auto e)->nvme_command_ertr::future<int> { + logger().error("pass_admin: ioctl failed"); + return crimson::ct_error::input_output_error::make(); + }); +} + +nvme_command_ertr::future<int> NVMeBlockDevice::pass_admin( + nvme_admin_command_t& admin_cmd, seastar::file f) { + return f.ioctl(NVME_IOCTL_ADMIN_CMD, &admin_cmd).handle_exception( + [](auto e)->nvme_command_ertr::future<int> { + logger().error("pass_admin: ioctl failed"); + return crimson::ct_error::input_output_error::make(); + }); +} + +nvme_command_ertr::future<int> NVMeBlockDevice::pass_through_io( + nvme_io_command_t& io_cmd) { + return device.ioctl(NVME_IOCTL_IO_CMD, &io_cmd); +} + +} diff --git a/src/crimson/os/seastore/random_block_manager/nvme_block_device.h b/src/crimson/os/seastore/random_block_manager/nvme_block_device.h new file mode 100644 index 000000000..ed8f99be8 --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/nvme_block_device.h @@ -0,0 +1,360 @@ +//-*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <vector> + +#include <seastar/core/file.hh> +#include <linux/nvme_ioctl.h> + +#include "crimson/osd/exceptions.h" +#include "crimson/common/layout.h" +#include "rbm_device.h" + +namespace ceph { + namespace buffer { + class bufferptr; + } +} + +namespace crimson::os::seastore::random_block_device::nvme { +/* + * NVMe protocol structures (nvme_XX, identify_XX) + * + * All structures relative to NVMe protocol are following NVMe protocol v1.4 + * (latest). NVMe is protocol for fast interfacing between user and SSD device. + * We selectively adopted features among various NVMe features to ease + * implementation. And also, NVMeBlockDevice provides generic command submission + * APIs for IO and Admin commands. Please use pass_through_io() and pass_admin() + * to do it. + * + * For more information about NVMe protocol, refer https://nvmexpress.org/ + */ +struct nvme_identify_command_t { + uint32_t common_dw[10]; + + uint32_t cns : 8; + uint32_t reserved : 8; + uint32_t cnt_id : 16; + + static const uint8_t CNS_NAMESPACE = 0x00; + static const uint8_t CNS_CONTROLLER = 0x01; +}; + +struct nvme_admin_command_t { + union { + nvme_passthru_cmd common; + nvme_identify_command_t identify; + }; + + static const uint8_t OPCODE_IDENTIFY = 0x06; +}; + +// Optional Admin Command Support (OACS) +// Indicates optional commands are supported by SSD or not +struct oacs_t { + uint16_t unused : 5; + uint16_t support_directives : 1; // Support multi-stream + uint16_t unused2 : 10; +}; + +struct nvme_identify_controller_data_t { + union { + struct { + uint8_t unused[256]; // [255:0] + oacs_t oacs; // [257:256] + uint8_t unused2[270]; // [527:258] + uint16_t awupf; // [529:528] + }; + uint8_t raw[4096]; + }; +}; + +// End-to-end Data Protection Capabilities (DPC) +// Indicates type of E2E data protection supported by SSD +struct dpc_t { + uint8_t support_type1 : 1; + uint8_t support_type2 : 1; + uint8_t support_type3 : 1; + uint8_t support_first_meta : 1; + uint8_t support_last_meta : 1; + uint8_t reserved : 3; +}; + +// End-to-end Data Protection Type Settings (DPS) +// Indicates enabled type of E2E data protection +struct dps_t { + uint8_t protection_type : 3; + uint8_t protection_info : 1; + uint8_t reserved : 4; +}; + +// Namespace Features (NSFEAT) +// Indicates features of namespace +struct nsfeat_t { + uint8_t thinp : 1; + uint8_t nsabp : 1; + uint8_t dae : 1; + uint8_t uid_reuse : 1; + uint8_t opterf : 1; // Support NPWG, NPWA + uint8_t reserved : 3; +}; + +// LBA Format (LBAF) +// Indicates LBA format (metadata size, data size, performance) +struct lbaf_t { + uint32_t ms : 16; + uint32_t lbads : 8; + uint32_t rp : 2; + uint32_t reserved : 6; +}; + +struct nvme_identify_namespace_data_t { + union { + struct { + uint8_t unused[24]; // [23:0] + nsfeat_t nsfeat; // [24] + uint8_t unused2[3]; // [27:25] + dpc_t dpc; // [28] + dps_t dps; // [29] + uint8_t unused3[34]; // [63:30] + uint16_t npwg; // [65:64] + uint16_t npwa; // [67:66] + uint8_t unused4[60]; // [127:68] + lbaf_t lbaf0; // [131:128] + }; + uint8_t raw[4096]; + }; +}; + +struct nvme_rw_command_t { + uint32_t common_dw[10]; + + uint64_t s_lba; + + uint32_t nlb : 16; // 0's based value + uint32_t reserved : 4; + uint32_t d_type : 4; + uint32_t reserved2 : 2; + uint32_t prinfo_prchk : 3; + uint32_t prinfo_pract : 1; + uint32_t fua : 1; + uint32_t lr : 1; + + uint32_t reserved3 : 16; + uint32_t dspec : 16; + + static const uint32_t DTYPE_STREAM = 1; +}; + +struct nvme_io_command_t { + union { + nvme_passthru_cmd common; + nvme_rw_command_t rw; + }; + static const uint8_t OPCODE_WRITE = 0x01; + static const uint8_t OPCODE_READ = 0x01; +}; + +/* + * Implementation of NVMeBlockDevice with POSIX APIs + * + * NVMeBlockDevice provides NVMe SSD interfaces through POSIX APIs which is + * generally available at most operating environment. + */ +class NVMeBlockDevice : public RBMDevice { +public: + + /* + * Service NVMe device relative size + * + * size : total size of device in byte. + * + * block_size : IO unit size in byte. Caller should follow every IO command + * aligned with block size. + * + * preffered_write_granularity(PWG), preffered_write_alignment(PWA) : IO unit + * size for write in byte. Caller should request every write IO sized multiple + * times of PWG and aligned starting address by PWA. Available only if NVMe + * Device supports NVMe protocol 1.4 or later versions. + * atomic_write_unit : The maximum size of write whose atomicity is guranteed + * by SSD even on power failure. The write equal to or smaller than + * atomic_write_unit does not require fsync(). + */ + + NVMeBlockDevice(std::string device_path) : device_path(device_path) {} + ~NVMeBlockDevice() = default; + + open_ertr::future<> open( + const std::string &in_path, + seastar::open_flags mode) override; + + write_ertr::future<> write( + uint64_t offset, + bufferptr &&bptr, + uint16_t stream = 0) override; + + using RBMDevice::read; + read_ertr::future<> read( + uint64_t offset, + bufferptr &bptr) final; + + close_ertr::future<> close() override; + + discard_ertr::future<> discard( + uint64_t offset, + uint64_t len) override; + + mount_ret mount() final; + + mkfs_ret mkfs(device_config_t config) final; + + write_ertr::future<> writev( + uint64_t offset, + ceph::bufferlist bl, + uint16_t stream = 0) final; + + stat_device_ret stat_device() final { + return seastar::file_stat(device_path, seastar::follow_symlink::yes + ).handle_exception([](auto e) -> stat_device_ret { + return crimson::ct_error::input_output_error::make(); + }).then([this](auto stat) { + return seastar::open_file_dma( + device_path, + seastar::open_flags::rw | seastar::open_flags::dsync + ).then([this, stat](auto file) mutable { + return file.size().then([this, stat, file](auto size) mutable { + stat.size = size; + return identify_namespace(file + ).safe_then([stat] (auto id_namespace_data) mutable { + // LBA format provides LBA size which is power of 2. LBA is the + // minimum size of read and write. + stat.block_size = (1 << id_namespace_data.lbaf0.lbads); + if (stat.block_size < RBM_SUPERBLOCK_SIZE) { + stat.block_size = RBM_SUPERBLOCK_SIZE; + } + return stat_device_ret( + read_ertr::ready_future_marker{}, + stat + ); + }).handle_error(crimson::ct_error::input_output_error::handle( + [stat]{ + return stat_device_ret( + read_ertr::ready_future_marker{}, + stat + ); + }), crimson::ct_error::pass_further_all{}); + }).safe_then([file](auto st) mutable { + return file.close( + ).then([st] { + return stat_device_ret( + read_ertr::ready_future_marker{}, + st + ); + }); + }); + }); + }); + } + + std::string get_device_path() const final { + return device_path; + } + + seastar::future<> start() final { + return shard_devices.start(device_path); + } + + seastar::future<> stop() final { + return shard_devices.stop(); + } + + Device& get_sharded_device() final { + return shard_devices.local(); + } + + uint64_t get_preffered_write_granularity() const { return write_granularity; } + uint64_t get_preffered_write_alignment() const { return write_alignment; } + uint64_t get_atomic_write_unit() const { return atomic_write_unit; } + /* + * End-to-End Data Protection + * + * NVMe device keeps track of data integrity similar with checksum. Client can + * offload checksuming to NVMe device to reduce its CPU utilization. If data + * protection is enabled, checksum is calculated on every write and used to + * verify data on every read. + */ + bool is_data_protection_enabled() const { return data_protection_enabled; } + + /* + * Data Health + * + * Returns list of LBAs which have almost corrupted data. Data of the LBAs + * will be corrupted very soon. Caller can overwrite, unmap or refresh data to + * protect data + */ + virtual nvme_command_ertr::future<std::list<uint64_t>> get_data_health() { + std::list<uint64_t> fragile_lbas; + return nvme_command_ertr::future<std::list<uint64_t>>( + nvme_command_ertr::ready_future_marker{}, + fragile_lbas + ); + } + + /* + * Recovery Level + * + * Regulate magnitude of SSD-internal data recovery. Caller can get good read + * latency with lower magnitude. + */ + virtual nvme_command_ertr::future<> set_data_recovery_level( + uint32_t level) { return nvme_command_ertr::now(); } + /* + * For passsing through nvme IO or Admin command to SSD + * Caller can construct and execute its own nvme command + */ + nvme_command_ertr::future<int> pass_admin( + nvme_admin_command_t& admin_cmd, seastar::file f); + nvme_command_ertr::future<int> pass_through_io( + nvme_io_command_t& io_cmd); + + bool support_multistream = false; + uint8_t data_protection_type = 0; + + /* + * Predictable Latency + * + * NVMe device can guarantee IO latency within pre-defined time window. This + * functionality will be analyzed soon. + */ + +private: + // identify_controller/namespace are used to get SSD internal information such + // as supported features, NPWG and NPWA + nvme_command_ertr::future<nvme_identify_controller_data_t> + identify_controller(seastar::file f); + nvme_command_ertr::future<nvme_identify_namespace_data_t> + identify_namespace(seastar::file f); + nvme_command_ertr::future<int> get_nsid(seastar::file f); + open_ertr::future<> open_for_io( + const std::string& in_path, + seastar::open_flags mode); + + seastar::file device; + std::vector<seastar::file> io_device; + uint32_t stream_index_to_open = WRITE_LIFE_NOT_SET; + uint32_t stream_id_count = 1; // stream is disabled, defaultly. + uint32_t awupf = 0; + + uint64_t write_granularity = 4096; + uint64_t write_alignment = 4096; + uint32_t atomic_write_unit = 4096; + + bool data_protection_enabled = false; + std::string device_path; + seastar::sharded<NVMeBlockDevice> shard_devices; +}; + +} diff --git a/src/crimson/os/seastore/random_block_manager/rbm_device.cc b/src/crimson/os/seastore/random_block_manager/rbm_device.cc new file mode 100644 index 000000000..cea6c30a7 --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/rbm_device.cc @@ -0,0 +1,271 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include <fcntl.h> + +#include "crimson/common/log.h" +#include "crimson/common/errorator-loop.h" + +#include "include/buffer.h" +#include "rbm_device.h" +#include "nvme_block_device.h" +#include "block_rb_manager.h" + +namespace crimson::os::seastore::random_block_device { +#include "crimson/os/seastore/logging.h" +SET_SUBSYS(seastore_device); + +RBMDevice::mkfs_ret RBMDevice::do_primary_mkfs(device_config_t config, + int shard_num, size_t journal_size) { + LOG_PREFIX(RBMDevice::do_primary_mkfs); + return stat_device( + ).handle_error( + mkfs_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error stat_device in RBMDevice::do_primary_mkfs"} + ).safe_then( + [this, FNAME, config=std::move(config), shard_num, journal_size](auto st) { + super.block_size = st.block_size; + super.size = st.size; + super.feature |= RBM_BITMAP_BLOCK_CRC; + super.config = std::move(config); + super.journal_size = journal_size; + ceph_assert_always(super.journal_size > 0); + ceph_assert_always(super.size >= super.journal_size); + ceph_assert_always(shard_num > 0); + + std::vector<rbm_shard_info_t> shard_infos(shard_num); + for (int i = 0; i < shard_num; i++) { + uint64_t aligned_size = + (super.size / shard_num) - + ((super.size / shard_num) % super.block_size); + shard_infos[i].size = aligned_size; + shard_infos[i].start_offset = i * aligned_size; + assert(shard_infos[i].size > super.journal_size); + } + super.shard_infos = shard_infos; + super.shard_num = shard_num; + shard_info = shard_infos[seastar::this_shard_id()]; + DEBUG("super {} ", super); + + // write super block + return open(get_device_path(), + seastar::open_flags::rw | seastar::open_flags::dsync + ).handle_error( + mkfs_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error open in RBMDevice::do_primary_mkfs"} + ).safe_then([this] { + return write_rbm_header( + ).safe_then([this] { + return close(); + }).handle_error( + mkfs_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error write_rbm_header in RBMDevice::do_primary_mkfs" + }); + }); + }); +} + +write_ertr::future<> RBMDevice::write_rbm_header() +{ + bufferlist meta_b_header; + super.crc = 0; + encode(super, meta_b_header); + // If NVMeDevice supports data protection, CRC for checksum is not required + // NVMeDevice is expected to generate and store checksum internally. + // CPU overhead for CRC might be saved. + if (is_data_protection_enabled()) { + super.crc = -1; + } else { + super.crc = meta_b_header.crc32c(-1); + } + + bufferlist bl; + encode(super, bl); + auto iter = bl.begin(); + auto bp = bufferptr(ceph::buffer::create_page_aligned(super.block_size)); + assert(bl.length() < super.block_size); + iter.copy(bl.length(), bp.c_str()); + return write(RBM_START_ADDRESS, std::move(bp)); +} + +read_ertr::future<rbm_metadata_header_t> RBMDevice::read_rbm_header( + rbm_abs_addr addr) +{ + LOG_PREFIX(RBMDevice::read_rbm_header); + assert(super.block_size > 0); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(super.block_size)), + [this, addr, FNAME](auto &bptr) { + return read( + addr, + bptr + ).safe_then([length=bptr.length(), this, bptr, FNAME]() + -> read_ertr::future<rbm_metadata_header_t> { + bufferlist bl; + bl.append(bptr); + auto p = bl.cbegin(); + rbm_metadata_header_t super_block; + try { + decode(super_block, p); + } + catch (ceph::buffer::error& e) { + DEBUG("read_rbm_header: unable to decode rbm super block {}", + e.what()); + return crimson::ct_error::enoent::make(); + } + checksum_t crc = super_block.crc; + bufferlist meta_b_header; + super_block.crc = 0; + encode(super_block, meta_b_header); + assert(ceph::encoded_sizeof<rbm_metadata_header_t>(super_block) < + super_block.block_size); + + // Do CRC verification only if data protection is not supported. + if (is_data_protection_enabled() == false) { + if (meta_b_header.crc32c(-1) != crc) { + DEBUG("bad crc on super block, expected {} != actual {} ", + meta_b_header.crc32c(-1), crc); + return crimson::ct_error::input_output_error::make(); + } + } else { + ceph_assert_always(crc == (checksum_t)-1); + } + super_block.crc = crc; + super = super_block; + DEBUG("got {} ", super); + return read_ertr::future<rbm_metadata_header_t>( + read_ertr::ready_future_marker{}, + super_block + ); + }); + }); +} + +RBMDevice::mount_ret RBMDevice::do_shard_mount() +{ + return open(get_device_path(), + seastar::open_flags::rw | seastar::open_flags::dsync + ).safe_then([this] { + return stat_device( + ).handle_error( + mount_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error stat_device in RBMDevice::do_shard_mount"} + ).safe_then([this](auto st) { + assert(st.block_size > 0); + super.block_size = st.block_size; + return read_rbm_header(RBM_START_ADDRESS + ).safe_then([this](auto s) { + LOG_PREFIX(RBMDevice::do_shard_mount); + shard_info = s.shard_infos[seastar::this_shard_id()]; + INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info); + s.validate(); + return seastar::now(); + }); + }); + }).handle_error( + mount_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error mount in RBMDevice::do_shard_mount"} + ); +} + +EphemeralRBMDeviceRef create_test_ephemeral(uint64_t journal_size, uint64_t data_size) { + return EphemeralRBMDeviceRef( + new EphemeralRBMDevice(journal_size + data_size + + random_block_device::RBMDevice::get_shard_reserved_size(), + EphemeralRBMDevice::TEST_BLOCK_SIZE)); +} + +open_ertr::future<> EphemeralRBMDevice::open( + const std::string &in_path, + seastar::open_flags mode) { + LOG_PREFIX(EphemeralRBMDevice::open); + if (buf) { + return open_ertr::now(); + } + + DEBUG( + "Initializing test memory device {}", + size); + + void* addr = ::mmap( + nullptr, + size, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, + -1, + 0); + + buf = (char*)addr; + + ::memset(buf, 0, size); + return open_ertr::now(); +} + +write_ertr::future<> EphemeralRBMDevice::write( + uint64_t offset, + bufferptr &&bptr, + uint16_t stream) { + LOG_PREFIX(EphemeralRBMDevice::write); + ceph_assert(buf); + DEBUG( + "EphemeralRBMDevice: write offset {} len {}", + offset, + bptr.length()); + + ::memcpy(buf + offset, bptr.c_str(), bptr.length()); + + return write_ertr::now(); +} + +read_ertr::future<> EphemeralRBMDevice::read( + uint64_t offset, + bufferptr &bptr) { + LOG_PREFIX(EphemeralRBMDevice::read); + ceph_assert(buf); + DEBUG( + "EphemeralRBMDevice: read offset {} len {}", + offset, + bptr.length()); + + bptr.copy_in(0, bptr.length(), buf + offset); + return read_ertr::now(); +} + +Device::close_ertr::future<> EphemeralRBMDevice::close() { + LOG_PREFIX(EphemeralRBMDevice::close); + DEBUG(" close "); + return close_ertr::now(); +} + +write_ertr::future<> EphemeralRBMDevice::writev( + uint64_t offset, + ceph::bufferlist bl, + uint16_t stream) { + LOG_PREFIX(EphemeralRBMDevice::writev); + ceph_assert(buf); + DEBUG( + "EphemeralRBMDevice: write offset {} len {}", + offset, + bl.length()); + + bl.begin().copy(bl.length(), buf + offset); + return write_ertr::now(); +} + +EphemeralRBMDevice::mount_ret EphemeralRBMDevice::mount() { + return do_shard_mount(); +} + +EphemeralRBMDevice::mkfs_ret EphemeralRBMDevice::mkfs(device_config_t config) { + return do_primary_mkfs(config, 1, DEFAULT_TEST_CBJOURNAL_SIZE); +} + +} + diff --git a/src/crimson/os/seastore/random_block_manager/rbm_device.h b/src/crimson/os/seastore/random_block_manager/rbm_device.h new file mode 100644 index 000000000..501d9f913 --- /dev/null +++ b/src/crimson/os/seastore/random_block_manager/rbm_device.h @@ -0,0 +1,261 @@ +//-*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/random_block_manager.h" +#include "crimson/os/seastore/device.h" + +namespace ceph { + namespace buffer { + class bufferptr; + } +} + +namespace crimson::os::seastore::random_block_device { + +// from blk/BlockDevice.h +#if defined(__linux__) +#if !defined(F_SET_FILE_RW_HINT) +#define F_LINUX_SPECIFIC_BASE 1024 +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) +#endif +// These values match Linux definition +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 +#define WRITE_LIFE_NOT_SET 0 // No hint information set +#define WRITE_LIFE_NONE 1 // No hints about write life time +#define WRITE_LIFE_SHORT 2 // Data written has a short life time +#define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time +#define WRITE_LIFE_LONG 4 // Data written has a long life time +#define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time +#define WRITE_LIFE_MAX 6 +#else +// On systems don't have WRITE_LIFE_* only use one FD +// And all files are created equal +#define WRITE_LIFE_NOT_SET 0 // No hint information set +#define WRITE_LIFE_NONE 0 // No hints about write life time +#define WRITE_LIFE_SHORT 0 // Data written has a short life time +#define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time +#define WRITE_LIFE_LONG 0 // Data written has a long life time +#define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time +#define WRITE_LIFE_MAX 1 +#endif + +using read_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + +using write_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::ebadf, + crimson::ct_error::enospc>; + +using open_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + +using nvme_command_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + +using discard_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + +constexpr uint32_t RBM_SUPERBLOCK_SIZE = 4096; +enum { + // TODO: This allows the device to manage crc on a block by itself + RBM_NVME_END_TO_END_PROTECTION = 1, + RBM_BITMAP_BLOCK_CRC = 2, +}; + +class RBMDevice : public Device { +public: + using Device::read; + read_ertr::future<> read ( + paddr_t addr, + size_t len, + ceph::bufferptr &out) final { + uint64_t rbm_addr = convert_paddr_to_abs_addr(addr); + return read(rbm_addr, out); + } +protected: + rbm_metadata_header_t super; + rbm_shard_info_t shard_info; +public: + RBMDevice() {} + virtual ~RBMDevice() = default; + + template <typename T> + static std::unique_ptr<T> create() { + return std::make_unique<T>(); + } + + device_id_t get_device_id() const { + return super.config.spec.id; + } + + magic_t get_magic() const final { + return super.config.spec.magic; + } + + device_type_t get_device_type() const final { + return device_type_t::RANDOM_BLOCK_SSD; + } + + backend_type_t get_backend_type() const final { + return backend_type_t::RANDOM_BLOCK; + } + + const seastore_meta_t &get_meta() const final { + return super.config.meta; + } + + secondary_device_set_t& get_secondary_devices() final { + return super.config.secondary_devices; + } + std::size_t get_available_size() const { return super.size; } + extent_len_t get_block_size() const { return super.block_size; } + + virtual read_ertr::future<> read( + uint64_t offset, + bufferptr &bptr) = 0; + + /* + * Multi-stream write + * + * Give hint to device about classification of data whose life time is similar + * with each other. Data with same stream value will be managed together in + * SSD for better write performance. + */ + virtual write_ertr::future<> write( + uint64_t offset, + bufferptr &&bptr, + uint16_t stream = 0) = 0; + + virtual discard_ertr::future<> discard( + uint64_t offset, + uint64_t len) { return seastar::now(); } + + virtual open_ertr::future<> open( + const std::string& path, + seastar::open_flags mode) = 0; + + virtual write_ertr::future<> writev( + uint64_t offset, + ceph::bufferlist bl, + uint16_t stream = 0) = 0; + + bool is_data_protection_enabled() const { return false; } + + mkfs_ret do_mkfs(device_config_t); + + // shard 0 mkfs + mkfs_ret do_primary_mkfs(device_config_t, int shard_num, size_t journal_size); + + mount_ret do_mount(); + + mount_ret do_shard_mount(); + + write_ertr::future<> write_rbm_header(); + + read_ertr::future<rbm_metadata_header_t> read_rbm_header(rbm_abs_addr addr); + + using stat_device_ret = + read_ertr::future<seastar::stat_data>; + virtual stat_device_ret stat_device() = 0; + + virtual std::string get_device_path() const = 0; + + uint64_t get_journal_size() const { + return super.journal_size; + } + + static rbm_abs_addr get_shard_reserved_size() { + return RBM_SUPERBLOCK_SIZE; + } + + rbm_abs_addr get_shard_journal_start() { + return shard_info.start_offset + get_shard_reserved_size(); + } + + uint64_t get_shard_start() const { + return shard_info.start_offset; + } + + uint64_t get_shard_end() const { + return shard_info.start_offset + shard_info.size; + } +}; +using RBMDeviceRef = std::unique_ptr<RBMDevice>; + +constexpr uint64_t DEFAULT_TEST_CBJOURNAL_SIZE = 1 << 26; + +class EphemeralRBMDevice : public RBMDevice { +public: + uint64_t size = 0; + uint64_t block_size = 0; + constexpr static uint32_t TEST_BLOCK_SIZE = 4096; + + EphemeralRBMDevice(size_t size, uint64_t block_size) : + size(size), block_size(block_size), buf(nullptr) { + } + ~EphemeralRBMDevice() { + if (buf) { + ::munmap(buf, size); + buf = nullptr; + } + } + + std::size_t get_available_size() const final { return size; } + extent_len_t get_block_size() const final { return block_size; } + + mount_ret mount() final; + mkfs_ret mkfs(device_config_t config) final; + + open_ertr::future<> open( + const std::string &in_path, + seastar::open_flags mode) override; + + write_ertr::future<> write( + uint64_t offset, + bufferptr &&bptr, + uint16_t stream = 0) override; + + using RBMDevice::read; + read_ertr::future<> read( + uint64_t offset, + bufferptr &bptr) override; + + close_ertr::future<> close() override; + + write_ertr::future<> writev( + uint64_t offset, + ceph::bufferlist bl, + uint16_t stream = 0) final; + + stat_device_ret stat_device() final { + seastar::stat_data stat; + stat.block_size = block_size; + stat.size = size; + return stat_device_ret( + read_ertr::ready_future_marker{}, + stat + ); + } + + std::string get_device_path() const final { + return ""; + } + + char *buf; +}; +using EphemeralRBMDeviceRef = std::unique_ptr<EphemeralRBMDevice>; +EphemeralRBMDeviceRef create_test_ephemeral( + uint64_t journal_size = DEFAULT_TEST_CBJOURNAL_SIZE, + uint64_t data_size = DEFAULT_TEST_CBJOURNAL_SIZE); + +} diff --git a/src/crimson/os/seastore/randomblock_manager_group.h b/src/crimson/os/seastore/randomblock_manager_group.h new file mode 100644 index 000000000..77d9cf797 --- /dev/null +++ b/src/crimson/os/seastore/randomblock_manager_group.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <set> + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/random_block_manager.h" +#include "crimson/os/seastore/random_block_manager/block_rb_manager.h" + +namespace crimson::os::seastore { + +class RBMDeviceGroup { +public: + RBMDeviceGroup() { + rb_devices.resize(DEVICE_ID_MAX); + } + + const std::set<device_id_t>& get_device_ids() const { + return device_ids; + } + + std::vector<RandomBlockManager*> get_rb_managers() const { + assert(device_ids.size()); + std::vector<RandomBlockManager*> ret; + for (auto& device_id : device_ids) { + auto rb_device = rb_devices[device_id].get(); + assert(rb_device->get_device_id() == device_id); + ret.emplace_back(rb_device); + } + return ret; + } + + void add_rb_manager(RandomBlockManagerRef rbm) { + auto device_id = rbm->get_device_id(); + ceph_assert(!has_device(device_id)); + rb_devices[device_id] = std::move(rbm); + device_ids.insert(device_id); + } + + void reset() { + rb_devices.clear(); + rb_devices.resize(DEVICE_ID_MAX); + device_ids.clear(); + } + + auto get_block_size() const { + assert(device_ids.size()); + return rb_devices[*device_ids.begin()]->get_block_size(); + } + + const seastore_meta_t &get_meta() const { + assert(device_ids.size()); + return rb_devices[*device_ids.begin()]->get_meta(); + } + +private: + bool has_device(device_id_t id) const { + assert(id <= DEVICE_ID_MAX_VALID); + return device_ids.count(id) >= 1; + } + + std::vector<RandomBlockManagerRef> rb_devices; + std::set<device_id_t> device_ids; +}; + +using RBMDeviceGroupRef = std::unique_ptr<RBMDeviceGroup>; + +} diff --git a/src/crimson/os/seastore/record_scanner.cc b/src/crimson/os/seastore/record_scanner.cc new file mode 100644 index 000000000..9778bbb77 --- /dev/null +++ b/src/crimson/os/seastore/record_scanner.cc @@ -0,0 +1,239 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "crimson/os/seastore/record_scanner.h" + +#include "crimson/os/seastore/logging.h" + +SET_SUBSYS(seastore_journal); + +namespace crimson::os::seastore { + +RecordScanner::scan_valid_records_ret +RecordScanner::scan_valid_records( + scan_valid_records_cursor &cursor, + segment_nonce_t nonce, + size_t budget, + found_record_handler_t &handler) +{ + LOG_PREFIX(RecordScanner::scan_valid_records); + initialize_cursor(cursor); + DEBUG("starting at {}, budget={}", cursor, budget); + auto retref = std::make_unique<size_t>(0); + auto &budget_used = *retref; + return crimson::repeat( + [=, &cursor, &budget_used, &handler, this]() mutable + -> scan_valid_records_ertr::future<seastar::stop_iteration> { + return [=, &handler, &cursor, &budget_used, this] { + if (!cursor.last_valid_header_found) { + return read_validate_record_metadata(cursor, nonce + ).safe_then([=, &cursor](auto md) { + if (!md) { + cursor.last_valid_header_found = true; + if (cursor.is_complete()) { + INFO("complete at {}, invalid record group metadata", + cursor); + } else { + DEBUG("found invalid record group metadata at {}, " + "processing {} pending record groups", + cursor.seq, + cursor.pending_record_groups.size()); + } + return scan_valid_records_ertr::now(); + } else { + auto& [header, md_bl] = *md; + DEBUG("found valid {} at {}", header, cursor.seq); + cursor.emplace_record_group(header, std::move(md_bl)); + return scan_valid_records_ertr::now(); + } + }).safe_then([=, &cursor, &budget_used, &handler, this] { + DEBUG("processing committed record groups until {}, {} pending", + cursor.last_committed, + cursor.pending_record_groups.size()); + return crimson::repeat( + [=, &budget_used, &cursor, &handler, this] { + if (cursor.pending_record_groups.empty()) { + /* This is only possible if the segment is empty. + * A record's last_commited must be prior to its own + * location since it itself cannot yet have been committed + * at its own time of submission. Thus, the most recently + * read record must always fall after cursor.last_committed */ + return scan_valid_records_ertr::make_ready_future< + seastar::stop_iteration>(seastar::stop_iteration::yes); + } + auto &next = cursor.pending_record_groups.front(); + journal_seq_t next_seq = {cursor.seq.segment_seq, next.offset}; + if (cursor.last_committed == JOURNAL_SEQ_NULL || + next_seq > cursor.last_committed) { + return scan_valid_records_ertr::make_ready_future< + seastar::stop_iteration>(seastar::stop_iteration::yes); + } + return consume_next_records(cursor, handler, budget_used + ).safe_then([] { + return scan_valid_records_ertr::make_ready_future< + seastar::stop_iteration>(seastar::stop_iteration::no); + }); + }); + }); + } else { + assert(!cursor.pending_record_groups.empty()); + auto &next = cursor.pending_record_groups.front(); + return read_validate_data(next.offset, next.header + ).safe_then([this, FNAME, &budget_used, &cursor, &handler, &next](auto valid) { + if (!valid) { + INFO("complete at {}, invalid record group data at {}, {}", + cursor, next.offset, next.header); + cursor.pending_record_groups.clear(); + return scan_valid_records_ertr::now(); + } + return consume_next_records(cursor, handler, budget_used); + }); + } + }().safe_then([=, &budget_used, &cursor] { + if (cursor.is_complete() || budget_used >= budget) { + DEBUG("finish at {}, budget_used={}, budget={}", + cursor, budget_used, budget); + return seastar::stop_iteration::yes; + } else { + return seastar::stop_iteration::no; + } + }); + }).safe_then([retref=std::move(retref)]() mutable -> scan_valid_records_ret { + return scan_valid_records_ret( + scan_valid_records_ertr::ready_future_marker{}, + std::move(*retref)); + }); +} + +RecordScanner::read_validate_record_metadata_ret +RecordScanner::read_validate_record_metadata( + scan_valid_records_cursor &cursor, + segment_nonce_t nonce) +{ + LOG_PREFIX(RecordScanner::read_validate_record_metadata); + paddr_t start = cursor.seq.offset; + auto block_size = cursor.get_block_size(); + if (get_segment_off(cursor.seq.offset) + block_size > get_segment_end_offset(cursor.seq.offset)) { + DEBUG("failed -- record group header block {}~4096 > segment_size {}", + start, get_segment_end_offset(cursor.seq.offset)); + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + TRACE("reading record group header block {}~4096", start); + return read(start, block_size + ).safe_then([=](bufferptr bptr) mutable + -> read_validate_record_metadata_ret { + bufferlist bl; + bl.append(bptr); + auto maybe_header = try_decode_records_header(bl, nonce); + if (!maybe_header.has_value()) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + + auto& header = *maybe_header; + if (header.mdlength < block_size || + header.mdlength % block_size != 0 || + header.dlength % block_size != 0 || + (header.committed_to != JOURNAL_SEQ_NULL && + get_segment_off(header.committed_to.offset) % + cursor.get_block_size() != 0) || + (get_segment_off(cursor.seq.offset) + header.mdlength + header.dlength > + get_segment_end_offset(cursor.seq.offset))) { + ERROR("failed, invalid record group header {}", header); + return crimson::ct_error::input_output_error::make(); + } + + if (is_record_segment_seq_invalid(cursor, header)) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + + if (header.mdlength == block_size) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::make_pair(std::move(header), std::move(bl)) + ); + } + + paddr_t rest_start = cursor.seq.offset.add_offset(block_size); + auto rest_len = header.mdlength - block_size; + TRACE("reading record group header rest {}~{}", rest_start, rest_len); + return read(rest_start, rest_len + ).safe_then([header=std::move(header), bl=std::move(bl) + ](auto&& bptail) mutable { + bl.push_back(bptail); + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::make_pair(std::move(header), std::move(bl))); + }); + }).safe_then([](auto p) { + if (p && validate_records_metadata(p->second)) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::move(*p) + ); + } else { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + }); + +} + +RecordScanner::read_validate_data_ret RecordScanner::read_validate_data( + paddr_t record_base, + const record_group_header_t &header) +{ + LOG_PREFIX(RecordScanner::read_validate_data); + auto data_addr = record_base.add_offset(header.mdlength); + TRACE("reading record group data blocks {}~{}", data_addr, header.dlength); + return read( + data_addr, + header.dlength + ).safe_then([=, &header](auto bptr) { + bufferlist bl; + bl.append(bptr); + return validate_records_data(header, bl); + }); +} + +RecordScanner::consume_record_group_ertr::future<> +RecordScanner::consume_next_records( + scan_valid_records_cursor& cursor, + found_record_handler_t& handler, + std::size_t& budget_used) +{ + LOG_PREFIX(RecordScanner::consume_next_records); + auto& next = cursor.pending_record_groups.front(); + auto total_length = next.header.dlength + next.header.mdlength; + budget_used += total_length; + auto locator = record_locator_t{ + next.offset.add_offset(next.header.mdlength), + write_result_t{ + journal_seq_t{ + cursor.seq.segment_seq, + next.offset + }, + total_length + } + }; + DEBUG("processing {} at {}, budget_used={}", + next.header, locator, budget_used); + return handler( + locator, + next.header, + next.mdbuffer + ).safe_then([FNAME, &cursor] { + cursor.pop_record_group(); + if (cursor.is_complete()) { + INFO("complete at {}, no more record group", cursor); + } + }); +} + +} diff --git a/src/crimson/os/seastore/record_scanner.h b/src/crimson/os/seastore/record_scanner.h new file mode 100644 index 000000000..2cbc7c562 --- /dev/null +++ b/src/crimson/os/seastore/record_scanner.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/segment_manager.h" + + +namespace crimson::os::seastore { + +class RecordScanner { +public: + using read_ertr = SegmentManager::read_ertr; + using scan_valid_records_ertr = read_ertr; + using scan_valid_records_ret = scan_valid_records_ertr::future< + size_t>; + using found_record_handler_t = std::function< + scan_valid_records_ertr::future<>( + record_locator_t record_locator, + // callee may assume header and bl will remain valid until + // returned future resolves + const record_group_header_t &header, + const bufferlist &mdbuf)>; + scan_valid_records_ret scan_valid_records( + scan_valid_records_cursor &cursor, ///< [in, out] cursor, updated during call + segment_nonce_t nonce, ///< [in] nonce for segment + size_t budget, ///< [in] max budget to use + found_record_handler_t &handler ///< [in] handler for records + ); ///< @return used budget + + device_off_t get_segment_off(paddr_t addr) const { + if (addr.get_addr_type() == paddr_types_t::SEGMENT) { + auto& seg_addr = addr.as_seg_paddr(); + return seg_addr.get_segment_off(); + } + assert(addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK); + auto& blk_addr = addr.as_blk_paddr(); + return blk_addr.get_device_off(); + } + +protected: + /// read record metadata for record starting at start + using read_validate_record_metadata_ertr = read_ertr; + using read_validate_record_metadata_ret = + read_validate_record_metadata_ertr::future< + std::optional<std::pair<record_group_header_t, bufferlist>> + >; + read_validate_record_metadata_ret read_validate_record_metadata( + scan_valid_records_cursor &cursor, + segment_nonce_t nonce); + + /// read and validate data + using read_validate_data_ertr = read_ertr; + using read_validate_data_ret = read_validate_data_ertr::future<bool>; + read_validate_data_ret read_validate_data( + paddr_t record_base, + const record_group_header_t &header ///< caller must ensure lifetime through + /// future resolution + ); + + virtual bool is_record_segment_seq_invalid(scan_valid_records_cursor &cursor, + record_group_header_t &h) = 0; + + virtual int64_t get_segment_end_offset(paddr_t addr) = 0; + + using read_ret = read_ertr::future<bufferptr>; + virtual read_ret read(paddr_t start, size_t len) = 0; + + using consume_record_group_ertr = scan_valid_records_ertr; + consume_record_group_ertr::future<> consume_next_records( + scan_valid_records_cursor& cursor, + found_record_handler_t& handler, + std::size_t& budget_used); + + virtual void initialize_cursor(scan_valid_records_cursor &cursor) = 0; + + virtual ~RecordScanner() {} + +}; + +} diff --git a/src/crimson/os/seastore/root_block.cc b/src/crimson/os/seastore/root_block.cc new file mode 100644 index 000000000..dc928e81b --- /dev/null +++ b/src/crimson/os/seastore/root_block.cc @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/root_block.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/backref/backref_tree_node.h" + +namespace crimson::os::seastore { + +void RootBlock::on_replace_prior(Transaction &t) { + if (!lba_root_node) { + auto &prior = static_cast<RootBlock&>(*get_prior_instance()); + lba_root_node = prior.lba_root_node; + if (lba_root_node) { + ((lba_manager::btree::LBANode*)lba_root_node)->root_block = this; + } + } + if (!backref_root_node) { + auto &prior = static_cast<RootBlock&>(*get_prior_instance()); + backref_root_node = prior.backref_root_node; + if (backref_root_node) { + ((backref::BackrefNode*)backref_root_node)->root_block = this; + } + } +} + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h new file mode 100644 index 000000000..0e45519ce --- /dev/null +++ b/src/crimson/os/seastore/root_block.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/cached_extent.h" + +namespace crimson::os::seastore { + +/** + * RootBlock + * + * Holds the physical addresses of all metadata roots. + * In-memory values may be + * - absolute: reference to block which predates the current transaction + * - record_relative: reference to block updated in this transaction + * if !pending() + * + * Journal replay only considers deltas and must always discover the most + * recent value for the RootBlock. Because the contents of root_t above are + * very small, it's simplest to stash the entire root_t value into the delta + * and never actually write the RootBlock to a physical location (safe since + * nothing references the location of the RootBlock). + * + * As a result, Cache treats the root differently in a few ways including: + * - state will only ever be DIRTY or MUTATION_PENDING + * - RootBlock's never show up in the transaction fresh or dirty lists -- + * there's a special Transaction::root member for when the root needs to + * be mutated. + * + * TODO: Journal trimming will need to be aware of the most recent RootBlock + * delta location, or, even easier, just always write one out with the + * mutation which changes the journal trim bound. + */ +struct RootBlock : CachedExtent { + constexpr static extent_len_t SIZE = 4<<10; + using Ref = TCachedExtentRef<RootBlock>; + + root_t root; + + CachedExtent* lba_root_node = nullptr; + CachedExtent* backref_root_node = nullptr; + + RootBlock() : CachedExtent(zero_length_t()) {}; + + RootBlock(const RootBlock &rhs) + : CachedExtent(rhs), + root(rhs.root), + lba_root_node(nullptr), + backref_root_node(nullptr) + {} + + CachedExtentRef duplicate_for_write(Transaction&) final { + return CachedExtentRef(new RootBlock(*this)); + }; + + static constexpr extent_types_t TYPE = extent_types_t::ROOT; + extent_types_t get_type() const final { + return extent_types_t::ROOT; + } + + void on_replace_prior(Transaction &t) final; + + /// dumps root as delta + ceph::bufferlist get_delta() final { + ceph::bufferlist bl; + ceph::buffer::ptr bptr(sizeof(root_t)); + *reinterpret_cast<root_t*>(bptr.c_str()) = root; + bl.append(bptr); + return bl; + } + + /// overwrites root + void apply_delta_and_adjust_crc(paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length() == sizeof(root_t)); + ceph::bufferlist bl = _bl; + bl.rebuild(); + root = *reinterpret_cast<const root_t*>(bl.front().c_str()); + root.adjust_addrs_from_base(base); + } + + /// Patches relative addrs in memory based on record commit addr + void on_delta_write(paddr_t record_block_offset) final { + root.adjust_addrs_from_base(record_block_offset); + } + + complete_load_ertr::future<> complete_load() final { + ceph_abort_msg("Root is only written via deltas"); + } + + void on_initial_write() final { + ceph_abort_msg("Root is only written via deltas"); + } + + root_t &get_root() { return root; } + + std::ostream &print_detail(std::ostream &out) const final { + return out << ", root_block(lba_root_node=" << (void*)lba_root_node + << ", backref_root_node=" << (void*)backref_root_node + << ")"; + } +}; +using RootBlockRef = RootBlock::Ref; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::RootBlock> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc new file mode 100644 index 000000000..897a063e0 --- /dev/null +++ b/src/crimson/os/seastore/seastore.cc @@ -0,0 +1,2135 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "seastore.h" + +#include <algorithm> + +#include <boost/algorithm/string/trim.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include <seastar/core/file.hh> +#include <seastar/core/fstream.hh> +#include <seastar/core/metrics.hh> +#include <seastar/core/shared_mutex.hh> + +#include "common/safe_io.h" +#include "include/stringify.h" +#include "os/Transaction.h" + +#include "crimson/common/buffer_io.h" + +#include "crimson/os/futurized_collection.h" + +#include "crimson/os/seastore/backref_manager.h" +#include "crimson/os/seastore/async_cleaner.h" +#include "crimson/os/seastore/collection_manager/flat_collection_manager.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/fltree_onode_manager.h" +#include "crimson/os/seastore/omap_manager/btree/btree_omap_manager.h" +#include "crimson/os/seastore/onode_manager.h" +#include "crimson/os/seastore/object_data_handler.h" + + +using std::string; +using crimson::common::local_conf; + +template <> struct fmt::formatter<crimson::os::seastore::op_type_t> + : fmt::formatter<std::string_view> { + using op_type_t = crimson::os::seastore::op_type_t; + // parse is inherited from formatter<string_view>. + template <typename FormatContext> + auto format(op_type_t op, FormatContext& ctx) { + std::string_view name = "unknown"; + switch (op) { + case op_type_t::TRANSACTION: + name = "transaction"; + break; + case op_type_t::READ: + name = "read"; + break; + case op_type_t::WRITE: + name = "write"; + break; + case op_type_t::GET_ATTR: + name = "get_attr"; + break; + case op_type_t::GET_ATTRS: + name = "get_attrs"; + break; + case op_type_t::STAT: + name = "stat"; + break; + case op_type_t::OMAP_GET_VALUES: + name = "omap_get_values"; + break; + case op_type_t::OMAP_LIST: + name = "omap_list"; + break; + case op_type_t::MAX: + name = "unknown"; + break; + } + return formatter<string_view>::format(name, ctx); + } +}; + +SET_SUBSYS(seastore); + +namespace crimson::os::seastore { + +class FileMDStore final : public SeaStore::MDStore { + std::string root; +public: + FileMDStore(const std::string& root) : root(root) {} + + write_meta_ret write_meta( + const std::string& key, const std::string& value) final { + std::string path = fmt::format("{}/{}", root, key); + ceph::bufferlist bl; + bl.append(value + "\n"); + return crimson::write_file(std::move(bl), path); + } + + read_meta_ret read_meta(const std::string& key) final { + std::string path = fmt::format("{}/{}", root, key); + return seastar::file_exists( + path + ).then([path] (bool exist) { + if (exist) { + return crimson::read_file(path) + .then([] (auto tmp_buf) { + std::string v = {tmp_buf.get(), tmp_buf.size()}; + std::size_t pos = v.find("\n"); + std::string str = v.substr(0, pos); + return seastar::make_ready_future<std::optional<std::string>>(str); + }); + } else { + return seastar::make_ready_future<std::optional<std::string>>(std::nullopt); + } + }); + } +}; + +using crimson::common::get_conf; + +SeaStore::Shard::Shard( + std::string root, + Device* dev, + bool is_test) + :root(root), + max_object_size( + get_conf<uint64_t>("seastore_default_max_object_size")), + is_test(is_test), + throttler( + get_conf<uint64_t>("seastore_max_concurrent_transactions")) +{ + device = &(dev->get_sharded_device()); + register_metrics(); +} + +SeaStore::SeaStore( + const std::string& root, + MDStoreRef mdstore) + : root(root), + mdstore(std::move(mdstore)) +{ +} + +SeaStore::~SeaStore() = default; + +void SeaStore::Shard::register_metrics() +{ + namespace sm = seastar::metrics; + using op_type_t = crimson::os::seastore::op_type_t; + std::pair<op_type_t, sm::label_instance> labels_by_op_type[] = { + {op_type_t::TRANSACTION, sm::label_instance("latency", "TRANSACTION")}, + {op_type_t::READ, sm::label_instance("latency", "READ")}, + {op_type_t::WRITE, sm::label_instance("latency", "WRITE")}, + {op_type_t::GET_ATTR, sm::label_instance("latency", "GET_ATTR")}, + {op_type_t::GET_ATTRS, sm::label_instance("latency", "GET_ATTRS")}, + {op_type_t::STAT, sm::label_instance("latency", "STAT")}, + {op_type_t::OMAP_GET_VALUES, sm::label_instance("latency", "OMAP_GET_VALUES")}, + {op_type_t::OMAP_LIST, sm::label_instance("latency", "OMAP_LIST")}, + }; + + for (auto& [op_type, label] : labels_by_op_type) { + auto desc = fmt::format("latency of seastore operation (optype={})", + op_type); + metrics.add_group( + "seastore", + { + sm::make_histogram( + "op_lat", [this, op_type=op_type] { + return get_latency(op_type); + }, + sm::description(desc), + {label} + ), + } + ); + } + + metrics.add_group( + "seastore", + { + sm::make_gauge( + "concurrent_transactions", + [this] { + return throttler.get_current(); + }, + sm::description("transactions that are running inside seastore") + ), + sm::make_gauge( + "pending_transactions", + [this] { + return throttler.get_pending(); + }, + sm::description("transactions waiting to get " + "through seastore's throttler") + ) + } + ); +} + +seastar::future<> SeaStore::start() +{ + ceph_assert(seastar::this_shard_id() == primary_core); +#ifndef NDEBUG + bool is_test = true; +#else + bool is_test = false; +#endif + using crimson::common::get_conf; + std::string type = get_conf<std::string>("seastore_main_device_type"); + device_type_t d_type = string_to_device_type(type); + assert(d_type == device_type_t::SSD || + d_type == device_type_t::RANDOM_BLOCK_SSD); + + ceph_assert(root != ""); + return Device::make_device(root, d_type + ).then([this](DeviceRef device_obj) { + device = std::move(device_obj); + return device->start(); + }).then([this, is_test] { + ceph_assert(device); + return shard_stores.start(root, device.get(), is_test); + }); +} + +seastar::future<> SeaStore::test_start(DeviceRef device_obj) +{ + ceph_assert(device_obj); + ceph_assert(root == ""); + device = std::move(device_obj); + return shard_stores.start_single(root, device.get(), true); +} + +seastar::future<> SeaStore::stop() +{ + ceph_assert(seastar::this_shard_id() == primary_core); + return seastar::do_for_each(secondaries, [](auto& sec_dev) { + return sec_dev->stop(); + }).then([this] { + secondaries.clear(); + if (device) { + return device->stop(); + } else { + return seastar::now(); + } + }).then([this] { + return shard_stores.stop(); + }); +} + +SeaStore::mount_ertr::future<> SeaStore::test_mount() +{ + ceph_assert(seastar::this_shard_id() == primary_core); + return shard_stores.local().mount_managers(); +} + +SeaStore::mount_ertr::future<> SeaStore::mount() +{ + ceph_assert(seastar::this_shard_id() == primary_core); + return device->mount( + ).safe_then([this] { + auto sec_devices = device->get_sharded_device().get_secondary_devices(); + return crimson::do_for_each(sec_devices, [this](auto& device_entry) { + device_id_t id = device_entry.first; + magic_t magic = device_entry.second.magic; + device_type_t dtype = device_entry.second.dtype; + std::string path = + fmt::format("{}/block.{}.{}", root, dtype, std::to_string(id)); + return Device::make_device(path, dtype + ).then([this, path, magic](DeviceRef sec_dev) { + return sec_dev->start( + ).then([this, magic, sec_dev = std::move(sec_dev)]() mutable { + return sec_dev->mount( + ).safe_then([this, sec_dev=std::move(sec_dev), magic]() mutable { + boost::ignore_unused(magic); // avoid clang warning; + assert(sec_dev->get_sharded_device().get_magic() == magic); + secondaries.emplace_back(std::move(sec_dev)); + }); + }).safe_then([this] { + return set_secondaries(); + }); + }); + }).safe_then([this] { + return shard_stores.invoke_on_all([](auto &local_store) { + return local_store.mount_managers(); + }); + }); + }).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in SeaStore::mount" + } + ); +} + +seastar::future<> SeaStore::Shard::mount_managers() +{ + init_managers(); + return transaction_manager->mount( + ).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in mount_managers" + }); +} + +seastar::future<> SeaStore::umount() +{ + ceph_assert(seastar::this_shard_id() == primary_core); + return shard_stores.invoke_on_all([](auto &local_store) { + return local_store.umount(); + }); +} + +seastar::future<> SeaStore::Shard::umount() +{ + return [this] { + if (transaction_manager) { + return transaction_manager->close(); + } else { + return TransactionManager::close_ertr::now(); + } + }().safe_then([this] { + return crimson::do_for_each( + secondaries, + [](auto& sec_dev) -> SegmentManager::close_ertr::future<> + { + return sec_dev->close(); + }); + }).safe_then([this] { + return device->close(); + }).safe_then([this] { + secondaries.clear(); + transaction_manager.reset(); + collection_manager.reset(); + onode_manager.reset(); + }).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in SeaStore::umount" + } + ); +} + +seastar::future<> SeaStore::write_fsid(uuid_d new_osd_fsid) +{ + ceph_assert(seastar::this_shard_id() == primary_core); + LOG_PREFIX(SeaStore::write_fsid); + return read_meta("fsid").then([this, FNAME, new_osd_fsid] (auto tuple) { + auto [ret, fsid] = tuple; + std::string str_fsid = stringify(new_osd_fsid); + if (ret == -1) { + return write_meta("fsid", stringify(new_osd_fsid)); + } else if (ret == 0 && fsid != str_fsid) { + ERROR("on-disk fsid {} != provided {}", + fsid, stringify(new_osd_fsid)); + throw std::runtime_error("store fsid error"); + } else { + return seastar::now(); + } + }); +} + +seastar::future<> +SeaStore::Shard::mkfs_managers() +{ + init_managers(); + return transaction_manager->mkfs( + ).safe_then([this] { + init_managers(); + return transaction_manager->mount(); + }).safe_then([this] { + return repeat_eagain([this] { + return transaction_manager->with_transaction_intr( + Transaction::src_t::MUTATE, + "mkfs_seastore", + [this](auto& t) + { + return onode_manager->mkfs(t + ).si_then([this, &t] { + return collection_manager->mkfs(t); + }).si_then([this, &t](auto coll_root) { + transaction_manager->write_collection_root( + t, coll_root); + return transaction_manager->submit_transaction(t); + }); + }); + }); + }).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in Shard::mkfs_managers" + } + ); +} + +seastar::future<> SeaStore::set_secondaries() +{ + auto sec_dev_ite = secondaries.rbegin(); + Device* sec_dev = sec_dev_ite->get(); + return shard_stores.invoke_on_all([sec_dev](auto &local_store) { + local_store.set_secondaries(sec_dev->get_sharded_device()); + }); +} + +SeaStore::mkfs_ertr::future<> SeaStore::test_mkfs(uuid_d new_osd_fsid) +{ + ceph_assert(seastar::this_shard_id() == primary_core); + return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) { + auto [done, value] = tuple; + if (done == 0) { + return seastar::now(); + } + return shard_stores.local().mkfs_managers( + ).then([this, new_osd_fsid] { + return prepare_meta(new_osd_fsid); + }); + }); +} + +seastar::future<> SeaStore::prepare_meta(uuid_d new_osd_fsid) +{ + ceph_assert(seastar::this_shard_id() == primary_core); + return write_fsid(new_osd_fsid).then([this] { + return read_meta("type").then([this] (auto tuple) { + auto [ret, type] = tuple; + if (ret == 0 && type == "seastore") { + return seastar::now(); + } else if (ret == 0 && type != "seastore") { + LOG_PREFIX(SeaStore::prepare_meta); + ERROR("expected seastore, but type is {}", type); + throw std::runtime_error("store type error"); + } else { + return write_meta("type", "seastore"); + } + }); + }).then([this] { + return write_meta("mkfs_done", "yes"); + }); +} + +SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid) +{ + ceph_assert(seastar::this_shard_id() == primary_core); + return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) { + auto [done, value] = tuple; + if (done == 0) { + return seastar::now(); + } else { + return seastar::do_with( + secondary_device_set_t(), + [this, new_osd_fsid](auto& sds) { + auto fut = seastar::now(); + LOG_PREFIX(SeaStore::mkfs); + DEBUG("root: {}", root); + if (!root.empty()) { + fut = seastar::open_directory(root + ).then([this, &sds, new_osd_fsid](seastar::file rdir) mutable { + std::unique_ptr<seastar::file> root_f = + std::make_unique<seastar::file>(std::move(rdir)); + auto sub = root_f->list_directory( + [this, &sds, new_osd_fsid](auto de) mutable -> seastar::future<> + { + LOG_PREFIX(SeaStore::mkfs); + DEBUG("found file: {}", de.name); + if (de.name.find("block.") == 0 + && de.name.length() > 6 /* 6 for "block." */) { + std::string entry_name = de.name; + auto dtype_end = entry_name.find_first_of('.', 6); + device_type_t dtype = + string_to_device_type( + entry_name.substr(6, dtype_end - 6)); + if (dtype == device_type_t::NONE) { + // invalid device type + return seastar::now(); + } + auto id = std::stoi(entry_name.substr(dtype_end + 1)); + std::string path = fmt::format("{}/{}", root, entry_name); + return Device::make_device(path, dtype + ).then([this, &sds, id, dtype, new_osd_fsid](DeviceRef sec_dev) { + auto p_sec_dev = sec_dev.get(); + secondaries.emplace_back(std::move(sec_dev)); + return p_sec_dev->start( + ).then([&sds, id, dtype, new_osd_fsid, p_sec_dev]() { + magic_t magic = (magic_t)std::rand(); + sds.emplace( + (device_id_t)id, + device_spec_t{magic, dtype, (device_id_t)id}); + return p_sec_dev->mkfs(device_config_t::create_secondary( + new_osd_fsid, id, dtype, magic) + ).handle_error(crimson::ct_error::assert_all{"not possible"}); + }); + }).then([this] { + return set_secondaries(); + }); + } + return seastar::now(); + }); + return sub.done().then([root_f=std::move(root_f)] {}); + }); + } + return fut.then([this, &sds, new_osd_fsid] { + device_id_t id = 0; + device_type_t d_type = device->get_device_type(); + assert(d_type == device_type_t::SSD || + d_type == device_type_t::RANDOM_BLOCK_SSD); + if (d_type == device_type_t::RANDOM_BLOCK_SSD) { + id = static_cast<device_id_t>(DEVICE_ID_RANDOM_BLOCK_MIN); + } + + return device->mkfs( + device_config_t::create_primary(new_osd_fsid, id, d_type, sds) + ); + }).safe_then([this] { + return crimson::do_for_each(secondaries, [](auto& sec_dev) { + return sec_dev->mount(); + }); + }); + }).safe_then([this] { + return device->mount(); + }).safe_then([this] { + return shard_stores.invoke_on_all([] (auto &local_store) { + return local_store.mkfs_managers(); + }); + }).safe_then([this, new_osd_fsid] { + return prepare_meta(new_osd_fsid); + }).safe_then([this] { + return umount(); + }).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in SeaStore::mkfs" + } + ); + } + }); +} + +using coll_core_t = FuturizedStore::coll_core_t; +seastar::future<std::vector<coll_core_t>> +SeaStore::list_collections() +{ + ceph_assert(seastar::this_shard_id() == primary_core); + return shard_stores.map([](auto &local_store) { + return local_store.list_collections(); + }).then([](std::vector<std::vector<coll_core_t>> results) { + std::vector<coll_core_t> collections; + for (auto& colls : results) { + collections.insert(collections.end(), colls.begin(), colls.end()); + } + return seastar::make_ready_future<std::vector<coll_core_t>>( + std::move(collections)); + }); +} + +store_statfs_t SeaStore::Shard::stat() const +{ + return transaction_manager->store_stat(); +} + +seastar::future<store_statfs_t> SeaStore::stat() const +{ + ceph_assert(seastar::this_shard_id() == primary_core); + LOG_PREFIX(SeaStore::stat); + DEBUG(""); + return shard_stores.map_reduce0( + [](const SeaStore::Shard &local_store) { + return local_store.stat(); + }, + store_statfs_t(), + [](auto &&ss, auto &&ret) { + ss.add(ret); + return std::move(ss); + } + ).then([](store_statfs_t ss) { + return seastar::make_ready_future<store_statfs_t>(std::move(ss)); + }); +} + +TransactionManager::read_extent_iertr::future<std::optional<unsigned>> +SeaStore::Shard::get_coll_bits(CollectionRef ch, Transaction &t) const +{ + return transaction_manager->read_collection_root(t) + .si_then([this, ch, &t](auto coll_root) { + return collection_manager->list(coll_root, t); + }).si_then([ch](auto colls) { + auto it = std::find_if(colls.begin(), colls.end(), + [ch](const std::pair<coll_t, coll_info_t>& element) { + return element.first == ch->get_cid(); + }); + if (it != colls.end()) { + return TransactionManager::read_extent_iertr::make_ready_future< + std::optional<unsigned>>(it->second.split_bits); + } else { + return TransactionManager::read_extent_iertr::make_ready_future< + std::optional<unsigned>>(std::nullopt); + } + }); +} + +col_obj_ranges_t +SeaStore::get_objs_range(CollectionRef ch, unsigned bits) +{ + col_obj_ranges_t obj_ranges; + spg_t pgid; + constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max(); + const std::string_view MAX_NSPACE = "\xff"; + if (ch->get_cid().is_pg(&pgid)) { + obj_ranges.obj_begin.shard_id = pgid.shard; + obj_ranges.temp_begin = obj_ranges.obj_begin; + + obj_ranges.obj_begin.hobj.pool = pgid.pool(); + obj_ranges.temp_begin.hobj.pool = -2ll - pgid.pool(); + + obj_ranges.obj_end = obj_ranges.obj_begin; + obj_ranges.temp_end = obj_ranges.temp_begin; + + uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps()); + obj_ranges.obj_begin.hobj.set_bitwise_key_u32(reverse_hash); + obj_ranges.temp_begin.hobj.set_bitwise_key_u32(reverse_hash); + + uint64_t end_hash = reverse_hash + (1ull << (32 - bits)); + if (end_hash > MAX_HASH) { + // make sure end hobj is even greater than the maximum possible hobj + obj_ranges.obj_end.hobj.set_bitwise_key_u32(MAX_HASH); + obj_ranges.temp_end.hobj.set_bitwise_key_u32(MAX_HASH); + obj_ranges.obj_end.hobj.nspace = MAX_NSPACE; + } else { + obj_ranges.obj_end.hobj.set_bitwise_key_u32(end_hash); + obj_ranges.temp_end.hobj.set_bitwise_key_u32(end_hash); + } + } else { + obj_ranges.obj_begin.shard_id = shard_id_t::NO_SHARD; + obj_ranges.obj_begin.hobj.pool = -1ull; + + obj_ranges.obj_end = obj_ranges.obj_begin; + obj_ranges.obj_begin.hobj.set_bitwise_key_u32(0); + obj_ranges.obj_end.hobj.set_bitwise_key_u32(MAX_HASH); + obj_ranges.obj_end.hobj.nspace = MAX_NSPACE; + // no separate temp section + obj_ranges.temp_begin = obj_ranges.obj_end; + obj_ranges.temp_end = obj_ranges.obj_end; + } + + obj_ranges.obj_begin.generation = 0; + obj_ranges.obj_end.generation = 0; + obj_ranges.temp_begin.generation = 0; + obj_ranges.temp_end.generation = 0; + return obj_ranges; +} + +static std::list<std::pair<ghobject_t, ghobject_t>> +get_ranges(CollectionRef ch, + ghobject_t start, + ghobject_t end, + col_obj_ranges_t obj_ranges) +{ + ceph_assert(start <= end); + std::list<std::pair<ghobject_t, ghobject_t>> ranges; + if (start < obj_ranges.temp_end) { + ranges.emplace_back( + std::max(obj_ranges.temp_begin, start), + std::min(obj_ranges.temp_end, end)); + } + if (end > obj_ranges.obj_begin) { + ranges.emplace_back( + std::max(obj_ranges.obj_begin, start), + std::min(obj_ranges.obj_end, end)); + } + return ranges; +} + +seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> +SeaStore::Shard::list_objects(CollectionRef ch, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const +{ + ceph_assert(start <= end); + using list_iertr = OnodeManager::list_onodes_iertr; + using RetType = typename OnodeManager::list_onodes_bare_ret; + return seastar::do_with( + RetType(std::vector<ghobject_t>(), start), + std::move(limit), + [this, ch, start, end](auto& ret, auto& limit) { + return repeat_eagain([this, ch, start, end, &limit, &ret] { + return transaction_manager->with_transaction_intr( + Transaction::src_t::READ, + "list_objects", + [this, ch, start, end, &limit, &ret](auto &t) + { + return get_coll_bits( + ch, t + ).si_then([this, ch, &t, start, end, &limit, &ret](auto bits) { + if (!bits) { + return list_iertr::make_ready_future< + OnodeManager::list_onodes_bare_ret + >(std::make_tuple( + std::vector<ghobject_t>(), + ghobject_t::get_max())); + } else { + auto filter = SeaStore::get_objs_range(ch, *bits); + using list_iertr = OnodeManager::list_onodes_iertr; + using repeat_ret = list_iertr::future<seastar::stop_iteration>; + return trans_intr::repeat( + [this, &t, &ret, &limit, + filter, ranges = get_ranges(ch, start, end, filter) + ]() mutable -> repeat_ret { + if (limit == 0 || ranges.empty()) { + return list_iertr::make_ready_future< + seastar::stop_iteration + >(seastar::stop_iteration::yes); + } + auto ite = ranges.begin(); + auto pstart = ite->first; + auto pend = ite->second; + ranges.pop_front(); + return onode_manager->list_onodes( + t, pstart, pend, limit + ).si_then([&limit, &ret, pend](auto &&_ret) mutable { + auto &next_objects = std::get<0>(_ret); + auto &ret_objects = std::get<0>(ret); + ret_objects.insert( + ret_objects.end(), + next_objects.begin(), + next_objects.end()); + std::get<1>(ret) = std::get<1>(_ret); + assert(limit >= next_objects.size()); + limit -= next_objects.size(); + assert(limit == 0 || + std::get<1>(_ret) == pend || + std::get<1>(_ret) == ghobject_t::get_max()); + return list_iertr::make_ready_future< + seastar::stop_iteration + >(seastar::stop_iteration::no); + }); + }).si_then([&ret] { + return list_iertr::make_ready_future< + OnodeManager::list_onodes_bare_ret>(std::move(ret)); + }); + } + }); + }).safe_then([&ret](auto&& _ret) { + ret = std::move(_ret); + }); + }).safe_then([&ret] { + return std::move(ret); + }).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in SeaStore::list_objects" + } + ); + }); +} + +seastar::future<CollectionRef> +SeaStore::Shard::create_new_collection(const coll_t& cid) +{ + LOG_PREFIX(SeaStore::create_new_collection); + DEBUG("{}", cid); + return seastar::make_ready_future<CollectionRef>(_get_collection(cid)); +} + +seastar::future<CollectionRef> +SeaStore::Shard::open_collection(const coll_t& cid) +{ + LOG_PREFIX(SeaStore::open_collection); + DEBUG("{}", cid); + return list_collections().then([cid, this] (auto colls_cores) { + if (auto found = std::find(colls_cores.begin(), + colls_cores.end(), + std::make_pair(cid, seastar::this_shard_id())); + found != colls_cores.end()) { + return seastar::make_ready_future<CollectionRef>(_get_collection(cid)); + } else { + return seastar::make_ready_future<CollectionRef>(); + } + }); +} + +seastar::future<std::vector<coll_core_t>> +SeaStore::Shard::list_collections() +{ + return seastar::do_with( + std::vector<coll_core_t>(), + [this](auto &ret) { + return repeat_eagain([this, &ret] { + return transaction_manager->with_transaction_intr( + Transaction::src_t::READ, + "list_collections", + [this, &ret](auto& t) + { + return transaction_manager->read_collection_root(t + ).si_then([this, &t](auto coll_root) { + return collection_manager->list(coll_root, t); + }).si_then([&ret](auto colls) { + ret.resize(colls.size()); + std::transform( + colls.begin(), colls.end(), ret.begin(), + [](auto p) { + return std::make_pair(p.first, seastar::this_shard_id()); + }); + }); + }); + }).safe_then([&ret] { + return seastar::make_ready_future<std::vector<coll_core_t>>(ret); + }); + } + ).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in SeaStore::list_collections" + } + ); +} + +SeaStore::Shard::read_errorator::future<ceph::bufferlist> +SeaStore::Shard::read( + CollectionRef ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags) +{ + LOG_PREFIX(SeaStore::read); + DEBUG("oid {} offset {} len {}", oid, offset, len); + return repeat_with_onode<ceph::bufferlist>( + ch, + oid, + Transaction::src_t::READ, + "read_obj", + op_type_t::READ, + [=, this](auto &t, auto &onode) -> ObjectDataHandler::read_ret { + size_t size = onode.get_layout().size; + + if (offset >= size) { + return seastar::make_ready_future<ceph::bufferlist>(); + } + + size_t corrected_len = (len == 0) ? + size - offset : + std::min(size - offset, len); + + return ObjectDataHandler(max_object_size).read( + ObjectDataHandler::context_t{ + *transaction_manager, + t, + onode, + }, + offset, + corrected_len); + }); +} + +SeaStore::Shard::read_errorator::future<ceph::bufferlist> +SeaStore::Shard::readv( + CollectionRef ch, + const ghobject_t& _oid, + interval_set<uint64_t>& m, + uint32_t op_flags) +{ + return seastar::do_with( + _oid, + ceph::bufferlist{}, + [=, this, &m](auto &oid, auto &ret) { + return crimson::do_for_each( + m, + [=, this, &oid, &ret](auto &p) { + return read( + ch, oid, p.first, p.second, op_flags + ).safe_then([&ret](auto bl) { + ret.claim_append(bl); + }); + }).safe_then([&ret] { + return read_errorator::make_ready_future<ceph::bufferlist> + (std::move(ret)); + }); + }); + return read_errorator::make_ready_future<ceph::bufferlist>(); +} + +using crimson::os::seastore::omap_manager::BtreeOMapManager; + +SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist> +SeaStore::Shard::get_attr( + CollectionRef ch, + const ghobject_t& oid, + std::string_view name) const +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + LOG_PREFIX(SeaStore::get_attr); + DEBUG("{} {}", c->get_cid(), oid); + return repeat_with_onode<ceph::bufferlist>( + c, + oid, + Transaction::src_t::READ, + "get_attr", + op_type_t::GET_ATTR, + [=, this](auto &t, auto& onode) -> _omap_get_value_ret { + auto& layout = onode.get_layout(); + if (name == OI_ATTR && layout.oi_size) { + ceph::bufferlist bl; + bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size)); + return seastar::make_ready_future<ceph::bufferlist>(std::move(bl)); + } + if (name == SS_ATTR && layout.ss_size) { + ceph::bufferlist bl; + bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size)); + return seastar::make_ready_future<ceph::bufferlist>(std::move(bl)); + } + return _omap_get_value( + t, + layout.xattr_root.get( + onode.get_metadata_hint(device->get_block_size())), + name); + } + ).handle_error(crimson::ct_error::input_output_error::handle([FNAME] { + ERROR("EIO when getting attrs"); + abort(); + }), crimson::ct_error::pass_further_all{}); +} + +SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t> +SeaStore::Shard::get_attrs( + CollectionRef ch, + const ghobject_t& oid) +{ + LOG_PREFIX(SeaStore::get_attrs); + auto c = static_cast<SeastoreCollection*>(ch.get()); + DEBUG("{} {}", c->get_cid(), oid); + return repeat_with_onode<attrs_t>( + c, + oid, + Transaction::src_t::READ, + "get_addrs", + op_type_t::GET_ATTRS, + [=, this](auto &t, auto& onode) { + auto& layout = onode.get_layout(); + return omap_list(onode, layout.xattr_root, t, std::nullopt, + OMapManager::omap_list_config_t().with_inclusive(false, false) + ).si_then([&layout](auto p) { + auto& attrs = std::get<1>(p); + ceph::bufferlist bl; + if (layout.oi_size) { + bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size)); + attrs.emplace(OI_ATTR, std::move(bl)); + } + if (layout.ss_size) { + bl.clear(); + bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size)); + attrs.emplace(SS_ATTR, std::move(bl)); + } + return seastar::make_ready_future<omap_values_t>(std::move(attrs)); + }); + } + ).handle_error(crimson::ct_error::input_output_error::handle([FNAME] { + ERROR("EIO when getting attrs"); + abort(); + }), crimson::ct_error::pass_further_all{}); +} + +seastar::future<struct stat> SeaStore::Shard::stat( + CollectionRef c, + const ghobject_t& oid) +{ + LOG_PREFIX(SeaStore::stat); + return repeat_with_onode<struct stat>( + c, + oid, + Transaction::src_t::READ, + "stat", + op_type_t::STAT, + [=, this](auto &t, auto &onode) { + struct stat st; + auto &olayout = onode.get_layout(); + st.st_size = olayout.size; + st.st_blksize = device->get_block_size(); + st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize; + st.st_nlink = 1; + DEBUGT("cid {}, oid {}, return size {}", t, c->get_cid(), oid, st.st_size); + return seastar::make_ready_future<struct stat>(st); + } + ).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in SeaStore::stat" + } + ); +} + +SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist> +SeaStore::Shard::omap_get_header( + CollectionRef ch, + const ghobject_t& oid) +{ + return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY); +} + +SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t> +SeaStore::Shard::omap_get_values( + CollectionRef ch, + const ghobject_t &oid, + const omap_keys_t &keys) +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + return repeat_with_onode<omap_values_t>( + c, + oid, + Transaction::src_t::READ, + "omap_get_values", + op_type_t::OMAP_GET_VALUES, + [this, keys](auto &t, auto &onode) { + omap_root_t omap_root = onode.get_layout().omap_root.get( + onode.get_metadata_hint(device->get_block_size())); + return _omap_get_values( + t, + std::move(omap_root), + keys); + }); +} + +SeaStore::Shard::_omap_get_value_ret +SeaStore::Shard::_omap_get_value( + Transaction &t, + omap_root_t &&root, + std::string_view key) const +{ + return seastar::do_with( + BtreeOMapManager(*transaction_manager), + std::move(root), + std::string(key), + [&t](auto &manager, auto& root, auto& key) -> _omap_get_value_ret { + if (root.is_null()) { + return crimson::ct_error::enodata::make(); + } + return manager.omap_get_value(root, t, key + ).si_then([](auto opt) -> _omap_get_value_ret { + if (!opt) { + return crimson::ct_error::enodata::make(); + } + return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt)); + }); + } + ); +} + +SeaStore::Shard::_omap_get_values_ret +SeaStore::Shard::_omap_get_values( + Transaction &t, + omap_root_t &&omap_root, + const omap_keys_t &keys) const +{ + if (omap_root.is_null()) { + return seastar::make_ready_future<omap_values_t>(); + } + return seastar::do_with( + BtreeOMapManager(*transaction_manager), + std::move(omap_root), + omap_values_t(), + [&](auto &manager, auto &root, auto &ret) { + return trans_intr::do_for_each( + keys.begin(), + keys.end(), + [&](auto &key) { + return manager.omap_get_value( + root, + t, + key + ).si_then([&ret, &key](auto &&p) { + if (p) { + bufferlist bl; + bl.append(*p); + ret.emplace( + std::move(key), + std::move(bl)); + } + return seastar::now(); + }); + } + ).si_then([&ret] { + return std::move(ret); + }); + } + ); +} + +SeaStore::Shard::omap_list_ret +SeaStore::Shard::omap_list( + Onode &onode, + const omap_root_le_t& omap_root, + Transaction& t, + const std::optional<std::string>& start, + OMapManager::omap_list_config_t config) const +{ + auto root = omap_root.get( + onode.get_metadata_hint(device->get_block_size())); + if (root.is_null()) { + return seastar::make_ready_future<omap_list_bare_ret>( + true, omap_values_t{} + ); + } + return seastar::do_with( + BtreeOMapManager(*transaction_manager), + root, + start, + std::optional<std::string>(std::nullopt), + [&t, config](auto &manager, auto &root, auto &start, auto &end) { + return manager.omap_list(root, t, start, end, config); + }); +} + +SeaStore::Shard::omap_get_values_ret_t +SeaStore::Shard::omap_get_values( + CollectionRef ch, + const ghobject_t &oid, + const std::optional<string> &start) +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + LOG_PREFIX(SeaStore::omap_get_values); + DEBUG("{} {}", c->get_cid(), oid); + using ret_bare_t = std::tuple<bool, SeaStore::Shard::omap_values_t>; + return repeat_with_onode<ret_bare_t>( + c, + oid, + Transaction::src_t::READ, + "omap_list", + op_type_t::OMAP_LIST, + [this, start](auto &t, auto &onode) { + return omap_list( + onode, + onode.get_layout().omap_root, + t, + start, + OMapManager::omap_list_config_t().with_inclusive(false, false)); + }); +} + +SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap( + Transaction &t, + Onode &onode, + uint64_t off, + uint64_t len) const +{ + return seastar::do_with( + ObjectDataHandler(max_object_size), + [=, this, &t, &onode] (auto &objhandler) { + return objhandler.fiemap( + ObjectDataHandler::context_t{ + *transaction_manager, + t, + onode, + }, + off, + len); + }); +} + +SeaStore::Shard::read_errorator::future<std::map<uint64_t, uint64_t>> +SeaStore::Shard::fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + LOG_PREFIX(SeaStore::fiemap); + DEBUG("oid: {}, off: {}, len: {} ", oid, off, len); + return repeat_with_onode<std::map<uint64_t, uint64_t>>( + ch, + oid, + Transaction::src_t::READ, + "fiemap_read", + op_type_t::READ, + [=, this](auto &t, auto &onode) -> _fiemap_ret { + size_t size = onode.get_layout().size; + if (off >= size) { + INFOT("fiemap offset is over onode size!", t); + return seastar::make_ready_future<std::map<uint64_t, uint64_t>>(); + } + size_t adjust_len = (len == 0) ? + size - off: + std::min(size - off, len); + return _fiemap(t, onode, off, adjust_len); + }); +} + +void SeaStore::Shard::on_error(ceph::os::Transaction &t) { + LOG_PREFIX(SeaStore::on_error); + ERROR(" transaction dump:\n"); + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + std::stringstream str; + f.flush(str); + ERROR("{}", str.str()); + abort(); +} + +seastar::future<> SeaStore::Shard::do_transaction_no_callbacks( + CollectionRef _ch, + ceph::os::Transaction&& _t) +{ + // repeat_with_internal_context ensures ordering via collection lock + return repeat_with_internal_context( + _ch, + std::move(_t), + Transaction::src_t::MUTATE, + "do_transaction", + op_type_t::TRANSACTION, + [this](auto &ctx) { + return with_trans_intr(*ctx.transaction, [&, this](auto &t) { +#ifndef NDEBUG + LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks); + TRACET(" transaction dump:\n", t); + JSONFormatter f(true); + f.open_object_section("transaction"); + ctx.ext_transaction.dump(&f); + f.close_section(); + std::stringstream str; + f.flush(str); + TRACET("{}", t, str.str()); +#endif + return seastar::do_with( + std::vector<OnodeRef>(ctx.iter.objects.size()), + std::vector<OnodeRef>(ctx.iter.objects.size()), + [this, &ctx](auto& onodes, auto& d_onodes) mutable { + return trans_intr::repeat( + [this, &ctx, &onodes, &d_onodes]() mutable + -> tm_iertr::future<seastar::stop_iteration> + { + if (ctx.iter.have_op()) { + return _do_transaction_step( + ctx, ctx.ch, onodes, d_onodes, ctx.iter + ).si_then([] { + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }); + } else { + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + }; + }).si_then([this, &ctx, &d_onodes] { + return onode_manager->write_dirty(*ctx.transaction, d_onodes); + }); + }).si_then([this, &ctx] { + return transaction_manager->submit_transaction(*ctx.transaction); + }); + }); + }); +} + + +seastar::future<> SeaStore::Shard::flush(CollectionRef ch) +{ + return seastar::do_with( + get_dummy_ordering_handle(), + [this, ch](auto &handle) { + return handle.take_collection_lock( + static_cast<SeastoreCollection&>(*ch).ordering_lock + ).then([this, &handle] { + return transaction_manager->flush(handle); + }); + }); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_do_transaction_step( + internal_context_t &ctx, + CollectionRef &col, + std::vector<OnodeRef> &onodes, + std::vector<OnodeRef> &d_onodes, + ceph::os::Transaction::iterator &i) +{ + auto op = i.decode_op(); + + using ceph::os::Transaction; + if (op->op == Transaction::OP_NOP) + return tm_iertr::now(); + + switch (op->op) { + case Transaction::OP_RMCOLL: + { + coll_t cid = i.get_cid(op->cid); + return _remove_collection(ctx, cid); + } + case Transaction::OP_MKCOLL: + { + coll_t cid = i.get_cid(op->cid); + return _create_collection(ctx, cid, op->split_bits); + } + case Transaction::OP_COLL_HINT: + { + ceph::bufferlist hint; + i.decode_bl(hint); + return tm_iertr::now(); + } + } + + using onode_iertr = OnodeManager::get_onode_iertr::extend< + crimson::ct_error::value_too_large>; + auto fut = onode_iertr::make_ready_future<OnodeRef>(OnodeRef()); + bool create = false; + if (op->op == Transaction::OP_TOUCH || + op->op == Transaction::OP_CREATE || + op->op == Transaction::OP_WRITE || + op->op == Transaction::OP_ZERO) { + create = true; + } + if (!onodes[op->oid]) { + if (!create) { + fut = onode_manager->get_onode(*ctx.transaction, i.get_oid(op->oid)); + } else { + fut = onode_manager->get_or_create_onode( + *ctx.transaction, i.get_oid(op->oid)); + } + } + return fut.si_then([&, op](auto get_onode) { + OnodeRef &o = onodes[op->oid]; + if (!o) { + assert(get_onode); + o = get_onode; + d_onodes[op->oid] = get_onode; + } + if (op->op == Transaction::OP_CLONE && !d_onodes[op->dest_oid]) { + //TODO: use when_all_succeed after making onode tree + // support parallel extents loading + return onode_manager->get_or_create_onode( + *ctx.transaction, i.get_oid(op->dest_oid) + ).si_then([&, op](auto dest_onode) { + assert(dest_onode); + auto &d_o = onodes[op->dest_oid]; + assert(!d_o); + assert(!d_onodes[op->dest_oid]); + d_o = dest_onode; + d_onodes[op->dest_oid] = dest_onode; + return seastar::now(); + }); + } else { + return OnodeManager::get_or_create_onode_iertr::now(); + } + }).si_then([&, op, this]() -> tm_ret { + LOG_PREFIX(SeaStore::_do_transaction_step); + try { + switch (op->op) { + case Transaction::OP_REMOVE: + { + TRACET("removing {}", *ctx.transaction, i.get_oid(op->oid)); + return _remove(ctx, onodes[op->oid] + ).si_then([&onodes, &d_onodes, op] { + onodes[op->oid].reset(); + d_onodes[op->oid].reset(); + }); + } + case Transaction::OP_CREATE: + case Transaction::OP_TOUCH: + { + return _touch(ctx, onodes[op->oid]); + } + case Transaction::OP_WRITE: + { + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + ceph::bufferlist bl; + i.decode_bl(bl); + return _write( + ctx, onodes[op->oid], off, len, std::move(bl), + fadvise_flags); + } + case Transaction::OP_TRUNCATE: + { + uint64_t off = op->off; + return _truncate(ctx, onodes[op->oid], off); + } + case Transaction::OP_SETATTR: + { + std::string name = i.decode_string(); + std::map<std::string, bufferlist> to_set; + ceph::bufferlist& bl = to_set[name]; + i.decode_bl(bl); + return _setattrs(ctx, onodes[op->oid], std::move(to_set)); + } + case Transaction::OP_SETATTRS: + { + std::map<std::string, bufferlist> to_set; + i.decode_attrset(to_set); + return _setattrs(ctx, onodes[op->oid], std::move(to_set)); + } + case Transaction::OP_RMATTR: + { + std::string name = i.decode_string(); + return _rmattr(ctx, onodes[op->oid], name); + } + case Transaction::OP_RMATTRS: + { + return _rmattrs(ctx, onodes[op->oid]); + } + case Transaction::OP_OMAP_SETKEYS: + { + std::map<std::string, ceph::bufferlist> aset; + i.decode_attrset(aset); + return _omap_set_values(ctx, onodes[op->oid], std::move(aset)); + } + case Transaction::OP_OMAP_SETHEADER: + { + ceph::bufferlist bl; + i.decode_bl(bl); + return _omap_set_header(ctx, onodes[op->oid], std::move(bl)); + } + case Transaction::OP_OMAP_RMKEYS: + { + omap_keys_t keys; + i.decode_keyset(keys); + return _omap_rmkeys(ctx, onodes[op->oid], std::move(keys)); + } + case Transaction::OP_OMAP_RMKEYRANGE: + { + string first, last; + first = i.decode_string(); + last = i.decode_string(); + return _omap_rmkeyrange( + ctx, onodes[op->oid], + std::move(first), std::move(last)); + } + case Transaction::OP_OMAP_CLEAR: + { + return _omap_clear(ctx, onodes[op->oid]); + } + case Transaction::OP_ZERO: + { + objaddr_t off = op->off; + extent_len_t len = op->len; + return _zero(ctx, onodes[op->oid], off, len); + } + case Transaction::OP_SETALLOCHINT: + { + // TODO + return tm_iertr::now(); + } + case Transaction::OP_CLONE: + { + TRACET("cloning {} to {}", + *ctx.transaction, + i.get_oid(op->oid), + i.get_oid(op->dest_oid)); + return _clone(ctx, onodes[op->oid], d_onodes[op->dest_oid]); + } + default: + ERROR("bad op {}", static_cast<unsigned>(op->op)); + return crimson::ct_error::input_output_error::make(); + } + } catch (std::exception &e) { + ERROR("got exception {}", e); + return crimson::ct_error::input_output_error::make(); + } + }).handle_error_interruptible( + tm_iertr::pass_further{}, + crimson::ct_error::enoent::handle([op] { + //OMAP_CLEAR, TRUNCATE, REMOVE etc ops will tolerate absent onode. + if (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD || + op->op == Transaction::OP_SETATTR || + op->op == Transaction::OP_SETATTRS || + op->op == Transaction::OP_RMATTR || + op->op == Transaction::OP_OMAP_SETKEYS || + op->op == Transaction::OP_OMAP_RMKEYS || + op->op == Transaction::OP_OMAP_RMKEYRANGE || + op->op == Transaction::OP_OMAP_SETHEADER) { + ceph_abort_msg("unexpected enoent error"); + } + return seastar::now(); + }), + crimson::ct_error::assert_all{ + "Invalid error in SeaStore::do_transaction_step" + } + ); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_remove( + internal_context_t &ctx, + OnodeRef &onode) +{ + LOG_PREFIX(SeaStore::_remove); + DEBUGT("onode={}", *ctx.transaction, *onode); + auto fut = BtreeOMapManager::omap_clear_iertr::now(); + auto omap_root = onode->get_layout().omap_root.get( + onode->get_metadata_hint(device->get_block_size())); + if (omap_root.get_location() != L_ADDR_NULL) { + fut = seastar::do_with( + BtreeOMapManager(*transaction_manager), + onode->get_layout().omap_root.get( + onode->get_metadata_hint(device->get_block_size())), + [&ctx, onode](auto &omap_manager, auto &omap_root) { + return omap_manager.omap_clear( + omap_root, + *ctx.transaction + ); + }); + } + return fut.si_then([this, &ctx, onode] { + return seastar::do_with( + ObjectDataHandler(max_object_size), + [=, this, &ctx](auto &objhandler) { + return objhandler.clear( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + *onode, + }); + }); + }).si_then([this, &ctx, onode]() mutable { + return onode_manager->erase_onode(*ctx.transaction, onode); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further(), + crimson::ct_error::assert_all( + "Invalid error in SeaStore::_remove" + ) + ); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_touch( + internal_context_t &ctx, + OnodeRef &onode) +{ + LOG_PREFIX(SeaStore::_touch); + DEBUGT("onode={}", *ctx.transaction, *onode); + return tm_iertr::now(); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_write( + internal_context_t &ctx, + OnodeRef &onode, + uint64_t offset, size_t len, + ceph::bufferlist &&_bl, + uint32_t fadvise_flags) +{ + LOG_PREFIX(SeaStore::_write); + DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len); + { + auto &object_size = onode->get_mutable_layout(*ctx.transaction).size; + object_size = std::max<uint64_t>( + offset + len, + object_size); + } + return seastar::do_with( + std::move(_bl), + ObjectDataHandler(max_object_size), + [=, this, &ctx, &onode](auto &bl, auto &objhandler) { + return objhandler.write( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + *onode, + }, + offset, + bl); + }); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_clone( + internal_context_t &ctx, + OnodeRef &onode, + OnodeRef &d_onode) +{ + LOG_PREFIX(SeaStore::_clone); + DEBUGT("onode={} d_onode={}", *ctx.transaction, *onode, *d_onode); + return seastar::do_with( + ObjectDataHandler(max_object_size), + [this, &ctx, &onode, &d_onode](auto &objHandler) { + //TODO: currently, we only care about object data, leaving cloning + // of xattr/omap for future work + auto &object_size = onode->get_layout().size; + auto &d_object_size = d_onode->get_mutable_layout(*ctx.transaction).size; + d_object_size = object_size; + return objHandler.clone( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + *onode, + d_onode.get()}); + }); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_zero( + internal_context_t &ctx, + OnodeRef &onode, + objaddr_t offset, + extent_len_t len) +{ + LOG_PREFIX(SeaStore::_zero); + DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len); + if (offset + len >= max_object_size) { + return crimson::ct_error::input_output_error::make(); + } + auto &object_size = onode->get_mutable_layout(*ctx.transaction).size; + object_size = std::max<uint64_t>(offset + len, object_size); + return seastar::do_with( + ObjectDataHandler(max_object_size), + [=, this, &ctx, &onode](auto &objhandler) { + return objhandler.zero( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + *onode, + }, + offset, + len); + }); +} + +SeaStore::Shard::omap_set_kvs_ret +SeaStore::Shard::_omap_set_kvs( + OnodeRef &onode, + const omap_root_le_t& omap_root, + Transaction& t, + omap_root_le_t& mutable_omap_root, + std::map<std::string, ceph::bufferlist>&& kvs) +{ + return seastar::do_with( + BtreeOMapManager(*transaction_manager), + omap_root.get(onode->get_metadata_hint(device->get_block_size())), + [&, keys=std::move(kvs)](auto &omap_manager, auto &root) { + tm_iertr::future<> maybe_create_root = + !root.is_null() ? + tm_iertr::now() : + omap_manager.initialize_omap( + t, onode->get_metadata_hint(device->get_block_size()) + ).si_then([&root](auto new_root) { + root = new_root; + }); + return maybe_create_root.si_then( + [&, keys=std::move(keys)]() mutable { + return omap_manager.omap_set_keys(root, t, std::move(keys)); + }).si_then([&] { + return tm_iertr::make_ready_future<omap_root_t>(std::move(root)); + }).si_then([&mutable_omap_root](auto root) { + if (root.must_update()) { + mutable_omap_root.update(root); + } + }); + } + ); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_omap_set_values( + internal_context_t &ctx, + OnodeRef &onode, + std::map<std::string, ceph::bufferlist> &&aset) +{ + LOG_PREFIX(SeaStore::_omap_set_values); + DEBUGT("{} {} keys", *ctx.transaction, *onode, aset.size()); + return _omap_set_kvs( + onode, + onode->get_layout().omap_root, + *ctx.transaction, + onode->get_mutable_layout(*ctx.transaction).omap_root, + std::move(aset)); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_omap_set_header( + internal_context_t &ctx, + OnodeRef &onode, + ceph::bufferlist &&header) +{ + LOG_PREFIX(SeaStore::_omap_set_header); + DEBUGT("{} {} bytes", *ctx.transaction, *onode, header.length()); + std::map<std::string, bufferlist> to_set; + to_set[OMAP_HEADER_XATTR_KEY] = header; + return _setattrs(ctx, onode,std::move(to_set)); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_omap_clear( + internal_context_t &ctx, + OnodeRef &onode) +{ + LOG_PREFIX(SeaStore::_omap_clear); + DEBUGT("{} {} keys", *ctx.transaction, *onode); + return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY)) + .si_then([this, &ctx, &onode]() -> tm_ret { + if (auto omap_root = onode->get_layout().omap_root.get( + onode->get_metadata_hint(device->get_block_size())); + omap_root.is_null()) { + return seastar::now(); + } else { + return seastar::do_with( + BtreeOMapManager(*transaction_manager), + onode->get_layout().omap_root.get( + onode->get_metadata_hint(device->get_block_size())), + [&ctx, &onode]( + auto &omap_manager, + auto &omap_root) { + return omap_manager.omap_clear( + omap_root, + *ctx.transaction) + .si_then([&] { + if (omap_root.must_update()) { + onode->get_mutable_layout(*ctx.transaction + ).omap_root.update(omap_root); + } + }); + }); + } + }); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_omap_rmkeys( + internal_context_t &ctx, + OnodeRef &onode, + omap_keys_t &&keys) +{ + LOG_PREFIX(SeaStore::_omap_rmkeys); + DEBUGT("{} {} keys", *ctx.transaction, *onode, keys.size()); + auto omap_root = onode->get_layout().omap_root.get( + onode->get_metadata_hint(device->get_block_size())); + if (omap_root.is_null()) { + return seastar::now(); + } else { + return seastar::do_with( + BtreeOMapManager(*transaction_manager), + onode->get_layout().omap_root.get( + onode->get_metadata_hint(device->get_block_size())), + std::move(keys), + [&ctx, &onode]( + auto &omap_manager, + auto &omap_root, + auto &keys) { + return trans_intr::do_for_each( + keys.begin(), + keys.end(), + [&](auto &p) { + return omap_manager.omap_rm_key( + omap_root, + *ctx.transaction, + p); + } + ).si_then([&] { + if (omap_root.must_update()) { + onode->get_mutable_layout(*ctx.transaction + ).omap_root.update(omap_root); + } + }); + } + ); + } +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_omap_rmkeyrange( + internal_context_t &ctx, + OnodeRef &onode, + std::string first, + std::string last) +{ + LOG_PREFIX(SeaStore::_omap_rmkeyrange); + DEBUGT("{} first={} last={}", *ctx.transaction, *onode, first, last); + if (first > last) { + ERRORT("range error, first: {} > last:{}", *ctx.transaction, first, last); + ceph_abort(); + } + auto omap_root = onode->get_layout().omap_root.get( + onode->get_metadata_hint(device->get_block_size())); + if (omap_root.is_null()) { + return seastar::now(); + } else { + return seastar::do_with( + BtreeOMapManager(*transaction_manager), + onode->get_layout().omap_root.get( + onode->get_metadata_hint(device->get_block_size())), + std::move(first), + std::move(last), + [&ctx, &onode]( + auto &omap_manager, + auto &omap_root, + auto &first, + auto &last) { + auto config = OMapManager::omap_list_config_t() + .with_inclusive(true, false) + .without_max(); + return omap_manager.omap_rm_key_range( + omap_root, + *ctx.transaction, + first, + last, + config + ).si_then([&] { + if (omap_root.must_update()) { + onode->get_mutable_layout(*ctx.transaction + ).omap_root.update(omap_root); + } + }); + }); + } +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_truncate( + internal_context_t &ctx, + OnodeRef &onode, + uint64_t size) +{ + LOG_PREFIX(SeaStore::_truncate); + DEBUGT("onode={} size={}", *ctx.transaction, *onode, size); + onode->get_mutable_layout(*ctx.transaction).size = size; + return seastar::do_with( + ObjectDataHandler(max_object_size), + [=, this, &ctx, &onode](auto &objhandler) { + return objhandler.truncate( + ObjectDataHandler::context_t{ + *transaction_manager, + *ctx.transaction, + *onode + }, + size); + }); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_setattrs( + internal_context_t &ctx, + OnodeRef &onode, + std::map<std::string, bufferlist>&& aset) +{ + LOG_PREFIX(SeaStore::_setattrs); + DEBUGT("onode={}", *ctx.transaction, *onode); + + auto fut = tm_iertr::now(); + auto& layout = onode->get_mutable_layout(*ctx.transaction); + if (auto it = aset.find(OI_ATTR); it != aset.end()) { + auto& val = it->second; + if (likely(val.length() <= onode_layout_t::MAX_OI_LENGTH)) { + maybe_inline_memcpy( + &layout.oi[0], + val.c_str(), + val.length(), + onode_layout_t::MAX_OI_LENGTH); + + if (!layout.oi_size) { + // if oi was not in the layout, it probably exists in the omap, + // need to remove it first + fut = _xattr_rmattr(ctx, onode, OI_ATTR); + } + layout.oi_size = val.length(); + aset.erase(it); + } else { + layout.oi_size = 0; + } + } + + if (auto it = aset.find(SS_ATTR); it != aset.end()) { + auto& val = it->second; + if (likely(val.length() <= onode_layout_t::MAX_SS_LENGTH)) { + maybe_inline_memcpy( + &layout.ss[0], + val.c_str(), + val.length(), + onode_layout_t::MAX_SS_LENGTH); + + if (!layout.ss_size) { + fut = _xattr_rmattr(ctx, onode, SS_ATTR); + } + layout.ss_size = val.length(); + + aset.erase(it); + } else { + layout.ss_size = 0; + } + } + + if (aset.empty()) { + return fut; + } + + return fut.si_then( + [this, onode, &ctx, &layout, + aset=std::move(aset)]() mutable { + return _omap_set_kvs( + onode, + onode->get_layout().xattr_root, + *ctx.transaction, + layout.xattr_root, + std::move(aset)); + }); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_rmattr( + internal_context_t &ctx, + OnodeRef &onode, + std::string name) +{ + LOG_PREFIX(SeaStore::_rmattr); + DEBUGT("onode={}", *ctx.transaction, *onode); + auto& layout = onode->get_mutable_layout(*ctx.transaction); + if ((name == OI_ATTR) && (layout.oi_size > 0)) { + memset(&layout.oi[0], 0, layout.oi_size); + layout.oi_size = 0; + return tm_iertr::now(); + } else if ((name == SS_ATTR) && (layout.ss_size > 0)) { + memset(&layout.ss[0], 0, layout.ss_size); + layout.ss_size = 0; + return tm_iertr::now(); + } else { + return _xattr_rmattr( + ctx, + onode, + std::move(name)); + } +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_xattr_rmattr( + internal_context_t &ctx, + OnodeRef &onode, + std::string &&name) +{ + LOG_PREFIX(SeaStore::_xattr_rmattr); + DEBUGT("onode={}", *ctx.transaction, *onode); + auto xattr_root = onode->get_layout().xattr_root.get( + onode->get_metadata_hint(device->get_block_size())); + if (xattr_root.is_null()) { + return seastar::now(); + } else { + return seastar::do_with( + BtreeOMapManager(*transaction_manager), + onode->get_layout().xattr_root.get( + onode->get_metadata_hint(device->get_block_size())), + std::move(name), + [&ctx, &onode](auto &omap_manager, auto &xattr_root, auto &name) { + return omap_manager.omap_rm_key(xattr_root, *ctx.transaction, name) + .si_then([&] { + if (xattr_root.must_update()) { + onode->get_mutable_layout(*ctx.transaction + ).xattr_root.update(xattr_root); + } + }); + }); + } +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_rmattrs( + internal_context_t &ctx, + OnodeRef &onode) +{ + LOG_PREFIX(SeaStore::_rmattrs); + DEBUGT("onode={}", *ctx.transaction, *onode); + auto& layout = onode->get_mutable_layout(*ctx.transaction); + memset(&layout.oi[0], 0, layout.oi_size); + layout.oi_size = 0; + memset(&layout.ss[0], 0, layout.ss_size); + layout.ss_size = 0; + return _xattr_clear(ctx, onode); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_xattr_clear( + internal_context_t &ctx, + OnodeRef &onode) +{ + LOG_PREFIX(SeaStore::_xattr_clear); + DEBUGT("onode={}", *ctx.transaction, *onode); + auto xattr_root = onode->get_layout().xattr_root.get( + onode->get_metadata_hint(device->get_block_size())); + if (xattr_root.is_null()) { + return seastar::now(); + } else { + return seastar::do_with( + BtreeOMapManager(*transaction_manager), + onode->get_layout().xattr_root.get( + onode->get_metadata_hint(device->get_block_size())), + [&ctx, &onode](auto &omap_manager, auto &xattr_root) { + return omap_manager.omap_clear(xattr_root, *ctx.transaction) + .si_then([&] { + if (xattr_root.must_update()) { + onode->get_mutable_layout(*ctx.transaction + ).xattr_root.update(xattr_root); + } + }); + }); + } +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_create_collection( + internal_context_t &ctx, + const coll_t& cid, int bits) +{ + return transaction_manager->read_collection_root( + *ctx.transaction + ).si_then([=, this, &ctx](auto _cmroot) { + return seastar::do_with( + _cmroot, + [=, this, &ctx](auto &cmroot) { + return collection_manager->create( + cmroot, + *ctx.transaction, + cid, + bits + ).si_then([this, &ctx, &cmroot] { + if (cmroot.must_update()) { + transaction_manager->write_collection_root( + *ctx.transaction, + cmroot); + } + }); + } + ); + }).handle_error_interruptible( + tm_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in SeaStore::_create_collection" + } + ); +} + +SeaStore::Shard::tm_ret +SeaStore::Shard::_remove_collection( + internal_context_t &ctx, + const coll_t& cid) +{ + return transaction_manager->read_collection_root( + *ctx.transaction + ).si_then([=, this, &ctx](auto _cmroot) { + return seastar::do_with( + _cmroot, + [=, this, &ctx](auto &cmroot) { + return collection_manager->remove( + cmroot, + *ctx.transaction, + cid + ).si_then([this, &ctx, &cmroot] { + // param here denotes whether it already existed, probably error + if (cmroot.must_update()) { + transaction_manager->write_collection_root( + *ctx.transaction, + cmroot); + } + }); + }); + }).handle_error_interruptible( + tm_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in SeaStore::_create_collection" + } + ); +} + +boost::intrusive_ptr<SeastoreCollection> +SeaStore::Shard::_get_collection(const coll_t& cid) +{ + return new SeastoreCollection{cid}; +} + +seastar::future<> SeaStore::Shard::write_meta( + const std::string& key, + const std::string& value) +{ + LOG_PREFIX(SeaStore::write_meta); + DEBUG("key: {}; value: {}", key, value); + return seastar::do_with( + key, value, + [this, FNAME](auto& key, auto& value) { + return repeat_eagain([this, FNAME, &key, &value] { + return transaction_manager->with_transaction_intr( + Transaction::src_t::MUTATE, + "write_meta", + [this, FNAME, &key, &value](auto& t) + { + DEBUGT("Have transaction, key: {}; value: {}", t, key, value); + return transaction_manager->update_root_meta( + t, key, value + ).si_then([this, &t] { + return transaction_manager->submit_transaction(t); + }); + }); + }); + }).handle_error( + crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"} + ); +} + +seastar::future<std::tuple<int, std::string>> +SeaStore::read_meta(const std::string& key) +{ + ceph_assert(seastar::this_shard_id() == primary_core); + LOG_PREFIX(SeaStore::read_meta); + DEBUG("key: {}", key); + return mdstore->read_meta(key).safe_then([](auto v) { + if (v) { + return std::make_tuple(0, std::move(*v)); + } else { + return std::make_tuple(-1, std::string("")); + } + }).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in SeaStore::read_meta" + } + ); +} + +uuid_d SeaStore::Shard::get_fsid() const +{ + return device->get_meta().seastore_id; +} + +void SeaStore::Shard::init_managers() +{ + transaction_manager.reset(); + collection_manager.reset(); + onode_manager.reset(); + + transaction_manager = make_transaction_manager( + device, secondaries, is_test); + collection_manager = std::make_unique<collection_manager::FlatCollectionManager>( + *transaction_manager); + onode_manager = std::make_unique<crimson::os::seastore::onode::FLTreeOnodeManager>( + *transaction_manager); +} + +std::unique_ptr<SeaStore> make_seastore( + const std::string &device) +{ + auto mdstore = std::make_unique<FileMDStore>(device); + return std::make_unique<SeaStore>( + device, + std::move(mdstore)); +} + +std::unique_ptr<SeaStore> make_test_seastore( + SeaStore::MDStoreRef mdstore) +{ + return std::make_unique<SeaStore>( + "", + std::move(mdstore)); +} + +} diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h new file mode 100644 index 000000000..876fadca8 --- /dev/null +++ b/src/crimson/os/seastore/seastore.h @@ -0,0 +1,531 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <unordered_map> +#include <map> +#include <typeinfo> +#include <vector> + +#include <optional> +#include <seastar/core/future.hh> +#include <seastar/core/metrics_types.hh> + +#include "include/uuid.h" + +#include "os/Transaction.h" +#include "crimson/common/throttle.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +#include "crimson/os/seastore/device.h" +#include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/onode_manager.h" +#include "crimson/os/seastore/omap_manager.h" +#include "crimson/os/seastore/collection_manager.h" +#include "crimson/os/seastore/object_data_handler.h" + +namespace crimson::os::seastore { + +class Onode; +using OnodeRef = boost::intrusive_ptr<Onode>; +class TransactionManager; + +enum class op_type_t : uint8_t { + TRANSACTION = 0, + READ, + WRITE, + GET_ATTR, + GET_ATTRS, + STAT, + OMAP_GET_VALUES, + OMAP_LIST, + MAX +}; + +class SeastoreCollection final : public FuturizedCollection { +public: + template <typename... T> + SeastoreCollection(T&&... args) : + FuturizedCollection(std::forward<T>(args)...) {} + + seastar::shared_mutex ordering_lock; +}; + +/** + * col_obj_ranges_t + * + * Represents the two ghobject_t ranges spanned by a PG collection. + * Temp objects will be within [temp_begin, temp_end) and normal objects + * will be in [obj_begin, obj_end). + */ +struct col_obj_ranges_t { + ghobject_t temp_begin; + ghobject_t temp_end; + ghobject_t obj_begin; + ghobject_t obj_end; +}; + +class SeaStore final : public FuturizedStore { +public: + class MDStore { + public: + using base_iertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + + using write_meta_ertr = base_iertr; + using write_meta_ret = write_meta_ertr::future<>; + virtual write_meta_ret write_meta( + const std::string &key, + const std::string &val + ) = 0; + + using read_meta_ertr = base_iertr; + using read_meta_ret = write_meta_ertr::future<std::optional<std::string>>; + virtual read_meta_ret read_meta(const std::string &key) = 0; + + virtual ~MDStore() {} + }; + using MDStoreRef = std::unique_ptr<MDStore>; + + class Shard : public FuturizedStore::Shard { + public: + Shard( + std::string root, + Device* device, + bool is_test); + ~Shard() = default; + + seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<ceph::bufferlist> read( + CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) final; + + read_errorator::future<ceph::bufferlist> readv( + CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) final; + + get_attr_errorator::future<ceph::bufferlist> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name) const final; + + get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) final; + + /// Retrieves paged set of values > start (if present) + using omap_get_values_ret_bare_t = std::tuple<bool, omap_values_t>; + using omap_get_values_ret_t = read_errorator::future< + omap_get_values_ret_bare_t>; + omap_get_values_ret_t omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) final; ///< @return <done, values> values.empty() iff done + + get_attr_errorator::future<bufferlist> omap_get_header( + CollectionRef c, + const ghobject_t& oid) final; + + seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const final; + + seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; + seastar::future<CollectionRef> open_collection(const coll_t& cid) final; + + seastar::future<> do_transaction_no_callbacks( + CollectionRef ch, + ceph::os::Transaction&& txn) final; + + /* Note, flush() machinery must go through the same pipeline + * stages and locks as do_transaction. */ + seastar::future<> flush(CollectionRef ch) final; + + read_errorator::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) final; + + unsigned get_max_attr_name_length() const final { + return 256; + } + + // only exposed to SeaStore + public: + seastar::future<> umount(); + // init managers and mount transaction_manager + seastar::future<> mount_managers(); + + void set_secondaries(Device& sec_dev) { + secondaries.emplace_back(&sec_dev); + } + + using coll_core_t = FuturizedStore::coll_core_t; + seastar::future<std::vector<coll_core_t>> list_collections(); + + seastar::future<> write_meta(const std::string& key, + const std::string& value); + + store_statfs_t stat() const; + + uuid_d get_fsid() const; + + seastar::future<> mkfs_managers(); + + void init_managers(); + + private: + struct internal_context_t { + CollectionRef ch; + ceph::os::Transaction ext_transaction; + + internal_context_t( + CollectionRef ch, + ceph::os::Transaction &&_ext_transaction, + TransactionRef &&transaction) + : ch(ch), ext_transaction(std::move(_ext_transaction)), + transaction(std::move(transaction)), + iter(ext_transaction.begin()) {} + + TransactionRef transaction; + + ceph::os::Transaction::iterator iter; + std::chrono::steady_clock::time_point begin_timestamp = std::chrono::steady_clock::now(); + + void reset_preserve_handle(TransactionManager &tm) { + tm.reset_transaction_preserve_handle(*transaction); + iter = ext_transaction.begin(); + } + }; + + TransactionManager::read_extent_iertr::future<std::optional<unsigned>> + get_coll_bits(CollectionRef ch, Transaction &t) const; + + static void on_error(ceph::os::Transaction &t); + + template <typename F> + auto repeat_with_internal_context( + CollectionRef ch, + ceph::os::Transaction &&t, + Transaction::src_t src, + const char* tname, + op_type_t op_type, + F &&f) { + return seastar::do_with( + internal_context_t( + ch, std::move(t), + transaction_manager->create_transaction(src, tname)), + std::forward<F>(f), + [this, op_type](auto &ctx, auto &f) { + return ctx.transaction->get_handle().take_collection_lock( + static_cast<SeastoreCollection&>(*(ctx.ch)).ordering_lock + ).then([this] { + return throttler.get(1); + }).then([&, this] { + return repeat_eagain([&, this] { + ctx.reset_preserve_handle(*transaction_manager); + return std::invoke(f, ctx); + }).handle_error( + crimson::ct_error::eagain::pass_further{}, + crimson::ct_error::all_same_way([&ctx](auto e) { + on_error(ctx.ext_transaction); + }) + ); + }).then([this, op_type, &ctx] { + add_latency_sample(op_type, + std::chrono::steady_clock::now() - ctx.begin_timestamp); + }).finally([this] { + throttler.put(); + }); + }); + } + + template <typename Ret, typename F> + auto repeat_with_onode( + CollectionRef ch, + const ghobject_t &oid, + Transaction::src_t src, + const char* tname, + op_type_t op_type, + F &&f) const { + auto begin_time = std::chrono::steady_clock::now(); + return seastar::do_with( + oid, Ret{}, std::forward<F>(f), + [this, src, op_type, begin_time, tname + ](auto &oid, auto &ret, auto &f) + { + return repeat_eagain([&, this, src, tname] { + return transaction_manager->with_transaction_intr( + src, + tname, + [&, this](auto& t) + { + return onode_manager->get_onode(t, oid + ).si_then([&](auto onode) { + return seastar::do_with(std::move(onode), [&](auto& onode) { + return f(t, *onode); + }); + }).si_then([&ret](auto _ret) { + ret = _ret; + }); + }); + }).safe_then([&ret, op_type, begin_time, this] { + const_cast<Shard*>(this)->add_latency_sample(op_type, + std::chrono::steady_clock::now() - begin_time); + return seastar::make_ready_future<Ret>(ret); + }); + }); + } + + using _fiemap_ret = ObjectDataHandler::fiemap_ret; + _fiemap_ret _fiemap( + Transaction &t, + Onode &onode, + uint64_t off, + uint64_t len) const; + + using _omap_get_value_iertr = OMapManager::base_iertr::extend< + crimson::ct_error::enodata + >; + using _omap_get_value_ret = _omap_get_value_iertr::future<ceph::bufferlist>; + _omap_get_value_ret _omap_get_value( + Transaction &t, + omap_root_t &&root, + std::string_view key) const; + + using _omap_get_values_iertr = OMapManager::base_iertr; + using _omap_get_values_ret = _omap_get_values_iertr::future<omap_values_t>; + _omap_get_values_ret _omap_get_values( + Transaction &t, + omap_root_t &&root, + const omap_keys_t &keys) const; + + friend class SeaStoreOmapIterator; + + using omap_list_bare_ret = OMapManager::omap_list_bare_ret; + using omap_list_ret = OMapManager::omap_list_ret; + omap_list_ret omap_list( + Onode &onode, + const omap_root_le_t& omap_root, + Transaction& t, + const std::optional<std::string>& start, + OMapManager::omap_list_config_t config) const; + + using tm_iertr = TransactionManager::base_iertr; + using tm_ret = tm_iertr::future<>; + tm_ret _do_transaction_step( + internal_context_t &ctx, + CollectionRef &col, + std::vector<OnodeRef> &onodes, + std::vector<OnodeRef> &d_onodes, + ceph::os::Transaction::iterator &i); + + tm_ret _remove( + internal_context_t &ctx, + OnodeRef &onode); + tm_ret _touch( + internal_context_t &ctx, + OnodeRef &onode); + tm_ret _write( + internal_context_t &ctx, + OnodeRef &onode, + uint64_t offset, size_t len, + ceph::bufferlist &&bl, + uint32_t fadvise_flags); + tm_ret _clone( + internal_context_t &ctx, + OnodeRef &onode, + OnodeRef &d_onode); + tm_ret _zero( + internal_context_t &ctx, + OnodeRef &onode, + objaddr_t offset, extent_len_t len); + tm_ret _omap_set_values( + internal_context_t &ctx, + OnodeRef &onode, + std::map<std::string, ceph::bufferlist> &&aset); + tm_ret _omap_set_header( + internal_context_t &ctx, + OnodeRef &onode, + ceph::bufferlist &&header); + tm_ret _omap_clear( + internal_context_t &ctx, + OnodeRef &onode); + tm_ret _omap_rmkeys( + internal_context_t &ctx, + OnodeRef &onode, + omap_keys_t &&aset); + tm_ret _omap_rmkeyrange( + internal_context_t &ctx, + OnodeRef &onode, + std::string first, + std::string last); + tm_ret _truncate( + internal_context_t &ctx, + OnodeRef &onode, uint64_t size); + tm_ret _setattrs( + internal_context_t &ctx, + OnodeRef &onode, + std::map<std::string,bufferlist>&& aset); + tm_ret _rmattr( + internal_context_t &ctx, + OnodeRef &onode, + std::string name); + tm_ret _rmattrs( + internal_context_t &ctx, + OnodeRef &onode); + tm_ret _xattr_rmattr( + internal_context_t &ctx, + OnodeRef &onode, + std::string &&name); + tm_ret _xattr_clear( + internal_context_t &ctx, + OnodeRef &onode); + tm_ret _create_collection( + internal_context_t &ctx, + const coll_t& cid, int bits); + tm_ret _remove_collection( + internal_context_t &ctx, + const coll_t& cid); + using omap_set_kvs_ret = tm_iertr::future<>; + omap_set_kvs_ret _omap_set_kvs( + OnodeRef &onode, + const omap_root_le_t& omap_root, + Transaction& t, + omap_root_le_t& mutable_omap_root, + std::map<std::string, ceph::bufferlist>&& kvs); + + boost::intrusive_ptr<SeastoreCollection> _get_collection(const coll_t& cid); + + static constexpr auto LAT_MAX = static_cast<std::size_t>(op_type_t::MAX); + + struct { + std::array<seastar::metrics::histogram, LAT_MAX> op_lat; + } stats; + + seastar::metrics::histogram& get_latency( + op_type_t op_type) { + assert(static_cast<std::size_t>(op_type) < stats.op_lat.size()); + return stats.op_lat[static_cast<std::size_t>(op_type)]; + } + + void add_latency_sample(op_type_t op_type, + std::chrono::steady_clock::duration dur) { + seastar::metrics::histogram& lat = get_latency(op_type); + lat.sample_count++; + lat.sample_sum += std::chrono::duration_cast<std::chrono::milliseconds>(dur).count(); + } + + private: + std::string root; + Device* device; + const uint32_t max_object_size; + bool is_test; + + std::vector<Device*> secondaries; + TransactionManagerRef transaction_manager; + CollectionManagerRef collection_manager; + OnodeManagerRef onode_manager; + + common::Throttle throttler; + + seastar::metrics::metric_group metrics; + void register_metrics(); + }; + +public: + SeaStore( + const std::string& root, + MDStoreRef mdstore); + ~SeaStore(); + + seastar::future<> start() final; + seastar::future<> stop() final; + + mount_ertr::future<> mount() final; + seastar::future<> umount() final; + + mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final; + seastar::future<store_statfs_t> stat() const final; + + uuid_d get_fsid() const final { + ceph_assert(seastar::this_shard_id() == primary_core); + return shard_stores.local().get_fsid(); + } + + seastar::future<> write_meta( + const std::string& key, + const std::string& value) final { + ceph_assert(seastar::this_shard_id() == primary_core); + return shard_stores.local().write_meta( + key, value).then([this, key, value] { + return mdstore->write_meta(key, value); + }).handle_error( + crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"} + ); + } + + seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final; + + seastar::future<std::vector<coll_core_t>> list_collections() final; + + FuturizedStore::Shard& get_sharded_store() final { + return shard_stores.local(); + } + + static col_obj_ranges_t + get_objs_range(CollectionRef ch, unsigned bits); + +// for test +public: + mount_ertr::future<> test_mount(); + mkfs_ertr::future<> test_mkfs(uuid_d new_osd_fsid); + + DeviceRef get_primary_device_ref() { + return std::move(device); + } + + seastar::future<> test_start(DeviceRef dev); + +private: + seastar::future<> write_fsid(uuid_d new_osd_fsid); + + seastar::future<> prepare_meta(uuid_d new_osd_fsid); + + seastar::future<> set_secondaries(); + +private: + std::string root; + MDStoreRef mdstore; + DeviceRef device; + std::vector<DeviceRef> secondaries; + seastar::sharded<SeaStore::Shard> shard_stores; +}; + +std::unique_ptr<SeaStore> make_seastore( + const std::string &device); + +std::unique_ptr<SeaStore> make_test_seastore( + SeaStore::MDStoreRef mdstore); +} diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc new file mode 100644 index 000000000..0acfdb74e --- /dev/null +++ b/src/crimson/os/seastore/seastore_types.cc @@ -0,0 +1,874 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/common/log.h" + +namespace { + +seastar::logger& journal_logger() { + return crimson::get_logger(ceph_subsys_seastore_journal); +} + +} + +namespace crimson::os::seastore { + +bool is_aligned(uint64_t offset, uint64_t alignment) +{ + return (offset % alignment) == 0; +} + +std::ostream& operator<<(std::ostream &out, const omap_root_t &root) +{ + return out << "omap_root{addr=" << root.addr + << ", depth=" << root.depth + << ", hint=" << root.hint + << ", mutated=" << root.mutated + << "}"; +} + +std::ostream& operator<<(std::ostream& out, const seastore_meta_t& meta) +{ + return out << meta.seastore_id; +} + +std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id) +{ + auto _id = id.id; + if (_id == DEVICE_ID_NULL) { + return out << "Dev(NULL)"; + } else if (_id == DEVICE_ID_RECORD_RELATIVE) { + return out << "Dev(RR)"; + } else if (_id == DEVICE_ID_BLOCK_RELATIVE) { + return out << "Dev(BR)"; + } else if (_id == DEVICE_ID_DELAYED) { + return out << "Dev(DELAYED)"; + } else if (_id == DEVICE_ID_FAKE) { + return out << "Dev(FAKE)"; + } else if (_id == DEVICE_ID_ZERO) { + return out << "Dev(ZERO)"; + } else if (_id == DEVICE_ID_ROOT) { + return out << "Dev(ROOT)"; + } else { + return out << "Dev(" << (unsigned)_id << ")"; + } +} + +std::ostream &operator<<(std::ostream &out, const segment_id_t &segment) +{ + if (segment == NULL_SEG_ID) { + return out << "Seg[NULL]"; + } else { + return out << "Seg[" << device_id_printer_t{segment.device_id()} + << "," << segment.device_segment_id() + << "]"; + } +} + +std::ostream& operator<<(std::ostream& out, segment_type_t t) +{ + switch(t) { + case segment_type_t::JOURNAL: + return out << "JOURNAL"; + case segment_type_t::OOL: + return out << "OOL"; + case segment_type_t::NULL_SEG: + return out << "NULL_SEG"; + default: + return out << "INVALID_SEGMENT_TYPE!"; + } +} + +std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq) +{ + if (seq.seq == NULL_SEG_SEQ) { + return out << "sseq(NULL)"; + } else { + return out << "sseq(" << seq.seq << ")"; + } +} + +std::ostream &operator<<(std::ostream &out, const pladdr_t &pladdr) +{ + if (pladdr.is_laddr()) { + return out << pladdr.get_laddr(); + } else { + return out << pladdr.get_paddr(); + } +} + +std::ostream &operator<<(std::ostream &out, const paddr_t &rhs) +{ + auto id = rhs.get_device_id(); + out << "paddr<"; + if (rhs == P_ADDR_NULL) { + out << "NULL"; + } else if (rhs == P_ADDR_MIN) { + out << "MIN"; + } else if (rhs == P_ADDR_ZERO) { + out << "ZERO"; + } else if (has_device_off(id)) { + auto &s = rhs.as_res_paddr(); + out << device_id_printer_t{id} + << "," + << s.get_device_off(); + } else if (rhs.get_addr_type() == paddr_types_t::SEGMENT) { + auto &s = rhs.as_seg_paddr(); + out << s.get_segment_id() + << "," + << s.get_segment_off(); + } else if (rhs.get_addr_type() == paddr_types_t::RANDOM_BLOCK) { + auto &s = rhs.as_blk_paddr(); + out << device_id_printer_t{s.get_device_id()} + << "," + << s.get_device_off(); + } else { + out << "INVALID!"; + } + return out << ">"; +} + +journal_seq_t journal_seq_t::add_offset( + journal_type_t type, + device_off_t off, + device_off_t roll_start, + device_off_t roll_size) const +{ + assert(offset.is_absolute()); + assert(off <= DEVICE_OFF_MAX && off >= DEVICE_OFF_MIN); + assert(roll_start >= 0); + assert(roll_size > 0); + + segment_seq_t jseq = segment_seq; + device_off_t joff; + if (type == journal_type_t::SEGMENTED) { + joff = offset.as_seg_paddr().get_segment_off(); + } else { + assert(type == journal_type_t::RANDOM_BLOCK); + auto boff = offset.as_blk_paddr().get_device_off(); + joff = boff; + } + auto roll_end = roll_start + roll_size; + assert(joff >= roll_start); + assert(joff <= roll_end); + + if (off >= 0) { + device_off_t new_jseq = jseq + (off / roll_size); + joff += (off % roll_size); + if (joff >= roll_end) { + ++new_jseq; + joff -= roll_size; + } + assert(new_jseq < MAX_SEG_SEQ); + jseq = static_cast<segment_seq_t>(new_jseq); + } else { + device_off_t mod = (-off) / roll_size; + joff -= ((-off) % roll_size); + if (joff < roll_start) { + ++mod; + joff += roll_size; + } + if (jseq >= mod) { + jseq -= mod; + } else { + return JOURNAL_SEQ_MIN; + } + } + assert(joff >= roll_start); + assert(joff < roll_end); + return journal_seq_t{jseq, make_block_relative_paddr(joff)}; +} + +device_off_t journal_seq_t::relative_to( + journal_type_t type, + const journal_seq_t& r, + device_off_t roll_start, + device_off_t roll_size) const +{ + assert(offset.is_absolute()); + assert(r.offset.is_absolute()); + assert(roll_start >= 0); + assert(roll_size > 0); + + device_off_t ret = static_cast<device_off_t>(segment_seq) - r.segment_seq; + ret *= roll_size; + if (type == journal_type_t::SEGMENTED) { + ret += (static_cast<device_off_t>(offset.as_seg_paddr().get_segment_off()) - + static_cast<device_off_t>(r.offset.as_seg_paddr().get_segment_off())); + } else { + assert(type == journal_type_t::RANDOM_BLOCK); + ret += offset.as_blk_paddr().get_device_off() - + r.offset.as_blk_paddr().get_device_off(); + } + assert(ret <= DEVICE_OFF_MAX && ret >= DEVICE_OFF_MIN); + return ret; +} + +std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq) +{ + if (seq == JOURNAL_SEQ_NULL) { + return out << "JOURNAL_SEQ_NULL"; + } else if (seq == JOURNAL_SEQ_MIN) { + return out << "JOURNAL_SEQ_MIN"; + } else { + return out << "jseq(" + << segment_seq_printer_t{seq.segment_seq} + << ", " << seq.offset + << ")"; + } +} + +std::ostream &operator<<(std::ostream &out, extent_types_t t) +{ + switch (t) { + case extent_types_t::ROOT: + return out << "ROOT"; + case extent_types_t::LADDR_INTERNAL: + return out << "LADDR_INTERNAL"; + case extent_types_t::LADDR_LEAF: + return out << "LADDR_LEAF"; + case extent_types_t::DINK_LADDR_LEAF: + return out << "LADDR_LEAF"; + case extent_types_t::ONODE_BLOCK_STAGED: + return out << "ONODE_BLOCK_STAGED"; + case extent_types_t::OMAP_INNER: + return out << "OMAP_INNER"; + case extent_types_t::OMAP_LEAF: + return out << "OMAP_LEAF"; + case extent_types_t::COLL_BLOCK: + return out << "COLL_BLOCK"; + case extent_types_t::OBJECT_DATA_BLOCK: + return out << "OBJECT_DATA_BLOCK"; + case extent_types_t::RETIRED_PLACEHOLDER: + return out << "RETIRED_PLACEHOLDER"; + case extent_types_t::TEST_BLOCK: + return out << "TEST_BLOCK"; + case extent_types_t::TEST_BLOCK_PHYSICAL: + return out << "TEST_BLOCK_PHYSICAL"; + case extent_types_t::BACKREF_INTERNAL: + return out << "BACKREF_INTERNAL"; + case extent_types_t::BACKREF_LEAF: + return out << "BACKREF_LEAF"; + case extent_types_t::NONE: + return out << "NONE"; + default: + return out << "UNKNOWN"; + } +} + +std::ostream &operator<<(std::ostream &out, rewrite_gen_printer_t gen) +{ + if (gen.gen == NULL_GENERATION) { + return out << "GEN_NULL"; + } else if (gen.gen == INIT_GENERATION) { + return out << "GEN_INIT"; + } else if (gen.gen == INLINE_GENERATION) { + return out << "GEN_INL"; + } else if (gen.gen == OOL_GENERATION) { + return out << "GEN_OOL"; + } else if (gen.gen > REWRITE_GENERATIONS) { + return out << "GEN_INVALID(" << (unsigned)gen.gen << ")!"; + } else { + return out << "GEN(" << (unsigned)gen.gen << ")"; + } +} + +std::ostream &operator<<(std::ostream &out, data_category_t c) +{ + switch (c) { + case data_category_t::METADATA: + return out << "MD"; + case data_category_t::DATA: + return out << "DATA"; + default: + return out << "INVALID_CATEGORY!"; + } +} + +std::ostream &operator<<(std::ostream &out, sea_time_point_printer_t tp) +{ + if (tp.tp == NULL_TIME) { + return out << "tp(NULL)"; + } + auto time = seastar::lowres_system_clock::to_time_t(tp.tp); + char buf[32]; + std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&time)); + return out << "tp(" << buf << ")"; +} + +std::ostream &operator<<(std::ostream &out, mod_time_point_printer_t tp) { + auto time = mod_to_timepoint(tp.tp); + return out << "mod_" << sea_time_point_printer_t{time}; +} + +std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs) +{ + bool first = false; + for (auto &i: rhs) { + out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')'; + first = true; + } + return out << ']'; +} +std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs) +{ + bool first = false; + for (auto &i: rhs) { + out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')'; + first = true; + } + return out << ']'; +} + +std::ostream &operator<<(std::ostream &out, const delta_info_t &delta) +{ + return out << "delta_info_t(" + << "type: " << delta.type + << ", paddr: " << delta.paddr + << ", laddr: " << delta.laddr + << ", prev_crc: " << delta.prev_crc + << ", final_crc: " << delta.final_crc + << ", length: " << delta.length + << ", pversion: " << delta.pversion + << ", ext_seq: " << delta.ext_seq + << ", seg_type: " << delta.seg_type + << ")"; +} + +std::ostream &operator<<(std::ostream &out, const journal_tail_delta_t &delta) +{ + return out << "journal_tail_delta_t(" + << "alloc_tail=" << delta.alloc_tail + << ", dirty_tail=" << delta.dirty_tail + << ")"; +} + +std::ostream &operator<<(std::ostream &out, const extent_info_t &info) +{ + return out << "extent_info_t(" + << "type: " << info.type + << ", addr: " << info.addr + << ", len: " << info.len + << ")"; +} + +std::ostream &operator<<(std::ostream &out, const segment_header_t &header) +{ + return out << "segment_header_t(" + << header.physical_segment_id + << " " << header.type + << " " << segment_seq_printer_t{header.segment_seq} + << " " << header.category + << " " << rewrite_gen_printer_t{header.generation} + << ", dirty_tail=" << header.dirty_tail + << ", alloc_tail=" << header.alloc_tail + << ", segment_nonce=" << header.segment_nonce + << ")"; +} + +std::ostream &operator<<(std::ostream &out, const segment_tail_t &tail) +{ + return out << "segment_tail_t(" + << tail.physical_segment_id + << " " << tail.type + << " " << segment_seq_printer_t{tail.segment_seq} + << ", segment_nonce=" << tail.segment_nonce + << ", modify_time=" << mod_time_point_printer_t{tail.modify_time} + << ", num_extents=" << tail.num_extents + << ")"; +} + +extent_len_t record_size_t::get_raw_mdlength() const +{ + // empty record is allowed to submit + return plain_mdlength + + ceph::encoded_sizeof_bounded<record_header_t>(); +} + +void record_size_t::account_extent(extent_len_t extent_len) +{ + assert(extent_len); + plain_mdlength += ceph::encoded_sizeof_bounded<extent_info_t>(); + dlength += extent_len; +} + +void record_size_t::account(const delta_info_t& delta) +{ + assert(delta.bl.length()); + plain_mdlength += ceph::encoded_sizeof(delta); +} + +std::ostream &operator<<(std::ostream &os, transaction_type_t type) +{ + switch (type) { + case transaction_type_t::MUTATE: + return os << "MUTATE"; + case transaction_type_t::READ: + return os << "READ"; + case transaction_type_t::TRIM_DIRTY: + return os << "TRIM_DIRTY"; + case transaction_type_t::TRIM_ALLOC: + return os << "TRIM_ALLOC"; + case transaction_type_t::CLEANER_MAIN: + return os << "CLEANER_MAIN"; + case transaction_type_t::CLEANER_COLD: + return os << "CLEANER_COLD"; + case transaction_type_t::MAX: + return os << "TRANS_TYPE_NULL"; + default: + return os << "INVALID_TRANS_TYPE(" + << static_cast<std::size_t>(type) + << ")"; + } +} + +std::ostream &operator<<(std::ostream& out, const record_size_t& rsize) +{ + return out << "record_size_t(" + << "raw_md=" << rsize.get_raw_mdlength() + << ", data=" << rsize.dlength + << ")"; +} + +std::ostream &operator<<(std::ostream& out, const record_t& r) +{ + return out << "record_t(" + << "type=" << r.type + << ", num_extents=" << r.extents.size() + << ", num_deltas=" << r.deltas.size() + << ", modify_time=" << sea_time_point_printer_t{r.modify_time} + << ")"; +} + +std::ostream &operator<<(std::ostream& out, const record_header_t& r) +{ + return out << "record_header_t(" + << "type=" << r.type + << ", num_extents=" << r.extents + << ", num_deltas=" << r.deltas + << ", modify_time=" << mod_time_point_printer_t{r.modify_time} + << ")"; +} + +std::ostream& operator<<(std::ostream& out, const record_group_header_t& h) +{ + return out << "record_group_header_t(" + << "num_records=" << h.records + << ", mdlength=" << h.mdlength + << ", dlength=" << h.dlength + << ", nonce=" << h.segment_nonce + << ", committed_to=" << h.committed_to + << ", data_crc=" << h.data_crc + << ")"; +} + +extent_len_t record_group_size_t::get_raw_mdlength() const +{ + return plain_mdlength + + sizeof(checksum_t) + + ceph::encoded_sizeof_bounded<record_group_header_t>(); +} + +void record_group_size_t::account( + const record_size_t& rsize, + extent_len_t _block_size) +{ + // empty record is allowed to submit + assert(_block_size > 0); + assert(rsize.dlength % _block_size == 0); + assert(block_size == 0 || block_size == _block_size); + plain_mdlength += rsize.get_raw_mdlength(); + dlength += rsize.dlength; + block_size = _block_size; +} + +std::ostream& operator<<(std::ostream& out, const record_group_size_t& size) +{ + return out << "record_group_size_t(" + << "raw_md=" << size.get_raw_mdlength() + << ", data=" << size.dlength + << ", block_size=" << size.block_size + << ", fullness=" << size.get_fullness() + << ")"; +} + +std::ostream& operator<<(std::ostream& out, const record_group_t& rg) +{ + return out << "record_group_t(" + << "num_records=" << rg.records.size() + << ", " << rg.size + << ")"; +} + +ceph::bufferlist encode_record( + record_t&& record, + extent_len_t block_size, + const journal_seq_t& committed_to, + segment_nonce_t current_segment_nonce) +{ + record_group_t record_group(std::move(record), block_size); + return encode_records( + record_group, + committed_to, + current_segment_nonce); +} + +ceph::bufferlist encode_records( + record_group_t& record_group, + const journal_seq_t& committed_to, + segment_nonce_t current_segment_nonce) +{ + assert(record_group.size.block_size > 0); + assert(record_group.records.size() > 0); + + bufferlist data_bl; + for (auto& r: record_group.records) { + for (auto& i: r.extents) { + assert(i.bl.length()); + data_bl.append(i.bl); + } + } + + bufferlist bl; + record_group_header_t header{ + static_cast<extent_len_t>(record_group.records.size()), + record_group.size.get_mdlength(), + record_group.size.dlength, + current_segment_nonce, + committed_to, + data_bl.crc32c(-1) + }; + encode(header, bl); + + auto metadata_crc_filler = bl.append_hole(sizeof(checksum_t)); + + for (auto& r: record_group.records) { + record_header_t rheader{ + r.type, + (extent_len_t)r.deltas.size(), + (extent_len_t)r.extents.size(), + timepoint_to_mod(r.modify_time) + }; + encode(rheader, bl); + } + for (auto& r: record_group.records) { + for (const auto& i: r.extents) { + encode(extent_info_t(i), bl); + } + } + for (auto& r: record_group.records) { + for (const auto& i: r.deltas) { + encode(i, bl); + } + } + ceph_assert(bl.length() == record_group.size.get_raw_mdlength()); + + auto aligned_mdlength = record_group.size.get_mdlength(); + if (bl.length() != aligned_mdlength) { + assert(bl.length() < aligned_mdlength); + bl.append_zero(aligned_mdlength - bl.length()); + } + + auto bliter = bl.cbegin(); + auto metadata_crc = bliter.crc32c( + ceph::encoded_sizeof_bounded<record_group_header_t>(), + -1); + bliter += sizeof(checksum_t); /* metadata crc hole */ + metadata_crc = bliter.crc32c( + bliter.get_remaining(), + metadata_crc); + ceph_le32 metadata_crc_le; + metadata_crc_le = metadata_crc; + metadata_crc_filler.copy_in( + sizeof(checksum_t), + reinterpret_cast<const char *>(&metadata_crc_le)); + + bl.claim_append(data_bl); + ceph_assert(bl.length() == record_group.size.get_encoded_length()); + + record_group.clear(); + return bl; +} + +std::optional<record_group_header_t> +try_decode_records_header( + const ceph::bufferlist& header_bl, + segment_nonce_t expected_nonce) +{ + auto bp = header_bl.cbegin(); + record_group_header_t header; + try { + decode(header, bp); + } catch (ceph::buffer::error &e) { + journal_logger().debug( + "try_decode_records_header: failed, " + "cannot decode record_group_header_t, got {}.", + e.what()); + return std::nullopt; + } + if (header.segment_nonce != expected_nonce) { + journal_logger().debug( + "try_decode_records_header: failed, record_group_header nonce mismatch, " + "read {}, expected {}!", + header.segment_nonce, + expected_nonce); + return std::nullopt; + } + return header; +} + +bool validate_records_metadata( + const ceph::bufferlist& md_bl) +{ + auto bliter = md_bl.cbegin(); + auto test_crc = bliter.crc32c( + ceph::encoded_sizeof_bounded<record_group_header_t>(), + -1); + ceph_le32 recorded_crc_le; + decode(recorded_crc_le, bliter); + uint32_t recorded_crc = recorded_crc_le; + test_crc = bliter.crc32c( + bliter.get_remaining(), + test_crc); + bool success = (test_crc == recorded_crc); + if (!success) { + journal_logger().debug( + "validate_records_metadata: failed, metadata crc mismatch."); + } + return success; +} + +bool validate_records_data( + const record_group_header_t& header, + const ceph::bufferlist& data_bl) +{ + bool success = (data_bl.crc32c(-1) == header.data_crc); + if (!success) { + journal_logger().debug( + "validate_records_data: failed, data crc mismatch!"); + } + return success; +} + +std::optional<std::vector<record_header_t>> +try_decode_record_headers( + const record_group_header_t& header, + const ceph::bufferlist& md_bl) +{ + auto bliter = md_bl.cbegin(); + bliter += ceph::encoded_sizeof_bounded<record_group_header_t>(); + bliter += sizeof(checksum_t); /* metadata crc hole */ + std::vector<record_header_t> record_headers(header.records); + for (auto &&i: record_headers) { + try { + decode(i, bliter); + } catch (ceph::buffer::error &e) { + journal_logger().debug( + "try_decode_record_headers: failed, " + "cannot decode record_header_t, got {}.", + e.what()); + return std::nullopt; + } + } + return record_headers; +} + +std::optional<std::vector<record_extent_infos_t> > +try_decode_extent_infos( + const record_group_header_t& header, + const ceph::bufferlist& md_bl) +{ + auto maybe_headers = try_decode_record_headers(header, md_bl); + if (!maybe_headers) { + return std::nullopt; + } + + auto bliter = md_bl.cbegin(); + bliter += ceph::encoded_sizeof_bounded<record_group_header_t>(); + bliter += sizeof(checksum_t); /* metadata crc hole */ + bliter += (ceph::encoded_sizeof_bounded<record_header_t>() * + maybe_headers->size()); + + std::vector<record_extent_infos_t> record_extent_infos( + maybe_headers->size()); + auto result_iter = record_extent_infos.begin(); + for (auto& h: *maybe_headers) { + result_iter->header = h; + result_iter->extent_infos.resize(h.extents); + for (auto& i: result_iter->extent_infos) { + try { + decode(i, bliter); + } catch (ceph::buffer::error &e) { + journal_logger().debug( + "try_decode_extent_infos: failed, " + "cannot decode extent_info_t, got {}.", + e.what()); + return std::nullopt; + } + } + ++result_iter; + } + return record_extent_infos; +} + +std::optional<std::vector<record_deltas_t> > +try_decode_deltas( + const record_group_header_t& header, + const ceph::bufferlist& md_bl, + paddr_t record_block_base) +{ + auto maybe_record_extent_infos = try_decode_extent_infos(header, md_bl); + if (!maybe_record_extent_infos) { + return std::nullopt; + } + + auto bliter = md_bl.cbegin(); + bliter += ceph::encoded_sizeof_bounded<record_group_header_t>(); + bliter += sizeof(checksum_t); /* metadata crc hole */ + bliter += (ceph::encoded_sizeof_bounded<record_header_t>() * + maybe_record_extent_infos->size()); + for (auto& r: *maybe_record_extent_infos) { + bliter += (ceph::encoded_sizeof_bounded<extent_info_t>() * + r.extent_infos.size()); + } + + std::vector<record_deltas_t> record_deltas( + maybe_record_extent_infos->size()); + auto result_iter = record_deltas.begin(); + for (auto& r: *maybe_record_extent_infos) { + result_iter->record_block_base = record_block_base; + result_iter->deltas.resize(r.header.deltas); + for (auto& i: result_iter->deltas) { + try { + decode(i.second, bliter); + i.first = mod_to_timepoint(r.header.modify_time); + } catch (ceph::buffer::error &e) { + journal_logger().debug( + "try_decode_deltas: failed, " + "cannot decode delta_info_t, got {}.", + e.what()); + return std::nullopt; + } + } + for (auto& i: r.extent_infos) { + record_block_base = record_block_base.add_offset(i.len); + } + ++result_iter; + } + return record_deltas; +} + +std::ostream& operator<<(std::ostream& out, placement_hint_t h) +{ + switch (h) { + case placement_hint_t::HOT: + return out << "Hint(HOT)"; + case placement_hint_t::COLD: + return out << "Hint(COLD)"; + case placement_hint_t::REWRITE: + return out << "Hint(REWRITE)"; + case PLACEMENT_HINT_NULL: + return out << "Hint(NULL)"; + default: + return out << "INVALID_PLACEMENT_HINT_TYPE!"; + } +} + +bool can_delay_allocation(device_type_t type) { + // Some types of device may not support delayed allocation, for example PMEM. + // All types of device currently support delayed allocation. + return true; +} + +device_type_t string_to_device_type(std::string type) { + if (type == "HDD") { + return device_type_t::HDD; + } + if (type == "SSD") { + return device_type_t::SSD; + } + if (type == "ZBD") { + return device_type_t::ZBD; + } + if (type == "RANDOM_BLOCK_SSD") { + return device_type_t::RANDOM_BLOCK_SSD; + } + return device_type_t::NONE; +} + +std::ostream& operator<<(std::ostream& out, device_type_t t) +{ + switch (t) { + case device_type_t::NONE: + return out << "NONE"; + case device_type_t::HDD: + return out << "HDD"; + case device_type_t::SSD: + return out << "SSD"; + case device_type_t::ZBD: + return out << "ZBD"; + case device_type_t::EPHEMERAL_COLD: + return out << "EPHEMERAL_COLD"; + case device_type_t::EPHEMERAL_MAIN: + return out << "EPHEMERAL_MAIN"; + case device_type_t::RANDOM_BLOCK_SSD: + return out << "RANDOM_BLOCK_SSD"; + case device_type_t::RANDOM_BLOCK_EPHEMERAL: + return out << "RANDOM_BLOCK_EPHEMERAL"; + default: + return out << "INVALID_DEVICE_TYPE!"; + } +} + +std::ostream& operator<<(std::ostream& out, backend_type_t btype) { + if (btype == backend_type_t::SEGMENTED) { + return out << "SEGMENTED"; + } else { + return out << "RANDOM_BLOCK"; + } +} + +std::ostream& operator<<(std::ostream& out, const write_result_t& w) +{ + return out << "write_result_t(" + << "start=" << w.start_seq + << ", length=" << w.length + << ")"; +} + +std::ostream& operator<<(std::ostream& out, const record_locator_t& l) +{ + return out << "record_locator_t(" + << "block_base=" << l.record_block_base + << ", " << l.write_result + << ")"; +} + +void scan_valid_records_cursor::emplace_record_group( + const record_group_header_t& header, ceph::bufferlist&& md_bl) +{ + auto new_committed_to = header.committed_to; + ceph_assert(last_committed == JOURNAL_SEQ_NULL || + last_committed <= new_committed_to); + last_committed = new_committed_to; + pending_record_groups.emplace_back( + seq.offset, + header, + std::move(md_bl)); + increment_seq(header.dlength + header.mdlength); + ceph_assert(new_committed_to == JOURNAL_SEQ_NULL || + new_committed_to < seq); +} + +std::ostream& operator<<(std::ostream& out, const scan_valid_records_cursor& c) +{ + return out << "cursor(last_valid_header_found=" << c.last_valid_header_found + << ", seq=" << c.seq + << ", last_committed=" << c.last_committed + << ", pending_record_groups=" << c.pending_record_groups.size() + << ", num_consumed_records=" << c.num_consumed_records + << ")"; +} + +} diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h new file mode 100644 index 000000000..0b4ad8536 --- /dev/null +++ b/src/crimson/os/seastore/seastore_types.h @@ -0,0 +1,2254 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <limits> +#include <numeric> +#include <optional> +#include <iostream> +#include <vector> +#include <boost/core/ignore_unused.hpp> + +#include <seastar/core/lowres_clock.hh> + +#include "include/byteorder.h" +#include "include/denc.h" +#include "include/buffer.h" +#include "include/intarith.h" +#include "include/interval_set.h" +#include "include/uuid.h" + +namespace crimson::os::seastore { + +/* using a special xattr key "omap_header" to store omap header */ + const std::string OMAP_HEADER_XATTR_KEY = "omap_header"; + +using transaction_id_t = uint64_t; +constexpr transaction_id_t TRANS_ID_NULL = 0; + +/* + * Note: NULL value is usually the default and max value. + */ + +using depth_t = uint32_t; +using depth_le_t = ceph_le32; + +inline depth_le_t init_depth_le(uint32_t i) { + return ceph_le32(i); +} + +using checksum_t = uint32_t; + +// Immutable metadata for seastore to set at mkfs time +struct seastore_meta_t { + uuid_d seastore_id; + + DENC(seastore_meta_t, v, p) { + DENC_START(1, 1, p); + denc(v.seastore_id, p); + DENC_FINISH(p); + } +}; + +std::ostream& operator<<(std::ostream& out, const seastore_meta_t& meta); + +bool is_aligned(uint64_t offset, uint64_t alignment); + +// identifies a specific physical device within seastore +using device_id_t = uint8_t; + +constexpr auto DEVICE_ID_BITS = std::numeric_limits<device_id_t>::digits; + +constexpr device_id_t DEVICE_ID_MAX = std::numeric_limits<device_id_t>::max(); +constexpr device_id_t DEVICE_ID_NULL = DEVICE_ID_MAX; +constexpr device_id_t DEVICE_ID_RECORD_RELATIVE = DEVICE_ID_MAX - 1; +constexpr device_id_t DEVICE_ID_BLOCK_RELATIVE = DEVICE_ID_MAX - 2; +constexpr device_id_t DEVICE_ID_DELAYED = DEVICE_ID_MAX - 3; +// for tests which generate fake paddrs +constexpr device_id_t DEVICE_ID_FAKE = DEVICE_ID_MAX - 4; +constexpr device_id_t DEVICE_ID_ZERO = DEVICE_ID_MAX - 5; +constexpr device_id_t DEVICE_ID_ROOT = DEVICE_ID_MAX - 6; +constexpr device_id_t DEVICE_ID_MAX_VALID = DEVICE_ID_MAX - 7; +constexpr device_id_t DEVICE_ID_MAX_VALID_SEGMENT = DEVICE_ID_MAX >> 1; +constexpr device_id_t DEVICE_ID_SEGMENTED_MIN = 0; +constexpr device_id_t DEVICE_ID_RANDOM_BLOCK_MIN = + 1 << (std::numeric_limits<device_id_t>::digits - 1); + +struct device_id_printer_t { + device_id_t id; +}; + +std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id); + +// 1 bit in paddr_t to identify the absolute physical address type +enum class paddr_types_t { + SEGMENT = 0, + RANDOM_BLOCK = 1, + RESERVED = 2 +}; + +constexpr paddr_types_t device_id_to_paddr_type(device_id_t id) { + if (id > DEVICE_ID_MAX_VALID) { + return paddr_types_t::RESERVED; + } else if ((id & 0x80) == 0) { + return paddr_types_t::SEGMENT; + } else { + return paddr_types_t::RANDOM_BLOCK; + } +} + +constexpr bool has_device_off(device_id_t id) { + return id == DEVICE_ID_RECORD_RELATIVE || + id == DEVICE_ID_BLOCK_RELATIVE || + id == DEVICE_ID_DELAYED || + id == DEVICE_ID_FAKE || + id == DEVICE_ID_ROOT; +} + +// internal segment id type of segment_id_t below, with the top +// "DEVICE_ID_BITS" bits representing the device id of the segment. +using internal_segment_id_t = uint32_t; +constexpr auto SEGMENT_ID_BITS = std::numeric_limits<internal_segment_id_t>::digits; + +// segment ids without a device id encapsulated +using device_segment_id_t = uint32_t; +constexpr auto DEVICE_SEGMENT_ID_BITS = SEGMENT_ID_BITS - DEVICE_ID_BITS; +constexpr device_segment_id_t DEVICE_SEGMENT_ID_MAX = (1 << DEVICE_SEGMENT_ID_BITS) - 1; + +// Identifies segment location on disk, see SegmentManager, +struct segment_id_t { +public: + // segment_id_t() == MAX_SEG_ID == NULL_SEG_ID + segment_id_t() + : segment_id_t(DEVICE_ID_MAX_VALID_SEGMENT, DEVICE_SEGMENT_ID_MAX) {} + + segment_id_t(device_id_t id, device_segment_id_t _segment) + : segment_id_t(make_internal(id, _segment)) {} + + segment_id_t(internal_segment_id_t _segment) + : segment(_segment) { + assert(device_id_to_paddr_type(device_id()) == paddr_types_t::SEGMENT); + } + + [[gnu::always_inline]] + constexpr device_id_t device_id() const { + return static_cast<device_id_t>(segment >> DEVICE_SEGMENT_ID_BITS); + } + + [[gnu::always_inline]] + constexpr device_segment_id_t device_segment_id() const { + constexpr internal_segment_id_t _SEGMENT_ID_MASK = (1u << DEVICE_SEGMENT_ID_BITS) - 1; + return segment & _SEGMENT_ID_MASK; + } + + bool operator==(const segment_id_t& other) const { + return segment == other.segment; + } + bool operator!=(const segment_id_t& other) const { + return segment != other.segment; + } + bool operator<(const segment_id_t& other) const { + return segment < other.segment; + } + bool operator<=(const segment_id_t& other) const { + return segment <= other.segment; + } + bool operator>(const segment_id_t& other) const { + return segment > other.segment; + } + bool operator>=(const segment_id_t& other) const { + return segment >= other.segment; + } + + DENC(segment_id_t, v, p) { + denc(v.segment, p); + } + + static constexpr segment_id_t create_const( + device_id_t id, device_segment_id_t segment) { + return segment_id_t(id, segment, const_t{}); + } + +private: + struct const_t {}; + constexpr segment_id_t(device_id_t id, device_segment_id_t _segment, const_t) + : segment(make_internal(id, _segment)) {} + + constexpr static inline internal_segment_id_t make_internal( + device_id_t d_id, + device_segment_id_t s_id) { + return static_cast<internal_segment_id_t>(s_id) | + (static_cast<internal_segment_id_t>(d_id) << DEVICE_SEGMENT_ID_BITS); + } + + internal_segment_id_t segment; + + friend struct segment_id_le_t; + friend struct paddr_t; +}; + +std::ostream &operator<<(std::ostream &out, const segment_id_t&); + +// ondisk type of segment_id_t +struct __attribute((packed)) segment_id_le_t { + ceph_le32 segment = ceph_le32(segment_id_t().segment); + + segment_id_le_t(const segment_id_t id) : + segment(ceph_le32(id.segment)) {} + + operator segment_id_t() const { + return segment_id_t(segment); + } +}; + +constexpr segment_id_t MIN_SEG_ID = segment_id_t::create_const(0, 0); +// segment_id_t() == MAX_SEG_ID == NULL_SEG_ID +constexpr segment_id_t MAX_SEG_ID = + segment_id_t::create_const(DEVICE_ID_MAX_VALID_SEGMENT, DEVICE_SEGMENT_ID_MAX); +constexpr segment_id_t NULL_SEG_ID = MAX_SEG_ID; + +/* Monotonically increasing segment seq, uniquely identifies + * the incarnation of a segment */ +using segment_seq_t = uint64_t; +static constexpr segment_seq_t MAX_SEG_SEQ = + std::numeric_limits<segment_seq_t>::max(); +static constexpr segment_seq_t NULL_SEG_SEQ = MAX_SEG_SEQ; + +enum class segment_type_t : uint8_t { + JOURNAL = 0, + OOL, + NULL_SEG, +}; + +std::ostream& operator<<(std::ostream& out, segment_type_t t); + +struct segment_seq_printer_t { + segment_seq_t seq; +}; + +std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq); + +/** + * segment_map_t + * + * Compact templated mapping from a segment_id_t to a value type. + */ +template <typename T> +class segment_map_t { +public: + segment_map_t() { + // initializes top vector with 0 length vectors to indicate that they + // are not yet present + device_to_segments.resize(DEVICE_ID_MAX_VALID); + } + void add_device(device_id_t device, std::size_t segments, const T& init) { + ceph_assert(device <= DEVICE_ID_MAX_VALID); + ceph_assert(device_to_segments[device].size() == 0); + ceph_assert(segments > 0); + device_to_segments[device].resize(segments, init); + total_segments += segments; + } + void clear() { + device_to_segments.clear(); + device_to_segments.resize(DEVICE_ID_MAX_VALID); + total_segments = 0; + } + + T& operator[](segment_id_t id) { + assert(id.device_segment_id() < device_to_segments[id.device_id()].size()); + return device_to_segments[id.device_id()][id.device_segment_id()]; + } + const T& operator[](segment_id_t id) const { + assert(id.device_segment_id() < device_to_segments[id.device_id()].size()); + return device_to_segments[id.device_id()][id.device_segment_id()]; + } + + bool contains(segment_id_t id) { + bool b = id.device_id() < device_to_segments.size(); + if (!b) { + return b; + } + b = id.device_segment_id() < device_to_segments[id.device_id()].size(); + return b; + } + + auto begin() { + return iterator<false>::lower_bound(*this, 0, 0); + } + auto begin() const { + return iterator<true>::lower_bound(*this, 0, 0); + } + + auto end() { + return iterator<false>::end_iterator(*this); + } + auto end() const { + return iterator<true>::end_iterator(*this); + } + + auto device_begin(device_id_t id) { + auto ret = iterator<false>::lower_bound(*this, id, 0); + assert(ret->first.device_id() == id); + return ret; + } + auto device_end(device_id_t id) { + return iterator<false>::lower_bound(*this, id + 1, 0); + } + + size_t size() const { + return total_segments; + } + +private: + template <bool is_const = false> + class iterator { + /// points at set being iterated over + std::conditional_t< + is_const, + const segment_map_t &, + segment_map_t &> parent; + + /// points at current device, or DEVICE_ID_MAX_VALID if is_end() + device_id_t device_id; + + /// segment at which we are pointing, 0 if is_end() + device_segment_id_t device_segment_id; + + /// holds referent for operator* and operator-> when !is_end() + std::optional< + std::pair< + const segment_id_t, + std::conditional_t<is_const, const T&, T&> + >> current; + + bool is_end() const { + return device_id == DEVICE_ID_MAX_VALID; + } + + void find_valid() { + assert(!is_end()); + auto &device_vec = parent.device_to_segments[device_id]; + if (device_vec.size() == 0 || + device_segment_id == device_vec.size()) { + while (++device_id < DEVICE_ID_MAX_VALID && + parent.device_to_segments[device_id].size() == 0); + device_segment_id = 0; + } + if (is_end()) { + current = std::nullopt; + } else { + current.emplace( + segment_id_t{device_id, device_segment_id}, + parent.device_to_segments[device_id][device_segment_id] + ); + } + } + + iterator( + decltype(parent) &parent, + device_id_t device_id, + device_segment_id_t device_segment_id) + : parent(parent), device_id(device_id), + device_segment_id(device_segment_id) {} + + public: + static iterator lower_bound( + decltype(parent) &parent, + device_id_t device_id, + device_segment_id_t device_segment_id) { + if (device_id == DEVICE_ID_MAX_VALID) { + return end_iterator(parent); + } else { + auto ret = iterator{parent, device_id, device_segment_id}; + ret.find_valid(); + return ret; + } + } + + static iterator end_iterator( + decltype(parent) &parent) { + return iterator{parent, DEVICE_ID_MAX_VALID, 0}; + } + + iterator<is_const>& operator++() { + assert(!is_end()); + ++device_segment_id; + find_valid(); + return *this; + } + + bool operator==(iterator<is_const> rit) { + return (device_id == rit.device_id && + device_segment_id == rit.device_segment_id); + } + + bool operator!=(iterator<is_const> rit) { + return !(*this == rit); + } + + template <bool c = is_const, std::enable_if_t<c, int> = 0> + const std::pair<const segment_id_t, const T&> *operator->() { + assert(!is_end()); + return &*current; + } + template <bool c = is_const, std::enable_if_t<!c, int> = 0> + std::pair<const segment_id_t, T&> *operator->() { + assert(!is_end()); + return &*current; + } + + using reference = std::conditional_t< + is_const, const std::pair<const segment_id_t, const T&>&, + std::pair<const segment_id_t, T&>&>; + reference operator*() { + assert(!is_end()); + return *current; + } + }; + + /** + * device_to_segments + * + * device -> segment -> T mapping. device_to_segments[d].size() > 0 iff + * device <d> has been added. + */ + std::vector<std::vector<T>> device_to_segments; + + /// total number of added segments + size_t total_segments = 0; +}; + +/** + * paddr_t + * + * <segment, offset> offset on disk, see SegmentManager + * + * May be absolute, record_relative, or block_relative. + * + * Blocks get read independently of the surrounding record, + * so paddrs embedded directly within a block need to refer + * to other blocks within the same record by a block_relative + * addr relative to the block's own offset. By contrast, + * deltas to existing blocks need to use record_relative + * addrs relative to the first block of the record. + * + * Fresh extents during a transaction are refered to by + * record_relative paddrs. + */ + +using internal_paddr_t = uint64_t; +constexpr auto PADDR_BITS = std::numeric_limits<internal_paddr_t>::digits; + +/** + * device_off_t + * + * Offset within a device, may be negative for relative offsets. + */ +using device_off_t = int64_t; +using u_device_off_t = uint64_t; +constexpr auto DEVICE_OFF_BITS = PADDR_BITS - DEVICE_ID_BITS; +constexpr auto DEVICE_OFF_MAX = + std::numeric_limits<device_off_t>::max() >> DEVICE_ID_BITS; +constexpr auto DEVICE_OFF_MIN = -(DEVICE_OFF_MAX + 1); + +/** + * segment_off_t + * + * Offset within a segment on disk, may be negative for relative offsets. + */ +using segment_off_t = int32_t; +using u_segment_off_t = uint32_t; +constexpr auto SEGMENT_OFF_MAX = std::numeric_limits<segment_off_t>::max(); +constexpr auto SEGMENT_OFF_MIN = std::numeric_limits<segment_off_t>::min(); +constexpr auto SEGMENT_OFF_BITS = std::numeric_limits<u_segment_off_t>::digits; +static_assert(PADDR_BITS == SEGMENT_ID_BITS + SEGMENT_OFF_BITS); + +constexpr auto DEVICE_ID_MASK = + ((internal_paddr_t(1) << DEVICE_ID_BITS) - 1) << DEVICE_OFF_BITS; +constexpr auto DEVICE_OFF_MASK = + std::numeric_limits<u_device_off_t>::max() >> DEVICE_ID_BITS; +constexpr auto SEGMENT_ID_MASK = + ((internal_paddr_t(1) << SEGMENT_ID_BITS) - 1) << SEGMENT_OFF_BITS; +constexpr auto SEGMENT_OFF_MASK = + (internal_paddr_t(1) << SEGMENT_OFF_BITS) - 1; + +constexpr internal_paddr_t encode_device_off(device_off_t off) { + return static_cast<internal_paddr_t>(off) & DEVICE_OFF_MASK; +} + +constexpr device_off_t decode_device_off(internal_paddr_t addr) { + if (addr & (1ull << (DEVICE_OFF_BITS - 1))) { + return static_cast<device_off_t>(addr | DEVICE_ID_MASK); + } else { + return static_cast<device_off_t>(addr & DEVICE_OFF_MASK); + } +} + +struct seg_paddr_t; +struct blk_paddr_t; +struct res_paddr_t; +struct pladdr_t; +struct paddr_t { +public: + // P_ADDR_MAX == P_ADDR_NULL == paddr_t{} + paddr_t() : paddr_t(DEVICE_ID_MAX, device_off_t(0)) {} + + static paddr_t make_seg_paddr( + segment_id_t seg, + segment_off_t offset) { + return paddr_t(seg, offset); + } + + static paddr_t make_seg_paddr( + device_id_t device, + device_segment_id_t seg, + segment_off_t offset) { + return paddr_t(segment_id_t(device, seg), offset); + } + + static paddr_t make_blk_paddr( + device_id_t device, + device_off_t offset) { + assert(device_id_to_paddr_type(device) == paddr_types_t::RANDOM_BLOCK); + return paddr_t(device, offset); + } + + static paddr_t make_res_paddr( + device_id_t device, + device_off_t offset) { + assert(device_id_to_paddr_type(device) == paddr_types_t::RESERVED); + return paddr_t(device, offset); + } + + void swap(paddr_t &other) { + std::swap(internal_paddr, other.internal_paddr); + } + + device_id_t get_device_id() const { + return static_cast<device_id_t>(internal_paddr >> DEVICE_OFF_BITS); + } + + paddr_types_t get_addr_type() const { + return device_id_to_paddr_type(get_device_id()); + } + + paddr_t add_offset(device_off_t o) const; + + paddr_t add_relative(paddr_t o) const; + + paddr_t add_block_relative(paddr_t o) const { + // special version mainly for documentation purposes + assert(o.is_block_relative()); + return add_relative(o); + } + + paddr_t add_record_relative(paddr_t o) const { + // special version mainly for documentation purposes + assert(o.is_record_relative()); + return add_relative(o); + } + + /** + * maybe_relative_to + * + * Helper for the case where an in-memory paddr_t may be + * either block_relative or absolute (not record_relative). + * + * base must be either absolute or record_relative. + */ + paddr_t maybe_relative_to(paddr_t base) const { + assert(!base.is_block_relative()); + if (is_block_relative()) { + return base.add_block_relative(*this); + } else { + return *this; + } + } + + /** + * block_relative_to + * + * Only defined for record_relative paddr_ts. Yields a + * block_relative address. + */ + paddr_t block_relative_to(paddr_t rhs) const; + + // To be compatible with laddr_t operator+ + paddr_t operator+(device_off_t o) const { + return add_offset(o); + } + + seg_paddr_t& as_seg_paddr(); + const seg_paddr_t& as_seg_paddr() const; + blk_paddr_t& as_blk_paddr(); + const blk_paddr_t& as_blk_paddr() const; + res_paddr_t& as_res_paddr(); + const res_paddr_t& as_res_paddr() const; + + bool is_delayed() const { + return get_device_id() == DEVICE_ID_DELAYED; + } + bool is_block_relative() const { + return get_device_id() == DEVICE_ID_BLOCK_RELATIVE; + } + bool is_record_relative() const { + return get_device_id() == DEVICE_ID_RECORD_RELATIVE; + } + bool is_relative() const { + return is_block_relative() || is_record_relative(); + } + /// Denotes special null addr + bool is_null() const { + return get_device_id() == DEVICE_ID_NULL; + } + /// Denotes special zero addr + bool is_zero() const { + return get_device_id() == DEVICE_ID_ZERO; + } + /// Denotes the root addr + bool is_root() const { + return get_device_id() == DEVICE_ID_ROOT; + } + + /** + * is_real + * + * indicates whether addr reflects a physical location, absolute, relative, + * or delayed. FAKE segments also count as real so as to reflect the way in + * which unit tests use them. + */ + bool is_real() const { + return !is_zero() && !is_null() && !is_root(); + } + + bool is_absolute() const { + return get_addr_type() != paddr_types_t::RESERVED; + } + + bool is_fake() const { + return get_device_id() == DEVICE_ID_FAKE; + } + + auto operator<=>(const paddr_t &) const = default; + + DENC(paddr_t, v, p) { + DENC_START(1, 1, p); + denc(v.internal_paddr, p); + DENC_FINISH(p); + } + + constexpr static paddr_t create_const( + device_id_t d_id, device_off_t offset) { + return paddr_t(d_id, offset, const_construct_t()); + } + +protected: + internal_paddr_t internal_paddr; + +private: + // as seg + paddr_t(segment_id_t seg, segment_off_t offset) + : paddr_t((static_cast<internal_paddr_t>(seg.segment) << SEGMENT_OFF_BITS) | + static_cast<u_segment_off_t>(offset)) {} + + // as blk or res + paddr_t(device_id_t d_id, device_off_t offset) + : paddr_t((static_cast<internal_paddr_t>(d_id) << DEVICE_OFF_BITS) | + encode_device_off(offset)) { + assert(offset >= DEVICE_OFF_MIN); + assert(offset <= DEVICE_OFF_MAX); + assert(get_addr_type() != paddr_types_t::SEGMENT); + } + + paddr_t(internal_paddr_t val); + + struct const_construct_t {}; + constexpr paddr_t(device_id_t d_id, device_off_t offset, const_construct_t) + : internal_paddr((static_cast<internal_paddr_t>(d_id) << DEVICE_OFF_BITS) | + static_cast<u_device_off_t>(offset)) {} + + friend struct paddr_le_t; + friend struct pladdr_le_t; + +}; + +std::ostream &operator<<(std::ostream &out, const paddr_t &rhs); + +struct seg_paddr_t : public paddr_t { + seg_paddr_t(const seg_paddr_t&) = delete; + seg_paddr_t(seg_paddr_t&) = delete; + seg_paddr_t& operator=(const seg_paddr_t&) = delete; + seg_paddr_t& operator=(seg_paddr_t&) = delete; + + segment_id_t get_segment_id() const { + return segment_id_t(static_cast<internal_segment_id_t>( + internal_paddr >> SEGMENT_OFF_BITS)); + } + + segment_off_t get_segment_off() const { + return segment_off_t(internal_paddr & SEGMENT_OFF_MASK); + } + + void set_segment_off(segment_off_t off) { + assert(off >= 0); + internal_paddr = (internal_paddr & SEGMENT_ID_MASK); + internal_paddr |= static_cast<u_segment_off_t>(off); + } + + paddr_t add_offset(device_off_t o) const { + device_off_t off = get_segment_off() + o; + assert(off >= 0); + assert(off <= SEGMENT_OFF_MAX); + return paddr_t::make_seg_paddr( + get_segment_id(), static_cast<segment_off_t>(off)); + } +}; + +struct blk_paddr_t : public paddr_t { + blk_paddr_t(const blk_paddr_t&) = delete; + blk_paddr_t(blk_paddr_t&) = delete; + blk_paddr_t& operator=(const blk_paddr_t&) = delete; + blk_paddr_t& operator=(blk_paddr_t&) = delete; + + device_off_t get_device_off() const { + return decode_device_off(internal_paddr); + } + + void set_device_off(device_off_t off) { + assert(off >= 0); + assert(off <= DEVICE_OFF_MAX); + internal_paddr = (internal_paddr & DEVICE_ID_MASK); + internal_paddr |= encode_device_off(off); + } + + paddr_t add_offset(device_off_t o) const { + assert(o >= DEVICE_OFF_MIN); + assert(o <= DEVICE_OFF_MAX); + auto off = get_device_off() + o; + return paddr_t::make_blk_paddr(get_device_id(), off); + } +}; + +struct res_paddr_t : public paddr_t { + res_paddr_t(const res_paddr_t&) = delete; + res_paddr_t(res_paddr_t&) = delete; + res_paddr_t& operator=(const res_paddr_t&) = delete; + res_paddr_t& operator=(res_paddr_t&) = delete; + + device_off_t get_device_off() const { + return decode_device_off(internal_paddr); + } + + void set_device_off(device_off_t off) { + assert(has_device_off(get_device_id())); + assert(off >= DEVICE_OFF_MIN); + assert(off <= DEVICE_OFF_MAX); + internal_paddr = (internal_paddr & DEVICE_ID_MASK); + internal_paddr |= encode_device_off(off); + } + + paddr_t add_offset(device_off_t o) const { + assert(has_device_off(get_device_id())); + assert(o >= DEVICE_OFF_MIN); + assert(o <= DEVICE_OFF_MAX); + auto off = get_device_off() + o; + return paddr_t::make_res_paddr(get_device_id(), off); + } + + paddr_t block_relative_to(const res_paddr_t &rhs) const { + assert(rhs.is_record_relative() && is_record_relative()); + auto off = get_device_off() - rhs.get_device_off(); + return paddr_t::make_res_paddr(DEVICE_ID_BLOCK_RELATIVE, off); + } +}; + +constexpr paddr_t P_ADDR_MIN = paddr_t::create_const(0, 0); +// P_ADDR_MAX == P_ADDR_NULL == paddr_t{} +constexpr paddr_t P_ADDR_MAX = paddr_t::create_const(DEVICE_ID_MAX, 0); +constexpr paddr_t P_ADDR_NULL = P_ADDR_MAX; +constexpr paddr_t P_ADDR_ZERO = paddr_t::create_const(DEVICE_ID_ZERO, 0); +constexpr paddr_t P_ADDR_ROOT = paddr_t::create_const(DEVICE_ID_ROOT, 0); + +inline paddr_t make_record_relative_paddr(device_off_t off) { + return paddr_t::make_res_paddr(DEVICE_ID_RECORD_RELATIVE, off); +} +inline paddr_t make_block_relative_paddr(device_off_t off) { + return paddr_t::make_res_paddr(DEVICE_ID_BLOCK_RELATIVE, off); +} +inline paddr_t make_fake_paddr(device_off_t off) { + return paddr_t::make_res_paddr(DEVICE_ID_FAKE, off); +} +inline paddr_t make_delayed_temp_paddr(device_off_t off) { + return paddr_t::make_res_paddr(DEVICE_ID_DELAYED, off); +} + +inline const seg_paddr_t& paddr_t::as_seg_paddr() const { + assert(get_addr_type() == paddr_types_t::SEGMENT); + return *static_cast<const seg_paddr_t*>(this); +} + +inline seg_paddr_t& paddr_t::as_seg_paddr() { + assert(get_addr_type() == paddr_types_t::SEGMENT); + return *static_cast<seg_paddr_t*>(this); +} + +inline const blk_paddr_t& paddr_t::as_blk_paddr() const { + assert(get_addr_type() == paddr_types_t::RANDOM_BLOCK); + return *static_cast<const blk_paddr_t*>(this); +} + +inline blk_paddr_t& paddr_t::as_blk_paddr() { + assert(get_addr_type() == paddr_types_t::RANDOM_BLOCK); + return *static_cast<blk_paddr_t*>(this); +} + +inline const res_paddr_t& paddr_t::as_res_paddr() const { + assert(get_addr_type() == paddr_types_t::RESERVED); + return *static_cast<const res_paddr_t*>(this); +} + +inline res_paddr_t& paddr_t::as_res_paddr() { + assert(get_addr_type() == paddr_types_t::RESERVED); + return *static_cast<res_paddr_t*>(this); +} + +inline paddr_t::paddr_t(internal_paddr_t val) : internal_paddr(val) { +#ifndef NDEBUG + auto type = get_addr_type(); + if (type == paddr_types_t::SEGMENT) { + assert(as_seg_paddr().get_segment_off() >= 0); + } else if (type == paddr_types_t::RANDOM_BLOCK) { + assert(as_blk_paddr().get_device_off() >= 0); + } else { + assert(type == paddr_types_t::RESERVED); + if (!has_device_off(get_device_id())) { + assert(as_res_paddr().get_device_off() == 0); + } + } +#endif +} + +#define PADDR_OPERATION(a_type, base, func) \ + if (get_addr_type() == a_type) { \ + return static_cast<const base*>(this)->func; \ + } + +inline paddr_t paddr_t::add_offset(device_off_t o) const { + PADDR_OPERATION(paddr_types_t::SEGMENT, seg_paddr_t, add_offset(o)) + PADDR_OPERATION(paddr_types_t::RANDOM_BLOCK, blk_paddr_t, add_offset(o)) + PADDR_OPERATION(paddr_types_t::RESERVED, res_paddr_t, add_offset(o)) + ceph_assert(0 == "not supported type"); + return P_ADDR_NULL; +} + +inline paddr_t paddr_t::add_relative(paddr_t o) const { + assert(o.is_relative()); + auto &res_o = o.as_res_paddr(); + return add_offset(res_o.get_device_off()); +} + +inline paddr_t paddr_t::block_relative_to(paddr_t rhs) const { + return as_res_paddr().block_relative_to(rhs.as_res_paddr()); +} + +struct __attribute((packed)) paddr_le_t { + ceph_le64 internal_paddr = + ceph_le64(P_ADDR_NULL.internal_paddr); + + using orig_type = paddr_t; + + paddr_le_t() = default; + paddr_le_t(const paddr_t &addr) : internal_paddr(ceph_le64(addr.internal_paddr)) {} + + operator paddr_t() const { + return paddr_t{internal_paddr}; + } +}; + +using objaddr_t = uint32_t; +constexpr objaddr_t OBJ_ADDR_MAX = std::numeric_limits<objaddr_t>::max(); +constexpr objaddr_t OBJ_ADDR_NULL = OBJ_ADDR_MAX; + +enum class placement_hint_t { + HOT = 0, // The default user hint that expects mutations or retirement + COLD, // Expect no mutations and no retirement in the near future + REWRITE, // Hint for the internal rewrites + NUM_HINTS // Constant for number of hints or as NULL +}; + +constexpr auto PLACEMENT_HINT_NULL = placement_hint_t::NUM_HINTS; + +std::ostream& operator<<(std::ostream& out, placement_hint_t h); + +enum class device_type_t : uint8_t { + NONE = 0, + HDD, + SSD, + ZBD, // ZNS SSD or SMR HDD + EPHEMERAL_COLD, + EPHEMERAL_MAIN, + RANDOM_BLOCK_SSD, + RANDOM_BLOCK_EPHEMERAL, + NUM_TYPES +}; + +std::ostream& operator<<(std::ostream& out, device_type_t t); + +bool can_delay_allocation(device_type_t type); +device_type_t string_to_device_type(std::string type); + +enum class backend_type_t { + SEGMENTED, // SegmentManager: SSD, ZBD, HDD + RANDOM_BLOCK // RBMDevice: RANDOM_BLOCK_SSD +}; + +std::ostream& operator<<(std::ostream& out, backend_type_t); +using journal_type_t = backend_type_t; + +constexpr backend_type_t get_default_backend_of_device(device_type_t dtype) { + assert(dtype != device_type_t::NONE && + dtype != device_type_t::NUM_TYPES); + if (dtype >= device_type_t::HDD && + dtype <= device_type_t::EPHEMERAL_MAIN) { + return backend_type_t::SEGMENTED; + } else { + return backend_type_t::RANDOM_BLOCK; + } +} + +/** + * Monotonically increasing identifier for the location of a + * journal_record. + */ +// JOURNAL_SEQ_NULL == JOURNAL_SEQ_MAX == journal_seq_t{} +struct journal_seq_t { + segment_seq_t segment_seq = NULL_SEG_SEQ; + paddr_t offset = P_ADDR_NULL; + + void swap(journal_seq_t &other) { + std::swap(segment_seq, other.segment_seq); + std::swap(offset, other.offset); + } + + // produces a pseudo journal_seq_t relative to this by offset + journal_seq_t add_offset( + journal_type_t type, + device_off_t off, + device_off_t roll_start, + device_off_t roll_size) const; + + device_off_t relative_to( + journal_type_t type, + const journal_seq_t& r, + device_off_t roll_start, + device_off_t roll_size) const; + + DENC(journal_seq_t, v, p) { + DENC_START(1, 1, p); + denc(v.segment_seq, p); + denc(v.offset, p); + DENC_FINISH(p); + } + + bool operator==(const journal_seq_t &o) const { return cmp(o) == 0; } + bool operator!=(const journal_seq_t &o) const { return cmp(o) != 0; } + bool operator<(const journal_seq_t &o) const { return cmp(o) < 0; } + bool operator<=(const journal_seq_t &o) const { return cmp(o) <= 0; } + bool operator>(const journal_seq_t &o) const { return cmp(o) > 0; } + bool operator>=(const journal_seq_t &o) const { return cmp(o) >= 0; } + +private: + int cmp(const journal_seq_t &other) const { + if (segment_seq > other.segment_seq) { + return 1; + } else if (segment_seq < other.segment_seq) { + return -1; + } + using ret_t = std::pair<device_off_t, segment_id_t>; + auto to_pair = [](const paddr_t &addr) -> ret_t { + if (addr.get_addr_type() == paddr_types_t::SEGMENT) { + auto &seg_addr = addr.as_seg_paddr(); + return ret_t(seg_addr.get_segment_off(), seg_addr.get_segment_id()); + } else if (addr.get_addr_type() == paddr_types_t::RANDOM_BLOCK) { + auto &blk_addr = addr.as_blk_paddr(); + return ret_t(blk_addr.get_device_off(), MAX_SEG_ID); + } else if (addr.get_addr_type() == paddr_types_t::RESERVED) { + auto &res_addr = addr.as_res_paddr(); + return ret_t(res_addr.get_device_off(), MAX_SEG_ID); + } else { + assert(0 == "impossible"); + return ret_t(0, MAX_SEG_ID); + } + }; + auto left = to_pair(offset); + auto right = to_pair(other.offset); + if (left > right) { + return 1; + } else if (left < right) { + return -1; + } else { + return 0; + } + } +}; + +std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq); + +constexpr journal_seq_t JOURNAL_SEQ_MIN{ + 0, + P_ADDR_MIN +}; +constexpr journal_seq_t JOURNAL_SEQ_MAX{ + MAX_SEG_SEQ, + P_ADDR_MAX +}; +// JOURNAL_SEQ_NULL == JOURNAL_SEQ_MAX == journal_seq_t{} +constexpr journal_seq_t JOURNAL_SEQ_NULL = JOURNAL_SEQ_MAX; + +// logical addr, see LBAManager, TransactionManager +using laddr_t = uint64_t; +constexpr laddr_t L_ADDR_MIN = std::numeric_limits<laddr_t>::min(); +constexpr laddr_t L_ADDR_MAX = std::numeric_limits<laddr_t>::max(); +constexpr laddr_t L_ADDR_NULL = L_ADDR_MAX; +constexpr laddr_t L_ADDR_ROOT = L_ADDR_MAX - 1; +constexpr laddr_t L_ADDR_LBAT = L_ADDR_MAX - 2; + +struct __attribute((packed)) laddr_le_t { + ceph_le64 laddr = ceph_le64(L_ADDR_NULL); + + using orig_type = laddr_t; + + laddr_le_t() = default; + laddr_le_t(const laddr_le_t &) = default; + explicit laddr_le_t(const laddr_t &addr) + : laddr(ceph_le64(addr)) {} + + operator laddr_t() const { + return laddr_t(laddr); + } + laddr_le_t& operator=(laddr_t addr) { + ceph_le64 val; + val = addr; + laddr = val; + return *this; + } +}; + +constexpr uint64_t PL_ADDR_NULL = std::numeric_limits<uint64_t>::max(); + +struct pladdr_t { + std::variant<laddr_t, paddr_t> pladdr; + + pladdr_t() = default; + pladdr_t(const pladdr_t &) = default; + pladdr_t(laddr_t laddr) + : pladdr(laddr) {} + pladdr_t(paddr_t paddr) + : pladdr(paddr) {} + + bool is_laddr() const { + return pladdr.index() == 0; + } + + bool is_paddr() const { + return pladdr.index() == 1; + } + + pladdr_t& operator=(paddr_t paddr) { + pladdr = paddr; + return *this; + } + + pladdr_t& operator=(laddr_t laddr) { + pladdr = laddr; + return *this; + } + + bool operator==(const pladdr_t &) const = default; + + paddr_t get_paddr() const { + assert(pladdr.index() == 1); + return paddr_t(std::get<1>(pladdr)); + } + + laddr_t get_laddr() const { + assert(pladdr.index() == 0); + return laddr_t(std::get<0>(pladdr)); + } + +}; + +std::ostream &operator<<(std::ostream &out, const pladdr_t &pladdr); + +enum class addr_type_t : uint8_t { + PADDR=0, + LADDR=1, + MAX=2 // or NONE +}; + +struct __attribute((packed)) pladdr_le_t { + ceph_le64 pladdr = ceph_le64(PL_ADDR_NULL); + addr_type_t addr_type = addr_type_t::MAX; + + pladdr_le_t() = default; + pladdr_le_t(const pladdr_le_t &) = default; + explicit pladdr_le_t(const pladdr_t &addr) + : pladdr( + ceph_le64( + addr.is_laddr() ? + std::get<0>(addr.pladdr) : + std::get<1>(addr.pladdr).internal_paddr)), + addr_type( + addr.is_laddr() ? + addr_type_t::LADDR : + addr_type_t::PADDR) + {} + + operator pladdr_t() const { + if (addr_type == addr_type_t::LADDR) { + return pladdr_t(laddr_t(pladdr)); + } else { + assert(addr_type == addr_type_t::PADDR); + return pladdr_t(paddr_t(pladdr)); + } + } +}; + +template <typename T> +struct min_max_t {}; + +template <> +struct min_max_t<laddr_t> { + static constexpr laddr_t max = L_ADDR_MAX; + static constexpr laddr_t min = L_ADDR_MIN; + static constexpr laddr_t null = L_ADDR_NULL; +}; + +template <> +struct min_max_t<paddr_t> { + static constexpr paddr_t max = P_ADDR_MAX; + static constexpr paddr_t min = P_ADDR_MIN; + static constexpr paddr_t null = P_ADDR_NULL; +}; + +// logical offset, see LBAManager, TransactionManager +using extent_len_t = uint32_t; +constexpr extent_len_t EXTENT_LEN_MAX = + std::numeric_limits<extent_len_t>::max(); + +using extent_len_le_t = ceph_le32; +inline extent_len_le_t init_extent_len_le(extent_len_t len) { + return ceph_le32(len); +} + +struct laddr_list_t : std::list<std::pair<laddr_t, extent_len_t>> { + template <typename... T> + laddr_list_t(T&&... args) + : std::list<std::pair<laddr_t, extent_len_t>>(std::forward<T>(args)...) {} +}; +struct paddr_list_t : std::list<std::pair<paddr_t, extent_len_t>> { + template <typename... T> + paddr_list_t(T&&... args) + : std::list<std::pair<paddr_t, extent_len_t>>(std::forward<T>(args)...) {} +}; + +std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs); +std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs); + +/* identifies type of extent, used for interpretting deltas, managing + * writeback. + * + * Note that any new extent type needs to be added to + * Cache::get_extent_by_type in cache.cc + */ +enum class extent_types_t : uint8_t { + ROOT = 0, + LADDR_INTERNAL = 1, + LADDR_LEAF = 2, + DINK_LADDR_LEAF = 3, // should only be used for unitttests + OMAP_INNER = 4, + OMAP_LEAF = 5, + ONODE_BLOCK_STAGED = 6, + COLL_BLOCK = 7, + OBJECT_DATA_BLOCK = 8, + RETIRED_PLACEHOLDER = 9, + // the following two types are not extent types, + // they are just used to indicates paddr allocation deltas + ALLOC_INFO = 10, + JOURNAL_TAIL = 11, + // Test Block Types + TEST_BLOCK = 12, + TEST_BLOCK_PHYSICAL = 13, + BACKREF_INTERNAL = 14, + BACKREF_LEAF = 15, + // None and the number of valid extent_types_t + NONE = 16, +}; +using extent_types_le_t = uint8_t; +constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE); + +constexpr size_t BACKREF_NODE_SIZE = 4096; + +std::ostream &operator<<(std::ostream &out, extent_types_t t); + +constexpr bool is_logical_type(extent_types_t type) { + switch (type) { + case extent_types_t::ROOT: + case extent_types_t::LADDR_INTERNAL: + case extent_types_t::LADDR_LEAF: + case extent_types_t::BACKREF_INTERNAL: + case extent_types_t::BACKREF_LEAF: + return false; + default: + return true; + } +} + +constexpr bool is_retired_placeholder(extent_types_t type) +{ + return type == extent_types_t::RETIRED_PLACEHOLDER; +} + +constexpr bool is_lba_node(extent_types_t type) +{ + return type == extent_types_t::LADDR_INTERNAL || + type == extent_types_t::LADDR_LEAF || + type == extent_types_t::DINK_LADDR_LEAF; +} + +constexpr bool is_backref_node(extent_types_t type) +{ + return type == extent_types_t::BACKREF_INTERNAL || + type == extent_types_t::BACKREF_LEAF; +} + +constexpr bool is_lba_backref_node(extent_types_t type) +{ + return is_lba_node(type) || is_backref_node(type); +} + +std::ostream &operator<<(std::ostream &out, extent_types_t t); + +/** + * rewrite_gen_t + * + * The goal is to group the similar aged extents in the same segment for better + * bimodel utilization distribution, and also to the same device tier. For EPM, + * it has the flexibility to make placement decisions by re-assigning the + * generation. And each non-inline generation will be statically mapped to a + * writer in EPM. + * + * All the fresh and dirty extents start with INIT_GENERATION upon allocation, + * and they will be assigned to INLINE/OOL generation by EPM before the initial + * writes. After that, the generation can only be increased upon rewrite. + * + * Note, although EPM can re-assign the generations according to the tiering + * status, it cannot decrease the generation for the correctness of space + * reservation. It may choose to assign a larger generation if the extent is + * hinted cold, or if want to evict extents to the cold tier. And it may choose + * to not increase the generation if want to keep the hot tier as filled as + * possible. + */ +using rewrite_gen_t = uint8_t; + +// INIT_GENERATION requires EPM decision to INLINE/OOL_GENERATION +constexpr rewrite_gen_t INIT_GENERATION = 0; +constexpr rewrite_gen_t INLINE_GENERATION = 1; // to the journal +constexpr rewrite_gen_t OOL_GENERATION = 2; + +// All the rewritten extents start with MIN_REWRITE_GENERATION +constexpr rewrite_gen_t MIN_REWRITE_GENERATION = 3; +// without cold tier, the largest generation is less than MIN_COLD_GENERATION +constexpr rewrite_gen_t MIN_COLD_GENERATION = 5; +constexpr rewrite_gen_t MAX_REWRITE_GENERATION = 7; +constexpr rewrite_gen_t REWRITE_GENERATIONS = MAX_REWRITE_GENERATION + 1; +constexpr rewrite_gen_t NULL_GENERATION = + std::numeric_limits<rewrite_gen_t>::max(); + +struct rewrite_gen_printer_t { + rewrite_gen_t gen; +}; + +std::ostream &operator<<(std::ostream &out, rewrite_gen_printer_t gen); + +constexpr std::size_t generation_to_writer(rewrite_gen_t gen) { + // caller to assert the gen is in the reasonable range + return gen - OOL_GENERATION; +} + +// before EPM decision +constexpr bool is_target_rewrite_generation(rewrite_gen_t gen) { + return gen == INIT_GENERATION || + (gen >= MIN_REWRITE_GENERATION && + gen <= REWRITE_GENERATIONS); +} + +// after EPM decision +constexpr bool is_rewrite_generation(rewrite_gen_t gen) { + return gen >= INLINE_GENERATION && + gen < REWRITE_GENERATIONS; +} + +enum class data_category_t : uint8_t { + METADATA = 0, + DATA, + NUM +}; + +std::ostream &operator<<(std::ostream &out, data_category_t c); + +constexpr data_category_t get_extent_category(extent_types_t type) { + if (type == extent_types_t::OBJECT_DATA_BLOCK || + type == extent_types_t::TEST_BLOCK) { + return data_category_t::DATA; + } else { + return data_category_t::METADATA; + } +} + +// type for extent modification time, milliseconds since the epoch +using sea_time_point = seastar::lowres_system_clock::time_point; +using sea_duration = seastar::lowres_system_clock::duration; +using mod_time_point_t = int64_t; + +constexpr mod_time_point_t +timepoint_to_mod(const sea_time_point &t) { + return std::chrono::duration_cast<std::chrono::milliseconds>( + t.time_since_epoch()).count(); +} + +constexpr sea_time_point +mod_to_timepoint(mod_time_point_t t) { + return sea_time_point(std::chrono::duration_cast<sea_duration>( + std::chrono::milliseconds(t))); +} + +constexpr auto NULL_TIME = sea_time_point(); +constexpr auto NULL_MOD_TIME = timepoint_to_mod(NULL_TIME); + +struct sea_time_point_printer_t { + sea_time_point tp; +}; +std::ostream &operator<<(std::ostream &out, sea_time_point_printer_t tp); + +struct mod_time_point_printer_t { + mod_time_point_t tp; +}; +std::ostream &operator<<(std::ostream &out, mod_time_point_printer_t tp); + +constexpr sea_time_point +get_average_time(const sea_time_point& t1, std::size_t n1, + const sea_time_point& t2, std::size_t n2) { + assert(t1 != NULL_TIME); + assert(t2 != NULL_TIME); + auto new_size = n1 + n2; + assert(new_size > 0); + auto c1 = t1.time_since_epoch().count(); + auto c2 = t2.time_since_epoch().count(); + auto c_ret = c1 / new_size * n1 + c2 / new_size * n2; + return sea_time_point(sea_duration(c_ret)); +} + +/* description of a new physical extent */ +struct extent_t { + extent_types_t type; ///< type of extent + laddr_t addr; ///< laddr of extent (L_ADDR_NULL for non-logical) + ceph::bufferlist bl; ///< payload, bl.length() == length, aligned +}; + +using extent_version_t = uint32_t; + +/* description of a mutation to a physical extent */ +struct delta_info_t { + extent_types_t type = extent_types_t::NONE; ///< delta type + paddr_t paddr; ///< physical address + laddr_t laddr = L_ADDR_NULL; ///< logical address + uint32_t prev_crc = 0; + uint32_t final_crc = 0; + extent_len_t length = 0; ///< extent length + extent_version_t pversion; ///< prior version + segment_seq_t ext_seq; ///< seq of the extent's segment + segment_type_t seg_type; + ceph::bufferlist bl; ///< payload + + DENC(delta_info_t, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.paddr, p); + denc(v.laddr, p); + denc(v.prev_crc, p); + denc(v.final_crc, p); + denc(v.length, p); + denc(v.pversion, p); + denc(v.ext_seq, p); + denc(v.seg_type, p); + denc(v.bl, p); + DENC_FINISH(p); + } + + bool operator==(const delta_info_t &rhs) const { + return ( + type == rhs.type && + paddr == rhs.paddr && + laddr == rhs.laddr && + prev_crc == rhs.prev_crc && + final_crc == rhs.final_crc && + length == rhs.length && + pversion == rhs.pversion && + ext_seq == rhs.ext_seq && + bl == rhs.bl + ); + } +}; + +std::ostream &operator<<(std::ostream &out, const delta_info_t &delta); + +/* contains the latest journal tail information */ +struct journal_tail_delta_t { + journal_seq_t alloc_tail; + journal_seq_t dirty_tail; + + DENC(journal_tail_delta_t, v, p) { + DENC_START(1, 1, p); + denc(v.alloc_tail, p); + denc(v.dirty_tail, p); + DENC_FINISH(p); + } +}; + +std::ostream &operator<<(std::ostream &out, const journal_tail_delta_t &delta); + +class object_data_t { + laddr_t reserved_data_base = L_ADDR_NULL; + extent_len_t reserved_data_len = 0; + + bool dirty = false; +public: + object_data_t( + laddr_t reserved_data_base, + extent_len_t reserved_data_len) + : reserved_data_base(reserved_data_base), + reserved_data_len(reserved_data_len) {} + + laddr_t get_reserved_data_base() const { + return reserved_data_base; + } + + extent_len_t get_reserved_data_len() const { + return reserved_data_len; + } + + bool is_null() const { + return reserved_data_base == L_ADDR_NULL; + } + + bool must_update() const { + return dirty; + } + + void update_reserved( + laddr_t base, + extent_len_t len) { + dirty = true; + reserved_data_base = base; + reserved_data_len = len; + } + + void update_len( + extent_len_t len) { + dirty = true; + reserved_data_len = len; + } + + void clear() { + dirty = true; + reserved_data_base = L_ADDR_NULL; + reserved_data_len = 0; + } +}; + +struct __attribute__((packed)) object_data_le_t { + laddr_le_t reserved_data_base = laddr_le_t(L_ADDR_NULL); + extent_len_le_t reserved_data_len = init_extent_len_le(0); + + void update(const object_data_t &nroot) { + reserved_data_base = nroot.get_reserved_data_base(); + reserved_data_len = init_extent_len_le(nroot.get_reserved_data_len()); + } + + object_data_t get() const { + return object_data_t( + reserved_data_base, + reserved_data_len); + } +}; + +struct omap_root_t { + laddr_t addr = L_ADDR_NULL; + depth_t depth = 0; + laddr_t hint = L_ADDR_MIN; + bool mutated = false; + + omap_root_t() = default; + omap_root_t(laddr_t addr, depth_t depth, laddr_t addr_min) + : addr(addr), + depth(depth), + hint(addr_min) {} + + omap_root_t(const omap_root_t &o) = default; + omap_root_t(omap_root_t &&o) = default; + omap_root_t &operator=(const omap_root_t &o) = default; + omap_root_t &operator=(omap_root_t &&o) = default; + + bool is_null() const { + return addr == L_ADDR_NULL; + } + + bool must_update() const { + return mutated; + } + + void update(laddr_t _addr, depth_t _depth, laddr_t _hint) { + mutated = true; + addr = _addr; + depth = _depth; + hint = _hint; + } + + laddr_t get_location() const { + return addr; + } + + depth_t get_depth() const { + return depth; + } + + laddr_t get_hint() const { + return hint; + } +}; +std::ostream &operator<<(std::ostream &out, const omap_root_t &root); + +class __attribute__((packed)) omap_root_le_t { + laddr_le_t addr = laddr_le_t(L_ADDR_NULL); + depth_le_t depth = init_depth_le(0); + +public: + omap_root_le_t() = default; + + omap_root_le_t(laddr_t addr, depth_t depth) + : addr(addr), depth(init_depth_le(depth)) {} + + omap_root_le_t(const omap_root_le_t &o) = default; + omap_root_le_t(omap_root_le_t &&o) = default; + omap_root_le_t &operator=(const omap_root_le_t &o) = default; + omap_root_le_t &operator=(omap_root_le_t &&o) = default; + + void update(const omap_root_t &nroot) { + addr = nroot.get_location(); + depth = init_depth_le(nroot.get_depth()); + } + + omap_root_t get(laddr_t hint) const { + return omap_root_t(addr, depth, hint); + } +}; + +/** + * phy_tree_root_t + */ +class __attribute__((packed)) phy_tree_root_t { + paddr_le_t root_addr; + depth_le_t depth = init_extent_len_le(0); + +public: + phy_tree_root_t() = default; + + phy_tree_root_t(paddr_t addr, depth_t depth) + : root_addr(addr), depth(init_depth_le(depth)) {} + + phy_tree_root_t(const phy_tree_root_t &o) = default; + phy_tree_root_t(phy_tree_root_t &&o) = default; + phy_tree_root_t &operator=(const phy_tree_root_t &o) = default; + phy_tree_root_t &operator=(phy_tree_root_t &&o) = default; + + paddr_t get_location() const { + return root_addr; + } + + void set_location(paddr_t location) { + root_addr = location; + } + + depth_t get_depth() const { + return depth; + } + + void set_depth(depth_t ndepth) { + depth = ndepth; + } + + void adjust_addrs_from_base(paddr_t base) { + paddr_t _root_addr = root_addr; + if (_root_addr.is_relative()) { + root_addr = base.add_record_relative(_root_addr); + } + } +}; + +class coll_root_t { + laddr_t addr = L_ADDR_NULL; + extent_len_t size = 0; + + bool mutated = false; + +public: + coll_root_t() = default; + coll_root_t(laddr_t addr, extent_len_t size) : addr(addr), size(size) {} + + coll_root_t(const coll_root_t &o) = default; + coll_root_t(coll_root_t &&o) = default; + coll_root_t &operator=(const coll_root_t &o) = default; + coll_root_t &operator=(coll_root_t &&o) = default; + + bool must_update() const { + return mutated; + } + + void update(laddr_t _addr, extent_len_t _s) { + mutated = true; + addr = _addr; + size = _s; + } + + laddr_t get_location() const { + return addr; + } + + extent_len_t get_size() const { + return size; + } +}; + +/** + * coll_root_le_t + * + * Information for locating CollectionManager information, to be embedded + * in root block. + */ +class __attribute__((packed)) coll_root_le_t { + laddr_le_t addr; + extent_len_le_t size = init_extent_len_le(0); + +public: + coll_root_le_t() = default; + + coll_root_le_t(laddr_t laddr, extent_len_t size) + : addr(laddr), size(init_extent_len_le(size)) {} + + + coll_root_le_t(const coll_root_le_t &o) = default; + coll_root_le_t(coll_root_le_t &&o) = default; + coll_root_le_t &operator=(const coll_root_le_t &o) = default; + coll_root_le_t &operator=(coll_root_le_t &&o) = default; + + void update(const coll_root_t &nroot) { + addr = nroot.get_location(); + size = init_extent_len_le(nroot.get_size()); + } + + coll_root_t get() const { + return coll_root_t(addr, size); + } +}; + +using lba_root_t = phy_tree_root_t; +using backref_root_t = phy_tree_root_t; + +/** + * root_t + * + * Contains information required to find metadata roots. + * TODO: generalize this to permit more than one lba_manager implementation + */ +struct __attribute__((packed)) root_t { + using meta_t = std::map<std::string, std::string>; + + static constexpr int MAX_META_LENGTH = 1024; + + backref_root_t backref_root; + lba_root_t lba_root; + laddr_le_t onode_root; + coll_root_le_t collection_root; + + char meta[MAX_META_LENGTH]; + + root_t() { + set_meta(meta_t{}); + } + + void adjust_addrs_from_base(paddr_t base) { + lba_root.adjust_addrs_from_base(base); + backref_root.adjust_addrs_from_base(base); + } + + meta_t get_meta() { + bufferlist bl; + bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta)); + meta_t ret; + auto iter = bl.cbegin(); + decode(ret, iter); + return ret; + } + + void set_meta(const meta_t &m) { + ceph::bufferlist bl; + encode(m, bl); + ceph_assert(bl.length() < MAX_META_LENGTH); + bl.rebuild(); + auto &bptr = bl.front(); + ::memset(meta, 0, MAX_META_LENGTH); + ::memcpy(meta, bptr.c_str(), bl.length()); + } +}; + +struct alloc_blk_t { + alloc_blk_t( + paddr_t paddr, + laddr_t laddr, + extent_len_t len, + extent_types_t type) + : paddr(paddr), laddr(laddr), len(len), type(type) + {} + + explicit alloc_blk_t() = default; + + paddr_t paddr = P_ADDR_NULL; + laddr_t laddr = L_ADDR_NULL; + extent_len_t len = 0; + extent_types_t type = extent_types_t::ROOT; + DENC(alloc_blk_t, v, p) { + DENC_START(1, 1, p); + denc(v.paddr, p); + denc(v.laddr, p); + denc(v.len, p); + denc(v.type, p); + DENC_FINISH(p); + } +}; + +// use absolute address +struct alloc_delta_t { + enum class op_types_t : uint8_t { + NONE = 0, + SET = 1, + CLEAR = 2 + }; + std::vector<alloc_blk_t> alloc_blk_ranges; + op_types_t op = op_types_t::NONE; + + alloc_delta_t() = default; + + DENC(alloc_delta_t, v, p) { + DENC_START(1, 1, p); + denc(v.alloc_blk_ranges, p); + denc(v.op, p); + DENC_FINISH(p); + } +}; + +struct extent_info_t { + extent_types_t type = extent_types_t::NONE; + laddr_t addr = L_ADDR_NULL; + extent_len_t len = 0; + + extent_info_t() = default; + extent_info_t(const extent_t &et) + : type(et.type), addr(et.addr), + len(et.bl.length()) + {} + + DENC(extent_info_t, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.addr, p); + denc(v.len, p); + DENC_FINISH(p); + } +}; +std::ostream &operator<<(std::ostream &out, const extent_info_t &header); + +using segment_nonce_t = uint32_t; + +/** + * Segment header + * + * Every segment contains and encode segment_header_t in the first block. + * Our strategy for finding the journal replay point is: + * 1) Find the segment with the highest journal_segment_seq + * 2) Get dirty_tail and alloc_tail from the segment header + * 3) Scan forward to update tails from journal_tail_delta_t + * 4) Replay from the latest tails + */ +struct segment_header_t { + segment_seq_t segment_seq; + segment_id_t physical_segment_id; // debugging + + journal_seq_t dirty_tail; + journal_seq_t alloc_tail; + segment_nonce_t segment_nonce; + + segment_type_t type; + + data_category_t category; + rewrite_gen_t generation; + + segment_type_t get_type() const { + return type; + } + + DENC(segment_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.segment_seq, p); + denc(v.physical_segment_id, p); + denc(v.dirty_tail, p); + denc(v.alloc_tail, p); + denc(v.segment_nonce, p); + denc(v.type, p); + denc(v.category, p); + denc(v.generation, p); + DENC_FINISH(p); + } +}; +std::ostream &operator<<(std::ostream &out, const segment_header_t &header); + +struct segment_tail_t { + segment_seq_t segment_seq; + segment_id_t physical_segment_id; // debugging + + segment_nonce_t segment_nonce; + + segment_type_t type; + + mod_time_point_t modify_time; + std::size_t num_extents; + + segment_type_t get_type() const { + return type; + } + + DENC(segment_tail_t, v, p) { + DENC_START(1, 1, p); + denc(v.segment_seq, p); + denc(v.physical_segment_id, p); + denc(v.segment_nonce, p); + denc(v.type, p); + denc(v.modify_time, p); + denc(v.num_extents, p); + DENC_FINISH(p); + } +}; +std::ostream &operator<<(std::ostream &out, const segment_tail_t &tail); + +enum class transaction_type_t : uint8_t { + MUTATE = 0, + READ, // including weak and non-weak read transactions + TRIM_DIRTY, + TRIM_ALLOC, + CLEANER_MAIN, + CLEANER_COLD, + MAX +}; + +static constexpr auto TRANSACTION_TYPE_NULL = transaction_type_t::MAX; + +static constexpr auto TRANSACTION_TYPE_MAX = static_cast<std::size_t>( + transaction_type_t::MAX); + +std::ostream &operator<<(std::ostream &os, transaction_type_t type); + +constexpr bool is_valid_transaction(transaction_type_t type) { + return type < transaction_type_t::MAX; +} + +constexpr bool is_background_transaction(transaction_type_t type) { + return (type >= transaction_type_t::TRIM_DIRTY && + type < transaction_type_t::MAX); +} + +constexpr bool is_trim_transaction(transaction_type_t type) { + return (type == transaction_type_t::TRIM_DIRTY || + type == transaction_type_t::TRIM_ALLOC); +} + +struct record_size_t { + extent_len_t plain_mdlength = 0; // mdlength without the record header + extent_len_t dlength = 0; + + extent_len_t get_raw_mdlength() const; + + bool is_empty() const { + return plain_mdlength == 0 && + dlength == 0; + } + + void account_extent(extent_len_t extent_len); + + void account(const extent_t& extent) { + account_extent(extent.bl.length()); + } + + void account(const delta_info_t& delta); + + bool operator==(const record_size_t &) const = default; +}; +std::ostream &operator<<(std::ostream&, const record_size_t&); + +struct record_t { + transaction_type_t type = TRANSACTION_TYPE_NULL; + std::vector<extent_t> extents; + std::vector<delta_info_t> deltas; + record_size_t size; + sea_time_point modify_time = NULL_TIME; + + record_t(transaction_type_t type) : type{type} { } + + // unit test only + record_t() { + type = transaction_type_t::MUTATE; + } + + // unit test only + record_t(std::vector<extent_t>&& _extents, + std::vector<delta_info_t>&& _deltas) { + auto modify_time = seastar::lowres_system_clock::now(); + for (auto& e: _extents) { + push_back(std::move(e), modify_time); + } + for (auto& d: _deltas) { + push_back(std::move(d)); + } + type = transaction_type_t::MUTATE; + } + + bool is_empty() const { + return extents.size() == 0 && + deltas.size() == 0; + } + + std::size_t get_delta_size() const { + auto delta_size = std::accumulate( + deltas.begin(), deltas.end(), 0, + [](uint64_t sum, auto& delta) { + return sum + delta.bl.length(); + } + ); + return delta_size; + } + + void push_back(extent_t&& extent, sea_time_point &t) { + ceph_assert(t != NULL_TIME); + if (extents.size() == 0) { + assert(modify_time == NULL_TIME); + modify_time = t; + } else { + modify_time = get_average_time(modify_time, extents.size(), t, 1); + } + size.account(extent); + extents.push_back(std::move(extent)); + } + + void push_back(delta_info_t&& delta) { + size.account(delta); + deltas.push_back(std::move(delta)); + } +}; +std::ostream &operator<<(std::ostream&, const record_t&); + +struct record_header_t { + transaction_type_t type; + uint32_t deltas; // number of deltas + uint32_t extents; // number of extents + mod_time_point_t modify_time; + + DENC(record_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.deltas, p); + denc(v.extents, p); + denc(v.modify_time, p); + DENC_FINISH(p); + } +}; +std::ostream &operator<<(std::ostream&, const record_header_t&); + +struct record_group_header_t { + uint32_t records; + extent_len_t mdlength; // block aligned, length of metadata + extent_len_t dlength; // block aligned, length of data + segment_nonce_t segment_nonce;// nonce of containing segment + journal_seq_t committed_to; // records prior to committed_to have been + // fully written, maybe in another segment. + checksum_t data_crc; // crc of data payload + + + DENC(record_group_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.records, p); + denc(v.mdlength, p); + denc(v.dlength, p); + denc(v.segment_nonce, p); + denc(v.committed_to, p); + denc(v.data_crc, p); + DENC_FINISH(p); + } +}; +std::ostream& operator<<(std::ostream&, const record_group_header_t&); + +struct record_group_size_t { + extent_len_t plain_mdlength = 0; // mdlength without the group header + extent_len_t dlength = 0; + extent_len_t block_size = 0; + + record_group_size_t() = default; + record_group_size_t( + const record_size_t& rsize, + extent_len_t block_size) { + account(rsize, block_size); + } + + extent_len_t get_raw_mdlength() const; + + extent_len_t get_mdlength() const { + assert(block_size > 0); + return p2roundup(get_raw_mdlength(), block_size); + } + + extent_len_t get_encoded_length() const { + assert(block_size > 0); + assert(dlength % block_size == 0); + return get_mdlength() + dlength; + } + + record_group_size_t get_encoded_length_after( + const record_size_t& rsize, + extent_len_t block_size) const { + record_group_size_t tmp = *this; + tmp.account(rsize, block_size); + return tmp; + } + + double get_fullness() const { + assert(block_size > 0); + return ((double)(get_raw_mdlength() + dlength) / + get_encoded_length()); + } + + void account(const record_size_t& rsize, + extent_len_t block_size); + + bool operator==(const record_group_size_t &) const = default; +}; +std::ostream& operator<<(std::ostream&, const record_group_size_t&); + +struct record_group_t { + std::vector<record_t> records; + record_group_size_t size; + + record_group_t() = default; + record_group_t( + record_t&& record, + extent_len_t block_size) { + push_back(std::move(record), block_size); + } + + std::size_t get_size() const { + return records.size(); + } + + void push_back( + record_t&& record, + extent_len_t block_size) { + size.account(record.size, block_size); + records.push_back(std::move(record)); + assert(size.get_encoded_length() < SEGMENT_OFF_MAX); + } + + void reserve(std::size_t limit) { + records.reserve(limit); + } + + void clear() { + records.clear(); + size = {}; + } +}; +std::ostream& operator<<(std::ostream&, const record_group_t&); + +ceph::bufferlist encode_record( + record_t&& record, + extent_len_t block_size, + const journal_seq_t& committed_to, + segment_nonce_t current_segment_nonce); + +ceph::bufferlist encode_records( + record_group_t& record_group, + const journal_seq_t& committed_to, + segment_nonce_t current_segment_nonce); + +std::optional<record_group_header_t> +try_decode_records_header( + const ceph::bufferlist& header_bl, + segment_nonce_t expected_nonce); + +bool validate_records_metadata( + const ceph::bufferlist& md_bl); + +bool validate_records_data( + const record_group_header_t& header, + const ceph::bufferlist& data_bl); + +struct record_extent_infos_t { + record_header_t header; + std::vector<extent_info_t> extent_infos; +}; +std::optional<std::vector<record_extent_infos_t> > +try_decode_extent_infos( + const record_group_header_t& header, + const ceph::bufferlist& md_bl); +std::optional<std::vector<record_header_t>> +try_decode_record_headers( + const record_group_header_t& header, + const ceph::bufferlist& md_bl); + +struct record_deltas_t { + paddr_t record_block_base; + std::vector<std::pair<sea_time_point, delta_info_t>> deltas; +}; +std::optional<std::vector<record_deltas_t> > +try_decode_deltas( + const record_group_header_t& header, + const ceph::bufferlist& md_bl, + paddr_t record_block_base); + +struct write_result_t { + journal_seq_t start_seq; + extent_len_t length; + + journal_seq_t get_end_seq() const { + return journal_seq_t{ + start_seq.segment_seq, + start_seq.offset.add_offset(length)}; + } +}; +std::ostream& operator<<(std::ostream&, const write_result_t&); + +struct record_locator_t { + paddr_t record_block_base; + write_result_t write_result; +}; +std::ostream& operator<<(std::ostream&, const record_locator_t&); + +/// scan segment for end incrementally +struct scan_valid_records_cursor { + bool last_valid_header_found = false; + journal_seq_t seq; + journal_seq_t last_committed; + std::size_t num_consumed_records = 0; + extent_len_t block_size = 0; + + struct found_record_group_t { + paddr_t offset; + record_group_header_t header; + bufferlist mdbuffer; + + found_record_group_t( + paddr_t offset, + const record_group_header_t &header, + const bufferlist &mdbuffer) + : offset(offset), header(header), mdbuffer(mdbuffer) {} + }; + std::deque<found_record_group_t> pending_record_groups; + + bool is_complete() const { + return last_valid_header_found && pending_record_groups.empty(); + } + + segment_id_t get_segment_id() const { + return seq.offset.as_seg_paddr().get_segment_id(); + } + + segment_off_t get_segment_offset() const { + return seq.offset.as_seg_paddr().get_segment_off(); + } + + extent_len_t get_block_size() const { + return block_size; + } + + void increment_seq(segment_off_t off) { + seq.offset = seq.offset.add_offset(off); + } + + void emplace_record_group(const record_group_header_t&, ceph::bufferlist&&); + + void pop_record_group() { + assert(!pending_record_groups.empty()); + ++num_consumed_records; + pending_record_groups.pop_front(); + } + + scan_valid_records_cursor( + journal_seq_t seq) + : seq(seq) {} +}; +std::ostream& operator<<(std::ostream&, const scan_valid_records_cursor&); + +} + +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::seastore_meta_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_id_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_tail_delta_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_group_header_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_header_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_blk_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_delta_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_tail_t) + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::data_category_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::delta_info_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::device_id_printer_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::extent_types_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::journal_seq_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::journal_tail_delta_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::laddr_list_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::omap_root_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::paddr_list_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::paddr_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::pladdr_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::placement_hint_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::device_type_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::record_group_header_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::record_group_size_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::record_header_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::record_locator_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::record_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::rewrite_gen_printer_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::scan_valid_records_cursor> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::sea_time_point_printer_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::segment_header_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::segment_id_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::segment_seq_printer_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::segment_tail_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::segment_type_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::transaction_type_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::write_result_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<ceph::buffer::list> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/segment_manager.cc b/src/crimson/os/seastore/segment_manager.cc new file mode 100644 index 000000000..1be9cce5f --- /dev/null +++ b/src/crimson/os/seastore/segment_manager.cc @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/segment_manager/block.h" +#include "crimson/os/seastore/logging.h" + +#ifdef HAVE_ZNS +#include "crimson/os/seastore/segment_manager/zbd.h" +SET_SUBSYS(seastore_device); +#endif + + +namespace crimson::os::seastore { + +std::ostream& operator<<(std::ostream& out, const block_shard_info_t& sf) +{ + out << "(" + << "size=" << sf.size + << ", segments=" <<sf.segments + << ", tracker_offset=" <<sf.tracker_offset + << ", first_segment_offset=" <<sf.first_segment_offset + <<")"; + return out; +} + +std::ostream& operator<<(std::ostream& out, const block_sm_superblock_t& sb) +{ + out << "superblock(" + << "shard_num=" << sb.shard_num + << ", segment_size=" << sb.segment_size + << ", block_size=" << sb.block_size + << ", shard_info:"; + for (auto &sf : sb.shard_infos) { + out << sf + << ","; + } + out << "config=" << sb.config + << ")"; + return out; +} + +std::ostream& operator<<(std::ostream &out, Segment::segment_state_t s) +{ + using state_t = Segment::segment_state_t; + switch (s) { + case state_t::EMPTY: + return out << "EMPTY"; + case state_t::OPEN: + return out << "OPEN"; + case state_t::CLOSED: + return out << "CLOSED"; + default: + return out << "INVALID_SEGMENT_STATE!"; + } +} + +seastar::future<crimson::os::seastore::SegmentManagerRef> +SegmentManager::get_segment_manager( + const std::string &device, device_type_t dtype) +{ +#ifdef HAVE_ZNS +LOG_PREFIX(SegmentManager::get_segment_manager); + return seastar::do_with( + static_cast<size_t>(0), + [FNAME, + dtype, + device](auto &nr_zones) { + return seastar::open_file_dma( + device + "/block", + seastar::open_flags::rw + ).then([FNAME, + dtype, + device, + &nr_zones](auto file) { + return seastar::do_with( + file, + [&nr_zones](auto &f) -> seastar::future<int> { + ceph_assert(f); + return f.ioctl(BLKGETNRZONES, (void *)&nr_zones); + }); + }).then([FNAME, + dtype, + device, + &nr_zones](auto ret) -> crimson::os::seastore::SegmentManagerRef { + crimson::os::seastore::SegmentManagerRef sm; + INFO("Found {} zones.", nr_zones); + if (nr_zones != 0) { + return std::make_unique< + segment_manager::zbd::ZBDSegmentManager + >(device + "/block"); + } else { + return std::make_unique< + segment_manager::block::BlockSegmentManager + >(device + "/block", dtype); + } + }); + }); +#else + return seastar::make_ready_future<crimson::os::seastore::SegmentManagerRef>( + std::make_unique< + segment_manager::block::BlockSegmentManager + >(device + "/block", dtype)); +#endif +} + +} diff --git a/src/crimson/os/seastore/segment_manager.h b/src/crimson/os/seastore/segment_manager.h new file mode 100644 index 000000000..719fa6075 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager.h @@ -0,0 +1,216 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iosfwd> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <boost/iterator/counting_iterator.hpp> +#include <seastar/core/future.hh> + +#include "include/buffer_fwd.h" +#include "include/ceph_assert.h" + +#include "crimson/common/config_proxy.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/osd/exceptions.h" +#include "device.h" + +namespace crimson::os::seastore { + +using std::vector; +struct block_shard_info_t { + std::size_t size; + std::size_t segments; + uint64_t tracker_offset; + uint64_t first_segment_offset; + + DENC(block_shard_info_t, v, p) { + DENC_START(1, 1, p); + denc(v.size, p); + denc(v.segments, p); + denc(v.tracker_offset, p); + denc(v.first_segment_offset, p); + DENC_FINISH(p); + } +}; + +struct block_sm_superblock_t { + unsigned int shard_num = 0; + size_t segment_size = 0; + size_t block_size = 0; + + std::vector<block_shard_info_t> shard_infos; + + device_config_t config; + + DENC(block_sm_superblock_t, v, p) { + DENC_START(1, 1, p); + denc(v.shard_num, p); + denc(v.segment_size, p); + denc(v.block_size, p); + denc(v.shard_infos, p); + denc(v.config, p); + DENC_FINISH(p); + } + + void validate() const { + ceph_assert(shard_num == seastar::smp::count); + ceph_assert(block_size > 0); + ceph_assert(segment_size > 0 && + segment_size % block_size == 0); + ceph_assert_always(segment_size <= SEGMENT_OFF_MAX); + for (unsigned int i = 0; i < seastar::smp::count; i ++) { + ceph_assert(shard_infos[i].size > segment_size && + shard_infos[i].size % block_size == 0); + ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX); + ceph_assert(shard_infos[i].segments > 0); + ceph_assert_always(shard_infos[i].segments <= DEVICE_SEGMENT_ID_MAX); + ceph_assert(shard_infos[i].tracker_offset > 0 && + shard_infos[i].tracker_offset % block_size == 0); + ceph_assert(shard_infos[i].first_segment_offset > shard_infos[i].tracker_offset && + shard_infos[i].first_segment_offset % block_size == 0); + } + ceph_assert(config.spec.magic != 0); + ceph_assert(get_default_backend_of_device(config.spec.dtype) == + backend_type_t::SEGMENTED); + ceph_assert(config.spec.id <= DEVICE_ID_MAX_VALID); + if (!config.major_dev) { + ceph_assert(config.secondary_devices.size() == 0); + } + for (const auto& [k, v] : config.secondary_devices) { + ceph_assert(k != config.spec.id); + ceph_assert(k <= DEVICE_ID_MAX_VALID); + ceph_assert(k == v.id); + ceph_assert(v.magic != 0); + ceph_assert(v.dtype > device_type_t::NONE); + ceph_assert(v.dtype < device_type_t::NUM_TYPES); + } + } +}; + +std::ostream& operator<<(std::ostream&, const block_shard_info_t&); +std::ostream& operator<<(std::ostream&, const block_sm_superblock_t&); + +class Segment : public boost::intrusive_ref_counter< + Segment, + boost::thread_unsafe_counter>{ +public: + + enum class segment_state_t : uint8_t { + EMPTY = 0, + OPEN = 1, + CLOSED = 2 + }; + + /** + * get_segment_id + */ + virtual segment_id_t get_segment_id() const = 0; + + /** + * min next write location + */ + virtual segment_off_t get_write_ptr() const = 0; + + /** + * max capacity + */ + virtual segment_off_t get_write_capacity() const = 0; + + /** + * close + * + * Closes segment for writes. Won't complete until + * outstanding writes to this segment are complete. + */ + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + virtual close_ertr::future<> close() = 0; + + + /** + * write + * + * @param offset offset of write, must be aligned to <> and >= write pointer, advances + * write pointer + * @param bl buffer to write, will be padded if not aligned + */ + using write_ertr = crimson::errorator< + crimson::ct_error::input_output_error, // media error or corruption + crimson::ct_error::invarg, // if offset is < write pointer or misaligned + crimson::ct_error::ebadf, // segment closed + crimson::ct_error::enospc // write exceeds segment size + >; + virtual write_ertr::future<> write( + segment_off_t offset, ceph::bufferlist bl) = 0; + + /** + * advance_wp + * + * advance the segment write pointer, + * needed when writing at wp is strictly implemented. ex: ZBD backed segments + * @param offset: advance write pointer till the given offset + */ + virtual write_ertr::future<> advance_wp( + segment_off_t offset) = 0; + + virtual ~Segment() {} +}; +using SegmentRef = boost::intrusive_ptr<Segment>; + +std::ostream& operator<<(std::ostream& out, Segment::segment_state_t); + +constexpr size_t PADDR_SIZE = sizeof(paddr_t); +class SegmentManager; + +using SegmentManagerRef = std::unique_ptr<SegmentManager>; + +class SegmentManager : public Device { +public: + backend_type_t get_backend_type() const final { + return backend_type_t::SEGMENTED; + } + + using open_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + virtual open_ertr::future<SegmentRef> open(segment_id_t id) = 0; + + using release_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + virtual release_ertr::future<> release(segment_id_t id) = 0; + + /* Methods for discovering device geometry, segmentid set, etc */ + virtual segment_off_t get_segment_size() const = 0; + virtual device_segment_id_t get_num_segments() const { + ceph_assert(get_available_size() % get_segment_size() == 0); + return ((device_segment_id_t)(get_available_size() / get_segment_size())); + } + + virtual ~SegmentManager() {} + + static seastar::future<SegmentManagerRef> + get_segment_manager(const std::string &device, device_type_t dtype); +}; + +} + +WRITE_CLASS_DENC( + crimson::os::seastore::block_shard_info_t +) +WRITE_CLASS_DENC( + crimson::os::seastore::block_sm_superblock_t +) + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::block_shard_info_t> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::block_sm_superblock_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/segment_manager/block.cc b/src/crimson/os/seastore/segment_manager/block.cc new file mode 100644 index 000000000..4eb8d60b2 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/block.cc @@ -0,0 +1,810 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include <fmt/format.h> + +#include <seastar/core/metrics.hh> + +#include "include/buffer.h" + +#include "crimson/common/config_proxy.h" +#include "crimson/common/errorator-loop.h" + +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/segment_manager/block.h" + +SET_SUBSYS(seastore_device); +/* + * format: + * - D<device-id> S<segment-id> offset=<off>~<len> poffset=<off> information + * - D<device-id> poffset=<off>~<len> information + * + * levels: + * - INFO: major initiation, closing and segment operations + * - DEBUG: INFO details, major read and write operations + * - TRACE: DEBUG details + */ + +using segment_state_t = crimson::os::seastore::Segment::segment_state_t; + +template <> struct fmt::formatter<segment_state_t>: fmt::formatter<std::string_view> { + // parse is inherited from formatter<string_view>. + template <typename FormatContext> + auto format(segment_state_t s, FormatContext& ctx) { + std::string_view name = "unknown"; + switch (s) { + case segment_state_t::EMPTY: + name = "empty"; + break; + case segment_state_t::OPEN: + name = "open"; + break; + case segment_state_t::CLOSED: + name = "closed"; + break; + } + return formatter<string_view>::format(name, ctx); + } +}; + +namespace crimson::os::seastore::segment_manager::block { + +static write_ertr::future<> do_write( + device_id_t device_id, + seastar::file &device, + uint64_t offset, + bufferptr &bptr) +{ + LOG_PREFIX(block_do_write); + auto len = bptr.length(); + TRACE("{} poffset={}~{} ...", + device_id_printer_t{device_id}, offset, len); + return device.dma_write( + offset, + bptr.c_str(), + len + ).handle_exception( + [FNAME, device_id, offset, len](auto e) -> write_ertr::future<size_t> { + ERROR("{} poffset={}~{} got error -- {}", + device_id_printer_t{device_id}, offset, len, e); + return crimson::ct_error::input_output_error::make(); + }).then([FNAME, device_id, offset, len](auto result) -> write_ertr::future<> { + if (result != len) { + ERROR("{} poffset={}~{} write len={} inconsistent", + device_id_printer_t{device_id}, offset, len, result); + return crimson::ct_error::input_output_error::make(); + } + TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len); + return write_ertr::now(); + }); +} + +static write_ertr::future<> do_writev( + device_id_t device_id, + seastar::file &device, + uint64_t offset, + bufferlist&& bl, + size_t block_size) +{ + LOG_PREFIX(block_do_writev); + TRACE("{} poffset={}~{}, {} buffers", + device_id_printer_t{device_id}, offset, bl.length(), bl.get_num_buffers()); + + // writev requires each buffer to be aligned to the disks' block + // size, we need to rebuild here + bl.rebuild_aligned(block_size); + + return seastar::do_with( + bl.prepare_iovs(), + std::move(bl), + [&device, device_id, offset, FNAME](auto& iovs, auto& bl) + { + return write_ertr::parallel_for_each( + iovs, + [&device, device_id, offset, FNAME](auto& p) mutable + { + auto off = offset + p.offset; + auto len = p.length; + auto& iov = p.iov; + TRACE("{} poffset={}~{} dma_write ...", + device_id_printer_t{device_id}, off, len); + return device.dma_write(off, std::move(iov) + ).handle_exception( + [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t> + { + ERROR("{} poffset={}~{} dma_write got error -- {}", + device_id_printer_t{device_id}, off, len, e); + return crimson::ct_error::input_output_error::make(); + }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> { + if (written != len) { + ERROR("{} poffset={}~{} dma_write len={} inconsistent", + device_id_printer_t{device_id}, off, len, written); + return crimson::ct_error::input_output_error::make(); + } + TRACE("{} poffset={}~{} dma_write done", + device_id_printer_t{device_id}, off, len); + return write_ertr::now(); + }); + }); + }); +} + +static read_ertr::future<> do_read( + device_id_t device_id, + seastar::file &device, + uint64_t offset, + size_t len, + bufferptr &bptr) +{ + LOG_PREFIX(block_do_read); + TRACE("{} poffset={}~{} ...", device_id_printer_t{device_id}, offset, len); + assert(len <= bptr.length()); + return device.dma_read( + offset, + bptr.c_str(), + len + ).handle_exception( + //FIXME: this is a little bit tricky, since seastar::future<T>::handle_exception + // returns seastar::future<T>, to return an crimson::ct_error, we have to create + // a seastar::future<T> holding that crimson::ct_error. This is not necessary + // once seastar::future<T>::handle_exception() returns seastar::futurize_t<T> + [FNAME, device_id, offset, len](auto e) -> read_ertr::future<size_t> + { + ERROR("{} poffset={}~{} got error -- {}", + device_id_printer_t{device_id}, offset, len, e); + return crimson::ct_error::input_output_error::make(); + }).then([FNAME, device_id, offset, len](auto result) -> read_ertr::future<> { + if (result != len) { + ERROR("{} poffset={}~{} read len={} inconsistent", + device_id_printer_t{device_id}, offset, len, result); + return crimson::ct_error::input_output_error::make(); + } + TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len); + return read_ertr::now(); + }); +} + +write_ertr::future<> +SegmentStateTracker::write_out( + device_id_t device_id, + seastar::file &device, + uint64_t offset) +{ + LOG_PREFIX(SegmentStateTracker::write_out); + DEBUG("{} poffset={}~{}", + device_id_printer_t{device_id}, offset, bptr.length()); + return do_write(device_id, device, offset, bptr); +} + +write_ertr::future<> +SegmentStateTracker::read_in( + device_id_t device_id, + seastar::file &device, + uint64_t offset) +{ + LOG_PREFIX(SegmentStateTracker::read_in); + DEBUG("{} poffset={}~{}", + device_id_printer_t{device_id}, offset, bptr.length()); + return do_read( + device_id, + device, + offset, + bptr.length(), + bptr); +} +using std::vector; +static +block_sm_superblock_t make_superblock( + device_id_t device_id, + device_config_t sm_config, + const seastar::stat_data &data) +{ + LOG_PREFIX(block_make_superblock); + using crimson::common::get_conf; + + auto config_size = get_conf<Option::size_t>( + "seastore_device_size"); + + size_t size = (data.size == 0) ? config_size : data.size; + + auto config_segment_size = get_conf<Option::size_t>( + "seastore_segment_size"); + size_t raw_segments = size / config_segment_size; + size_t shard_tracker_size = SegmentStateTracker::get_raw_size( + raw_segments / seastar::smp::count, + data.block_size); + size_t total_tracker_size = shard_tracker_size * seastar::smp::count; + size_t tracker_off = data.block_size; //superblock + size_t segments = (size - tracker_off - total_tracker_size) / config_segment_size; + size_t segments_per_shard = segments / seastar::smp::count; + + vector<block_shard_info_t> shard_infos(seastar::smp::count); + for (unsigned int i = 0; i < seastar::smp::count; i++) { + shard_infos[i].size = segments_per_shard * config_segment_size; + shard_infos[i].segments = segments_per_shard; + shard_infos[i].tracker_offset = tracker_off + i * shard_tracker_size; + shard_infos[i].first_segment_offset = tracker_off + total_tracker_size + + i * segments_per_shard * config_segment_size; + } + + INFO("{} disk_size={}, segment_size={}, block_size={}", + device_id_printer_t{device_id}, + size, + uint64_t(config_segment_size), + data.block_size); + for (unsigned int i = 0; i < seastar::smp::count; i++) { + INFO("shard {} infos:", i, shard_infos[i]); + } + + return block_sm_superblock_t{ + seastar::smp::count, + config_segment_size, + data.block_size, + shard_infos, + std::move(sm_config) + }; +} + +using check_create_device_ertr = BlockSegmentManager::access_ertr; +using check_create_device_ret = check_create_device_ertr::future<>; +static check_create_device_ret check_create_device( + const std::string &path, + size_t size) +{ + LOG_PREFIX(block_check_create_device); + INFO("path={}, size={}", path, size); + return seastar::open_file_dma( + path, + seastar::open_flags::exclusive | + seastar::open_flags::rw | + seastar::open_flags::create + ).then([size, FNAME, &path](auto file) { + return seastar::do_with( + file, + [size, FNAME, &path](auto &f) -> seastar::future<> + { + DEBUG("path={} created, truncating to {}", path, size); + ceph_assert(f); + return f.truncate( + size + ).then([&f, size] { + return f.allocate(0, size); + }).finally([&f] { + return f.close(); + }); + }); + }).then_wrapped([&path, FNAME](auto f) -> check_create_device_ret { + if (f.failed()) { + try { + f.get(); + return seastar::now(); + } catch (const std::system_error &e) { + if (e.code().value() == EEXIST) { + ERROR("path={} exists", path); + return seastar::now(); + } else { + ERROR("path={} creation error -- {}", path, e); + return crimson::ct_error::input_output_error::make(); + } + } catch (...) { + ERROR("path={} creation error", path); + return crimson::ct_error::input_output_error::make(); + } + } + + DEBUG("path={} complete", path); + std::ignore = f.discard_result(); + return seastar::now(); + }); +} + +using open_device_ret = + BlockSegmentManager::access_ertr::future< + std::pair<seastar::file, seastar::stat_data> + >; +static +open_device_ret open_device( + const std::string &path) +{ + LOG_PREFIX(block_open_device); + return seastar::file_stat(path, seastar::follow_symlink::yes + ).then([&path, FNAME](auto stat) mutable { + return seastar::open_file_dma( + path, + seastar::open_flags::rw | seastar::open_flags::dsync + ).then([stat, &path, FNAME](auto file) mutable { + return file.size().then([stat, file, &path, FNAME](auto size) mutable { + stat.size = size; + INFO("path={} successful, size={}, block_size={}", + path, stat.size, stat.block_size); + return std::make_pair(file, stat); + }); + }); + }).handle_exception([FNAME, &path](auto e) -> open_device_ret { + ERROR("path={} got error -- {}", path, e); + return crimson::ct_error::input_output_error::make(); + }); +} + + +static +BlockSegmentManager::access_ertr::future<> +write_superblock( + device_id_t device_id, + seastar::file &device, + block_sm_superblock_t sb) +{ + LOG_PREFIX(block_write_superblock); + DEBUG("{} write {}", device_id_printer_t{device_id}, sb); + sb.validate(); + assert(ceph::encoded_sizeof<block_sm_superblock_t>(sb) < + sb.block_size); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(sb.block_size)), + [=, &device](auto &bp) + { + bufferlist bl; + encode(sb, bl); + auto iter = bl.begin(); + assert(bl.length() < sb.block_size); + iter.copy(bl.length(), bp.c_str()); + return do_write(device_id, device, 0, bp); + }); +} + +static +BlockSegmentManager::access_ertr::future<block_sm_superblock_t> +read_superblock(seastar::file &device, seastar::stat_data sd) +{ + LOG_PREFIX(block_read_superblock); + DEBUG("reading superblock ..."); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(sd.block_size)), + [=, &device](auto &bp) + { + return do_read( + DEVICE_ID_NULL, // unknown + device, + 0, + bp.length(), + bp + ).safe_then([=, &bp] { + bufferlist bl; + bl.push_back(bp); + block_sm_superblock_t ret; + auto bliter = bl.cbegin(); + try { + decode(ret, bliter); + } catch (...) { + ERROR("got decode error!"); + ceph_assert(0 == "invalid superblock"); + } + assert(ceph::encoded_sizeof<block_sm_superblock_t>(ret) < + sd.block_size); + return BlockSegmentManager::access_ertr::future<block_sm_superblock_t>( + BlockSegmentManager::access_ertr::ready_future_marker{}, + ret); + }); + }); +} + +BlockSegment::BlockSegment( + BlockSegmentManager &manager, segment_id_t id) + : manager(manager), id(id) {} + +segment_off_t BlockSegment::get_write_capacity() const +{ + return manager.get_segment_size(); +} + +Segment::close_ertr::future<> BlockSegment::close() +{ + return manager.segment_close(id, write_pointer); +} + +Segment::write_ertr::future<> BlockSegment::write( + segment_off_t offset, ceph::bufferlist bl) +{ + LOG_PREFIX(BlockSegment::write); + auto paddr = paddr_t::make_seg_paddr(id, offset); + DEBUG("{} offset={}~{} poffset={} ...", + id, offset, bl.length(), manager.get_offset(paddr)); + + if (offset < write_pointer || + offset % manager.superblock.block_size != 0 || + bl.length() % manager.superblock.block_size != 0) { + ERROR("{} offset={}~{} poffset={} invalid write", + id, offset, bl.length(), manager.get_offset(paddr)); + return crimson::ct_error::invarg::make(); + } + + if (offset + bl.length() > manager.superblock.segment_size) { + ERROR("{} offset={}~{} poffset={} write out of the range {}", + id, offset, bl.length(), manager.get_offset(paddr), + manager.superblock.segment_size); + return crimson::ct_error::enospc::make(); + } + + write_pointer = offset + bl.length(); + return manager.segment_write(paddr, bl); +} + +Segment::write_ertr::future<> BlockSegment::advance_wp( + segment_off_t offset) { + return write_ertr::now(); +} + +Segment::close_ertr::future<> BlockSegmentManager::segment_close( + segment_id_t id, segment_off_t write_pointer) +{ + LOG_PREFIX(BlockSegmentManager::segment_close); + auto s_id = id.device_segment_id(); + int unused_bytes = get_segment_size() - write_pointer; + INFO("{} unused_bytes={} ...", id, unused_bytes); + + assert(unused_bytes >= 0); + assert(id.device_id() == get_device_id()); + assert(tracker); + + tracker->set(s_id, segment_state_t::CLOSED); + ++stats.closed_segments; + stats.closed_segments_unused_bytes += unused_bytes; + stats.metadata_write.increment(tracker->get_size()); + return tracker->write_out( + get_device_id(), device, + shard_info.tracker_offset); +} + +Segment::write_ertr::future<> BlockSegmentManager::segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check) +{ + assert(addr.get_device_id() == get_device_id()); + assert((bl.length() % superblock.block_size) == 0); + stats.data_write.increment(bl.length()); + return do_writev( + get_device_id(), + device, + get_offset(addr), + std::move(bl), + superblock.block_size); +} + +BlockSegmentManager::~BlockSegmentManager() +{ +} + +BlockSegmentManager::mount_ret BlockSegmentManager::mount() +{ + return shard_devices.invoke_on_all([](auto &local_device) { + return local_device.shard_mount( + ).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in BlockSegmentManager::mount" + }); + }); +} + +BlockSegmentManager::mount_ret BlockSegmentManager::shard_mount() +{ + LOG_PREFIX(BlockSegmentManager::shard_mount); + return open_device( + device_path + ).safe_then([=, this](auto p) { + device = std::move(p.first); + auto sd = p.second; + return read_superblock(device, sd); + }).safe_then([=, this](auto sb) { + set_device_id(sb.config.spec.id); + shard_info = sb.shard_infos[seastar::this_shard_id()]; + INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info); + sb.validate(); + superblock = sb; + stats.data_read.increment( + ceph::encoded_sizeof<block_sm_superblock_t>(superblock)); + tracker = std::make_unique<SegmentStateTracker>( + shard_info.segments, + superblock.block_size); + stats.data_read.increment(tracker->get_size()); + return tracker->read_in( + get_device_id(), + device, + shard_info.tracker_offset + ).safe_then([this] { + for (device_segment_id_t i = 0; i < tracker->get_capacity(); ++i) { + if (tracker->get(i) == segment_state_t::OPEN) { + tracker->set(i, segment_state_t::CLOSED); + } + } + stats.metadata_write.increment(tracker->get_size()); + return tracker->write_out( + get_device_id(), device, + shard_info.tracker_offset); + }); + }).safe_then([this, FNAME] { + INFO("{} complete", device_id_printer_t{get_device_id()}); + register_metrics(); + }); +} + +BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs( + device_config_t sm_config) +{ + return shard_devices.local().primary_mkfs(sm_config + ).safe_then([this] { + return shard_devices.invoke_on_all([](auto &local_device) { + return local_device.shard_mkfs( + ).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in BlockSegmentManager::mkfs" + }); + }); + }); +} + +BlockSegmentManager::mkfs_ret BlockSegmentManager::primary_mkfs( + device_config_t sm_config) +{ + LOG_PREFIX(BlockSegmentManager::primary_mkfs); + ceph_assert(sm_config.spec.dtype == superblock.config.spec.dtype); + set_device_id(sm_config.spec.id); + INFO("{} path={}, {}", + device_id_printer_t{get_device_id()}, device_path, sm_config); + return seastar::do_with( + seastar::file{}, + seastar::stat_data{}, + block_sm_superblock_t{}, + std::unique_ptr<SegmentStateTracker>(), + [=, this](auto &device, auto &stat, auto &sb, auto &tracker) + { + check_create_device_ret maybe_create = check_create_device_ertr::now(); + using crimson::common::get_conf; + if (get_conf<bool>("seastore_block_create")) { + auto size = get_conf<Option::size_t>("seastore_device_size"); + maybe_create = check_create_device(device_path, size); + } + + return maybe_create.safe_then([this] { + return open_device(device_path); + }).safe_then([&, sm_config](auto p) { + device = p.first; + stat = p.second; + sb = make_superblock(get_device_id(), sm_config, stat); + stats.metadata_write.increment( + ceph::encoded_sizeof<block_sm_superblock_t>(sb)); + return write_superblock(get_device_id(), device, sb); + }).finally([&] { + return device.close(); + }).safe_then([FNAME, this] { + INFO("{} complete", device_id_printer_t{get_device_id()}); + return mkfs_ertr::now(); + }); + }); +} + +BlockSegmentManager::mkfs_ret BlockSegmentManager::shard_mkfs() +{ + LOG_PREFIX(BlockSegmentManager::shard_mkfs); + return open_device( + device_path + ).safe_then([this](auto p) { + device = std::move(p.first); + auto sd = p.second; + return read_superblock(device, sd); + }).safe_then([this, FNAME](auto sb) { + set_device_id(sb.config.spec.id); + shard_info = sb.shard_infos[seastar::this_shard_id()]; + INFO("{} read {}", device_id_printer_t{get_device_id()}, shard_info); + sb.validate(); + tracker.reset(new SegmentStateTracker( + shard_info.segments, sb.block_size)); + stats.metadata_write.increment(tracker->get_size()); + return tracker->write_out( + get_device_id(), device, + shard_info.tracker_offset); + }).finally([this] { + return device.close(); + }).safe_then([FNAME, this] { + INFO("{} complete", device_id_printer_t{get_device_id()}); + return mkfs_ertr::now(); + }); +} + +BlockSegmentManager::close_ertr::future<> BlockSegmentManager::close() +{ + LOG_PREFIX(BlockSegmentManager::close); + INFO("{}", device_id_printer_t{get_device_id()}); + metrics.clear(); + return device.close(); +} + +SegmentManager::open_ertr::future<SegmentRef> BlockSegmentManager::open( + segment_id_t id) +{ + LOG_PREFIX(BlockSegmentManager::open); + auto s_id = id.device_segment_id(); + INFO("{} ...", id); + + assert(id.device_id() == get_device_id()); + + if (s_id >= get_num_segments()) { + ERROR("{} segment-id out of range {}", id, get_num_segments()); + return crimson::ct_error::invarg::make(); + } + + if (tracker->get(s_id) != segment_state_t::EMPTY) { + ERROR("{} invalid state {} != EMPTY", id, tracker->get(s_id)); + return crimson::ct_error::invarg::make(); + } + + tracker->set(s_id, segment_state_t::OPEN); + stats.metadata_write.increment(tracker->get_size()); + return tracker->write_out( + get_device_id(), device, + shard_info.tracker_offset + ).safe_then([this, id, FNAME] { + ++stats.opened_segments; + DEBUG("{} done", id); + return open_ertr::future<SegmentRef>( + open_ertr::ready_future_marker{}, + SegmentRef(new BlockSegment(*this, id))); + }); +} + +SegmentManager::release_ertr::future<> BlockSegmentManager::release( + segment_id_t id) +{ + LOG_PREFIX(BlockSegmentManager::release); + auto s_id = id.device_segment_id(); + INFO("{} ...", id); + + assert(id.device_id() == get_device_id()); + + if (s_id >= get_num_segments()) { + ERROR("{} segment-id out of range {}", id, get_num_segments()); + return crimson::ct_error::invarg::make(); + } + + if (tracker->get(s_id) != segment_state_t::CLOSED) { + ERROR("{} invalid state {} != CLOSED", id, tracker->get(s_id)); + return crimson::ct_error::invarg::make(); + } + + tracker->set(s_id, segment_state_t::EMPTY); + ++stats.released_segments; + stats.metadata_write.increment(tracker->get_size()); + return tracker->write_out( + get_device_id(), device, + shard_info.tracker_offset); +} + +SegmentManager::read_ertr::future<> BlockSegmentManager::read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) +{ + LOG_PREFIX(BlockSegmentManager::read); + auto& seg_addr = addr.as_seg_paddr(); + auto id = seg_addr.get_segment_id(); + auto s_id = id.device_segment_id(); + auto s_off = seg_addr.get_segment_off(); + auto p_off = get_offset(addr); + DEBUG("{} offset={}~{} poffset={} ...", id, s_off, len, p_off); + + assert(addr.get_device_id() == get_device_id()); + + if (s_off % superblock.block_size != 0 || + len % superblock.block_size != 0) { + ERROR("{} offset={}~{} poffset={} invalid read", id, s_off, len, p_off); + return crimson::ct_error::invarg::make(); + } + + if (s_id >= get_num_segments()) { + ERROR("{} offset={}~{} poffset={} segment-id out of range {}", + id, s_off, len, p_off, get_num_segments()); + return crimson::ct_error::invarg::make(); + } + + if (s_off + len > superblock.segment_size) { + ERROR("{} offset={}~{} poffset={} read out of range {}", + id, s_off, len, p_off, superblock.segment_size); + return crimson::ct_error::invarg::make(); + } + + if (tracker->get(s_id) == segment_state_t::EMPTY) { + // XXX: not an error during scanning, + // might need refactor to increase the log level + DEBUG("{} offset={}~{} poffset={} invalid state {}", + id, s_off, len, p_off, tracker->get(s_id)); + return crimson::ct_error::enoent::make(); + } + + stats.data_read.increment(len); + return do_read( + get_device_id(), + device, + p_off, + len, + out); +} + +void BlockSegmentManager::register_metrics() +{ + LOG_PREFIX(BlockSegmentManager::register_metrics); + DEBUG("{}", device_id_printer_t{get_device_id()}); + namespace sm = seastar::metrics; + std::vector<sm::label_instance> label_instances; + label_instances.push_back(sm::label_instance("device_id", get_device_id())); + stats.reset(); + metrics.add_group( + "segment_manager", + { + sm::make_counter( + "data_read_num", + stats.data_read.num, + sm::description("total number of data read"), + label_instances + ), + sm::make_counter( + "data_read_bytes", + stats.data_read.bytes, + sm::description("total bytes of data read"), + label_instances + ), + sm::make_counter( + "data_write_num", + stats.data_write.num, + sm::description("total number of data write"), + label_instances + ), + sm::make_counter( + "data_write_bytes", + stats.data_write.bytes, + sm::description("total bytes of data write"), + label_instances + ), + sm::make_counter( + "metadata_write_num", + stats.metadata_write.num, + sm::description("total number of metadata write"), + label_instances + ), + sm::make_counter( + "metadata_write_bytes", + stats.metadata_write.bytes, + sm::description("total bytes of metadata write"), + label_instances + ), + sm::make_counter( + "opened_segments", + stats.opened_segments, + sm::description("total segments opened"), + label_instances + ), + sm::make_counter( + "closed_segments", + stats.closed_segments, + sm::description("total segments closed"), + label_instances + ), + sm::make_counter( + "closed_segments_unused_bytes", + stats.closed_segments_unused_bytes, + sm::description("total unused bytes of closed segments"), + label_instances + ), + sm::make_counter( + "released_segments", + stats.released_segments, + sm::description("total segments released"), + label_instances + ), + } + ); +} + +} diff --git a/src/crimson/os/seastore/segment_manager/block.h b/src/crimson/os/seastore/segment_manager/block.h new file mode 100644 index 000000000..495d0d104 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/block.h @@ -0,0 +1,262 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/file.hh> +#include <seastar/core/future.hh> +#include <seastar/core/reactor.hh> + +#include "crimson/common/layout.h" + +#include "crimson/os/seastore/segment_manager.h" + +namespace crimson::os::seastore::segment_manager::block { + +using write_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; +using read_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + +/** + * SegmentStateTracker + * + * Tracks lifecycle state of each segment using space at the beginning + * of the drive. + */ +class SegmentStateTracker { + using segment_state_t = Segment::segment_state_t; + + bufferptr bptr; + + using L = absl::container_internal::Layout<uint8_t>; + const L layout; + +public: + static size_t get_raw_size(size_t segments, size_t block_size) { + return p2roundup(segments, block_size); + } + + SegmentStateTracker(size_t segments, size_t block_size) + : bptr(ceph::buffer::create_page_aligned( + get_raw_size(segments, block_size))), + layout(bptr.length()) + { + ::memset( + bptr.c_str(), + static_cast<char>(segment_state_t::EMPTY), + bptr.length()); + } + + size_t get_size() const { + return bptr.length(); + } + + size_t get_capacity() const { + return bptr.length(); + } + + segment_state_t get(device_segment_id_t offset) const { + assert(offset < get_capacity()); + return static_cast<segment_state_t>( + layout.template Pointer<0>( + bptr.c_str())[offset]); + } + + void set(device_segment_id_t offset, segment_state_t state) { + assert(offset < get_capacity()); + layout.template Pointer<0>(bptr.c_str())[offset] = + static_cast<uint8_t>(state); + } + + write_ertr::future<> write_out( + device_id_t device_id, + seastar::file &device, + uint64_t offset); + + read_ertr::future<> read_in( + device_id_t device_id, + seastar::file &device, + uint64_t offset); +}; + +class BlockSegmentManager; +class BlockSegment final : public Segment { + friend class BlockSegmentManager; + BlockSegmentManager &manager; + const segment_id_t id; + segment_off_t write_pointer = 0; +public: + BlockSegment(BlockSegmentManager &manager, segment_id_t id); + + segment_id_t get_segment_id() const final { return id; } + segment_off_t get_write_capacity() const final; + segment_off_t get_write_ptr() const final { return write_pointer; } + close_ertr::future<> close() final; + write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final; + write_ertr::future<> advance_wp(segment_off_t offset) final; + + ~BlockSegment() {} +}; + +/** + * BlockSegmentManager + * + * Implements SegmentManager on a conventional block device. + * SegmentStateTracker uses space at the start of the device to store + * state analagous to that of the segments of a zns device. + */ +class BlockSegmentManager final : public SegmentManager { +// interfaces used by Device +public: + seastar::future<> start() { + return shard_devices.start(device_path, superblock.config.spec.dtype); + } + + seastar::future<> stop() { + return shard_devices.stop(); + } + + Device& get_sharded_device() final { + return shard_devices.local(); + } + mount_ret mount() final; + + mkfs_ret mkfs(device_config_t) final; +// interfaces used by each shard device +public: + close_ertr::future<> close(); + + BlockSegmentManager( + const std::string &path, + device_type_t dtype) + : device_path(path) { + ceph_assert(get_device_type() == device_type_t::NONE); + superblock.config.spec.dtype = dtype; + } + + ~BlockSegmentManager(); + + open_ertr::future<SegmentRef> open(segment_id_t id) final; + + release_ertr::future<> release(segment_id_t id) final; + + read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) final; + + device_type_t get_device_type() const final { + return superblock.config.spec.dtype; + } + size_t get_available_size() const final { + return shard_info.size; + } + extent_len_t get_block_size() const { + return superblock.block_size; + } + segment_off_t get_segment_size() const { + return superblock.segment_size; + } + + device_id_t get_device_id() const final { + assert(device_id <= DEVICE_ID_MAX_VALID); + return device_id; + } + secondary_device_set_t& get_secondary_devices() final { + return superblock.config.secondary_devices; + } + // public so tests can bypass segment interface when simpler + Segment::write_ertr::future<> segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check=false); + + magic_t get_magic() const final { + return superblock.config.spec.magic; + } + +private: + friend class BlockSegment; + using segment_state_t = Segment::segment_state_t; + + struct effort_t { + uint64_t num = 0; + uint64_t bytes = 0; + + void increment(uint64_t read_bytes) { + ++num; + bytes += read_bytes; + } + }; + + struct { + effort_t data_read; + effort_t data_write; + effort_t metadata_write; + uint64_t opened_segments; + uint64_t closed_segments; + uint64_t closed_segments_unused_bytes; + uint64_t released_segments; + + void reset() { + data_read = {}; + data_write = {}; + metadata_write = {}; + opened_segments = 0; + closed_segments = 0; + closed_segments_unused_bytes = 0; + released_segments = 0; + } + } stats; + + void register_metrics(); + seastar::metrics::metric_group metrics; + + std::string device_path; + std::unique_ptr<SegmentStateTracker> tracker; + block_shard_info_t shard_info; + block_sm_superblock_t superblock; + seastar::file device; + + void set_device_id(device_id_t id) { + assert(id <= DEVICE_ID_MAX_VALID); + assert(device_id == DEVICE_ID_NULL || + device_id == id); + device_id = id; + } + device_id_t device_id = DEVICE_ID_NULL; + + size_t get_offset(paddr_t addr) { + auto& seg_addr = addr.as_seg_paddr(); + return shard_info.first_segment_offset + + (seg_addr.get_segment_id().device_segment_id() * superblock.segment_size) + + seg_addr.get_segment_off(); + } + + const seastore_meta_t &get_meta() const { + return superblock.config.meta; + } + + std::vector<segment_state_t> segment_state; + + char *buffer = nullptr; + + Segment::close_ertr::future<> segment_close( + segment_id_t id, segment_off_t write_pointer); + +private: + // shard 0 mkfs + mkfs_ret primary_mkfs(device_config_t); + // all shards mkfs + mkfs_ret shard_mkfs(); + // all shards mount + mount_ret shard_mount(); + + seastar::sharded<BlockSegmentManager> shard_devices; +}; + +} diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.cc b/src/crimson/os/seastore/segment_manager/ephemeral.cc new file mode 100644 index 000000000..4a4873afb --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/ephemeral.cc @@ -0,0 +1,294 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include "seastar/core/sleep.hh" + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/segment_manager/ephemeral.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_seastore_device); + } +} + +namespace crimson::os::seastore::segment_manager { + +std::ostream &operator<<(std::ostream &lhs, const ephemeral_config_t &c) { + return lhs << "ephemeral_config_t(size=" << c.size << ", block_size=" << c.block_size + << ", segment_size=" << c.segment_size << ")"; +} + +EphemeralSegmentManagerRef create_test_ephemeral() { + return EphemeralSegmentManagerRef( + new EphemeralSegmentManager(DEFAULT_TEST_EPHEMERAL)); +} + +device_config_t get_ephemeral_device_config( + std::size_t index, + std::size_t num_main_devices, + std::size_t num_cold_devices) +{ + auto num_devices = num_main_devices + num_cold_devices; + assert(num_devices > index); + auto get_sec_dtype = [num_main_devices](std::size_t idx) { + if (idx < num_main_devices) { + return device_type_t::EPHEMERAL_MAIN; + } else { + return device_type_t::EPHEMERAL_COLD; + } + }; + + magic_t magic = 0xabcd; + bool is_major_device; + secondary_device_set_t secondary_devices; + if (index == 0) { + is_major_device = true; + for (std::size_t secondary_index = index + 1; + secondary_index < num_devices; + ++secondary_index) { + device_id_t secondary_id = static_cast<device_id_t>(secondary_index); + secondary_devices.insert({ + secondary_index, + device_spec_t{ + magic, + get_sec_dtype(secondary_index), + secondary_id + } + }); + } + } else { // index > 0 + is_major_device = false; + } + + device_id_t id = static_cast<device_id_t>(index); + seastore_meta_t meta = {}; + return {is_major_device, + device_spec_t{ + magic, + get_sec_dtype(index), + id + }, + meta, + secondary_devices}; +} + +EphemeralSegment::EphemeralSegment( + EphemeralSegmentManager &manager, segment_id_t id) + : manager(manager), id(id) {} + +segment_off_t EphemeralSegment::get_write_capacity() const +{ + return manager.get_segment_size(); +} + +Segment::close_ertr::future<> EphemeralSegment::close() +{ + return manager.segment_close(id).safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +Segment::write_ertr::future<> EphemeralSegment::write( + segment_off_t offset, ceph::bufferlist bl) +{ + if (offset < write_pointer || offset % manager.config.block_size != 0) + return crimson::ct_error::invarg::make(); + + if (offset + bl.length() > (size_t)manager.get_segment_size()) + return crimson::ct_error::enospc::make(); + + return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl); +} + +Segment::write_ertr::future<> EphemeralSegment::advance_wp( + segment_off_t offset) +{ + return write_ertr::now(); +} + +Segment::close_ertr::future<> EphemeralSegmentManager::segment_close(segment_id_t id) +{ + auto s_id = id.device_segment_id(); + if (segment_state[s_id] != segment_state_t::OPEN) + return crimson::ct_error::invarg::make(); + + segment_state[s_id] = segment_state_t::CLOSED; + return Segment::close_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +EphemeralSegmentManager::mkfs_ret +EphemeralSegmentManager::mkfs(device_config_t _config) +{ + logger().info( + "Mkfs ephemeral segment manager with {}", + _config); + device_config = _config; + return mkfs_ertr::now(); +} + +Segment::write_ertr::future<> EphemeralSegmentManager::segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check) +{ + auto& seg_addr = addr.as_seg_paddr(); + logger().debug( + "segment_write to segment {} at offset {}, physical offset {}, len {}, crc {}", + seg_addr.get_segment_id(), + seg_addr.get_segment_off(), + get_offset(addr), + bl.length(), + bl.crc32c(1)); + if (!ignore_check && segment_state[seg_addr.get_segment_id().device_segment_id()] + != segment_state_t::OPEN) + return crimson::ct_error::invarg::make(); + + bl.begin().copy(bl.length(), buffer + get_offset(addr)); + return Segment::write_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +EphemeralSegmentManager::init_ertr::future<> EphemeralSegmentManager::init() +{ + logger().info( + "Initing ephemeral segment manager with config {}", + config); + + if (config.block_size % (4<<10) != 0) { + return crimson::ct_error::invarg::make(); + } + if (config.segment_size % config.block_size != 0) { + return crimson::ct_error::invarg::make(); + } + if (config.size % config.segment_size != 0) { + return crimson::ct_error::invarg::make(); + } + + void* addr = ::mmap( + nullptr, + config.size, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, + -1, + 0); + + segment_state.resize(config.size / config.segment_size, segment_state_t::EMPTY); + + if (addr == MAP_FAILED) + return crimson::ct_error::enospc::make(); + + buffer = (char*)addr; + + ::memset(buffer, 0, config.size); + return init_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +EphemeralSegmentManager::~EphemeralSegmentManager() +{ + if (buffer) { + ::munmap(buffer, config.size); + } +} + +void EphemeralSegmentManager::remount() +{ + for (auto &i : segment_state) { + if (i == Segment::segment_state_t::OPEN) + i = Segment::segment_state_t::CLOSED; + } +} + +SegmentManager::open_ertr::future<SegmentRef> EphemeralSegmentManager::open( + segment_id_t id) +{ + auto s_id = id.device_segment_id(); + if (s_id >= get_num_segments()) { + logger().error("EphemeralSegmentManager::open: invalid segment {}", id); + return crimson::ct_error::invarg::make(); + } + + if (segment_state[s_id] != segment_state_t::EMPTY) { + logger().error("EphemeralSegmentManager::open: segment {} not empty", id); + return crimson::ct_error::invarg::make(); + } + + segment_state[s_id] = segment_state_t::OPEN; + return open_ertr::make_ready_future<SegmentRef>(new EphemeralSegment(*this, id)); +} + +SegmentManager::release_ertr::future<> EphemeralSegmentManager::release( + segment_id_t id) +{ + auto s_id = id.device_segment_id(); + logger().debug("EphemeralSegmentManager::release: {}", id); + + if (s_id >= get_num_segments()) { + logger().error( + "EphemeralSegmentManager::release: invalid segment {}", + id); + return crimson::ct_error::invarg::make(); + } + + if (segment_state[s_id] != segment_state_t::CLOSED) { + logger().error( + "EphemeralSegmentManager::release: segment id {} not closed", + id); + return crimson::ct_error::invarg::make(); + } + + ::memset(buffer + get_offset(paddr_t::make_seg_paddr(id, 0)), 0, config.segment_size); + segment_state[s_id] = segment_state_t::EMPTY; + return release_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +SegmentManager::read_ertr::future<> EphemeralSegmentManager::read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) +{ + auto& seg_addr = addr.as_seg_paddr(); + if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) { + logger().error( + "EphemeralSegmentManager::read: invalid segment {}", + addr); + return crimson::ct_error::invarg::make(); + } + + if (seg_addr.get_segment_off() + len > config.segment_size) { + logger().error( + "EphemeralSegmentManager::read: invalid offset {}~{}!", + addr, + len); + return crimson::ct_error::invarg::make(); + } + + out.copy_in(0, len, buffer + get_offset(addr)); + + bufferlist bl; + bl.push_back(out); + logger().debug( + "segment_read to segment {} at offset {}, physical offset {}, length {}, crc {}", + seg_addr.get_segment_id().device_segment_id(), + seg_addr.get_segment_off(), + get_offset(addr), + len, + bl.begin().crc32c(len, 1)); + + return read_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +} diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.h b/src/crimson/os/seastore/segment_manager/ephemeral.h new file mode 100644 index 000000000..d7a3eb4a7 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/ephemeral.h @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "crimson/os/seastore/segment_manager.h" + +#include "crimson/os/seastore/segment_manager/ephemeral.h" + +namespace crimson::os::seastore::segment_manager { + +class EphemeralSegmentManager; +using EphemeralSegmentManagerRef = std::unique_ptr<EphemeralSegmentManager>; + +struct ephemeral_config_t { + size_t size = 0; + size_t block_size = 0; + size_t segment_size = 0; + + void validate() const { + ceph_assert_always(size > 0); + ceph_assert_always(size <= DEVICE_OFF_MAX); + ceph_assert_always(segment_size > 0); + ceph_assert_always(segment_size <= SEGMENT_OFF_MAX); + ceph_assert_always(size / segment_size > 0); + ceph_assert_always(size / segment_size <= DEVICE_SEGMENT_ID_MAX); + } +}; + +constexpr ephemeral_config_t DEFAULT_TEST_EPHEMERAL = { + 1 << 30, + 4 << 10, + 8 << 20 +}; + +std::ostream &operator<<(std::ostream &, const ephemeral_config_t &); + +EphemeralSegmentManagerRef create_test_ephemeral(); + +device_config_t get_ephemeral_device_config( + std::size_t index, + std::size_t num_main_devices, + std::size_t num_cold_devices); + +class EphemeralSegment final : public Segment { + friend class EphemeralSegmentManager; + EphemeralSegmentManager &manager; + const segment_id_t id; + segment_off_t write_pointer = 0; +public: + EphemeralSegment(EphemeralSegmentManager &manager, segment_id_t id); + + segment_id_t get_segment_id() const final { return id; } + segment_off_t get_write_capacity() const final; + segment_off_t get_write_ptr() const final { return write_pointer; } + close_ertr::future<> close() final; + write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final; + write_ertr::future<> advance_wp(segment_off_t offset) final; + + ~EphemeralSegment() {} +}; + +class EphemeralSegmentManager final : public SegmentManager { + friend class EphemeralSegment; + using segment_state_t = Segment::segment_state_t; + + const ephemeral_config_t config; + std::optional<device_config_t> device_config; + + device_type_t get_device_type() const final { + assert(device_config); + return device_config->spec.dtype; + } + + size_t get_offset(paddr_t addr) { + auto& seg_addr = addr.as_seg_paddr(); + return (seg_addr.get_segment_id().device_segment_id() * config.segment_size) + + seg_addr.get_segment_off(); + } + + std::vector<segment_state_t> segment_state; + + char *buffer = nullptr; + + Segment::close_ertr::future<> segment_close(segment_id_t id); + +public: + EphemeralSegmentManager( + ephemeral_config_t config) + : config(config) { + config.validate(); + } + + ~EphemeralSegmentManager(); + + close_ertr::future<> close() final { + return close_ertr::now(); + } + + device_id_t get_device_id() const final { + assert(device_config); + return device_config->spec.id; + } + + mount_ret mount() final { + return mount_ertr::now(); + } + + mkfs_ret mkfs(device_config_t) final; + + open_ertr::future<SegmentRef> open(segment_id_t id) final; + + release_ertr::future<> release(segment_id_t id) final; + + read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) final; + + size_t get_available_size() const final { + return config.size; + } + extent_len_t get_block_size() const final { + return config.block_size; + } + segment_off_t get_segment_size() const final { + return config.segment_size; + } + + const seastore_meta_t &get_meta() const final { + assert(device_config); + return device_config->meta; + } + + secondary_device_set_t& get_secondary_devices() final { + assert(device_config); + return device_config->secondary_devices; + } + + magic_t get_magic() const final { + return device_config->spec.magic; + } + + using init_ertr = crimson::errorator< + crimson::ct_error::enospc, + crimson::ct_error::invarg>; + init_ertr::future<> init(); + + void remount(); + + // public so tests can bypass segment interface when simpler + Segment::write_ertr::future<> segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check=false); +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::segment_manager::ephemeral_config_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/segment_manager/zbd.cc b/src/crimson/os/seastore/segment_manager/zbd.cc new file mode 100644 index 000000000..88521a947 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/zbd.cc @@ -0,0 +1,823 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> +#include <linux/blkzoned.h> + +#include <fmt/format.h> +#include "crimson/os/seastore/segment_manager/zbd.h" +#include "crimson/common/config_proxy.h" +#include "crimson/os/seastore/logging.h" +#include "crimson/common/errorator-loop.h" +#include "include/buffer.h" + +SET_SUBSYS(seastore_device); + +#define SECT_SHIFT 9 +#define RESERVED_ZONES 1 +// limit the max padding buf size to 1MB +#define MAX_PADDING_SIZE 4194304 + +using z_op = crimson::os::seastore::segment_manager::zbd::zone_op; +template <> struct fmt::formatter<z_op>: fmt::formatter<std::string_view> { + template <typename FormatContext> + auto format(z_op s, FormatContext& ctx) { + std::string_view name = "Unknown"; + switch (s) { + using enum z_op; + case OPEN: + name = "BLKOPENZONE"; + break; + case FINISH: + name = "BLKFINISHZONE"; + break; + case CLOSE: + name = "BLKCLOSEZONE"; + break; + case RESET: + name = "BLKRESETZONE"; + break; + } + return formatter<string_view>::format(name, ctx); + } +}; + +namespace crimson::os::seastore::segment_manager::zbd { + +using open_device_ret = ZBDSegmentManager::access_ertr::future< + std::pair<seastar::file, seastar::stat_data>>; +static open_device_ret open_device( + const std::string &path, + seastar::open_flags mode) +{ + LOG_PREFIX(ZBDSegmentManager::open_device); + return seastar::file_stat( + path, seastar::follow_symlink::yes + ).then([FNAME, mode, &path](auto stat) mutable { + return seastar::open_file_dma(path, mode).then([=](auto file) { + DEBUG("open of device {} successful, size {}", + path, + stat.size); + return std::make_pair(file, stat); + }); + }).handle_exception( + [FNAME](auto e) -> open_device_ret { + ERROR("got error {}", + e); + return crimson::ct_error::input_output_error::make(); + } + ); +} + +static zbd_sm_metadata_t make_metadata( + uint64_t total_size, + seastore_meta_t meta, + const seastar::stat_data &data, + size_t zone_size_sectors, + size_t zone_capacity_sectors, + size_t nr_cnv_zones, + size_t num_zones) +{ + LOG_PREFIX(ZBDSegmentManager::make_metadata); + + // Using only SWR zones in a SMR drive, for now + auto skipped_zones = RESERVED_ZONES + nr_cnv_zones; + assert(num_zones > skipped_zones); + + // TODO: support Option::size_t seastore_segment_size + // to allow zones_per_segment > 1 with striping. + size_t zone_size = zone_size_sectors << SECT_SHIFT; + assert(total_size == num_zones * zone_size); + size_t zone_capacity = zone_capacity_sectors << SECT_SHIFT; + size_t segment_size = zone_size; + size_t zones_per_segment = segment_size / zone_size; + size_t segments = (num_zones - skipped_zones) / zones_per_segment; + size_t per_shard_segments = segments / seastar::smp::count; + size_t available_size = zone_capacity * segments; + size_t per_shard_available_size = zone_capacity * per_shard_segments; + + + WARN("Ignoring configuration values for device and segment size"); + INFO( + "device size: {}, available size: {}, block size: {}, allocated size: {}," + " total zones {}, zone size: {}, zone capacity: {}," + " total segments: {}, zones per segment: {}, segment size: {}" + " conv zones: {}, swr zones: {}, per shard segments: {}" + " per shard available size: {}", + total_size, + available_size, + data.block_size, + data.allocated_size, + num_zones, + zone_size, + zone_capacity, + segments, + zones_per_segment, + zone_capacity * zones_per_segment, + nr_cnv_zones, + num_zones - nr_cnv_zones, + per_shard_segments, + per_shard_available_size); + + std::vector<zbd_shard_info_t> shard_infos(seastar::smp::count); + for (unsigned int i = 0; i < seastar::smp::count; i++) { + shard_infos[i].size = per_shard_available_size; + shard_infos[i].segments = per_shard_segments; + shard_infos[i].first_segment_offset = zone_size * skipped_zones + + i * segment_size * per_shard_segments; + INFO("First segment offset for shard {} is: {}", + i, shard_infos[i].first_segment_offset); + } + + zbd_sm_metadata_t ret = zbd_sm_metadata_t{ + seastar::smp::count, + segment_size, + zone_capacity * zones_per_segment, + zones_per_segment, + zone_capacity, + data.block_size, + zone_size, + shard_infos, + meta}; + ret.validate(); + return ret; +} + +struct ZoneReport { + struct blk_zone_report *hdr; + ZoneReport(int nr_zones) + : hdr((blk_zone_report *)malloc( + sizeof(struct blk_zone_report) + nr_zones * sizeof(struct blk_zone))){;} + ~ZoneReport(){ + free(hdr); + } + ZoneReport(const ZoneReport &) = delete; + ZoneReport(ZoneReport &&rhs) : hdr(rhs.hdr) { + rhs.hdr = nullptr; + } +}; + +static seastar::future<size_t> get_blk_dev_size( + seastar::file &device) +{ + return seastar::do_with( + (uint64_t)0, + [&](auto& size_sects) { + return device.ioctl( + BLKGETSIZE, + (void *)&size_sects + ).then([&](int ret) { + ceph_assert(size_sects); + size_t size = size_sects << SECT_SHIFT; + return seastar::make_ready_future<size_t>(size); + }); + }); +} + +// zone_size should be in 512B sectors +static seastar::future<> reset_device( + seastar::file &device, + uint64_t zone_size_sects, + uint64_t nr_zones) +{ + return seastar::do_with( + blk_zone_range{}, + [&, nr_zones, zone_size_sects](auto &range) { + range.sector = 0; + range.nr_sectors = zone_size_sects * nr_zones; + return device.ioctl( + BLKRESETZONE, + &range + ).then([&](int ret){ + return seastar::now(); + }); + } + ); +} + +static seastar::future<size_t> get_zone_capacity( + seastar::file &device, + uint32_t nr_zones) +{ + return seastar::do_with( + ZoneReport(nr_zones), + [&](auto &zr) { + zr.hdr->sector = 0; + zr.hdr->nr_zones = nr_zones; + return device.ioctl( + BLKREPORTZONE, + zr.hdr + ).then([&](int ret) { + return seastar::make_ready_future<size_t>(zr.hdr->zones[0].capacity); + }); + } + ); +} + +// get the number of conventional zones of SMR HDD, +// they are randomly writable and don't respond to zone operations +static seastar::future<size_t> get_nr_cnv_zones( + seastar::file &device, + uint32_t nr_zones) +{ + return seastar::do_with( + ZoneReport(nr_zones), + [&](auto &zr) { + zr.hdr->sector = 0; + zr.hdr->nr_zones = nr_zones; + return device.ioctl( + BLKREPORTZONE, + zr.hdr + ).then([&, nr_zones](int ret) { + size_t cnv_zones = 0; + for (uint32_t i = 0; i < nr_zones; i++) { + if (zr.hdr->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL) + cnv_zones++; + } + return seastar::make_ready_future<size_t>(cnv_zones); + }); + } + ); +} + + +static write_ertr::future<> do_write( + seastar::file &device, + uint64_t offset, + bufferptr &bptr) +{ + LOG_PREFIX(ZBDSegmentManager::do_write); + DEBUG("offset {} len {}", + offset, + bptr.length()); + return device.dma_write( + offset, + bptr.c_str(), + bptr.length() + ).handle_exception( + [FNAME](auto e) -> write_ertr::future<size_t> { + ERROR("dma_write got error {}", + e); + return crimson::ct_error::input_output_error::make(); + } + ).then([length = bptr.length()](auto result) -> write_ertr::future<> { + if (result != length) { + return crimson::ct_error::input_output_error::make(); + } + return write_ertr::now(); + }); +} + +static write_ertr::future<> do_writev( + device_id_t device_id, + seastar::file &device, + uint64_t offset, + bufferlist&& bl, + size_t block_size) +{ + LOG_PREFIX(ZBDSegmentManager::do_writev); + DEBUG("{} offset {} len {}", + device_id_printer_t{device_id}, offset, bl.length()); + // writev requires each buffer to be aligned to the disks' block + // size, we need to rebuild here + bl.rebuild_aligned(block_size); + + return seastar::do_with( + bl.prepare_iovs(), + std::move(bl), + [&device, device_id, offset, FNAME](auto& iovs, auto& bl) + { + return write_ertr::parallel_for_each( + iovs, + [&device, device_id, offset, FNAME](auto& p) + { + auto off = offset + p.offset; + auto len = p.length; + auto& iov = p.iov; + DEBUG("{} poffset={}~{} dma_write ...", + device_id_printer_t{device_id}, + off, len); + return device.dma_write(off, std::move(iov) + ).handle_exception( + [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t> + { + ERROR("{} poffset={}~{} dma_write got error -- {}", + device_id_printer_t{device_id}, off, len, e); + return crimson::ct_error::input_output_error::make(); + }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> { + if (written != len) { + ERROR("{} poffset={}~{} dma_write len={} inconsistent", + device_id_printer_t{device_id}, off, len, written); + return crimson::ct_error::input_output_error::make(); + } + DEBUG("{} poffset={}~{} dma_write done", + device_id_printer_t{device_id}, + off, len); + return write_ertr::now(); + }); + }); + }); +} + +static ZBDSegmentManager::access_ertr::future<> +write_metadata(seastar::file &device, zbd_sm_metadata_t sb) +{ + assert(ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>() < + sb.block_size); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(sb.block_size)), + [=, &device](auto &bp) { + LOG_PREFIX(ZBDSegmentManager::write_metadata); + DEBUG("block_size {}", sb.block_size); + bufferlist bl; + encode(sb, bl); + auto iter = bl.begin(); + assert(bl.length() < sb.block_size); + DEBUG("buffer length {}", bl.length()); + iter.copy(bl.length(), bp.c_str()); + DEBUG("doing writeout"); + return do_write(device, 0, bp); + }); +} + +static read_ertr::future<> do_read( + seastar::file &device, + uint64_t offset, + size_t len, + bufferptr &bptr) +{ + LOG_PREFIX(ZBDSegmentManager::do_read); + assert(len <= bptr.length()); + DEBUG("offset {} len {}", + offset, + len); + return device.dma_read( + offset, + bptr.c_str(), + len + ).handle_exception( + [FNAME](auto e) -> read_ertr::future<size_t> { + ERROR("dma_read got error {}", + e); + return crimson::ct_error::input_output_error::make(); + } + ).then([len](auto result) -> read_ertr::future<> { + if (result != len) { + return crimson::ct_error::input_output_error::make(); + } + return read_ertr::now(); + }); +} + +static +ZBDSegmentManager::access_ertr::future<zbd_sm_metadata_t> +read_metadata(seastar::file &device, seastar::stat_data sd) +{ + assert(ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>() < + sd.block_size); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(sd.block_size)), + [=, &device](auto &bp) { + return do_read( + device, + 0, + bp.length(), + bp + ).safe_then([=, &bp] { + bufferlist bl; + bl.push_back(bp); + zbd_sm_metadata_t ret; + auto bliter = bl.cbegin(); + decode(ret, bliter); + ret.validate(); + return ZBDSegmentManager::access_ertr::future<zbd_sm_metadata_t>( + ZBDSegmentManager::access_ertr::ready_future_marker{}, + ret); + }); + }); +} + +ZBDSegmentManager::mount_ret ZBDSegmentManager::mount() +{ + return shard_devices.invoke_on_all([](auto &local_device) { + return local_device.shard_mount( + ).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in ZBDSegmentManager::mount" + }); + }); +} + +ZBDSegmentManager::mount_ret ZBDSegmentManager::shard_mount() +{ + return open_device( + device_path, seastar::open_flags::rw + ).safe_then([=, this](auto p) { + device = std::move(p.first); + auto sd = p.second; + return read_metadata(device, sd); + }).safe_then([=, this](auto meta){ + shard_info = meta.shard_infos[seastar::this_shard_id()]; + metadata = meta; + return mount_ertr::now(); + }); +} + +ZBDSegmentManager::mkfs_ret ZBDSegmentManager::mkfs( + device_config_t config) +{ + return shard_devices.local().primary_mkfs(config + ).safe_then([this] { + return shard_devices.invoke_on_all([](auto &local_device) { + return local_device.shard_mkfs( + ).handle_error( + crimson::ct_error::assert_all{ + "Invalid error in ZBDSegmentManager::mkfs" + }); + }); + }); +} + +ZBDSegmentManager::mkfs_ret ZBDSegmentManager::primary_mkfs( + device_config_t config) +{ + LOG_PREFIX(ZBDSegmentManager::primary_mkfs); + INFO("starting, device_path {}", device_path); + return seastar::do_with( + seastar::file{}, + seastar::stat_data{}, + zbd_sm_metadata_t{}, + size_t(), + size_t(), + size_t(), + size_t(), + [=, this] + (auto &device, + auto &stat, + auto &sb, + auto &zone_size_sects, + auto &nr_zones, + auto &size, + auto &nr_cnv_zones) { + return open_device( + device_path, + seastar::open_flags::rw + ).safe_then([=, this, &device, &stat, &sb, &zone_size_sects, &nr_zones, &size, &nr_cnv_zones](auto p) { + device = p.first; + stat = p.second; + return device.ioctl( + BLKGETNRZONES, + (void *)&nr_zones + ).then([&](int ret) { + if (nr_zones == 0) { + return seastar::make_exception_future<int>( + std::system_error(std::make_error_code(std::errc::io_error))); + } + return device.ioctl(BLKGETZONESZ, (void *)&zone_size_sects); + }).then([&](int ret) { + ceph_assert(zone_size_sects); + return reset_device(device, zone_size_sects, nr_zones); + }).then([&] { + return get_blk_dev_size(device); + }).then([&](auto devsize) { + size = devsize; + return get_nr_cnv_zones(device, nr_zones); + }).then([&](auto cnv_zones) { + DEBUG("Found {} conventional zones", cnv_zones); + nr_cnv_zones = cnv_zones; + return get_zone_capacity(device, nr_zones); + }).then([&, FNAME, config](auto zone_capacity_sects) { + ceph_assert(zone_capacity_sects); + DEBUG("zone_size in sectors {}, zone_capacity in sectors {}", + zone_size_sects, zone_capacity_sects); + sb = make_metadata( + size, + config.meta, + stat, + zone_size_sects, + zone_capacity_sects, + nr_cnv_zones, + nr_zones); + metadata = sb; + stats.metadata_write.increment( + ceph::encoded_sizeof_bounded<zbd_sm_metadata_t>()); + DEBUG("Wrote to stats."); + return write_metadata(device, sb); + }).finally([&, FNAME] { + DEBUG("Closing device."); + return device.close(); + }).safe_then([FNAME] { + DEBUG("Returning from mkfs."); + return mkfs_ertr::now(); + }); + }); + }); +} + +ZBDSegmentManager::mkfs_ret ZBDSegmentManager::shard_mkfs() +{ + LOG_PREFIX(ZBDSegmentManager::shard_mkfs); + INFO("starting, device_path {}", device_path); + return open_device( + device_path, seastar::open_flags::rw + ).safe_then([=, this](auto p) { + device = std::move(p.first); + auto sd = p.second; + return read_metadata(device, sd); + }).safe_then([=, this](auto meta){ + shard_info = meta.shard_infos[seastar::this_shard_id()]; + metadata = meta; + return device.close(); + }).safe_then([FNAME] { + DEBUG("Returning from shard_mkfs."); + return mkfs_ertr::now(); + }); +} + +// Return range of sectors to operate on. +struct blk_zone_range make_range( + segment_id_t id, + size_t segment_size, + size_t first_segment_offset) +{ + return blk_zone_range{ + (id.device_segment_id() * (segment_size >> SECT_SHIFT) + + (first_segment_offset >> SECT_SHIFT)), + (segment_size >> SECT_SHIFT) + }; +} + +using blk_zone_op_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; +using blk_zone_op_ret = blk_zone_op_ertr::future<>; +blk_zone_op_ret blk_zone_op(seastar::file &device, + blk_zone_range &range, + zone_op op) { + LOG_PREFIX(ZBDSegmentManager::blk_zone_op); + + unsigned long ioctl_op = 0; + switch (op) { + using enum zone_op; + case OPEN: + ioctl_op = BLKOPENZONE; + break; + case FINISH: + ioctl_op = BLKFINISHZONE; + break; + case RESET: + ioctl_op = BLKRESETZONE; + break; + case CLOSE: + ioctl_op = BLKCLOSEZONE; + break; + default: + ERROR("Invalid zone operation {}", op); + ceph_assert(ioctl_op); + } + + return device.ioctl( + ioctl_op, + &range + ).then_wrapped([=](auto f) -> blk_zone_op_ret { + if (f.failed()) { + ERROR("{} ioctl failed", op); + return crimson::ct_error::input_output_error::make(); + } else { + int ret = f.get(); + if (ret == 0) { + return seastar::now(); + } else { + ERROR("{} ioctl failed with return code {}", op, ret); + return crimson::ct_error::input_output_error::make(); + } + } + }); +} + +ZBDSegmentManager::open_ertr::future<SegmentRef> ZBDSegmentManager::open( + segment_id_t id) +{ + LOG_PREFIX(ZBDSegmentManager::open); + return seastar::do_with( + blk_zone_range{}, + [=, this](auto &range) { + range = make_range( + id, + metadata.segment_size, + shard_info.first_segment_offset); + return blk_zone_op( + device, + range, + zone_op::OPEN + ); + } + ).safe_then([=, this] { + DEBUG("segment {}, open successful", id); + return open_ertr::future<SegmentRef>( + open_ertr::ready_future_marker{}, + SegmentRef(new ZBDSegment(*this, id)) + ); + }); +} + +ZBDSegmentManager::release_ertr::future<> ZBDSegmentManager::release( + segment_id_t id) +{ + LOG_PREFIX(ZBDSegmentManager::release); + DEBUG("Resetting zone/segment {}", id); + return seastar::do_with( + blk_zone_range{}, + [=, this](auto &range) { + range = make_range( + id, + metadata.segment_size, + shard_info.first_segment_offset); + return blk_zone_op( + device, + range, + zone_op::RESET + ); + } + ).safe_then([=] { + DEBUG("segment release successful"); + return release_ertr::now(); + }); +} + +SegmentManager::read_ertr::future<> ZBDSegmentManager::read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) +{ + LOG_PREFIX(ZBDSegmentManager::read); + auto& seg_addr = addr.as_seg_paddr(); + if (seg_addr.get_segment_id().device_segment_id() >= get_num_segments()) { + ERROR("invalid segment {}", + seg_addr.get_segment_id().device_segment_id()); + return crimson::ct_error::invarg::make(); + } + + if (seg_addr.get_segment_off() + len > metadata.segment_capacity) { + ERROR("invalid read offset {}, len {}", + addr, + len); + return crimson::ct_error::invarg::make(); + } + return do_read( + device, + get_offset(addr), + len, + out); +} + +Segment::close_ertr::future<> ZBDSegmentManager::segment_close( + segment_id_t id, segment_off_t write_pointer) +{ + LOG_PREFIX(ZBDSegmentManager::segment_close); + return seastar::do_with( + blk_zone_range{}, + [=, this](auto &range) { + range = make_range( + id, + metadata.segment_size, + shard_info.first_segment_offset); + return blk_zone_op( + device, + range, + zone_op::FINISH + ); + } + ).safe_then([=] { + DEBUG("zone finish successful"); + return Segment::close_ertr::now(); + }); +} + +Segment::write_ertr::future<> ZBDSegmentManager::segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check) +{ + LOG_PREFIX(ZBDSegmentManager::segment_write); + assert(addr.get_device_id() == get_device_id()); + assert((bl.length() % metadata.block_size) == 0); + auto& seg_addr = addr.as_seg_paddr(); + DEBUG("write to segment {} at offset {}, physical offset {}, len {}", + seg_addr.get_segment_id(), + seg_addr.get_segment_off(), + get_offset(addr), + bl.length()); + stats.data_write.increment(bl.length()); + return do_writev( + get_device_id(), + device, + get_offset(addr), + std::move(bl), + metadata.block_size); +} + +device_id_t ZBDSegmentManager::get_device_id() const +{ + return metadata.device_id; +}; + +secondary_device_set_t& ZBDSegmentManager::get_secondary_devices() +{ + return metadata.secondary_devices; +}; + +magic_t ZBDSegmentManager::get_magic() const +{ + return metadata.magic; +}; + +segment_off_t ZBDSegment::get_write_capacity() const +{ + return manager.get_segment_size(); +} + +SegmentManager::close_ertr::future<> ZBDSegmentManager::close() +{ + if (device) { + return device.close(); + } + return seastar::now(); +} + +Segment::close_ertr::future<> ZBDSegment::close() +{ + return manager.segment_close(id, write_pointer); +} + +Segment::write_ertr::future<> ZBDSegment::write( + segment_off_t offset, ceph::bufferlist bl) +{ + LOG_PREFIX(ZBDSegment::write); + if (offset != write_pointer || offset % manager.metadata.block_size != 0) { + ERROR("Segment offset and zone write pointer mismatch. " + "segment {} segment-offset {} write pointer {}", + id, offset, write_pointer); + return crimson::ct_error::invarg::make(); + } + if (offset + bl.length() > manager.metadata.segment_capacity) { + return crimson::ct_error::enospc::make(); + } + + write_pointer = offset + bl.length(); + return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl); +} + +Segment::write_ertr::future<> ZBDSegment::write_padding_bytes( + size_t padding_bytes) +{ + LOG_PREFIX(ZBDSegment::write_padding_bytes); + DEBUG("Writing {} padding bytes to segment {} at wp {}", + padding_bytes, id, write_pointer); + + return crimson::repeat([FNAME, padding_bytes, this] () mutable { + size_t bufsize = 0; + if (padding_bytes >= MAX_PADDING_SIZE) { + bufsize = MAX_PADDING_SIZE; + } else { + bufsize = padding_bytes; + } + + padding_bytes -= bufsize; + bufferptr bp(ceph::buffer::create_page_aligned(bufsize)); + bp.zero(); + bufferlist padd_bl; + padd_bl.append(bp); + return write(write_pointer, padd_bl).safe_then([FNAME, padding_bytes, this]() { + if (padding_bytes == 0) { + return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::yes); + } else { + return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::no); + } + }); + }); +} + +// Advance write pointer, to given offset. +Segment::write_ertr::future<> ZBDSegment::advance_wp( + segment_off_t offset) +{ + LOG_PREFIX(ZBDSegment::advance_wp); + + DEBUG("Advancing write pointer from {} to {}", write_pointer, offset); + if (offset < write_pointer) { + return crimson::ct_error::invarg::make(); + } + + size_t padding_bytes = offset - write_pointer; + + if (padding_bytes == 0) { + return write_ertr::now(); + } + + assert(padding_bytes % manager.metadata.block_size == 0); + + return write_padding_bytes(padding_bytes); +} + +} diff --git a/src/crimson/os/seastore/segment_manager/zbd.h b/src/crimson/os/seastore/segment_manager/zbd.h new file mode 100644 index 000000000..c18f46336 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/zbd.h @@ -0,0 +1,246 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include <linux/blkzoned.h> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/file.hh> +#include <seastar/core/future.hh> +#include <seastar/core/reactor.hh> + +#include "crimson/common/layout.h" + +#include "crimson/os/seastore/segment_manager.h" + +#include "include/uuid.h" + +namespace crimson::os::seastore::segment_manager::zbd { + + struct zbd_shard_info_t { + size_t size = 0; + size_t segments = 0; + size_t first_segment_offset = 0; + + DENC(zbd_shard_info_t, v, p) { + DENC_START(1, 1, p); + denc(v.size, p); + denc(v.segments, p); + denc(v.first_segment_offset, p); + DENC_FINISH(p); + } + }; + + struct zbd_sm_metadata_t { + unsigned int shard_num = 0; + size_t segment_size = 0; + size_t segment_capacity = 0; + size_t zones_per_segment = 0; + size_t zone_capacity = 0; + size_t block_size = 0; + size_t zone_size = 0; + + std::vector<zbd_shard_info_t> shard_infos; + + seastore_meta_t meta; + + bool major_dev = false; + magic_t magic = 0; + device_type_t dtype = device_type_t::NONE; + device_id_t device_id = 0; + secondary_device_set_t secondary_devices; + + DENC(zbd_sm_metadata_t, v, p) { + DENC_START(1, 1, p); + denc(v.shard_num, p); + denc(v.segment_size, p); + denc(v.segment_capacity, p); + denc(v.zones_per_segment, p); + denc(v.zone_capacity, p); + denc(v.block_size, p); + denc(v.zone_size, p); + denc(v.shard_infos, p); + denc(v.meta, p); + denc(v.magic, p); + denc(v.dtype, p); + denc(v.device_id, p); + if (v.major_dev) { + denc(v.secondary_devices, p); + } + DENC_FINISH(p); + } + + void validate() const { + ceph_assert_always(shard_num == seastar::smp::count); + for (unsigned int i = 0; i < seastar::smp::count; i++) { + ceph_assert_always(shard_infos[i].size > 0); + ceph_assert_always(shard_infos[i].size <= DEVICE_OFF_MAX); + ceph_assert_always(shard_infos[i].segments > 0); + ceph_assert_always(shard_infos[i].segments <= DEVICE_SEGMENT_ID_MAX); + } + ceph_assert_always(segment_capacity > 0); + ceph_assert_always(segment_capacity <= SEGMENT_OFF_MAX); + } + }; + + using write_ertr = crimson::errorator<crimson::ct_error::input_output_error>; + using read_ertr = crimson::errorator<crimson::ct_error::input_output_error>; + + enum class zone_op { + OPEN, + FINISH, + CLOSE, + RESET, + }; + + class ZBDSegmentManager; + + class ZBDSegment final : public Segment { + public: + ZBDSegment(ZBDSegmentManager &man, segment_id_t i) : manager(man), id(i){}; + + segment_id_t get_segment_id() const final { return id; } + segment_off_t get_write_capacity() const final; + segment_off_t get_write_ptr() const final { return write_pointer; } + close_ertr::future<> close() final; + write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final; + write_ertr::future<> advance_wp(segment_off_t offset) final; + + ~ZBDSegment() {} + private: + friend class ZBDSegmentManager; + ZBDSegmentManager &manager; + const segment_id_t id; + segment_off_t write_pointer = 0; + write_ertr::future<> write_padding_bytes(size_t padding_bytes); + }; + + class ZBDSegmentManager final : public SegmentManager{ + // interfaces used by Device + public: + seastar::future<> start() { + return shard_devices.start(device_path); + } + + seastar::future<> stop() { + return shard_devices.stop(); + } + + Device& get_sharded_device() final { + return shard_devices.local(); + } + + mount_ret mount() final; + mkfs_ret mkfs(device_config_t meta) final; + + ZBDSegmentManager(const std::string &path) : device_path(path) {} + + ~ZBDSegmentManager() final = default; + + //interfaces used by each shard device + public: + open_ertr::future<SegmentRef> open(segment_id_t id) final; + close_ertr::future<> close() final; + + release_ertr::future<> release(segment_id_t id) final; + + read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) final; + + device_type_t get_device_type() const final { + return device_type_t::ZBD; + } + + size_t get_available_size() const final { + return shard_info.size; + }; + + extent_len_t get_block_size() const final { + return metadata.block_size; + }; + + segment_off_t get_segment_size() const final { + return metadata.segment_capacity; + }; + + const seastore_meta_t &get_meta() const { + return metadata.meta; + }; + + device_id_t get_device_id() const final; + + secondary_device_set_t& get_secondary_devices() final; + + magic_t get_magic() const final; + + Segment::write_ertr::future<> segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check=false); + + private: + friend class ZBDSegment; + std::string device_path; + zbd_shard_info_t shard_info; + zbd_sm_metadata_t metadata; + seastar::file device; + uint32_t nr_zones; + struct effort_t { + uint64_t num = 0; + uint64_t bytes = 0; + + void increment(uint64_t read_bytes) { + ++num; + bytes += read_bytes; + } + }; + + struct zbd_sm_stats { + effort_t data_read = {}; + effort_t data_write = {}; + effort_t metadata_write = {}; + uint64_t opened_segments = 0; + uint64_t closed_segments = 0; + uint64_t closed_segments_unused_bytes = 0; + uint64_t released_segments = 0; + + void reset() { + *this = zbd_sm_stats{}; + } + } stats; + + void register_metrics(); + seastar::metrics::metric_group metrics; + + Segment::close_ertr::future<> segment_close( + segment_id_t id, segment_off_t write_pointer); + + uint64_t get_offset(paddr_t addr) { + auto& seg_addr = addr.as_seg_paddr(); + return (shard_info.first_segment_offset + + (seg_addr.get_segment_id().device_segment_id() * + metadata.segment_size)) + seg_addr.get_segment_off(); + } + private: + // shard 0 mkfs + mkfs_ret primary_mkfs(device_config_t meta); + // all shards mkfs + mkfs_ret shard_mkfs(); + + mount_ret shard_mount(); + + seastar::sharded<ZBDSegmentManager> shard_devices; + }; + +} + +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::segment_manager::zbd::zbd_shard_info_t +) +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::segment_manager::zbd::zbd_sm_metadata_t +) diff --git a/src/crimson/os/seastore/segment_manager_group.cc b/src/crimson/os/seastore/segment_manager_group.cc new file mode 100644 index 000000000..332b794b7 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager_group.cc @@ -0,0 +1,171 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "crimson/os/seastore/segment_manager_group.h" + +#include "crimson/os/seastore/logging.h" + +SET_SUBSYS(seastore_journal); + +namespace crimson::os::seastore { + +SegmentManagerGroup::read_segment_tail_ret +SegmentManagerGroup::read_segment_tail(segment_id_t segment) +{ + assert(has_device(segment.device_id())); + auto& segment_manager = *segment_managers[segment.device_id()]; + return segment_manager.read( + paddr_t::make_seg_paddr( + segment, + segment_manager.get_segment_size() - get_rounded_tail_length()), + get_rounded_tail_length() + ).handle_error( + read_segment_header_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in SegmentManagerGroup::read_segment_tail" + } + ).safe_then([=, &segment_manager](bufferptr bptr) -> read_segment_tail_ret { + LOG_PREFIX(SegmentManagerGroup::read_segment_tail); + DEBUG("segment {} bptr size {}", segment, bptr.length()); + + segment_tail_t tail; + bufferlist bl; + bl.push_back(bptr); + + DEBUG("segment {} block crc {}", + segment, + bl.begin().crc32c(segment_manager.get_block_size(), 0)); + + auto bp = bl.cbegin(); + try { + decode(tail, bp); + } catch (ceph::buffer::error &e) { + DEBUG("segment {} unable to decode tail, skipping -- {}", + segment, e.what()); + return crimson::ct_error::enodata::make(); + } + DEBUG("segment {} tail {}", segment, tail); + return read_segment_tail_ret( + read_segment_tail_ertr::ready_future_marker{}, + tail); + }); +} + +SegmentManagerGroup::read_segment_header_ret +SegmentManagerGroup::read_segment_header(segment_id_t segment) +{ + assert(has_device(segment.device_id())); + auto& segment_manager = *segment_managers[segment.device_id()]; + return segment_manager.read( + paddr_t::make_seg_paddr(segment, 0), + get_rounded_header_length() + ).handle_error( + read_segment_header_ertr::pass_further{}, + crimson::ct_error::assert_all{ + "Invalid error in SegmentManagerGroup::read_segment_header" + } + ).safe_then([=, &segment_manager](bufferptr bptr) -> read_segment_header_ret { + LOG_PREFIX(SegmentManagerGroup::read_segment_header); + DEBUG("segment {} bptr size {}", segment, bptr.length()); + + segment_header_t header; + bufferlist bl; + bl.push_back(bptr); + + DEBUG("segment {} block crc {}", + segment, + bl.begin().crc32c(segment_manager.get_block_size(), 0)); + + auto bp = bl.cbegin(); + try { + decode(header, bp); + } catch (ceph::buffer::error &e) { + DEBUG("segment {} unable to decode header, skipping -- {}", + segment, e.what()); + return crimson::ct_error::enodata::make(); + } + DEBUG("segment {} header {}", segment, header); + return read_segment_header_ret( + read_segment_header_ertr::ready_future_marker{}, + header); + }); +} + +void SegmentManagerGroup::initialize_cursor( + scan_valid_records_cursor &cursor) +{ + LOG_PREFIX(SegmentManagerGroup::initialize_cursor); + assert(has_device(cursor.get_segment_id().device_id())); + auto& segment_manager = + *segment_managers[cursor.get_segment_id().device_id()]; + if (cursor.get_segment_offset() == 0) { + INFO("start to scan segment {}", cursor.get_segment_id()); + cursor.increment_seq(segment_manager.get_block_size()); + } + cursor.block_size = segment_manager.get_block_size(); +} + +SegmentManagerGroup::read_ret +SegmentManagerGroup::read(paddr_t start, size_t len) +{ + LOG_PREFIX(SegmentManagerGroup::read); + assert(has_device(start.get_device_id())); + auto& segment_manager = *segment_managers[start.get_device_id()]; + TRACE("reading data {}~{}", start, len); + return segment_manager.read( + start, + len + ).safe_then([](auto bptr) { + return read_ret( + read_ertr::ready_future_marker{}, + std::move(bptr) + ); + }); +} + +SegmentManagerGroup::find_journal_segment_headers_ret +SegmentManagerGroup::find_journal_segment_headers() +{ + return seastar::do_with( + get_segment_managers(), + find_journal_segment_headers_ret_bare{}, + [this](auto &sms, auto& ret) -> find_journal_segment_headers_ret + { + return crimson::do_for_each(sms, + [this, &ret](SegmentManager *sm) + { + LOG_PREFIX(SegmentManagerGroup::find_journal_segment_headers); + auto device_id = sm->get_device_id(); + auto num_segments = sm->get_num_segments(); + DEBUG("processing {} with {} segments", + device_id_printer_t{device_id}, num_segments); + return crimson::do_for_each( + boost::counting_iterator<device_segment_id_t>(0), + boost::counting_iterator<device_segment_id_t>(num_segments), + [this, &ret, device_id](device_segment_id_t d_segment_id) + { + segment_id_t segment_id{device_id, d_segment_id}; + return read_segment_header(segment_id + ).safe_then([segment_id, &ret](auto &&header) { + if (header.get_type() == segment_type_t::JOURNAL) { + ret.emplace_back(std::make_pair(segment_id, std::move(header))); + } + }).handle_error( + crimson::ct_error::enoent::handle([](auto) { + return find_journal_segment_headers_ertr::now(); + }), + crimson::ct_error::enodata::handle([](auto) { + return find_journal_segment_headers_ertr::now(); + }), + crimson::ct_error::input_output_error::pass_further{} + ); + }); + }).safe_then([&ret]() mutable { + return find_journal_segment_headers_ret{ + find_journal_segment_headers_ertr::ready_future_marker{}, + std::move(ret)}; + }); + }); +} + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/segment_manager_group.h b/src/crimson/os/seastore/segment_manager_group.h new file mode 100644 index 000000000..f193b5eed --- /dev/null +++ b/src/crimson/os/seastore/segment_manager_group.h @@ -0,0 +1,150 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <set> + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/record_scanner.h" + +namespace crimson::os::seastore { + +class SegmentManagerGroup : public RecordScanner { +public: + SegmentManagerGroup() { + segment_managers.resize(DEVICE_ID_MAX, nullptr); + } + + const std::set<device_id_t>& get_device_ids() const { + return device_ids; + } + + std::vector<SegmentManager*> get_segment_managers() const { + assert(device_ids.size()); + std::vector<SegmentManager*> ret; + for (auto& device_id : device_ids) { + auto segment_manager = segment_managers[device_id]; + assert(segment_manager->get_device_id() == device_id); + ret.emplace_back(segment_manager); + } + return ret; + } + + void add_segment_manager(SegmentManager* segment_manager) { + auto device_id = segment_manager->get_device_id(); + ceph_assert(!has_device(device_id)); + if (!device_ids.empty()) { + auto existing_id = *device_ids.begin(); + ceph_assert(segment_managers[existing_id]->get_device_type() + == segment_manager->get_device_type()); + } + segment_managers[device_id] = segment_manager; + device_ids.insert(device_id); + } + + void reset() { + segment_managers.clear(); + segment_managers.resize(DEVICE_ID_MAX, nullptr); + device_ids.clear(); + } + + /** + * get device info + * + * Assume all segment managers share the same following information. + */ + extent_len_t get_block_size() const { + assert(device_ids.size()); + return segment_managers[*device_ids.begin()]->get_block_size(); + } + + segment_off_t get_segment_size() const { + assert(device_ids.size()); + return segment_managers[*device_ids.begin()]->get_segment_size(); + } + + const seastore_meta_t &get_meta() const { + assert(device_ids.size()); + return segment_managers[*device_ids.begin()]->get_meta(); + } + + std::size_t get_rounded_header_length() const { + return p2roundup( + ceph::encoded_sizeof_bounded<segment_header_t>(), + (std::size_t)get_block_size()); + } + + std::size_t get_rounded_tail_length() const { + return p2roundup( + ceph::encoded_sizeof_bounded<segment_tail_t>(), + (std::size_t)get_block_size()); + } + + using read_segment_header_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::enodata, + crimson::ct_error::input_output_error + >; + using read_segment_header_ret = read_segment_header_ertr::future< + segment_header_t>; + read_segment_header_ret read_segment_header(segment_id_t segment); + + using read_segment_tail_ertr = read_segment_header_ertr; + using read_segment_tail_ret = read_segment_tail_ertr::future< + segment_tail_t>; + read_segment_tail_ret read_segment_tail(segment_id_t segment); + + /* + * read journal segment headers + */ + using find_journal_segment_headers_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using find_journal_segment_headers_ret_bare = std::vector< + std::pair<segment_id_t, segment_header_t>>; + using find_journal_segment_headers_ret = find_journal_segment_headers_ertr::future< + find_journal_segment_headers_ret_bare>; + find_journal_segment_headers_ret find_journal_segment_headers(); + + using open_ertr = SegmentManager::open_ertr; + open_ertr::future<SegmentRef> open(segment_id_t id) { + assert(has_device(id.device_id())); + return segment_managers[id.device_id()]->open(id); + } + + using release_ertr = SegmentManager::release_ertr; + release_ertr::future<> release_segment(segment_id_t id) { + assert(has_device(id.device_id())); + return segment_managers[id.device_id()]->release(id); + } + +private: + bool has_device(device_id_t id) const { + assert(id <= DEVICE_ID_MAX_VALID); + return device_ids.count(id) >= 1; + } + + void initialize_cursor(scan_valid_records_cursor &cursor) final; + + read_ret read(paddr_t start, size_t len) final; + + bool is_record_segment_seq_invalid(scan_valid_records_cursor &cursor, + record_group_header_t &header) final { + return false; + } + + int64_t get_segment_end_offset(paddr_t addr) final { + auto& seg_addr = addr.as_seg_paddr(); + auto& segment_manager = *segment_managers[seg_addr.get_segment_id().device_id()]; + return static_cast<int64_t>(segment_manager.get_segment_size()); + } + + std::vector<SegmentManager*> segment_managers; + std::set<device_id_t> device_ids; +}; + +using SegmentManagerGroupRef = std::unique_ptr<SegmentManagerGroup>; + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/segment_seq_allocator.h b/src/crimson/os/seastore/segment_seq_allocator.h new file mode 100644 index 000000000..28c81bf32 --- /dev/null +++ b/src/crimson/os/seastore/segment_seq_allocator.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { +class AsyncCleaner; +} + +namespace crimson::os::seastore::journal { +class SegmentedJournal; +} + +namespace crimson::os::seastore { + +class SegmentSeqAllocator { +public: + SegmentSeqAllocator(segment_type_t type) + : type(type) {} + segment_seq_t get_and_inc_next_segment_seq() { + return next_segment_seq++; + } +private: + void set_next_segment_seq(segment_seq_t seq) { + LOG_PREFIX(SegmentSeqAllocator::set_next_segment_seq); + SUBDEBUG( + seastore_journal, + "{}, next={}, cur={}", + type, + segment_seq_printer_t{seq}, + segment_seq_printer_t{next_segment_seq}); + assert(type == segment_type_t::JOURNAL + ? seq >= next_segment_seq + : true); + if (seq > next_segment_seq) + next_segment_seq = seq; + } + segment_seq_t next_segment_seq = 0; + segment_type_t type = segment_type_t::NULL_SEG; + friend class journal::SegmentedJournal; + friend class SegmentCleaner; +}; + +using SegmentSeqAllocatorRef = + std::unique_ptr<SegmentSeqAllocator>; + +}; diff --git a/src/crimson/os/seastore/transaction.cc b/src/crimson/os/seastore/transaction.cc new file mode 100644 index 000000000..4cab476c3 --- /dev/null +++ b/src/crimson/os/seastore/transaction.cc @@ -0,0 +1,8 @@ +#include "transaction.h" +#include "crimson/common/interruptible_future.h" + +namespace crimson::interruptible { +template +thread_local interrupt_cond_t<::crimson::os::seastore::TransactionConflictCondition> +interrupt_cond<::crimson::os::seastore::TransactionConflictCondition>; +} diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h new file mode 100644 index 000000000..d423196fe --- /dev/null +++ b/src/crimson/os/seastore/transaction.h @@ -0,0 +1,653 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive/list.hpp> + +#include "crimson/common/log.h" +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/ordering_handle.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/root_block.h" + +namespace crimson::os::seastore { + +class SeaStore; +class Transaction; + +struct io_stat_t { + uint64_t num = 0; + uint64_t bytes = 0; + + bool is_clear() const { + return (num == 0 && bytes == 0); + } + + void increment(uint64_t _bytes) { + ++num; + bytes += _bytes; + } + + void increment_stat(const io_stat_t& stat) { + num += stat.num; + bytes += stat.bytes; + } +}; +inline std::ostream& operator<<(std::ostream& out, const io_stat_t& stat) { + return out << stat.num << "(" << stat.bytes << "B)"; +} + +struct version_stat_t { + uint64_t num = 0; + uint64_t version = 0; + + bool is_clear() const { + return (num == 0 && version == 0); + } + + void increment(extent_version_t v) { + ++num; + version += v; + } + + void increment_stat(const version_stat_t& stat) { + num += stat.num; + version += stat.version; + } +}; + +/** + * Transaction + * + * Representation of in-progress mutation. Used exclusively through Cache methods. + * + * Transaction log levels: + * seastore_t + * - DEBUG: transaction create, conflict, commit events + * - TRACE: DEBUG details + * - seastore_cache logs + */ +class Transaction { +public: + using Ref = std::unique_ptr<Transaction>; + using on_destruct_func_t = std::function<void(Transaction&)>; + enum class get_extent_ret { + PRESENT, + ABSENT, + RETIRED + }; + get_extent_ret get_extent(paddr_t addr, CachedExtentRef *out) { + LOG_PREFIX(Transaction::get_extent); + // it's possible that both write_set and retired_set contain + // this addr at the same time when addr is absolute and the + // corresponding extent is used to map existing extent on disk. + // So search write_set first. + if (auto iter = write_set.find_offset(addr); + iter != write_set.end()) { + if (out) + *out = CachedExtentRef(&*iter); + SUBTRACET(seastore_cache, "{} is present in write_set -- {}", + *this, addr, *iter); + assert((*out)->is_valid()); + return get_extent_ret::PRESENT; + } else if (retired_set.count(addr)) { + return get_extent_ret::RETIRED; + } else if ( + auto iter = read_set.find(addr); + iter != read_set.end()) { + // placeholder in read-set should be in the retired-set + // at the same time. + assert(iter->ref->get_type() != extent_types_t::RETIRED_PLACEHOLDER); + if (out) + *out = iter->ref; + SUBTRACET(seastore_cache, "{} is present in read_set -- {}", + *this, addr, *(iter->ref)); + return get_extent_ret::PRESENT; + } else { + return get_extent_ret::ABSENT; + } + } + + void add_to_retired_set(CachedExtentRef ref) { + ceph_assert(!is_weak()); + if (ref->is_exist_clean() || + ref->is_exist_mutation_pending()) { + existing_block_stats.dec(ref); + ref->set_invalid(*this); + write_set.erase(*ref); + } else if (ref->is_initial_pending()) { + ref->set_invalid(*this); + write_set.erase(*ref); + } else if (ref->is_mutation_pending()) { + ref->set_invalid(*this); + write_set.erase(*ref); + assert(ref->prior_instance); + retired_set.insert(ref->prior_instance); + assert(read_set.count(ref->prior_instance->get_paddr())); + ref->prior_instance.reset(); + } else { + // && retired_set.count(ref->get_paddr()) == 0 + // If it's already in the set, insert here will be a noop, + // which is what we want. + retired_set.insert(ref); + } + } + + void add_to_read_set(CachedExtentRef ref) { + if (is_weak()) return; + + assert(ref->is_valid()); + + auto it = ref->transactions.lower_bound( + this, read_set_item_t<Transaction>::trans_cmp_t()); + if (it != ref->transactions.end() && it->t == this) return; + + auto [iter, inserted] = read_set.emplace(this, ref); + ceph_assert(inserted); + ref->transactions.insert_before( + it, const_cast<read_set_item_t<Transaction>&>(*iter)); + } + + void add_fresh_extent( + CachedExtentRef ref) { + ceph_assert(!is_weak()); + if (ref->is_exist_clean()) { + existing_block_stats.inc(ref); + existing_block_list.push_back(ref); + } else if (ref->get_paddr().is_delayed()) { + assert(ref->get_paddr() == make_delayed_temp_paddr(0)); + assert(ref->is_logical()); + ref->set_paddr(make_delayed_temp_paddr(delayed_temp_offset)); + delayed_temp_offset += ref->get_length(); + delayed_alloc_list.emplace_back(ref->cast<LogicalCachedExtent>()); + fresh_block_stats.increment(ref->get_length()); + } else if (ref->get_paddr().is_absolute()) { + pre_alloc_list.emplace_back(ref->cast<LogicalCachedExtent>()); + fresh_block_stats.increment(ref->get_length()); + } else { + if (likely(ref->get_paddr() == make_record_relative_paddr(0))) { + ref->set_paddr(make_record_relative_paddr(offset)); + } else { + ceph_assert(ref->get_paddr().is_fake()); + } + offset += ref->get_length(); + inline_block_list.push_back(ref); + fresh_block_stats.increment(ref->get_length()); + } + write_set.insert(*ref); + if (is_backref_node(ref->get_type())) + fresh_backref_extents++; + } + + uint64_t get_num_fresh_backref() const { + return fresh_backref_extents; + } + + void mark_delayed_extent_inline(LogicalCachedExtentRef& ref) { + write_set.erase(*ref); + assert(ref->get_paddr().is_delayed()); + ref->set_paddr(make_record_relative_paddr(offset), + /* need_update_mapping: */ true); + offset += ref->get_length(); + inline_block_list.push_back(ref); + write_set.insert(*ref); + } + + void mark_delayed_extent_ool(LogicalCachedExtentRef& ref) { + written_ool_block_list.push_back(ref); + } + + void update_delayed_ool_extent_addr(LogicalCachedExtentRef& ref, + paddr_t final_addr) { + write_set.erase(*ref); + assert(ref->get_paddr().is_delayed()); + ref->set_paddr(final_addr, /* need_update_mapping: */ true); + assert(!ref->get_paddr().is_null()); + assert(!ref->is_inline()); + write_set.insert(*ref); + } + + void mark_allocated_extent_ool(LogicalCachedExtentRef& ref) { + assert(ref->get_paddr().is_absolute()); + assert(!ref->is_inline()); + written_ool_block_list.push_back(ref); + } + + void add_mutated_extent(CachedExtentRef ref) { + ceph_assert(!is_weak()); + assert(ref->is_exist_mutation_pending() || + read_set.count(ref->prior_instance->get_paddr())); + mutated_block_list.push_back(ref); + if (!ref->is_exist_mutation_pending()) { + write_set.insert(*ref); + } else { + assert(write_set.find_offset(ref->get_paddr()) != + write_set.end()); + } + } + + void replace_placeholder(CachedExtent& placeholder, CachedExtent& extent) { + ceph_assert(!is_weak()); + + assert(placeholder.get_type() == extent_types_t::RETIRED_PLACEHOLDER); + assert(extent.get_type() != extent_types_t::RETIRED_PLACEHOLDER); + assert(extent.get_type() != extent_types_t::ROOT); + assert(extent.get_paddr() == placeholder.get_paddr()); + { + auto where = read_set.find(placeholder.get_paddr()); + assert(where != read_set.end()); + assert(where->ref.get() == &placeholder); + where = read_set.erase(where); + auto it = read_set.emplace_hint(where, this, &extent); + extent.transactions.insert(const_cast<read_set_item_t<Transaction>&>(*it)); + } + { + auto where = retired_set.find(&placeholder); + assert(where != retired_set.end()); + assert(where->get() == &placeholder); + where = retired_set.erase(where); + retired_set.emplace_hint(where, &extent); + } + } + + auto get_delayed_alloc_list() { + std::list<LogicalCachedExtentRef> ret; + for (auto& extent : delayed_alloc_list) { + // delayed extents may be invalidated + if (extent->is_valid()) { + ret.push_back(std::move(extent)); + } else { + ++num_delayed_invalid_extents; + } + } + delayed_alloc_list.clear(); + return ret; + } + + auto get_valid_pre_alloc_list() { + std::list<LogicalCachedExtentRef> ret; + assert(num_allocated_invalid_extents == 0); + for (auto& extent : pre_alloc_list) { + if (extent->is_valid()) { + ret.push_back(extent); + } else { + ++num_allocated_invalid_extents; + } + } + return ret; + } + + const auto &get_inline_block_list() { + return inline_block_list; + } + + const auto &get_mutated_block_list() { + return mutated_block_list; + } + + const auto &get_existing_block_list() { + return existing_block_list; + } + + const auto &get_retired_set() { + return retired_set; + } + + bool is_retired(paddr_t paddr, extent_len_t len) { + if (retired_set.empty()) { + return false; + } + auto iter = retired_set.lower_bound(paddr); + if (iter == retired_set.end() || + (*iter)->get_paddr() > paddr) { + assert(iter != retired_set.begin()); + --iter; + } + auto retired_paddr = (*iter)->get_paddr(); + auto retired_length = (*iter)->get_length(); + return retired_paddr <= paddr && + retired_paddr.add_offset(retired_length) >= paddr.add_offset(len); + } + + template <typename F> + auto for_each_fresh_block(F &&f) const { + std::for_each(written_ool_block_list.begin(), written_ool_block_list.end(), f); + std::for_each(inline_block_list.begin(), inline_block_list.end(), f); + } + + const io_stat_t& get_fresh_block_stats() const { + return fresh_block_stats; + } + + using src_t = transaction_type_t; + src_t get_src() const { + return src; + } + + bool is_weak() const { + return weak; + } + + void test_set_conflict() { + conflicted = true; + } + + bool is_conflicted() const { + return conflicted; + } + + auto &get_handle() { + return handle; + } + + Transaction( + OrderingHandle &&handle, + bool weak, + src_t src, + journal_seq_t initiated_after, + on_destruct_func_t&& f, + transaction_id_t trans_id + ) : weak(weak), + handle(std::move(handle)), + on_destruct(std::move(f)), + src(src), + trans_id(trans_id) + {} + + void invalidate_clear_write_set() { + for (auto &&i: write_set) { + i.set_invalid(*this); + } + write_set.clear(); + } + + ~Transaction() { + on_destruct(*this); + invalidate_clear_write_set(); + } + + friend class crimson::os::seastore::SeaStore; + friend class TransactionConflictCondition; + + void reset_preserve_handle(journal_seq_t initiated_after) { + root.reset(); + offset = 0; + delayed_temp_offset = 0; + read_set.clear(); + fresh_backref_extents = 0; + invalidate_clear_write_set(); + mutated_block_list.clear(); + fresh_block_stats = {}; + num_delayed_invalid_extents = 0; + num_allocated_invalid_extents = 0; + delayed_alloc_list.clear(); + inline_block_list.clear(); + written_ool_block_list.clear(); + pre_alloc_list.clear(); + retired_set.clear(); + existing_block_list.clear(); + existing_block_stats = {}; + onode_tree_stats = {}; + omap_tree_stats = {}; + lba_tree_stats = {}; + backref_tree_stats = {}; + ool_write_stats = {}; + rewrite_version_stats = {}; + conflicted = false; + if (!has_reset) { + has_reset = true; + } + } + + bool did_reset() const { + return has_reset; + } + + struct tree_stats_t { + uint64_t depth = 0; + uint64_t num_inserts = 0; + uint64_t num_erases = 0; + uint64_t num_updates = 0; + int64_t extents_num_delta = 0; + + bool is_clear() const { + return (depth == 0 && + num_inserts == 0 && + num_erases == 0 && + num_updates == 0 && + extents_num_delta == 0); + } + }; + tree_stats_t& get_onode_tree_stats() { + return onode_tree_stats; + } + tree_stats_t& get_omap_tree_stats() { + return omap_tree_stats; + } + tree_stats_t& get_lba_tree_stats() { + return lba_tree_stats; + } + tree_stats_t& get_backref_tree_stats() { + return backref_tree_stats; + } + + struct ool_write_stats_t { + io_stat_t extents; + uint64_t md_bytes = 0; + uint64_t num_records = 0; + + uint64_t get_data_bytes() const { + return extents.bytes; + } + + bool is_clear() const { + return (extents.is_clear() && + md_bytes == 0 && + num_records == 0); + } + }; + ool_write_stats_t& get_ool_write_stats() { + return ool_write_stats; + } + version_stat_t& get_rewrite_version_stats() { + return rewrite_version_stats; + } + + struct existing_block_stats_t { + uint64_t valid_num = 0; + uint64_t clean_num = 0; + uint64_t mutated_num = 0; + void inc(const CachedExtentRef &ref) { + valid_num++; + if (ref->is_exist_clean()) { + clean_num++; + } else { + mutated_num++; + } + } + void dec(const CachedExtentRef &ref) { + valid_num--; + if (ref->is_exist_clean()) { + clean_num--; + } else { + mutated_num--; + } + } + }; + existing_block_stats_t& get_existing_block_stats() { + return existing_block_stats; + } + + transaction_id_t get_trans_id() const { + return trans_id; + } + +private: + friend class Cache; + friend Ref make_test_transaction(); + + /** + * If set, *this may not be used to perform writes and will not provide + * consistentency allowing operations using to avoid maintaining a read_set. + */ + const bool weak; + + RootBlockRef root; ///< ref to root if read or written by transaction + + device_off_t offset = 0; ///< relative offset of next block + device_off_t delayed_temp_offset = 0; + + /** + * read_set + * + * Holds a reference (with a refcount) to every extent read via *this. + * Submitting a transaction mutating any contained extent/addr will + * invalidate *this. + */ + read_set_t<Transaction> read_set; ///< set of extents read by paddr + + uint64_t fresh_backref_extents = 0; // counter of new backref extents + + /** + * write_set + * + * Contains a reference (without a refcount) to every extent mutated + * as part of *this. No contained extent may be referenced outside + * of *this. Every contained extent will be in one of inline_block_list, + * written_ool_block_list or/and pre_alloc_list, mutated_block_list, + * or delayed_alloc_list. + */ + ExtentIndex write_set; + + /** + * lists of fresh blocks, holds refcounts, subset of write_set + */ + io_stat_t fresh_block_stats; + uint64_t num_delayed_invalid_extents = 0; + uint64_t num_allocated_invalid_extents = 0; + /// blocks that will be committed with journal record inline + std::list<CachedExtentRef> inline_block_list; + /// blocks that will be committed with out-of-line record + std::list<CachedExtentRef> written_ool_block_list; + /// blocks with delayed allocation, may become inline or ool above + std::list<LogicalCachedExtentRef> delayed_alloc_list; + + /// Extents with pre-allocated addresses, + /// will be added to written_ool_block_list after write + std::list<LogicalCachedExtentRef> pre_alloc_list; + + /// list of mutated blocks, holds refcounts, subset of write_set + std::list<CachedExtentRef> mutated_block_list; + + /// partial blocks of extents on disk, with data and refcounts + std::list<CachedExtentRef> existing_block_list; + existing_block_stats_t existing_block_stats; + + /** + * retire_set + * + * Set of extents retired by *this. + */ + pextent_set_t retired_set; + + /// stats to collect when commit or invalidate + tree_stats_t onode_tree_stats; + tree_stats_t omap_tree_stats; // exclude omap tree depth + tree_stats_t lba_tree_stats; + tree_stats_t backref_tree_stats; + ool_write_stats_t ool_write_stats; + version_stat_t rewrite_version_stats; + + bool conflicted = false; + + bool has_reset = false; + + OrderingHandle handle; + + on_destruct_func_t on_destruct; + + const src_t src; + + transaction_id_t trans_id = TRANS_ID_NULL; +}; +using TransactionRef = Transaction::Ref; + +/// Should only be used with dummy staged-fltree node extent manager +inline TransactionRef make_test_transaction() { + static transaction_id_t next_id = 0; + return std::make_unique<Transaction>( + get_dummy_ordering_handle(), + false, + Transaction::src_t::MUTATE, + JOURNAL_SEQ_NULL, + [](Transaction&) {}, + ++next_id + ); +} + +struct TransactionConflictCondition { + class transaction_conflict final : public std::exception { + public: + const char* what() const noexcept final { + return "transaction conflict detected"; + } + }; + +public: + TransactionConflictCondition(Transaction &t) : t(t) {} + + template <typename Fut> + std::optional<Fut> may_interrupt() { + if (t.conflicted) { + return seastar::futurize<Fut>::make_exception_future( + transaction_conflict()); + } else { + return std::optional<Fut>(); + } + } + + template <typename T> + static constexpr bool is_interruption_v = + std::is_same_v<T, transaction_conflict>; + + + static bool is_interruption(std::exception_ptr& eptr) { + return *eptr.__cxa_exception_type() == typeid(transaction_conflict); + } + +private: + Transaction &t; +}; + +using trans_intr = crimson::interruptible::interruptor< + TransactionConflictCondition + >; + +template <typename E> +using trans_iertr = + crimson::interruptible::interruptible_errorator< + TransactionConflictCondition, + E + >; + +template <typename F, typename... Args> +auto with_trans_intr(Transaction &t, F &&f, Args&&... args) { + return trans_intr::with_interruption_to_error<crimson::ct_error::eagain>( + std::move(f), + TransactionConflictCondition(t), + t, + std::forward<Args>(args)...); +} + +template <typename T> +using with_trans_ertr = typename T::base_ertr::template extend<crimson::ct_error::eagain>; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::io_stat_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc new file mode 100644 index 000000000..ad8e5f1a6 --- /dev/null +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -0,0 +1,759 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "include/denc.h" +#include "include/intarith.h" + +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/journal.h" +#include "crimson/os/seastore/journal/circular_bounded_journal.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/random_block_manager/rbm_device.h" + +/* + * TransactionManager logs + * + * levels: + * - INFO: major initiation, closing operations + * - DEBUG: major extent related operations, INFO details + * - TRACE: DEBUG details + * - seastore_t logs + */ +SET_SUBSYS(seastore_tm); + +namespace crimson::os::seastore { + +TransactionManager::TransactionManager( + JournalRef _journal, + CacheRef _cache, + LBAManagerRef _lba_manager, + ExtentPlacementManagerRef &&_epm, + BackrefManagerRef&& _backref_manager) + : cache(std::move(_cache)), + lba_manager(std::move(_lba_manager)), + journal(std::move(_journal)), + epm(std::move(_epm)), + backref_manager(std::move(_backref_manager)) +{ + epm->set_extent_callback(this); + journal->set_write_pipeline(&write_pipeline); +} + +TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() +{ + LOG_PREFIX(TransactionManager::mkfs); + INFO("enter"); + return epm->mount( + ).safe_then([this] { + return journal->open_for_mkfs(); + }).safe_then([this](auto start_seq) { + journal->get_trimmer().update_journal_tails(start_seq, start_seq); + journal->get_trimmer().set_journal_head(start_seq); + return epm->open_for_write(); + }).safe_then([this, FNAME]() { + return with_transaction_intr( + Transaction::src_t::MUTATE, + "mkfs_tm", + [this, FNAME](auto& t) + { + cache->init(); + return cache->mkfs(t + ).si_then([this, &t] { + return lba_manager->mkfs(t); + }).si_then([this, &t] { + return backref_manager->mkfs(t); + }).si_then([this, FNAME, &t] { + INFOT("submitting mkfs transaction", t); + return submit_transaction_direct(t); + }); + }).handle_error( + crimson::ct_error::eagain::handle([] { + ceph_assert(0 == "eagain impossible"); + return mkfs_ertr::now(); + }), + mkfs_ertr::pass_further{} + ); + }).safe_then([this] { + return close(); + }).safe_then([FNAME] { + INFO("completed"); + }); +} + +TransactionManager::mount_ertr::future<> TransactionManager::mount() +{ + LOG_PREFIX(TransactionManager::mount); + INFO("enter"); + cache->init(); + return epm->mount( + ).safe_then([this] { + return journal->replay( + [this]( + const auto &offsets, + const auto &e, + const journal_seq_t &dirty_tail, + const journal_seq_t &alloc_tail, + sea_time_point modify_time) + { + auto start_seq = offsets.write_result.start_seq; + return cache->replay_delta( + start_seq, + offsets.record_block_base, + e, + dirty_tail, + alloc_tail, + modify_time); + }); + }).safe_then([this] { + return journal->open_for_mount(); + }).safe_then([this](auto start_seq) { + journal->get_trimmer().set_journal_head(start_seq); + return with_transaction_weak( + "mount", + [this](auto &t) + { + return cache->init_cached_extents(t, [this](auto &t, auto &e) { + if (is_backref_node(e->get_type())) { + return backref_manager->init_cached_extent(t, e); + } else { + return lba_manager->init_cached_extent(t, e); + } + }).si_then([this, &t] { + epm->start_scan_space(); + return backref_manager->scan_mapped_space( + t, + [this]( + paddr_t paddr, + paddr_t backref_key, + extent_len_t len, + extent_types_t type, + laddr_t laddr) { + if (is_backref_node(type)) { + assert(laddr == L_ADDR_NULL); + assert(backref_key != P_ADDR_NULL); + backref_manager->cache_new_backref_extent(paddr, backref_key, type); + cache->update_tree_extents_num(type, 1); + epm->mark_space_used(paddr, len); + } else if (laddr == L_ADDR_NULL) { + assert(backref_key == P_ADDR_NULL); + cache->update_tree_extents_num(type, -1); + epm->mark_space_free(paddr, len); + } else { + assert(backref_key == P_ADDR_NULL); + cache->update_tree_extents_num(type, 1); + epm->mark_space_used(paddr, len); + } + }); + }); + }); + }).safe_then([this] { + return epm->open_for_write(); + }).safe_then([FNAME, this] { + epm->start_background(); + INFO("completed"); + }).handle_error( + mount_ertr::pass_further{}, + crimson::ct_error::all_same_way([] { + ceph_assert(0 == "unhandled error"); + return mount_ertr::now(); + }) + ); +} + +TransactionManager::close_ertr::future<> TransactionManager::close() { + LOG_PREFIX(TransactionManager::close); + INFO("enter"); + return epm->stop_background( + ).then([this] { + return cache->close(); + }).safe_then([this] { + cache->dump_contents(); + return journal->close(); + }).safe_then([this] { + return epm->close(); + }).safe_then([FNAME] { + INFO("completed"); + return seastar::now(); + }); +} + +TransactionManager::ref_ret TransactionManager::inc_ref( + Transaction &t, + LogicalCachedExtentRef &ref) +{ + LOG_PREFIX(TransactionManager::inc_ref); + TRACET("{}", t, *ref); + return lba_manager->incref_extent(t, ref->get_laddr() + ).si_then([FNAME, ref, &t](auto result) { + DEBUGT("extent refcount is incremented to {} -- {}", + t, result.refcount, *ref); + return result.refcount; + }).handle_error_interruptible( + ref_iertr::pass_further{}, + ct_error::all_same_way([](auto e) { + ceph_assert(0 == "unhandled error, TODO"); + })); +} + +TransactionManager::ref_ret TransactionManager::inc_ref( + Transaction &t, + laddr_t offset) +{ + LOG_PREFIX(TransactionManager::inc_ref); + TRACET("{}", t, offset); + return lba_manager->incref_extent(t, offset + ).si_then([FNAME, offset, &t](auto result) { + DEBUGT("extent refcount is incremented to {} -- {}~{}, {}", + t, result.refcount, offset, result.length, result.addr); + return result.refcount; + }); +} + +TransactionManager::ref_ret TransactionManager::dec_ref( + Transaction &t, + LogicalCachedExtentRef &ref) +{ + LOG_PREFIX(TransactionManager::dec_ref); + TRACET("{}", t, *ref); + return lba_manager->decref_extent(t, ref->get_laddr(), true + ).si_then([this, FNAME, &t, ref](auto result) { + DEBUGT("extent refcount is decremented to {} -- {}", + t, result.refcount, *ref); + if (result.refcount == 0) { + cache->retire_extent(t, ref); + } + return result.refcount; + }); +} + +TransactionManager::ref_ret TransactionManager::_dec_ref( + Transaction &t, + laddr_t offset, + bool cascade_remove) +{ + LOG_PREFIX(TransactionManager::_dec_ref); + TRACET("{}", t, offset); + return lba_manager->decref_extent(t, offset, cascade_remove + ).si_then([this, FNAME, offset, &t](auto result) -> ref_ret { + DEBUGT("extent refcount is decremented to {} -- {}~{}, {}", + t, result.refcount, offset, result.length, result.addr); + auto fut = ref_iertr::now(); + if (result.refcount == 0) { + if (result.addr.is_paddr() && + !result.addr.get_paddr().is_zero()) { + fut = cache->retire_extent_addr( + t, result.addr.get_paddr(), result.length); + } + } + + return fut.si_then([result=std::move(result)] { + return result.refcount; + }); + }); +} + +TransactionManager::refs_ret TransactionManager::dec_ref( + Transaction &t, + std::vector<laddr_t> offsets) +{ + LOG_PREFIX(TransactionManager::dec_ref); + DEBUG("{} offsets", offsets.size()); + return seastar::do_with(std::move(offsets), std::vector<unsigned>(), + [this, &t] (auto &&offsets, auto &refcnt) { + return trans_intr::do_for_each(offsets.begin(), offsets.end(), + [this, &t, &refcnt] (auto &laddr) { + return this->dec_ref(t, laddr).si_then([&refcnt] (auto ref) { + refcnt.push_back(ref); + return ref_iertr::now(); + }); + }).si_then([&refcnt] { + return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnt)); + }); + }); +} + +TransactionManager::submit_transaction_iertr::future<> +TransactionManager::submit_transaction( + Transaction &t) +{ + LOG_PREFIX(TransactionManager::submit_transaction); + SUBTRACET(seastore_t, "start", t); + return trans_intr::make_interruptible( + t.get_handle().enter(write_pipeline.reserve_projected_usage) + ).then_interruptible([this, FNAME, &t] { + auto dispatch_result = epm->dispatch_delayed_extents(t); + auto projected_usage = dispatch_result.usage; + SUBTRACET(seastore_t, "waiting for projected_usage: {}", t, projected_usage); + return trans_intr::make_interruptible( + epm->reserve_projected_usage(projected_usage) + ).then_interruptible([this, &t, dispatch_result = std::move(dispatch_result)] { + return do_submit_transaction(t, std::move(dispatch_result)); + }).finally([this, FNAME, projected_usage, &t] { + SUBTRACET(seastore_t, "releasing projected_usage: {}", t, projected_usage); + epm->release_projected_usage(projected_usage); + }); + }); +} + +TransactionManager::submit_transaction_direct_ret +TransactionManager::submit_transaction_direct( + Transaction &tref, + std::optional<journal_seq_t> trim_alloc_to) +{ + return do_submit_transaction( + tref, + epm->dispatch_delayed_extents(tref), + trim_alloc_to); +} + +TransactionManager::submit_transaction_direct_ret +TransactionManager::do_submit_transaction( + Transaction &tref, + ExtentPlacementManager::dispatch_result_t dispatch_result, + std::optional<journal_seq_t> trim_alloc_to) +{ + LOG_PREFIX(TransactionManager::do_submit_transaction); + SUBTRACET(seastore_t, "start", tref); + return trans_intr::make_interruptible( + tref.get_handle().enter(write_pipeline.ool_writes) + ).then_interruptible([this, FNAME, &tref, + dispatch_result = std::move(dispatch_result)] { + return seastar::do_with(std::move(dispatch_result), + [this, FNAME, &tref](auto &dispatch_result) { + return epm->write_delayed_ool_extents(tref, dispatch_result.alloc_map + ).si_then([this, FNAME, &tref, &dispatch_result] { + SUBTRACET(seastore_t, "update delayed extent mappings", tref); + return lba_manager->update_mappings(tref, dispatch_result.delayed_extents); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further(), + crimson::ct_error::assert_all("invalid error") + ); + }); + }).si_then([this, FNAME, &tref] { + auto allocated_extents = tref.get_valid_pre_alloc_list(); + auto num_extents = allocated_extents.size(); + SUBTRACET(seastore_t, "process {} allocated extents", tref, num_extents); + return epm->write_preallocated_ool_extents(tref, allocated_extents + ).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further(), + crimson::ct_error::assert_all("invalid error") + ); + }).si_then([this, FNAME, &tref] { + SUBTRACET(seastore_t, "about to prepare", tref); + return tref.get_handle().enter(write_pipeline.prepare); + }).si_then([this, FNAME, &tref, trim_alloc_to=std::move(trim_alloc_to)]() mutable + -> submit_transaction_iertr::future<> { + if (trim_alloc_to && *trim_alloc_to != JOURNAL_SEQ_NULL) { + cache->trim_backref_bufs(*trim_alloc_to); + } + + auto record = cache->prepare_record( + tref, + journal->get_trimmer().get_journal_head(), + journal->get_trimmer().get_dirty_tail()); + + tref.get_handle().maybe_release_collection_lock(); + + SUBTRACET(seastore_t, "about to submit to journal", tref); + return journal->submit_record(std::move(record), tref.get_handle() + ).safe_then([this, FNAME, &tref](auto submit_result) mutable { + SUBDEBUGT(seastore_t, "committed with {}", tref, submit_result); + auto start_seq = submit_result.write_result.start_seq; + journal->get_trimmer().set_journal_head(start_seq); + cache->complete_commit( + tref, + submit_result.record_block_base, + start_seq); + + std::vector<CachedExtentRef> lba_to_clear; + std::vector<CachedExtentRef> backref_to_clear; + lba_to_clear.reserve(tref.get_retired_set().size()); + backref_to_clear.reserve(tref.get_retired_set().size()); + for (auto &e: tref.get_retired_set()) { + if (e->is_logical() || is_lba_node(e->get_type())) + lba_to_clear.push_back(e); + else if (is_backref_node(e->get_type())) + backref_to_clear.push_back(e); + } + + journal->get_trimmer().update_journal_tails( + cache->get_oldest_dirty_from().value_or(start_seq), + cache->get_oldest_backref_dirty_from().value_or(start_seq)); + return journal->finish_commit(tref.get_src() + ).then([&tref] { + return tref.get_handle().complete(); + }); + }).handle_error( + submit_transaction_iertr::pass_further{}, + crimson::ct_error::all_same_way([](auto e) { + ceph_assert(0 == "Hit error submitting to journal"); + }) + ); + }).finally([&tref]() { + tref.get_handle().exit(); + }); +} + +seastar::future<> TransactionManager::flush(OrderingHandle &handle) +{ + LOG_PREFIX(TransactionManager::flush); + SUBDEBUG(seastore_t, "H{} start", (void*)&handle); + return handle.enter(write_pipeline.reserve_projected_usage + ).then([this, &handle] { + return handle.enter(write_pipeline.ool_writes); + }).then([this, &handle] { + return handle.enter(write_pipeline.prepare); + }).then([this, &handle] { + handle.maybe_release_collection_lock(); + return journal->flush(handle); + }).then([FNAME, &handle] { + SUBDEBUG(seastore_t, "H{} completed", (void*)&handle); + }); +} + +TransactionManager::get_next_dirty_extents_ret +TransactionManager::get_next_dirty_extents( + Transaction &t, + journal_seq_t seq, + size_t max_bytes) +{ + LOG_PREFIX(TransactionManager::get_next_dirty_extents); + DEBUGT("max_bytes={}B, seq={}", t, max_bytes, seq); + return cache->get_next_dirty_extents(t, seq, max_bytes); +} + +TransactionManager::rewrite_extent_ret +TransactionManager::rewrite_logical_extent( + Transaction& t, + LogicalCachedExtentRef extent) +{ + LOG_PREFIX(TransactionManager::rewrite_logical_extent); + if (extent->has_been_invalidated()) { + ERRORT("extent has been invalidated -- {}", t, *extent); + ceph_abort(); + } + TRACET("rewriting extent -- {}", t, *extent); + + auto lextent = extent->cast<LogicalCachedExtent>(); + cache->retire_extent(t, extent); + auto nlextent = cache->alloc_new_extent_by_type( + t, + lextent->get_type(), + lextent->get_length(), + lextent->get_user_hint(), + // get target rewrite generation + lextent->get_rewrite_generation())->cast<LogicalCachedExtent>(); + lextent->get_bptr().copy_out( + 0, + lextent->get_length(), + nlextent->get_bptr().c_str()); + nlextent->set_laddr(lextent->get_laddr()); + nlextent->set_modify_time(lextent->get_modify_time()); + + DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent); + + /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc + * extents since we're going to do it again once we either do the ool write + * or allocate a relative inline addr. TODO: refactor AsyncCleaner to + * avoid this complication. */ + return lba_manager->update_mapping( + t, + lextent->get_laddr(), + lextent->get_paddr(), + nlextent->get_paddr(), + nlextent.get()); +} + +TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( + Transaction &t, + CachedExtentRef extent, + rewrite_gen_t target_generation, + sea_time_point modify_time) +{ + LOG_PREFIX(TransactionManager::rewrite_extent); + + { + auto updated = cache->update_extent_from_transaction(t, extent); + if (!updated) { + DEBUGT("extent is already retired, skipping -- {}", t, *extent); + return rewrite_extent_iertr::now(); + } + extent = updated; + ceph_assert(!extent->is_pending_io()); + } + + assert(extent->is_valid() && !extent->is_initial_pending()); + if (extent->is_dirty()) { + extent->set_target_rewrite_generation(INIT_GENERATION); + } else { + extent->set_target_rewrite_generation(target_generation); + ceph_assert(modify_time != NULL_TIME); + extent->set_modify_time(modify_time); + } + + t.get_rewrite_version_stats().increment(extent->get_version()); + + if (is_backref_node(extent->get_type())) { + DEBUGT("rewriting backref extent -- {}", t, *extent); + return backref_manager->rewrite_extent(t, extent); + } + + if (extent->get_type() == extent_types_t::ROOT) { + DEBUGT("rewriting root extent -- {}", t, *extent); + cache->duplicate_for_write(t, extent); + return rewrite_extent_iertr::now(); + } + + if (extent->is_logical()) { + return rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>()); + } else { + DEBUGT("rewriting physical extent -- {}", t, *extent); + return lba_manager->rewrite_extent(t, extent); + } +} + +TransactionManager::get_extents_if_live_ret +TransactionManager::get_extents_if_live( + Transaction &t, + extent_types_t type, + paddr_t paddr, + laddr_t laddr, + extent_len_t len) +{ + LOG_PREFIX(TransactionManager::get_extent_if_live); + TRACET("{} {}~{} {}", t, type, laddr, len, paddr); + + // This only works with segments to check if alive, + // as parallel transactions may split the extent at the same time. + ceph_assert(paddr.get_addr_type() == paddr_types_t::SEGMENT); + + return cache->get_extent_if_cached(t, paddr, type + ).si_then([=, this, &t](auto extent) + -> get_extents_if_live_ret { + if (extent && extent->get_length() == len) { + DEBUGT("{} {}~{} {} is live in cache -- {}", + t, type, laddr, len, paddr, *extent); + std::list<CachedExtentRef> res; + res.emplace_back(std::move(extent)); + return get_extents_if_live_ret( + interruptible::ready_future_marker{}, + res); + } + + if (is_logical_type(type)) { + return lba_manager->get_mappings( + t, + laddr, + len + ).si_then([=, this, &t](lba_pin_list_t pin_list) { + return seastar::do_with( + std::list<CachedExtentRef>(), + [=, this, &t, pin_list=std::move(pin_list)]( + std::list<CachedExtentRef> &list) mutable + { + auto paddr_seg_id = paddr.as_seg_paddr().get_segment_id(); + return trans_intr::parallel_for_each( + pin_list, + [=, this, &list, &t]( + LBAMappingRef &pin) -> Cache::get_extent_iertr::future<> + { + auto pin_paddr = pin->get_val(); + auto &pin_seg_paddr = pin_paddr.as_seg_paddr(); + auto pin_paddr_seg_id = pin_seg_paddr.get_segment_id(); + auto pin_len = pin->get_length(); + if (pin_paddr_seg_id != paddr_seg_id) { + return seastar::now(); + } + // Only extent split can happen during the lookup + ceph_assert(pin_seg_paddr >= paddr && + pin_seg_paddr.add_offset(pin_len) <= paddr.add_offset(len)); + return read_pin_by_type(t, std::move(pin), type + ).si_then([&list](auto ret) { + list.emplace_back(std::move(ret)); + return seastar::now(); + }); + }).si_then([&list] { + return get_extents_if_live_ret( + interruptible::ready_future_marker{}, + std::move(list)); + }); + }); + }).handle_error_interruptible(crimson::ct_error::enoent::handle([] { + return get_extents_if_live_ret( + interruptible::ready_future_marker{}, + std::list<CachedExtentRef>()); + }), crimson::ct_error::pass_further_all{}); + } else { + return lba_manager->get_physical_extent_if_live( + t, + type, + paddr, + laddr, + len + ).si_then([=, &t](auto ret) { + std::list<CachedExtentRef> res; + if (ret) { + DEBUGT("{} {}~{} {} is live as physical extent -- {}", + t, type, laddr, len, paddr, *ret); + res.emplace_back(std::move(ret)); + } else { + DEBUGT("{} {}~{} {} is not live as physical extent", + t, type, laddr, len, paddr); + } + return get_extents_if_live_ret( + interruptible::ready_future_marker{}, + std::move(res)); + }); + } + }); +} + +TransactionManager::~TransactionManager() {} + +TransactionManagerRef make_transaction_manager( + Device *primary_device, + const std::vector<Device*> &secondary_devices, + bool is_test) +{ + auto epm = std::make_unique<ExtentPlacementManager>(); + auto cache = std::make_unique<Cache>(*epm); + auto lba_manager = lba_manager::create_lba_manager(*cache); + auto sms = std::make_unique<SegmentManagerGroup>(); + auto rbs = std::make_unique<RBMDeviceGroup>(); + auto backref_manager = create_backref_manager(*cache); + SegmentManagerGroupRef cold_sms = nullptr; + std::vector<SegmentProvider*> segment_providers_by_id{DEVICE_ID_MAX, nullptr}; + + auto p_backend_type = primary_device->get_backend_type(); + + if (p_backend_type == backend_type_t::SEGMENTED) { + auto dtype = primary_device->get_device_type(); + ceph_assert(dtype != device_type_t::HDD && + dtype != device_type_t::EPHEMERAL_COLD); + sms->add_segment_manager(static_cast<SegmentManager*>(primary_device)); + } else { + auto rbm = std::make_unique<BlockRBManager>( + static_cast<RBMDevice*>(primary_device), "", is_test); + rbs->add_rb_manager(std::move(rbm)); + } + + for (auto &p_dev : secondary_devices) { + if (p_dev->get_backend_type() == backend_type_t::SEGMENTED) { + if (p_dev->get_device_type() == primary_device->get_device_type()) { + sms->add_segment_manager(static_cast<SegmentManager*>(p_dev)); + } else { + if (!cold_sms) { + cold_sms = std::make_unique<SegmentManagerGroup>(); + } + cold_sms->add_segment_manager(static_cast<SegmentManager*>(p_dev)); + } + } else { + auto rbm = std::make_unique<BlockRBManager>( + static_cast<RBMDevice*>(p_dev), "", is_test); + rbs->add_rb_manager(std::move(rbm)); + } + } + + auto journal_type = p_backend_type; + device_off_t roll_size; + device_off_t roll_start; + if (journal_type == journal_type_t::SEGMENTED) { + roll_size = static_cast<SegmentManager*>(primary_device)->get_segment_size(); + roll_start = 0; + } else { + roll_size = static_cast<random_block_device::RBMDevice*>(primary_device) + ->get_journal_size() - primary_device->get_block_size(); + // see CircularBoundedJournal::get_records_start() + roll_start = static_cast<random_block_device::RBMDevice*>(primary_device) + ->get_shard_journal_start() + primary_device->get_block_size(); + ceph_assert_always(roll_size <= DEVICE_OFF_MAX); + ceph_assert_always((std::size_t)roll_size + roll_start <= + primary_device->get_available_size()); + } + ceph_assert(roll_size % primary_device->get_block_size() == 0); + ceph_assert(roll_start % primary_device->get_block_size() == 0); + + bool cleaner_is_detailed; + SegmentCleaner::config_t cleaner_config; + JournalTrimmerImpl::config_t trimmer_config; + if (is_test) { + cleaner_is_detailed = true; + cleaner_config = SegmentCleaner::config_t::get_test(); + trimmer_config = JournalTrimmerImpl::config_t::get_test( + roll_size, journal_type); + } else { + cleaner_is_detailed = false; + cleaner_config = SegmentCleaner::config_t::get_default(); + trimmer_config = JournalTrimmerImpl::config_t::get_default( + roll_size, journal_type); + } + + auto journal_trimmer = JournalTrimmerImpl::create( + *backref_manager, trimmer_config, + journal_type, roll_start, roll_size); + + AsyncCleanerRef cleaner; + JournalRef journal; + + SegmentCleanerRef cold_segment_cleaner = nullptr; + + if (cold_sms) { + cold_segment_cleaner = SegmentCleaner::create( + cleaner_config, + std::move(cold_sms), + *backref_manager, + epm->get_ool_segment_seq_allocator(), + cleaner_is_detailed, + /* is_cold = */ true); + if (journal_type == journal_type_t::SEGMENTED) { + for (auto id : cold_segment_cleaner->get_device_ids()) { + segment_providers_by_id[id] = + static_cast<SegmentProvider*>(cold_segment_cleaner.get()); + } + } + } + + if (journal_type == journal_type_t::SEGMENTED) { + cleaner = SegmentCleaner::create( + cleaner_config, + std::move(sms), + *backref_manager, + epm->get_ool_segment_seq_allocator(), + cleaner_is_detailed); + auto segment_cleaner = static_cast<SegmentCleaner*>(cleaner.get()); + for (auto id : segment_cleaner->get_device_ids()) { + segment_providers_by_id[id] = + static_cast<SegmentProvider*>(segment_cleaner); + } + segment_cleaner->set_journal_trimmer(*journal_trimmer); + journal = journal::make_segmented( + *segment_cleaner, + *journal_trimmer); + } else { + cleaner = RBMCleaner::create( + std::move(rbs), + *backref_manager, + cleaner_is_detailed); + journal = journal::make_circularbounded( + *journal_trimmer, + static_cast<random_block_device::RBMDevice*>(primary_device), + ""); + } + + cache->set_segment_providers(std::move(segment_providers_by_id)); + + epm->init(std::move(journal_trimmer), + std::move(cleaner), + std::move(cold_segment_cleaner)); + epm->set_primary_device(primary_device); + + return std::make_unique<TransactionManager>( + std::move(journal), + std::move(cache), + std::move(lba_manager), + std::move(epm), + std::move(backref_manager)); +} + +} diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h new file mode 100644 index 000000000..dd1898ba7 --- /dev/null +++ b/src/crimson/os/seastore/transaction_manager.h @@ -0,0 +1,928 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <optional> +#include <vector> +#include <utility> +#include <functional> + +#include <boost/intrusive_ptr.hpp> +#include <boost/iterator/counting_iterator.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer.h" + +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/logging.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/backref_manager.h" +#include "crimson/os/seastore/journal.h" +#include "crimson/os/seastore/extent_placement_manager.h" +#include "crimson/os/seastore/device.h" + +namespace crimson::os::seastore { +class Journal; + +template <typename F> +auto repeat_eagain(F &&f) { + return seastar::do_with( + std::forward<F>(f), + [](auto &f) + { + return crimson::repeat([&f] { + return std::invoke(f + ).safe_then([] { + return seastar::stop_iteration::yes; + }).handle_error( + [](const crimson::ct_error::eagain &e) { + return seastar::stop_iteration::no; + }, + crimson::ct_error::pass_further_all{} + ); + }); + }); +} + +/** + * TransactionManager + * + * Abstraction hiding reading and writing to persistence. + * Exposes transaction based interface with read isolation. + */ +class TransactionManager : public ExtentCallbackInterface { +public: + TransactionManager( + JournalRef journal, + CacheRef cache, + LBAManagerRef lba_manager, + ExtentPlacementManagerRef &&epm, + BackrefManagerRef&& backref_manager); + + /// Writes initial metadata to disk + using mkfs_ertr = base_ertr; + mkfs_ertr::future<> mkfs(); + + /// Reads initial metadata from disk + using mount_ertr = base_ertr; + mount_ertr::future<> mount(); + + /// Closes transaction_manager + using close_ertr = base_ertr; + close_ertr::future<> close(); + + /// Resets transaction + void reset_transaction_preserve_handle(Transaction &t) { + return cache->reset_transaction_preserve_handle(t); + } + + /** + * get_pin + * + * Get the logical pin at offset + */ + using get_pin_iertr = LBAManager::get_mapping_iertr; + using get_pin_ret = LBAManager::get_mapping_iertr::future<LBAMappingRef>; + get_pin_ret get_pin( + Transaction &t, + laddr_t offset) { + LOG_PREFIX(TransactionManager::get_pin); + SUBTRACET(seastore_tm, "{}", t, offset); + return lba_manager->get_mapping(t, offset); + } + + /** + * get_pins + * + * Get logical pins overlapping offset~length + */ + using get_pins_iertr = LBAManager::get_mappings_iertr; + using get_pins_ret = get_pins_iertr::future<lba_pin_list_t>; + get_pins_ret get_pins( + Transaction &t, + laddr_t offset, + extent_len_t length) { + LOG_PREFIX(TransactionManager::get_pins); + SUBDEBUGT(seastore_tm, "{}~{}", t, offset, length); + return lba_manager->get_mappings( + t, offset, length); + } + + /** + * read_extent + * + * Read extent of type T at offset~length + */ + using read_extent_iertr = get_pin_iertr; + template <typename T> + using read_extent_ret = read_extent_iertr::future< + TCachedExtentRef<T>>; + template <typename T> + read_extent_ret<T> read_extent( + Transaction &t, + laddr_t offset, + extent_len_t length) { + LOG_PREFIX(TransactionManager::read_extent); + SUBTRACET(seastore_tm, "{}~{}", t, offset, length); + return get_pin( + t, offset + ).si_then([this, FNAME, &t, offset, length] (auto pin) + -> read_extent_ret<T> { + if (length != pin->get_length() || !pin->get_val().is_real()) { + SUBERRORT(seastore_tm, + "offset {} len {} got wrong pin {}", + t, offset, length, *pin); + ceph_assert(0 == "Should be impossible"); + } + return this->read_pin<T>(t, std::move(pin)); + }); + } + + /** + * read_extent + * + * Read extent of type T at offset + */ + template <typename T> + read_extent_ret<T> read_extent( + Transaction &t, + laddr_t offset) { + LOG_PREFIX(TransactionManager::read_extent); + SUBTRACET(seastore_tm, "{}", t, offset); + return get_pin( + t, offset + ).si_then([this, FNAME, &t, offset] (auto pin) + -> read_extent_ret<T> { + if (!pin->get_val().is_real()) { + SUBERRORT(seastore_tm, + "offset {} got wrong pin {}", + t, offset, *pin); + ceph_assert(0 == "Should be impossible"); + } + return this->read_pin<T>(t, std::move(pin)); + }); + } + + template <typename T> + base_iertr::future<TCachedExtentRef<T>> read_pin( + Transaction &t, + LBAMappingRef pin) + { + auto v = pin->get_logical_extent(t); + if (v.has_child()) { + return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) { +#ifndef NDEBUG + auto lextent = extent->template cast<LogicalCachedExtent>(); + auto pin_laddr = pin->get_key(); + if (pin->is_indirect()) { + pin_laddr = pin->get_intermediate_base(); + } + assert(lextent->get_laddr() == pin_laddr); +#endif + return extent->template cast<T>(); + }); + } else { + return pin_to_extent<T>(t, std::move(pin)); + } + } + + base_iertr::future<LogicalCachedExtentRef> read_pin_by_type( + Transaction &t, + LBAMappingRef pin, + extent_types_t type) + { + auto v = pin->get_logical_extent(t); + if (v.has_child()) { + return std::move(v.get_child_fut()); + } else { + return pin_to_extent_by_type(t, std::move(pin), type); + } + } + + /// Obtain mutable copy of extent + LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) { + LOG_PREFIX(TransactionManager::get_mutable_extent); + auto ret = cache->duplicate_for_write( + t, + ref)->cast<LogicalCachedExtent>(); + if (!ret->has_laddr()) { + SUBDEBUGT(seastore_tm, + "duplicating extent for write -- {} -> {}", + t, + *ref, + *ret); + ret->set_laddr(ref->get_laddr()); + } else { + SUBTRACET(seastore_tm, + "extent is already duplicated -- {}", + t, + *ref); + assert(ref->is_mutable()); + assert(&*ref == &*ret); + } + return ret; + } + + + using ref_iertr = LBAManager::ref_iertr; + using ref_ret = ref_iertr::future<unsigned>; + + /// Add refcount for ref + ref_ret inc_ref( + Transaction &t, + LogicalCachedExtentRef &ref); + + /// Add refcount for offset + ref_ret inc_ref( + Transaction &t, + laddr_t offset); + + /// Remove refcount for ref + ref_ret dec_ref( + Transaction &t, + LogicalCachedExtentRef &ref); + + /// Remove refcount for offset + ref_ret dec_ref( + Transaction &t, + laddr_t offset) { + return _dec_ref(t, offset, true); + } + + /// remove refcount for list of offset + using refs_ret = ref_iertr::future<std::vector<unsigned>>; + refs_ret dec_ref( + Transaction &t, + std::vector<laddr_t> offsets); + + /** + * alloc_extent + * + * Allocates a new block of type T with the minimum lba range of size len + * greater than laddr_hint. + */ + using alloc_extent_iertr = LBAManager::alloc_extent_iertr; + template <typename T> + using alloc_extent_ret = alloc_extent_iertr::future<TCachedExtentRef<T>>; + template <typename T> + alloc_extent_ret<T> alloc_extent( + Transaction &t, + laddr_t laddr_hint, + extent_len_t len, + placement_hint_t placement_hint = placement_hint_t::HOT) { + LOG_PREFIX(TransactionManager::alloc_extent); + SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}", + t, T::TYPE, len, placement_hint, laddr_hint); + ceph_assert(is_aligned(laddr_hint, epm->get_block_size())); + auto ext = cache->alloc_new_extent<T>( + t, + len, + placement_hint, + INIT_GENERATION); + return lba_manager->alloc_extent( + t, + laddr_hint, + len, + ext->get_paddr(), + *ext + ).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable { + LOG_PREFIX(TransactionManager::alloc_extent); + SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint); + return alloc_extent_iertr::make_ready_future<TCachedExtentRef<T>>( + std::move(ext)); + }); + } + + /** + * remap_pin + * + * Remap original extent to new extents. + * Return the pins of new extent. + */ + struct remap_entry { + extent_len_t offset; + extent_len_t len; + remap_entry(extent_len_t _offset, extent_len_t _len) { + offset = _offset; + len = _len; + } + }; + using remap_pin_iertr = base_iertr; + template <std::size_t N> + using remap_pin_ret = remap_pin_iertr::future<std::array<LBAMappingRef, N>>; + template <typename T, std::size_t N> + remap_pin_ret<N> remap_pin( + Transaction &t, + LBAMappingRef &&pin, + std::array<remap_entry, N> remaps) { + +#ifndef NDEBUG + std::sort(remaps.begin(), remaps.end(), + [](remap_entry x, remap_entry y) { + return x.offset < y.offset; + }); + auto original_len = pin->get_length(); + extent_len_t total_remap_len = 0; + extent_len_t last_offset = 0; + extent_len_t last_len = 0; + + for (auto &remap : remaps) { + auto remap_offset = remap.offset; + auto remap_len = remap.len; + total_remap_len += remap.len; + ceph_assert(remap_offset >= (last_offset + last_len)); + last_offset = remap_offset; + last_len = remap_len; + } + ceph_assert(total_remap_len < original_len); +#endif + + // FIXME: paddr can be absolute and pending + ceph_assert(pin->get_val().is_absolute()); + return cache->get_extent_if_cached( + t, pin->get_val(), T::TYPE + ).si_then([this, &t, remaps, + original_laddr = pin->get_key(), + intermediate_base = pin->is_indirect() + ? pin->get_intermediate_base() + : L_ADDR_NULL, + intermediate_key = pin->is_indirect() + ? pin->get_intermediate_key() + : L_ADDR_NULL, + original_paddr = pin->get_val(), + original_len = pin->get_length()](auto ext) mutable { + std::optional<ceph::bufferptr> original_bptr; + LOG_PREFIX(TransactionManager::remap_pin); + SUBDEBUGT(seastore_tm, + "original laddr: {}, original paddr: {}, original length: {}," + " intermediate_base: {}, intermediate_key: {}," + " remap to {} extents", + t, original_laddr, original_paddr, original_len, + intermediate_base, intermediate_key, remaps.size()); + ceph_assert( + (intermediate_base == L_ADDR_NULL) + == (intermediate_key == L_ADDR_NULL)); + if (ext) { + // FIXME: cannot and will not remap a dirty extent for now. + ceph_assert(!ext->is_dirty()); + ceph_assert(!ext->is_mutable()); + ceph_assert(ext->get_length() >= original_len); + ceph_assert(ext->get_paddr() == original_paddr); + original_bptr = ext->get_bptr(); + } + return seastar::do_with( + std::array<LBAMappingRef, N>(), + 0, + std::move(original_bptr), + std::vector<remap_entry>(remaps.begin(), remaps.end()), + [this, &t, original_laddr, original_paddr, + original_len, intermediate_base, intermediate_key] + (auto &ret, auto &count, auto &original_bptr, auto &remaps) { + return _dec_ref(t, original_laddr, false + ).si_then([this, &t, &original_bptr, &ret, &count, + &remaps, intermediate_base, intermediate_key, + original_laddr, original_paddr, original_len](auto) { + return trans_intr::do_for_each( + remaps.begin(), + remaps.end(), + [this, &t, &original_bptr, &ret, + &count, intermediate_base, intermediate_key, + original_laddr, original_paddr, original_len](auto &remap) { + LOG_PREFIX(TransactionManager::remap_pin); + auto remap_offset = remap.offset; + auto remap_len = remap.len; + auto remap_laddr = original_laddr + remap_offset; + auto remap_paddr = original_paddr.add_offset(remap_offset); + ceph_assert(remap_len < original_len); + ceph_assert(remap_offset + remap_len <= original_len); + ceph_assert(remap_len != 0); + ceph_assert(remap_offset % cache->get_block_size() == 0); + ceph_assert(remap_len % cache->get_block_size() == 0); + SUBDEBUGT(seastore_tm, + "remap laddr: {}, remap paddr: {}, remap length: {}", t, + remap_laddr, remap_paddr, remap_len); + auto remapped_intermediate_key = intermediate_key; + if (remapped_intermediate_key != L_ADDR_NULL) { + assert(intermediate_base != L_ADDR_NULL); + remapped_intermediate_key += remap_offset; + } + return alloc_remapped_extent<T>( + t, + remap_laddr, + remap_paddr, + remap_len, + original_laddr, + intermediate_base, + remapped_intermediate_key, + std::move(original_bptr) + ).si_then([&ret, &count, remap_laddr](auto &&npin) { + ceph_assert(npin->get_key() == remap_laddr); + ret[count++] = std::move(npin); + }); + }); + }).si_then([this, &t, intermediate_base, intermediate_key] { + if (N > 1 && intermediate_key != L_ADDR_NULL) { + return lba_manager->incref_extent( + t, intermediate_base, N - 1 + ).si_then([](auto) { + return seastar::now(); + }); + } + return LBAManager::ref_iertr::now(); + }).handle_error_interruptible( + remap_pin_iertr::pass_further{}, + crimson::ct_error::assert_all{ + "TransactionManager::remap_pin hit invalid error" + } + ).si_then([&ret, &count] { + ceph_assert(count == N); + return remap_pin_iertr::make_ready_future< + std::array<LBAMappingRef, N>>(std::move(ret)); + }); + }); + }); + } + + using reserve_extent_iertr = alloc_extent_iertr; + using reserve_extent_ret = reserve_extent_iertr::future<LBAMappingRef>; + reserve_extent_ret reserve_region( + Transaction &t, + laddr_t hint, + extent_len_t len) { + LOG_PREFIX(TransactionManager::reserve_region); + SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}", t, len, hint); + ceph_assert(is_aligned(hint, epm->get_block_size())); + return lba_manager->reserve_region( + t, + hint, + len); + } + + /* + * clone_pin + * + * create an indirect lba mapping pointing to the physical + * lba mapping whose key is intermediate_key. Resort to btree_lba_manager.h + * for the definition of "indirect lba mapping" and "physical lba mapping" + * + */ + using clone_extent_iertr = alloc_extent_iertr; + using clone_extent_ret = clone_extent_iertr::future<LBAMappingRef>; + clone_extent_ret clone_pin( + Transaction &t, + laddr_t hint, + const LBAMapping &mapping) { + auto intermediate_key = + mapping.is_indirect() + ? mapping.get_intermediate_key() + : mapping.get_key(); + auto intermediate_base = + mapping.is_indirect() + ? mapping.get_intermediate_base() + : mapping.get_key(); + + LOG_PREFIX(TransactionManager::clone_pin); + SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, clone_offset {}", + t, mapping.get_length(), hint, intermediate_key); + ceph_assert(is_aligned(hint, epm->get_block_size())); + return lba_manager->clone_extent( + t, + hint, + mapping.get_length(), + intermediate_key, + mapping.get_val(), + intermediate_key + ).si_then([this, &t, intermediate_base](auto pin) { + return inc_ref(t, intermediate_base + ).si_then([pin=std::move(pin)](auto) mutable { + return std::move(pin); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further(), + crimson::ct_error::assert_all("not possible") + ); + }); + } + + /* alloc_extents + * + * allocates more than one new blocks of type T. + */ + using alloc_extents_iertr = alloc_extent_iertr; + template<class T> + alloc_extents_iertr::future<std::vector<TCachedExtentRef<T>>> + alloc_extents( + Transaction &t, + laddr_t hint, + extent_len_t len, + int num) { + LOG_PREFIX(TransactionManager::alloc_extents); + SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, num={}", + t, len, hint, num); + return seastar::do_with(std::vector<TCachedExtentRef<T>>(), + [this, &t, hint, len, num] (auto &extents) { + return trans_intr::do_for_each( + boost::make_counting_iterator(0), + boost::make_counting_iterator(num), + [this, &t, len, hint, &extents] (auto i) { + return alloc_extent<T>(t, hint, len).si_then( + [&extents](auto &&node) { + extents.push_back(node); + }); + }).si_then([&extents] { + return alloc_extents_iertr::make_ready_future + <std::vector<TCachedExtentRef<T>>>(std::move(extents)); + }); + }); + } + + /** + * submit_transaction + * + * Atomically submits transaction to persistence + */ + using submit_transaction_iertr = base_iertr; + submit_transaction_iertr::future<> submit_transaction(Transaction &); + + /** + * flush + * + * Block until all outstanding IOs on handle are committed. + * Note, flush() machinery must go through the same pipeline + * stages and locks as submit_transaction. + */ + seastar::future<> flush(OrderingHandle &handle); + + /* + * ExtentCallbackInterface + */ + + /// weak transaction should be type READ + TransactionRef create_transaction( + Transaction::src_t src, + const char* name, + bool is_weak=false) final { + return cache->create_transaction(src, name, is_weak); + } + + using ExtentCallbackInterface::submit_transaction_direct_ret; + submit_transaction_direct_ret submit_transaction_direct( + Transaction &t, + std::optional<journal_seq_t> seq_to_trim = std::nullopt) final; + + using ExtentCallbackInterface::get_next_dirty_extents_ret; + get_next_dirty_extents_ret get_next_dirty_extents( + Transaction &t, + journal_seq_t seq, + size_t max_bytes) final; + + using ExtentCallbackInterface::rewrite_extent_ret; + rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent, + rewrite_gen_t target_generation, + sea_time_point modify_time) final; + + using ExtentCallbackInterface::get_extents_if_live_ret; + get_extents_if_live_ret get_extents_if_live( + Transaction &t, + extent_types_t type, + paddr_t paddr, + laddr_t laddr, + extent_len_t len) final; + + /** + * read_root_meta + * + * Read root block meta entry for key. + */ + using read_root_meta_iertr = base_iertr; + using read_root_meta_bare = std::optional<std::string>; + using read_root_meta_ret = read_root_meta_iertr::future< + read_root_meta_bare>; + read_root_meta_ret read_root_meta( + Transaction &t, + const std::string &key) { + return cache->get_root( + t + ).si_then([&key, &t](auto root) { + LOG_PREFIX(TransactionManager::read_root_meta); + auto meta = root->root.get_meta(); + auto iter = meta.find(key); + if (iter == meta.end()) { + SUBDEBUGT(seastore_tm, "{} -> nullopt", t, key); + return seastar::make_ready_future<read_root_meta_bare>(std::nullopt); + } else { + SUBDEBUGT(seastore_tm, "{} -> {}", t, key, iter->second); + return seastar::make_ready_future<read_root_meta_bare>(iter->second); + } + }); + } + + /** + * update_root_meta + * + * Update root block meta entry for key to value. + */ + using update_root_meta_iertr = base_iertr; + using update_root_meta_ret = update_root_meta_iertr::future<>; + update_root_meta_ret update_root_meta( + Transaction& t, + const std::string& key, + const std::string& value) { + LOG_PREFIX(TransactionManager::update_root_meta); + SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {}", t, key, value); + return cache->get_root( + t + ).si_then([this, &t, &key, &value](RootBlockRef root) { + root = cache->duplicate_for_write(t, root)->cast<RootBlock>(); + + auto meta = root->root.get_meta(); + meta[key] = value; + + root->root.set_meta(meta); + return seastar::now(); + }); + } + + /** + * read_onode_root + * + * Get onode-tree root logical address + */ + using read_onode_root_iertr = base_iertr; + using read_onode_root_ret = read_onode_root_iertr::future<laddr_t>; + read_onode_root_ret read_onode_root(Transaction &t) { + return cache->get_root(t).si_then([&t](auto croot) { + LOG_PREFIX(TransactionManager::read_onode_root); + laddr_t ret = croot->get_root().onode_root; + SUBTRACET(seastore_tm, "{}", t, ret); + return ret; + }); + } + + /** + * write_onode_root + * + * Write onode-tree root logical address, must be called after read. + */ + void write_onode_root(Transaction &t, laddr_t addr) { + LOG_PREFIX(TransactionManager::write_onode_root); + SUBDEBUGT(seastore_tm, "{}", t, addr); + auto croot = cache->get_root_fast(t); + croot = cache->duplicate_for_write(t, croot)->cast<RootBlock>(); + croot->get_root().onode_root = addr; + } + + /** + * read_collection_root + * + * Get collection root addr + */ + using read_collection_root_iertr = base_iertr; + using read_collection_root_ret = read_collection_root_iertr::future< + coll_root_t>; + read_collection_root_ret read_collection_root(Transaction &t) { + return cache->get_root(t).si_then([&t](auto croot) { + LOG_PREFIX(TransactionManager::read_collection_root); + auto ret = croot->get_root().collection_root.get(); + SUBTRACET(seastore_tm, "{}~{}", + t, ret.get_location(), ret.get_size()); + return ret; + }); + } + + /** + * write_collection_root + * + * Update collection root addr + */ + void write_collection_root(Transaction &t, coll_root_t cmroot) { + LOG_PREFIX(TransactionManager::write_collection_root); + SUBDEBUGT(seastore_tm, "{}~{}", + t, cmroot.get_location(), cmroot.get_size()); + auto croot = cache->get_root_fast(t); + croot = cache->duplicate_for_write(t, croot)->cast<RootBlock>(); + croot->get_root().collection_root.update(cmroot); + } + + extent_len_t get_block_size() const { + return epm->get_block_size(); + } + + store_statfs_t store_stat() const { + return epm->get_stat(); + } + + ~TransactionManager(); + +private: + friend class Transaction; + + CacheRef cache; + LBAManagerRef lba_manager; + JournalRef journal; + ExtentPlacementManagerRef epm; + BackrefManagerRef backref_manager; + + WritePipeline write_pipeline; + + rewrite_extent_ret rewrite_logical_extent( + Transaction& t, + LogicalCachedExtentRef extent); + + submit_transaction_direct_ret do_submit_transaction( + Transaction &t, + ExtentPlacementManager::dispatch_result_t dispatch_result, + std::optional<journal_seq_t> seq_to_trim = std::nullopt); + + /// Remove refcount for offset + ref_ret _dec_ref( + Transaction &t, + laddr_t offset, + bool cascade_remove); + + /** + * pin_to_extent + * + * Get extent mapped at pin. + */ + using pin_to_extent_iertr = base_iertr; + template <typename T> + using pin_to_extent_ret = pin_to_extent_iertr::future< + TCachedExtentRef<T>>; + template <typename T> + pin_to_extent_ret<T> pin_to_extent( + Transaction &t, + LBAMappingRef pin) { + LOG_PREFIX(TransactionManager::pin_to_extent); + SUBTRACET(seastore_tm, "getting extent {}", t, *pin); + static_assert(is_logical_type(T::TYPE)); + using ret = pin_to_extent_ret<T>; + auto &pref = *pin; + return cache->get_absent_extent<T>( + t, + pref.get_val(), + pref.is_indirect() ? + pref.get_intermediate_length() : + pref.get_length(), + [pin=std::move(pin)] + (T &extent) mutable { + assert(!extent.has_laddr()); + assert(!extent.has_been_invalidated()); + assert(!pin->has_been_invalidated()); + assert(pin->get_parent()); + pin->link_child(&extent); + extent.maybe_set_intermediate_laddr(*pin); + } + ).si_then([FNAME, &t](auto ref) mutable -> ret { + SUBTRACET(seastore_tm, "got extent -- {}", t, *ref); + assert(ref->is_fully_loaded()); + return pin_to_extent_ret<T>( + interruptible::ready_future_marker{}, + std::move(ref)); + }); + } + + /** + * pin_to_extent_by_type + * + * Get extent mapped at pin. + */ + using pin_to_extent_by_type_ret = pin_to_extent_iertr::future< + LogicalCachedExtentRef>; + pin_to_extent_by_type_ret pin_to_extent_by_type( + Transaction &t, + LBAMappingRef pin, + extent_types_t type) + { + LOG_PREFIX(TransactionManager::pin_to_extent_by_type); + SUBTRACET(seastore_tm, "getting extent {} type {}", t, *pin, type); + assert(is_logical_type(type)); + auto &pref = *pin; + return cache->get_absent_extent_by_type( + t, + type, + pref.get_val(), + pref.get_key(), + pref.is_indirect() ? + pref.get_intermediate_length() : + pref.get_length(), + [pin=std::move(pin)](CachedExtent &extent) mutable { + auto &lextent = static_cast<LogicalCachedExtent&>(extent); + assert(!lextent.has_laddr()); + assert(!lextent.has_been_invalidated()); + assert(!pin->has_been_invalidated()); + assert(pin->get_parent()); + assert(!pin->get_parent()->is_pending()); + pin->link_child(&lextent); + lextent.maybe_set_intermediate_laddr(*pin); + } + ).si_then([FNAME, &t](auto ref) { + SUBTRACET(seastore_tm, "got extent -- {}", t, *ref); + assert(ref->is_fully_loaded()); + return pin_to_extent_by_type_ret( + interruptible::ready_future_marker{}, + std::move(ref->template cast<LogicalCachedExtent>())); + }); + } + + /** + * alloc_remapped_extent + * + * Allocates a new extent at given remap_paddr that must be absolute and + * use the buffer to fill the new extent if buffer exists. Otherwise, will + * not read disk to fill the new extent. + * Returns the new extent. + * + * Should make sure the end laddr of remap extent <= the end laddr of + * original extent when using this method. + */ + using alloc_remapped_extent_iertr = + alloc_extent_iertr::extend_ertr<Device::read_ertr>; + using alloc_remapped_extent_ret = + alloc_remapped_extent_iertr::future<LBAMappingRef>; + template <typename T> + alloc_remapped_extent_ret alloc_remapped_extent( + Transaction &t, + laddr_t remap_laddr, + paddr_t remap_paddr, + extent_len_t remap_length, + laddr_t original_laddr, + laddr_t intermediate_base, + laddr_t intermediate_key, + std::optional<ceph::bufferptr> &&original_bptr) { + LOG_PREFIX(TransactionManager::alloc_remapped_extent); + SUBDEBUG(seastore_tm, "alloc remapped extent: remap_laddr: {}, " + "remap_paddr: {}, remap_length: {}, has data in cache: {} ", + remap_laddr, remap_paddr, remap_length, + original_bptr.has_value() ? "true":"false"); + TCachedExtentRef<T> ext; + auto fut = LBAManager::alloc_extent_iertr::make_ready_future< + LBAMappingRef>(); + assert((intermediate_key == L_ADDR_NULL) + == (intermediate_base == L_ADDR_NULL)); + if (intermediate_key == L_ADDR_NULL) { + // remapping direct mapping + ext = cache->alloc_remapped_extent<T>( + t, + remap_laddr, + remap_paddr, + remap_length, + original_laddr, + std::move(original_bptr)); + fut = lba_manager->alloc_extent( + t, remap_laddr, remap_length, remap_paddr, *ext); + } else { + fut = lba_manager->clone_extent( + t, + remap_laddr, + remap_length, + intermediate_key, + remap_paddr, + intermediate_base); + } + return fut.si_then([remap_laddr, remap_length, remap_paddr](auto &&ref) { + assert(ref->get_key() == remap_laddr); + assert(ref->get_val() == remap_paddr); + assert(ref->get_length() == remap_length); + return alloc_remapped_extent_iertr::make_ready_future + <LBAMappingRef>(std::move(ref)); + }); + } + +public: + // Testing interfaces + auto get_epm() { + return epm.get(); + } + + auto get_lba_manager() { + return lba_manager.get(); + } + + auto get_backref_manager() { + return backref_manager.get(); + } + + auto get_cache() { + return cache.get(); + } + auto get_journal() { + return journal.get(); + } +}; +using TransactionManagerRef = std::unique_ptr<TransactionManager>; + +TransactionManagerRef make_transaction_manager( + Device *primary_device, + const std::vector<Device*> &secondary_devices, + bool is_test); +} diff --git a/src/crimson/osd/CMakeLists.txt b/src/crimson/osd/CMakeLists.txt new file mode 100644 index 000000000..f521e0244 --- /dev/null +++ b/src/crimson/osd/CMakeLists.txt @@ -0,0 +1,72 @@ +add_executable(crimson-osd + backfill_state.cc + ec_backend.cc + heartbeat.cc + lsan_suppressions.cc + main.cc + main_config_bootstrap_helpers.cc + osd.cc + osd_meta.cc + pg.cc + pg_backend.cc + pg_meta.cc + replicated_backend.cc + shard_services.cc + pg_shard_manager.cc + object_context.cc + object_context_loader.cc + ops_executer.cc + osd_operation.cc + osd_operations/client_request.cc + osd_operations/client_request_common.cc + osd_operations/internal_client_request.cc + osd_operations/peering_event.cc + osd_operations/pg_advance_map.cc + osd_operations/replicated_request.cc + osd_operations/logmissing_request.cc + osd_operations/logmissing_request_reply.cc + osd_operations/background_recovery.cc + osd_operations/recovery_subrequest.cc + osd_operations/snaptrim_event.cc + pg_recovery.cc + recovery_backend.cc + replicated_recovery_backend.cc + scheduler/scheduler.cc + scheduler/mclock_scheduler.cc + osdmap_gate.cc + pg_activation_blocker.cc + pg_map.cc + pg_interval_interrupt_condition.cc + objclass.cc + ${PROJECT_SOURCE_DIR}/src/objclass/class_api.cc + ${PROJECT_SOURCE_DIR}/src/osd/ClassHandler.cc + ${PROJECT_SOURCE_DIR}/src/osd/osd_op_util.cc + ${PROJECT_SOURCE_DIR}/src/osd/OSDCap.cc + ${PROJECT_SOURCE_DIR}/src/osd/PeeringState.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGStateUtils.cc + ${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc + ${PROJECT_SOURCE_DIR}/src/osd/SnapMapper.cc + ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc + ${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc + watch.cc + ) +if(HAS_VTA) + set_source_files_properties(main.cc + PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments) +endif() +target_link_libraries(crimson-osd + crimson-admin + crimson-common + crimson-os + crimson + fmt::fmt + Boost::MPL + dmclock::dmclock) +set_target_properties(crimson-osd PROPERTIES + POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE}) +install(TARGETS crimson-osd DESTINATION bin) +if(WITH_TESTS) + add_dependencies(tests crimson-osd) +endif() diff --git a/src/crimson/osd/acked_peers.h b/src/crimson/osd/acked_peers.h new file mode 100644 index 000000000..b2f2562c0 --- /dev/null +++ b/src/crimson/osd/acked_peers.h @@ -0,0 +1,14 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <vector> + +namespace crimson::osd { + struct peer_shard_t { + pg_shard_t shard; + eversion_t last_complete_ondisk; + }; + using acked_peers_t = std::vector<peer_shard_t>; +} diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h new file mode 100644 index 000000000..683dc6ea6 --- /dev/null +++ b/src/crimson/osd/backfill_facades.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/pg.h" +#include "osd/PeeringState.h" + +namespace crimson::osd { + +// PeeringFacade -- main implementation of the BackfillState::PeeringFacade +// interface. We have the abstraction to decuple BackfillState from Peering +// State, and thus cut depedencies in unit testing. The second implemention +// is BackfillFixture::PeeringFacade and sits in test_backfill.cc. +struct PeeringFacade final : BackfillState::PeeringFacade { + PeeringState& peering_state; + + hobject_t earliest_backfill() const override { + return peering_state.earliest_backfill(); + } + + const std::set<pg_shard_t>& get_backfill_targets() const override { + return peering_state.get_backfill_targets(); + } + + const hobject_t& get_peer_last_backfill(pg_shard_t peer) const override { + return peering_state.get_peer_info(peer).last_backfill; + } + + const eversion_t& get_last_update() const override { + return peering_state.get_info().last_update; + } + + const eversion_t& get_log_tail() const override { + return peering_state.get_info().log_tail; + } + + void scan_log_after(eversion_t v, scan_log_func_t f) const override { + peering_state.get_pg_log().get_log().scan_log_after(v, std::move(f)); + } + + bool is_backfill_target(pg_shard_t peer) const override { + return peering_state.is_backfill_target(peer); + } + void update_complete_backfill_object_stats(const hobject_t &hoid, + const pg_stat_t &stats) override { + peering_state.update_complete_backfill_object_stats(hoid, stats); + } + + bool is_backfilling() const override { + return peering_state.is_backfilling(); + } + + PeeringFacade(PeeringState& peering_state) + : peering_state(peering_state) { + } +}; + +// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge +// interface of crimson's PG class. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct PGFacade final : BackfillState::PGFacade { + PG& pg; + + const eversion_t& get_projected_last_update() const override { + return pg.projected_last_update; + } + + PGFacade(PG& pg) : pg(pg) {} +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc new file mode 100644 index 000000000..46a270ffe --- /dev/null +++ b/src/crimson/osd/backfill_state.cc @@ -0,0 +1,558 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> +#include <boost/type_index.hpp> +#include <fmt/ranges.h> +#include "common/hobject_fmt.h" +#include "crimson/osd/backfill_state.h" +#include "osd/osd_types_fmt.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +BackfillState::BackfillState( + BackfillState::BackfillListener& backfill_listener, + std::unique_ptr<BackfillState::PeeringFacade> peering_state, + std::unique_ptr<BackfillState::PGFacade> pg) + : backfill_machine(*this, + backfill_listener, + std::move(peering_state), + std::move(pg)), + progress_tracker( + std::make_unique<BackfillState::ProgressTracker>(backfill_machine)) +{ + logger().debug("{}:{}", __func__, __LINE__); + backfill_machine.initiate(); +} + +template <class S> +BackfillState::StateHelper<S>::StateHelper() +{ + logger().debug("enter {}", + boost::typeindex::type_id<S>().pretty_name()); +} + +template <class S> +BackfillState::StateHelper<S>::~StateHelper() +{ + logger().debug("exit {}", + boost::typeindex::type_id<S>().pretty_name()); +} + +BackfillState::~BackfillState() = default; + +BackfillState::BackfillMachine::BackfillMachine( + BackfillState& backfill_state, + BackfillState::BackfillListener& backfill_listener, + std::unique_ptr<BackfillState::PeeringFacade> peering_state, + std::unique_ptr<BackfillState::PGFacade> pg) + : backfill_state(backfill_state), + backfill_listener(backfill_listener), + peering_state(std::move(peering_state)), + pg(std::move(pg)) +{} + +BackfillState::BackfillMachine::~BackfillMachine() = default; + +BackfillState::Initial::Initial(my_context ctx) + : my_base(ctx) +{ + backfill_state().last_backfill_started = peering_state().earliest_backfill(); + logger().debug("{}: bft={} from {}", + __func__, peering_state().get_backfill_targets(), + backfill_state().last_backfill_started); + for (const auto& bt : peering_state().get_backfill_targets()) { + logger().debug("{}: target shard {} from {}", + __func__, bt, peering_state().get_peer_last_backfill(bt)); + } + ceph_assert(peering_state().get_backfill_targets().size()); + ceph_assert(!backfill_state().last_backfill_started.is_max()); +} + +boost::statechart::result +BackfillState::Initial::react(const BackfillState::Triggered& evt) +{ + logger().debug("{}: backfill triggered", __func__); + ceph_assert(backfill_state().last_backfill_started == \ + peering_state().earliest_backfill()); + ceph_assert(peering_state().is_backfilling()); + // initialize BackfillIntervals + for (const auto& bt : peering_state().get_backfill_targets()) { + backfill_state().peer_backfill_info[bt].reset( + peering_state().get_peer_last_backfill(bt)); + } + backfill_state().backfill_info.reset(backfill_state().last_backfill_started); + if (Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + logger().debug("{}: switching to Done state", __func__); + return transit<BackfillState::Done>(); + } else { + logger().debug("{}: switching to Enqueuing state", __func__); + return transit<BackfillState::Enqueuing>(); + } +} + + +// -- Enqueuing +void BackfillState::Enqueuing::maybe_update_range() +{ + if (auto& primary_bi = backfill_state().backfill_info; + primary_bi.version >= pg().get_projected_last_update()) { + logger().info("{}: bi is current", __func__); + ceph_assert(primary_bi.version == pg().get_projected_last_update()); + } else if (primary_bi.version >= peering_state().get_log_tail()) { +#if 0 + if (peering_state().get_pg_log().get_log().empty() && + pg().get_projected_log().empty()) { + /* Because we don't move log_tail on split, the log might be + * empty even if log_tail != last_update. However, the only + * way to get here with an empty log is if log_tail is actually + * eversion_t(), because otherwise the entry which changed + * last_update since the last scan would have to be present. + */ + ceph_assert(primary_bi.version == eversion_t()); + return; + } +#endif + logger().debug("{}: bi is old, ({}) can be updated with log to {}", + __func__, + primary_bi.version, + pg().get_projected_last_update()); + logger().debug("{}: scanning pg log first", __func__); + peering_state().scan_log_after(primary_bi.version, + [&](const pg_log_entry_t& e) { + logger().debug("maybe_update_range(lambda): updating from version {}", + e.version); + if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) { + if (e.is_update()) { + logger().debug("maybe_update_range(lambda): {} updated to ver {}", + e.soid, e.version); + primary_bi.objects.erase(e.soid); + primary_bi.objects.insert(std::make_pair(e.soid, + e.version)); + } else if (e.is_delete()) { + logger().debug("maybe_update_range(lambda): {} removed", + e.soid); + primary_bi.objects.erase(e.soid); + } + } + }); + primary_bi.version = pg().get_projected_last_update(); + } else { + ceph_abort_msg( + "scan_range should have raised primary_bi.version past log_tail"); + } +} + +void BackfillState::Enqueuing::trim_backfill_infos() +{ + for (const auto& bt : peering_state().get_backfill_targets()) { + backfill_state().peer_backfill_info[bt].trim_to( + std::max(peering_state().get_peer_last_backfill(bt), + backfill_state().last_backfill_started)); + } + backfill_state().backfill_info.trim_to( + backfill_state().last_backfill_started); +} + +/* static */ bool BackfillState::Enqueuing::all_enqueued( + const PeeringFacade& peering_state, + const BackfillInterval& backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) +{ + const bool all_local_enqueued = \ + backfill_info.extends_to_end() && backfill_info.empty(); + const bool all_peer_enqueued = std::all_of( + std::begin(peer_backfill_info), + std::end(peer_backfill_info), + [] (const auto& kv) { + [[maybe_unused]] const auto& [ shard, peer_backfill_info ] = kv; + return peer_backfill_info.extends_to_end() && peer_backfill_info.empty(); + }); + return all_local_enqueued && all_peer_enqueued; +} + +hobject_t BackfillState::Enqueuing::earliest_peer_backfill( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const +{ + hobject_t e = hobject_t::get_max(); + for (const pg_shard_t& bt : peering_state().get_backfill_targets()) { + const auto iter = peer_backfill_info.find(bt); + ceph_assert(iter != peer_backfill_info.end()); + e = std::min(e, iter->second.begin); + } + return e; +} + +bool BackfillState::Enqueuing::should_rescan_replicas( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const +{ + const auto& targets = peering_state().get_backfill_targets(); + return std::any_of(std::begin(targets), std::end(targets), + [&] (const auto& bt) { + return ReplicasScanning::replica_needs_scan(peer_backfill_info.at(bt), + backfill_info); + }); +} + +bool BackfillState::Enqueuing::should_rescan_primary( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const +{ + return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) && + !backfill_info.extends_to_end(); +} + +void BackfillState::Enqueuing::trim_backfilled_object_from_intervals( + BackfillState::Enqueuing::result_t&& result, + hobject_t& last_backfill_started, + std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) +{ + std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets), + [&peer_backfill_info] (const auto& bt) { + peer_backfill_info.at(bt).pop_front(); + }); + last_backfill_started = std::move(result.new_last_backfill_started); +} + +BackfillState::Enqueuing::result_t +BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) +{ + // set `new_last_backfill_started` to `check` + result_t result { {}, check }; + for (const auto& bt : peering_state().get_backfill_targets()) { + const auto& pbi = backfill_state().peer_backfill_info.at(bt); + if (pbi.begin == check) { + result.pbi_targets.insert(bt); + const auto& version = pbi.objects.begin()->second; + backfill_state().progress_tracker->enqueue_drop(pbi.begin); + backfill_listener().enqueue_drop(bt, pbi.begin, version); + } + } + logger().debug("{}: BACKFILL removing {} from peers {}", + __func__, check, result.pbi_targets); + ceph_assert(!result.pbi_targets.empty()); + return result; +} + +BackfillState::Enqueuing::result_t +BackfillState::Enqueuing::update_on_peers(const hobject_t& check) +{ + logger().debug("{}: check={}", __func__, check); + const auto& primary_bi = backfill_state().backfill_info; + result_t result { {}, primary_bi.begin }; + + for (const auto& bt : peering_state().get_backfill_targets()) { + const auto& peer_bi = backfill_state().peer_backfill_info.at(bt); + + // Find all check peers that have the wrong version + if (const eversion_t& obj_v = primary_bi.objects.begin()->second; + check == primary_bi.begin && check == peer_bi.begin) { + if(peer_bi.objects.begin()->second != obj_v && + backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) { + backfill_listener().enqueue_push(primary_bi.begin, obj_v); + } else { + // it's fine, keep it! OR already recovering + } + result.pbi_targets.insert(bt); + } else { + // Only include peers that we've caught up to their backfill line + // otherwise, they only appear to be missing this object + // because their peer_bi.begin > backfill_info.begin. + if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) && + backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) { + backfill_listener().enqueue_push(primary_bi.begin, obj_v); + } + } + } + return result; +} + +bool BackfillState::Enqueuing::Enqueuing::all_emptied( + const BackfillInterval& local_backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const +{ + const auto& targets = peering_state().get_backfill_targets(); + const auto replicas_emptied = + std::all_of(std::begin(targets), std::end(targets), + [&] (const auto& bt) { + return peer_backfill_info.at(bt).empty(); + }); + return local_backfill_info.empty() && replicas_emptied; +} + +BackfillState::Enqueuing::Enqueuing(my_context ctx) + : my_base(ctx) +{ + auto& primary_bi = backfill_state().backfill_info; + + // update our local interval to cope with recent changes + primary_bi.begin = backfill_state().last_backfill_started; + if (primary_bi.version < peering_state().get_log_tail()) { + // it might be that the OSD is so flooded with modifying operations + // that backfill will be spinning here over and over. For the sake + // of performance and complexity we don't synchronize with entire PG. + // similar can happen in classical OSD. + logger().warn("{}: bi is old, rescanning of local backfill_info", + __func__); + post_event(RequestPrimaryScanning{}); + return; + } else { + maybe_update_range(); + } + trim_backfill_infos(); + + while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) { + if (!backfill_listener().budget_available()) { + post_event(RequestWaiting{}); + return; + } else if (should_rescan_replicas(backfill_state().peer_backfill_info, + primary_bi)) { + // Count simultaneous scans as a single op and let those complete + post_event(RequestReplicasScanning{}); + return; + } + // Get object within set of peers to operate on and the set of targets + // for which that object applies. + if (const hobject_t check = \ + earliest_peer_backfill(backfill_state().peer_backfill_info); + check < primary_bi.begin) { + // Don't increment ops here because deletions + // are cheap and not replied to unlike real recovery_ops, + // and we can't increment ops without requeueing ourself + // for recovery. + auto result = remove_on_peers(check); + trim_backfilled_object_from_intervals(std::move(result), + backfill_state().last_backfill_started, + backfill_state().peer_backfill_info); + } else { + auto result = update_on_peers(check); + trim_backfilled_object_from_intervals(std::move(result), + backfill_state().last_backfill_started, + backfill_state().peer_backfill_info); + primary_bi.pop_front(); + } + backfill_listener().maybe_flush(); + } + + if (should_rescan_primary(backfill_state().peer_backfill_info, + primary_bi)) { + // need to grab one another chunk of the object namespace and restart + // the queueing. + logger().debug("{}: reached end for current local chunk", + __func__); + post_event(RequestPrimaryScanning{}); + } else if (backfill_state().progress_tracker->tracked_objects_completed()) { + post_event(RequestDone{}); + } else { + logger().debug("{}: reached end for both local and all peers " + "but still has in-flight operations", __func__); + post_event(RequestWaiting{}); + } +} + +// -- PrimaryScanning +BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx) + : my_base(ctx) +{ + backfill_state().backfill_info.version = peering_state().get_last_update(); + backfill_listener().request_primary_scan( + backfill_state().backfill_info.begin); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(PrimaryScanned evt) +{ + logger().debug("{}", __func__); + backfill_state().backfill_info = std::move(evt.result); + return transit<Enqueuing>(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(ObjectPushed evt) +{ + logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + return discard_event(); +} + +// -- ReplicasScanning +bool BackfillState::ReplicasScanning::replica_needs_scan( + const BackfillInterval& replica_backfill_info, + const BackfillInterval& local_backfill_info) +{ + return replica_backfill_info.empty() && \ + replica_backfill_info.begin <= local_backfill_info.begin && \ + !replica_backfill_info.extends_to_end(); +} + +BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx) + : my_base(ctx) +{ + for (const auto& bt : peering_state().get_backfill_targets()) { + if (const auto& pbi = backfill_state().peer_backfill_info.at(bt); + replica_needs_scan(pbi, backfill_state().backfill_info)) { + logger().debug("{}: scanning peer osd.{} from {}", + __func__, bt, pbi.end); + backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{}); + + ceph_assert(waiting_on_backfill.find(bt) == \ + waiting_on_backfill.end()); + waiting_on_backfill.insert(bt); + } + } + ceph_assert(!waiting_on_backfill.empty()); + // TODO: start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end +} + +#if 0 +BackfillState::ReplicasScanning::~ReplicasScanning() +{ + // TODO: finish_recovery_op(hobject_t::get_max()); +} +#endif + +boost::statechart::result +BackfillState::ReplicasScanning::react(ReplicaScanned evt) +{ + logger().debug("{}: got scan result from osd={}, result={}", + __func__, evt.from, evt.result); + // TODO: maybe we'll be able to move waiting_on_backfill from + // the machine to the state. + ceph_assert(peering_state().is_backfill_target(evt.from)); + if (waiting_on_backfill.erase(evt.from)) { + backfill_state().peer_backfill_info[evt.from] = std::move(evt.result); + if (waiting_on_backfill.empty()) { + ceph_assert(backfill_state().peer_backfill_info.size() == \ + peering_state().get_backfill_targets().size()); + return transit<Enqueuing>(); + } + } else { + // we canceled backfill for a while due to a too full, and this + // is an extra response from a non-too-full peer + logger().debug("{}: canceled backfill (too full?)", __func__); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::ReplicasScanning::react(ObjectPushed evt) +{ + logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + return discard_event(); +} + + +// -- Waiting +BackfillState::Waiting::Waiting(my_context ctx) + : my_base(ctx) +{ +} + +boost::statechart::result +BackfillState::Waiting::react(ObjectPushed evt) +{ + logger().debug("Waiting::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + if (!Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + return transit<Enqueuing>(); + } else if (backfill_state().progress_tracker->tracked_objects_completed()) { + return transit<Done>(); + } else { + // we still have something to wait on + logger().debug("Waiting::react() on ObjectPushed; still waiting"); + return discard_event(); + } +} + +// -- Done +BackfillState::Done::Done(my_context ctx) + : my_base(ctx) +{ + logger().info("{}: backfill is done", __func__); + backfill_listener().backfilled(); +} + +// -- Crashed +BackfillState::Crashed::Crashed() +{ + ceph_abort_msg("{}: this should not happen"); +} + +// ProgressTracker is an intermediary between the BackfillListener and +// BackfillMachine + its states. All requests to push or drop an object +// are directed through it. The same happens with notifications about +// completing given operations which are generated by BackfillListener +// and dispatched as i.e. ObjectPushed events. +// This allows ProgressTacker to track the list of in-flight operations +// which is essential to make the decision whether the entire machine +// should switch from Waiting to Done keep in Waiting. +// ProgressTracker also coordinates .last_backfill_started and stats +// updates. +bool BackfillState::ProgressTracker::tracked_objects_completed() const +{ + return registry.empty(); +} + +bool BackfillState::ProgressTracker::enqueue_push(const hobject_t& obj) +{ + [[maybe_unused]] const auto [it, first_seen] = registry.try_emplace( + obj, registry_item_t{op_stage_t::enqueued_push, std::nullopt}); + return first_seen; +} + +void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj) +{ + registry.try_emplace( + obj, registry_item_t{op_stage_t::enqueued_drop, pg_stat_t{}}); +} + +void BackfillState::ProgressTracker::complete_to( + const hobject_t& obj, + const pg_stat_t& stats) +{ + logger().debug("{}: obj={}", + __func__, obj); + if (auto completion_iter = registry.find(obj); + completion_iter != std::end(registry)) { + completion_iter->second = \ + registry_item_t{ op_stage_t::completed_push, stats }; + } else { + ceph_abort_msg("completing untracked object shall not happen"); + } + for (auto it = std::begin(registry); + it != std::end(registry) && + it->second.stage != op_stage_t::enqueued_push; + it = registry.erase(it)) { + auto& [soid, item] = *it; + assert(item.stats); + peering_state().update_complete_backfill_object_stats( + soid, + *item.stats); + } + if (Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info) && + tracked_objects_completed()) { + backfill_state().last_backfill_started = hobject_t::get_max(); + backfill_listener().update_peers_last_backfill(hobject_t::get_max()); + } else { + backfill_listener().update_peers_last_backfill(obj); + } +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h new file mode 100644 index 000000000..4bd2991fb --- /dev/null +++ b/src/crimson/osd/backfill_state.h @@ -0,0 +1,382 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <optional> + +#include <boost/statechart/custom_reaction.hpp> +#include <boost/statechart/event.hpp> +#include <boost/statechart/event_base.hpp> +#include <boost/statechart/simple_state.hpp> +#include <boost/statechart/state.hpp> +#include <boost/statechart/state_machine.hpp> +#include <boost/statechart/transition.hpp> + +#include "osd/recovery_types.h" + +namespace crimson::osd { + +namespace sc = boost::statechart; + +struct BackfillState { + struct BackfillListener; + struct PeeringFacade; + struct PGFacade; + + // events comes first + struct PrimaryScanned : sc::event<PrimaryScanned> { + BackfillInterval result; + PrimaryScanned(BackfillInterval&& result) + : result(std::move(result)) { + } + }; + + struct ReplicaScanned : sc::event<ReplicaScanned> { + pg_shard_t from; + BackfillInterval result; + ReplicaScanned(pg_shard_t from, BackfillInterval&& result) + : from(std::move(from)), + result(std::move(result)) { + } + }; + + struct ObjectPushed : sc::event<ObjectPushed> { + // TODO: implement replica management; I don't want to follow + // current convention where the backend layer is responsible + // for tracking replicas. + hobject_t object; + pg_stat_t stat; + ObjectPushed(hobject_t object) + : object(std::move(object)) { + } + }; + + struct Triggered : sc::event<Triggered> { + }; + +private: + // internal events + struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> { + }; + + struct RequestReplicasScanning : sc::event<RequestReplicasScanning> { + }; + + struct RequestWaiting : sc::event<RequestWaiting> { + }; + + struct RequestDone : sc::event<RequestDone> { + }; + + class ProgressTracker; + +public: + + struct Initial; + struct Enqueuing; + struct PrimaryScanning; + struct ReplicasScanning; + struct Waiting; + struct Done; + + struct BackfillMachine : sc::state_machine<BackfillMachine, Initial> { + BackfillMachine(BackfillState& backfill_state, + BackfillListener& backfill_listener, + std::unique_ptr<PeeringFacade> peering_state, + std::unique_ptr<PGFacade> pg); + ~BackfillMachine(); + BackfillState& backfill_state; + BackfillListener& backfill_listener; + std::unique_ptr<PeeringFacade> peering_state; + std::unique_ptr<PGFacade> pg; + }; + +private: + template <class S> + struct StateHelper { + StateHelper(); + ~StateHelper(); + + BackfillState& backfill_state() { + return static_cast<S*>(this) \ + ->template context<BackfillMachine>().backfill_state; + } + BackfillListener& backfill_listener() { + return static_cast<S*>(this) \ + ->template context<BackfillMachine>().backfill_listener; + } + PeeringFacade& peering_state() { + return *static_cast<S*>(this) \ + ->template context<BackfillMachine>().peering_state; + } + PGFacade& pg() { + return *static_cast<S*>(this)->template context<BackfillMachine>().pg; + } + + const PeeringFacade& peering_state() const { + return *static_cast<const S*>(this) \ + ->template context<BackfillMachine>().peering_state; + } + const BackfillState& backfill_state() const { + return static_cast<const S*>(this) \ + ->template context<BackfillMachine>().backfill_state; + } + }; + +public: + + // states + struct Crashed : sc::simple_state<Crashed, BackfillMachine>, + StateHelper<Crashed> { + explicit Crashed(); + }; + + struct Initial : sc::state<Initial, BackfillMachine>, + StateHelper<Initial> { + using reactions = boost::mpl::list< + sc::custom_reaction<Triggered>, + sc::transition<sc::event_base, Crashed>>; + explicit Initial(my_context); + // initialize after triggering backfill by on_activate_complete(). + // transit to Enqueuing. + sc::result react(const Triggered&); + }; + + struct Enqueuing : sc::state<Enqueuing, BackfillMachine>, + StateHelper<Enqueuing> { + using reactions = boost::mpl::list< + sc::transition<RequestPrimaryScanning, PrimaryScanning>, + sc::transition<RequestReplicasScanning, ReplicasScanning>, + sc::transition<RequestWaiting, Waiting>, + sc::transition<RequestDone, Done>, + sc::transition<sc::event_base, Crashed>>; + explicit Enqueuing(my_context); + + // indicate whether there is any remaining work to do when it comes + // to comparing the hobject_t namespace between primary and replicas. + // true doesn't necessarily mean backfill is done -- there could be + // in-flight pushes or drops which had been enqueued but aren't + // completed yet. + static bool all_enqueued( + const PeeringFacade& peering_state, + const BackfillInterval& backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info); + + private: + void maybe_update_range(); + void trim_backfill_infos(); + + // these methods take BackfillIntervals instead of extracting them from + // the state to emphasize the relationships across the main loop. + bool all_emptied( + const BackfillInterval& local_backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const; + hobject_t earliest_peer_backfill( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const; + bool should_rescan_replicas( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const; + // indicate whether a particular acting primary needs to scanned again + // to process next piece of the hobject_t's namespace. + // the logic is per analogy to replica_needs_scan(). See comments there. + bool should_rescan_primary( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const; + + // the result_t is intermediary between {remove,update}_on_peers() and + // updating BackfillIntervals in trim_backfilled_object_from_intervals. + // This step is important because it affects the main loop's condition, + // and thus deserves to be exposed instead of being called deeply from + // {remove,update}_on_peers(). + struct [[nodiscard]] result_t { + std::set<pg_shard_t> pbi_targets; + hobject_t new_last_backfill_started; + }; + void trim_backfilled_object_from_intervals( + result_t&&, + hobject_t& last_backfill_started, + std::map<pg_shard_t, BackfillInterval>& peer_backfill_info); + result_t remove_on_peers(const hobject_t& check); + result_t update_on_peers(const hobject_t& check); + }; + + struct PrimaryScanning : sc::state<PrimaryScanning, BackfillMachine>, + StateHelper<PrimaryScanning> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::custom_reaction<PrimaryScanned>, + sc::transition<sc::event_base, Crashed>>; + explicit PrimaryScanning(my_context); + sc::result react(ObjectPushed); + // collect scanning result and transit to Enqueuing. + sc::result react(PrimaryScanned); + }; + + struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>, + StateHelper<ReplicasScanning> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::custom_reaction<ReplicaScanned>, + sc::transition<sc::event_base, Crashed>>; + explicit ReplicasScanning(my_context); + // collect scanning result; if all results are collected, transition + // to Enqueuing will happen. + sc::result react(ObjectPushed); + sc::result react(ReplicaScanned); + + // indicate whether a particular peer should be scanned to retrieve + // BackfillInterval for new range of hobject_t namespace. + // true when bi.objects is exhausted, replica bi's end is not MAX, + // and primary bi'begin is further than the replica's one. + static bool replica_needs_scan( + const BackfillInterval& replica_backfill_info, + const BackfillInterval& local_backfill_info); + + private: + std::set<pg_shard_t> waiting_on_backfill; + }; + + struct Waiting : sc::state<Waiting, BackfillMachine>, + StateHelper<Waiting> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::transition<sc::event_base, Crashed>>; + explicit Waiting(my_context); + sc::result react(ObjectPushed); + }; + + struct Done : sc::state<Done, BackfillMachine>, + StateHelper<Done> { + using reactions = boost::mpl::list< + sc::transition<sc::event_base, Crashed>>; + explicit Done(my_context); + }; + + BackfillState(BackfillListener& backfill_listener, + std::unique_ptr<PeeringFacade> peering_state, + std::unique_ptr<PGFacade> pg); + ~BackfillState(); + + void process_event( + boost::intrusive_ptr<const sc::event_base> evt) { + backfill_machine.process_event(*std::move(evt)); + } + + hobject_t get_last_backfill_started() const { + return last_backfill_started; + } +private: + hobject_t last_backfill_started; + BackfillInterval backfill_info; + std::map<pg_shard_t, BackfillInterval> peer_backfill_info; + BackfillMachine backfill_machine; + std::unique_ptr<ProgressTracker> progress_tracker; +}; + +// BackfillListener -- an interface used by the backfill FSM to request +// low-level services like issueing `MOSDPGPush` or `MOSDPGBackfillRemove`. +// The goals behind the interface are: 1) unittestability; 2) possibility +// to retrofit classical OSD with BackfillState. For the second reason we +// never use `seastar::future` -- instead responses to the requests are +// conveyed as events; see ObjectPushed as an example. +struct BackfillState::BackfillListener { + virtual void request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) = 0; + + virtual void request_primary_scan( + const hobject_t& begin) = 0; + + virtual void enqueue_push( + const hobject_t& obj, + const eversion_t& v) = 0; + + virtual void enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) = 0; + + virtual void maybe_flush() = 0; + + virtual void update_peers_last_backfill( + const hobject_t& new_last_backfill) = 0; + + virtual bool budget_available() const = 0; + + virtual void backfilled() = 0; + + virtual ~BackfillListener() = default; +}; + +// PeeringFacade -- a facade (in the GoF-defined meaning) simplifying +// the interface of PeeringState. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct BackfillState::PeeringFacade { + virtual hobject_t earliest_backfill() const = 0; + virtual const std::set<pg_shard_t>& get_backfill_targets() const = 0; + virtual const hobject_t& get_peer_last_backfill(pg_shard_t peer) const = 0; + virtual const eversion_t& get_last_update() const = 0; + virtual const eversion_t& get_log_tail() const = 0; + + // the performance impact of `std::function` has not been considered yet. + // If there is any proof (from e.g. profiling) about its significance, we + // can switch back to the template variant. + using scan_log_func_t = std::function<void(const pg_log_entry_t&)>; + virtual void scan_log_after(eversion_t, scan_log_func_t) const = 0; + + virtual bool is_backfill_target(pg_shard_t peer) const = 0; + virtual void update_complete_backfill_object_stats(const hobject_t &hoid, + const pg_stat_t &stats) = 0; + virtual bool is_backfilling() const = 0; + virtual ~PeeringFacade() {} +}; + +// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge +// interface of crimson's PG class. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct BackfillState::PGFacade { + virtual const eversion_t& get_projected_last_update() const = 0; + virtual ~PGFacade() {} +}; + +class BackfillState::ProgressTracker { + // TODO: apply_stat, + enum class op_stage_t { + enqueued_push, + enqueued_drop, + completed_push, + }; + + struct registry_item_t { + op_stage_t stage; + std::optional<pg_stat_t> stats; + }; + + BackfillMachine& backfill_machine; + std::map<hobject_t, registry_item_t> registry; + + BackfillState& backfill_state() { + return backfill_machine.backfill_state; + } + PeeringFacade& peering_state() { + return *backfill_machine.peering_state; + } + BackfillListener& backfill_listener() { + return backfill_machine.backfill_listener; + } + +public: + ProgressTracker(BackfillMachine& backfill_machine) + : backfill_machine(backfill_machine) { + } + + bool tracked_objects_completed() const; + + bool enqueue_push(const hobject_t&); + void enqueue_drop(const hobject_t&); + void complete_to(const hobject_t&, const pg_stat_t&); +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc new file mode 100644 index 000000000..d555d6cdc --- /dev/null +++ b/src/crimson/osd/ec_backend.cc @@ -0,0 +1,37 @@ +#include "ec_backend.h" + +#include "crimson/osd/shard_services.h" + +ECBackend::ECBackend(shard_id_t shard, + ECBackend::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t&, + uint64_t, + DoutPrefixProvider &dpp) + : PGBackend{shard, coll, shard_services, dpp} +{ + // todo +} + +ECBackend::ll_read_ierrorator::future<ceph::bufferlist> +ECBackend::_read(const hobject_t& hoid, + const uint64_t off, + const uint64_t len, + const uint32_t flags) +{ + // todo + return seastar::make_ready_future<bufferlist>(); +} + +ECBackend::rep_op_fut_t +ECBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + // todo + return {seastar::now(), + seastar::make_ready_future<crimson::osd::acked_peers_t>()}; +} diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h new file mode 100644 index 000000000..3dbcc4def --- /dev/null +++ b/src/crimson/osd/ec_backend.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> +#include "include/buffer_fwd.h" +#include "osd/osd_types.h" +#include "pg_backend.h" + +class ECBackend : public PGBackend +{ +public: + ECBackend(shard_id_t shard, + CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile, + uint64_t stripe_width, + DoutPrefixProvider &dpp); + seastar::future<> stop() final { + return seastar::now(); + } + void on_actingset_changed(bool same_primary) final {} +private: + ll_read_ierrorator::future<ceph::bufferlist> + _read(const hobject_t& hoid, uint64_t off, uint64_t len, uint32_t flags) override; + rep_op_fut_t + _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + osd_op_params_t&& req, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) final; + CollectionRef coll; + crimson::os::FuturizedStore::Shard* store; + seastar::future<> request_committed(const osd_reqid_t& reqid, + const eversion_t& version) final { + return seastar::now(); + } +}; diff --git a/src/crimson/osd/exceptions.h b/src/crimson/osd/exceptions.h new file mode 100644 index 000000000..2783ed252 --- /dev/null +++ b/src/crimson/osd/exceptions.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <exception> +#include <system_error> + +#include "crimson/common/errorator.h" + +namespace crimson::osd { +class error : private std::system_error { +public: + error(const std::errc ec) + : system_error(std::make_error_code(ec)) { + } + + using system_error::code; + using system_error::what; + + friend error make_error(int ret); + +private: + error(const int ret) noexcept + : system_error(ret, std::system_category()) { + } +}; + +inline error make_error(const int ret) { + return error{ret}; +} + +struct object_not_found : public error { + object_not_found() : error(std::errc::no_such_file_or_directory) {} +}; + +struct invalid_argument : public error { + invalid_argument() : error(std::errc::invalid_argument) {} +}; + +// FIXME: error handling +struct permission_denied : public error { + permission_denied() : error(std::errc::operation_not_permitted) {} +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc new file mode 100644 index 000000000..266e56533 --- /dev/null +++ b/src/crimson/osd/heartbeat.cc @@ -0,0 +1,819 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "heartbeat.h" + +#include <boost/range/join.hpp> +#include <fmt/chrono.h> +#include <fmt/os.h> + +#include "messages/MOSDPing.h" +#include "messages/MOSDFailure.h" + +#include "crimson/common/config_proxy.h" +#include "crimson/common/formatter.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/osd/shard_services.h" +#include "crimson/mon/MonClient.h" + +#include "osd/OSDMap.h" + +using std::set; +using std::string; +using crimson::common::local_conf; + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +Heartbeat::Heartbeat(osd_id_t whoami, + crimson::osd::ShardServices& service, + crimson::mon::Client& monc, + crimson::net::Messenger &front_msgr, + crimson::net::Messenger &back_msgr) + : whoami{whoami}, + service{service}, + monc{monc}, + front_msgr{front_msgr}, + back_msgr{back_msgr}, + // do this in background + timer{[this] { + heartbeat_check(); + (void)send_heartbeats(); + }}, + failing_peers{*this} +{} + +seastar::future<> Heartbeat::start(entity_addrvec_t front_addrs, + entity_addrvec_t back_addrs) +{ + logger().info("heartbeat: start front_addrs={}, back_addrs={}", + front_addrs, back_addrs); + // i only care about the address, so any unused port would work + for (auto& addr : boost::join(front_addrs.v, back_addrs.v)) { + addr.set_port(0); + } + + using crimson::net::SocketPolicy; + front_msgr.set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossy_client(0)); + back_msgr.set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossy_client(0)); + return seastar::when_all_succeed(start_messenger(front_msgr, + front_addrs), + start_messenger(back_msgr, + back_addrs)) + .then_unpack([this] { + timer.arm_periodic( + std::chrono::seconds(local_conf()->osd_heartbeat_interval)); + }); +} + +seastar::future<> +Heartbeat::start_messenger(crimson::net::Messenger& msgr, + const entity_addrvec_t& addrs) +{ + return msgr.bind(addrs).safe_then([this, &msgr]() mutable { + return msgr.start({this}); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [addrs] (const std::error_code& e) { + logger().error("heartbeat messenger bind({}): {}", addrs, e); + ceph_abort(); + })); +} + +seastar::future<> Heartbeat::stop() +{ + logger().info("{}", __func__); + timer.cancel(); + front_msgr.stop(); + back_msgr.stop(); + return gate.close().then([this] { + return seastar::when_all_succeed(front_msgr.shutdown(), + back_msgr.shutdown()); + }).then_unpack([] { + return seastar::now(); + }); +} + +const entity_addrvec_t& Heartbeat::get_front_addrs() const +{ + return front_msgr.get_myaddrs(); +} + +const entity_addrvec_t& Heartbeat::get_back_addrs() const +{ + return back_msgr.get_myaddrs(); +} + +crimson::net::Messenger& Heartbeat::get_front_msgr() const +{ + return front_msgr; +} + +crimson::net::Messenger& Heartbeat::get_back_msgr() const +{ + return back_msgr; +} + +void Heartbeat::add_peer(osd_id_t _peer, epoch_t epoch) +{ + assert(whoami != _peer); + auto [iter, added] = peers.try_emplace(_peer, *this, _peer); + auto& peer = iter->second; + peer.set_epoch_added(epoch); +} + +Heartbeat::osds_t Heartbeat::remove_down_peers() +{ + osds_t old_osds; // osds not added in this epoch + for (auto i = peers.begin(); i != peers.end(); ) { + auto osdmap = service.get_map(); + const auto& [osd, peer] = *i; + if (!osdmap->is_up(osd)) { + i = peers.erase(i); + } else { + if (peer.get_epoch_added() < osdmap->get_epoch()) { + old_osds.push_back(osd); + } + ++i; + } + } + return old_osds; +} + +void Heartbeat::add_reporter_peers(int whoami) +{ + auto osdmap = service.get_map(); + // include next and previous up osds to ensure we have a fully-connected set + set<int> want; + if (auto next = osdmap->get_next_up_osd_after(whoami); next >= 0) { + want.insert(next); + } + if (auto prev = osdmap->get_previous_up_osd_before(whoami); prev >= 0) { + want.insert(prev); + } + // make sure we have at least **min_down** osds coming from different + // subtree level (e.g., hosts) for fast failure detection. + auto min_down = local_conf().get_val<uint64_t>("mon_osd_min_down_reporters"); + auto subtree = local_conf().get_val<string>("mon_osd_reporter_subtree_level"); + osdmap->get_random_up_osds_by_subtree( + whoami, subtree, min_down, want, &want); + auto epoch = osdmap->get_epoch(); + for (int osd : want) { + add_peer(osd, epoch); + }; +} + +void Heartbeat::update_peers(int whoami) +{ + const auto min_peers = static_cast<size_t>( + local_conf().get_val<int64_t>("osd_heartbeat_min_peers")); + add_reporter_peers(whoami); + auto extra = remove_down_peers(); + // too many? + for (auto& osd : extra) { + if (peers.size() <= min_peers) { + break; + } + remove_peer(osd); + } + // or too few? + auto osdmap = service.get_map(); + auto epoch = osdmap->get_epoch(); + for (auto next = osdmap->get_next_up_osd_after(whoami); + peers.size() < min_peers && next >= 0 && next != whoami; + next = osdmap->get_next_up_osd_after(next)) { + add_peer(next, epoch); + } +} + +Heartbeat::osds_t Heartbeat::get_peers() const +{ + osds_t osds; + osds.reserve(peers.size()); + for (auto& peer : peers) { + osds.push_back(peer.first); + } + return osds; +} + +void Heartbeat::remove_peer(osd_id_t peer) +{ + assert(peers.count(peer) == 1); + peers.erase(peer); +} + +std::optional<seastar::future<>> +Heartbeat::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + bool dispatched = true; + gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] { + switch (m->get_type()) { + case MSG_OSD_PING: + return handle_osd_ping(conn, boost::static_pointer_cast<MOSDPing>(m)); + default: + dispatched = false; + return seastar::now(); + } + }); + return (dispatched ? std::make_optional(seastar::now()) : std::nullopt); +} + +void Heartbeat::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) +{ + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_reset(conn, is_replace); + } +} + +void Heartbeat::ms_handle_connect( + crimson::net::ConnectionRef conn, + seastar::shard_id prv_shard) +{ + ceph_assert_always(seastar::this_shard_id() == prv_shard); + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_connect(conn); + } +} + +void Heartbeat::ms_handle_accept( + crimson::net::ConnectionRef conn, + seastar::shard_id prv_shard, + bool is_replace) +{ + ceph_assert_always(seastar::this_shard_id() == prv_shard); + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_accept(conn, is_replace); + } +} + +seastar::future<> Heartbeat::handle_osd_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + switch (m->op) { + case MOSDPing::PING: + return handle_ping(conn, m); + case MOSDPing::PING_REPLY: + return handle_reply(conn, m); + case MOSDPing::YOU_DIED: + return handle_you_died(); + default: + return seastar::now(); + } +} + +seastar::future<> Heartbeat::handle_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + auto min_message = static_cast<uint32_t>( + local_conf()->osd_heartbeat_min_size); + auto reply = + crimson::make_message<MOSDPing>( + m->fsid, + service.get_map()->get_epoch(), + MOSDPing::PING_REPLY, + m->ping_stamp, + m->mono_ping_stamp, + service.get_mnow(), + service.get_up_epoch(), + min_message); + return conn->send(std::move(reply) + ).then([this, m, conn] { + return maybe_share_osdmap(conn, m); + }); +} + +seastar::future<> Heartbeat::maybe_share_osdmap( + crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + const osd_id_t from = m->get_source().num(); + const epoch_t current_osdmap_epoch = service.get_map()->get_epoch(); + auto found = peers.find(from); + if (found == peers.end()) { + return seastar::now(); + } + auto& peer = found->second; + + if (m->map_epoch > peer.get_projected_epoch()) { + logger().debug("{} updating peer {} session's projected_epoch" + "from {} to ping map epoch of {}", + __func__, from, peer.get_projected_epoch(), + m->map_epoch); + peer.set_projected_epoch(m->map_epoch); + } + + if (current_osdmap_epoch <= peer.get_projected_epoch()) { + logger().debug("{} peer {} projected_epoch {} is already later " + "than our osdmap epoch of {}", + __func__ , from, peer.get_projected_epoch(), + current_osdmap_epoch); + return seastar::now(); + } + + const epoch_t send_from = peer.get_projected_epoch(); + logger().debug("{} sending peer {} peer maps from projected epoch {} through " + "local osdmap epoch {}", + __func__, + from, + send_from, + current_osdmap_epoch); + peer.set_projected_epoch(current_osdmap_epoch); + return service.send_incremental_map_to_osd(from, send_from); +} + +seastar::future<> Heartbeat::handle_reply(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + const osd_id_t from = m->get_source().num(); + auto found = peers.find(from); + if (found == peers.end()) { + // stale reply + return seastar::now(); + } + auto& peer = found->second; + return peer.handle_reply(conn, m + ).then([this, conn, m] { + return maybe_share_osdmap(conn, m); + }); +} + +seastar::future<> Heartbeat::handle_you_died() +{ + // TODO: ask for newer osdmap + return seastar::now(); +} + +void Heartbeat::heartbeat_check() +{ + failure_queue_t failure_queue; + const auto now = clock::now(); + for (const auto& [osd, peer] : peers) { + auto failed_since = peer.failed_since(now); + if (!clock::is_zero(failed_since)) { + failure_queue.emplace(osd, failed_since); + } + } + if (!failure_queue.empty()) { + // send_failures can run in background, because + // 1. After the execution of send_failures, no msg is actually + // sent, which means the sending operation is not done, + // which further seems to involve problems risks that when + // osd shuts down, the left part of the sending operation + // may reference OSD and Heartbeat instances that are already + // deleted. However, remaining work of that sending operation + // involves no reference back to OSD or Heartbeat instances, + // which means it wouldn't involve the above risks. + // 2. messages are sent in order, if later checks find out + // the previous "failed" peers to be healthy, that "still + // alive" messages would be sent after the previous "osd + // failure" messages which is totally safe. + (void)send_failures(std::move(failure_queue)); + } +} + +seastar::future<> Heartbeat::send_heartbeats() +{ + const auto mnow = service.get_mnow(); + const auto now = clock::now(); + + std::vector<seastar::future<>> futures; + for (auto& [osd, peer] : peers) { + peer.send_heartbeat(now, mnow, futures); + } + return seastar::when_all_succeed(futures.begin(), futures.end()); +} + +seastar::future<> Heartbeat::send_failures(failure_queue_t&& failure_queue) +{ + std::vector<seastar::future<>> futures; + const auto now = clock::now(); + for (auto [osd, failed_since] : failure_queue) { + failing_peers.add_pending(osd, failed_since, now, futures); + } + + return seastar::when_all_succeed(futures.begin(), futures.end()); +} + +void Heartbeat::print(std::ostream& out) const +{ + out << "heartbeat"; +} + +Heartbeat::Connection::~Connection() +{ + if (conn) { + conn->mark_down(); + } +} + +bool Heartbeat::Connection::matches(crimson::net::ConnectionRef _conn) const +{ + return (conn && conn == _conn); +} + +bool Heartbeat::Connection::accepted( + crimson::net::ConnectionRef accepted_conn, + bool is_replace) +{ + ceph_assert(accepted_conn); + ceph_assert(accepted_conn != conn); + if (accepted_conn->get_peer_addr() != listener.get_peer_addr(type)) { + return false; + } + + if (is_replace) { + logger().info("Heartbeat::Connection::accepted(): " + "{} racing", *this); + racing_detected = true; + } + if (conn) { + // there is no assumption about the ordering of the reset and accept + // events for the 2 racing connections. + if (is_connected) { + logger().warn("Heartbeat::Connection::accepted(): " + "{} is accepted while connected, is_replace={}", + *this, is_replace); + conn->mark_down(); + set_unconnected(); + } + } + conn = accepted_conn; + set_connected(); + return true; +} + +void Heartbeat::Connection::reset(bool is_replace) +{ + if (is_replace) { + logger().info("Heartbeat::Connection::reset(): " + "{} racing, waiting for the replacing accept", + *this); + racing_detected = true; + } + + if (is_connected) { + set_unconnected(); + } else { + conn = nullptr; + } + + if (is_replace) { + // waiting for the replacing accept event + } else if (!racing_detected || is_winner_side) { + connect(); + } else { // racing_detected && !is_winner_side + logger().info("Heartbeat::Connection::reset(): " + "{} racing detected and lose, " + "waiting for peer connect me", *this); + } +} + +seastar::future<> Heartbeat::Connection::send(MessageURef msg) +{ + assert(is_connected); + return conn->send(std::move(msg)); +} + +void Heartbeat::Connection::validate() +{ + assert(is_connected); + auto peer_addr = listener.get_peer_addr(type); + if (conn->get_peer_addr() != peer_addr) { + logger().info("Heartbeat::Connection::validate(): " + "{} has new address {} over {}, reset", + *this, peer_addr, conn->get_peer_addr()); + conn->mark_down(); + racing_detected = false; + reset(); + } +} + +void Heartbeat::Connection::retry() +{ + racing_detected = false; + if (!is_connected) { + if (conn) { + conn->mark_down(); + reset(); + } else { + connect(); + } + } +} + +void Heartbeat::Connection::set_connected() +{ + assert(conn); + assert(!is_connected); + ceph_assert(conn->is_connected()); + is_connected = true; + listener.increase_connected(); +} + +void Heartbeat::Connection::set_unconnected() +{ + assert(conn); + assert(is_connected); + conn = nullptr; + is_connected = false; + listener.decrease_connected(); +} + +void Heartbeat::Connection::connect() +{ + assert(!conn); + auto addr = listener.get_peer_addr(type); + conn = msgr.connect(addr, entity_name_t(CEPH_ENTITY_TYPE_OSD, peer)); + if (conn->is_connected()) { + set_connected(); + } +} + +Heartbeat::clock::time_point +Heartbeat::Session::failed_since(Heartbeat::clock::time_point now) const +{ + if (do_health_screen(now) == health_state::UNHEALTHY) { + auto oldest_deadline = ping_history.begin()->second.deadline; + auto failed_since = std::min(last_rx_back, last_rx_front); + if (clock::is_zero(failed_since)) { + logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} " + "ever on either front or back, first ping sent {} " + "(oldest deadline {})", + peer, first_tx, oldest_deadline); + failed_since = first_tx; + } else { + logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} " + "since back {} front {} (oldest deadline {})", + peer, last_rx_back, last_rx_front, oldest_deadline); + } + return failed_since; + } else { + return clock::zero(); + } +} + +void Heartbeat::Session::set_inactive_history(clock::time_point now) +{ + assert(!connected); + if (ping_history.empty()) { + const utime_t sent_stamp{now}; + const auto deadline = + now + std::chrono::seconds(local_conf()->osd_heartbeat_grace); + ping_history.emplace(sent_stamp, reply_t{deadline, 0}); + } else { // the entry is already added + assert(ping_history.size() == 1); + } +} + +Heartbeat::Peer::Peer(Heartbeat& heartbeat, osd_id_t peer) + : ConnectionListener(2), heartbeat{heartbeat}, peer{peer}, session{peer}, + con_front(peer, heartbeat.whoami > peer, Connection::type_t::front, + heartbeat.front_msgr, *this), + con_back(peer, heartbeat.whoami > peer, Connection::type_t::back, + heartbeat.back_msgr, *this) +{ + logger().info("Heartbeat::Peer: osd.{} added", peer); +} + +Heartbeat::Peer::~Peer() +{ + logger().info("Heartbeat::Peer: osd.{} removed", peer); +} + +void Heartbeat::Peer::send_heartbeat( + clock::time_point now, ceph::signedspan mnow, + std::vector<seastar::future<>>& futures) +{ + session.set_tx(now); + if (session.is_started()) { + do_send_heartbeat(now, mnow, &futures); + for_each_conn([] (auto& conn) { + conn.validate(); + }); + } else { + // we should send MOSDPing but still cannot at this moment + if (pending_send) { + // we have already pending for a entire heartbeat interval + logger().warn("Heartbeat::Peer::send_heartbeat(): " + "heartbeat to osd.{} is still pending...", peer); + for_each_conn([] (auto& conn) { + conn.retry(); + }); + } else { + logger().info("Heartbeat::Peer::send_heartbeat(): " + "heartbeat to osd.{} is pending send...", peer); + session.set_inactive_history(now); + pending_send = true; + } + } +} + +void Heartbeat::Peer::handle_reset( + crimson::net::ConnectionRef conn, bool is_replace) +{ + int cnt = 0; + for_each_conn([&] (auto& _conn) { + if (_conn.matches(conn)) { + ++cnt; + _conn.reset(is_replace); + } + }); + + if (cnt == 0) { + logger().info("Heartbeat::Peer::handle_reset(): {} ignores conn, is_replace={} -- {}", + *this, is_replace, *conn); + } else if (cnt > 1) { + logger().error("Heartbeat::Peer::handle_reset(): {} handles conn {} times -- {}", + *this, cnt, *conn); + } +} + +void Heartbeat::Peer::handle_connect(crimson::net::ConnectionRef conn) +{ + int cnt = 0; + for_each_conn([&] (auto& _conn) { + if (_conn.matches(conn)) { + ++cnt; + _conn.connected(); + } + }); + + if (cnt == 0) { + logger().error("Heartbeat::Peer::handle_connect(): {} ignores conn -- {}", + *this, *conn); + conn->mark_down(); + } else if (cnt > 1) { + logger().error("Heartbeat::Peer::handle_connect(): {} handles conn {} times -- {}", + *this, cnt, *conn); + } +} + +void Heartbeat::Peer::handle_accept(crimson::net::ConnectionRef conn, bool is_replace) +{ + int cnt = 0; + for_each_conn([&] (auto& _conn) { + if (_conn.accepted(conn, is_replace)) { + ++cnt; + } + }); + + if (cnt == 0) { + logger().warn("Heartbeat::Peer::handle_accept(): {} ignores conn -- {}", + *this, *conn); + } else if (cnt > 1) { + logger().error("Heartbeat::Peer::handle_accept(): {} handles conn {} times -- {}", + *this, cnt, *conn); + } +} + +seastar::future<> Heartbeat::Peer::handle_reply( + crimson::net::ConnectionRef conn, Ref<MOSDPing> m) +{ + if (!session.is_started()) { + // we haven't sent any ping yet + return seastar::now(); + } + type_t type; + if (con_front.matches(conn)) { + type = type_t::front; + } else if (con_back.matches(conn)) { + type = type_t::back; + } else { + return seastar::now(); + } + const auto now = clock::now(); + if (session.on_pong(m->ping_stamp, type, now)) { + if (session.do_health_screen(now) == Session::health_state::HEALTHY) { + return heartbeat.failing_peers.cancel_one(peer); + } + } + return seastar::now(); +} + +entity_addr_t Heartbeat::Peer::get_peer_addr(type_t type) +{ + const auto osdmap = heartbeat.service.get_map(); + if (type == type_t::front) { + return osdmap->get_hb_front_addrs(peer).front(); + } else { + return osdmap->get_hb_back_addrs(peer).front(); + } +} + +void Heartbeat::Peer::on_connected() +{ + logger().info("Heartbeat::Peer: osd.{} connected (send={})", + peer, pending_send); + session.on_connected(); + if (pending_send) { + pending_send = false; + do_send_heartbeat(clock::now(), heartbeat.service.get_mnow(), nullptr); + } +} + +void Heartbeat::Peer::on_disconnected() +{ + logger().info("Heartbeat::Peer: osd.{} disconnected", peer); + session.on_disconnected(); +} + +void Heartbeat::Peer::do_send_heartbeat( + Heartbeat::clock::time_point now, + ceph::signedspan mnow, + std::vector<seastar::future<>>* futures) +{ + const utime_t sent_stamp{now}; + const auto deadline = + now + std::chrono::seconds(local_conf()->osd_heartbeat_grace); + session.on_ping(sent_stamp, deadline); + for_each_conn([&, this] (auto& conn) { + auto min_message = static_cast<uint32_t>( + local_conf()->osd_heartbeat_min_size); + auto ping = crimson::make_message<MOSDPing>( + heartbeat.monc.get_fsid(), + heartbeat.service.get_map()->get_epoch(), + MOSDPing::PING, + sent_stamp, + mnow, + mnow, + heartbeat.service.get_up_epoch(), + min_message); + if (futures) { + futures->push_back(conn.send(std::move(ping))); + } + }); +} + +bool Heartbeat::FailingPeers::add_pending( + osd_id_t peer, + clock::time_point failed_since, + clock::time_point now, + std::vector<seastar::future<>>& futures) +{ + if (failure_pending.count(peer)) { + return false; + } + auto failed_for = std::chrono::duration_cast<std::chrono::seconds>( + now - failed_since).count(); + auto osdmap = heartbeat.service.get_map(); + auto failure_report = + crimson::make_message<MOSDFailure>(heartbeat.monc.get_fsid(), + peer, + osdmap->get_addrs(peer), + static_cast<int>(failed_for), + osdmap->get_epoch()); + failure_pending.emplace(peer, failure_info_t{failed_since, + osdmap->get_addrs(peer)}); + futures.push_back(heartbeat.monc.send_message(std::move(failure_report))); + logger().info("{}: osd.{} failed for {}", __func__, peer, failed_for); + return true; +} + +seastar::future<> Heartbeat::FailingPeers::cancel_one(osd_id_t peer) +{ + if (auto pending = failure_pending.find(peer); + pending != failure_pending.end()) { + auto fut = send_still_alive(peer, pending->second.addrs); + failure_pending.erase(peer); + return fut; + } + return seastar::now(); +} + +seastar::future<> +Heartbeat::FailingPeers::send_still_alive( + osd_id_t osd, const entity_addrvec_t& addrs) +{ + auto still_alive = crimson::make_message<MOSDFailure>( + heartbeat.monc.get_fsid(), + osd, + addrs, + 0, + heartbeat.service.get_map()->get_epoch(), + MOSDFailure::FLAG_ALIVE); + logger().info("{}: osd.{}", __func__, osd); + return heartbeat.monc.send_message(std::move(still_alive)); +} diff --git a/src/crimson/osd/heartbeat.h b/src/crimson/osd/heartbeat.h new file mode 100644 index 000000000..f5da45118 --- /dev/null +++ b/src/crimson/osd/heartbeat.h @@ -0,0 +1,461 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> +#include <seastar/core/future.hh> +#include "common/ceph_time.h" +#include "crimson/common/gated.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/net/Fwd.h" + +class MOSDPing; + +namespace crimson::osd { + class ShardServices; +} + +namespace crimson::mon { + class Client; +} + +template<typename Message> using Ref = boost::intrusive_ptr<Message>; + +class Heartbeat : public crimson::net::Dispatcher { +public: + using osd_id_t = int; + + Heartbeat(osd_id_t whoami, + crimson::osd::ShardServices& service, + crimson::mon::Client& monc, + crimson::net::Messenger &front_msgr, + crimson::net::Messenger &back_msgr); + + seastar::future<> start(entity_addrvec_t front, + entity_addrvec_t back); + seastar::future<> stop(); + + using osds_t = std::vector<osd_id_t>; + void add_peer(osd_id_t peer, epoch_t epoch); + void update_peers(int whoami); + void remove_peer(osd_id_t peer); + osds_t get_peers() const; + + const entity_addrvec_t& get_front_addrs() const; + const entity_addrvec_t& get_back_addrs() const; + + crimson::net::Messenger &get_front_msgr() const; + crimson::net::Messenger &get_back_msgr() const; + + // Dispatcher methods + std::optional<seastar::future<>> ms_dispatch( + crimson::net::ConnectionRef conn, MessageRef m) override; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override; + void ms_handle_connect(crimson::net::ConnectionRef conn, seastar::shard_id) override; + void ms_handle_accept(crimson::net::ConnectionRef conn, seastar::shard_id, bool is_replace) override; + + void print(std::ostream&) const; +private: + seastar::future<> handle_osd_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_reply(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_you_died(); + + /// remove down OSDs + /// @return peers not added in this epoch + osds_t remove_down_peers(); + /// add enough reporters for fast failure detection + void add_reporter_peers(int whoami); + + seastar::future<> start_messenger(crimson::net::Messenger& msgr, + const entity_addrvec_t& addrs); + seastar::future<> maybe_share_osdmap(crimson::net::ConnectionRef, + Ref<MOSDPing> m); +private: + const osd_id_t whoami; + crimson::osd::ShardServices& service; + crimson::mon::Client& monc; + crimson::net::Messenger &front_msgr; + crimson::net::Messenger &back_msgr; + + seastar::timer<seastar::lowres_clock> timer; + // use real_clock so it can be converted to utime_t + using clock = ceph::coarse_real_clock; + + class ConnectionListener; + class Connection; + class Session; + class Peer; + using peers_map_t = std::map<osd_id_t, Peer>; + peers_map_t peers; + + // osds which are considered failed + // osd_id => when was the last time that both front and back pings were acked + // or sent. + // use for calculating how long the OSD has been unresponsive + using failure_queue_t = std::map<osd_id_t, clock::time_point>; + seastar::future<> send_failures(failure_queue_t&& failure_queue); + seastar::future<> send_heartbeats(); + void heartbeat_check(); + + // osds we've reported to monior as failed ones, but they are not marked down + // yet + crimson::common::Gated gate; + + class FailingPeers { + public: + FailingPeers(Heartbeat& heartbeat) : heartbeat(heartbeat) {} + bool add_pending(osd_id_t peer, + clock::time_point failed_since, + clock::time_point now, + std::vector<seastar::future<>>& futures); + seastar::future<> cancel_one(osd_id_t peer); + + private: + seastar::future<> send_still_alive(osd_id_t, const entity_addrvec_t&); + + Heartbeat& heartbeat; + + struct failure_info_t { + clock::time_point failed_since; + entity_addrvec_t addrs; + }; + std::map<osd_id_t, failure_info_t> failure_pending; + } failing_peers; +}; + +inline std::ostream& operator<<(std::ostream& out, const Heartbeat& hb) { + hb.print(out); + return out; +} + +/* + * Event driven interface for Heartbeat::Peer to be notified when both hb_front + * and hb_back are connected, or connection is lost. + */ +class Heartbeat::ConnectionListener { + public: + ConnectionListener(size_t connections) : connections{connections} {} + + void increase_connected() { + assert(connected < connections); + ++connected; + if (connected == connections) { + on_connected(); + } + } + void decrease_connected() { + assert(connected > 0); + if (connected == connections) { + on_disconnected(); + } + --connected; + } + enum class type_t { front, back }; + virtual entity_addr_t get_peer_addr(type_t) = 0; + + protected: + virtual void on_connected() = 0; + virtual void on_disconnected() = 0; + + private: + const size_t connections; + size_t connected = 0; +}; + +class Heartbeat::Connection { + public: + using type_t = ConnectionListener::type_t; + Connection(osd_id_t peer, bool is_winner_side, type_t type, + crimson::net::Messenger& msgr, + ConnectionListener& listener) + : peer{peer}, type{type}, + msgr{msgr}, listener{listener}, + is_winner_side{is_winner_side} { + connect(); + } + Connection(const Connection&) = delete; + Connection(Connection&&) = delete; + Connection& operator=(const Connection&) = delete; + Connection& operator=(Connection&&) = delete; + + ~Connection(); + + bool matches(crimson::net::ConnectionRef _conn) const; + void connected() { + set_connected(); + } + bool accepted(crimson::net::ConnectionRef, bool is_replace); + void reset(bool is_replace=false); + seastar::future<> send(MessageURef msg); + void validate(); + // retry connection if still pending + void retry(); + + private: + void set_connected(); + void set_unconnected(); + void connect(); + + const osd_id_t peer; + const type_t type; + crimson::net::Messenger& msgr; + ConnectionListener& listener; + +/* + * Resolve the following racing when both me and peer are trying to connect + * each other symmetrically, under SocketPolicy::lossy_client: + * + * OSD.A OSD.B + * - - + * |-[1]----> <----[2]-| + * \ / + * \ / + * delay.. X delay.. + * / \ + * |-[1]x> / \ <x[2]-| + * |<-[2]--- ---[1]->| + * |(reset#1) (reset#2)| + * |(reconnectB) (reconnectA)| + * |-[2]---> <---[1]-| + * delay.. delay.. + * (remote close populated) + * |-[2]x> <x[1]-| + * |(reset#2) (reset#1)| + * | ... ... | + * (dead loop!) + * + * Our solution is to remember if such racing was happened recently, and + * establish connection asymmetrically only from the winner side whose osd-id + * is larger. + */ + const bool is_winner_side; + bool racing_detected = false; + + crimson::net::ConnectionRef conn; + bool is_connected = false; + + friend std::ostream& operator<<(std::ostream& os, const Connection& c) { + if (c.type == type_t::front) { + return os << "con_front(osd." << c.peer << ")"; + } else { + return os << "con_back(osd." << c.peer << ")"; + } + } +}; + +/* + * Track the ping history and ping reply (the pong) from the same session, clean up + * history once hb_front or hb_back loses connection and restart the session once + * both connections are connected again. + * + * We cannot simply remove the entire Heartbeat::Peer once hb_front or hb_back + * loses connection, because we would end up with the following deadloop: + * + * OSD.A OSD.B + * - - + * hb_front reset <--(network)--- hb_front close + * | ^ + * | | + * remove Peer B (dead loop!) remove Peer A + * | | + * V | + * hb_back close ----(network)---> hb_back reset + */ +class Heartbeat::Session { + public: + Session(osd_id_t peer) : peer{peer} {} + + void set_epoch_added(epoch_t epoch_) { epoch = epoch_; } + epoch_t get_epoch_added() const { return epoch; } + + void set_projected_epoch(epoch_t epoch_) { projected_epoch = epoch_; } + epoch_t get_projected_epoch() const { return projected_epoch; } + + bool is_started() const { return connected; } + bool pinged() const { + if (clock::is_zero(first_tx)) { + // i can never receive a pong without sending any ping message first. + assert(clock::is_zero(last_rx_front) && + clock::is_zero(last_rx_back)); + return false; + } else { + return true; + } + } + + enum class health_state { + UNKNOWN, + UNHEALTHY, + HEALTHY, + }; + health_state do_health_screen(clock::time_point now) const { + if (!pinged()) { + // we are not healty nor unhealty because we haven't sent anything yet + return health_state::UNKNOWN; + } else if (!ping_history.empty() && ping_history.begin()->second.deadline < now) { + return health_state::UNHEALTHY; + } else if (!clock::is_zero(last_rx_front) && + !clock::is_zero(last_rx_back)) { + // only declare to be healthy until we have received the first + // replies from both front/back connections + return health_state::HEALTHY; + } else { + return health_state::UNKNOWN; + } + } + + clock::time_point failed_since(clock::time_point now) const; + + void set_tx(clock::time_point now) { + if (!pinged()) { + first_tx = now; + } + last_tx = now; + } + + void on_connected() { + assert(!connected); + connected = true; + ping_history.clear(); + } + + void on_ping(const utime_t& sent_stamp, + const clock::time_point& deadline) { + assert(connected); + [[maybe_unused]] auto [reply, added] = + ping_history.emplace(sent_stamp, reply_t{deadline, 2}); + } + + bool on_pong(const utime_t& ping_stamp, + Connection::type_t type, + clock::time_point now) { + assert(connected); + auto ping = ping_history.find(ping_stamp); + if (ping == ping_history.end()) { + // old replies, deprecated by newly sent pings. + return false; + } + auto& unacked = ping->second.unacknowledged; + assert(unacked); + if (type == Connection::type_t::front) { + last_rx_front = now; + unacked--; + } else { + last_rx_back = now; + unacked--; + } + if (unacked == 0) { + ping_history.erase(ping_history.begin(), ++ping); + } + return true; + } + + void on_disconnected() { + assert(connected); + connected = false; + if (!ping_history.empty()) { + // we lost our ping_history of the last session, but still need to keep + // the oldest deadline for unhealthy check. + auto oldest = ping_history.begin(); + auto sent_stamp = oldest->first; + auto deadline = oldest->second.deadline; + ping_history.clear(); + ping_history.emplace(sent_stamp, reply_t{deadline, 0}); + } + } + + // maintain an entry in ping_history for unhealthy check + void set_inactive_history(clock::time_point); + + private: + const osd_id_t peer; + bool connected = false; + // time we sent our first ping request + clock::time_point first_tx; + // last time we sent a ping request + clock::time_point last_tx; + // last time we got a ping reply on the front side + clock::time_point last_rx_front; + // last time we got a ping reply on the back side + clock::time_point last_rx_back; + // most recent epoch we wanted this peer + epoch_t epoch; // rename me to epoch_added + // epoch we expect peer to be at once our sent incrementals are processed + epoch_t projected_epoch = 0; + + struct reply_t { + clock::time_point deadline; + // one sent over front conn, another sent over back conn + uint8_t unacknowledged = 0; + }; + // history of inflight pings, arranging by timestamp we sent + std::map<utime_t, reply_t> ping_history; +}; + +class Heartbeat::Peer final : private Heartbeat::ConnectionListener { + public: + Peer(Heartbeat&, osd_id_t); + ~Peer(); + Peer(Peer&&) = delete; + Peer(const Peer&) = delete; + Peer& operator=(Peer&&) = delete; + Peer& operator=(const Peer&) = delete; + + // set/get the epoch at which the peer was added + void set_epoch_added(epoch_t epoch) { session.set_epoch_added(epoch); } + epoch_t get_epoch_added() const { return session.get_epoch_added(); } + + void set_projected_epoch(epoch_t epoch) { session.set_projected_epoch(epoch); } + epoch_t get_projected_epoch() const { return session.get_projected_epoch(); } + + // if failure, return time_point since last active + // else, return clock::zero() + clock::time_point failed_since(clock::time_point now) const { + return session.failed_since(now); + } + void send_heartbeat( + clock::time_point, ceph::signedspan, std::vector<seastar::future<>>&); + seastar::future<> handle_reply(crimson::net::ConnectionRef, Ref<MOSDPing>); + + void handle_reset(crimson::net::ConnectionRef conn, bool is_replace); + + void handle_connect(crimson::net::ConnectionRef conn); + + void handle_accept(crimson::net::ConnectionRef conn, bool is_replace); + + private: + entity_addr_t get_peer_addr(type_t type) override; + void on_connected() override; + void on_disconnected() override; + void do_send_heartbeat( + clock::time_point, ceph::signedspan, std::vector<seastar::future<>>*); + + template <typename Func> + void for_each_conn(Func&& f) { + f(con_front); + f(con_back); + } + + Heartbeat& heartbeat; + const osd_id_t peer; + Session session; + // if need to send heartbeat when session connected + bool pending_send = false; + Connection con_front; + Connection con_back; + + friend std::ostream& operator<<(std::ostream& os, const Peer& p) { + return os << "peer(osd." << p.peer << ")"; + } +}; + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<Heartbeat> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<Heartbeat::Connection> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<Heartbeat::Peer> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/lsan_suppressions.cc b/src/crimson/osd/lsan_suppressions.cc new file mode 100644 index 000000000..53b7eb630 --- /dev/null +++ b/src/crimson/osd/lsan_suppressions.cc @@ -0,0 +1,20 @@ +#ifndef _NDEBUG +// The callbacks we define here will be called from the sanitizer runtime, but +// aren't referenced from the Chrome executable. We must ensure that those +// callbacks are not sanitizer-instrumented, and that they aren't stripped by +// the linker. +#define SANITIZER_HOOK_ATTRIBUTE \ + extern "C" \ + __attribute__((no_sanitize("address", "thread", "undefined"))) \ + __attribute__((visibility("default"))) \ + __attribute__((used)) + +static char kLSanDefaultSuppressions[] = + "leak:InitModule\n" + "leak:MallocExtension::Initialize\n" + "leak:MallocExtension::Register\n"; + +SANITIZER_HOOK_ATTRIBUTE const char *__lsan_default_suppressions() { + return kLSanDefaultSuppressions; +} +#endif // ! _NDEBUG diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc new file mode 100644 index 000000000..1e817415d --- /dev/null +++ b/src/crimson/osd/main.cc @@ -0,0 +1,259 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/types.h> +#include <unistd.h> + +#include <iostream> +#include <fstream> +#include <random> + +#include <seastar/core/app-template.hh> +#include <seastar/core/print.hh> +#include <seastar/core/prometheus.hh> +#include <seastar/core/thread.hh> +#include <seastar/http/httpd.hh> +#include <seastar/net/inet_address.hh> +#include <seastar/util/closeable.hh> +#include <seastar/util/defer.hh> +#include <seastar/util/std-compat.hh> + +#include "auth/KeyRing.h" +#include "common/ceph_argparse.h" +#include "common/config_tracker.h" +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" +#include "crimson/common/fatal_signal.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Messenger.h" +#include "crimson/osd/stop_signal.h" +#include "crimson/osd/main_config_bootstrap_helpers.h" +#include "global/pidfile.h" +#include "osd.h" + +using namespace std::literals; +namespace bpo = boost::program_options; +using crimson::common::local_conf; +using crimson::common::sharded_conf; +using crimson::common::sharded_perf_coll; + +static seastar::logger& logger() +{ + return crimson::get_logger(ceph_subsys_osd); +} + +seastar::future<> make_keyring() +{ + const auto path = local_conf().get_val<std::string>("keyring"); + return seastar::file_exists(path).then([path](bool exists) { + KeyRing keyring; + EntityName name{local_conf()->name}; + EntityAuth auth; + if (exists && + keyring.load(nullptr, path) == 0 && + keyring.get_auth(name, auth)) { + fmt::print(std::cerr, "already have key in keyring: {}\n", path); + return seastar::now(); + } else { + CephContext temp_cct{}; + auth.key.create(&temp_cct, CEPH_CRYPTO_AES); + keyring.add(name, auth); + bufferlist bl; + keyring.encode_plaintext(bl); + const auto permissions = (seastar::file_permissions::user_read | + seastar::file_permissions::user_write); + return crimson::write_file(std::move(bl), path, permissions); + } + }).handle_exception_type([path](const std::filesystem::filesystem_error& e) { + fmt::print(std::cerr, "FATAL: writing new keyring to {}: {}\n", path, e.what()); + throw e; + }); +} + +static std::ofstream maybe_set_logger() +{ + std::ofstream log_file_stream; + if (auto log_file = local_conf()->log_file; !log_file.empty()) { + log_file_stream.open(log_file, std::ios::app | std::ios::out); + try { + seastar::throw_system_error_on(log_file_stream.fail()); + } catch (const std::system_error& e) { + ceph_abort_msg(fmt::format("unable to open log file: {}", e.what())); + } + logger().set_ostream(log_file_stream); + } + return log_file_stream; +} + +int main(int argc, const char* argv[]) +{ + auto early_config_result = crimson::osd::get_early_config(argc, argv); + if (!early_config_result.has_value()) { + int r = early_config_result.error(); + std::cerr << "do_early_config returned error: " << r << std::endl; + return r; + } + auto &early_config = early_config_result.value(); + + auto seastar_n_early_args = early_config.get_early_args(); + auto config_proxy_args = early_config.get_ceph_args(); + + seastar::app_template::config app_cfg; + app_cfg.name = "Crimson"; + app_cfg.auto_handle_sigint_sigterm = false; + seastar::app_template app(std::move(app_cfg)); + app.add_options() + ("mkkey", "generate a new secret key. " + "This is normally used in combination with --mkfs") + ("mkfs", "create a [new] data directory") + ("debug", "enable debug output on all loggers") + ("trace", "enable trace output on all loggers") + ("osdspec-affinity", bpo::value<std::string>()->default_value(std::string{}), + "set affinity to an osdspec") + ("prometheus_port", bpo::value<uint16_t>()->default_value(0), + "Prometheus port. Set to zero to disable") + ("prometheus_address", bpo::value<std::string>()->default_value("0.0.0.0"), + "Prometheus listening address") + ("prometheus_prefix", bpo::value<std::string>()->default_value("osd"), + "Prometheus metrics prefix"); + + try { + return app.run( + seastar_n_early_args.size(), + const_cast<char**>(seastar_n_early_args.data()), + [&] { + auto& config = app.configuration(); + return seastar::async([&] { + try { + FatalSignal fatal_signal; + seastar_apps_lib::stop_signal should_stop; + if (config.count("debug")) { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::debug + ); + } + if (config.count("trace")) { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::trace + ); + } + sharded_conf().start( + early_config.init_params.name, early_config.cluster_name).get(); + local_conf().start().get(); + auto stop_conf = seastar::deferred_stop(sharded_conf()); + sharded_perf_coll().start().get(); + auto stop_perf_coll = seastar::deferred_stop(sharded_perf_coll()); + local_conf().parse_config_files(early_config.conf_file_list).get(); + local_conf().parse_env().get(); + local_conf().parse_argv(config_proxy_args).get(); + auto log_file_stream = maybe_set_logger(); + auto reset_logger = seastar::defer([] { + logger().set_ostream(std::cerr); + }); + if (const auto ret = pidfile_write(local_conf()->pid_file); + ret == -EACCES || ret == -EAGAIN) { + ceph_abort_msg( + "likely there is another crimson-osd instance with the same id"); + } else if (ret < 0) { + ceph_abort_msg(fmt::format("pidfile_write failed with {} {}", + ret, cpp_strerror(-ret))); + } + // just ignore SIGHUP, we don't reread settings. keep in mind signals + // handled by S* must be blocked for alien threads (see AlienStore). + seastar::engine().handle_signal(SIGHUP, [] {}); + + // start prometheus API server + seastar::httpd::http_server_control prom_server; + std::any stop_prometheus; + if (uint16_t prom_port = config["prometheus_port"].as<uint16_t>(); + prom_port != 0) { + prom_server.start("prometheus").get(); + stop_prometheus = seastar::make_shared(seastar::deferred_stop(prom_server)); + + seastar::prometheus::config prom_config; + prom_config.prefix = config["prometheus_prefix"].as<std::string>(); + seastar::prometheus::start(prom_server, prom_config).get(); + seastar::net::inet_address prom_addr(config["prometheus_address"].as<std::string>()); + prom_server.listen(seastar::socket_address{prom_addr, prom_port}) + .handle_exception([=] (auto ep) { + std::cerr << seastar::format("Could not start Prometheus API server on {}:{}: {}\n", + prom_addr, prom_port, ep); + return seastar::make_exception_future(ep); + }).get(); + } + + const int whoami = std::stoi(local_conf()->name.get_id()); + const auto nonce = crimson::osd::get_nonce(); + crimson::net::MessengerRef cluster_msgr, client_msgr; + crimson::net::MessengerRef hb_front_msgr, hb_back_msgr; + for (auto [msgr, name] : {make_pair(std::ref(cluster_msgr), "cluster"s), + make_pair(std::ref(client_msgr), "client"s), + make_pair(std::ref(hb_front_msgr), "hb_front"s), + make_pair(std::ref(hb_back_msgr), "hb_back"s)}) { + msgr = crimson::net::Messenger::create(entity_name_t::OSD(whoami), + name, + nonce, + true); + } + auto store = crimson::os::FuturizedStore::create( + local_conf().get_val<std::string>("osd_objectstore"), + local_conf().get_val<std::string>("osd_data"), + local_conf().get_config_values()); + + crimson::osd::OSD osd( + whoami, nonce, std::ref(should_stop.abort_source()), + std::ref(*store), cluster_msgr, client_msgr, + hb_front_msgr, hb_back_msgr); + + if (config.count("mkkey")) { + make_keyring().get(); + } + if (local_conf()->no_mon_config) { + logger().info("bypassing the config fetch due to --no-mon-config"); + } else { + crimson::osd::populate_config_from_mon().get(); + } + if (config.count("mkfs")) { + auto osd_uuid = local_conf().get_val<uuid_d>("osd_uuid"); + if (osd_uuid.is_zero()) { + // use a random osd uuid if not specified + osd_uuid.generate_random(); + } + osd.mkfs( + *store, + whoami, + osd_uuid, + local_conf().get_val<uuid_d>("fsid"), + config["osdspec-affinity"].as<std::string>()).get(); + } + if (config.count("mkkey") || config.count("mkfs")) { + return EXIT_SUCCESS; + } else { + osd.start().get(); + } + logger().info("crimson startup completed"); + should_stop.wait().get(); + logger().info("crimson shutting down"); + osd.stop().get(); + // stop()s registered using defer() are called here + } catch (...) { + logger().error("startup failed: {}", std::current_exception()); + return EXIT_FAILURE; + } + logger().info("crimson shutdown complete"); + return EXIT_SUCCESS; + }); + }); + } catch (...) { + fmt::print(std::cerr, "FATAL: Exception during startup, aborting: {}\n", std::current_exception()); + return EXIT_FAILURE; + } +} + +/* + * Local Variables: + * compile-command: "make -j4 \ + * -C ../../../build \ + * crimson-osd" + * End: + */ diff --git a/src/crimson/osd/main_config_bootstrap_helpers.cc b/src/crimson/osd/main_config_bootstrap_helpers.cc new file mode 100644 index 000000000..807fd1591 --- /dev/null +++ b/src/crimson/osd/main_config_bootstrap_helpers.cc @@ -0,0 +1,265 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/main_config_bootstrap_helpers.h" + +#include <seastar/core/print.hh> +#include <seastar/core/prometheus.hh> +#include <seastar/core/thread.hh> +#include <seastar/http/httpd.hh> +#include <seastar/net/inet_address.hh> +#include <seastar/util/closeable.hh> +#include <seastar/util/defer.hh> +#include <seastar/util/std-compat.hh> + +#include "common/ceph_argparse.h" +#include "common/config_tracker.h" +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" +#include "crimson/common/fatal_signal.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Messenger.h" +#include "crimson/osd/main_config_bootstrap_helpers.h" + +using namespace std::literals; +using crimson::common::local_conf; +using crimson::common::sharded_conf; +using crimson::common::sharded_perf_coll; + +static seastar::logger& logger() +{ + return crimson::get_logger(ceph_subsys_osd); +} + +namespace crimson::osd { + +void usage(const char* prog) +{ + std::cout << "usage: " << prog << std::endl; + generic_server_usage(); +} + + +seastar::future<> populate_config_from_mon() +{ + logger().info("populating config from monitor"); + // i don't have any client before joining the cluster, so no need to have + // a proper auth handler + class DummyAuthHandler : public crimson::common::AuthHandler { + public: + void handle_authentication(const EntityName& name, + const AuthCapsInfo& caps) + {} + }; + return seastar::async([] { + auto auth_handler = std::make_unique<DummyAuthHandler>(); + auto msgr = crimson::net::Messenger::create(entity_name_t::CLIENT(), + "temp_mon_client", + get_nonce(), + true); + crimson::mon::Client monc{*msgr, *auth_handler}; + msgr->set_auth_client(&monc); + msgr->start({&monc}).get(); + auto stop_msgr = seastar::defer([&] { + msgr->stop(); + msgr->shutdown().get(); + }); + monc.start().handle_exception([] (auto ep) { + fmt::print(std::cerr, "FATAL: unable to connect to cluster: {}\n", ep); + return seastar::make_exception_future<>(ep); + }).get(); + auto stop_monc = seastar::defer([&] { + monc.stop().get(); + }); + monc.sub_want("config", 0, 0); + monc.renew_subs().get(); + // wait for monmap and config + monc.wait_for_config().get(); + auto fsid = monc.get_fsid().to_string(); + local_conf().set_val("fsid", fsid).get(); + logger().debug("{}: got config from monitor, fsid {}", __func__, fsid); + }); +} + +static tl::expected<early_config_t, int> +_get_early_config(int argc, const char *argv[]) +{ + early_config_t ret; + + // pull off ceph configs the stuff from early_args + std::vector<const char *> early_args; + early_args.insert( + std::end(early_args), + argv, argv + argc); + + ret.init_params = ceph_argparse_early_args( + early_args, + CEPH_ENTITY_TYPE_OSD, + &ret.cluster_name, + &ret.conf_file_list); + + if (ceph_argparse_need_usage(early_args)) { + usage(argv[0]); + exit(0); + } + + seastar::app_template::config app_cfg; + app_cfg.name = "Crimson-startup"; + app_cfg.auto_handle_sigint_sigterm = false; + seastar::app_template app(std::move(app_cfg)); + const char *bootstrap_args[] = { argv[0], "--smp", "1" }; + int r = app.run( + sizeof(bootstrap_args) / sizeof(bootstrap_args[0]), + const_cast<char**>(bootstrap_args), + [argc, argv, &ret, &early_args] { + return seastar::async([argc, argv, &ret, &early_args] { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::debug); + sharded_conf().start( + ret.init_params.name, ret.cluster_name).get(); + local_conf().start().get(); + auto stop_conf = seastar::deferred_stop(sharded_conf()); + + sharded_perf_coll().start().get(); + auto stop_perf_coll = seastar::deferred_stop(sharded_perf_coll()); + + local_conf().parse_env().get(); + local_conf().parse_argv(early_args).get(); + local_conf().parse_config_files(ret.conf_file_list).get(); + + if (local_conf()->no_mon_config) { + logger().info("bypassing the config fetch due to --no-mon-config"); + } else { + populate_config_from_mon().get(); + } + + // get ceph configs + std::set_difference( + argv, argv + argc, + std::begin(early_args), + std::end(early_args), + std::back_inserter(ret.ceph_args)); + + ret.early_args.insert( + std::end(ret.early_args), + std::begin(early_args), + std::end(early_args)); + + if (auto found = std::find_if( + std::begin(early_args), + std::end(early_args), + [](auto* arg) { return "--smp"sv == arg; }); + found == std::end(early_args)) { + + // Set --smp based on crimson_seastar_smp config option + ret.early_args.emplace_back("--smp"); + + auto smp_config = local_conf().get_val<uint64_t>( + "crimson_seastar_smp"); + + ret.early_args.emplace_back(fmt::format("{}", smp_config)); + logger().info("get_early_config: set --smp {}", smp_config); + } + return 0; + }); + }); + if (r < 0) { + return tl::unexpected(r); + } + return ret; +} + +/* get_early_config handles obtaining config parameters required prior + * to reactor startup. Most deployment mechanisms (cephadm for one) + * rely on pulling configs from the monitor rather than shipping around + * config files, so this process needs to support pulling config options + * from the monitors. + * + * Of particular interest are config params related to the seastar + * reactor itself which can't be modified after the reactor has been + * started -- like the number of cores to use (smp::count). Contacting + * the monitors, however, requires a MonClient, which in turn needs a + * running reactor. + * + * Unfortunately, seastar doesn't clean up thread local state + * associated with seastar::smp task queues etc, so we can't + * start a reactor, stop it, and restart it in the same thread + * without an impractical amount of cleanup in seastar. + * + * More unfortunately, starting a reactor in a seperate thread + * and then joining the thread still doesn't avoid all global state, + * I observed tasks from the previous reactor incarnation nevertheless + * continuing to run in the new one resulting in a crash as they access + * freed memory. + * + * The approach taken here, therefore, is to actually fork, start a + * reactor in the child process, encode the resulting early_config_t, + * and send it back to the parent process. + */ +tl::expected<early_config_t, int> +get_early_config(int argc, const char *argv[]) +{ + int pipes[2]; + int r = pipe2(pipes, 0); + if (r < 0) { + std::cerr << "get_early_config: failed to create pipes: " + << -errno << std::endl; + return tl::unexpected(-errno); + } + + pid_t worker = fork(); + if (worker < 0) { + close(pipes[0]); + close(pipes[1]); + std::cerr << "get_early_config: failed to fork: " + << -errno << std::endl; + return tl::unexpected(-errno); + } else if (worker == 0) { // child + close(pipes[0]); + auto ret = _get_early_config(argc, argv); + if (ret.has_value()) { + bufferlist bl; + ::encode(ret.value(), bl); + r = bl.write_fd(pipes[1]); + close(pipes[1]); + if (r < 0) { + std::cerr << "get_early_config: child failed to write_fd: " + << r << std::endl; + exit(-r); + } else { + exit(0); + } + } else { + std::cerr << "get_early_config: child failed: " + << -ret.error() << std::endl; + exit(-ret.error()); + } + return tl::unexpected(-1); + } else { // parent + close(pipes[1]); + + bufferlist bl; + early_config_t ret; + while ((r = bl.read_fd(pipes[0], 1024)) > 0); + close(pipes[0]); + + // ignore error, we'll propogate error based on read and decode + waitpid(worker, nullptr, 0); + + if (r < 0) { + std::cerr << "get_early_config: parent failed to read from pipe: " + << r << std::endl; + return tl::unexpected(r); + } + try { + auto bliter = bl.cbegin(); + ::decode(ret, bliter); + return ret; + } catch (...) { + std::cerr << "get_early_config: parent failed to decode" << std::endl; + return tl::unexpected(-EINVAL); + } + } +} + +} diff --git a/src/crimson/osd/main_config_bootstrap_helpers.h b/src/crimson/osd/main_config_bootstrap_helpers.h new file mode 100644 index 000000000..7c6131d17 --- /dev/null +++ b/src/crimson/osd/main_config_bootstrap_helpers.h @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <sys/types.h> +#include <unistd.h> + +#include <iostream> +#include <fstream> +#include <random> + +#include <seastar/core/future.hh> + +#include "common/ceph_argparse.h" +#include "include/expected.hpp" + +namespace crimson::osd { + +void usage(const char* prog); + +inline uint64_t get_nonce() +{ + if (auto pid = getpid(); pid == 1 || std::getenv("CEPH_USE_RANDOM_NONCE")) { + // we're running in a container; use a random number instead! + std::random_device rd; + std::default_random_engine rng{rd()}; + return std::uniform_int_distribution<uint64_t>{}(rng); + } else { + return pid; + } +} + +seastar::future<> populate_config_from_mon(); + +struct early_config_t { + std::vector<std::string> early_args; + std::vector<std::string> ceph_args; + + std::string cluster_name{"ceph"}; + std::string conf_file_list; + CephInitParameters init_params{CEPH_ENTITY_TYPE_OSD}; + + /// Returned vector must not outlive in + auto to_ptr_vector(const std::vector<std::string> &in) { + std::vector<const char *> ret; + ret.reserve(in.size()); + std::transform( + std::begin(in), std::end(in), + std::back_inserter(ret), + [](const auto &str) { return str.c_str(); }); + return ret; + } + + std::vector<const char *> get_early_args() { + return to_ptr_vector(early_args); + } + + std::vector<const char *> get_ceph_args() { + return to_ptr_vector(ceph_args); + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(early_args, bl); + encode(ceph_args, bl); + encode(cluster_name, bl); + encode(conf_file_list, bl); + encode(init_params, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(early_args, bl); + decode(ceph_args, bl); + decode(cluster_name, bl); + decode(conf_file_list, bl); + decode(init_params, bl); + DECODE_FINISH(bl); + } +}; + +/** + * get_early_config + * + * Compile initial configuration information from command line arguments, + * config files, and monitors. + * + * This implementation forks off a worker process to do this work and must + * therefore be called very early in main(). (See implementation for an + * explanation). + */ +tl::expected<early_config_t, int> +get_early_config(int argc, const char *argv[]); + +} + +WRITE_CLASS_ENCODER(crimson::osd::early_config_t) diff --git a/src/crimson/osd/objclass.cc b/src/crimson/osd/objclass.cc new file mode 100644 index 000000000..4cc9d7336 --- /dev/null +++ b/src/crimson/osd/objclass.cc @@ -0,0 +1,584 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <cstdarg> +#include <cstring> +#include <boost/container/small_vector.hpp> +#include "common/ceph_context.h" +#include "common/ceph_releases.h" +#include "common/config.h" +#include "crimson/common/config_proxy.h" +#include "common/debug.h" + +#include "crimson/osd/exceptions.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/pg_backend.h" + +#include "objclass/objclass.h" +#include "osd/ClassHandler.h" + +#include "auth/Crypto.h" +#include "common/armor.h" + +using std::map; +using std::string; + +#define dout_context ClassHandler::get_instance().cct + +static constexpr int dout_subsys = ceph_subsys_objclass; + +static inline int execute_osd_op(cls_method_context_t hctx, OSDOp& op) +{ + // we can expect the memory under `ret` will be still fine after + // executing the osd op as we're running inside `seastar::thread` + // created for us by `seastar::async` in `::do_op_call()`. + int ret = 0; + using osd_op_errorator = crimson::osd::OpsExecuter::osd_op_errorator; + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->execute_op(op) + .handle_error_interruptible( + osd_op_errorator::all_same_way([&ret] (const std::error_code& err) { + assert(err.value() > 0); + ret = -err.value(); + return seastar::now(); + })).get(); // we're blocking here which requires `seastar::thread`. + return ret; +} + +int cls_call(cls_method_context_t hctx, const char *cls, const char *method, + char *indata, int datalen, + char **outdata, int *outdatalen) +{ +// FIXME, HACK: this is for testing only. Let's use dynamic linker to verify +// our depedencies + return 0; +} + +int cls_getxattr(cls_method_context_t hctx, + const char *name, + char **outdata, + int *outdatalen) +{ + return 0; +} + +int cls_setxattr(cls_method_context_t hctx, + const char *name, + const char *value, + int val_len) +{ + return 0; +} + +int cls_read(cls_method_context_t hctx, + int ofs, int len, + char **outdata, + int *outdatalen) +{ + return 0; +} + +int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin) +{ + assert(origin); + + try { + const auto& message = \ + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message(); + *origin = message.get_orig_source_inst(); + return 0; + } catch (crimson::osd::error& e) { + return -e.code().value(); + } +} + +int cls_cxx_create(cls_method_context_t hctx, const bool exclusive) +{ + OSDOp op{CEPH_OSD_OP_CREATE}; + op.op.flags = (exclusive ? CEPH_OSD_OP_FLAG_EXCL : 0); + return execute_osd_op(hctx, op); +} + +int cls_cxx_remove(cls_method_context_t hctx) +{ + OSDOp op{CEPH_OSD_OP_DELETE}; + return execute_osd_op(hctx, op); +} + +int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime) +{ + OSDOp op{CEPH_OSD_OP_STAT}; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + utime_t ut; + uint64_t s; + try { + auto iter = op.outdata.cbegin(); + decode(s, iter); + decode(ut, iter); + } catch (buffer::error& err) { + return -EIO; + } + if (size) { + *size = s; + } + if (mtime) { + *mtime = ut.sec(); + } + return 0; +} + +int cls_cxx_stat2(cls_method_context_t hctx, + uint64_t *size, + ceph::real_time *mtime) +{ + OSDOp op{CEPH_OSD_OP_STAT}; + if (const int ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + uint64_t dummy_size; + real_time dummy_mtime; + uint64_t& out_size = size ? *size : dummy_size; + real_time& out_mtime = mtime ? *mtime : dummy_mtime; + try { + auto iter = op.outdata.cbegin(); + decode(out_size, iter); + decode(out_mtime, iter); + return 0; + } catch (buffer::error& err) { + return -EIO; + } +} + +int cls_cxx_read2(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *outbl, + uint32_t op_flags) +{ + OSDOp op{CEPH_OSD_OP_SYNC_READ}; + op.op.extent.offset = ofs; + op.op.extent.length = len; + op.op.flags = op_flags; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return outbl->length(); +} + +int cls_cxx_write2(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *inbl, + uint32_t op_flags) +{ + OSDOp op{CEPH_OSD_OP_WRITE}; + op.op.extent.offset = ofs; + op.op.extent.length = len; + op.op.flags = op_flags; + op.indata = *inbl; + return execute_osd_op(hctx, op); +} + +int cls_cxx_write_full(cls_method_context_t hctx, bufferlist * const inbl) +{ + OSDOp op{CEPH_OSD_OP_WRITEFULL}; + op.op.extent.offset = 0; + op.op.extent.length = inbl->length(); + op.indata = *inbl; + return execute_osd_op(hctx, op); +} + +int cls_cxx_replace(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *inbl) +{ + { + OSDOp top{CEPH_OSD_OP_TRUNCATE}; + top.op.extent.offset = 0; + top.op.extent.length = 0; + if (const auto ret = execute_osd_op(hctx, top); ret < 0) { + return ret; + } + } + + { + OSDOp wop{CEPH_OSD_OP_WRITE}; + wop.op.extent.offset = ofs; + wop.op.extent.length = len; + wop.indata = *inbl; + if (const auto ret = execute_osd_op(hctx, wop); ret < 0) { + return ret; + } + } + return 0; +} + +int cls_cxx_truncate(cls_method_context_t hctx, int ofs) +{ + OSDOp op{CEPH_OSD_OP_TRUNCATE}; + op.op.extent.offset = ofs; + op.op.extent.length = 0; + return execute_osd_op(hctx, op); +} + +int cls_cxx_write_zero(cls_method_context_t hctx, int offset, int len) +{ + OSDOp op{CEPH_OSD_OP_ZERO}; + op.op.extent.offset = offset; + op.op.extent.length = len; + return execute_osd_op(hctx, op); +} + +int cls_cxx_getxattr(cls_method_context_t hctx, + const char *name, + bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_GETXATTR}; + op.op.xattr.name_len = strlen(name); + op.indata.append(name, op.op.xattr.name_len); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return outbl->length(); +} + +int cls_cxx_getxattrs(cls_method_context_t hctx, + map<string, bufferlist> *attrset) +{ + OSDOp op{CEPH_OSD_OP_GETXATTRS}; + if (const int ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*attrset, iter); + } catch (buffer::error& err) { + return -EIO; + } + return 0; +} + +int cls_cxx_setxattr(cls_method_context_t hctx, + const char *name, + bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_SETXATTR}; + op.op.xattr.name_len = std::strlen(name); + op.op.xattr.value_len = inbl->length(); + op.indata.append(name, op.op.xattr.name_len); + op.indata.append(*inbl); + return execute_osd_op(hctx, op); +} + +int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid) +{ + OSDOp op{CEPH_OSD_OP_ROLLBACK}; + op.op.snap.snapid = snapid; + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_get_all_vals(cls_method_context_t hctx, + map<string, bufferlist>* vals, + bool *more) +{ + return 0; +} + +int cls_cxx_map_get_keys(cls_method_context_t hctx, + const std::string& start_obj, + const uint64_t max_to_get, + std::set<std::string>* const keys, + bool* const more) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETKEYS}; + encode(start_obj, op.indata); + encode(max_to_get, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*keys, iter); + decode(*more, iter); + } catch (buffer::error&) { + return -EIO; + } + return keys->size(); +} + +int cls_cxx_map_get_vals(cls_method_context_t hctx, + const std::string& start_obj, + const std::string& filter_prefix, + const uint64_t max_to_get, + std::map<std::string, ceph::bufferlist> *vals, + bool* const more) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALS}; + encode(start_obj, op.indata); + encode(max_to_get, op.indata); + encode(filter_prefix, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*vals, iter); + decode(*more, iter); + } catch (buffer::error&) { + return -EIO; + } + return vals->size(); +} + +int cls_cxx_map_get_vals_by_keys(cls_method_context_t hctx, + const std::set<std::string> &keys, + std::map<std::string, ceph::bufferlist> *vals) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS}; + encode(keys, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*vals, iter); + } catch (buffer::error&) { + return -EIO; + } + return 0; +} + +int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETHEADER}; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return 0; +} + +int cls_cxx_map_get_val(cls_method_context_t hctx, + const string &key, + bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS}; + { + std::set<std::string> k{key}; + encode(k, op.indata); + } + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + std::map<std::string, ceph::bufferlist> m; + try { + auto iter = op.outdata.cbegin(); + decode(m, iter); + } catch (buffer::error&) { + return -EIO; + } + if (auto iter = std::begin(m); iter != std::end(m)) { + *outbl = std::move(iter->second); + return 0; + } else { + return -ENOENT; + } +} + +int cls_cxx_map_set_val(cls_method_context_t hctx, + const string &key, + bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETVALS}; + { + std::map<std::string, ceph::bufferlist> m; + m[key] = *inbl; + encode(m, op.indata); + } + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_set_vals(cls_method_context_t hctx, + const std::map<string, ceph::bufferlist> *map) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETVALS}; + encode(*map, op.indata); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_clear(cls_method_context_t hctx) +{ + OSDOp op{CEPH_OSD_OP_OMAPCLEAR}; + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETHEADER}; + op.indata = std::move(*inbl); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_remove_range(cls_method_context_t hctx, + const std::string& key_begin, + const std::string& key_end) +{ + OSDOp op{CEPH_OSD_OP_OMAPRMKEYRANGE}; + encode(key_begin, op.indata); + encode(key_end, op.indata); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_remove_key(cls_method_context_t hctx, const string &key) +{ + OSDOp op{CEPH_OSD_OP_OMAPRMKEYS}; + std::vector<string> to_rm{key}; + encode(to_rm, op.indata); + return execute_osd_op(hctx, op); +} + +int cls_cxx_list_watchers(cls_method_context_t hctx, + obj_list_watch_response_t *watchers) +{ + OSDOp op{CEPH_OSD_OP_LIST_WATCHERS}; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + + try { + auto iter = op.outdata.cbegin(); + decode(*watchers, iter); + } catch (buffer::error&) { + return -EIO; + } + return 0; +} + +uint64_t cls_current_version(cls_method_context_t hctx) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + return ox->get_last_user_version(); +} + + +int cls_current_subop_num(cls_method_context_t hctx) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + // in contrast to classical OSD, crimson doesn't count OP_CALL and + // OP_STAT which seems fine regarding how the plugins we take care + // about use this part of API. + return ox->get_processed_rw_ops_num(); +} + +uint64_t cls_get_features(cls_method_context_t hctx) +{ + return 0; +} + +uint64_t cls_get_client_features(cls_method_context_t hctx) +{ + try { + const auto& message = \ + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message(); + return message.get_features(); + } catch (crimson::osd::error& e) { + return -e.code().value(); + } +} + +uint64_t cls_get_pool_stripe_width(cls_method_context_t hctx) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + return ox->get_pool_stripe_width(); +} + +ceph_release_t cls_get_required_osd_release(cls_method_context_t hctx) +{ + // FIXME + return ceph_release_t::nautilus; +} + +ceph_release_t cls_get_min_compatible_client(cls_method_context_t hctx) +{ + // FIXME + return ceph_release_t::nautilus; +} + +const ConfigProxy& cls_get_config(cls_method_context_t hctx) +{ + return crimson::common::local_conf(); +} + +const object_info_t& cls_get_object_info(cls_method_context_t hctx) +{ + return reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_object_info(); +} + +int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + auto obc = ox->get_obc(); + if (!obc->obs.exists || + (obc->obs.oi.is_whiteout() && + obc->ssc->snapset.clones.empty())) { + return -ENOENT; + } + *snap_seq = obc->ssc->snapset.seq; + return 0; +} + +int cls_cxx_chunk_write_and_set(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *write_inbl, + uint32_t op_flags, + bufferlist *set_inbl, + int set_len) +{ + return 0; +} + +int cls_get_manifest_ref_count(cls_method_context_t hctx, string fp_oid) +{ + return 0; +} + +uint64_t cls_get_osd_min_alloc_size(cls_method_context_t hctx) { + // FIXME + return 4096; +} + +int cls_cxx_gather(cls_method_context_t hctx, const std::set<std::string> &src_objs, const std::string& pool, + const char *cls, const char *method, bufferlist& inbl) +{ + return 0; +} + +int cls_cxx_get_gathered_data(cls_method_context_t hctx, std::map<std::string, bufferlist> *results) +{ + return 0; +} + +// although at first glance the implementation looks the same as in +// the classical OSD, it's different b/c of how the dout macro expands. +int cls_log(int level, const char *format, ...) +{ + size_t size = 256; + va_list ap; + while (1) { + boost::container::small_vector<char, 256> buf(size); + va_start(ap, format); + int n = vsnprintf(buf.data(), size, format, ap); + va_end(ap); +#define MAX_SIZE 8196UL + if ((n > -1 && static_cast<size_t>(n) < size) || size > MAX_SIZE) { + dout(ceph::dout::need_dynamic(level)) << buf.data() << dendl; + return n; + } + size *= 2; + } +} diff --git a/src/crimson/osd/object_context.cc b/src/crimson/osd/object_context.cc new file mode 100644 index 000000000..1ea701c22 --- /dev/null +++ b/src/crimson/osd/object_context.cc @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/object_context.h" + +#include <fmt/ranges.h> + +#include "common/Formatter.h" +#include "crimson/common/config_proxy.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +ObjectContextRegistry::ObjectContextRegistry(crimson::common::ConfigProxy &conf) +{ + obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size")); + conf.add_observer(this); +} + +ObjectContextRegistry::~ObjectContextRegistry() +{ + // purge the cache to avoid leaks and complains from LSan + obc_lru.set_target_size(0UL); +} + +const char** ObjectContextRegistry::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "crimson_osd_obc_lru_size", + nullptr + }; + return KEYS; +} + +void ObjectContextRegistry::handle_conf_change( + const crimson::common::ConfigProxy& conf, + const std::set <std::string> &changed) +{ + obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size")); +} + +std::optional<hobject_t> resolve_oid( + const SnapSet &ss, + const hobject_t &oid) +{ + logger().debug("{} oid.snap={},head snapset.seq={}", + __func__, oid.snap, ss.seq); + if (oid.snap > ss.seq) { + // Because oid.snap > ss.seq, we are trying to read from a snapshot + // taken after the most recent write to this object. Read from head. + return oid.get_head(); + } else { + // which clone would it be? + auto clone = std::lower_bound( + begin(ss.clones), end(ss.clones), + oid.snap); + if (clone == end(ss.clones)) { + // Doesn't exist, > last clone, < ss.seq + return std::nullopt; + } + auto citer = ss.clone_snaps.find(*clone); + // TODO: how do we want to handle this kind of logic error? + ceph_assert(citer != ss.clone_snaps.end()); + + if (std::find( + citer->second.begin(), + citer->second.end(), + oid.snap) == citer->second.end()) { + logger().debug("{} {} does not contain {} -- DNE", + __func__, ss.clone_snaps, oid.snap); + return std::nullopt; + } else { + auto soid = oid; + soid.snap = *clone; + return std::optional<hobject_t>(soid); + } + } +} + +} diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h new file mode 100644 index 000000000..8abf6d3f7 --- /dev/null +++ b/src/crimson/osd/object_context.h @@ -0,0 +1,276 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <optional> +#include <utility> +#include <seastar/core/shared_future.hh> +#include <seastar/core/shared_ptr.hh> + +#include "common/intrusive_lru.h" +#include "osd/object_state.h" +#include "crimson/common/exception.h" +#include "crimson/common/tri_mutex.h" +#include "crimson/osd/osd_operation.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::common { + class ConfigProxy; +} + +namespace crimson::osd { + +class Watch; +struct SnapSetContext; +using SnapSetContextRef = boost::intrusive_ptr<SnapSetContext>; + +template <typename OBC> +struct obc_to_hoid { + using type = hobject_t; + const type &operator()(const OBC &obc) { + return obc.obs.oi.soid; + } +}; + +struct SnapSetContext : + public boost::intrusive_ref_counter<SnapSetContext, + boost::thread_unsafe_counter> +{ + hobject_t oid; + SnapSet snapset; + bool exists = false; + /** + * exists + * + * Because ObjectContext's are cached, we need to be able to express the case + * where the object to which a cached ObjectContext refers does not exist. + * ObjectContext's for yet-to-be-created objects are initialized with exists=false. + * The ObjectContext for a deleted object will have exists set to false until it falls + * out of cache (or another write recreates the object). + */ + explicit SnapSetContext(const hobject_t& o) : + oid(o), exists(false) {} +}; + +class ObjectContext : public ceph::common::intrusive_lru_base< + ceph::common::intrusive_lru_config< + hobject_t, ObjectContext, obc_to_hoid<ObjectContext>>> +{ +public: + ObjectState obs; + SnapSetContextRef ssc; + // the watch / notify machinery rather stays away from the hot and + // frequented paths. std::map is used mostly because of developer's + // convenience. + using watch_key_t = std::pair<uint64_t, entity_name_t>; + std::map<watch_key_t, seastar::shared_ptr<crimson::osd::Watch>> watchers; + + ObjectContext(hobject_t hoid) : obs(std::move(hoid)) {} + + const hobject_t &get_oid() const { + return obs.oi.soid; + } + + bool is_head() const { + return get_oid().is_head(); + } + + hobject_t get_head_oid() const { + return get_oid().get_head(); + } + + const SnapSet &get_head_ss() const { + ceph_assert(is_head()); + ceph_assert(ssc); + return ssc->snapset; + } + + void set_head_state(ObjectState &&_obs, SnapSetContextRef &&_ssc) { + ceph_assert(is_head()); + obs = std::move(_obs); + ssc = std::move(_ssc); + } + + void set_clone_state(ObjectState &&_obs) { + ceph_assert(!is_head()); + obs = std::move(_obs); + } + + /// pass the provided exception to any waiting consumers of this ObjectContext + template<typename Exception> + void interrupt(Exception ex) { + lock.abort(std::move(ex)); + if (recovery_read_marker) { + drop_recovery_read(); + } + } + +private: + tri_mutex lock; + bool recovery_read_marker = false; + + template <typename Lock, typename Func> + auto _with_lock(Lock&& lock, Func&& func) { + Ref obc = this; + return lock.lock().then([&lock, func = std::forward<Func>(func), obc]() mutable { + return seastar::futurize_invoke(func).finally([&lock, obc] { + lock.unlock(); + }); + }); + } + + boost::intrusive::list_member_hook<> list_hook; + uint64_t list_link_cnt = 0; + +public: + + template <typename ListType> + void append_to(ListType& list) { + if (list_link_cnt++ == 0) { + list.push_back(*this); + } + } + + template <typename ListType> + void remove_from(ListType&& list) { + assert(list_link_cnt > 0); + if (--list_link_cnt == 0) { + list.erase(std::decay_t<ListType>::s_iterator_to(*this)); + } + } + + using obc_accessing_option_t = boost::intrusive::member_hook< + ObjectContext, + boost::intrusive::list_member_hook<>, + &ObjectContext::list_hook>; + + template<RWState::State Type, typename InterruptCond = void, typename Func> + auto with_lock(Func&& func) { + if constexpr (!std::is_void_v<InterruptCond>) { + auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func)); + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.for_write(), std::move(wrapper)); + case RWState::RWREAD: + return _with_lock(lock.for_read(), std::move(wrapper)); + case RWState::RWEXCL: + return _with_lock(lock.for_excl(), std::move(wrapper)); + case RWState::RWNONE: + return seastar::futurize_invoke(std::move(wrapper)); + default: + assert(0 == "noop"); + } + } else { + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.for_write(), std::forward<Func>(func)); + case RWState::RWREAD: + return _with_lock(lock.for_read(), std::forward<Func>(func)); + case RWState::RWEXCL: + return _with_lock(lock.for_excl(), std::forward<Func>(func)); + case RWState::RWNONE: + return seastar::futurize_invoke(std::forward<Func>(func)); + default: + assert(0 == "noop"); + } + } + } + template<RWState::State Type, typename InterruptCond = void, typename Func> + auto with_promoted_lock(Func&& func) { + if constexpr (!std::is_void_v<InterruptCond>) { + auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func)); + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.excl_from_write(), std::move(wrapper)); + case RWState::RWREAD: + return _with_lock(lock.excl_from_read(), std::move(wrapper)); + case RWState::RWEXCL: + return _with_lock(lock.excl_from_excl(), std::move(wrapper)); + case RWState::RWNONE: + return _with_lock(lock.for_excl(), std::move(wrapper)); + default: + assert(0 == "noop"); + } + } else { + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.excl_from_write(), std::forward<Func>(func)); + case RWState::RWREAD: + return _with_lock(lock.excl_from_read(), std::forward<Func>(func)); + case RWState::RWEXCL: + return _with_lock(lock.excl_from_excl(), std::forward<Func>(func)); + case RWState::RWNONE: + return _with_lock(lock.for_excl(), std::forward<Func>(func)); + default: + assert(0 == "noop"); + } + } + } + + bool empty() const { + return !lock.is_acquired(); + } + bool is_request_pending() const { + return lock.is_acquired(); + } + + bool get_recovery_read() { + if (lock.try_lock_for_read()) { + recovery_read_marker = true; + return true; + } else { + return false; + } + } + void wait_recovery_read() { + assert(lock.get_readers() > 0); + recovery_read_marker = true; + } + void drop_recovery_read() { + assert(recovery_read_marker); + recovery_read_marker = false; + } + bool maybe_get_excl() { + return lock.try_lock_for_excl(); + } +}; +using ObjectContextRef = ObjectContext::Ref; + +class ObjectContextRegistry : public md_config_obs_t { + ObjectContext::lru_t obc_lru; + +public: + ObjectContextRegistry(crimson::common::ConfigProxy &conf); + ~ObjectContextRegistry(); + + std::pair<ObjectContextRef, bool> get_cached_obc(const hobject_t &hoid) { + return obc_lru.get_or_create(hoid); + } + ObjectContextRef maybe_get_cached_obc(const hobject_t &hoid) { + return obc_lru.get(hoid); + } + + void clear_range(const hobject_t &from, + const hobject_t &to) { + obc_lru.clear_range(from, to); + } + + template <class F> + void for_each(F&& f) { + obc_lru.for_each(std::forward<F>(f)); + } + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const crimson::common::ConfigProxy& conf, + const std::set <std::string> &changed) final; +}; + +std::optional<hobject_t> resolve_oid(const SnapSet &ss, + const hobject_t &oid); + +} // namespace crimson::osd diff --git a/src/crimson/osd/object_context_loader.cc b/src/crimson/osd/object_context_loader.cc new file mode 100644 index 000000000..0a4d74c0d --- /dev/null +++ b/src/crimson/osd/object_context_loader.cc @@ -0,0 +1,232 @@ +#include "crimson/osd/object_context_loader.h" +#include "osd/osd_types_fmt.h" + +SET_SUBSYS(osd); + +namespace crimson::osd { + +using crimson::common::local_conf; + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_head_obc(ObjectContextRef obc, + bool existed, + with_obc_func_t&& func) + { + LOG_PREFIX(ObjectContextLoader::with_head_obc); + DEBUGDPP("object {}", dpp, obc->get_oid()); + assert(obc->is_head()); + obc->append_to(obc_set_accessing); + return obc->with_lock<State, IOInterruptCondition>( + [existed=existed, obc=obc, func=std::move(func), this] { + return get_or_load_obc<State>(obc, existed) + .safe_then_interruptible( + [func = std::move(func)](auto obc) { + return std::move(func)(std::move(obc)); + }); + }).finally([FNAME, this, obc=std::move(obc)] { + DEBUGDPP("released object {}", dpp, obc->get_oid()); + obc->remove_from(obc_set_accessing); + }); + } + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_clone_obc(hobject_t oid, + with_obc_func_t&& func) + { + LOG_PREFIX(ObjectContextLoader::with_clone_obc); + assert(!oid.is_head()); + return with_obc<RWState::RWREAD>( + oid.get_head(), + [FNAME, oid, func=std::move(func), this](auto head) mutable + -> load_obc_iertr::future<> { + if (!head->obs.exists) { + ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid); + return load_obc_iertr::future<>{ + crimson::ct_error::enoent::make() + }; + } + return this->with_clone_obc_only<State>(std::move(head), + oid, + std::move(func)); + }); + } + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_clone_obc_only(ObjectContextRef head, + hobject_t oid, + with_obc_func_t&& func) + { + LOG_PREFIX(ObjectContextLoader::with_clone_obc_only); + auto coid = resolve_oid(head->get_head_ss(), oid); + if (!coid) { + ERRORDPP("clone {} not found", dpp, oid); + return load_obc_iertr::future<>{ + crimson::ct_error::enoent::make() + }; + } + auto [clone, existed] = obc_registry.get_cached_obc(*coid); + return clone->template with_lock<State, IOInterruptCondition>( + [existed=existed, clone=std::move(clone), + func=std::move(func), head=std::move(head), this]() + -> load_obc_iertr::future<> { + auto loaded = get_or_load_obc<State>(clone, existed); + return loaded.safe_then_interruptible( + [func = std::move(func)](auto clone) { + return std::move(func)(std::move(clone)); + }); + }); + } + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_clone_obc_direct( + hobject_t oid, + with_both_obc_func_t&& func) + { + LOG_PREFIX(ObjectContextLoader::with_clone_obc_direct); + assert(!oid.is_head()); + return with_obc<RWState::RWREAD>( + oid.get_head(), + [FNAME, oid, func=std::move(func), this](auto head) mutable + -> load_obc_iertr::future<> { + if (!head->obs.exists) { + ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid); + return load_obc_iertr::future<>{ + crimson::ct_error::enoent::make() + }; + } +#ifndef NDEBUG + auto &ss = head->get_head_ss(); + auto cit = std::find( + std::begin(ss.clones), std::end(ss.clones), oid.snap); + assert(cit != std::end(ss.clones)); +#endif + auto [clone, existed] = obc_registry.get_cached_obc(oid); + return clone->template with_lock<State, IOInterruptCondition>( + [existed=existed, clone=std::move(clone), + func=std::move(func), head=std::move(head), this]() + -> load_obc_iertr::future<> { + auto loaded = get_or_load_obc<State>(clone, existed); + return loaded.safe_then_interruptible( + [func = std::move(func), head=std::move(head)](auto clone) { + return std::move(func)(std::move(head), std::move(clone)); + }); + }); + }); + } + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_obc(hobject_t oid, + with_obc_func_t&& func) + { + if (oid.is_head()) { + auto [obc, existed] = + obc_registry.get_cached_obc(std::move(oid)); + return with_head_obc<State>(std::move(obc), + existed, + std::move(func)); + } else { + return with_clone_obc<State>(oid, std::move(func)); + } + } + + ObjectContextLoader::load_obc_iertr::future<ObjectContextRef> + ObjectContextLoader::load_obc(ObjectContextRef obc) + { + LOG_PREFIX(ObjectContextLoader::load_obc); + return backend.load_metadata(obc->get_oid()) + .safe_then_interruptible( + [FNAME, this, obc=std::move(obc)](auto md) + -> load_obc_ertr::future<ObjectContextRef> { + const hobject_t& oid = md->os.oi.soid; + DEBUGDPP("loaded obs {} for {}", dpp, md->os.oi, oid); + if (oid.is_head()) { + if (!md->ssc) { + ERRORDPP("oid {} missing snapsetcontext", dpp, oid); + return crimson::ct_error::object_corrupted::make(); + } + obc->set_head_state(std::move(md->os), + std::move(md->ssc)); + } else { + obc->set_clone_state(std::move(md->os)); + } + DEBUGDPP("returning obc {} for {}", dpp, obc->obs.oi, obc->obs.oi.soid); + return load_obc_ertr::make_ready_future<ObjectContextRef>(obc); + }); + } + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<ObjectContextRef> + ObjectContextLoader::get_or_load_obc(ObjectContextRef obc, + bool existed) + { + LOG_PREFIX(ObjectContextLoader::get_or_load_obc); + auto loaded = + load_obc_iertr::make_ready_future<ObjectContextRef>(obc); + if (existed) { + DEBUGDPP("cache hit on {}", dpp, obc->get_oid()); + } else { + DEBUGDPP("cache miss on {}", dpp, obc->get_oid()); + loaded = + obc->template with_promoted_lock<State, IOInterruptCondition>( + [obc, this] { + return load_obc(obc); + }); + } + return loaded; + } + + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::reload_obc(ObjectContext& obc) const + { + LOG_PREFIX(ObjectContextLoader::reload_obc); + assert(obc.is_head()); + return backend.load_metadata(obc.get_oid()) + .safe_then_interruptible<false>( + [FNAME, this, &obc](auto md)-> load_obc_ertr::future<> { + DEBUGDPP("reloaded obs {} for {}", dpp, md->os.oi, obc.get_oid()); + if (!md->ssc) { + ERRORDPP("oid {} missing snapsetcontext", dpp, obc.get_oid()); + return crimson::ct_error::object_corrupted::make(); + } + obc.set_head_state(std::move(md->os), std::move(md->ssc)); + return load_obc_ertr::now(); + }); + } + + void ObjectContextLoader::notify_on_change(bool is_primary) + { + LOG_PREFIX(ObjectContextLoader::notify_on_change); + DEBUGDPP("is_primary: {}", dpp, is_primary); + for (auto& obc : obc_set_accessing) { + DEBUGDPP("interrupting obc: {}", dpp, obc.get_oid()); + obc.interrupt(::crimson::common::actingset_changed(is_primary)); + } + } + + // explicitly instantiate the used instantiations + template ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_obc<RWState::RWNONE>(hobject_t, + with_obc_func_t&&); + + template ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_obc<RWState::RWREAD>(hobject_t, + with_obc_func_t&&); + + template ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_obc<RWState::RWWRITE>(hobject_t, + with_obc_func_t&&); + + template ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_obc<RWState::RWEXCL>(hobject_t, + with_obc_func_t&&); + + template ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_clone_obc_direct<RWState::RWWRITE>( + hobject_t, + with_both_obc_func_t&&); +} diff --git a/src/crimson/osd/object_context_loader.h b/src/crimson/osd/object_context_loader.h new file mode 100644 index 000000000..3ab7f6ad8 --- /dev/null +++ b/src/crimson/osd/object_context_loader.h @@ -0,0 +1,87 @@ +#pragma once + +#include <seastar/core/future.hh> +#include "crimson/common/errorator.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/pg_backend.h" + +namespace crimson::osd { +class ObjectContextLoader { +public: + using obc_accessing_list_t = boost::intrusive::list< + ObjectContext, + ObjectContext::obc_accessing_option_t>; + + ObjectContextLoader( + ObjectContextRegistry& _obc_services, + PGBackend& _backend, + DoutPrefixProvider& dpp) + : obc_registry{_obc_services}, + backend{_backend}, + dpp{dpp} + {} + + using load_obc_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::object_corrupted>; + using load_obc_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + load_obc_ertr>; + + using with_obc_func_t = + std::function<load_obc_iertr::future<> (ObjectContextRef)>; + + using with_both_obc_func_t = + std::function<load_obc_iertr::future<> (ObjectContextRef, ObjectContextRef)>; + + // Use this variant by default + template<RWState::State State> + load_obc_iertr::future<> with_obc(hobject_t oid, + with_obc_func_t&& func); + + // Use this variant in the case where the head object + // obc is already locked and only the clone obc is needed. + // Avoid nesting with_head_obc() calls by using with_clone_obc() + // with an already locked head. + template<RWState::State State> + load_obc_iertr::future<> with_clone_obc_only(ObjectContextRef head, + hobject_t oid, + with_obc_func_t&& func); + + // Use this variant in the case where both the head + // object *and* the matching clone object are being used + // in func. + template<RWState::State State> + load_obc_iertr::future<> with_clone_obc_direct( + hobject_t oid, + with_both_obc_func_t&& func); + + load_obc_iertr::future<> reload_obc(ObjectContext& obc) const; + + void notify_on_change(bool is_primary); + +private: + ObjectContextRegistry& obc_registry; + PGBackend& backend; + DoutPrefixProvider& dpp; + obc_accessing_list_t obc_set_accessing; + + template<RWState::State State> + load_obc_iertr::future<> with_clone_obc(hobject_t oid, + with_obc_func_t&& func); + + template<RWState::State State> + load_obc_iertr::future<> with_head_obc(ObjectContextRef obc, + bool existed, + with_obc_func_t&& func); + + template<RWState::State State> + load_obc_iertr::future<ObjectContextRef> + get_or_load_obc(ObjectContextRef obc, + bool existed); + + load_obc_iertr::future<ObjectContextRef> + load_obc(ObjectContextRef obc); +}; +} diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc new file mode 100644 index 000000000..040870203 --- /dev/null +++ b/src/crimson/osd/ops_executer.cc @@ -0,0 +1,1461 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ops_executer.h" + +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/map.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm_ext/push_back.hpp> +#include <boost/range/algorithm/max_element.hpp> +#include <boost/range/numeric.hpp> + +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include <seastar/core/thread.hh> + +#include "crimson/osd/exceptions.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/watch.h" +#include "osd/ClassHandler.h" +#include "osd/SnapMapper.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +OpsExecuter::call_ierrorator::future<> OpsExecuter::do_op_call(OSDOp& osd_op) +{ + std::string cname, mname; + ceph::bufferlist indata; + try { + auto bp = std::begin(osd_op.indata); + bp.copy(osd_op.op.cls.class_len, cname); + bp.copy(osd_op.op.cls.method_len, mname); + bp.copy(osd_op.op.cls.indata_len, indata); + } catch (buffer::error&) { + logger().warn("call unable to decode class + method + indata"); + return crimson::ct_error::invarg::make(); + } + + // NOTE: opening a class can actually result in dlopen(), and thus + // blocking the entire reactor. Thankfully to ClassHandler's cache + // this is supposed to be extremely infrequent. + ClassHandler::ClassData* cls; + int r = ClassHandler::get_instance().open_class(cname, &cls); + if (r) { + logger().warn("class {} open got {}", cname, cpp_strerror(r)); + if (r == -ENOENT) { + return crimson::ct_error::operation_not_supported::make(); + } else if (r == -EPERM) { + // propagate permission errors + return crimson::ct_error::permission_denied::make(); + } + return crimson::ct_error::input_output_error::make(); + } + + ClassHandler::ClassMethod* method = cls->get_method(mname); + if (!method) { + logger().warn("call method {}.{} does not exist", cname, mname); + return crimson::ct_error::operation_not_supported::make(); + } + + const auto flags = method->get_flags(); + if (!obc->obs.exists && (flags & CLS_METHOD_WR) == 0) { + return crimson::ct_error::enoent::make(); + } + +#if 0 + if (flags & CLS_METHOD_WR) { + ctx->user_modify = true; + } +#endif + + logger().debug("calling method {}.{}, num_read={}, num_write={}", + cname, mname, num_read, num_write); + const auto prev_rd = num_read; + const auto prev_wr = num_write; + return interruptor::async( + [this, method, indata=std::move(indata)]() mutable { + ceph::bufferlist outdata; + auto cls_context = reinterpret_cast<cls_method_context_t>(this); + const auto ret = method->exec(cls_context, indata, outdata); + return std::make_pair(ret, std::move(outdata)); + } + ).then_interruptible( + [this, prev_rd, prev_wr, &osd_op, flags] + (auto outcome) -> call_errorator::future<> { + auto& [ret, outdata] = outcome; + osd_op.rval = ret; + + logger().debug("do_op_call: method returned ret={}, outdata.length()={}" + " while num_read={}, num_write={}", + ret, outdata.length(), num_read, num_write); + if (num_read > prev_rd && !(flags & CLS_METHOD_RD)) { + logger().error("method tried to read object but is not marked RD"); + osd_op.rval = -EIO; + return crimson::ct_error::input_output_error::make(); + } + if (num_write > prev_wr && !(flags & CLS_METHOD_WR)) { + logger().error("method tried to update object but is not marked WR"); + osd_op.rval = -EIO; + return crimson::ct_error::input_output_error::make(); + } + // ceph-osd has this implemented in `PrimaryLogPG::execute_ctx`, + // grep for `ignore_out_data`. + using crimson::common::local_conf; + if (op_info.allows_returnvec() && + op_info.may_write() && + ret >= 0 && + outdata.length() > local_conf()->osd_max_write_op_reply_len) { + // the justification of this limit it to not inflate the pg log. + // that's the reason why we don't worry about pure reads. + logger().error("outdata overflow due to .length()={}, limit={}", + outdata.length(), + local_conf()->osd_max_write_op_reply_len); + osd_op.rval = -EOVERFLOW; + return crimson::ct_error::value_too_large::make(); + } + // for write calls we never return data expect errors or RETURNVEC. + // please refer cls/cls_hello.cc to details. + if (!op_info.may_write() || op_info.allows_returnvec() || ret < 0) { + osd_op.op.extent.length = outdata.length(); + osd_op.outdata.claim_append(outdata); + } + if (ret < 0) { + return crimson::stateful_ec{ + std::error_code(-ret, std::generic_category()) }; + } else { + return seastar::now(); + } + } + ); +} + +static watch_info_t create_watch_info(const OSDOp& osd_op, + const OpsExecuter::ExecutableMessage& msg, + entity_addr_t peer_addr) +{ + using crimson::common::local_conf; + const uint32_t timeout = + osd_op.op.watch.timeout == 0 ? local_conf()->osd_client_watch_timeout + : osd_op.op.watch.timeout; + return { + osd_op.op.watch.cookie, + timeout, + peer_addr + }; +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + logger().debug("{}", __func__); + struct connect_ctx_t { + ObjectContext::watch_key_t key; + crimson::net::ConnectionRef conn; + watch_info_t info; + + connect_ctx_t( + const OSDOp& osd_op, + const ExecutableMessage& msg, + crimson::net::ConnectionRef conn) + : key(osd_op.op.watch.cookie, msg.get_reqid().name), + conn(conn), + info(create_watch_info(osd_op, msg, conn->get_peer_addr())) { + } + }; + + return with_effect_on_obc( + connect_ctx_t{ osd_op, get_message(), conn }, + [&](auto& ctx) { + const auto& entity = ctx.key.second; + auto [it, emplaced] = + os.oi.watchers.try_emplace(ctx.key, std::move(ctx.info)); + if (emplaced) { + logger().info("registered new watch {} by {}", it->second, entity); + txn.nop(); + } else { + logger().info("found existing watch {} by {}", it->second, entity); + } + return seastar::now(); + }, + [](auto&& ctx, ObjectContextRef obc, Ref<PG> pg) { + assert(pg); + auto [it, emplaced] = obc->watchers.try_emplace(ctx.key, nullptr); + if (emplaced) { + const auto& [cookie, entity] = ctx.key; + it->second = crimson::osd::Watch::create( + obc, ctx.info, entity, std::move(pg)); + logger().info("op_effect: added new watcher: {}", ctx.key); + } else { + logger().info("op_effect: found existing watcher: {}", ctx.key); + } + return it->second->connect(std::move(ctx.conn), true /* will_ping */); + } + ); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_reconnect( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + const entity_name_t& entity = get_message().get_reqid().name; + const auto& cookie = osd_op.op.watch.cookie; + if (!os.oi.watchers.count(std::make_pair(cookie, entity))) { + return crimson::ct_error::not_connected::make(); + } else { + logger().info("found existing watch by {}", entity); + return do_op_watch_subop_watch(osd_op, os, txn); + } +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_unwatch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + logger().info("{}", __func__); + + struct disconnect_ctx_t { + ObjectContext::watch_key_t key; + disconnect_ctx_t(const OSDOp& osd_op, const ExecutableMessage& msg) + : key(osd_op.op.watch.cookie, msg.get_reqid().name) { + } + }; + return with_effect_on_obc(disconnect_ctx_t{ osd_op, get_message() }, + [&] (auto& ctx) { + const auto& entity = ctx.key.second; + if (auto nh = os.oi.watchers.extract(ctx.key); !nh.empty()) { + logger().info("removed watch {} by {}", nh.mapped(), entity); + txn.nop(); + } else { + logger().info("can't remove: no watch by {}", entity); + } + return seastar::now(); + }, + [] (auto&& ctx, ObjectContextRef obc, Ref<PG>) { + if (auto nh = obc->watchers.extract(ctx.key); !nh.empty()) { + return seastar::do_with(std::move(nh.mapped()), + [ctx](auto&& watcher) { + logger().info("op_effect: disconnect watcher {}", ctx.key); + return watcher->remove(); + }); + } else { + logger().info("op_effect: disconnect failed to find watcher {}", ctx.key); + return seastar::now(); + } + }); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_ping( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + const entity_name_t& entity = get_message().get_reqid().name; + const auto& cookie = osd_op.op.watch.cookie; + const auto key = std::make_pair(cookie, entity); + + // Note: WATCH with PING doesn't cause may_write() to return true, + // so if there is nothing else in the transaction, this is going + // to run do_osd_op_effects, but not write out a log entry */ + if (!os.oi.watchers.count(key)) { + return crimson::ct_error::not_connected::make(); + } + auto it = obc->watchers.find(key); + if (it == std::end(obc->watchers) || !it->second->is_connected()) { + return crimson::ct_error::timed_out::make(); + } + logger().info("found existing watch by {}", entity); + it->second->got_ping(ceph_clock_now()); + return seastar::now(); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + logger().debug("{}", __func__); + if (!os.exists) { + return crimson::ct_error::enoent::make(); + } + switch (osd_op.op.watch.op) { + case CEPH_OSD_WATCH_OP_WATCH: + return do_op_watch_subop_watch(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_RECONNECT: + return do_op_watch_subop_reconnect(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_PING: + return do_op_watch_subop_ping(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_UNWATCH: + return do_op_watch_subop_unwatch(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_LEGACY_WATCH: + logger().warn("ignoring CEPH_OSD_WATCH_OP_LEGACY_WATCH"); + return crimson::ct_error::invarg::make(); + } + logger().warn("unrecognized WATCH subop: {}", osd_op.op.watch.op); + return crimson::ct_error::invarg::make(); +} + +static uint64_t get_next_notify_id(epoch_t e) +{ + // FIXME + static std::uint64_t next_notify_id = 0; + return (((uint64_t)e) << 32) | ((uint64_t)(next_notify_id++)); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_notify( + OSDOp& osd_op, + const ObjectState& os) +{ + logger().debug("{}, msg epoch: {}", __func__, get_message().get_map_epoch()); + + if (!os.exists) { + return crimson::ct_error::enoent::make(); + } + struct notify_ctx_t { + crimson::net::ConnectionRef conn; + notify_info_t ninfo; + const uint64_t client_gid; + const epoch_t epoch; + + notify_ctx_t(const ExecutableMessage& msg, + crimson::net::ConnectionRef conn) + : conn(conn), + client_gid(msg.get_reqid().name.num()), + epoch(msg.get_map_epoch()) { + } + }; + return with_effect_on_obc( + notify_ctx_t{ get_message(), conn }, + [&](auto& ctx) { + try { + auto bp = osd_op.indata.cbegin(); + uint32_t ver; // obsolete + ceph::decode(ver, bp); + ceph::decode(ctx.ninfo.timeout, bp); + ceph::decode(ctx.ninfo.bl, bp); + } catch (const buffer::error&) { + ctx.ninfo.timeout = 0; + } + if (!ctx.ninfo.timeout) { + using crimson::common::local_conf; + ctx.ninfo.timeout = local_conf()->osd_default_notify_timeout; + } + ctx.ninfo.notify_id = get_next_notify_id(ctx.epoch); + ctx.ninfo.cookie = osd_op.op.notify.cookie; + // return our unique notify id to the client + ceph::encode(ctx.ninfo.notify_id, osd_op.outdata); + return seastar::now(); + }, + [](auto&& ctx, ObjectContextRef obc, Ref<PG>) { + auto alive_watchers = obc->watchers | boost::adaptors::map_values + | boost::adaptors::filtered( + [] (const auto& w) { + // FIXME: filter as for the `is_ping` in `Watch::start_notify` + return w->is_alive(); + }); + return crimson::osd::Notify::create_n_propagate( + std::begin(alive_watchers), + std::end(alive_watchers), + std::move(ctx.conn), + ctx.ninfo, + ctx.client_gid, + obc->obs.oi.user_version); + } + ); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_list_watchers( + OSDOp& osd_op, + const ObjectState& os) +{ + logger().debug("{}", __func__); + + obj_list_watch_response_t response; + for (const auto& [key, info] : os.oi.watchers) { + logger().debug("{}: key cookie={}, entity={}", + __func__, key.first, key.second); + assert(key.first == info.cookie); + assert(key.second.is_client()); + response.entries.emplace_back(watch_item_t{ + key.second, info.cookie, info.timeout_seconds, info.addr}); + } + response.encode(osd_op.outdata, get_message().get_features()); + return watch_ierrorator::now(); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_notify_ack( + OSDOp& osd_op, + const ObjectState& os) +{ + logger().debug("{}", __func__); + + struct notifyack_ctx_t { + const entity_name_t entity; + uint64_t watch_cookie; + uint64_t notify_id; + ceph::bufferlist reply_bl; + + notifyack_ctx_t(const ExecutableMessage& msg) + : entity(msg.get_reqid().name) { + } + }; + return with_effect_on_obc(notifyack_ctx_t{ get_message() }, + [&] (auto& ctx) -> watch_errorator::future<> { + try { + auto bp = osd_op.indata.cbegin(); + ceph::decode(ctx.notify_id, bp); + ceph::decode(ctx.watch_cookie, bp); + if (!bp.end()) { + ceph::decode(ctx.reply_bl, bp); + } + } catch (const buffer::error&) { + // here we behave differently than ceph-osd. For historical reasons, + // it falls back to using `osd_op.op.watch.cookie` as `ctx.notify_id`. + // crimson just returns EINVAL if the data cannot be decoded. + return crimson::ct_error::invarg::make(); + } + return watch_errorator::now(); + }, + [] (auto&& ctx, ObjectContextRef obc, Ref<PG>) { + logger().info("notify_ack watch_cookie={}, notify_id={}", + ctx.watch_cookie, ctx.notify_id); + return seastar::do_for_each(obc->watchers, + [ctx=std::move(ctx)] (auto& kv) { + const auto& [key, watchp] = kv; + static_assert( + std::is_same_v<std::decay_t<decltype(watchp)>, + seastar::shared_ptr<crimson::osd::Watch>>); + auto& [cookie, entity] = key; + if (ctx.entity != entity) { + logger().debug("skipping watch {}; entity name {} != {}", + key, entity, ctx.entity); + return seastar::now(); + } + if (ctx.watch_cookie != cookie) { + logger().debug("skipping watch {}; cookie {} != {}", + key, ctx.watch_cookie, cookie); + return seastar::now(); + } + logger().info("acking notify on watch {}", key); + return watchp->notify_ack(ctx.notify_id, ctx.reply_bl); + }); + }); +} + +// Defined here because there is a circular dependency between OpsExecuter and PG +template <class Func> +auto OpsExecuter::do_const_op(Func&& f) { + // TODO: pass backend as read-only + return std::forward<Func>(f)(pg->get_backend(), std::as_const(obc->obs)); +} + +// Defined here because there is a circular dependency between OpsExecuter and PG +template <class Func> +auto OpsExecuter::do_write_op(Func&& f, OpsExecuter::modified_by m) { + ++num_write; + if (!osd_op_params) { + osd_op_params.emplace(); + fill_op_params_bump_pg_version(); + } + user_modify = (m == modified_by::user); + return std::forward<Func>(f)(pg->get_backend(), obc->obs, txn); +} +OpsExecuter::call_errorator::future<> OpsExecuter::do_assert_ver( + OSDOp& osd_op, + const ObjectState& os) +{ + if (!osd_op.op.assert_ver.ver) { + return crimson::ct_error::invarg::make(); + } else if (osd_op.op.assert_ver.ver < os.oi.user_version) { + return crimson::ct_error::erange::make(); + } else if (osd_op.op.assert_ver.ver > os.oi.user_version) { + return crimson::ct_error::value_too_large::make(); + } + return seastar::now(); +} + +OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps( + OSDOp& osd_op, + const ObjectState& os, + const SnapSet& ss) +{ + obj_list_snap_response_t resp; + resp.clones.reserve(ss.clones.size() + 1); + for (auto &clone: ss.clones) { + clone_info ci; + ci.cloneid = clone; + + { + auto p = ss.clone_snaps.find(clone); + if (p == ss.clone_snaps.end()) { + logger().error( + "OpsExecutor::do_list_snaps: {} has inconsistent " + "clone_snaps, missing clone {}", + os.oi.soid, + clone); + return crimson::ct_error::invarg::make(); + } + ci.snaps.reserve(p->second.size()); + ci.snaps.insert(ci.snaps.end(), p->second.rbegin(), p->second.rend()); + } + + { + auto p = ss.clone_overlap.find(clone); + if (p == ss.clone_overlap.end()) { + logger().error( + "OpsExecutor::do_list_snaps: {} has inconsistent " + "clone_overlap, missing clone {}", + os.oi.soid, + clone); + return crimson::ct_error::invarg::make(); + } + ci.overlap.reserve(p->second.num_intervals()); + ci.overlap.insert(ci.overlap.end(), p->second.begin(), p->second.end()); + } + + { + auto p = ss.clone_size.find(clone); + if (p == ss.clone_size.end()) { + logger().error( + "OpsExecutor::do_list_snaps: {} has inconsistent " + "clone_size, missing clone {}", + os.oi.soid, + clone); + return crimson::ct_error::invarg::make(); + } + ci.size = p->second; + } + resp.clones.push_back(std::move(ci)); + } + + if (!os.oi.is_whiteout()) { + clone_info ci; + ci.cloneid = CEPH_NOSNAP; + ci.size = os.oi.size; + resp.clones.push_back(std::move(ci)); + } + resp.seq = ss.seq; + logger().error( + "OpsExecutor::do_list_snaps: {}, resp.clones.size(): {}", + os.oi.soid, + resp.clones.size()); + resp.encode(osd_op.outdata); + return read_ierrorator::now(); +} + +OpsExecuter::interruptible_errorated_future<OpsExecuter::osd_op_errorator> +OpsExecuter::execute_op(OSDOp& osd_op) +{ + return do_execute_op(osd_op).handle_error_interruptible( + osd_op_errorator::all_same_way([&osd_op](auto e, auto&& e_raw) + -> OpsExecuter::osd_op_errorator::future<> { + // All ops except for CMPEXT should have rval set to -e.value(), + // CMPEXT sets rval itself and shouldn't be overridden. + if (e.value() != ct_error::cmp_fail_error_value) { + osd_op.rval = -e.value(); + } + if ((osd_op.op.flags & CEPH_OSD_OP_FLAG_FAILOK) && + e.value() != EAGAIN && e.value() != EINPROGRESS) { + return osd_op_errorator::now(); + } else { + return std::move(e_raw); + } + })); +} + +OpsExecuter::interruptible_errorated_future<OpsExecuter::osd_op_errorator> +OpsExecuter::do_execute_op(OSDOp& osd_op) +{ + // TODO: dispatch via call table? + // TODO: we might want to find a way to unify both input and output + // of each op. + logger().debug( + "handling op {} on object {}", + ceph_osd_op_name(osd_op.op.op), + get_target()); + switch (const ceph_osd_op& op = osd_op.op; op.op) { + case CEPH_OSD_OP_SYNC_READ: + [[fallthrough]]; + case CEPH_OSD_OP_READ: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.read(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_SPARSE_READ: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.sparse_read(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_CHECKSUM: + return do_read_op([&osd_op](auto& backend, const auto& os) { + return backend.checksum(os, osd_op); + }); + case CEPH_OSD_OP_CMPEXT: + return do_read_op([&osd_op](auto& backend, const auto& os) { + return backend.cmp_ext(os, osd_op); + }); + case CEPH_OSD_OP_GETXATTR: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.getxattr(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_GETXATTRS: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.get_xattrs(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_CMPXATTR: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.cmp_xattr(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_RMXATTR: + return do_write_op([&osd_op](auto& backend, auto& os, auto& txn) { + return backend.rm_xattr(os, osd_op, txn); + }); + case CEPH_OSD_OP_CREATE: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.create(os, osd_op, txn, delta_stats); + }); + case CEPH_OSD_OP_WRITE: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.write(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_WRITESAME: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.write_same(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_WRITEFULL: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.writefull(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_ROLLBACK: + return do_write_op([this, &head=obc, + &osd_op](auto& backend, auto& os, auto& txn) { + return backend.rollback(os, osd_op, txn, *osd_op_params, delta_stats, + head, pg->obc_loader); + }); + case CEPH_OSD_OP_APPEND: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.append(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_TRUNCATE: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + // FIXME: rework needed. Move this out to do_write_op(), introduce + // do_write_op_no_user_modify()... + return backend.truncate(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_ZERO: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.zero(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_SETALLOCHINT: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.set_allochint(os, osd_op, txn, delta_stats); + }); + case CEPH_OSD_OP_SETXATTR: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.setxattr(os, osd_op, txn, delta_stats); + }); + case CEPH_OSD_OP_DELETE: + { + bool whiteout = false; + if (!obc->ssc->snapset.clones.empty() || + (snapc.snaps.size() && // there are snaps + snapc.snaps[0] > obc->ssc->snapset.seq)) { // existing obj is old + logger().debug("{} has or will have clones, will whiteout {}", + __func__, obc->obs.oi.soid); + whiteout = true; + } + return do_write_op([this, whiteout](auto& backend, auto& os, auto& txn) { + return backend.remove(os, txn, delta_stats, whiteout); + }); + } + case CEPH_OSD_OP_CALL: + return this->do_op_call(osd_op); + case CEPH_OSD_OP_STAT: + // note: stat does not require RD + return do_const_op([this, &osd_op] (/* const */auto& backend, const auto& os) { + return backend.stat(os, osd_op, delta_stats); + }); + + case CEPH_OSD_OP_TMAPPUT: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.tmapput(os, osd_op, txn, delta_stats, *osd_op_params); + }); + case CEPH_OSD_OP_TMAPUP: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto &txn) { + return backend.tmapup(os, osd_op, txn, delta_stats, *osd_op_params); + }); + case CEPH_OSD_OP_TMAPGET: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.tmapget(os, osd_op, delta_stats); + }); + + // OMAP + case CEPH_OSD_OP_OMAPGETKEYS: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.omap_get_keys(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_OMAPGETVALS: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.omap_get_vals(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_OMAP_CMP: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.omap_cmp(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_OMAPGETHEADER: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.omap_get_header(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_OMAPGETVALSBYKEYS: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.omap_get_vals_by_keys(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_OMAPSETVALS: +#if 0 + if (!pg.get_pgpool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.omap_set_vals(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_OMAPSETHEADER: +#if 0 + if (!pg.get_pgpool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.omap_set_header(os, osd_op, txn, *osd_op_params, + delta_stats); + }); + case CEPH_OSD_OP_OMAPRMKEYRANGE: +#if 0 + if (!pg.get_pgpool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.omap_remove_range(os, osd_op, txn, delta_stats); + }); + case CEPH_OSD_OP_OMAPRMKEYS: + /** TODO: Implement supports_omap() + if (!pg.get_pgpool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + }*/ + return do_write_op([&osd_op](auto& backend, auto& os, auto& txn) { + return backend.omap_remove_key(os, osd_op, txn); + }); + case CEPH_OSD_OP_OMAPCLEAR: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.omap_clear(os, osd_op, txn, *osd_op_params, delta_stats); + }); + + // watch/notify + case CEPH_OSD_OP_WATCH: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return do_op_watch(osd_op, os, txn); + }, modified_by::sys); + case CEPH_OSD_OP_LIST_WATCHERS: + return do_read_op([this, &osd_op](auto&, const auto& os) { + return do_op_list_watchers(osd_op, os); + }); + case CEPH_OSD_OP_NOTIFY: + return do_read_op([this, &osd_op](auto&, const auto& os) { + return do_op_notify(osd_op, os); + }); + case CEPH_OSD_OP_NOTIFY_ACK: + return do_read_op([this, &osd_op](auto&, const auto& os) { + return do_op_notify_ack(osd_op, os); + }); + case CEPH_OSD_OP_ASSERT_VER: + return do_read_op([this, &osd_op](auto&, const auto& os) { + return do_assert_ver(osd_op, os); + }); + case CEPH_OSD_OP_LIST_SNAPS: + return do_snapset_op([this, &osd_op](const auto &os, const auto &ss) { + return do_list_snaps(osd_op, os, ss); + }); + + default: + logger().warn("unknown op {}", ceph_osd_op_name(op.op)); + throw std::runtime_error( + fmt::format("op '{}' not supported", ceph_osd_op_name(op.op))); + } +} + +void OpsExecuter::fill_op_params_bump_pg_version() +{ + osd_op_params->req_id = msg->get_reqid(); + osd_op_params->mtime = msg->get_mtime(); + osd_op_params->at_version = pg->next_version(); + osd_op_params->pg_trim_to = pg->get_pg_trim_to(); + osd_op_params->min_last_complete_ondisk = pg->get_min_last_complete_ondisk(); + osd_op_params->last_complete = pg->get_info().last_complete; +} + +std::vector<pg_log_entry_t> OpsExecuter::prepare_transaction( + const std::vector<OSDOp>& ops) +{ + // let's ensure we don't need to inform SnapMapper about this particular + // entry. + assert(obc->obs.oi.soid.snap >= CEPH_MAXSNAP); + std::vector<pg_log_entry_t> log_entries; + log_entries.emplace_back( + obc->obs.exists ? + pg_log_entry_t::MODIFY : pg_log_entry_t::DELETE, + obc->obs.oi.soid, + osd_op_params->at_version, + obc->obs.oi.version, + osd_op_params->user_modify ? osd_op_params->at_version.version : 0, + osd_op_params->req_id, + osd_op_params->mtime, + op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0); + if (op_info.allows_returnvec()) { + // also the per-op values are recorded in the pg log + log_entries.back().set_op_returns(ops); + logger().debug("{} op_returns: {}", + __func__, log_entries.back().op_returns); + } + log_entries.back().clean_regions = std::move(osd_op_params->clean_regions); + return log_entries; +} + +OpsExecuter::interruptible_future<> OpsExecuter::snap_map_remove( + const hobject_t& soid, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn) +{ + logger().debug("{}: soid {}", __func__, soid); + return interruptor::async([soid, &snap_mapper, + _t=osdriver.get_transaction(&txn)]() mutable { + const auto r = snap_mapper.remove_oid(soid, &_t); + if (r) { + logger().error("{}: remove_oid {} failed with {}", + __func__, soid, r); + } + // On removal tolerate missing key corruption + assert(r == 0 || r == -ENOENT); + }); +} + +OpsExecuter::interruptible_future<> OpsExecuter::snap_map_modify( + const hobject_t& soid, + const std::set<snapid_t>& snaps, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn) +{ + logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps); + return interruptor::async([soid, snaps, &snap_mapper, + _t=osdriver.get_transaction(&txn)]() mutable { + assert(std::size(snaps) > 0); + [[maybe_unused]] const auto r = snap_mapper.update_snaps( + soid, snaps, 0, &_t); + assert(r == 0); + }); +} + +OpsExecuter::interruptible_future<> OpsExecuter::snap_map_clone( + const hobject_t& soid, + const std::set<snapid_t>& snaps, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn) +{ + logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps); + return interruptor::async([soid, snaps, &snap_mapper, + _t=osdriver.get_transaction(&txn)]() mutable { + assert(std::size(snaps) > 0); + snap_mapper.add_oid(soid, snaps, &_t); + }); +} + +// Defined here because there is a circular dependency between OpsExecuter and PG +uint32_t OpsExecuter::get_pool_stripe_width() const { + return pg->get_pgpool().info.get_stripe_width(); +} + +// Defined here because there is a circular dependency between OpsExecuter and PG +version_t OpsExecuter::get_last_user_version() const +{ + return pg->get_last_user_version(); +} + +std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone( + const SnapContext& snapc, + const ObjectState& initial_obs, + const SnapSet& initial_snapset, + PGBackend& backend, + ceph::os::Transaction& txn) +{ + const hobject_t& soid = initial_obs.oi.soid; + logger().debug("{} {} snapset={} snapc={}", + __func__, soid, + initial_snapset, snapc); + + auto cloning_ctx = std::make_unique<CloningContext>(); + cloning_ctx->new_snapset = initial_snapset; + + // clone object, the snap field is set to the seq of the SnapContext + // at its creation. + hobject_t coid = soid; + coid.snap = snapc.seq; + + // existing snaps are stored in descending order in snapc, + // cloned_snaps vector will hold all the snaps stored until snapset.seq + const std::vector<snapid_t> cloned_snaps = [&] { + auto last = std::find_if( + std::begin(snapc.snaps), std::end(snapc.snaps), + [&](snapid_t snap_id) { return snap_id <= initial_snapset.seq; }); + return std::vector<snapid_t>{std::begin(snapc.snaps), last}; + }(); + + auto [snap_oi, clone_obc] = prepare_clone(coid); + // make clone + backend.clone(snap_oi, initial_obs, clone_obc->obs, txn); + + delta_stats.num_objects++; + if (snap_oi.is_omap()) { + delta_stats.num_objects_omap++; + } + delta_stats.num_object_clones++; + // newsnapset is obc's ssc + cloning_ctx->new_snapset.clones.push_back(coid.snap); + cloning_ctx->new_snapset.clone_size[coid.snap] = initial_obs.oi.size; + cloning_ctx->new_snapset.clone_snaps[coid.snap] = cloned_snaps; + + // clone_overlap should contain an entry for each clone + // (an empty interval_set if there is no overlap) + auto &overlap = cloning_ctx->new_snapset.clone_overlap[coid.snap]; + if (initial_obs.oi.size) { + overlap.insert(0, initial_obs.oi.size); + } + + // log clone + logger().debug("cloning v {} to {} v {} snaps={} snapset={}", + initial_obs.oi.version, coid, + osd_op_params->at_version, cloned_snaps, cloning_ctx->new_snapset); + + cloning_ctx->log_entry = { + pg_log_entry_t::CLONE, + coid, + snap_oi.version, + initial_obs.oi.version, + initial_obs.oi.user_version, + osd_reqid_t(), + initial_obs.oi.mtime, // will be replaced in `apply_to()` + 0 + }; + encode(cloned_snaps, cloning_ctx->log_entry.snaps); + + // TODO: update most recent clone_overlap and usage stats + return cloning_ctx; +} + +void OpsExecuter::CloningContext::apply_to( + std::vector<pg_log_entry_t>& log_entries, + ObjectContext& processed_obc) && +{ + log_entry.mtime = processed_obc.obs.oi.mtime; + log_entries.emplace_back(std::move(log_entry)); + processed_obc.ssc->snapset = std::move(new_snapset); +} + +OpsExecuter::interruptible_future<std::vector<pg_log_entry_t>> +OpsExecuter::flush_clone_metadata( + std::vector<pg_log_entry_t>&& log_entries, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn) +{ + assert(!txn.empty()); + auto maybe_snap_mapped = interruptor::now(); + if (cloning_ctx) { + std::move(*cloning_ctx).apply_to(log_entries, *obc); + const auto& coid = log_entries.back().soid; + const auto& cloned_snaps = obc->ssc->snapset.clone_snaps[coid.snap]; + maybe_snap_mapped = snap_map_clone( + coid, + std::set<snapid_t>{std::begin(cloned_snaps), std::end(cloned_snaps)}, + snap_mapper, + osdriver, + txn); + } + if (snapc.seq > obc->ssc->snapset.seq) { + // update snapset with latest snap context + obc->ssc->snapset.seq = snapc.seq; + obc->ssc->snapset.snaps.clear(); + } + logger().debug("{} done, initial snapset={}, new snapset={}", + __func__, obc->obs.oi.soid, obc->ssc->snapset); + return std::move( + maybe_snap_mapped + ).then_interruptible([log_entries=std::move(log_entries)]() mutable { + return interruptor::make_ready_future<std::vector<pg_log_entry_t>>( + std::move(log_entries)); + }); +} + +// TODO: make this static +std::pair<object_info_t, ObjectContextRef> OpsExecuter::prepare_clone( + const hobject_t& coid) +{ + object_info_t static_snap_oi(coid); + static_snap_oi.version = pg->next_version(); + static_snap_oi.prior_version = obc->obs.oi.version; + static_snap_oi.copy_user_bits(obc->obs.oi); + if (static_snap_oi.is_whiteout()) { + // clone shouldn't be marked as whiteout + static_snap_oi.clear_flag(object_info_t::FLAG_WHITEOUT); + } + + ObjectContextRef clone_obc; + if (pg->is_primary()) { + // lookup_or_create + auto [c_obc, existed] = + pg->obc_registry.get_cached_obc(std::move(coid)); + assert(!existed); + c_obc->obs.oi = static_snap_oi; + c_obc->obs.exists = true; + c_obc->ssc = obc->ssc; + logger().debug("clone_obc: {}", c_obc->obs.oi); + clone_obc = std::move(c_obc); + } + return std::make_pair(std::move(static_snap_oi), std::move(clone_obc)); +} + +void OpsExecuter::apply_stats() +{ + pg->get_peering_state().apply_op_stats(get_target(), delta_stats); + pg->publish_stats_to_osd(); +} + +OpsExecuter::OpsExecuter(Ref<PG> pg, + ObjectContextRef _obc, + const OpInfo& op_info, + abstracted_msg_t&& msg, + crimson::net::ConnectionRef conn, + const SnapContext& _snapc) + : pg(std::move(pg)), + obc(std::move(_obc)), + op_info(op_info), + msg(std::move(msg)), + conn(conn), + snapc(_snapc) +{ + if (op_info.may_write() && should_clone(*obc, snapc)) { + do_write_op([this](auto& backend, auto& os, auto& txn) { + cloning_ctx = execute_clone(std::as_const(snapc), + std::as_const(obc->obs), + std::as_const(obc->ssc->snapset), + backend, + txn); + }); + } +} + +static inline std::unique_ptr<const PGLSFilter> get_pgls_filter( + const std::string& type, + bufferlist::const_iterator& iter) +{ + // storing non-const PGLSFilter for the sake of ::init() + std::unique_ptr<PGLSFilter> filter; + if (type.compare("plain") == 0) { + filter = std::make_unique<PGLSPlainFilter>(); + } else { + std::size_t dot = type.find("."); + if (dot == type.npos || dot == 0 || dot == type.size() - 1) { + throw crimson::osd::invalid_argument{}; + } + + const std::string class_name = type.substr(0, dot); + const std::string filter_name = type.substr(dot + 1); + ClassHandler::ClassData *cls = nullptr; + int r = ClassHandler::get_instance().open_class(class_name, &cls); + if (r != 0) { + logger().warn("can't open class {}: {}", class_name, cpp_strerror(r)); + if (r == -EPERM) { + // propogate permission error + throw crimson::osd::permission_denied{}; + } else { + throw crimson::osd::invalid_argument{}; + } + } else { + ceph_assert(cls); + } + + ClassHandler::ClassFilter * const class_filter = cls->get_filter(filter_name); + if (class_filter == nullptr) { + logger().warn("can't find filter {} in class {}", filter_name, class_name); + throw crimson::osd::invalid_argument{}; + } + + filter.reset(class_filter->fn()); + if (!filter) { + // Object classes are obliged to return us something, but let's + // give an error rather than asserting out. + logger().warn("buggy class {} failed to construct filter {}", + class_name, filter_name); + throw crimson::osd::invalid_argument{}; + } + } + + ceph_assert(filter); + int r = filter->init(iter); + if (r < 0) { + logger().warn("error initializing filter {}: {}", type, cpp_strerror(r)); + throw crimson::osd::invalid_argument{}; + } + + // successfully constructed and initialized, return it. + return filter; +} + +static PG::interruptible_future<hobject_t> pgls_filter( + const PGLSFilter& filter, + const PGBackend& backend, + const hobject_t& sobj) +{ + if (const auto xattr = filter.get_xattr(); !xattr.empty()) { + logger().debug("pgls_filter: filter is interested in xattr={} for obj={}", + xattr, sobj); + return backend.getxattr(sobj, std::move(xattr)).safe_then_interruptible( + [&filter, sobj] (ceph::bufferlist val) { + logger().debug("pgls_filter: got xvalue for obj={}", sobj); + + const bool filtered = filter.filter(sobj, val); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + }, PGBackend::get_attr_errorator::all_same_way([&filter, sobj] { + logger().debug("pgls_filter: got error for obj={}", sobj); + + if (filter.reject_empty_xattr()) { + return seastar::make_ready_future<hobject_t>(); + } + ceph::bufferlist val; + const bool filtered = filter.filter(sobj, val); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + })); + } else { + ceph::bufferlist empty_lvalue_bl; + const bool filtered = filter.filter(sobj, empty_lvalue_bl); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + } +} + +static PG::interruptible_future<ceph::bufferlist> do_pgnls_common( + const hobject_t& pg_start, + const hobject_t& pg_end, + const PGBackend& backend, + const hobject_t& lower_bound, + const std::string& nspace, + const uint64_t limit, + const PGLSFilter* const filter) +{ + if (!(lower_bound.is_min() || + lower_bound.is_max() || + (lower_bound >= pg_start && lower_bound < pg_end))) { + // this should only happen with a buggy client. + throw std::invalid_argument("outside of PG bounds"); + } + + return backend.list_objects(lower_bound, limit).then_interruptible( + [&backend, filter, nspace](auto&& ret) + -> PG::interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>> { + auto& [objects, next] = ret; + auto in_my_namespace = [&nspace](const hobject_t& obj) { + using crimson::common::local_conf; + if (obj.get_namespace() == local_conf()->osd_hit_set_namespace) { + return false; + } else if (nspace == librados::all_nspaces) { + return true; + } else { + return obj.get_namespace() == nspace; + } + }; + auto to_pglsed = [&backend, filter] (const hobject_t& obj) + -> PG::interruptible_future<hobject_t> { + // this transformation looks costly. However, I don't have any + // reason to think PGLS* operations are critical for, let's say, + // general performance. + // + // from tchaikov: "another way is to use seastar::map_reduce(), + // to 1) save the effort to filter the already filtered objects + // 2) avoid the space to keep the tuple<bool, object> even if + // the object is filtered out". + if (filter) { + return pgls_filter(*filter, backend, obj); + } else { + return seastar::make_ready_future<hobject_t>(obj); + } + }; + + auto range = objects | boost::adaptors::filtered(in_my_namespace) + | boost::adaptors::transformed(to_pglsed); + logger().debug("do_pgnls_common: finishing the 1st stage of pgls"); + return seastar::when_all_succeed(std::begin(range), + std::end(range)).then( + [next=std::move(next)] (auto items) mutable { + // the sole purpose of this chaining is to pass `next` to 2nd + // stage altogether with items + logger().debug("do_pgnls_common: 1st done"); + return seastar::make_ready_future< + std::tuple<std::vector<hobject_t>, hobject_t>>( + std::move(items), std::move(next)); + }); + }).then_interruptible( + [pg_end] (auto&& ret) { + auto& [items, next] = ret; + auto is_matched = [] (const auto& obj) { + return !obj.is_min(); + }; + auto to_entry = [] (const auto& obj) { + return librados::ListObjectImpl{ + obj.get_namespace(), obj.oid.name, obj.get_key() + }; + }; + + pg_nls_response_t response; + boost::push_back(response.entries, items | boost::adaptors::filtered(is_matched) + | boost::adaptors::transformed(to_entry)); + response.handle = next.is_max() ? pg_end : next; + ceph::bufferlist out; + encode(response, out); + logger().debug("do_pgnls_common: response.entries.size()= {}", + response.entries.size()); + return seastar::make_ready_future<ceph::bufferlist>(std::move(out)); + }); +} + +static PG::interruptible_future<> do_pgnls( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + hobject_t lower_bound; + try { + ceph::decode(lower_bound, osd_op.indata); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGNLS handle"); + } + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = \ + pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num()); + return do_pgnls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + nullptr /* no filter */) + .then_interruptible([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); +} + +static PG::interruptible_future<> do_pgnls_filtered( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + std::string cname, mname, type; + auto bp = osd_op.indata.cbegin(); + try { + ceph::decode(cname, bp); + ceph::decode(mname, bp); + ceph::decode(type, bp); + } catch (const buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + auto filter = get_pgls_filter(type, bp); + + hobject_t lower_bound; + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGNLS_FILTER description"); + } + + logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}", + __func__, cname, mname, type, lower_bound, + static_cast<const void*>(filter.get())); + return seastar::do_with(std::move(filter), + [&, lower_bound=std::move(lower_bound)](auto&& filter) { + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num()); + return do_pgnls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + filter.get()) + .then_interruptible([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); + }); +} + +static PG::interruptible_future<ceph::bufferlist> do_pgls_common( + const hobject_t& pg_start, + const hobject_t& pg_end, + const PGBackend& backend, + const hobject_t& lower_bound, + const std::string& nspace, + const uint64_t limit, + const PGLSFilter* const filter) +{ + if (!(lower_bound.is_min() || + lower_bound.is_max() || + (lower_bound >= pg_start && lower_bound < pg_end))) { + // this should only happen with a buggy client. + throw std::invalid_argument("outside of PG bounds"); + } + + using entries_t = decltype(pg_ls_response_t::entries); + return backend.list_objects(lower_bound, limit).then_interruptible( + [&backend, filter, nspace](auto&& ret) { + auto& [objects, next] = ret; + return PG::interruptor::when_all( + PG::interruptor::map_reduce(std::move(objects), + [&backend, filter, nspace](const hobject_t& obj) + -> PG::interruptible_future<hobject_t>{ + if (obj.get_namespace() == nspace) { + if (filter) { + return pgls_filter(*filter, backend, obj); + } else { + return seastar::make_ready_future<hobject_t>(obj); + } + } else { + return seastar::make_ready_future<hobject_t>(); + } + }, + entries_t{}, + [](entries_t entries, hobject_t obj) { + if (!obj.is_min()) { + entries.emplace_back(obj.oid, obj.get_key()); + } + return entries; + }), + seastar::make_ready_future<hobject_t>(next)); + }).then_interruptible([pg_end](auto&& ret) { + auto entries = std::move(std::get<0>(ret).get0()); + auto next = std::move(std::get<1>(ret).get0()); + pg_ls_response_t response; + response.handle = next.is_max() ? pg_end : next; + response.entries = std::move(entries); + ceph::bufferlist out; + encode(response, out); + logger().debug("{}: response.entries.size()=", + __func__, response.entries.size()); + return seastar::make_ready_future<ceph::bufferlist>(std::move(out)); + }); +} + +static PG::interruptible_future<> do_pgls( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + hobject_t lower_bound; + auto bp = osd_op.indata.cbegin(); + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument{"unable to decode PGLS handle"}; + } + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = + pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num()); + return do_pgls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + nullptr /* no filter */) + .then_interruptible([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); +} + +static PG::interruptible_future<> do_pgls_filtered( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + std::string cname, mname, type; + auto bp = osd_op.indata.cbegin(); + try { + ceph::decode(cname, bp); + ceph::decode(mname, bp); + ceph::decode(type, bp); + } catch (const buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + auto filter = get_pgls_filter(type, bp); + + hobject_t lower_bound; + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGLS_FILTER description"); + } + + logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}", + __func__, cname, mname, type, lower_bound, + static_cast<const void*>(filter.get())); + return seastar::do_with(std::move(filter), + [&, lower_bound=std::move(lower_bound)](auto&& filter) { + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num()); + return do_pgls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + filter.get()) + .then_interruptible([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); + }); +} + +PgOpsExecuter::interruptible_future<> +PgOpsExecuter::execute_op(OSDOp& osd_op) +{ + logger().warn("handling op {}", ceph_osd_op_name(osd_op.op.op)); + switch (const ceph_osd_op& op = osd_op.op; op.op) { + case CEPH_OSD_OP_PGLS: + return do_pgls(pg, nspace, osd_op); + case CEPH_OSD_OP_PGLS_FILTER: + return do_pgls_filtered(pg, nspace, osd_op); + case CEPH_OSD_OP_PGNLS: + return do_pgnls(pg, nspace, osd_op); + case CEPH_OSD_OP_PGNLS_FILTER: + return do_pgnls_filtered(pg, nspace, osd_op); + default: + logger().warn("unknown op {}", ceph_osd_op_name(op.op)); + throw std::runtime_error( + fmt::format("op '{}' not supported", ceph_osd_op_name(op.op))); + } +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h new file mode 100644 index 000000000..1230b1c5a --- /dev/null +++ b/src/crimson/osd/ops_executer.h @@ -0,0 +1,629 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <type_traits> +#include <utility> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <fmt/os.h> +#include <seastar/core/chunked_fifo.hh> +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/shared_ptr.hh> + +#include "common/dout.h" +#include "common/map_cacher.hpp" +#include "common/static_ptr.h" +#include "messages/MOSDOp.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" + +#include "crimson/common/errorator.h" +#include "crimson/common/interruptible_future.h" +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/shard_services.h" + +struct ObjectState; +struct OSDOp; +class OSDriver; +class SnapMapper; + +namespace crimson::osd { +class PG; + +// OpsExecuter -- a class for executing ops targeting a certain object. +class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> { + friend class SnapTrimObjSubEvent; + + using call_errorator = crimson::errorator< + crimson::stateful_ec, + crimson::ct_error::enoent, + crimson::ct_error::eexist, + crimson::ct_error::enospc, + crimson::ct_error::edquot, + crimson::ct_error::cmp_fail, + crimson::ct_error::eagain, + crimson::ct_error::invarg, + crimson::ct_error::erange, + crimson::ct_error::ecanceled, + crimson::ct_error::enametoolong, + crimson::ct_error::permission_denied, + crimson::ct_error::operation_not_supported, + crimson::ct_error::input_output_error, + crimson::ct_error::value_too_large, + crimson::ct_error::file_too_large>; + using read_errorator = PGBackend::read_errorator; + using write_ertr = PGBackend::write_ertr; + using get_attr_errorator = PGBackend::get_attr_errorator; + using watch_errorator = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::invarg, + crimson::ct_error::not_connected, + crimson::ct_error::timed_out>; + + using call_ierrorator = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, call_errorator>; + using read_ierrorator = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, read_errorator>; + using write_iertr = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, write_ertr>; + using get_attr_ierrorator = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, get_attr_errorator>; + using watch_ierrorator = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, watch_errorator>; + + template <typename Errorator, typename T = void> + using interruptible_errorated_future = + ::crimson::interruptible::interruptible_errorated_future< + IOInterruptCondition, Errorator, T>; + using interruptor = + ::crimson::interruptible::interruptor<IOInterruptCondition>; + template <typename T = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + IOInterruptCondition, T>; + +public: + // ExecutableMessage -- an interface class to allow using OpsExecuter + // with other message types than just the `MOSDOp`. The type erasure + // happens in the ctor of `OpsExecuter`. + struct ExecutableMessage { + virtual osd_reqid_t get_reqid() const = 0; + virtual utime_t get_mtime() const = 0; + virtual epoch_t get_map_epoch() const = 0; + virtual entity_inst_t get_orig_source_inst() const = 0; + virtual uint64_t get_features() const = 0; + virtual bool has_flag(uint32_t flag) const = 0; + virtual entity_name_t get_source() const = 0; + }; + + template <class ImplT> + class ExecutableMessagePimpl final : ExecutableMessage { + const ImplT* pimpl; + // In crimson, conn is independently maintained outside Message. + const crimson::net::ConnectionRef conn; + public: + ExecutableMessagePimpl(const ImplT* pimpl, + const crimson::net::ConnectionRef conn) + : pimpl(pimpl), conn(conn) { + } + + osd_reqid_t get_reqid() const final { + return pimpl->get_reqid(); + } + bool has_flag(uint32_t flag) const final { + return pimpl->has_flag(flag); + } + utime_t get_mtime() const final { + return pimpl->get_mtime(); + }; + epoch_t get_map_epoch() const final { + return pimpl->get_map_epoch(); + } + entity_inst_t get_orig_source_inst() const final { + // We can't get the origin source address from the message + // since (In Crimson) the connection is maintained + // outside of the Message. + return entity_inst_t(get_source(), conn->get_peer_addr()); + } + entity_name_t get_source() const final { + return pimpl->get_source(); + } + uint64_t get_features() const final { + return pimpl->get_features(); + } + }; + + // because OpsExecuter is pretty heavy-weight object we want to ensure + // it's not copied nor even moved by accident. Performance is the sole + // reason for prohibiting that. + OpsExecuter(OpsExecuter&&) = delete; + OpsExecuter(const OpsExecuter&) = delete; + + using osd_op_errorator = crimson::compound_errorator_t< + call_errorator, + read_errorator, + write_ertr, + get_attr_errorator, + watch_errorator, + PGBackend::stat_errorator>; + using osd_op_ierrorator = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, osd_op_errorator>; + + object_stat_sum_t delta_stats; +private: + // an operation can be divided into two stages: main and effect-exposing + // one. The former is performed immediately on call to `do_osd_op()` while + // the later on `submit_changes()` – after successfully processing main + // stages of all involved operations. When any stage fails, none of all + // scheduled effect-exposing stages will be executed. + // when operation requires this division, some variant of `with_effect()` + // should be used. + struct effect_t { + // an effect can affect PG, i.e. create a watch timeout + virtual osd_op_errorator::future<> execute(Ref<PG> pg) = 0; + virtual ~effect_t() = default; + }; + + Ref<PG> pg; // for the sake of object class + ObjectContextRef obc; + const OpInfo& op_info; + using abstracted_msg_t = + ceph::static_ptr<ExecutableMessage, + sizeof(ExecutableMessagePimpl<void>)>; + abstracted_msg_t msg; + crimson::net::ConnectionRef conn; + std::optional<osd_op_params_t> osd_op_params; + bool user_modify = false; + ceph::os::Transaction txn; + + size_t num_read = 0; ///< count read ops + size_t num_write = 0; ///< count update ops + + SnapContext snapc; // writer snap context + struct CloningContext { + SnapSet new_snapset; + pg_log_entry_t log_entry; + + void apply_to( + std::vector<pg_log_entry_t>& log_entries, + ObjectContext& processed_obc) &&; + }; + std::unique_ptr<CloningContext> cloning_ctx; + + + /** + * execute_clone + * + * If snapc contains a snap which occurred logically after the last write + * seen by this object (see OpsExecutor::should_clone()), we first need + * make a clone of the object at its current state. execute_clone primes + * txn with that clone operation and returns an + * OpsExecutor::CloningContext which will allow us to fill in the corresponding + * metadata and log_entries once the operations have been processed. + * + * Note that this strategy differs from classic, which instead performs this + * work at the end and reorders the transaction. See + * PrimaryLogPG::make_writeable + * + * @param snapc [in] snapc for this operation (from the client if from the + * client, from the pool otherwise) + * @param initial_obs [in] objectstate for the object at operation start + * @param initial_snapset [in] snapset for the object at operation start + * @param backend [in,out] interface for generating mutations + * @param txn [out] transaction for the operation + */ + std::unique_ptr<CloningContext> execute_clone( + const SnapContext& snapc, + const ObjectState& initial_obs, + const SnapSet& initial_snapset, + PGBackend& backend, + ceph::os::Transaction& txn); + + + /** + * should_clone + * + * Predicate returning whether a user write with snap context snapc + * contains a snap which occurred prior to the most recent write + * on the object reflected in initial_obc. + * + * @param initial_obc [in] obc for object to be mutated + * @param snapc [in] snapc for this operation (from the client if from the + * client, from the pool otherwise) + */ + static bool should_clone( + const ObjectContext& initial_obc, + const SnapContext& snapc) { + // clone? + return initial_obc.obs.exists // both nominally and... + && !initial_obc.obs.oi.is_whiteout() // ... logically exists + && snapc.snaps.size() // there are snaps + && snapc.snaps[0] > initial_obc.ssc->snapset.seq; // existing obj is old + } + + interruptible_future<std::vector<pg_log_entry_t>> flush_clone_metadata( + std::vector<pg_log_entry_t>&& log_entries, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn); + + static interruptible_future<> snap_map_remove( + const hobject_t& soid, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn); + static interruptible_future<> snap_map_modify( + const hobject_t& soid, + const std::set<snapid_t>& snaps, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn); + static interruptible_future<> snap_map_clone( + const hobject_t& soid, + const std::set<snapid_t>& snaps, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn); + + // this gizmo could be wrapped in std::optional for the sake of lazy + // initialization. we don't need it for ops that doesn't have effect + // TODO: verify the init overhead of chunked_fifo + seastar::chunked_fifo<std::unique_ptr<effect_t>> op_effects; + + template <class Context, class MainFunc, class EffectFunc> + auto with_effect_on_obc( + Context&& ctx, + MainFunc&& main_func, + EffectFunc&& effect_func); + + call_ierrorator::future<> do_op_call(OSDOp& osd_op); + watch_ierrorator::future<> do_op_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn); + watch_ierrorator::future<> do_op_watch_subop_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn); + watch_ierrorator::future<> do_op_watch_subop_reconnect( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn); + watch_ierrorator::future<> do_op_watch_subop_unwatch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn); + watch_ierrorator::future<> do_op_watch_subop_ping( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn); + watch_ierrorator::future<> do_op_list_watchers( + OSDOp& osd_op, + const ObjectState& os); + watch_ierrorator::future<> do_op_notify( + OSDOp& osd_op, + const ObjectState& os); + watch_ierrorator::future<> do_op_notify_ack( + OSDOp& osd_op, + const ObjectState& os); + call_errorator::future<> do_assert_ver( + OSDOp& osd_op, + const ObjectState& os); + + using list_snaps_ertr = read_errorator::extend< + crimson::ct_error::invarg>; + using list_snaps_iertr = ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + list_snaps_ertr>; + list_snaps_iertr::future<> do_list_snaps( + OSDOp& osd_op, + const ObjectState& os, + const SnapSet& ss); + + template <class Func> + auto do_const_op(Func&& f); + + template <class Func> + auto do_read_op(Func&& f) { + ++num_read; + // TODO: pass backend as read-only + return do_const_op(std::forward<Func>(f)); + } + + template <class Func> + auto do_snapset_op(Func&& f) { + ++num_read; + return std::invoke( + std::forward<Func>(f), + std::as_const(obc->obs), + std::as_const(obc->ssc->snapset)); + } + + enum class modified_by { + user, + sys, + }; + + template <class Func> + auto do_write_op(Func&& f, modified_by m = modified_by::user); + + decltype(auto) dont_do_legacy_op() { + return crimson::ct_error::operation_not_supported::make(); + } + + interruptible_errorated_future<osd_op_errorator> + do_execute_op(OSDOp& osd_op); + + OpsExecuter(Ref<PG> pg, + ObjectContextRef obc, + const OpInfo& op_info, + abstracted_msg_t&& msg, + crimson::net::ConnectionRef conn, + const SnapContext& snapc); + +public: + template <class MsgT> + OpsExecuter(Ref<PG> pg, + ObjectContextRef obc, + const OpInfo& op_info, + const MsgT& msg, + crimson::net::ConnectionRef conn, + const SnapContext& snapc) + : OpsExecuter( + std::move(pg), + std::move(obc), + op_info, + abstracted_msg_t{ + std::in_place_type_t<ExecutableMessagePimpl<MsgT>>{}, + &msg, + conn}, + conn, + snapc) { + } + + template <class Func> + struct RollbackHelper; + + template <class Func> + RollbackHelper<Func> create_rollbacker(Func&& func); + + interruptible_errorated_future<osd_op_errorator> + execute_op(OSDOp& osd_op); + + using rep_op_fut_tuple = + std::tuple<interruptible_future<>, osd_op_ierrorator::future<>>; + using rep_op_fut_t = + interruptible_future<rep_op_fut_tuple>; + template <typename MutFunc> + rep_op_fut_t flush_changes_n_do_ops_effects( + const std::vector<OSDOp>& ops, + SnapMapper& snap_mapper, + OSDriver& osdriver, + MutFunc&& mut_func) &&; + std::vector<pg_log_entry_t> prepare_transaction( + const std::vector<OSDOp>& ops); + void fill_op_params_bump_pg_version(); + + ObjectContextRef get_obc() const { + return obc; + } + + const object_info_t &get_object_info() const { + return obc->obs.oi; + } + const hobject_t &get_target() const { + return get_object_info().soid; + } + + const auto& get_message() const { + return *msg; + } + + size_t get_processed_rw_ops_num() const { + return num_read + num_write; + } + + uint32_t get_pool_stripe_width() const; + + bool has_seen_write() const { + return num_write > 0; + } + + object_stat_sum_t& get_stats(){ + return delta_stats; + } + + version_t get_last_user_version() const; + + std::pair<object_info_t, ObjectContextRef> prepare_clone( + const hobject_t& coid); + + void apply_stats(); +}; + +template <class Context, class MainFunc, class EffectFunc> +auto OpsExecuter::with_effect_on_obc( + Context&& ctx, + MainFunc&& main_func, + EffectFunc&& effect_func) +{ + using context_t = std::decay_t<Context>; + // the language offers implicit conversion to pointer-to-function for + // lambda only when it's closureless. We enforce this restriction due + // the fact that `flush_changes()` std::moves many executer's parts. + using allowed_effect_func_t = + seastar::future<> (*)(context_t&&, ObjectContextRef, Ref<PG>); + static_assert(std::is_convertible_v<EffectFunc, allowed_effect_func_t>, + "with_effect function is not allowed to capture"); + struct task_t final : effect_t { + context_t ctx; + EffectFunc effect_func; + ObjectContextRef obc; + + task_t(Context&& ctx, EffectFunc&& effect_func, ObjectContextRef obc) + : ctx(std::move(ctx)), + effect_func(std::move(effect_func)), + obc(std::move(obc)) { + } + osd_op_errorator::future<> execute(Ref<PG> pg) final { + return std::move(effect_func)(std::move(ctx), + std::move(obc), + std::move(pg)); + } + }; + auto task = + std::make_unique<task_t>(std::move(ctx), std::move(effect_func), obc); + auto& ctx_ref = task->ctx; + op_effects.emplace_back(std::move(task)); + return std::forward<MainFunc>(main_func)(ctx_ref); +} + +template <typename MutFunc> +OpsExecuter::rep_op_fut_t +OpsExecuter::flush_changes_n_do_ops_effects( + const std::vector<OSDOp>& ops, + SnapMapper& snap_mapper, + OSDriver& osdriver, + MutFunc&& mut_func) && +{ + const bool want_mutate = !txn.empty(); + // osd_op_params are instantiated by every wr-like operation. + assert(osd_op_params || !want_mutate); + assert(obc); + rep_op_fut_t maybe_mutated = + interruptor::make_ready_future<rep_op_fut_tuple>( + seastar::now(), + interruptor::make_interruptible(osd_op_errorator::now())); + if (cloning_ctx) { + ceph_assert(want_mutate); + } + if (want_mutate) { + if (user_modify) { + osd_op_params->user_at_version = osd_op_params->at_version.version; + } + maybe_mutated = flush_clone_metadata( + prepare_transaction(ops), + snap_mapper, + osdriver, + txn + ).then_interruptible([mut_func=std::move(mut_func), + this](auto&& log_entries) mutable { + auto [submitted, all_completed] = + std::forward<MutFunc>(mut_func)(std::move(txn), + std::move(obc), + std::move(*osd_op_params), + std::move(log_entries)); + return interruptor::make_ready_future<rep_op_fut_tuple>( + std::move(submitted), + osd_op_ierrorator::future<>(std::move(all_completed))); + }); + } + apply_stats(); + + if (__builtin_expect(op_effects.empty(), true)) { + return maybe_mutated; + } else { + return maybe_mutated.then_unpack_interruptible( + // need extra ref pg due to apply_stats() which can be executed after + // informing snap mapper + [this, pg=this->pg](auto&& submitted, auto&& all_completed) mutable { + return interruptor::make_ready_future<rep_op_fut_tuple>( + std::move(submitted), + all_completed.safe_then_interruptible([this, pg=std::move(pg)] { + // let's do the cleaning of `op_effects` in destructor + return interruptor::do_for_each(op_effects, + [pg=std::move(pg)](auto& op_effect) { + return op_effect->execute(pg); + }); + })); + }); + } +} + +template <class Func> +struct OpsExecuter::RollbackHelper { + interruptible_future<> rollback_obc_if_modified(const std::error_code& e); + ObjectContextRef get_obc() const { + assert(ox); + return ox->obc; + } + seastar::lw_shared_ptr<OpsExecuter> ox; + Func func; +}; + +template <class Func> +inline OpsExecuter::RollbackHelper<Func> +OpsExecuter::create_rollbacker(Func&& func) { + return {shared_from_this(), std::forward<Func>(func)}; +} + + +template <class Func> +OpsExecuter::interruptible_future<> +OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified( + const std::error_code& e) +{ + // Oops, an operation had failed. do_osd_ops() altogether with + // OpsExecuter already dropped the ObjectStore::Transaction if + // there was any. However, this is not enough to completely + // rollback as we gave OpsExecuter the very single copy of `obc` + // we maintain and we did it for both reading and writing. + // Now all modifications must be reverted. + // + // Let's just reload from the store. Evicting from the shared + // LRU would be tricky as next MOSDOp (the one at `get_obc` + // phase) could actually already finished the lookup. Fortunately, + // this is supposed to live on cold paths, so performance is not + // a concern -- simplicity wins. + // + // The conditional's purpose is to efficiently handle hot errors + // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or + // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients + // typically append them before any write. If OpsExecuter hasn't + // seen any modifying operation, `obc` is supposed to be kept + // unchanged. + assert(ox); + const auto need_rollback = ox->has_seen_write(); + crimson::get_logger(ceph_subsys_osd).debug( + "{}: object {} got error {}, need_rollback={}", + __func__, + ox->obc->get_oid(), + e, + need_rollback); + return need_rollback ? func(*ox->obc) : interruptor::now(); +} + +// PgOpsExecuter -- a class for executing ops targeting a certain PG. +class PgOpsExecuter { + template <typename T = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + IOInterruptCondition, T>; + +public: + PgOpsExecuter(const PG& pg, const MOSDOp& msg) + : pg(pg), nspace(msg.get_hobj().nspace) { + } + + interruptible_future<> execute_op(OSDOp& osd_op); + +private: + const PG& pg; + const std::string& nspace; +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc new file mode 100644 index 000000000..cfe4f54ab --- /dev/null +++ b/src/crimson/osd/osd.cc @@ -0,0 +1,1357 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd.h" + +#include <sys/utsname.h> + +#include <boost/iterator/counting_iterator.hpp> +#include <boost/range/join.hpp> +#include <fmt/format.h> +#include <fmt/os.h> +#include <fmt/ostream.h> +#include <seastar/core/timer.hh> + +#include "common/pick_address.h" +#include "include/util.h" + +#include "messages/MCommand.h" +#include "messages/MOSDBeacon.h" +#include "messages/MOSDBoot.h" +#include "messages/MOSDMap.h" +#include "messages/MOSDMarkMeDown.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDPeeringOp.h" +#include "messages/MOSDPGCreate2.h" +#include "messages/MOSDPGUpdateLogMissing.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDScrub2.h" +#include "messages/MPGStats.h" + +#include "os/Transaction.h" +#include "osd/ClassHandler.h" +#include "osd/OSDCap.h" +#include "osd/PGPeeringEvent.h" +#include "osd/PeeringState.h" + +#include "crimson/admin/osd_admin.h" +#include "crimson/admin/pg_commands.h" +#include "crimson/common/buffer_io.h" +#include "crimson/common/exception.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/heartbeat.h" +#include "crimson/osd/osd_meta.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/pg_meta.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/pg_advance_map.h" +#include "crimson/osd/osd_operations/recovery_subrequest.h" +#include "crimson/osd/osd_operations/replicated_request.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/crush/CrushLocation.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } + static constexpr int TICK_INTERVAL = 1; +} + +using std::make_unique; +using std::map; +using std::pair; +using std::string; +using std::unique_ptr; +using std::vector; + +using crimson::common::local_conf; +using crimson::os::FuturizedStore; + +namespace crimson::osd { + +OSD::OSD(int id, uint32_t nonce, + seastar::abort_source& abort_source, + crimson::os::FuturizedStore& store, + crimson::net::MessengerRef cluster_msgr, + crimson::net::MessengerRef public_msgr, + crimson::net::MessengerRef hb_front_msgr, + crimson::net::MessengerRef hb_back_msgr) + : whoami{id}, + nonce{nonce}, + abort_source{abort_source}, + // do this in background + beacon_timer{[this] { (void)send_beacon(); }}, + cluster_msgr{cluster_msgr}, + public_msgr{public_msgr}, + hb_front_msgr{hb_front_msgr}, + hb_back_msgr{hb_back_msgr}, + monc{new crimson::mon::Client{*public_msgr, *this}}, + mgrc{new crimson::mgr::Client{*public_msgr, *this}}, + store{store}, + pg_shard_manager{osd_singleton_state, + shard_services, + pg_to_shard_mappings}, + // do this in background -- continuation rearms timer when complete + tick_timer{[this] { + std::ignore = update_heartbeat_peers( + ).then([this] { + update_stats(); + tick_timer.arm( + std::chrono::seconds(TICK_INTERVAL)); + }); + }}, + asok{seastar::make_lw_shared<crimson::admin::AdminSocket>()}, + log_client(cluster_msgr.get(), LogClient::NO_FLAGS), + clog(log_client.create_channel()) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + for (auto msgr : {std::ref(cluster_msgr), std::ref(public_msgr), + std::ref(hb_front_msgr), std::ref(hb_back_msgr)}) { + msgr.get()->set_auth_server(monc.get()); + msgr.get()->set_auth_client(monc.get()); + } + + if (local_conf()->osd_open_classes_on_start) { + const int r = ClassHandler::get_instance().open_all_classes(); + if (r) { + logger().warn("{} warning: got an error loading one or more classes: {}", + __func__, cpp_strerror(r)); + } + } + logger().info("{}: nonce is {}", __func__, nonce); + monc->set_log_client(&log_client); + clog->set_log_to_monitors(true); +} + +OSD::~OSD() = default; + +namespace { +// Initial features in new superblock. +// Features here are also automatically upgraded +CompatSet get_osd_initial_compat_set() +{ + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES); + return CompatSet(ceph_osd_feature_compat, + ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} +} + +seastar::future<> OSD::open_meta_coll() +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return store.get_sharded_store().open_collection( + coll_t::meta() + ).then([this](auto ch) { + pg_shard_manager.init_meta_coll(ch, store.get_sharded_store()); + return seastar::now(); + }); +} + +seastar::future<OSDMeta> OSD::open_or_create_meta_coll(FuturizedStore &store) +{ + return store.get_sharded_store().open_collection(coll_t::meta()).then([&store](auto ch) { + if (!ch) { + return store.get_sharded_store().create_new_collection( + coll_t::meta() + ).then([&store](auto ch) { + return OSDMeta(ch, store.get_sharded_store()); + }); + } else { + return seastar::make_ready_future<OSDMeta>(ch, store.get_sharded_store()); + } + }); +} + +seastar::future<> OSD::mkfs( + FuturizedStore &store, + unsigned whoami, + uuid_d osd_uuid, + uuid_d cluster_fsid, + std::string osdspec_affinity) +{ + return store.start().then([&store, osd_uuid] { + return store.mkfs(osd_uuid).handle_error( + crimson::stateful_ec::handle([] (const auto& ec) { + logger().error("error creating empty object store in {}: ({}) {}", + local_conf().get_val<std::string>("osd_data"), + ec.value(), ec.message()); + std::exit(EXIT_FAILURE); + })); + }).then([&store] { + return store.mount().handle_error( + crimson::stateful_ec::handle([](const auto& ec) { + logger().error("error mounting object store in {}: ({}) {}", + local_conf().get_val<std::string>("osd_data"), + ec.value(), ec.message()); + std::exit(EXIT_FAILURE); + })); + }).then([&store] { + return open_or_create_meta_coll(store); + }).then([&store, whoami, cluster_fsid](auto meta_coll) { + OSDSuperblock superblock; + superblock.cluster_fsid = cluster_fsid; + superblock.osd_fsid = store.get_fsid(); + superblock.whoami = whoami; + superblock.compat_features = get_osd_initial_compat_set(); + return _write_superblock( + store, std::move(meta_coll), std::move(superblock)); + }).then([&store, cluster_fsid] { + return store.write_meta("ceph_fsid", cluster_fsid.to_string()); + }).then([&store] { + return store.write_meta("magic", CEPH_OSD_ONDISK_MAGIC); + }).then([&store, whoami] { + return store.write_meta("whoami", std::to_string(whoami)); + }).then([&store] { + return _write_key_meta(store); + }).then([&store, osdspec_affinity=std::move(osdspec_affinity)] { + return store.write_meta("osdspec_affinity", osdspec_affinity); + }).then([&store] { + return store.write_meta("ready", "ready"); + }).then([&store, whoami, cluster_fsid] { + fmt::print("created object store {} for osd.{} fsid {}\n", + local_conf().get_val<std::string>("osd_data"), + whoami, cluster_fsid); + return store.umount(); + }).then([&store] { + return store.stop(); + }); +} + +seastar::future<> OSD::_write_superblock( + FuturizedStore &store, + OSDMeta meta_coll, + OSDSuperblock superblock) +{ + return seastar::do_with( + std::move(meta_coll), + std::move(superblock), + [&store](auto &meta_coll, auto &superblock) { + return meta_coll.load_superblock( + ).safe_then([&superblock](OSDSuperblock&& sb) { + if (sb.cluster_fsid != superblock.cluster_fsid) { + logger().error("provided cluster fsid {} != superblock's {}", + sb.cluster_fsid, superblock.cluster_fsid); + throw std::invalid_argument("mismatched fsid"); + } + if (sb.whoami != superblock.whoami) { + logger().error("provided osd id {} != superblock's {}", + sb.whoami, superblock.whoami); + throw std::invalid_argument("mismatched osd id"); + } + }).handle_error( + crimson::ct_error::enoent::handle([&store, &meta_coll, &superblock] { + // meta collection does not yet, create superblock + logger().info( + "{} writing superblock cluster_fsid {} osd_fsid {}", + "_write_superblock", + superblock.cluster_fsid, + superblock.osd_fsid); + ceph::os::Transaction t; + meta_coll.create(t); + meta_coll.store_superblock(t, superblock); + logger().debug("OSD::_write_superblock: do_transaction..."); + return store.get_sharded_store().do_transaction( + meta_coll.collection(), + std::move(t)); + }), + crimson::ct_error::assert_all("_write_superbock error") + ); + }); +} + +// this `to_string` sits in the `crimson::osd` namespace, so we don't brake +// the language rule on not overloading in `std::`. +static std::string to_string(const seastar::temporary_buffer<char>& temp_buf) +{ + return {temp_buf.get(), temp_buf.size()}; +} + +seastar::future<> OSD::_write_key_meta(FuturizedStore &store) +{ + + if (auto key = local_conf().get_val<std::string>("key"); !std::empty(key)) { + return store.write_meta("osd_key", key); + } else if (auto keyfile = local_conf().get_val<std::string>("keyfile"); + !std::empty(keyfile)) { + return read_file(keyfile).then([&store](const auto& temp_buf) { + // it's on a truly cold path, so don't worry about memcpy. + return store.write_meta("osd_key", to_string(temp_buf)); + }).handle_exception([keyfile] (auto ep) { + logger().error("_write_key_meta: failed to handle keyfile {}: {}", + keyfile, ep); + ceph_abort(); + }); + } else { + return seastar::now(); + } +} + +namespace { + entity_addrvec_t pick_addresses(int what) { + entity_addrvec_t addrs; + crimson::common::CephContext cct; + // we're interested solely in v2; crimson doesn't do v1 + const auto flags = what | CEPH_PICK_ADDRESS_MSGR2; + if (int r = ::pick_addresses(&cct, flags, &addrs, -1); r < 0) { + throw std::runtime_error("failed to pick address"); + } + for (auto addr : addrs.v) { + logger().info("picked address {}", addr); + } + return addrs; + } + std::pair<entity_addrvec_t, bool> + replace_unknown_addrs(entity_addrvec_t maybe_unknowns, + const entity_addrvec_t& knowns) { + bool changed = false; + auto maybe_replace = [&](entity_addr_t addr) { + if (!addr.is_blank_ip()) { + return addr; + } + for (auto& b : knowns.v) { + if (addr.get_family() == b.get_family()) { + auto a = b; + a.set_nonce(addr.get_nonce()); + a.set_type(addr.get_type()); + a.set_port(addr.get_port()); + changed = true; + return a; + } + } + throw std::runtime_error("failed to replace unknown address"); + }; + entity_addrvec_t replaced; + std::transform(maybe_unknowns.v.begin(), + maybe_unknowns.v.end(), + std::back_inserter(replaced.v), + maybe_replace); + return {replaced, changed}; + } +} + +seastar::future<> OSD::start() +{ + logger().info("start"); + + startup_time = ceph::mono_clock::now(); + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return store.start().then([this] { + return pg_to_shard_mappings.start(0, seastar::smp::count + ).then([this] { + return osd_singleton_state.start_single( + whoami, std::ref(*cluster_msgr), std::ref(*public_msgr), + std::ref(*monc), std::ref(*mgrc)); + }).then([this] { + return osd_states.start(); + }).then([this] { + ceph::mono_time startup_time = ceph::mono_clock::now(); + return shard_services.start( + std::ref(osd_singleton_state), + std::ref(pg_to_shard_mappings), + whoami, + startup_time, + osd_singleton_state.local().perf, + osd_singleton_state.local().recoverystate_perf, + std::ref(store), + std::ref(osd_states)); + }); + }).then([this] { + heartbeat.reset(new Heartbeat{ + whoami, get_shard_services(), + *monc, *hb_front_msgr, *hb_back_msgr}); + return store.mount().handle_error( + crimson::stateful_ec::handle([] (const auto& ec) { + logger().error("error mounting object store in {}: ({}) {}", + local_conf().get_val<std::string>("osd_data"), + ec.value(), ec.message()); + std::exit(EXIT_FAILURE); + })); + }).then([this] { + return open_meta_coll(); + }).then([this] { + return pg_shard_manager.get_meta_coll().load_superblock( + ).handle_error( + crimson::ct_error::assert_all("open_meta_coll error") + ); + }).then([this](OSDSuperblock&& sb) { + superblock = std::move(sb); + pg_shard_manager.set_superblock(superblock); + return pg_shard_manager.get_local_map(superblock.current_epoch); + }).then([this](OSDMapService::local_cached_map_t&& map) { + osdmap = make_local_shared_foreign(OSDMapService::local_cached_map_t(map)); + return pg_shard_manager.update_map(std::move(map)); + }).then([this] { + return shard_services.invoke_on_all([this](auto &local_service) { + local_service.local_state.osdmap_gate.got_map(osdmap->get_epoch()); + }); + }).then([this] { + bind_epoch = osdmap->get_epoch(); + return pg_shard_manager.load_pgs(store); + }).then([this] { + uint64_t osd_required = + CEPH_FEATURE_UID | + CEPH_FEATURE_PGID64 | + CEPH_FEATURE_OSDENC; + using crimson::net::SocketPolicy; + + public_msgr->set_default_policy(SocketPolicy::stateless_server(0)); + public_msgr->set_policy(entity_name_t::TYPE_MON, + SocketPolicy::lossy_client(osd_required)); + public_msgr->set_policy(entity_name_t::TYPE_MGR, + SocketPolicy::lossy_client(osd_required)); + public_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::stateless_server(0)); + + cluster_msgr->set_default_policy(SocketPolicy::stateless_server(0)); + cluster_msgr->set_policy(entity_name_t::TYPE_MON, + SocketPolicy::lossy_client(0)); + cluster_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossless_peer(osd_required)); + cluster_msgr->set_policy(entity_name_t::TYPE_CLIENT, + SocketPolicy::stateless_server(0)); + + crimson::net::dispatchers_t dispatchers{this, monc.get(), mgrc.get()}; + return seastar::when_all_succeed( + cluster_msgr->bind(pick_addresses(CEPH_PICK_ADDRESS_CLUSTER)) + .safe_then([this, dispatchers]() mutable { + return cluster_msgr->start(dispatchers); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [] (const std::error_code& e) { + logger().error("cluster messenger bind(): {}", e); + ceph_abort(); + })), + public_msgr->bind(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC)) + .safe_then([this, dispatchers]() mutable { + return public_msgr->start(dispatchers); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [] (const std::error_code& e) { + logger().error("public messenger bind(): {}", e); + ceph_abort(); + }))); + }).then_unpack([this] { + return seastar::when_all_succeed(monc->start(), + mgrc->start()); + }).then_unpack([this] { + return _add_me_to_crush(); + }).then([this] { + monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0); + monc->sub_want("mgrmap", 0, 0); + monc->sub_want("osdmap", 0, 0); + return monc->renew_subs(); + }).then([this] { + if (auto [addrs, changed] = + replace_unknown_addrs(cluster_msgr->get_myaddrs(), + public_msgr->get_myaddrs()); changed) { + logger().debug("replacing unkwnown addrs of cluster messenger"); + cluster_msgr->set_myaddrs(addrs); + } + return heartbeat->start(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC), + pick_addresses(CEPH_PICK_ADDRESS_CLUSTER)); + }).then([this] { + // create the admin-socket server, and the objects that register + // to handle incoming commands + return start_asok_admin(); + }).then([this] { + return log_client.set_fsid(monc->get_fsid()); + }).then([this] { + return start_boot(); + }); +} + +seastar::future<> OSD::start_boot() +{ + pg_shard_manager.set_preboot(); + return monc->get_version("osdmap").then([this](auto&& ret) { + auto [newest, oldest] = ret; + return _preboot(oldest, newest); + }); +} + +seastar::future<> OSD::_preboot(version_t oldest, version_t newest) +{ + logger().info("osd.{}: _preboot", whoami); + if (osdmap->get_epoch() == 0) { + logger().info("waiting for initial osdmap"); + } else if (osdmap->is_destroyed(whoami)) { + logger().warn("osdmap says I am destroyed"); + // provide a small margin so we don't livelock seeing if we + // un-destroyed ourselves. + if (osdmap->get_epoch() > newest - 1) { + throw std::runtime_error("i am destroyed"); + } + } else if (osdmap->is_noup(whoami)) { + logger().warn("osdmap NOUP flag is set, waiting for it to clear"); + } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) { + logger().error("osdmap SORTBITWISE OSDMap flag is NOT set; please set it"); + } else if (osdmap->require_osd_release < ceph_release_t::octopus) { + logger().error("osdmap require_osd_release < octopus; please upgrade to octopus"); + } else if (false) { + // TODO: update mon if current fullness state is different from osdmap + } else if (version_t n = local_conf()->osd_map_message_max; + osdmap->get_epoch() >= oldest - 1 && + osdmap->get_epoch() + n > newest) { + return _send_boot(); + } + // get all the latest maps + if (osdmap->get_epoch() + 1 >= oldest) { + return get_shard_services().osdmap_subscribe(osdmap->get_epoch() + 1, false); + } else { + return get_shard_services().osdmap_subscribe(oldest - 1, true); + } +} + +seastar::future<> OSD::_send_boot() +{ + pg_shard_manager.set_booting(); + + entity_addrvec_t public_addrs = public_msgr->get_myaddrs(); + entity_addrvec_t cluster_addrs = cluster_msgr->get_myaddrs(); + entity_addrvec_t hb_back_addrs = heartbeat->get_back_addrs(); + entity_addrvec_t hb_front_addrs = heartbeat->get_front_addrs(); + if (cluster_msgr->set_addr_unknowns(public_addrs)) { + cluster_addrs = cluster_msgr->get_myaddrs(); + } + if (heartbeat->get_back_msgr().set_addr_unknowns(cluster_addrs)) { + hb_back_addrs = heartbeat->get_back_addrs(); + } + if (heartbeat->get_front_msgr().set_addr_unknowns(public_addrs)) { + hb_front_addrs = heartbeat->get_front_addrs(); + } + logger().info("hb_back_msgr: {}", hb_back_addrs); + logger().info("hb_front_msgr: {}", hb_front_addrs); + logger().info("cluster_msgr: {}", cluster_addrs); + + auto m = crimson::make_message<MOSDBoot>(superblock, + osdmap->get_epoch(), + boot_epoch, + hb_back_addrs, + hb_front_addrs, + cluster_addrs, + CEPH_FEATURES_ALL); + collect_sys_info(&m->metadata, NULL); + + // See OSDMonitor::preprocess_boot, prevents boot without allow_crimson + // OSDMap flag + m->metadata["osd_type"] = "crimson"; + return monc->send_message(std::move(m)); +} + +seastar::future<> OSD::_add_me_to_crush() +{ + if (!local_conf().get_val<bool>("osd_crush_update_on_start")) { + return seastar::now(); + } + auto get_weight = [this] { + if (auto w = local_conf().get_val<double>("osd_crush_initial_weight"); + w >= 0) { + return seastar::make_ready_future<double>(w); + } else { + return store.stat().then([](auto st) { + auto total = st.total; + return seastar::make_ready_future<double>( + std::max(.00001, + double(total) / double(1ull << 40))); // TB + }); + } + }; + return get_weight().then([this](auto weight) { + const crimson::crush::CrushLocation loc; + return seastar::do_with( + std::move(loc), + [this, weight] (crimson::crush::CrushLocation& loc) { + return loc.init_on_startup().then([this, weight, &loc]() { + logger().info("crush location is {}", loc); + string cmd = fmt::format(R"({{ + "prefix": "osd crush create-or-move", + "id": {}, + "weight": {:.4f}, + "args": [{}] + }})", whoami, weight, loc); + return monc->run_command(std::move(cmd), {}); + }); + }); + }).then([](auto&& command_result) { + [[maybe_unused]] auto [code, message, out] = std::move(command_result); + if (code) { + logger().warn("fail to add to crush: {} ({})", message, code); + throw std::runtime_error("fail to add to crush"); + } else { + logger().info("added to crush: {}", message); + } + return seastar::now(); + }); +} + +seastar::future<> OSD::handle_command( + crimson::net::ConnectionRef conn, + Ref<MCommand> m) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return asok->handle_command(conn, std::move(m)); +} + +/* + The OSD's Admin Socket object created here has two servers (i.e. - blocks of commands + to handle) registered to it: + - OSD's specific commands are handled by the OSD object; + - there are some common commands registered to be directly handled by the AdminSocket object + itself. +*/ +seastar::future<> OSD::start_asok_admin() +{ + auto asok_path = local_conf().get_val<std::string>("admin_socket"); + using namespace crimson::admin; + return asok->start(asok_path).then([this] { + asok->register_admin_commands(); + asok->register_command(make_asok_hook<OsdStatusHook>(std::as_const(*this))); + asok->register_command(make_asok_hook<SendBeaconHook>(*this)); + asok->register_command(make_asok_hook<FlushPgStatsHook>(*this)); + asok->register_command( + make_asok_hook<DumpPGStateHistory>(std::as_const(pg_shard_manager))); + asok->register_command(make_asok_hook<DumpMetricsHook>()); + asok->register_command(make_asok_hook<DumpPerfCountersHook>()); + asok->register_command(make_asok_hook<InjectDataErrorHook>(get_shard_services())); + asok->register_command(make_asok_hook<InjectMDataErrorHook>(get_shard_services())); + // PG commands + asok->register_command(make_asok_hook<pg::QueryCommand>(*this)); + asok->register_command(make_asok_hook<pg::MarkUnfoundLostCommand>(*this)); + // ops commands + asok->register_command( + make_asok_hook<DumpInFlightOpsHook>( + std::as_const(pg_shard_manager))); + asok->register_command( + make_asok_hook<DumpHistoricOpsHook>( + std::as_const(get_shard_services().get_registry()))); + asok->register_command( + make_asok_hook<DumpSlowestHistoricOpsHook>( + std::as_const(get_shard_services().get_registry()))); + asok->register_command( + make_asok_hook<DumpRecoveryReservationsHook>(get_shard_services())); + }); +} + +seastar::future<> OSD::stop() +{ + logger().info("stop"); + beacon_timer.cancel(); + tick_timer.cancel(); + // see also OSD::shutdown() + return prepare_to_stop().then([this] { + return pg_shard_manager.set_stopping(); + }).then([this] { + logger().debug("prepared to stop"); + public_msgr->stop(); + cluster_msgr->stop(); + auto gate_close_fut = gate.close(); + return asok->stop().then([this] { + return heartbeat->stop(); + }).then([this] { + return pg_shard_manager.stop_registries(); + }).then([this] { + return store.umount(); + }).then([this] { + return store.stop(); + }).then([this] { + return pg_shard_manager.stop_pgs(); + }).then([this] { + return monc->stop(); + }).then([this] { + return mgrc->stop(); + }).then([this] { + return shard_services.stop(); + }).then([this] { + return osd_states.stop(); + }).then([this] { + return osd_singleton_state.stop(); + }).then([this] { + return pg_to_shard_mappings.stop(); + }).then([fut=std::move(gate_close_fut)]() mutable { + return std::move(fut); + }).then([this] { + return when_all_succeed( + public_msgr->shutdown(), + cluster_msgr->shutdown()).discard_result(); + }).handle_exception([](auto ep) { + logger().error("error while stopping osd: {}", ep); + }); + }); +} + +void OSD::dump_status(Formatter* f) const +{ + f->dump_stream("cluster_fsid") << superblock.cluster_fsid; + f->dump_stream("osd_fsid") << superblock.osd_fsid; + f->dump_unsigned("whoami", superblock.whoami); + f->dump_string("state", pg_shard_manager.get_osd_state_string()); + f->dump_unsigned("oldest_map", superblock.oldest_map); + f->dump_unsigned("cluster_osdmap_trim_lower_bound", + superblock.cluster_osdmap_trim_lower_bound); + f->dump_unsigned("newest_map", superblock.newest_map); + f->dump_unsigned("num_pgs", pg_shard_manager.get_num_pgs()); +} + +void OSD::print(std::ostream& out) const +{ + out << "{osd." << superblock.whoami << " " + << superblock.osd_fsid << " [" << superblock.oldest_map + << "," << superblock.newest_map << "] " + << "tlb:" << superblock.cluster_osdmap_trim_lower_bound + << " pgs:" << pg_shard_manager.get_num_pgs() + << "}"; +} + +std::optional<seastar::future<>> +OSD::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + if (pg_shard_manager.is_stopping()) { + return seastar::now(); + } + auto maybe_ret = do_ms_dispatch(conn, std::move(m)); + if (!maybe_ret.has_value()) { + return std::nullopt; + } + + gate.dispatch_in_background( + __func__, *this, [ret=std::move(maybe_ret.value())]() mutable { + return std::move(ret); + }); + return seastar::now(); +} + +std::optional<seastar::future<>> +OSD::do_ms_dispatch( + crimson::net::ConnectionRef conn, + MessageRef m) +{ + if (seastar::this_shard_id() != PRIMARY_CORE) { + switch (m->get_type()) { + case CEPH_MSG_OSD_MAP: + case MSG_COMMAND: + case MSG_OSD_MARK_ME_DOWN: + // FIXME: order is not guaranteed in this path + return conn.get_foreign( + ).then([this, m=std::move(m)](auto f_conn) { + return seastar::smp::submit_to(PRIMARY_CORE, + [f_conn=std::move(f_conn), m=std::move(m), this]() mutable { + auto conn = make_local_shared_foreign(std::move(f_conn)); + auto ret = do_ms_dispatch(conn, std::move(m)); + assert(ret.has_value()); + return std::move(ret.value()); + }); + }); + } + } + + switch (m->get_type()) { + case CEPH_MSG_OSD_MAP: + return handle_osd_map(boost::static_pointer_cast<MOSDMap>(m)); + case CEPH_MSG_OSD_OP: + return handle_osd_op(conn, boost::static_pointer_cast<MOSDOp>(m)); + case MSG_OSD_PG_CREATE2: + return handle_pg_create( + conn, boost::static_pointer_cast<MOSDPGCreate2>(m)); + return seastar::now(); + case MSG_COMMAND: + return handle_command(conn, boost::static_pointer_cast<MCommand>(m)); + case MSG_OSD_MARK_ME_DOWN: + return handle_mark_me_down(conn, boost::static_pointer_cast<MOSDMarkMeDown>(m)); + case MSG_OSD_PG_PULL: + [[fallthrough]]; + case MSG_OSD_PG_PUSH: + [[fallthrough]]; + case MSG_OSD_PG_PUSH_REPLY: + [[fallthrough]]; + case MSG_OSD_PG_RECOVERY_DELETE: + [[fallthrough]]; + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + [[fallthrough]]; + case MSG_OSD_PG_SCAN: + [[fallthrough]]; + case MSG_OSD_PG_BACKFILL: + [[fallthrough]]; + case MSG_OSD_PG_BACKFILL_REMOVE: + return handle_recovery_subreq(conn, boost::static_pointer_cast<MOSDFastDispatchOp>(m)); + case MSG_OSD_PG_LEASE: + [[fallthrough]]; + case MSG_OSD_PG_LEASE_ACK: + [[fallthrough]]; + case MSG_OSD_PG_NOTIFY2: + [[fallthrough]]; + case MSG_OSD_PG_INFO2: + [[fallthrough]]; + case MSG_OSD_PG_QUERY2: + [[fallthrough]]; + case MSG_OSD_BACKFILL_RESERVE: + [[fallthrough]]; + case MSG_OSD_RECOVERY_RESERVE: + [[fallthrough]]; + case MSG_OSD_PG_LOG: + return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m)); + case MSG_OSD_REPOP: + return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m)); + case MSG_OSD_REPOPREPLY: + return handle_rep_op_reply(conn, boost::static_pointer_cast<MOSDRepOpReply>(m)); + case MSG_OSD_SCRUB2: + return handle_scrub(conn, boost::static_pointer_cast<MOSDScrub2>(m)); + case MSG_OSD_PG_UPDATE_LOG_MISSING: + return handle_update_log_missing(conn, boost::static_pointer_cast< + MOSDPGUpdateLogMissing>(m)); + case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: + return handle_update_log_missing_reply(conn, boost::static_pointer_cast< + MOSDPGUpdateLogMissingReply>(m)); + default: + return std::nullopt; + } +} + +void OSD::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) +{ + // TODO: cleanup the session attached to this connection + logger().warn("ms_handle_reset"); +} + +void OSD::ms_handle_remote_reset(crimson::net::ConnectionRef conn) +{ + logger().warn("ms_handle_remote_reset"); +} + +void OSD::handle_authentication(const EntityName& name, + const AuthCapsInfo& caps_info) +{ + // TODO: store the parsed cap and associate it with the connection + if (caps_info.allow_all) { + logger().debug("{} {} has all caps", __func__, name); + return; + } + if (caps_info.caps.length() > 0) { + auto p = caps_info.caps.cbegin(); + string str; + try { + decode(str, p); + } catch (ceph::buffer::error& e) { + logger().warn("{} {} failed to decode caps string", __func__, name); + return; + } + OSDCap caps; + if (caps.parse(str)) { + logger().debug("{} {} has caps {}", __func__, name, str); + } else { + logger().warn("{} {} failed to parse caps {}", __func__, name, str); + } + } +} + +void OSD::update_stats() +{ + osd_stat_seq++; + osd_stat.up_from = get_shard_services().get_up_epoch(); + osd_stat.hb_peers = heartbeat->get_peers(); + osd_stat.seq = ( + static_cast<uint64_t>(get_shard_services().get_up_epoch()) << 32 + ) | osd_stat_seq; + gate.dispatch_in_background("statfs", *this, [this] { + (void) store.stat().then([this](store_statfs_t&& st) { + osd_stat.statfs = st; + }); + }); +} + +seastar::future<MessageURef> OSD::get_stats() const +{ + // MPGStats::had_map_for is not used since PGMonitor was removed + auto m = crimson::make_message<MPGStats>(monc->get_fsid(), osdmap->get_epoch()); + m->osd_stat = osd_stat; + return pg_shard_manager.get_pg_stats( + ).then([m=std::move(m)](auto &&stats) mutable { + m->pg_stat = std::move(stats); + return seastar::make_ready_future<MessageURef>(std::move(m)); + }); +} + +uint64_t OSD::send_pg_stats() +{ + // mgr client sends the report message in background + mgrc->report(); + return osd_stat.seq; +} + +seastar::future<> OSD::handle_osd_map(Ref<MOSDMap> m) +{ + /* Ensure that only one MOSDMap is processed at a time. Allowing concurrent + * processing may eventually be worthwhile, but such an implementation would + * need to ensure (among other things) + * 1. any particular map is only processed once + * 2. PGAdvanceMap operations are processed in order for each PG + * As map handling is not presently a bottleneck, we stick to this + * simpler invariant for now. + * See https://tracker.ceph.com/issues/59165 + */ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return handle_osd_map_lock.lock().then([this, m] { + return _handle_osd_map(m); + }).finally([this] { + return handle_osd_map_lock.unlock(); + }); +} + +seastar::future<> OSD::_handle_osd_map(Ref<MOSDMap> m) +{ + logger().info("handle_osd_map {}", *m); + if (m->fsid != superblock.cluster_fsid) { + logger().warn("fsid mismatched"); + return seastar::now(); + } + if (pg_shard_manager.is_initializing()) { + logger().warn("i am still initializing"); + return seastar::now(); + } + + const auto first = m->get_first(); + const auto last = m->get_last(); + logger().info("handle_osd_map epochs [{}..{}], i have {}, src has [{}..{}]", + first, last, superblock.newest_map, + m->cluster_osdmap_trim_lower_bound, m->newest_map); + // make sure there is something new, here, before we bother flushing + // the queues and such + if (last <= superblock.newest_map) { + return seastar::now(); + } + // missing some? + bool skip_maps = false; + epoch_t start = superblock.newest_map + 1; + if (first > start) { + logger().info("handle_osd_map message skips epochs {}..{}", + start, first - 1); + if (m->cluster_osdmap_trim_lower_bound <= start) { + return get_shard_services().osdmap_subscribe(start, false); + } + // always try to get the full range of maps--as many as we can. this + // 1- is good to have + // 2- is at present the only way to ensure that we get a *full* map as + // the first map! + if (m->cluster_osdmap_trim_lower_bound < first) { + return get_shard_services().osdmap_subscribe( + m->cluster_osdmap_trim_lower_bound - 1, true); + } + skip_maps = true; + start = first; + } + + return seastar::do_with(ceph::os::Transaction{}, + [=, this](auto& t) { + return pg_shard_manager.store_maps(t, start, m).then([=, this, &t] { + // even if this map isn't from a mon, we may have satisfied our subscription + monc->sub_got("osdmap", last); + if (!superblock.oldest_map || skip_maps) { + superblock.oldest_map = first; + } + superblock.newest_map = last; + superblock.current_epoch = last; + + // note in the superblock that we were clean thru the prior epoch + if (boot_epoch && boot_epoch >= superblock.mounted) { + superblock.mounted = boot_epoch; + superblock.clean_thru = last; + } + pg_shard_manager.get_meta_coll().store_superblock(t, superblock); + pg_shard_manager.set_superblock(superblock); + logger().debug("OSD::handle_osd_map: do_transaction..."); + return store.get_sharded_store().do_transaction( + pg_shard_manager.get_meta_coll().collection(), + std::move(t)); + }); + }).then([=, this] { + // TODO: write to superblock and commit the transaction + return committed_osd_maps(start, last, m); + }); +} + +seastar::future<> OSD::committed_osd_maps( + version_t first, + version_t last, + Ref<MOSDMap> m) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + logger().info("osd.{}: committed_osd_maps({}, {})", whoami, first, last); + // advance through the new maps + return seastar::do_for_each(boost::make_counting_iterator(first), + boost::make_counting_iterator(last + 1), + [this](epoch_t cur) { + return pg_shard_manager.get_local_map( + cur + ).then([this](OSDMapService::local_cached_map_t&& o) { + osdmap = make_local_shared_foreign(OSDMapService::local_cached_map_t(o)); + return pg_shard_manager.update_map(std::move(o)); + }).then([this] { + if (get_shard_services().get_up_epoch() == 0 && + osdmap->is_up(whoami) && + osdmap->get_addrs(whoami) == public_msgr->get_myaddrs()) { + return pg_shard_manager.set_up_epoch( + osdmap->get_epoch() + ).then([this] { + if (!boot_epoch) { + boot_epoch = osdmap->get_epoch(); + } + }); + } else { + return seastar::now(); + } + }); + }).then([m, this] { + auto fut = seastar::now(); + if (osdmap->is_up(whoami)) { + const auto up_from = osdmap->get_up_from(whoami); + logger().info("osd.{}: map e {} marked me up: up_from {}, bind_epoch {}, state {}", + whoami, osdmap->get_epoch(), up_from, bind_epoch, + pg_shard_manager.get_osd_state_string()); + if (bind_epoch < up_from && + osdmap->get_addrs(whoami) == public_msgr->get_myaddrs() && + pg_shard_manager.is_booting()) { + logger().info("osd.{}: activating...", whoami); + fut = pg_shard_manager.set_active().then([this] { + beacon_timer.arm_periodic( + std::chrono::seconds(local_conf()->osd_beacon_report_interval)); + // timer continuation rearms when complete + tick_timer.arm( + std::chrono::seconds(TICK_INTERVAL)); + }); + } + } else { + if (pg_shard_manager.is_prestop()) { + got_stop_ack(); + return seastar::now(); + } + } + return fut.then([this] { + return check_osdmap_features().then([this] { + // yay! + logger().info("osd.{}: committed_osd_maps: broadcasting osdmaps up" + " to {} epoch to pgs", whoami, osdmap->get_epoch()); + return pg_shard_manager.broadcast_map_to_pgs(osdmap->get_epoch()); + }); + }); + }).then([m, this] { + if (pg_shard_manager.is_active()) { + logger().info("osd.{}: now active", whoami); + if (!osdmap->exists(whoami) || + osdmap->is_stop(whoami)) { + return shutdown(); + } + if (should_restart()) { + return restart(); + } else { + return seastar::now(); + } + } else if (pg_shard_manager.is_preboot()) { + logger().info("osd.{}: now preboot", whoami); + + if (m->get_source().is_mon()) { + return _preboot( + m->cluster_osdmap_trim_lower_bound, m->newest_map); + } else { + logger().info("osd.{}: start_boot", whoami); + return start_boot(); + } + } else { + logger().info("osd.{}: now {}", whoami, + pg_shard_manager.get_osd_state_string()); + // XXX + return seastar::now(); + } + }); +} + +seastar::future<> OSD::handle_osd_op( + crimson::net::ConnectionRef conn, + Ref<MOSDOp> m) +{ + return pg_shard_manager.start_pg_operation<ClientRequest>( + get_shard_services(), + conn, + std::move(m)).second; +} + +seastar::future<> OSD::handle_pg_create( + crimson::net::ConnectionRef conn, + Ref<MOSDPGCreate2> m) +{ + return seastar::do_for_each(m->pgs, [this, conn, m](auto& pg) { + auto& [pgid, when] = pg; + const auto &[created, created_stamp] = when; + auto q = m->pg_extra.find(pgid); + ceph_assert(q != m->pg_extra.end()); + auto& [history, pi] = q->second; + logger().debug( + "{}: {} e{} @{} " + "history {} pi {}", + __func__, pgid, created, created_stamp, + history, pi); + if (!pi.empty() && + m->epoch < pi.get_bounds().second) { + logger().error( + "got pg_create on {} epoch {} " + "unmatched past_intervals {} (history {})", + pgid, m->epoch, + pi, history); + return seastar::now(); + } else { + return pg_shard_manager.start_pg_operation<RemotePeeringEvent>( + conn, + pg_shard_t(), + pgid, + m->epoch, + m->epoch, + NullEvt(), + true, + new PGCreateInfo(pgid, m->epoch, history, pi, true)).second; + } + }); +} + +seastar::future<> OSD::handle_update_log_missing( + crimson::net::ConnectionRef conn, + Ref<MOSDPGUpdateLogMissing> m) +{ + m->decode_payload(); + return pg_shard_manager.start_pg_operation<LogMissingRequest>( + std::move(conn), + std::move(m)).second; +} + +seastar::future<> OSD::handle_update_log_missing_reply( + crimson::net::ConnectionRef conn, + Ref<MOSDPGUpdateLogMissingReply> m) +{ + m->decode_payload(); + return pg_shard_manager.start_pg_operation<LogMissingRequestReply>( + std::move(conn), + std::move(m)).second; +} + +seastar::future<> OSD::handle_rep_op( + crimson::net::ConnectionRef conn, + Ref<MOSDRepOp> m) +{ + m->finish_decode(); + return pg_shard_manager.start_pg_operation<RepRequest>( + std::move(conn), + std::move(m)).second; +} + +seastar::future<> OSD::handle_rep_op_reply( + crimson::net::ConnectionRef conn, + Ref<MOSDRepOpReply> m) +{ + spg_t pgid = m->get_spg(); + return pg_shard_manager.with_pg( + pgid, + [m=std::move(m)](auto &&pg) { + if (pg) { + m->finish_decode(); + pg->handle_rep_op_reply(*m); + } else { + logger().warn("stale reply: {}", *m); + } + return seastar::now(); + }); +} + +seastar::future<> OSD::handle_scrub( + crimson::net::ConnectionRef conn, + Ref<MOSDScrub2> m) +{ + if (m->fsid != superblock.cluster_fsid) { + logger().warn("fsid mismatched"); + return seastar::now(); + } + return seastar::parallel_for_each(std::move(m->scrub_pgs), + [m, conn, this](spg_t pgid) { + pg_shard_t from_shard{static_cast<int>(m->get_source().num()), + pgid.shard}; + PeeringState::RequestScrub scrub_request{m->deep, m->repair}; + return pg_shard_manager.start_pg_operation<RemotePeeringEvent>( + conn, + from_shard, + pgid, + PGPeeringEvent{m->epoch, m->epoch, scrub_request}).second; + }); +} + +seastar::future<> OSD::handle_mark_me_down( + crimson::net::ConnectionRef conn, + Ref<MOSDMarkMeDown> m) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + if (pg_shard_manager.is_prestop()) { + got_stop_ack(); + } + return seastar::now(); +} + +seastar::future<> OSD::handle_recovery_subreq( + crimson::net::ConnectionRef conn, + Ref<MOSDFastDispatchOp> m) +{ + return pg_shard_manager.start_pg_operation<RecoverySubRequest>( + conn, std::move(m)).second; +} + +bool OSD::should_restart() const +{ + if (!osdmap->is_up(whoami)) { + logger().info("map e {} marked osd.{} down", + osdmap->get_epoch(), whoami); + return true; + } else if (osdmap->get_addrs(whoami) != public_msgr->get_myaddrs()) { + logger().error("map e {} had wrong client addr ({} != my {})", + osdmap->get_epoch(), + osdmap->get_addrs(whoami), + public_msgr->get_myaddrs()); + return true; + } else if (osdmap->get_cluster_addrs(whoami) != cluster_msgr->get_myaddrs()) { + logger().error("map e {} had wrong cluster addr ({} != my {})", + osdmap->get_epoch(), + osdmap->get_cluster_addrs(whoami), + cluster_msgr->get_myaddrs()); + return true; + } else { + return false; + } +} + +seastar::future<> OSD::restart() +{ + beacon_timer.cancel(); + tick_timer.cancel(); + return pg_shard_manager.set_up_epoch( + 0 + ).then([this] { + bind_epoch = osdmap->get_epoch(); + // TODO: promote to shutdown if being marked down for multiple times + // rebind messengers + return start_boot(); + }); +} + +seastar::future<> OSD::shutdown() +{ + logger().info("shutting down per osdmap"); + abort_source.request_abort(); + return seastar::now(); +} + +seastar::future<> OSD::send_beacon() +{ + if (!pg_shard_manager.is_active()) { + return seastar::now(); + } + // FIXME: min lec should be calculated from pg_stat + // and should set m->pgs + epoch_t min_last_epoch_clean = osdmap->get_epoch(); + auto m = crimson::make_message<MOSDBeacon>(osdmap->get_epoch(), + min_last_epoch_clean, + superblock.last_purged_snaps_scrub, + local_conf()->osd_beacon_report_interval); + return monc->send_message(std::move(m)); +} + +seastar::future<> OSD::update_heartbeat_peers() +{ + if (!pg_shard_manager.is_active()) { + return seastar::now();; + } + + pg_shard_manager.for_each_pgid([this](auto &pgid) { + vector<int> up, acting; + osdmap->pg_to_up_acting_osds(pgid.pgid, + &up, nullptr, + &acting, nullptr); + for (int osd : boost::join(up, acting)) { + if (osd == CRUSH_ITEM_NONE || osd == whoami) { + continue; + } else { + heartbeat->add_peer(osd, osdmap->get_epoch()); + } + } + }); + heartbeat->update_peers(whoami); + return seastar::now(); +} + +seastar::future<> OSD::handle_peering_op( + crimson::net::ConnectionRef conn, + Ref<MOSDPeeringOp> m) +{ + const int from = m->get_source().num(); + logger().debug("handle_peering_op on {} from {}", m->get_spg(), from); + m->set_features(conn->get_features()); + std::unique_ptr<PGPeeringEvent> evt(m->get_event()); + return pg_shard_manager.start_pg_operation<RemotePeeringEvent>( + conn, + pg_shard_t{from, m->get_spg().shard}, + m->get_spg(), + std::move(*evt)).second; +} + +seastar::future<> OSD::check_osdmap_features() +{ + assert(seastar::this_shard_id() == PRIMARY_CORE); + return store.write_meta( + "require_osd_release", + stringify((int)osdmap->require_osd_release)); +} + +seastar::future<> OSD::prepare_to_stop() +{ + if (osdmap && osdmap->is_up(whoami)) { + pg_shard_manager.set_prestop(); + const auto timeout = + std::chrono::duration_cast<std::chrono::milliseconds>( + std::chrono::duration<double>( + local_conf().get_val<double>("osd_mon_shutdown_timeout"))); + + return seastar::with_timeout( + seastar::timer<>::clock::now() + timeout, + monc->send_message( + crimson::make_message<MOSDMarkMeDown>( + monc->get_fsid(), + whoami, + osdmap->get_addrs(whoami), + osdmap->get_epoch(), + true)).then([this] { + return stop_acked.get_future(); + }) + ).handle_exception_type( + [](seastar::timed_out_error&) { + return seastar::now(); + }); + } + return seastar::now(); +} + +} diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h new file mode 100644 index 000000000..10ff60d47 --- /dev/null +++ b/src/crimson/osd/osd.h @@ -0,0 +1,251 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/abort_source.hh> +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/shared_ptr.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/timer.hh> + +#include "crimson/common/logclient.h" +#include "crimson/common/type_helpers.h" +#include "crimson/common/auth_handler.h" +#include "crimson/common/gated.h" +#include "crimson/admin/admin_socket.h" +#include "crimson/common/simple_lru.h" +#include "crimson/mgr/client.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/pg_shard_manager.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/pg_map.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/state.h" + +#include "messages/MOSDOp.h" +#include "osd/PeeringState.h" +#include "osd/osd_types.h" +#include "osd/osd_perf_counters.h" +#include "osd/PGPeeringEvent.h" + +class MCommand; +class MOSDMap; +class MOSDRepOpReply; +class MOSDRepOp; +class MOSDScrub2; +class OSDMeta; +class Heartbeat; + +namespace ceph::os { + class Transaction; +} + +namespace crimson::mon { + class Client; +} + +namespace crimson::net { + class Messenger; +} + +namespace crimson::os { + class FuturizedStore; +} + +namespace crimson::osd { +class PG; + +class OSD final : public crimson::net::Dispatcher, + private crimson::common::AuthHandler, + private crimson::mgr::WithStats { + const int whoami; + const uint32_t nonce; + seastar::abort_source& abort_source; + seastar::timer<seastar::lowres_clock> beacon_timer; + // talk with osd + crimson::net::MessengerRef cluster_msgr; + // talk with client/mon/mgr + crimson::net::MessengerRef public_msgr; + + // HB Messengers + crimson::net::MessengerRef hb_front_msgr; + crimson::net::MessengerRef hb_back_msgr; + + std::unique_ptr<crimson::mon::Client> monc; + std::unique_ptr<crimson::mgr::Client> mgrc; + + // TODO: use a wrapper for ObjectStore + OSDMapService::cached_map_t osdmap; + crimson::os::FuturizedStore& store; + + /// _first_ epoch we were marked up (after this process started) + epoch_t boot_epoch = 0; + //< epoch we last did a bind to new ip:ports + epoch_t bind_epoch = 0; + //< since when there is no more pending pg creates from mon + epoch_t last_pg_create_epoch = 0; + + ceph::mono_time startup_time; + + seastar::shared_mutex handle_osd_map_lock; + + OSDSuperblock superblock; + + // Dispatcher methods + std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef, MessageRef) final; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final; + void ms_handle_remote_reset(crimson::net::ConnectionRef conn) final; + + std::optional<seastar::future<>> do_ms_dispatch(crimson::net::ConnectionRef, MessageRef); + + // mgr::WithStats methods + // pg statistics including osd ones + osd_stat_t osd_stat; + uint32_t osd_stat_seq = 0; + void update_stats(); + seastar::future<MessageURef> get_stats() const final; + + // AuthHandler methods + void handle_authentication(const EntityName& name, + const AuthCapsInfo& caps) final; + + seastar::sharded<PGShardMapping> pg_to_shard_mappings; + seastar::sharded<OSDSingletonState> osd_singleton_state; + seastar::sharded<OSDState> osd_states; + seastar::sharded<ShardServices> shard_services; + + crimson::osd::PGShardManager pg_shard_manager; + + std::unique_ptr<Heartbeat> heartbeat; + seastar::timer<seastar::lowres_clock> tick_timer; + + // admin-socket + seastar::lw_shared_ptr<crimson::admin::AdminSocket> asok; + +public: + OSD(int id, uint32_t nonce, + seastar::abort_source& abort_source, + crimson::os::FuturizedStore& store, + crimson::net::MessengerRef cluster_msgr, + crimson::net::MessengerRef client_msgr, + crimson::net::MessengerRef hb_front_msgr, + crimson::net::MessengerRef hb_back_msgr); + ~OSD() final; + + auto &get_pg_shard_manager() { + return pg_shard_manager; + } + + seastar::future<> open_meta_coll(); + static seastar::future<OSDMeta> open_or_create_meta_coll( + crimson::os::FuturizedStore &store + ); + static seastar::future<> mkfs( + crimson::os::FuturizedStore &store, + unsigned whoami, + uuid_d osd_uuid, + uuid_d cluster_fsid, + std::string osdspec_affinity); + + seastar::future<> start(); + seastar::future<> stop(); + + void dump_status(Formatter*) const; + void print(std::ostream&) const; + + /// @return the seq id of the pg stats being sent + uint64_t send_pg_stats(); + + auto &get_shard_services() { + return shard_services.local(); + } + +private: + static seastar::future<> _write_superblock( + crimson::os::FuturizedStore &store, + OSDMeta meta, + OSDSuperblock superblock); + static seastar::future<> _write_key_meta( + crimson::os::FuturizedStore &store + ); + seastar::future<> start_boot(); + seastar::future<> _preboot(version_t oldest_osdmap, version_t newest_osdmap); + seastar::future<> _send_boot(); + seastar::future<> _add_me_to_crush(); + + seastar::future<> osdmap_subscribe(version_t epoch, bool force_request); + + seastar::future<> start_asok_admin(); + + void write_superblock(ceph::os::Transaction& t); + seastar::future<> read_superblock(); + + seastar::future<> handle_osd_map(Ref<MOSDMap> m); + seastar::future<> _handle_osd_map(Ref<MOSDMap> m); + seastar::future<> handle_pg_create(crimson::net::ConnectionRef conn, + Ref<MOSDPGCreate2> m); + seastar::future<> handle_osd_op(crimson::net::ConnectionRef conn, + Ref<MOSDOp> m); + seastar::future<> handle_rep_op(crimson::net::ConnectionRef conn, + Ref<MOSDRepOp> m); + seastar::future<> handle_rep_op_reply(crimson::net::ConnectionRef conn, + Ref<MOSDRepOpReply> m); + seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn, + Ref<MOSDPeeringOp> m); + seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn, + Ref<MOSDFastDispatchOp> m); + seastar::future<> handle_scrub(crimson::net::ConnectionRef conn, + Ref<MOSDScrub2> m); + seastar::future<> handle_mark_me_down(crimson::net::ConnectionRef conn, + Ref<MOSDMarkMeDown> m); + + seastar::future<> committed_osd_maps(version_t first, + version_t last, + Ref<MOSDMap> m); + + seastar::future<> check_osdmap_features(); + + seastar::future<> handle_command(crimson::net::ConnectionRef conn, + Ref<MCommand> m); + seastar::future<> handle_update_log_missing(crimson::net::ConnectionRef conn, + Ref<MOSDPGUpdateLogMissing> m); + seastar::future<> handle_update_log_missing_reply( + crimson::net::ConnectionRef conn, + Ref<MOSDPGUpdateLogMissingReply> m); + +private: + crimson::common::Gated gate; + + seastar::promise<> stop_acked; + void got_stop_ack() { + stop_acked.set_value(); + } + seastar::future<> prepare_to_stop(); + bool should_restart() const; + seastar::future<> restart(); + seastar::future<> shutdown(); + seastar::future<> update_heartbeat_peers(); + friend class PGAdvanceMap; + +public: + seastar::future<> send_beacon(); + +private: + LogClient log_client; + LogChannelRef clog; +}; + +inline std::ostream& operator<<(std::ostream& out, const OSD& osd) { + osd.print(out); + return out; +} + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::OSD> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_connection_priv.h b/src/crimson/osd/osd_connection_priv.h new file mode 100644 index 000000000..69edf94b8 --- /dev/null +++ b/src/crimson/osd/osd_connection_priv.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/replicated_request.h" + +namespace crimson::osd { + +struct OSDConnectionPriv : public crimson::net::Connection::user_private_t { + ConnectionPipeline client_request_conn_pipeline; + ConnectionPipeline peering_request_conn_pipeline; + ConnectionPipeline replicated_request_conn_pipeline; +}; + +static OSDConnectionPriv &get_osd_priv(crimson::net::Connection *conn) { + if (!conn->has_user_private()) { + conn->set_user_private(std::make_unique<OSDConnectionPriv>()); + } + return static_cast<OSDConnectionPriv&>(conn->get_user_private()); +} + +} diff --git a/src/crimson/osd/osd_meta.cc b/src/crimson/osd/osd_meta.cc new file mode 100644 index 000000000..e40b2b246 --- /dev/null +++ b/src/crimson/osd/osd_meta.cc @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd_meta.h" + +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "os/Transaction.h" + +using std::string; +using read_errorator = crimson::os::FuturizedStore::Shard::read_errorator; + +void OSDMeta::create(ceph::os::Transaction& t) +{ + t.create_collection(coll->get_cid(), 0); +} + +void OSDMeta::store_map(ceph::os::Transaction& t, + epoch_t e, const bufferlist& m) +{ + t.write(coll->get_cid(), osdmap_oid(e), 0, m.length(), m); +} + +seastar::future<bufferlist> OSDMeta::load_map(epoch_t e) +{ + return store.read(coll, + osdmap_oid(e), 0, 0, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED).handle_error( + read_errorator::all_same_way([e] { + ceph_abort_msg(fmt::format("{} read gave enoent on {}", + __func__, osdmap_oid(e))); + })); +} + +void OSDMeta::store_superblock(ceph::os::Transaction& t, + const OSDSuperblock& superblock) +{ + bufferlist bl; + encode(superblock, bl); + t.write(coll->get_cid(), superblock_oid(), 0, bl.length(), bl); +} + +OSDMeta::load_superblock_ret OSDMeta::load_superblock() +{ + return store.read( + coll, superblock_oid(), 0, 0 + ).safe_then([] (bufferlist&& bl) { + auto p = bl.cbegin(); + OSDSuperblock superblock; + decode(superblock, p); + return seastar::make_ready_future<OSDSuperblock>(std::move(superblock)); + }); +} + +seastar::future<std::tuple<pg_pool_t, + std::string, + OSDMeta::ec_profile_t>> +OSDMeta::load_final_pool_info(int64_t pool) { + return store.read(coll, final_pool_info_oid(pool), + 0, 0).safe_then([] (bufferlist&& bl) { + auto p = bl.cbegin(); + pg_pool_t pi; + string name; + ec_profile_t ec_profile; + decode(pi, p); + decode(name, p); + decode(ec_profile, p); + return seastar::make_ready_future<std::tuple<pg_pool_t, + string, + ec_profile_t>>( + std::make_tuple(std::move(pi), + std::move(name), + std::move(ec_profile))); + },read_errorator::all_same_way([pool] { + throw std::runtime_error(fmt::format("read gave enoent on {}", + final_pool_info_oid(pool))); + })); +} + +ghobject_t OSDMeta::osdmap_oid(epoch_t epoch) +{ + string name = fmt::format("osdmap.{}", epoch); + return ghobject_t(hobject_t(sobject_t(object_t(name), 0))); +} + +ghobject_t OSDMeta::final_pool_info_oid(int64_t pool) +{ + string name = fmt::format("final_pool_{}", pool); + return ghobject_t(hobject_t(sobject_t(object_t(name), CEPH_NOSNAP))); +} + +ghobject_t OSDMeta::superblock_oid() +{ + return ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0))); +} diff --git a/src/crimson/osd/osd_meta.h b/src/crimson/osd/osd_meta.h new file mode 100644 index 000000000..652266d9e --- /dev/null +++ b/src/crimson/osd/osd_meta.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <string> +#include <seastar/core/future.hh> +#include "osd/osd_types.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +namespace ceph::os { + class Transaction; +} + +namespace crimson::os { + class FuturizedCollection; + class FuturizedStore; +} + +/// metadata shared across PGs, or put in another way, +/// metadata not specific to certain PGs. +class OSDMeta { + template<typename T> using Ref = boost::intrusive_ptr<T>; + + crimson::os::FuturizedStore::Shard& store; + Ref<crimson::os::FuturizedCollection> coll; + +public: + OSDMeta(Ref<crimson::os::FuturizedCollection> coll, + crimson::os::FuturizedStore::Shard& store) + : store{store}, coll{coll} + {} + + auto collection() { + return coll; + } + void create(ceph::os::Transaction& t); + + void store_map(ceph::os::Transaction& t, + epoch_t e, const bufferlist& m); + seastar::future<bufferlist> load_map(epoch_t e); + + void store_superblock(ceph::os::Transaction& t, + const OSDSuperblock& sb); + + using load_superblock_ertr = crimson::os::FuturizedStore::Shard::read_errorator; + using load_superblock_ret = load_superblock_ertr::future<OSDSuperblock>; + load_superblock_ret load_superblock(); + + using ec_profile_t = std::map<std::string, std::string>; + seastar::future<std::tuple<pg_pool_t, + std::string, + ec_profile_t>> load_final_pool_info(int64_t pool); +private: + static ghobject_t osdmap_oid(epoch_t epoch); + static ghobject_t final_pool_info_oid(int64_t pool); + static ghobject_t superblock_oid(); +}; diff --git a/src/crimson/osd/osd_operation.cc b/src/crimson/osd/osd_operation.cc new file mode 100644 index 000000000..920fdc114 --- /dev/null +++ b/src/crimson/osd/osd_operation.cc @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd_operation.h" +#include "common/Formatter.h" +#include "crimson/common/log.h" +#include "crimson/osd/osd_operations/client_request.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +void OSDOperationRegistry::do_stop() +{ + logger().info("OSDOperationRegistry::{}", __func__); + // we need to decouple visiting the registry from destructing + // ops because of the auto-unlink feature of boost::intrusive. + // the list shouldn't change while iterating due to constrains + // on iterator's validity. + constexpr auto historic_reg_index = + static_cast<size_t>(OperationTypeCode::historic_client_request); + auto& historic_registry = get_registry<historic_reg_index>(); + std::vector<ClientRequest::ICRef> to_ref_down; + std::transform(std::begin(historic_registry), std::end(historic_registry), + std::back_inserter(to_ref_down), + [] (const Operation& op) { + return ClientRequest::ICRef{ + static_cast<const ClientRequest*>(&op), + /* add_ref= */ false + }; + }); + last_of_recents = std::end(historic_registry); + // to_ref_down is going off +} + +OSDOperationRegistry::OSDOperationRegistry() + : OperationRegistryT(seastar::this_shard_id()) +{ + constexpr auto historic_reg_index = + static_cast<size_t>(OperationTypeCode::historic_client_request); + auto& historic_registry = get_registry<historic_reg_index>(); + last_of_recents = std::begin(historic_registry); +} + +static auto get_duration(const ClientRequest& client_request) +{ + // TODO: consider enhancing `CompletionEvent` with computing duration + // once -- when it's enetered. + return client_request.get_completed() - client_request.get_started(); +} + +void OSDOperationRegistry::put_historic(const ClientRequest& op) +{ + // unlink the op from the client request registry. this is a part of + // the re-link procedure. finally it will be in historic registry. + constexpr auto client_reg_index = + static_cast<size_t>(OperationTypeCode::client_request); + constexpr auto historic_reg_index = + static_cast<size_t>(OperationTypeCode::historic_client_request); + auto& client_registry = get_registry<client_reg_index>(); + auto& historic_registry = get_registry<historic_reg_index>(); + historic_registry.splice(std::end(historic_registry), + client_registry, + client_registry.iterator_to(op)); + ClientRequest::ICRef( + &op, /* add_ref= */true + ).detach(); // yes, "leak" it for now! + + // check whether the history size limit is not exceeded; if so, then + // purge the oldest op. + // NOTE: Operation uses the auto-unlink feature of boost::intrusive. + // NOTE: the cleaning happens in OSDOperationRegistry::do_stop() + using crimson::common::local_conf; + if (num_recent_ops >= local_conf()->osd_op_history_size) { + ++last_of_recents; + ++num_slow_ops; + } else { + ++num_recent_ops; + } + if (num_slow_ops > local_conf()->osd_op_history_slow_op_size) { + // we're interested in keeping slowest ops. if the slow op history + // is disabled, the list will have only one element, so the full-blown + // search will boil down into `.front()`. + const auto fastest_historic_iter = std::min_element( + std::cbegin(historic_registry), last_of_recents, + [] (const auto& lop, const auto& rop) { + const auto& lclient_request = static_cast<const ClientRequest&>(lop); + const auto& rclient_request = static_cast<const ClientRequest&>(rop); + return get_duration(lclient_request) < get_duration(rclient_request); + }); + assert(fastest_historic_iter != std::end(historic_registry)); + const auto& fastest_historic_op = + static_cast<const ClientRequest&>(*fastest_historic_iter); + historic_registry.erase(fastest_historic_iter); + // clear a previously "leaked" op + ClientRequest::ICRef(&fastest_historic_op, /* add_ref= */false); + --num_slow_ops; + } +} + +size_t OSDOperationRegistry::dump_historic_client_requests(ceph::Formatter* f) const +{ + const auto& historic_client_registry = + get_registry<static_cast<size_t>(OperationTypeCode::historic_client_request)>(); //ClientRequest::type)>(); + f->open_object_section("op_history"); + f->dump_int("size", historic_client_registry.size()); + // TODO: f->dump_int("duration", history_duration.load()); + // the intrusive list is configured to not store the size + size_t ops_count = 0; + { + f->open_array_section("ops"); + for (const auto& op : historic_client_registry) { + op.dump(f); + ++ops_count; + } + f->close_section(); + } + f->close_section(); + return ops_count; +} + +size_t OSDOperationRegistry::dump_slowest_historic_client_requests(ceph::Formatter* f) const +{ + const auto& historic_client_registry = + get_registry<static_cast<size_t>(OperationTypeCode::historic_client_request)>(); //ClientRequest::type)>(); + f->open_object_section("op_history"); + f->dump_int("size", historic_client_registry.size()); + // TODO: f->dump_int("duration", history_duration.load()); + // the intrusive list is configured to not store the size + std::multimap<utime_t, + const ClientRequest*, + std::greater<utime_t>> sorted_slowest_ops; + // iterating over the entire registry as a slow op could be also + // in the "recently added" part. + std::transform(std::begin(historic_client_registry), + std::end(historic_client_registry), + std::inserter(sorted_slowest_ops, std::end(sorted_slowest_ops)), + [] (const Operation& op) { + const auto& cop = static_cast<const ClientRequest&>(op); + return std::make_pair(get_duration(cop), &cop); + }); + f->open_array_section("ops"); + using crimson::common::local_conf; + size_t ops_count = 0; + for (auto it = std::begin(sorted_slowest_ops); + ops_count < local_conf()->osd_op_history_slow_op_size + && it != std::end(sorted_slowest_ops); + ++it, ++ops_count) + { + it->second->dump(f); + } + f->close_section(); + return ops_count; +} + +OperationThrottler::OperationThrottler(ConfigProxy &conf) + : scheduler(crimson::osd::scheduler::make_scheduler(conf)) +{ + conf.add_observer(this); + update_from_config(conf); +} + +void OperationThrottler::wake() +{ + while ((!max_in_progress || in_progress < max_in_progress) && + !scheduler->empty()) { + auto item = scheduler->dequeue(); + item.wake.set_value(); + ++in_progress; + --pending; + } +} + +void OperationThrottler::release_throttle() +{ + ceph_assert(in_progress > 0); + --in_progress; + wake(); +} + +seastar::future<> OperationThrottler::acquire_throttle( + crimson::osd::scheduler::params_t params) +{ + crimson::osd::scheduler::item_t item{params, seastar::promise<>()}; + auto fut = item.wake.get_future(); + scheduler->enqueue(std::move(item)); + return fut; +} + +void OperationThrottler::dump_detail(Formatter *f) const +{ + f->dump_unsigned("max_in_progress", max_in_progress); + f->dump_unsigned("in_progress", in_progress); + f->open_object_section("scheduler"); + { + scheduler->dump(*f); + } + f->close_section(); +} + +void OperationThrottler::update_from_config(const ConfigProxy &conf) +{ + max_in_progress = conf.get_val<uint64_t>("crimson_osd_scheduler_concurrency"); + wake(); +} + +const char** OperationThrottler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "crimson_osd_scheduler_concurrency", + NULL + }; + return KEYS; +} + +void OperationThrottler::handle_conf_change( + const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + update_from_config(conf); +} + +} diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h new file mode 100644 index 000000000..8ef44ee9e --- /dev/null +++ b/src/crimson/osd/osd_operation.h @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/operation.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/scheduler/scheduler.h" +#include "osd/osd_types.h" + +namespace crimson::os::seastore { + template<class OpT> + class OperationProxyT; +} + +namespace crimson::osd { + +/// Ordering stages for a class of operations ordered by PG. +struct ConnectionPipeline { + struct AwaitActive : OrderedExclusivePhaseT<AwaitActive> { + static constexpr auto type_name = + "ConnectionPipeline::await_active"; + } await_active; + + struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> { + static constexpr auto type_name = + "ConnectionPipeline::await_map"; + } await_map; + + struct GetPG : OrderedExclusivePhaseT<GetPG> { + static constexpr auto type_name = + "ConnectionPipeline::get_pg"; + } get_pg; +}; + +enum class OperationTypeCode { + client_request = 0, + peering_event, + pg_advance_map, + pg_creation, + replicated_request, + background_recovery, + background_recovery_sub, + internal_client_request, + historic_client_request, + logmissing_request, + logmissing_request_reply, + snaptrim_event, + snaptrimobj_subevent, + last_op +}; + +static constexpr const char* const OP_NAMES[] = { + "client_request", + "peering_event", + "pg_advance_map", + "pg_creation", + "replicated_request", + "background_recovery", + "background_recovery_sub", + "internal_client_request", + "historic_client_request", + "logmissing_request", + "logmissing_request_reply", + "snaptrim_event", + "snaptrimobj_subevent", +}; + +// prevent the addition of OperationTypeCode-s with no matching OP_NAMES entry: +static_assert( + (sizeof(OP_NAMES)/sizeof(OP_NAMES[0])) == + static_cast<int>(OperationTypeCode::last_op)); + +struct InterruptibleOperation : Operation { + template <typename ValuesT = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition, ValuesT>; + using interruptor = + ::crimson::interruptible::interruptor< + ::crimson::osd::IOInterruptCondition>; +}; + +template <typename T> +struct OperationT : InterruptibleOperation { + static constexpr const char *type_name = OP_NAMES[static_cast<int>(T::type)]; + using IRef = boost::intrusive_ptr<T>; + using ICRef = boost::intrusive_ptr<const T>; + + unsigned get_type() const final { + return static_cast<unsigned>(T::type); + } + + const char *get_type_name() const final { + return T::type_name; + } + + virtual ~OperationT() = default; + +private: + virtual void dump_detail(ceph::Formatter *f) const = 0; +}; + +template <class T> +class TrackableOperationT : public OperationT<T> { + T* that() { + return static_cast<T*>(this); + } + const T* that() const { + return static_cast<const T*>(this); + } + +protected: + template<class EventT> + decltype(auto) get_event() { + // all out derivates are supposed to define the list of tracking + // events accessible via `std::get`. This will usually boil down + // into an instance of `std::tuple`. + return std::get<EventT>(that()->tracking_events); + } + + template<class EventT> + decltype(auto) get_event() const { + return std::get<EventT>(that()->tracking_events); + } + + using OperationT<T>::OperationT; + + struct StartEvent : TimeEvent<StartEvent> {}; + struct CompletionEvent : TimeEvent<CompletionEvent> {}; + + template <class EventT, class... Args> + void track_event(Args&&... args) { + // the idea is to have a visitor-like interface that allows to double + // dispatch (backend, blocker type) + get_event<EventT>().trigger(*that(), std::forward<Args>(args)...); + } + + template <class BlockingEventT, class InterruptorT=void, class F> + auto with_blocking_event(F&& f) { + auto ret = std::forward<F>(f)(typename BlockingEventT::template Trigger<T>{ + get_event<BlockingEventT>(), *that() + }); + if constexpr (std::is_same_v<InterruptorT, void>) { + return ret; + } else { + using ret_t = decltype(ret); + return typename InterruptorT::template futurize_t<ret_t>{std::move(ret)}; + } + } + +public: + static constexpr bool is_trackable = true; +}; + +template <class T> +class PhasedOperationT : public TrackableOperationT<T> { + using base_t = TrackableOperationT<T>; + + T* that() { + return static_cast<T*>(this); + } + const T* that() const { + return static_cast<const T*>(this); + } + +protected: + using TrackableOperationT<T>::TrackableOperationT; + + template <class InterruptorT=void, class StageT> + auto enter_stage(StageT& stage) { + return this->template with_blocking_event<typename StageT::BlockingEvent, + InterruptorT>( + [&stage, this] (auto&& trigger) { + // delegated storing the pipeline handle to let childs to match + // the lifetime of pipeline with e.g. ConnectedSocket (important + // for ConnectionPipeline). + return that()->get_handle().template enter<T>(stage, std::move(trigger)); + }); + } + + template <class OpT> + friend class crimson::os::seastore::OperationProxyT; + + // PGShardManager::start_pg_operation needs access to enter_stage, we can make this + // more sophisticated later on + friend class PGShardManager; +}; + +/** + * Maintains a set of lists of all active ops. + */ +struct OSDOperationRegistry : OperationRegistryT< + static_cast<size_t>(OperationTypeCode::last_op) +> { + OSDOperationRegistry(); + + void do_stop() override; + + void put_historic(const class ClientRequest& op); + + size_t dump_historic_client_requests(ceph::Formatter* f) const; + size_t dump_slowest_historic_client_requests(ceph::Formatter* f) const; + +private: + op_list::const_iterator last_of_recents; + size_t num_recent_ops = 0; + size_t num_slow_ops = 0; +}; +/** + * Throttles set of currently running operations + * + * Very primitive currently, assumes all ops are equally + * expensive and simply limits the number that can be + * concurrently active. + */ +class OperationThrottler : public BlockerT<OperationThrottler>, + private md_config_obs_t { + friend BlockerT<OperationThrottler>; + static constexpr const char* type_name = "OperationThrottler"; + + template <typename OperationT, typename F> + auto with_throttle( + OperationT* op, + crimson::osd::scheduler::params_t params, + F &&f) { + if (!max_in_progress) return f(); + return acquire_throttle(params) + .then(std::forward<F>(f)) + .then([this](auto x) { + release_throttle(); + return x; + }); + } + + template <typename OperationT, typename F> + seastar::future<> with_throttle_while( + OperationT* op, + crimson::osd::scheduler::params_t params, + F &&f) { + return with_throttle(op, params, f).then([this, params, op, f](bool cont) { + return cont ? with_throttle_while(op, params, f) : seastar::now(); + }); + } + + +public: + OperationThrottler(ConfigProxy &conf); + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) final; + void update_from_config(const ConfigProxy &conf); + + template <class OpT, class... Args> + seastar::future<> with_throttle_while( + BlockingEvent::Trigger<OpT>&& trigger, + Args&&... args) { + return trigger.maybe_record_blocking( + with_throttle_while(std::forward<Args>(args)...), *this); + } + +private: + void dump_detail(Formatter *f) const final; + + crimson::osd::scheduler::SchedulerRef scheduler; + + uint64_t max_in_progress = 0; + uint64_t in_progress = 0; + + uint64_t pending = 0; + + void wake(); + + seastar::future<> acquire_throttle( + crimson::osd::scheduler::params_t params); + + void release_throttle(); +}; + +} diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h new file mode 100644 index 000000000..4b6dbf4b7 --- /dev/null +++ b/src/crimson/osd/osd_operation_external_tracking.h @@ -0,0 +1,307 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/osd/osd.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operations/background_recovery.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/pg_advance_map.h" +#include "crimson/osd/osd_operations/recovery_subrequest.h" +#include "crimson/osd/osd_operations/replicated_request.h" +#include "crimson/osd/osd_operations/snaptrim_event.h" +#include "crimson/osd/pg_activation_blocker.h" +#include "crimson/osd/pg_map.h" + +namespace crimson::osd { + +// Just the boilerplate currently. Implementing +struct LttngBackend + : ClientRequest::StartEvent::Backend, + ConnectionPipeline::AwaitActive::BlockingEvent::Backend, + ConnectionPipeline::AwaitMap::BlockingEvent::Backend, + ConnectionPipeline::GetPG::BlockingEvent::Backend, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, + PGMap::PGCreationBlockingEvent::Backend, + ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend, + PGActivationBlocker::BlockingEvent::Backend, + ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend, + ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend, + ClientRequest::PGPipeline::Process::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend, + ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend, + ClientRequest::CompletionEvent::Backend +{ + void handle(ClientRequest::StartEvent&, + const Operation&) override {} + + void handle(ConnectionPipeline::AwaitActive::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::AwaitActive& blocker) override { + } + + void handle(ConnectionPipeline::AwaitMap::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::AwaitMap& blocker) override { + } + + void handle(OSD_OSDMapGate::OSDMapBlocker::BlockingEvent&, + const Operation&, + const OSD_OSDMapGate::OSDMapBlocker&) override { + } + + void handle(ConnectionPipeline::GetPG::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::GetPG& blocker) override { + } + + void handle(PGMap::PGCreationBlockingEvent&, + const Operation&, + const PGMap::PGCreationBlocker&) override { + } + + void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::AwaitMap& blocker) override { + } + + void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&, + const Operation&, + const PG_OSDMapGate::OSDMapBlocker&) override { + } + + void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::WaitForActive& blocker) override { + } + + void handle(PGActivationBlocker::BlockingEvent& ev, + const Operation& op, + const PGActivationBlocker& blocker) override { + } + + void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::RecoverMissing& blocker) override { + } + + void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::GetOBC& blocker) override { + } + + void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::Process& blocker) override { + } + + void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::WaitRepop& blocker) override { + } + + void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev, + const Operation& op) override { + } + + void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::SendReply& blocker) override { + } + + void handle(ClientRequest::CompletionEvent&, + const Operation&) override {} +}; + +struct HistoricBackend + : ClientRequest::StartEvent::Backend, + ConnectionPipeline::AwaitActive::BlockingEvent::Backend, + ConnectionPipeline::AwaitMap::BlockingEvent::Backend, + ConnectionPipeline::GetPG::BlockingEvent::Backend, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, + PGMap::PGCreationBlockingEvent::Backend, + ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend, + PGActivationBlocker::BlockingEvent::Backend, + ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend, + ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend, + ClientRequest::PGPipeline::Process::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend, + ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend, + ClientRequest::CompletionEvent::Backend +{ + void handle(ClientRequest::StartEvent&, + const Operation&) override {} + + void handle(ConnectionPipeline::AwaitActive::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::AwaitActive& blocker) override { + } + + void handle(ConnectionPipeline::AwaitMap::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::AwaitMap& blocker) override { + } + + void handle(OSD_OSDMapGate::OSDMapBlocker::BlockingEvent&, + const Operation&, + const OSD_OSDMapGate::OSDMapBlocker&) override { + } + + void handle(ConnectionPipeline::GetPG::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::GetPG& blocker) override { + } + + void handle(PGMap::PGCreationBlockingEvent&, + const Operation&, + const PGMap::PGCreationBlocker&) override { + } + + void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::AwaitMap& blocker) override { + } + + void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&, + const Operation&, + const PG_OSDMapGate::OSDMapBlocker&) override { + } + + void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::WaitForActive& blocker) override { + } + + void handle(PGActivationBlocker::BlockingEvent& ev, + const Operation& op, + const PGActivationBlocker& blocker) override { + } + + void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::RecoverMissing& blocker) override { + } + + void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::GetOBC& blocker) override { + } + + void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::Process& blocker) override { + } + + void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::WaitRepop& blocker) override { + } + + void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev, + const Operation& op) override { + } + + void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::SendReply& blocker) override { + } + + static const ClientRequest& to_client_request(const Operation& op) { +#ifdef NDEBUG + return static_cast<const ClientRequest&>(op); +#else + return dynamic_cast<const ClientRequest&>(op); +#endif + } + + void handle(ClientRequest::CompletionEvent&, const Operation& op) override { + if (crimson::common::local_conf()->osd_op_history_size) { + to_client_request(op).put_historic(); + } + } +}; + +} // namespace crimson::osd + +namespace crimson { + +template <> +struct EventBackendRegistry<osd::ClientRequest> { + static std::tuple<osd::LttngBackend, osd::HistoricBackend> get_backends() { + return { {}, {} }; + } +}; + +template <> +struct EventBackendRegistry<osd::RemotePeeringEvent> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + +template <> +struct EventBackendRegistry<osd::LocalPeeringEvent> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + +template <> +struct EventBackendRegistry<osd::RepRequest> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + + +template <> +struct EventBackendRegistry<osd::LogMissingRequest> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + +template <> +struct EventBackendRegistry<osd::LogMissingRequestReply> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + +template <> +struct EventBackendRegistry<osd::RecoverySubRequest> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + +template <> +struct EventBackendRegistry<osd::BackfillRecovery> { + static std::tuple<> get_backends() { + return {}; + } +}; + +template <> +struct EventBackendRegistry<osd::PGAdvanceMap> { + static std::tuple<> get_backends() { + return {}; + } +}; + +template <> +struct EventBackendRegistry<osd::SnapTrimObjSubEvent> { + static std::tuple<> get_backends() { + return {}; + } +}; + +} // namespace crimson diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc new file mode 100644 index 000000000..953ec9595 --- /dev/null +++ b/src/crimson/osd/osd_operations/background_recovery.cc @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> +#include <seastar/core/sleep.hh> + +#include "messages/MOSDOp.h" + +#include "crimson/osd/pg.h" +#include "crimson/osd/shard_services.h" +#include "common/Formatter.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/osd_operations/background_recovery.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson { + template <> + struct EventBackendRegistry<osd::UrgentRecovery> { + static std::tuple<> get_backends() { + return {}; + } + }; + + template <> + struct EventBackendRegistry<osd::PglogBasedRecovery> { + static std::tuple<> get_backends() { + return {}; + } + }; +} + +namespace crimson::osd { + +template <class T> +BackgroundRecoveryT<T>::BackgroundRecoveryT( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + crimson::osd::scheduler::scheduler_class_t scheduler_class, + float delay) + : pg(pg), + epoch_started(epoch_started), + delay(delay), + ss(ss), + scheduler_class(scheduler_class) +{} + +template <class T> +void BackgroundRecoveryT<T>::print(std::ostream &lhs) const +{ + lhs << "BackgroundRecovery(" << pg->get_pgid() << ")"; +} + +template <class T> +void BackgroundRecoveryT<T>::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->get_pgid(); + f->open_object_section("recovery_detail"); + { + // TODO pg->dump_recovery_state(f); + } + f->close_section(); +} + +template <class T> +seastar::future<> BackgroundRecoveryT<T>::start() +{ + logger().debug("{}: start", *this); + + typename T::IRef ref = static_cast<T*>(this); + auto maybe_delay = seastar::now(); + if (delay) { + maybe_delay = seastar::sleep( + std::chrono::milliseconds(std::lround(delay * 1000))); + } + return maybe_delay.then([ref, this] { + return this->template with_blocking_event<OperationThrottler::BlockingEvent>( + [ref, this] (auto&& trigger) { + return ss.with_throttle_while( + std::move(trigger), + this, get_scheduler_params(), [this] { + return T::interruptor::with_interruption([this] { + return do_recovery(); + }, [](std::exception_ptr) { + return seastar::make_ready_future<bool>(false); + }, pg); + }).handle_exception_type([ref, this](const std::system_error& err) { + if (err.code() == std::make_error_code(std::errc::interrupted)) { + logger().debug("{} recovery interruped: {}", *pg, err.what()); + return seastar::now(); + } + return seastar::make_exception_future<>(err); + }); + }); + }); +} + +UrgentRecovery::UrgentRecovery( + const hobject_t& soid, + const eversion_t& need, + Ref<PG> pg, + ShardServices& ss, + epoch_t epoch_started) + : BackgroundRecoveryT{pg, ss, epoch_started, + crimson::osd::scheduler::scheduler_class_t::immediate}, + soid{soid}, need(need) +{ +} + +UrgentRecovery::interruptible_future<bool> +UrgentRecovery::do_recovery() +{ + logger().debug("{}: {}", __func__, *this); + if (!pg->has_reset_since(epoch_started)) { + return with_blocking_event<RecoveryBackend::RecoveryBlockingEvent, + interruptor>([this] (auto&& trigger) { + return pg->get_recovery_handler()->recover_missing(trigger, soid, need); + }).then_interruptible([] { + return seastar::make_ready_future<bool>(false); + }); + } + return seastar::make_ready_future<bool>(false); +} + +void UrgentRecovery::print(std::ostream &lhs) const +{ + lhs << "UrgentRecovery(" << pg->get_pgid() << ", " + << soid << ", v" << need << ", epoch_started: " + << epoch_started << ")"; +} + +void UrgentRecovery::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->get_pgid(); + f->open_object_section("recovery_detail"); + { + f->dump_stream("oid") << soid; + f->dump_stream("version") << need; + } + f->close_section(); +} + +PglogBasedRecovery::PglogBasedRecovery( + Ref<PG> pg, + ShardServices &ss, + const epoch_t epoch_started, + float delay) + : BackgroundRecoveryT( + std::move(pg), + ss, + epoch_started, + crimson::osd::scheduler::scheduler_class_t::background_recovery, + delay) +{} + +PglogBasedRecovery::interruptible_future<bool> +PglogBasedRecovery::do_recovery() +{ + if (pg->has_reset_since(epoch_started)) { + return seastar::make_ready_future<bool>(false); + } + return with_blocking_event<RecoveryBackend::RecoveryBlockingEvent, + interruptor>([this] (auto&& trigger) { + return pg->get_recovery_handler()->start_recovery_ops( + trigger, + crimson::common::local_conf()->osd_recovery_max_single_start); + }); +} + +PGPeeringPipeline &BackfillRecovery::peering_pp(PG &pg) +{ + return pg.peering_request_pg_pipeline; +} + +BackfillRecovery::interruptible_future<bool> +BackfillRecovery::do_recovery() +{ + logger().debug("{}", __func__); + + if (pg->has_reset_since(epoch_started)) { + logger().debug("{}: pg got reset since epoch_started={}", + __func__, epoch_started); + return seastar::make_ready_future<bool>(false); + } + // TODO: limits + return enter_stage<interruptor>( + // process_event() of our boost::statechart machine is non-reentrant. + // with the backfill_pipeline we protect it from a second entry from + // the implementation of BackfillListener. + // additionally, this stage serves to synchronize with PeeringEvent. + peering_pp(*pg).process + ).then_interruptible([this] { + pg->get_recovery_handler()->dispatch_backfill_event(std::move(evt)); + return seastar::make_ready_future<bool>(false); + }); +} + +template class BackgroundRecoveryT<UrgentRecovery>; +template class BackgroundRecoveryT<PglogBasedRecovery>; +template class BackgroundRecoveryT<BackfillRecovery>; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/background_recovery.h b/src/crimson/osd/osd_operations/background_recovery.h new file mode 100644 index 000000000..17f2cd57a --- /dev/null +++ b/src/crimson/osd/osd_operations/background_recovery.h @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/statechart/event_base.hpp> + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/recovery_backend.h" +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/pg.h" + +namespace crimson::osd { +class PG; +class ShardServices; + +template <class T> +class BackgroundRecoveryT : public PhasedOperationT<T> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::background_recovery; + + BackgroundRecoveryT( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + crimson::osd::scheduler::scheduler_class_t scheduler_class, float delay = 0); + + virtual void print(std::ostream &) const; + seastar::future<> start(); + +protected: + Ref<PG> pg; + const epoch_t epoch_started; + float delay = 0; + +private: + virtual void dump_detail(Formatter *f) const; + crimson::osd::scheduler::params_t get_scheduler_params() const { + return { + 1, // cost + 0, // owner + scheduler_class + }; + } + using do_recovery_ret_t = typename PhasedOperationT<T>::template interruptible_future<bool>; + virtual do_recovery_ret_t do_recovery() = 0; + ShardServices &ss; + const crimson::osd::scheduler::scheduler_class_t scheduler_class; +}; + +/// represent a recovery initiated for serving a client request +/// +/// unlike @c PglogBasedRecovery and @c BackfillRecovery, +/// @c UrgentRecovery is not throttled by the scheduler. and it +/// utilizes @c RecoveryBackend directly to recover the unreadable +/// object. +class UrgentRecovery final : public BackgroundRecoveryT<UrgentRecovery> { +public: + UrgentRecovery( + const hobject_t& soid, + const eversion_t& need, + Ref<PG> pg, + ShardServices& ss, + epoch_t epoch_started); + void print(std::ostream&) const final; + + std::tuple< + OperationThrottler::BlockingEvent, + RecoveryBackend::RecoveryBlockingEvent + > tracking_events; + +private: + void dump_detail(Formatter* f) const final; + interruptible_future<bool> do_recovery() override; + const hobject_t soid; + const eversion_t need; +}; + +class PglogBasedRecovery final : public BackgroundRecoveryT<PglogBasedRecovery> { +public: + PglogBasedRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + float delay = 0); + + std::tuple< + OperationThrottler::BlockingEvent, + RecoveryBackend::RecoveryBlockingEvent + > tracking_events; + +private: + interruptible_future<bool> do_recovery() override; +}; + +class BackfillRecovery final : public BackgroundRecoveryT<BackfillRecovery> { +public: + + template <class EventT> + BackfillRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + const EventT& evt); + + PipelineHandle& get_handle() { return handle; } + + std::tuple< + OperationThrottler::BlockingEvent, + PGPeeringPipeline::Process::BlockingEvent + > tracking_events; + +private: + boost::intrusive_ptr<const boost::statechart::event_base> evt; + PipelineHandle handle; + + static PGPeeringPipeline &peering_pp(PG &pg); + interruptible_future<bool> do_recovery() override; +}; + +template <class EventT> +BackfillRecovery::BackfillRecovery( + Ref<PG> pg, + ShardServices &ss, + const epoch_t epoch_started, + const EventT& evt) + : BackgroundRecoveryT( + std::move(pg), + ss, + epoch_started, + crimson::osd::scheduler::scheduler_class_t::background_best_effort), + evt(evt.intrusive_from_this()) +{} + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::BackfillRecovery> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::osd::PglogBasedRecovery> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::osd::UrgentRecovery> : fmt::ostream_formatter {}; +template <class T> struct fmt::formatter<crimson::osd::BackgroundRecoveryT<T>> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc new file mode 100644 index 000000000..9374fbde2 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request.cc @@ -0,0 +1,388 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" + +#include "crimson/common/exception.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" +#include "common/Formatter.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_connection_priv.h" +#include "osd/object_state_fmt.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + + +void ClientRequest::Orderer::requeue( + ShardServices &shard_services, Ref<PG> pg) +{ + for (auto &req: list) { + logger().debug("{}: {} requeueing {}", __func__, *pg, req); + req.reset_instance_handle(); + std::ignore = req.with_pg_int(shard_services, pg); + } +} + +void ClientRequest::Orderer::clear_and_cancel() +{ + for (auto i = list.begin(); i != list.end(); ) { + logger().debug( + "ClientRequest::Orderer::clear_and_cancel: {}", + *i); + i->complete_request(); + remove_request(*(i++)); + } +} + +void ClientRequest::complete_request() +{ + track_event<CompletionEvent>(); + on_complete.set_value(); +} + +ClientRequest::ClientRequest( + ShardServices &shard_services, crimson::net::ConnectionRef conn, + Ref<MOSDOp> &&m) + : put_historic_shard_services(&shard_services), + conn(std::move(conn)), + m(std::move(m)), + instance_handle(new instance_handle_t) +{} + +ClientRequest::~ClientRequest() +{ + logger().debug("{}: destroying", *this); +} + +void ClientRequest::print(std::ostream &lhs) const +{ + lhs << "m=[" << *m << "]"; +} + +void ClientRequest::dump_detail(Formatter *f) const +{ + logger().debug("{}: dumping", *this); + std::apply([f] (auto... event) { + (..., event.dump(f)); + }, tracking_events); +} + +ConnectionPipeline &ClientRequest::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).client_request_conn_pipeline; +} + +ClientRequest::PGPipeline &ClientRequest::client_pp(PG &pg) +{ + return pg.request_pg_pipeline; +} + +bool ClientRequest::is_pg_op() const +{ + return std::any_of( + begin(m->ops), end(m->ops), + [](auto& op) { return ceph_osd_op_type_pg(op.op.op); }); +} + +seastar::future<> ClientRequest::with_pg_int( + ShardServices &shard_services, Ref<PG> pgref) +{ + epoch_t same_interval_since = pgref->get_interval_start_epoch(); + logger().debug("{} same_interval_since: {}", *this, same_interval_since); + if (m->finish_decode()) { + m->clear_payload(); + } + const auto this_instance_id = instance_id++; + OperationRef opref{this}; + auto instance_handle = get_instance_handle(); + auto &ihref = *instance_handle; + return interruptor::with_interruption( + [this, pgref, this_instance_id, &ihref, &shard_services]() mutable { + PG &pg = *pgref; + if (pg.can_discard_op(*m)) { + return shard_services.send_incremental_map( + std::ref(*conn), m->get_map_epoch() + ).then([this, this_instance_id, pgref] { + logger().debug("{}.{}: discarding", *this, this_instance_id); + pgref->client_request_orderer.remove_request(*this); + complete_request(); + return interruptor::now(); + }); + } + return ihref.enter_stage<interruptor>(client_pp(pg).await_map, *this + ).then_interruptible([this, this_instance_id, &pg, &ihref] { + logger().debug("{}.{}: after await_map stage", *this, this_instance_id); + return ihref.enter_blocker( + *this, pg.osdmap_gate, &decltype(pg.osdmap_gate)::wait_for_map, + m->get_min_epoch(), nullptr); + }).then_interruptible([this, this_instance_id, &pg, &ihref](auto map) { + logger().debug("{}.{}: after wait_for_map", *this, this_instance_id); + return ihref.enter_stage<interruptor>(client_pp(pg).wait_for_active, *this); + }).then_interruptible([this, this_instance_id, &pg, &ihref]() { + logger().debug( + "{}.{}: after wait_for_active stage", *this, this_instance_id); + return ihref.enter_blocker( + *this, + pg.wait_for_active_blocker, + &decltype(pg.wait_for_active_blocker)::wait); + }).then_interruptible([this, pgref, this_instance_id, &ihref]() mutable + -> interruptible_future<> { + logger().debug( + "{}.{}: after wait_for_active", *this, this_instance_id); + if (is_pg_op()) { + return process_pg_op(pgref); + } else { + return process_op(ihref, pgref); + } + }).then_interruptible([this, this_instance_id, pgref] { + logger().debug("{}.{}: after process*", *this, this_instance_id); + pgref->client_request_orderer.remove_request(*this); + complete_request(); + }); + }, [this, this_instance_id, pgref](std::exception_ptr eptr) { + // TODO: better debug output + logger().debug("{}.{}: interrupted {}", *this, this_instance_id, eptr); + }, pgref).finally( + [opref=std::move(opref), pgref=std::move(pgref), + instance_handle=std::move(instance_handle), &ihref] { + ihref.handle.exit(); + }); +} + +seastar::future<> ClientRequest::with_pg( + ShardServices &shard_services, Ref<PG> pgref) +{ + put_historic_shard_services = &shard_services; + pgref->client_request_orderer.add_request(*this); + auto ret = on_complete.get_future(); + std::ignore = with_pg_int( + shard_services, std::move(pgref) + ); + return ret; +} + +ClientRequest::interruptible_future<> +ClientRequest::process_pg_op( + Ref<PG> &pg) +{ + return pg->do_pg_ops( + m + ).then_interruptible([this, pg=std::move(pg)](MURef<MOSDOpReply> reply) { + return conn->send(std::move(reply)); + }); +} + +auto ClientRequest::reply_op_error(const Ref<PG>& pg, int err) +{ + logger().debug("{}: replying with error {}", *this, err); + auto reply = crimson::make_message<MOSDOpReply>( + m.get(), err, pg->get_osdmap_epoch(), + m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK), + !m->has_flag(CEPH_OSD_FLAG_RETURNVEC)); + reply->set_reply_versions(eversion_t(), 0); + reply->set_op_returns(std::vector<pg_log_op_return_item_t>{}); + return conn->send(std::move(reply)); +} + +ClientRequest::interruptible_future<> +ClientRequest::process_op(instance_handle_t &ihref, Ref<PG> &pg) +{ + return ihref.enter_stage<interruptor>( + client_pp(*pg).recover_missing, + *this + ).then_interruptible( + [this, pg]() mutable { + if (pg->is_primary()) { + return do_recover_missing(pg, m->get_hobj()); + } else { + logger().debug("process_op: Skipping do_recover_missing" + "on non primary pg"); + return interruptor::now(); + } + }).then_interruptible([this, pg, &ihref]() mutable { + return pg->already_complete(m->get_reqid()).then_interruptible( + [this, pg, &ihref](auto completed) mutable + -> PG::load_obc_iertr::future<> { + if (completed) { + auto reply = crimson::make_message<MOSDOpReply>( + m.get(), completed->err, pg->get_osdmap_epoch(), + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, false); + reply->set_reply_versions(completed->version, completed->user_version); + return conn->send(std::move(reply)); + } else { + return ihref.enter_stage<interruptor>(client_pp(*pg).get_obc, *this + ).then_interruptible( + [this, pg, &ihref]() mutable -> PG::load_obc_iertr::future<> { + logger().debug("{}: in get_obc stage", *this); + op_info.set_from_op(&*m, *pg->get_osdmap()); + return pg->with_locked_obc( + m->get_hobj(), op_info, + [this, pg, &ihref](auto obc) mutable { + logger().debug("{}: got obc {}", *this, obc->obs); + return ihref.enter_stage<interruptor>( + client_pp(*pg).process, *this + ).then_interruptible([this, pg, obc, &ihref]() mutable { + return do_process(ihref, pg, obc); + }); + }); + }); + } + }); + }).handle_error_interruptible( + PG::load_obc_ertr::all_same_way([this, pg=std::move(pg)](const auto &code) { + logger().error("ClientRequest saw error code {}", code); + assert(code.value() > 0); + return reply_op_error(pg, -code.value()); + })); +} + +ClientRequest::interruptible_future<> +ClientRequest::do_process( + instance_handle_t &ihref, + Ref<PG>& pg, crimson::osd::ObjectContextRef obc) +{ + if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) { + return reply_op_error(pg, -EINVAL); + } + const pg_pool_t pool = pg->get_pgpool().info; + if (pool.has_flag(pg_pool_t::FLAG_EIO)) { + // drop op on the floor; the client will handle returning EIO + if (m->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO)) { + logger().debug("discarding op due to pool EIO flag"); + return seastar::now(); + } else { + logger().debug("replying EIO due to pool EIO flag"); + return reply_op_error(pg, -EIO); + } + } + if (m->get_oid().name.size() + > crimson::common::local_conf()->osd_max_object_name_len) { + return reply_op_error(pg, -ENAMETOOLONG); + } else if (m->get_hobj().get_key().size() + > crimson::common::local_conf()->osd_max_object_name_len) { + return reply_op_error(pg, -ENAMETOOLONG); + } else if (m->get_hobj().nspace.size() + > crimson::common::local_conf()->osd_max_object_namespace_len) { + return reply_op_error(pg, -ENAMETOOLONG); + } else if (m->get_hobj().oid.name.empty()) { + return reply_op_error(pg, -EINVAL); + } else if (pg->get_osdmap()->is_blocklisted(conn->get_peer_addr())) { + logger().info("{} is blocklisted", conn->get_peer_addr()); + return reply_op_error(pg, -EBLOCKLISTED); + } + + if (!obc->obs.exists && !op_info.may_write()) { + return reply_op_error(pg, -ENOENT); + } + + SnapContext snapc = get_snapc(pg,obc); + + if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) && + snapc.seq < obc->ssc->snapset.seq) { + logger().debug("{} ORDERSNAP flag set and snapc seq {}", + " < snapset seq {} on {}", + __func__, snapc.seq, obc->ssc->snapset.seq, + obc->obs.oi.soid); + return reply_op_error(pg, -EOLDSNAPC); + } + + if (!pg->is_primary()) { + // primary can handle both normal ops and balanced reads + if (is_misdirected(*pg)) { + logger().trace("do_process: dropping misdirected op"); + return seastar::now(); + } else if (const hobject_t& hoid = m->get_hobj(); + !pg->get_peering_state().can_serve_replica_read(hoid)) { + logger().debug("{}: unstable write on replica, " + "bouncing to primary", + __func__); + return reply_op_error(pg, -EAGAIN); + } else { + logger().debug("{}: serving replica read on oid {}", + __func__, m->get_hobj()); + } + } + return pg->do_osd_ops(m, conn, obc, op_info, snapc).safe_then_unpack_interruptible( + [this, pg, &ihref](auto submitted, auto all_completed) mutable { + return submitted.then_interruptible([this, pg, &ihref] { + return ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this); + }).then_interruptible( + [this, pg, all_completed=std::move(all_completed), &ihref]() mutable { + return all_completed.safe_then_interruptible( + [this, pg, &ihref](MURef<MOSDOpReply> reply) { + return ihref.enter_stage<interruptor>(client_pp(*pg).send_reply, *this + ).then_interruptible( + [this, reply=std::move(reply)]() mutable { + logger().debug("{}: sending response", *this); + return conn->send(std::move(reply)); + }); + }, crimson::ct_error::eagain::handle([this, pg, &ihref]() mutable { + return process_op(ihref, pg); + })); + }); + }, crimson::ct_error::eagain::handle([this, pg, &ihref]() mutable { + return process_op(ihref, pg); + })); +} + +bool ClientRequest::is_misdirected(const PG& pg) const +{ + // otherwise take a closer look + if (const int flags = m->get_flags(); + flags & CEPH_OSD_FLAG_BALANCE_READS || + flags & CEPH_OSD_FLAG_LOCALIZE_READS) { + if (!op_info.may_read()) { + // no read found, so it can't be balanced read + return true; + } + if (op_info.may_write() || op_info.may_cache()) { + // write op, but i am not primary + return true; + } + // balanced reads; any replica will do + return false; + } + // neither balanced nor localize reads + return true; +} + +void ClientRequest::put_historic() const +{ + ceph_assert_always(put_historic_shard_services); + put_historic_shard_services->get_registry().put_historic(*this); +} + +const SnapContext ClientRequest::get_snapc( + Ref<PG>& pg, + crimson::osd::ObjectContextRef obc) const +{ + SnapContext snapc; + if (op_info.may_write() || op_info.may_cache()) { + // snap + if (pg->get_pgpool().info.is_pool_snaps_mode()) { + // use pool's snapc + snapc = pg->get_pgpool().snapc; + logger().debug("{} using pool's snapc snaps={}", + __func__, snapc.snaps); + + } else { + // client specified snapc + snapc.seq = m->get_snap_seq(); + snapc.snaps = m->get_snaps(); + logger().debug("{} client specified snapc seq={} snaps={}", + __func__, snapc.seq, snapc.snaps); + } + } + return snapc; +} + +} diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h new file mode 100644 index 000000000..b2dce1e87 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request.h @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include <boost/intrusive/list.hpp> +#include <boost/intrusive_ptr.hpp> + +#include "osd/osd_op_util.h" +#include "crimson/net/Connection.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request_common.h" +#include "crimson/osd/osd_operations/common/pg_pipeline.h" +#include "crimson/osd/pg_activation_blocker.h" +#include "crimson/osd/pg_map.h" +#include "crimson/common/type_helpers.h" +#include "crimson/common/utility.h" +#include "messages/MOSDOp.h" + +namespace crimson::osd { +class PG; +class OSD; +class ShardServices; + +class ClientRequest final : public PhasedOperationT<ClientRequest>, + private CommonClientRequest { + // Initially set to primary core, updated to pg core after move, + // used by put_historic + ShardServices *put_historic_shard_services = nullptr; + + crimson::net::ConnectionRef conn; + // must be after conn due to ConnectionPipeline's life-time + Ref<MOSDOp> m; + OpInfo op_info; + seastar::promise<> on_complete; + unsigned instance_id = 0; + +public: + class PGPipeline : public CommonPGPipeline { + public: + struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> { + static constexpr auto type_name = "ClientRequest::PGPipeline::await_map"; + } await_map; + struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> { + static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop"; + } wait_repop; + struct SendReply : OrderedExclusivePhaseT<SendReply> { + static constexpr auto type_name = "ClientRequest::PGPipeline::send_reply"; + } send_reply; + friend class ClientRequest; + friend class LttngBackend; + friend class HistoricBackend; + friend class ReqRequest; + friend class LogMissingRequest; + friend class LogMissingRequestReply; + }; + + /** + * instance_handle_t + * + * Client request is, at present, the only Operation which can be requeued. + * This is, mostly, fine. However, reusing the PipelineHandle or + * BlockingEvent structures before proving that the prior instance has stopped + * can create hangs or crashes due to violations of the BlockerT and + * PipelineHandle invariants. + * + * To solve this, we create an instance_handle_t which contains the events + * for the portion of execution that can be rerun as well as the + * PipelineHandle. ClientRequest::with_pg_int grabs a reference to the current + * instance_handle_t and releases its PipelineHandle in the finally block. + * On requeue, we create a new instance_handle_t with a fresh PipelineHandle + * and events tuple and use it and use it for the next invocation of + * with_pg_int. + */ + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + PGMap::PGCreationBlockingEvent, + CompletionEvent + > tracking_events; + + class instance_handle_t : public boost::intrusive_ref_counter< + instance_handle_t, boost::thread_unsafe_counter> { + public: + // intrusive_ptr because seastar::lw_shared_ptr includes a cpu debug check + // that we will fail since the core on which we allocate the request may not + // be the core on which we perform with_pg_int. This is harmless, since we + // don't leave any references on the source core, so we just bypass it by using + // intrusive_ptr instead. + using ref_t = boost::intrusive_ptr<instance_handle_t>; + PipelineHandle handle; + + std::tuple< + PGPipeline::AwaitMap::BlockingEvent, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent, + PGPipeline::WaitForActive::BlockingEvent, + PGActivationBlocker::BlockingEvent, + PGPipeline::RecoverMissing::BlockingEvent, + PGPipeline::GetOBC::BlockingEvent, + PGPipeline::Process::BlockingEvent, + PGPipeline::WaitRepop::BlockingEvent, + PGPipeline::SendReply::BlockingEvent, + CompletionEvent + > pg_tracking_events; + + template <typename BlockingEventT, typename InterruptorT=void, typename F> + auto with_blocking_event(F &&f, ClientRequest &op) { + auto ret = std::forward<F>(f)( + typename BlockingEventT::template Trigger<ClientRequest>{ + std::get<BlockingEventT>(pg_tracking_events), op + }); + if constexpr (std::is_same_v<InterruptorT, void>) { + return ret; + } else { + using ret_t = decltype(ret); + return typename InterruptorT::template futurize_t<ret_t>{std::move(ret)}; + } + } + + template <typename InterruptorT=void, typename StageT> + auto enter_stage(StageT &stage, ClientRequest &op) { + return this->template with_blocking_event< + typename StageT::BlockingEvent, + InterruptorT>( + [&stage, this](auto &&trigger) { + return handle.template enter<ClientRequest>( + stage, std::move(trigger)); + }, op); + } + + template < + typename InterruptorT=void, typename BlockingObj, typename Method, + typename... Args> + auto enter_blocker( + ClientRequest &op, BlockingObj &obj, Method method, Args&&... args) { + return this->template with_blocking_event< + typename BlockingObj::Blocker::BlockingEvent, + InterruptorT>( + [&obj, method, + args=std::forward_as_tuple(std::move(args)...)](auto &&trigger) mutable { + return apply_method_to_tuple( + obj, method, + std::tuple_cat( + std::forward_as_tuple(std::move(trigger)), + std::move(args)) + ); + }, op); + } + }; + instance_handle_t::ref_t instance_handle; + void reset_instance_handle() { + instance_handle = new instance_handle_t; + } + auto get_instance_handle() { return instance_handle; } + + using ordering_hook_t = boost::intrusive::list_member_hook<>; + ordering_hook_t ordering_hook; + class Orderer { + using list_t = boost::intrusive::list< + ClientRequest, + boost::intrusive::member_hook< + ClientRequest, + typename ClientRequest::ordering_hook_t, + &ClientRequest::ordering_hook> + >; + list_t list; + + public: + void add_request(ClientRequest &request) { + assert(!request.ordering_hook.is_linked()); + intrusive_ptr_add_ref(&request); + list.push_back(request); + } + void remove_request(ClientRequest &request) { + assert(request.ordering_hook.is_linked()); + list.erase(list_t::s_iterator_to(request)); + intrusive_ptr_release(&request); + } + void requeue(ShardServices &shard_services, Ref<PG> pg); + void clear_and_cancel(); + }; + void complete_request(); + + static constexpr OperationTypeCode type = OperationTypeCode::client_request; + + ClientRequest( + ShardServices &shard_services, + crimson::net::ConnectionRef, Ref<MOSDOp> &&m); + ~ClientRequest(); + + void print(std::ostream &) const final; + void dump_detail(Formatter *f) const final; + + static constexpr bool can_create() { return false; } + spg_t get_pgid() const { + return m->get_spg(); + } + PipelineHandle &get_handle() { return instance_handle->handle; } + epoch_t get_epoch() const { return m->get_min_epoch(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } + + seastar::future<> with_pg_int( + ShardServices &shard_services, Ref<PG> pg); + +public: + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pgref); + +private: + template <typename FuncT> + interruptible_future<> with_sequencer(FuncT&& func); + auto reply_op_error(const Ref<PG>& pg, int err); + + interruptible_future<> do_process( + instance_handle_t &ihref, + Ref<PG>& pg, + crimson::osd::ObjectContextRef obc); + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition> process_pg_op( + Ref<PG> &pg); + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition> process_op( + instance_handle_t &ihref, + Ref<PG> &pg); + bool is_pg_op() const; + + PGPipeline &client_pp(PG &pg); + + template <typename Errorator> + using interruptible_errorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + Errorator>; + + bool is_misdirected(const PG& pg) const; + + const SnapContext get_snapc( + Ref<PG>& pg, + crimson::osd::ObjectContextRef obc) const; + +public: + + friend class LttngBackend; + friend class HistoricBackend; + + auto get_started() const { + return get_event<StartEvent>().get_timestamp(); + }; + + auto get_completed() const { + return get_event<CompletionEvent>().get_timestamp(); + }; + + void put_historic() const; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::ClientRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/client_request_common.cc b/src/crimson/osd/osd_operations/client_request_common.cc new file mode 100644 index 000000000..cfd22c774 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request_common.cc @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "crimson/osd/osd_operations/client_request_common.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd_operations/background_recovery.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +typename InterruptibleOperation::template interruptible_future<> +CommonClientRequest::do_recover_missing( + Ref<PG>& pg, const hobject_t& soid) +{ + eversion_t ver; + assert(pg->is_primary()); + logger().debug("{} check for recovery, {}", __func__, soid); + if (!pg->is_unreadable_object(soid, &ver) && + !pg->is_degraded_or_backfilling_object(soid)) { + return seastar::now(); + } + logger().debug("{} need to wait for recovery, {}", __func__, soid); + if (pg->get_recovery_backend()->is_recovering(soid)) { + return pg->get_recovery_backend()->get_recovering(soid).wait_for_recovered(); + } else { + auto [op, fut] = + pg->get_shard_services().start_operation<UrgentRecovery>( + soid, ver, pg, pg->get_shard_services(), pg->get_osdmap_epoch()); + return std::move(fut); + } +} + +bool CommonClientRequest::should_abort_request( + const Operation& op, + std::exception_ptr eptr) +{ + if (*eptr.__cxa_exception_type() == + typeid(::crimson::common::actingset_changed)) { + try { + std::rethrow_exception(eptr); + } catch(::crimson::common::actingset_changed& e) { + if (e.is_primary()) { + logger().debug("{} {} operation restart, acting set changed", __func__, op); + return false; + } else { + logger().debug("{} {} operation abort, up primary changed", __func__, op); + return true; + } + } + } else { + assert(*eptr.__cxa_exception_type() == + typeid(crimson::common::system_shutdown_exception)); + crimson::get_logger(ceph_subsys_osd).debug( + "{} {} operation skipped, system shutdown", __func__, op); + return true; + } +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/client_request_common.h b/src/crimson/osd/osd_operations/client_request_common.h new file mode 100644 index 000000000..6a8a78966 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request_common.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/operation.h" +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operation.h" + +namespace crimson::osd { + +struct CommonClientRequest { + static InterruptibleOperation::template interruptible_future<> + do_recover_missing(Ref<PG>& pg, const hobject_t& soid); + + static bool should_abort_request( + const crimson::Operation& op, std::exception_ptr eptr); +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h new file mode 100644 index 000000000..58fa07b8b --- /dev/null +++ b/src/crimson/osd/osd_operations/common/pg_pipeline.h @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "osd/osd_op_util.h" +#include "crimson/osd/osd_operation.h" + +namespace crimson::osd { + +class CommonPGPipeline { +protected: + friend class InternalClientRequest; + friend class SnapTrimEvent; + friend class SnapTrimObjSubEvent; + + struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> { + static constexpr auto type_name = "CommonPGPipeline:::wait_for_active"; + } wait_for_active; + struct RecoverMissing : OrderedExclusivePhaseT<RecoverMissing> { + static constexpr auto type_name = "CommonPGPipeline::recover_missing"; + } recover_missing; + struct GetOBC : OrderedExclusivePhaseT<GetOBC> { + static constexpr auto type_name = "CommonPGPipeline::get_obc"; + } get_obc; + struct Process : OrderedExclusivePhaseT<Process> { + static constexpr auto type_name = "CommonPGPipeline::process"; + } process; +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc new file mode 100644 index 000000000..1e9b842b2 --- /dev/null +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include <seastar/core/future.hh> + +#include "crimson/osd/osd_operations/internal_client_request.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson { + template <> + struct EventBackendRegistry<osd::InternalClientRequest> { + static std::tuple<> get_backends() { + return {}; + } + }; +} + + +namespace crimson::osd { + +InternalClientRequest::InternalClientRequest(Ref<PG> pg) + : pg(std::move(pg)) +{ + assert(bool(this->pg)); + assert(this->pg->is_primary()); +} + +InternalClientRequest::~InternalClientRequest() +{ + logger().debug("{}: destroying", *this); +} + +void InternalClientRequest::print(std::ostream &) const +{ +} + +void InternalClientRequest::dump_detail(Formatter *f) const +{ +} + +CommonPGPipeline& InternalClientRequest::client_pp() +{ + return pg->request_pg_pipeline; +} + +seastar::future<> InternalClientRequest::start() +{ + track_event<StartEvent>(); + return crimson::common::handle_system_shutdown([this] { + return seastar::repeat([this] { + logger().debug("{}: in repeat", *this); + return interruptor::with_interruption([this]() mutable { + return enter_stage<interruptor>( + client_pp().wait_for_active + ).then_interruptible([this] { + return with_blocking_event<PGActivationBlocker::BlockingEvent, + interruptor>([this] (auto&& trigger) { + return pg->wait_for_active_blocker.wait(std::move(trigger)); + }); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().recover_missing); + }).then_interruptible([this] { + return do_recover_missing(pg, get_target_oid()); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().get_obc); + }).then_interruptible([this] () -> PG::load_obc_iertr::future<> { + logger().debug("{}: getting obc lock", *this); + return seastar::do_with(create_osd_ops(), + [this](auto& osd_ops) mutable { + logger().debug("InternalClientRequest: got {} OSDOps to execute", + std::size(osd_ops)); + [[maybe_unused]] const int ret = op_info.set_from_op( + std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap()); + assert(ret == 0); + return pg->with_locked_obc(get_target_oid(), op_info, + [&osd_ops, this](auto obc) { + return enter_stage<interruptor>(client_pp().process + ).then_interruptible( + [obc=std::move(obc), &osd_ops, this] { + return pg->do_osd_ops( + std::move(obc), + osd_ops, + std::as_const(op_info), + get_do_osd_ops_params(), + [] { + return PG::do_osd_ops_iertr::now(); + }, + [] (const std::error_code& e) { + return PG::do_osd_ops_iertr::now(); + } + ).safe_then_unpack_interruptible( + [](auto submitted, auto all_completed) { + return all_completed.handle_error_interruptible( + crimson::ct_error::eagain::handle([] { + return seastar::now(); + })); + }, crimson::ct_error::eagain::handle([] { + return interruptor::now(); + }) + ); + }); + }); + }); + }).handle_error_interruptible(PG::load_obc_ertr::all_same_way([] { + return seastar::now(); + })).then_interruptible([] { + return seastar::stop_iteration::yes; + }); + }, [this](std::exception_ptr eptr) { + if (should_abort_request(*this, std::move(eptr))) { + return seastar::stop_iteration::yes; + } else { + return seastar::stop_iteration::no; + } + }, pg); + }).then([this] { + track_event<CompletionEvent>(); + }); + }); +} + +} // namespace crimson::osd + diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h new file mode 100644 index 000000000..8eed12e05 --- /dev/null +++ b/src/crimson/osd/osd_operations/internal_client_request.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request_common.h" +#include "crimson/osd/osd_operations/common/pg_pipeline.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_activation_blocker.h" + +namespace crimson::osd { + +class InternalClientRequest : public PhasedOperationT<InternalClientRequest>, + private CommonClientRequest { +public: + explicit InternalClientRequest(Ref<PG> pg); + ~InternalClientRequest(); + + // imposed by `ShardService::start_operation<T>(...)`. + seastar::future<> start(); + +protected: + virtual const hobject_t& get_target_oid() const = 0; + virtual PG::do_osd_ops_params_t get_do_osd_ops_params() const = 0; + virtual std::vector<OSDOp> create_osd_ops() = 0; + + const PG& get_pg() const { + return *pg; + } + +private: + friend OperationT<InternalClientRequest>; + + static constexpr OperationTypeCode type = + OperationTypeCode::internal_client_request; + + void print(std::ostream &) const final; + void dump_detail(Formatter *f) const final; + + CommonPGPipeline& client_pp(); + + seastar::future<> do_process(); + + Ref<PG> pg; + OpInfo op_info; + PipelineHandle handle; + +public: + PipelineHandle& get_handle() { return handle; } + + std::tuple< + StartEvent, + CommonPGPipeline::WaitForActive::BlockingEvent, + PGActivationBlocker::BlockingEvent, + CommonPGPipeline::RecoverMissing::BlockingEvent, + CommonPGPipeline::GetOBC::BlockingEvent, + CommonPGPipeline::Process::BlockingEvent, + CompletionEvent + > tracking_events; +}; + +} // namespace crimson::osd + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::InternalClientRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/logmissing_request.cc b/src/crimson/osd/osd_operations/logmissing_request.cc new file mode 100644 index 000000000..739b46406 --- /dev/null +++ b/src/crimson/osd/osd_operations/logmissing_request.cc @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "logmissing_request.h" + +#include "common/Formatter.h" + +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_connection_priv.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/pg.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +LogMissingRequest::LogMissingRequest(crimson::net::ConnectionRef&& conn, + Ref<MOSDPGUpdateLogMissing> &&req) + : conn{std::move(conn)}, + req{std::move(req)} +{} + +void LogMissingRequest::print(std::ostream& os) const +{ + os << "LogMissingRequest(" + << "from=" << req->from + << " req=" << *req + << ")"; +} + +void LogMissingRequest::dump_detail(Formatter *f) const +{ + f->open_object_section("LogMissingRequest"); + f->dump_stream("req_tid") << req->get_tid(); + f->dump_stream("pgid") << req->get_spg(); + f->dump_unsigned("map_epoch", req->get_map_epoch()); + f->dump_unsigned("min_epoch", req->get_min_epoch()); + f->dump_stream("entries") << req->entries; + f->dump_stream("from") << req->from; + f->close_section(); +} + +ConnectionPipeline &LogMissingRequest::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).replicated_request_conn_pipeline; +} + +ClientRequest::PGPipeline &LogMissingRequest::client_pp(PG &pg) +{ + return pg.request_pg_pipeline; +} + +seastar::future<> LogMissingRequest::with_pg( + ShardServices &shard_services, Ref<PG> pg) +{ + logger().debug("{}: LogMissingRequest::with_pg", *this); + + IRef ref = this; + return interruptor::with_interruption([this, pg] { + logger().debug("{}: pg present", *this); + return this->template enter_stage<interruptor>(client_pp(*pg).await_map + ).then_interruptible([this, pg] { + return this->template with_blocking_event< + PG_OSDMapGate::OSDMapBlocker::BlockingEvent + >([this, pg](auto &&trigger) { + return pg->osdmap_gate.wait_for_map( + std::move(trigger), req->min_epoch); + }); + }).then_interruptible([this, pg](auto) { + return pg->do_update_log_missing(req, conn); + }); + }, [ref](std::exception_ptr) { return seastar::now(); }, pg); +} + +} diff --git a/src/crimson/osd/osd_operations/logmissing_request.h b/src/crimson/osd/osd_operations/logmissing_request.h new file mode 100644 index 000000000..71d0816fd --- /dev/null +++ b/src/crimson/osd/osd_operations/logmissing_request.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/pg_map.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDPGUpdateLogMissing.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; + +class OSD; +class PG; + +class LogMissingRequest final : public PhasedOperationT<LogMissingRequest> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::logmissing_request; + LogMissingRequest(crimson::net::ConnectionRef&&, Ref<MOSDPGUpdateLogMissing>&&); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + + static constexpr bool can_create() { return false; } + spg_t get_pgid() const { + return req->get_spg(); + } + PipelineHandle &get_handle() { return handle; } + epoch_t get_epoch() const { return req->get_min_epoch(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } + + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); + + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + ClientRequest::PGPipeline::AwaitMap::BlockingEvent, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent, + PGMap::PGCreationBlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent + > tracking_events; + +private: + ClientRequest::PGPipeline &client_pp(PG &pg); + + crimson::net::ConnectionRef conn; + // must be after `conn` to ensure the ConnectionPipeline's is alive + PipelineHandle handle; + Ref<MOSDPGUpdateLogMissing> req; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::LogMissingRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.cc b/src/crimson/osd/osd_operations/logmissing_request_reply.cc new file mode 100644 index 000000000..b4bf2938e --- /dev/null +++ b/src/crimson/osd/osd_operations/logmissing_request_reply.cc @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "logmissing_request_reply.h" + +#include "common/Formatter.h" + +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_connection_priv.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/pg.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +LogMissingRequestReply::LogMissingRequestReply( + crimson::net::ConnectionRef&& conn, + Ref<MOSDPGUpdateLogMissingReply> &&req) + : conn{std::move(conn)}, + req{std::move(req)} +{} + +void LogMissingRequestReply::print(std::ostream& os) const +{ + os << "LogMissingRequestReply(" + << "from=" << req->from + << " req=" << *req + << ")"; +} + +void LogMissingRequestReply::dump_detail(Formatter *f) const +{ + f->open_object_section("LogMissingRequestReply"); + f->dump_stream("rep_tid") << req->get_tid(); + f->dump_stream("pgid") << req->get_spg(); + f->dump_unsigned("map_epoch", req->get_map_epoch()); + f->dump_unsigned("min_epoch", req->get_min_epoch()); + f->dump_stream("from") << req->from; + f->close_section(); +} + +ConnectionPipeline &LogMissingRequestReply::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).replicated_request_conn_pipeline; +} + +ClientRequest::PGPipeline &LogMissingRequestReply::client_pp(PG &pg) +{ + return pg.request_pg_pipeline; +} + +seastar::future<> LogMissingRequestReply::with_pg( + ShardServices &shard_services, Ref<PG> pg) +{ + logger().debug("{}: LogMissingRequestReply::with_pg", *this); + + IRef ref = this; + return interruptor::with_interruption([this, pg] { + return pg->do_update_log_missing_reply(std::move(req)); + }, [ref](std::exception_ptr) { return seastar::now(); }, pg); +} + +} diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.h b/src/crimson/osd/osd_operations/logmissing_request_reply.h new file mode 100644 index 000000000..c89131fec --- /dev/null +++ b/src/crimson/osd/osd_operations/logmissing_request_reply.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/pg_map.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; + +class OSD; +class PG; + +class LogMissingRequestReply final : public PhasedOperationT<LogMissingRequestReply> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::logmissing_request_reply; + LogMissingRequestReply(crimson::net::ConnectionRef&&, Ref<MOSDPGUpdateLogMissingReply>&&); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + + static constexpr bool can_create() { return false; } + spg_t get_pgid() const { + return req->get_spg(); + } + PipelineHandle &get_handle() { return handle; } + epoch_t get_epoch() const { return req->get_min_epoch(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } + + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); + + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + PGMap::PGCreationBlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent + > tracking_events; + +private: + ClientRequest::PGPipeline &client_pp(PG &pg); + + crimson::net::ConnectionRef conn; + // must be after `conn` to ensure the ConnectionPipeline's is alive + PipelineHandle handle; + Ref<MOSDPGUpdateLogMissingReply> req; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::LogMissingRequestReply> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h new file mode 100644 index 000000000..c7b81e765 --- /dev/null +++ b/src/crimson/osd/osd_operations/osdop_params.h @@ -0,0 +1,22 @@ +#pragma once + +#include "messages/MOSDOp.h" +#include "osd/osd_types.h" +#include "crimson/common/type_helpers.h" + +// The fields in this struct are parameters that may be needed in multiple +// level of processing. I inclosed all those parameters in this struct to +// avoid passing each of them as a method parameter. +struct osd_op_params_t { + osd_reqid_t req_id; + utime_t mtime; + eversion_t at_version; + eversion_t pg_trim_to; + eversion_t min_last_complete_ondisk; + eversion_t last_complete; + version_t user_at_version = 0; + bool user_modify = false; + ObjectCleanRegions clean_regions; + + osd_op_params_t() = default; +}; diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc new file mode 100644 index 000000000..ea4662bd0 --- /dev/null +++ b/src/crimson/osd/osd_operations/peering_event.cc @@ -0,0 +1,190 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> +#include <seastar/core/sleep.hh> + +#include "messages/MOSDPGLog.h" + +#include "common/Formatter.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_connection_priv.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +template <class T> +void PeeringEvent<T>::print(std::ostream &lhs) const +{ + lhs << "PeeringEvent(" + << "from=" << from + << " pgid=" << pgid + << " sent=" << evt.get_epoch_sent() + << " requested=" << evt.get_epoch_requested() + << " evt=" << evt.get_desc() + << ")"; +} + +template <class T> +void PeeringEvent<T>::dump_detail(Formatter *f) const +{ + f->open_object_section("PeeringEvent"); + f->dump_stream("from") << from; + f->dump_stream("pgid") << pgid; + f->dump_int("sent", evt.get_epoch_sent()); + f->dump_int("requested", evt.get_epoch_requested()); + f->dump_string("evt", evt.get_desc()); + f->open_array_section("events"); + { + std::apply([f](auto&... events) { + (..., events.dump(f)); + }, static_cast<const T*>(this)->tracking_events); + } + f->close_section(); + f->close_section(); +} + + +template <class T> +PGPeeringPipeline &PeeringEvent<T>::peering_pp(PG &pg) +{ + return pg.peering_request_pg_pipeline; +} + +template <class T> +seastar::future<> PeeringEvent<T>::with_pg( + ShardServices &shard_services, Ref<PG> pg) +{ + if (!pg) { + logger().warn("{}: pg absent, did not create", *this); + on_pg_absent(shard_services); + that()->get_handle().exit(); + return complete_rctx_no_pg(shard_services); + } + + using interruptor = typename T::interruptor; + return interruptor::with_interruption([this, pg, &shard_services] { + logger().debug("{}: pg present", *this); + return this->template enter_stage<interruptor>(peering_pp(*pg).await_map + ).then_interruptible([this, pg] { + return this->template with_blocking_event< + PG_OSDMapGate::OSDMapBlocker::BlockingEvent + >([this, pg](auto &&trigger) { + return pg->osdmap_gate.wait_for_map( + std::move(trigger), evt.get_epoch_sent()); + }); + }).then_interruptible([this, pg](auto) { + return this->template enter_stage<interruptor>(peering_pp(*pg).process); + }).then_interruptible([this, pg, &shard_services] { + return pg->do_peering_event(evt, ctx + ).then_interruptible([this, pg, &shard_services] { + that()->get_handle().exit(); + return complete_rctx(shard_services, pg); + }); + }).then_interruptible([pg, &shard_services]() + -> typename T::template interruptible_future<> { + if (!pg->get_need_up_thru()) { + return seastar::now(); + } + return shard_services.send_alive(pg->get_same_interval_since()); + }).then_interruptible([&shard_services] { + return shard_services.send_pg_temp(); + }); + }, [this](std::exception_ptr ep) { + logger().debug("{}: interrupted with {}", *this, ep); + }, pg); +} + +template <class T> +void PeeringEvent<T>::on_pg_absent(ShardServices &) +{ + logger().debug("{}: pg absent, dropping", *this); +} + +template <class T> +typename PeeringEvent<T>::template interruptible_future<> +PeeringEvent<T>::complete_rctx(ShardServices &shard_services, Ref<PG> pg) +{ + logger().debug("{}: submitting ctx", *this); + return shard_services.dispatch_context( + pg->get_collection_ref(), + std::move(ctx)); +} + +ConnectionPipeline &RemotePeeringEvent::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).peering_request_conn_pipeline; +} + +void RemotePeeringEvent::on_pg_absent(ShardServices &shard_services) +{ + if (auto& e = get_event().get_event(); + e.dynamic_type() == MQuery::static_type()) { + const auto map_epoch = + shard_services.get_map()->get_epoch(); + const auto& q = static_cast<const MQuery&>(e); + const pg_info_t empty{spg_t{pgid.pgid, q.query.to}}; + if (q.query.type == q.query.LOG || + q.query.type == q.query.FULLLOG) { + auto m = crimson::make_message<MOSDPGLog>(q.query.from, q.query.to, + map_epoch, empty, + q.query.epoch_sent); + ctx.send_osd_message(q.from.osd, std::move(m)); + } else { + ctx.send_notify(q.from.osd, {q.query.from, q.query.to, + q.query.epoch_sent, + map_epoch, empty, + PastIntervals{}}); + } + } +} + +RemotePeeringEvent::interruptible_future<> RemotePeeringEvent::complete_rctx( + ShardServices &shard_services, + Ref<PG> pg) +{ + if (pg) { + return PeeringEvent::complete_rctx(shard_services, pg); + } else { + return shard_services.dispatch_context_messages(std::move(ctx)); + } +} + +seastar::future<> RemotePeeringEvent::complete_rctx_no_pg( + ShardServices &shard_services) +{ + return shard_services.dispatch_context_messages(std::move(ctx)); +} + +seastar::future<> LocalPeeringEvent::start() +{ + logger().debug("{}: start", *this); + + IRef ref = this; + auto maybe_delay = seastar::now(); + if (delay) { + maybe_delay = seastar::sleep( + std::chrono::milliseconds(std::lround(delay * 1000))); + } + return maybe_delay.then([this] { + return with_pg(pg->get_shard_services(), pg); + }).finally([ref=std::move(ref)] { + logger().debug("{}: complete", *ref); + }); +} + + +LocalPeeringEvent::~LocalPeeringEvent() {} + +template class PeeringEvent<RemotePeeringEvent>; +template class PeeringEvent<LocalPeeringEvent>; + +} diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h new file mode 100644 index 000000000..e94caead1 --- /dev/null +++ b/src/crimson/osd/osd_operations/peering_event.h @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "osd/osd_types.h" +#include "osd/PGPeeringEvent.h" +#include "osd/PeeringState.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class OSD; +class ShardServices; +class PG; +class BackfillRecovery; + + class PGPeeringPipeline { + struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> { + static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map"; + } await_map; + struct Process : OrderedExclusivePhaseT<Process> { + static constexpr auto type_name = "PeeringEvent::PGPipeline::process"; + } process; + template <class T> + friend class PeeringEvent; + friend class LocalPeeringEvent; + friend class RemotePeeringEvent; + friend class PGAdvanceMap; + friend class BackfillRecovery; + }; + +template <class T> +class PeeringEvent : public PhasedOperationT<T> { + T* that() { + return static_cast<T*>(this); + } + const T* that() const { + return static_cast<const T*>(this); + } + +public: + static constexpr OperationTypeCode type = OperationTypeCode::peering_event; + +protected: + PGPeeringPipeline &peering_pp(PG &pg); + + PeeringCtx ctx; + pg_shard_t from; + spg_t pgid; + float delay = 0; + PGPeeringEvent evt; + + const pg_shard_t get_from() const { + return from; + } + + const spg_t get_pgid() const { + return pgid; + } + + const PGPeeringEvent &get_event() const { + return evt; + } + + virtual void on_pg_absent(ShardServices &); + + virtual typename PeeringEvent::template interruptible_future<> + complete_rctx(ShardServices &, Ref<PG>); + + virtual seastar::future<> complete_rctx_no_pg( + ShardServices &shard_services + ) { return seastar::now();} + +public: + template <typename... Args> + PeeringEvent( + const pg_shard_t &from, const spg_t &pgid, + Args&&... args) : + from(from), + pgid(pgid), + evt(std::forward<Args>(args)...) + {} + template <typename... Args> + PeeringEvent( + const pg_shard_t &from, const spg_t &pgid, + float delay, Args&&... args) : + from(from), + pgid(pgid), + delay(delay), + evt(std::forward<Args>(args)...) + {} + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); +}; + +class RemotePeeringEvent : public PeeringEvent<RemotePeeringEvent> { +protected: + crimson::net::ConnectionRef conn; + // must be after conn due to ConnectionPipeline's life-time + PipelineHandle handle; + + void on_pg_absent(ShardServices &) final; + PeeringEvent::interruptible_future<> complete_rctx( + ShardServices &shard_services, + Ref<PG> pg) override; + seastar::future<> complete_rctx_no_pg( + ShardServices &shard_services + ) override; + +public: + class OSDPipeline { + struct AwaitActive : OrderedExclusivePhaseT<AwaitActive> { + static constexpr auto type_name = + "PeeringRequest::OSDPipeline::await_active"; + } await_active; + friend class RemotePeeringEvent; + }; + + template <typename... Args> + RemotePeeringEvent(crimson::net::ConnectionRef conn, Args&&... args) : + PeeringEvent(std::forward<Args>(args)...), + conn(conn) + {} + + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + PGMap::PGCreationBlockingEvent, + PGPeeringPipeline::AwaitMap::BlockingEvent, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent, + PGPeeringPipeline::Process::BlockingEvent, + OSDPipeline::AwaitActive::BlockingEvent, + CompletionEvent + > tracking_events; + + static constexpr bool can_create() { return true; } + auto get_create_info() { return std::move(evt.create_info); } + spg_t get_pgid() const { + return pgid; + } + PipelineHandle &get_handle() { return handle; } + epoch_t get_epoch() const { return evt.get_epoch_sent(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } +}; + +class LocalPeeringEvent final : public PeeringEvent<LocalPeeringEvent> { +protected: + Ref<PG> pg; + PipelineHandle handle; + +public: + template <typename... Args> + LocalPeeringEvent(Ref<PG> pg, Args&&... args) : + PeeringEvent(std::forward<Args>(args)...), + pg(pg) + {} + + seastar::future<> start(); + virtual ~LocalPeeringEvent(); + + PipelineHandle &get_handle() { return handle; } + + std::tuple< + StartEvent, + PGPeeringPipeline::AwaitMap::BlockingEvent, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent, + PGPeeringPipeline::Process::BlockingEvent, + CompletionEvent + > tracking_events; +}; + + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::LocalPeeringEvent> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::osd::RemotePeeringEvent> : fmt::ostream_formatter {}; +template <class T> struct fmt::formatter<crimson::osd::PeeringEvent<T>> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/pg_advance_map.cc b/src/crimson/osd/osd_operations/pg_advance_map.cc new file mode 100644 index 000000000..3706af810 --- /dev/null +++ b/src/crimson/osd/osd_operations/pg_advance_map.cc @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> + +#include "include/types.h" +#include "common/Formatter.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/osd_operations/pg_advance_map.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "osd/PeeringState.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +PGAdvanceMap::PGAdvanceMap( + ShardServices &shard_services, Ref<PG> pg, epoch_t to, + PeeringCtx &&rctx, bool do_init) + : shard_services(shard_services), pg(pg), to(to), + rctx(std::move(rctx)), do_init(do_init) +{ + logger().debug("{}: created", *this); +} + +PGAdvanceMap::~PGAdvanceMap() {} + +void PGAdvanceMap::print(std::ostream &lhs) const +{ + lhs << "PGAdvanceMap(" + << "pg=" << pg->get_pgid() + << " from=" << (from ? *from : -1) + << " to=" << to; + if (do_init) { + lhs << " do_init"; + } + lhs << ")"; +} + +void PGAdvanceMap::dump_detail(Formatter *f) const +{ + f->open_object_section("PGAdvanceMap"); + f->dump_stream("pgid") << pg->get_pgid(); + if (from) { + f->dump_int("from", *from); + } + f->dump_int("to", to); + f->dump_bool("do_init", do_init); + f->close_section(); +} + +PGPeeringPipeline &PGAdvanceMap::peering_pp(PG &pg) +{ + return pg.peering_request_pg_pipeline; +} + +seastar::future<> PGAdvanceMap::start() +{ + using cached_map_t = OSDMapService::cached_map_t; + + logger().debug("{}: start", *this); + + IRef ref = this; + return enter_stage<>( + peering_pp(*pg).process + ).then([this] { + /* + * PGAdvanceMap is scheduled at pg creation and when + * broadcasting new osdmaps to pgs. We are not able to serialize + * between the two different PGAdvanceMap callers since a new pg + * will get advanced to the latest osdmap at it's creation. + * As a result, we may need to adjust the PGAdvance operation + * 'from' epoch. + * See: https://tracker.ceph.com/issues/61744 + */ + from = pg->get_osdmap_epoch(); + auto fut = seastar::now(); + if (do_init) { + fut = pg->handle_initialize(rctx + ).then([this] { + return pg->handle_activate_map(rctx); + }); + } + return fut.then([this] { + ceph_assert(std::cmp_less_equal(*from, to)); + return seastar::do_for_each( + boost::make_counting_iterator(*from + 1), + boost::make_counting_iterator(to + 1), + [this](epoch_t next_epoch) { + logger().debug("{}: start: getting map {}", + *this, next_epoch); + return shard_services.get_map(next_epoch).then( + [this] (cached_map_t&& next_map) { + logger().debug("{}: advancing map to {}", + *this, next_map->get_epoch()); + return pg->handle_advance_map(next_map, rctx); + }); + }).then([this] { + return pg->handle_activate_map(rctx).then([this] { + logger().debug("{}: map activated", *this); + if (do_init) { + shard_services.pg_created(pg->get_pgid(), pg); + logger().info("PGAdvanceMap::start new pg {}", *pg); + } + return seastar::when_all_succeed( + pg->get_need_up_thru() + ? shard_services.send_alive( + pg->get_same_interval_since()) + : seastar::now(), + shard_services.dispatch_context( + pg->get_collection_ref(), + std::move(rctx))); + }); + }).then_unpack([this] { + logger().debug("{}: sending pg temp", *this); + return shard_services.send_pg_temp(); + }); + }); + }).then([this, ref=std::move(ref)] { + logger().debug("{}: complete", *this); + }); +} + +} diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h new file mode 100644 index 000000000..b712cc12e --- /dev/null +++ b/src/crimson/osd/osd_operations/pg_advance_map.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "osd/osd_types.h" +#include "crimson/common/type_helpers.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; +class PG; + +class PGAdvanceMap : public PhasedOperationT<PGAdvanceMap> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::pg_advance_map; + +protected: + ShardServices &shard_services; + Ref<PG> pg; + PipelineHandle handle; + + std::optional<epoch_t> from; + epoch_t to; + + PeeringCtx rctx; + const bool do_init; + +public: + PGAdvanceMap( + ShardServices &shard_services, Ref<PG> pg, epoch_t to, + PeeringCtx &&rctx, bool do_init); + ~PGAdvanceMap(); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter *f) const final; + seastar::future<> start(); + PipelineHandle &get_handle() { return handle; } + + std::tuple< + PGPeeringPipeline::Process::BlockingEvent + > tracking_events; + +private: + PGPeeringPipeline &peering_pp(PG &pg); +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::PGAdvanceMap> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.cc b/src/crimson/osd/osd_operations/recovery_subrequest.cc new file mode 100644 index 000000000..68655b8da --- /dev/null +++ b/src/crimson/osd/osd_operations/recovery_subrequest.cc @@ -0,0 +1,46 @@ +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "crimson/osd/osd_operations/recovery_subrequest.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd_connection_priv.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson { + template <> + struct EventBackendRegistry<osd::RecoverySubRequest> { + static std::tuple<> get_backends() { + return {}; + } + }; +} + +namespace crimson::osd { + +seastar::future<> RecoverySubRequest::with_pg( + ShardServices &shard_services, Ref<PG> pgref) +{ + logger().debug("{}: {}", "RecoverySubRequest::with_pg", *this); + + track_event<StartEvent>(); + IRef opref = this; + return interruptor::with_interruption([this, pgref] { + return pgref->get_recovery_backend()->handle_recovery_op(m, conn); + }, [](std::exception_ptr) { + return seastar::now(); + }, pgref).finally([this, opref, pgref] { + track_event<CompletionEvent>(); + }); +} + +ConnectionPipeline &RecoverySubRequest::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).peering_request_conn_pipeline; +} + +} diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h new file mode 100644 index 000000000..07c7c95b5 --- /dev/null +++ b/src/crimson/osd/osd_operations/recovery_subrequest.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "osd/osd_op_util.h" +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/pg.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDFastDispatchOp.h" + +namespace crimson::osd { + +class PG; + +class RecoverySubRequest final : public PhasedOperationT<RecoverySubRequest> { +public: + static constexpr OperationTypeCode type = + OperationTypeCode::background_recovery_sub; + + RecoverySubRequest( + crimson::net::ConnectionRef conn, + Ref<MOSDFastDispatchOp>&& m) + : conn(conn), m(m) {} + + void print(std::ostream& out) const final + { + out << *m; + } + + void dump_detail(Formatter *f) const final + { + } + + static constexpr bool can_create() { return false; } + spg_t get_pgid() const { + return m->get_spg(); + } + PipelineHandle &get_handle() { return handle; } + epoch_t get_epoch() const { return m->get_min_epoch(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } + + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); + + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + PGMap::PGCreationBlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent, + CompletionEvent + > tracking_events; + +private: + crimson::net::ConnectionRef conn; + // must be after `conn` to ensure the ConnectionPipeline's is alive + PipelineHandle handle; + Ref<MOSDFastDispatchOp> m; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::RecoverySubRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/replicated_request.cc b/src/crimson/osd/osd_operations/replicated_request.cc new file mode 100644 index 000000000..09217575c --- /dev/null +++ b/src/crimson/osd/osd_operations/replicated_request.cc @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "replicated_request.h" + +#include "common/Formatter.h" + +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_connection_priv.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/pg.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +RepRequest::RepRequest(crimson::net::ConnectionRef&& conn, + Ref<MOSDRepOp> &&req) + : conn{std::move(conn)}, + req{std::move(req)} +{} + +void RepRequest::print(std::ostream& os) const +{ + os << "RepRequest(" + << "from=" << req->from + << " req=" << *req + << ")"; +} + +void RepRequest::dump_detail(Formatter *f) const +{ + f->open_object_section("RepRequest"); + f->dump_stream("reqid") << req->reqid; + f->dump_stream("pgid") << req->get_spg(); + f->dump_unsigned("map_epoch", req->get_map_epoch()); + f->dump_unsigned("min_epoch", req->get_min_epoch()); + f->dump_stream("oid") << req->poid; + f->dump_stream("from") << req->from; + f->close_section(); +} + +ConnectionPipeline &RepRequest::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).replicated_request_conn_pipeline; +} + +ClientRequest::PGPipeline &RepRequest::client_pp(PG &pg) +{ + return pg.request_pg_pipeline; +} + +seastar::future<> RepRequest::with_pg( + ShardServices &shard_services, Ref<PG> pg) +{ + logger().debug("{}: RepRequest::with_pg", *this); + IRef ref = this; + return interruptor::with_interruption([this, pg] { + logger().debug("{}: pg present", *this); + return this->template enter_stage<interruptor>(client_pp(*pg).await_map + ).then_interruptible([this, pg] { + return this->template with_blocking_event< + PG_OSDMapGate::OSDMapBlocker::BlockingEvent + >([this, pg](auto &&trigger) { + return pg->osdmap_gate.wait_for_map( + std::move(trigger), req->min_epoch); + }); + }).then_interruptible([this, pg] (auto) { + return pg->handle_rep_op(req); + }); + }, [ref](std::exception_ptr) { + return seastar::now(); + }, pg); +} + +} diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h new file mode 100644 index 000000000..c742888d9 --- /dev/null +++ b/src/crimson/osd/osd_operations/replicated_request.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/pg_map.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDRepOp.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; + +class OSD; +class PG; + +class RepRequest final : public PhasedOperationT<RepRequest> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::replicated_request; + RepRequest(crimson::net::ConnectionRef&&, Ref<MOSDRepOp>&&); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + + static constexpr bool can_create() { return false; } + spg_t get_pgid() const { + return req->get_spg(); + } + PipelineHandle &get_handle() { return handle; } + epoch_t get_epoch() const { return req->get_min_epoch(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } + + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); + + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + ClientRequest::PGPipeline::AwaitMap::BlockingEvent, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent, + PGMap::PGCreationBlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent + > tracking_events; + +private: + ClientRequest::PGPipeline &client_pp(PG &pg); + + crimson::net::ConnectionRef conn; + PipelineHandle handle; + Ref<MOSDRepOp> req; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::RepRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc new file mode 100644 index 000000000..e4a1b04df --- /dev/null +++ b/src/crimson/osd/osd_operations/snaptrim_event.cc @@ -0,0 +1,569 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/osd_operations/snaptrim_event.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/pg.h" +#include <seastar/core/sleep.hh> + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson { + template <> + struct EventBackendRegistry<osd::SnapTrimEvent> { + static std::tuple<> get_backends() { + return {}; + } + }; + + template <> + struct EventBackendRegistry<osd::SnapTrimObjSubEvent> { + static std::tuple<> get_backends() { + return {}; + } + }; +} + +namespace crimson::osd { + +PG::interruptible_future<> +PG::SnapTrimMutex::lock(SnapTrimEvent &st_event) noexcept +{ + return st_event.enter_stage<interruptor>(wait_pg + ).then_interruptible([this] { + return mutex.lock(); + }); +} + +void SnapTrimEvent::SubOpBlocker::dump_detail(Formatter *f) const +{ + f->open_array_section("dependent_operations"); + { + for (const auto &kv : subops) { + f->dump_unsigned("op_id", kv.first); + } + } + f->close_section(); +} + +template <class... Args> +void SnapTrimEvent::SubOpBlocker::emplace_back(Args&&... args) +{ + subops.emplace_back(std::forward<Args>(args)...); +}; + +SnapTrimEvent::remove_or_update_iertr::future<> +SnapTrimEvent::SubOpBlocker::wait_completion() +{ + return interruptor::do_for_each(subops, [](auto&& kv) { + return std::move(kv.second); + }); +} + +void SnapTrimEvent::print(std::ostream &lhs) const +{ + lhs << "SnapTrimEvent(" + << "pgid=" << pg->get_pgid() + << " snapid=" << snapid + << " needs_pause=" << needs_pause + << ")"; +} + +void SnapTrimEvent::dump_detail(Formatter *f) const +{ + f->open_object_section("SnapTrimEvent"); + f->dump_stream("pgid") << pg->get_pgid(); + f->close_section(); +} + +SnapTrimEvent::snap_trim_ertr::future<seastar::stop_iteration> +SnapTrimEvent::start() +{ + logger().debug("{}: {}", *this, __func__); + return with_pg( + pg->get_shard_services(), pg + ).finally([ref=IRef{this}, this] { + logger().debug("{}: complete", *ref); + return handle.complete(); + }); +} + +CommonPGPipeline& SnapTrimEvent::client_pp() +{ + return pg->request_pg_pipeline; +} + +SnapTrimEvent::snap_trim_ertr::future<seastar::stop_iteration> +SnapTrimEvent::with_pg( + ShardServices &shard_services, Ref<PG> _pg) +{ + return interruptor::with_interruption([&shard_services, this] { + return enter_stage<interruptor>( + client_pp().wait_for_active + ).then_interruptible([this] { + return with_blocking_event<PGActivationBlocker::BlockingEvent, + interruptor>([this] (auto&& trigger) { + return pg->wait_for_active_blocker.wait(std::move(trigger)); + }); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().recover_missing); + }).then_interruptible([] { + //return do_recover_missing(pg, get_target_oid()); + return seastar::now(); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().get_obc); + }).then_interruptible([this] { + return pg->snaptrim_mutex.lock(*this); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().process); + }).then_interruptible([&shard_services, this] { + return interruptor::async([this] { + std::vector<hobject_t> to_trim; + using crimson::common::local_conf; + const auto max = + local_conf().get_val<uint64_t>("osd_pg_max_concurrent_snap_trims"); + // we need to look for at least 1 snaptrim, otherwise we'll misinterpret + // the ENOENT below and erase snapid. + int r = snap_mapper.get_next_objects_to_trim( + snapid, + max, + &to_trim); + if (r == -ENOENT) { + to_trim.clear(); // paranoia + return to_trim; + } else if (r != 0) { + logger().error("{}: get_next_objects_to_trim returned {}", + *this, cpp_strerror(r)); + ceph_abort_msg("get_next_objects_to_trim returned an invalid code"); + } else { + assert(!to_trim.empty()); + } + logger().debug("{}: async almost done line {}", *this, __LINE__); + return to_trim; + }).then_interruptible([&shard_services, this] (const auto& to_trim) { + if (to_trim.empty()) { + // the legit ENOENT -> done + logger().debug("{}: to_trim is empty! Stopping iteration", *this); + pg->snaptrim_mutex.unlock(); + return snap_trim_iertr::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + return [&shard_services, this](const auto &to_trim) { + for (const auto& object : to_trim) { + logger().debug("{}: trimming {}", *this, object); + auto [op, fut] = shard_services.start_operation_may_interrupt< + interruptor, SnapTrimObjSubEvent>( + pg, + object, + snapid); + subop_blocker.emplace_back( + op->get_id(), + std::move(fut) + ); + } + return interruptor::now(); + }(to_trim).then_interruptible([this] { + return enter_stage<interruptor>(wait_subop); + }).then_interruptible([this] { + logger().debug("{}: awaiting completion", *this); + return subop_blocker.wait_completion(); + }).finally([this] { + pg->snaptrim_mutex.unlock(); + }).safe_then_interruptible([this] { + if (!needs_pause) { + return interruptor::now(); + } + // let's know operators we're waiting + return enter_stage<interruptor>( + wait_trim_timer + ).then_interruptible([this] { + using crimson::common::local_conf; + const auto time_to_sleep = + local_conf().template get_val<double>("osd_snap_trim_sleep"); + logger().debug("{}: time_to_sleep {}", *this, time_to_sleep); + // TODO: this logic should be more sophisticated and distinguish + // between SSDs, HDDs and the hybrid case + return seastar::sleep( + std::chrono::milliseconds(std::lround(time_to_sleep * 1000))); + }); + }).safe_then_interruptible([this] { + logger().debug("{}: all completed", *this); + return snap_trim_iertr::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }); + }); + }); + }, [this](std::exception_ptr eptr) -> snap_trim_ertr::future<seastar::stop_iteration> { + logger().debug("{}: interrupted {}", *this, eptr); + return crimson::ct_error::eagain::make(); + }, pg); +} + + +CommonPGPipeline& SnapTrimObjSubEvent::client_pp() +{ + return pg->request_pg_pipeline; +} + +SnapTrimObjSubEvent::remove_or_update_iertr::future<> +SnapTrimObjSubEvent::start() +{ + logger().debug("{}: start", *this); + return with_pg( + pg->get_shard_services(), pg + ).finally([ref=IRef{this}, this] { + logger().debug("{}: complete", *ref); + return handle.complete(); + }); +} + +SnapTrimObjSubEvent::remove_or_update_iertr::future<> +SnapTrimObjSubEvent::remove_clone( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries +) { + const auto p = std::find( + head_obc->ssc->snapset.clones.begin(), + head_obc->ssc->snapset.clones.end(), + coid.snap); + if (p == head_obc->ssc->snapset.clones.end()) { + logger().error("{}: Snap {} not in clones", + *this, coid.snap); + return crimson::ct_error::enoent::make(); + } + assert(p != head_obc->ssc->snapset.clones.end()); + snapid_t last = coid.snap; + delta_stats.num_bytes -= head_obc->ssc->snapset.get_clone_bytes(last); + + if (p != head_obc->ssc->snapset.clones.begin()) { + // not the oldest... merge overlap into next older clone + std::vector<snapid_t>::iterator n = p - 1; + hobject_t prev_coid = coid; + prev_coid.snap = *n; + + // does the classical OSD really need is_present_clone(prev_coid)? + delta_stats.num_bytes -= head_obc->ssc->snapset.get_clone_bytes(*n); + head_obc->ssc->snapset.clone_overlap[*n].intersection_of( + head_obc->ssc->snapset.clone_overlap[*p]); + delta_stats.num_bytes += head_obc->ssc->snapset.get_clone_bytes(*n); + } + delta_stats.num_objects--; + if (obc->obs.oi.is_dirty()) { + delta_stats.num_objects_dirty--; + } + if (obc->obs.oi.is_omap()) { + delta_stats.num_objects_omap--; + } + if (obc->obs.oi.is_whiteout()) { + logger().debug("{}: trimming whiteout on {}", + *this, coid); + delta_stats.num_whiteouts--; + } + delta_stats.num_object_clones--; + + obc->obs.exists = false; + head_obc->ssc->snapset.clones.erase(p); + head_obc->ssc->snapset.clone_overlap.erase(last); + head_obc->ssc->snapset.clone_size.erase(last); + head_obc->ssc->snapset.clone_snaps.erase(last); + + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::DELETE, + coid, + osd_op_p.at_version, + obc->obs.oi.version, + 0, + osd_reqid_t(), + obc->obs.oi.mtime, // will be replaced in `apply_to()` + 0} + ); + txn.remove( + pg->get_collection_ref()->get_cid(), + ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}); + obc->obs.oi = object_info_t(coid); + return OpsExecuter::snap_map_remove(coid, pg->snap_mapper, pg->osdriver, txn); +} + +void SnapTrimObjSubEvent::remove_head_whiteout( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries +) { + // NOTE: this arguably constitutes minor interference with the + // tiering agent if this is a cache tier since a snap trim event + // is effectively evicting a whiteout we might otherwise want to + // keep around. + const auto head_oid = coid.get_head(); + logger().info("{}: {} removing {}", + *this, coid, head_oid); + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::DELETE, + head_oid, + osd_op_p.at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + obc->obs.oi.mtime, // will be replaced in `apply_to()` + 0} + ); + logger().info("{}: remove snap head", *this); + object_info_t& oi = head_obc->obs.oi; + delta_stats.num_objects--; + if (oi.is_dirty()) { + delta_stats.num_objects_dirty--; + } + if (oi.is_omap()) { + delta_stats.num_objects_omap--; + } + if (oi.is_whiteout()) { + logger().debug("{}: trimming whiteout on {}", + *this, oi.soid); + delta_stats.num_whiteouts--; + } + head_obc->obs.exists = false; + head_obc->obs.oi = object_info_t(head_oid); + txn.remove(pg->get_collection_ref()->get_cid(), + ghobject_t{head_oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}); +} + +SnapTrimObjSubEvent::interruptible_future<> +SnapTrimObjSubEvent::adjust_snaps( + ObjectContextRef obc, + ObjectContextRef head_obc, + const std::set<snapid_t>& new_snaps, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries +) { + head_obc->ssc->snapset.clone_snaps[coid.snap] = + std::vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend()); + + // we still do a 'modify' event on this object just to trigger a + // snapmapper.update ... :( + obc->obs.oi.prior_version = obc->obs.oi.version; + obc->obs.oi.version = osd_op_p.at_version; + ceph::bufferlist bl; + encode(obc->obs.oi, + bl, + pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + txn.setattr( + pg->get_collection_ref()->get_cid(), + ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}, + OI_ATTR, + bl); + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::MODIFY, + coid, + obc->obs.oi.version, + obc->obs.oi.prior_version, + 0, + osd_reqid_t(), + obc->obs.oi.mtime, + 0} + ); + return OpsExecuter::snap_map_modify( + coid, new_snaps, pg->snap_mapper, pg->osdriver, txn); +} + +void SnapTrimObjSubEvent::update_head( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries +) { + const auto head_oid = coid.get_head(); + logger().info("{}: writing updated snapset on {}, snapset is {}", + *this, head_oid, head_obc->ssc->snapset); + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::MODIFY, + head_oid, + osd_op_p.at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + obc->obs.oi.mtime, + 0} + ); + + head_obc->obs.oi.prior_version = head_obc->obs.oi.version; + head_obc->obs.oi.version = osd_op_p.at_version; + + std::map<std::string, ceph::bufferlist, std::less<>> attrs; + ceph::bufferlist bl; + encode(head_obc->ssc->snapset, bl); + attrs[SS_ATTR] = std::move(bl); + + bl.clear(); + head_obc->obs.oi.encode_no_oid(bl, + pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + attrs[OI_ATTR] = std::move(bl); + txn.setattrs( + pg->get_collection_ref()->get_cid(), + ghobject_t{head_oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}, + attrs); +} + +SnapTrimObjSubEvent::remove_or_update_iertr::future< + SnapTrimObjSubEvent::remove_or_update_ret_t> +SnapTrimObjSubEvent::remove_or_update( + ObjectContextRef obc, + ObjectContextRef head_obc) +{ + auto citer = head_obc->ssc->snapset.clone_snaps.find(coid.snap); + if (citer == head_obc->ssc->snapset.clone_snaps.end()) { + logger().error("{}: No clone_snaps in snapset {} for object {}", + *this, head_obc->ssc->snapset, coid); + return crimson::ct_error::enoent::make(); + } + const auto& old_snaps = citer->second; + if (old_snaps.empty()) { + logger().error("{}: no object info snaps for object {}", + *this, coid); + return crimson::ct_error::enoent::make(); + } + if (head_obc->ssc->snapset.seq == 0) { + logger().error("{}: no snapset.seq for object {}", + *this, coid); + return crimson::ct_error::enoent::make(); + } + const OSDMapRef& osdmap = pg->get_osdmap(); + std::set<snapid_t> new_snaps; + for (const auto& old_snap : old_snaps) { + if (!osdmap->in_removed_snaps_queue(pg->get_info().pgid.pgid.pool(), + old_snap) + && old_snap != snap_to_trim) { + new_snaps.insert(old_snap); + } + } + + return seastar::do_with(ceph::os::Transaction{}, [=, this](auto &txn) { + std::vector<pg_log_entry_t> log_entries{}; + + int64_t num_objects_before_trim = delta_stats.num_objects; + osd_op_p.at_version = pg->next_version(); + auto ret = remove_or_update_iertr::now(); + if (new_snaps.empty()) { + // remove clone from snapset + logger().info("{}: {} snaps {} -> {} ... deleting", + *this, coid, old_snaps, new_snaps); + ret = remove_clone(obc, head_obc, txn, log_entries); + } else { + // save adjusted snaps for this object + logger().info("{}: {} snaps {} -> {}", + *this, coid, old_snaps, new_snaps); + ret = adjust_snaps(obc, head_obc, new_snaps, txn, log_entries); + } + return std::move(ret).safe_then_interruptible( + [&txn, obc, num_objects_before_trim, log_entries=std::move(log_entries), head_obc=std::move(head_obc), this]() mutable { + osd_op_p.at_version = pg->next_version(); + + // save head snapset + logger().debug("{}: {} new snapset {} on {}", + *this, coid, head_obc->ssc->snapset, head_obc->obs.oi); + if (head_obc->ssc->snapset.clones.empty() && head_obc->obs.oi.is_whiteout()) { + remove_head_whiteout(obc, head_obc, txn, log_entries); + } else { + update_head(obc, head_obc, txn, log_entries); + } + // Stats reporting - Set number of objects trimmed + if (num_objects_before_trim > delta_stats.num_objects) { + //int64_t num_objects_trimmed = + // num_objects_before_trim - delta_stats.num_objects; + //add_objects_trimmed_count(num_objects_trimmed); + } + }).safe_then_interruptible( + [&txn, log_entries=std::move(log_entries)] () mutable { + return remove_or_update_iertr::make_ready_future<remove_or_update_ret_t>( + std::make_pair(std::move(txn), std::move(log_entries))); + }); + }); +} + +SnapTrimObjSubEvent::remove_or_update_iertr::future<> +SnapTrimObjSubEvent::with_pg( + ShardServices &shard_services, Ref<PG> _pg) +{ + return enter_stage<interruptor>( + client_pp().wait_for_active + ).then_interruptible([this] { + return with_blocking_event<PGActivationBlocker::BlockingEvent, + interruptor>([this] (auto&& trigger) { + return pg->wait_for_active_blocker.wait(std::move(trigger)); + }); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().recover_missing); + }).then_interruptible([] { + //return do_recover_missing(pg, get_target_oid()); + return seastar::now(); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().get_obc); + }).then_interruptible([this] { + logger().debug("{}: getting obc for {}", *this, coid); + // end of commonality + // with_clone_obc_direct lock both clone's and head's obcs + return pg->obc_loader.with_clone_obc_direct<RWState::RWWRITE>( + coid, + [this](auto head_obc, auto clone_obc) { + logger().debug("{}: got clone_obc={}", *this, clone_obc->get_oid()); + return enter_stage<interruptor>( + client_pp().process + ).then_interruptible( + [this,clone_obc=std::move(clone_obc), head_obc=std::move(head_obc)]() mutable { + logger().debug("{}: processing clone_obc={}", *this, clone_obc->get_oid()); + return remove_or_update( + clone_obc, head_obc + ).safe_then_unpack_interruptible([clone_obc, this] + (auto&& txn, auto&& log_entries) mutable { + auto [submitted, all_completed] = pg->submit_transaction( + std::move(clone_obc), + std::move(txn), + std::move(osd_op_p), + std::move(log_entries)); + return submitted.then_interruptible( + [all_completed=std::move(all_completed), this] () mutable { + return enter_stage<interruptor>( + wait_repop + ).then_interruptible([all_completed=std::move(all_completed)] () mutable { + return std::move(all_completed); + }); + }); + }); + }); + }).handle_error_interruptible( + remove_or_update_iertr::pass_further{}, + crimson::ct_error::assert_all{"unexpected error in SnapTrimObjSubEvent"} + ); + }); +} + +void SnapTrimObjSubEvent::print(std::ostream &lhs) const +{ + lhs << "SnapTrimObjSubEvent(" + << "coid=" << coid + << " snapid=" << snap_to_trim + << ")"; +} + +void SnapTrimObjSubEvent::dump_detail(Formatter *f) const +{ + f->open_object_section("SnapTrimObjSubEvent"); + f->dump_stream("coid") << coid; + f->close_section(); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h new file mode 100644 index 000000000..a3a970a04 --- /dev/null +++ b/src/crimson/osd/osd_operations/snaptrim_event.h @@ -0,0 +1,210 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/common/pg_pipeline.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_activation_blocker.h" +#include "osd/osd_types.h" +#include "osd/PGPeeringEvent.h" +#include "osd/PeeringState.h" + +namespace ceph { + class Formatter; +} + +class SnapMapper; + +namespace crimson::osd { + +class OSD; +class ShardServices; + +// trim up to `max` objects for snapshot `snapid +class SnapTrimEvent final : public PhasedOperationT<SnapTrimEvent> { +public: + using remove_or_update_ertr = + crimson::errorator<crimson::ct_error::enoent>; + using remove_or_update_iertr = + crimson::interruptible::interruptible_errorator< + IOInterruptCondition, remove_or_update_ertr>; + using snap_trim_ertr = remove_or_update_ertr::extend< + crimson::ct_error::eagain>; + using snap_trim_iertr = remove_or_update_iertr::extend< + crimson::ct_error::eagain>; + + static constexpr OperationTypeCode type = OperationTypeCode::snaptrim_event; + + SnapTrimEvent(Ref<PG> pg, + SnapMapper& snap_mapper, + const snapid_t snapid, + const bool needs_pause) + : pg(std::move(pg)), + snap_mapper(snap_mapper), + snapid(snapid), + needs_pause(needs_pause) {} + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + snap_trim_ertr::future<seastar::stop_iteration> start(); + snap_trim_ertr::future<seastar::stop_iteration> with_pg( + ShardServices &shard_services, Ref<PG> pg); + +private: + CommonPGPipeline& client_pp(); + + // bases on 998cb8c141bb89aafae298a9d5e130fbd78fe5f2 + struct SubOpBlocker : crimson::BlockerT<SubOpBlocker> { + static constexpr const char* type_name = "CompoundOpBlocker"; + + using id_done_t = std::pair<crimson::Operation::id_t, + remove_or_update_iertr::future<>>; + + void dump_detail(Formatter *f) const final; + + template <class... Args> + void emplace_back(Args&&... args); + + remove_or_update_iertr::future<> wait_completion(); + private: + std::vector<id_done_t> subops; + } subop_blocker; + + // we don't need to synchronize with other instances of SnapTrimEvent; + // it's here for the sake of op tracking. + struct WaitSubop : OrderedConcurrentPhaseT<WaitSubop> { + static constexpr auto type_name = "SnapTrimEvent::wait_subop"; + } wait_subop; + + // an instantiator can instruct us to go over this stage and then + // wait for the future to implement throttling. It is implemented + // that way to for the sake of tracking ops. + struct WaitTrimTimer : OrderedExclusivePhaseT<WaitTrimTimer> { + static constexpr auto type_name = "SnapTrimEvent::wait_trim_timer"; + } wait_trim_timer; + + PipelineHandle handle; + Ref<PG> pg; + SnapMapper& snap_mapper; + const snapid_t snapid; + const bool needs_pause; + +public: + PipelineHandle& get_handle() { return handle; } + + std::tuple< + StartEvent, + CommonPGPipeline::WaitForActive::BlockingEvent, + PGActivationBlocker::BlockingEvent, + CommonPGPipeline::RecoverMissing::BlockingEvent, + CommonPGPipeline::GetOBC::BlockingEvent, + CommonPGPipeline::Process::BlockingEvent, + WaitSubop::BlockingEvent, + PG::SnapTrimMutex::WaitPG::BlockingEvent, + WaitTrimTimer::BlockingEvent, + CompletionEvent + > tracking_events; + + friend class PG::SnapTrimMutex; +}; + +// remove single object. a SnapTrimEvent can create multiple subrequests. +// the division of labour is needed because of the restriction that an Op +// cannot revisite a pipeline's stage it already saw. +class SnapTrimObjSubEvent : public PhasedOperationT<SnapTrimObjSubEvent> { +public: + using remove_or_update_ertr = + crimson::errorator<crimson::ct_error::enoent>; + using remove_or_update_iertr = + crimson::interruptible::interruptible_errorator< + IOInterruptCondition, remove_or_update_ertr>; + + static constexpr OperationTypeCode type = + OperationTypeCode::snaptrimobj_subevent; + + SnapTrimObjSubEvent( + Ref<PG> pg, + const hobject_t& coid, + snapid_t snap_to_trim) + : pg(std::move(pg)), + coid(coid), + snap_to_trim(snap_to_trim) { + } + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + remove_or_update_iertr::future<> start(); + remove_or_update_iertr::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); + + CommonPGPipeline& client_pp(); + +private: + object_stat_sum_t delta_stats; + + remove_or_update_iertr::future<> remove_clone( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries); + void remove_head_whiteout( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries); + interruptible_future<> adjust_snaps( + ObjectContextRef obc, + ObjectContextRef head_obc, + const std::set<snapid_t>& new_snaps, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries); + void update_head( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries); + + using remove_or_update_ret_t = + std::pair<ceph::os::Transaction, std::vector<pg_log_entry_t>>; + remove_or_update_iertr::future<remove_or_update_ret_t> + remove_or_update(ObjectContextRef obc, ObjectContextRef head_obc); + + // we don't need to synchronize with other instances started by + // SnapTrimEvent; it's here for the sake of op tracking. + struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> { + static constexpr auto type_name = "SnapTrimObjSubEvent::wait_repop"; + } wait_repop; + + Ref<PG> pg; + PipelineHandle handle; + osd_op_params_t osd_op_p; + const hobject_t coid; + const snapid_t snap_to_trim; + +public: + PipelineHandle& get_handle() { return handle; } + + std::tuple< + StartEvent, + CommonPGPipeline::WaitForActive::BlockingEvent, + PGActivationBlocker::BlockingEvent, + CommonPGPipeline::RecoverMissing::BlockingEvent, + CommonPGPipeline::GetOBC::BlockingEvent, + CommonPGPipeline::Process::BlockingEvent, + WaitRepop::BlockingEvent, + CompletionEvent + > tracking_events; +}; + +} // namespace crimson::osd + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::SnapTrimEvent> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::osd::SnapTrimObjSubEvent> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osdmap_gate.cc b/src/crimson/osd/osdmap_gate.cc new file mode 100644 index 000000000..171ec436d --- /dev/null +++ b/src/crimson/osd/osdmap_gate.cc @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/exception.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/shard_services.h" +#include "common/Formatter.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +template <OSDMapGateType OSDMapGateTypeV> +void OSDMapGate<OSDMapGateTypeV>::OSDMapBlocker::dump_detail(Formatter *f) const +{ + f->open_object_section("OSDMapGate"); + f->dump_int("epoch", epoch); + f->close_section(); +} + +template <OSDMapGateType OSDMapGateTypeV> +seastar::future<epoch_t> OSDMapGate<OSDMapGateTypeV>::wait_for_map( + typename OSDMapBlocker::BlockingEvent::TriggerI&& trigger, + epoch_t epoch, + ShardServices *shard_services) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<epoch_t>( + crimson::common::system_shutdown_exception()); + } + if (current >= epoch) { + return seastar::make_ready_future<epoch_t>(current); + } else { + logger().info("evt epoch is {}, i have {}, will wait", epoch, current); + auto &blocker = waiting_peering.emplace( + epoch, std::make_pair(blocker_type, epoch)).first->second; + auto fut = blocker.promise.get_shared_future(); + if (shard_services) { + return trigger.maybe_record_blocking( + shard_services->osdmap_subscribe(current, true).then( + [fut=std::move(fut)]() mutable { + return std::move(fut); + }), + blocker); + } else { + return trigger.maybe_record_blocking(std::move(fut), blocker); + } + } +} + +template <OSDMapGateType OSDMapGateTypeV> +void OSDMapGate<OSDMapGateTypeV>::got_map(epoch_t epoch) { + if (epoch == 0) { + return; + } + ceph_assert(epoch > current); + current = epoch; + auto first = waiting_peering.begin(); + auto last = waiting_peering.upper_bound(epoch); + std::for_each(first, last, [epoch](auto& blocked_requests) { + blocked_requests.second.promise.set_value(epoch); + }); + waiting_peering.erase(first, last); +} + +template <OSDMapGateType OSDMapGateTypeV> +seastar::future<> OSDMapGate<OSDMapGateTypeV>::stop() { + logger().info("osdmap::stop"); + stopping = true; + auto first = waiting_peering.begin(); + auto last = waiting_peering.end(); + std::for_each(first, last, [](auto& blocked_requests) { + blocked_requests.second.promise.set_exception( + crimson::common::system_shutdown_exception()); + }); + return seastar::now(); +} + +template class OSDMapGate<OSDMapGateType::PG>; +template class OSDMapGate<OSDMapGateType::OSD>; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osdmap_gate.h b/src/crimson/osd/osdmap_gate.h new file mode 100644 index 000000000..d76c4b82f --- /dev/null +++ b/src/crimson/osd/osdmap_gate.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <functional> +#include <map> +#include <optional> + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "include/types.h" +#include "crimson/osd/osd_operation.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; + +enum class OSDMapGateType { + OSD, + PG, +}; + +template <OSDMapGateType OSDMapGateTypeV> +class OSDMapGate { +public: + struct OSDMapBlocker : BlockerT<OSDMapBlocker> { + const char * type_name; + epoch_t epoch; + + OSDMapBlocker(std::pair<const char *, epoch_t> args) + : type_name(args.first), epoch(args.second) {} + + OSDMapBlocker(const OSDMapBlocker &) = delete; + OSDMapBlocker(OSDMapBlocker &&) = delete; + OSDMapBlocker &operator=(const OSDMapBlocker &) = delete; + OSDMapBlocker &operator=(OSDMapBlocker &&) = delete; + + seastar::shared_promise<epoch_t> promise; + + void dump_detail(Formatter *f) const final; + }; + using Blocker = OSDMapBlocker; + +private: + // order the promises in ascending order of the waited osdmap epoch, + // so we can access all the waiters expecting a map whose epoch is less + // than or equal to a given epoch + using waiting_peering_t = std::map<epoch_t, + OSDMapBlocker>; + const char *blocker_type; + waiting_peering_t waiting_peering; + epoch_t current = 0; + bool stopping = false; +public: + OSDMapGate(const char *blocker_type) + : blocker_type(blocker_type) {} + + /** + * wait_for_map + * + * Wait for an osdmap whose epoch is greater or equal to given epoch. + * If shard_services is non-null, request map if not present. + */ + seastar::future<epoch_t> + wait_for_map( + typename OSDMapBlocker::BlockingEvent::TriggerI&& trigger, + epoch_t epoch, + ShardServices *shard_services=nullptr + ); + void got_map(epoch_t epoch); + seastar::future<> stop(); +}; + +using OSD_OSDMapGate = OSDMapGate<OSDMapGateType::OSD>; +using PG_OSDMapGate = OSDMapGate<OSDMapGateType::PG>; + +} diff --git a/src/crimson/osd/osdmap_service.h b/src/crimson/osd/osdmap_service.h new file mode 100644 index 000000000..017303536 --- /dev/null +++ b/src/crimson/osd/osdmap_service.h @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/types.h" +#include "osd/OSDMap.h" + +class OSDMap; + +class OSDMapService { +public: + using cached_map_t = OSDMapRef; + using local_cached_map_t = LocalOSDMapRef; + + virtual ~OSDMapService() = default; + virtual seastar::future<cached_map_t> get_map(epoch_t e) = 0; + /// get the latest map + virtual cached_map_t get_map() const = 0; + virtual epoch_t get_up_epoch() const = 0; +}; diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc new file mode 100644 index 000000000..7cf3b158c --- /dev/null +++ b/src/crimson/osd/pg.cc @@ -0,0 +1,1544 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "pg.h" + +#include <functional> + +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/map.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm/copy.hpp> +#include <boost/range/algorithm/max_element.hpp> +#include <boost/range/numeric.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "common/hobject_fmt.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" + +#include "osd/OSDMap.h" +#include "osd/osd_types_fmt.h" + +#include "os/Transaction.h" + +#include "crimson/common/exception.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/os/cyanstore/cyan_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/exceptions.h" +#include "crimson/osd/pg_meta.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/osd_operations/osdop_params.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/background_recovery.h" +#include "crimson/osd/osd_operations/snaptrim_event.h" +#include "crimson/osd/pg_recovery.h" +#include "crimson/osd/replicated_recovery_backend.h" +#include "crimson/osd/watch.h" + +using std::ostream; +using std::set; +using std::string; +using std::vector; + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace std::chrono { +std::ostream& operator<<(std::ostream& out, const signedspan& d) +{ + auto s = std::chrono::duration_cast<std::chrono::seconds>(d).count(); + auto ns = std::abs((d % 1s).count()); + fmt::print(out, "{}{}s", s, ns ? fmt::format(".{:0>9}", ns) : ""); + return out; +} +} + +template <typename T> +struct fmt::formatter<std::optional<T>> : fmt::formatter<T> { + template <typename FormatContext> + auto format(const std::optional<T>& v, FormatContext& ctx) const { + if (v.has_value()) { + return fmt::formatter<T>::format(*v, ctx); + } + return fmt::format_to(ctx.out(), "<null>"); + } +}; + +namespace crimson::osd { + +using crimson::common::local_conf; + +class RecoverablePredicate : public IsPGRecoverablePredicate { +public: + bool operator()(const set<pg_shard_t> &have) const override { + return !have.empty(); + } +}; + +class ReadablePredicate: public IsPGReadablePredicate { + pg_shard_t whoami; +public: + explicit ReadablePredicate(pg_shard_t whoami) : whoami(whoami) {} + bool operator()(const set<pg_shard_t> &have) const override { + return have.count(whoami); + } +}; + +PG::PG( + spg_t pgid, + pg_shard_t pg_shard, + crimson::os::CollectionRef coll_ref, + pg_pool_t&& pool, + std::string&& name, + cached_map_t osdmap, + ShardServices &shard_services, + ec_profile_t profile) + : pgid{pgid}, + pg_whoami{pg_shard}, + coll_ref{coll_ref}, + pgmeta_oid{pgid.make_pgmeta_oid()}, + osdmap_gate("PG::osdmap_gate"), + shard_services{shard_services}, + backend( + PGBackend::create( + pgid.pgid, + pg_shard, + pool, + coll_ref, + shard_services, + profile, + *this)), + recovery_backend( + std::make_unique<ReplicatedRecoveryBackend>( + *this, shard_services, coll_ref, backend.get())), + recovery_handler( + std::make_unique<PGRecovery>(this)), + peering_state( + shard_services.get_cct(), + pg_shard, + pgid, + PGPool( + osdmap, + pgid.pool(), + pool, + name), + osdmap, + this, + this), + obc_registry{ + local_conf()}, + obc_loader{ + obc_registry, + *backend.get(), + *this}, + osdriver( + &shard_services.get_store(), + coll_ref, + pgid.make_pgmeta_oid()), + snap_mapper( + this->shard_services.get_cct(), + &osdriver, + pgid.ps(), + pgid.get_split_bits(pool.get_pg_num()), + pgid.pool(), + pgid.shard), + wait_for_active_blocker(this) +{ + peering_state.set_backend_predicates( + new ReadablePredicate(pg_whoami), + new RecoverablePredicate()); + osdmap_gate.got_map(osdmap->get_epoch()); +} + +PG::~PG() {} + +void PG::check_blocklisted_watchers() +{ + logger().debug("{}", __func__); + obc_registry.for_each([this](ObjectContextRef obc) { + assert(obc); + for (const auto& [key, watch] : obc->watchers) { + assert(watch->get_pg() == this); + const auto& ea = watch->get_peer_addr(); + logger().debug("watch: Found {} cookie {}. Checking entity_add_t {}", + watch->get_entity(), watch->get_cookie(), ea); + if (get_osdmap()->is_blocklisted(ea)) { + logger().info("watch: Found blocklisted watcher for {}", ea); + watch->do_watch_timeout(); + } + } + }); +} + +bool PG::try_flush_or_schedule_async() { + logger().debug("PG::try_flush_or_schedule_async: flush ..."); + (void)shard_services.get_store().flush( + coll_ref + ).then( + [this, epoch=get_osdmap_epoch()]() { + return shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + epoch, + epoch, + PeeringState::IntervalFlush()); + }); + return false; +} + +void PG::publish_stats_to_osd() +{ + if (!is_primary()) + return; + if (auto new_pg_stats = peering_state.prepare_stats_for_publish( + pg_stats, + object_stat_collection_t()); + new_pg_stats.has_value()) { + pg_stats = std::move(new_pg_stats); + } +} + +void PG::clear_publish_stats() +{ + pg_stats.reset(); +} + +pg_stat_t PG::get_stats() const +{ + return pg_stats.value_or(pg_stat_t{}); +} + +void PG::queue_check_readable(epoch_t last_peering_reset, ceph::timespan delay) +{ + // handle the peering event in the background + logger().debug( + "{}: PG::queue_check_readable lpr: {}, delay: {}", + *this, last_peering_reset, delay); + check_readable_timer.cancel(); + check_readable_timer.set_callback([last_peering_reset, this] { + logger().debug( + "{}: PG::queue_check_readable callback lpr: {}", + *this, last_peering_reset); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + last_peering_reset, + last_peering_reset, + PeeringState::CheckReadable{}); + }); + check_readable_timer.arm( + std::chrono::duration_cast<seastar::lowres_clock::duration>(delay)); +} + +void PG::recheck_readable() +{ + bool changed = false; + const auto mnow = shard_services.get_mnow(); + if (peering_state.state_test(PG_STATE_WAIT)) { + auto prior_readable_until_ub = peering_state.get_prior_readable_until_ub(); + if (mnow < prior_readable_until_ub) { + logger().info( + "{}: {} will wait (mnow {} < prior_readable_until_ub {})", + *this, __func__, mnow, prior_readable_until_ub); + queue_check_readable( + peering_state.get_last_peering_reset(), + prior_readable_until_ub - mnow); + } else { + logger().info( + "{}:{} no longer wait (mnow {} >= prior_readable_until_ub {})", + *this, __func__, mnow, prior_readable_until_ub); + peering_state.state_clear(PG_STATE_WAIT); + peering_state.clear_prior_readable_until_ub(); + changed = true; + } + } + if (peering_state.state_test(PG_STATE_LAGGY)) { + auto readable_until = peering_state.get_readable_until(); + if (readable_until == readable_until.zero()) { + logger().info( + "{}:{} still laggy (mnow {}, readable_until zero)", + *this, __func__, mnow); + } else if (mnow >= readable_until) { + logger().info( + "{}:{} still laggy (mnow {} >= readable_until {})", + *this, __func__, mnow, readable_until); + } else { + logger().info( + "{}:{} no longer laggy (mnow {} < readable_until {})", + *this, __func__, mnow, readable_until); + peering_state.state_clear(PG_STATE_LAGGY); + changed = true; + } + } + if (changed) { + publish_stats_to_osd(); + if (!peering_state.state_test(PG_STATE_WAIT) && + !peering_state.state_test(PG_STATE_LAGGY)) { + // TODO: requeue ops waiting for readable + } + } +} + +unsigned PG::get_target_pg_log_entries() const +{ + const unsigned local_num_pgs = shard_services.get_num_local_pgs(); + const unsigned local_target = + local_conf().get_val<uint64_t>("osd_target_pg_log_entries_per_osd") / + seastar::smp::count; + const unsigned min_pg_log_entries = + local_conf().get_val<uint64_t>("osd_min_pg_log_entries"); + if (local_num_pgs > 0 && local_target > 0) { + // target an even spread of our budgeted log entries across all + // PGs. note that while we only get to control the entry count + // for primary PGs, we'll normally be responsible for a mix of + // primary and replica PGs (for the same pool(s) even), so this + // will work out. + const unsigned max_pg_log_entries = + local_conf().get_val<uint64_t>("osd_max_pg_log_entries"); + return std::clamp(local_target / local_num_pgs, + min_pg_log_entries, + max_pg_log_entries); + } else { + // fall back to a per-pg value. + return min_pg_log_entries; + } +} + +void PG::on_removal(ceph::os::Transaction &t) { + t.register_on_commit( + new LambdaContext( + [this](int r) { + ceph_assert(r == 0); + (void)shard_services.start_operation<LocalPeeringEvent>( + this, pg_whoami, pgid, float(0.001), get_osdmap_epoch(), + get_osdmap_epoch(), PeeringState::DeleteSome()); + })); +} + +void PG::on_activate(interval_set<snapid_t> snaps) +{ + logger().debug("{}: {} snaps={}", *this, __func__, snaps); + snap_trimq = std::move(snaps); + projected_last_update = peering_state.get_info().last_update; +} + +void PG::on_activate_complete() +{ + wait_for_active_blocker.unblock(); + + if (peering_state.needs_recovery()) { + logger().info("{}: requesting recovery", + __func__); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + float(0.001), + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DoRecovery{}); + } else if (peering_state.needs_backfill()) { + logger().info("{}: requesting backfill", + __func__); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + float(0.001), + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill{}); + } else { + logger().debug("{}: no need to recover or backfill, AllReplicasRecovered", + " for pg: {}", __func__, pgid); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + float(0.001), + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::AllReplicasRecovered{}); + } + publish_stats_to_osd(); +} + +void PG::prepare_write(pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ceph::os::Transaction &t) +{ + std::map<string,bufferlist> km; + std::string key_to_remove; + if (dirty_big_info || dirty_info) { + int ret = prepare_info_keymap( + shard_services.get_cct(), + &km, + &key_to_remove, + get_osdmap_epoch(), + info, + last_written_info, + past_intervals, + dirty_big_info, + need_write_epoch, + true, + nullptr, + this); + ceph_assert(ret == 0); + } + pglog.write_log_and_missing( + t, &km, coll_ref->get_cid(), pgmeta_oid, + peering_state.get_pgpool().info.require_rollback()); + if (!km.empty()) { + t.omap_setkeys(coll_ref->get_cid(), pgmeta_oid, km); + } + if (!key_to_remove.empty()) { + t.omap_rmkey(coll_ref->get_cid(), pgmeta_oid, key_to_remove); + } +} + +std::pair<ghobject_t, bool> +PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next) +{ + logger().info("removing pg {}", pgid); + auto fut = interruptor::make_interruptible( + shard_services.get_store().list_objects( + coll_ref, + _next, + ghobject_t::get_max(), + local_conf()->osd_target_transaction_size)); + + auto [objs_to_rm, next] = fut.get(); + if (objs_to_rm.empty()) { + logger().info("all objs removed, removing coll for {}", pgid); + t.remove(coll_ref->get_cid(), pgmeta_oid); + t.remove_collection(coll_ref->get_cid()); + (void) shard_services.get_store().do_transaction( + coll_ref, std::move(t)).then([this] { + return shard_services.remove_pg(pgid); + }); + return {next, false}; + } else { + for (auto &obj : objs_to_rm) { + if (obj == pgmeta_oid) { + continue; + } + logger().trace("pg {}, removing obj {}", pgid, obj); + t.remove(coll_ref->get_cid(), obj); + } + t.register_on_commit( + new LambdaContext([this](int r) { + ceph_assert(r == 0); + logger().trace("triggering more pg delete {}", pgid); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + float(0.001), + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DeleteSome{}); + })); + return {next, true}; + } +} + +Context *PG::on_clean() +{ + // Not needed yet (will be needed for IO unblocking) + return nullptr; +} + +void PG::on_active_actmap() +{ + logger().debug("{}: {} snap_trimq={}", *this, __func__, snap_trimq); + peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR); + // loops until snap_trimq is empty or SNAPTRIM_ERROR. + std::ignore = seastar::do_until( + [this] { return snap_trimq.empty() + || peering_state.state_test(PG_STATE_SNAPTRIM_ERROR); + }, + [this] { + peering_state.state_set(PG_STATE_SNAPTRIM); + publish_stats_to_osd(); + const auto to_trim = snap_trimq.range_start(); + snap_trimq.erase(to_trim); + const auto needs_pause = !snap_trimq.empty(); + return seastar::repeat([to_trim, needs_pause, this] { + logger().debug("{}: going to start SnapTrimEvent, to_trim={}", + *this, to_trim); + return shard_services.start_operation<SnapTrimEvent>( + this, + snap_mapper, + to_trim, + needs_pause + ).second.handle_error( + crimson::ct_error::enoent::handle([this] { + logger().error("{}: ENOENT saw, trimming stopped", *this); + peering_state.state_set(PG_STATE_SNAPTRIM_ERROR); + publish_stats_to_osd(); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + }), crimson::ct_error::eagain::handle([this] { + logger().info("{}: EAGAIN saw, trimming restarted", *this); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }) + ); + }).then([this, trimmed=to_trim] { + logger().debug("{}: trimmed snap={}", *this, trimmed); + }); + }).finally([this] { + logger().debug("{}: PG::on_active_actmap() finished trimming", + *this); + peering_state.state_clear(PG_STATE_SNAPTRIM); + peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR); + publish_stats_to_osd(); + }); +} + +void PG::on_active_advmap(const OSDMapRef &osdmap) +{ + const auto new_removed_snaps = osdmap->get_new_removed_snaps(); + if (auto it = new_removed_snaps.find(get_pgid().pool()); + it != new_removed_snaps.end()) { + bool bad = false; + for (auto j : it->second) { + if (snap_trimq.intersects(j.first, j.second)) { + decltype(snap_trimq) added, overlap; + added.insert(j.first, j.second); + overlap.intersection_of(snap_trimq, added); + logger().error("{}: {} removed_snaps already contains {}", + *this, __func__, overlap); + bad = true; + snap_trimq.union_of(added); + } else { + snap_trimq.insert(j.first, j.second); + } + } + logger().info("{}: {} new removed snaps {}, snap_trimq now{}", + *this, __func__, it->second, snap_trimq); + assert(!bad || !local_conf().get_val<bool>("osd_debug_verify_cached_snaps")); + } +} + +void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) +{ + // TODO: should update the stats upon finishing the scrub + peering_state.update_stats([scrub_level, this](auto& history, auto& stats) { + const utime_t now = ceph_clock_now(); + history.last_scrub = peering_state.get_info().last_update; + history.last_scrub_stamp = now; + history.last_clean_scrub_stamp = now; + if (scrub_level == scrub_level_t::deep) { + history.last_deep_scrub = history.last_scrub; + history.last_deep_scrub_stamp = now; + } + // yes, please publish the stats + return true; + }); +} + +void PG::log_state_enter(const char *state) { + logger().info("Entering state: {}", state); +} + +void PG::log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) { + logger().info( + "Exiting state: {}, entered at {}, {} spent on {} events", + state_name, + enter_time, + event_dur, + events); +} + +ceph::signedspan PG::get_mnow() const +{ + return shard_services.get_mnow(); +} + +HeartbeatStampsRef PG::get_hb_stamps(int peer) +{ + return shard_services.get_hb_stamps(peer); +} + +void PG::schedule_renew_lease(epoch_t last_peering_reset, ceph::timespan delay) +{ + // handle the peering event in the background + renew_lease_timer.cancel(); + renew_lease_timer.set_callback([last_peering_reset, this] { + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + last_peering_reset, + last_peering_reset, + RenewLease{}); + }); + renew_lease_timer.arm( + std::chrono::duration_cast<seastar::lowres_clock::duration>(delay)); +} + + +void PG::init( + int role, + const vector<int>& newup, int new_up_primary, + const vector<int>& newacting, int new_acting_primary, + const pg_history_t& history, + const PastIntervals& pi, + ObjectStore::Transaction &t) +{ + peering_state.init( + role, newup, new_up_primary, newacting, + new_acting_primary, history, pi, t); +} + +seastar::future<> PG::read_state(crimson::os::FuturizedStore::Shard* store) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + return seastar::do_with(PGMeta(*store, pgid), [] (auto& pg_meta) { + return pg_meta.load(); + }).then([this, store](auto&& ret) { + auto [pg_info, past_intervals] = std::move(ret); + return peering_state.init_from_disk_state( + std::move(pg_info), + std::move(past_intervals), + [this, store] (PGLog &pglog) { + return pglog.read_log_and_missing_crimson( + *store, + coll_ref, + peering_state.get_info(), + pgmeta_oid); + }); + }).then([this]() { + int primary, up_primary; + vector<int> acting, up; + peering_state.get_osdmap()->pg_to_up_acting_osds( + pgid.pgid, &up, &up_primary, &acting, &primary); + peering_state.init_primary_up_acting( + up, + acting, + up_primary, + primary); + int rr = OSDMap::calc_pg_role(pg_whoami, acting); + peering_state.set_role(rr); + + epoch_t epoch = get_osdmap_epoch(); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + epoch, + epoch, + PeeringState::Initialize()); + + return seastar::now(); + }); +} + +PG::interruptible_future<> PG::do_peering_event( + PGPeeringEvent& evt, PeeringCtx &rctx) +{ + if (peering_state.pg_has_reset_since(evt.get_epoch_requested()) || + peering_state.pg_has_reset_since(evt.get_epoch_sent())) { + logger().debug("{} ignoring {} -- pg has reset", __func__, evt.get_desc()); + return interruptor::now(); + } else { + logger().debug("{} handling {} for pg: {}", __func__, evt.get_desc(), pgid); + // all peering event handling needs to be run in a dedicated seastar::thread, + // so that event processing can involve I/O reqs freely, for example: PG::on_removal, + // PG::on_new_interval + return interruptor::async([this, &evt, &rctx] { + peering_state.handle_event( + evt.get_event(), + &rctx); + peering_state.write_if_dirty(rctx.transaction); + }); + } +} + +seastar::future<> PG::handle_advance_map( + cached_map_t next_map, PeeringCtx &rctx) +{ + return seastar::async([this, next_map=std::move(next_map), &rctx] { + vector<int> newup, newacting; + int up_primary, acting_primary; + next_map->pg_to_up_acting_osds( + pgid.pgid, + &newup, &up_primary, + &newacting, &acting_primary); + peering_state.advance_map( + next_map, + peering_state.get_osdmap(), + newup, + up_primary, + newacting, + acting_primary, + rctx); + osdmap_gate.got_map(next_map->get_epoch()); + }); +} + +seastar::future<> PG::handle_activate_map(PeeringCtx &rctx) +{ + return seastar::async([this, &rctx] { + peering_state.activate_map(rctx); + }); +} + +seastar::future<> PG::handle_initialize(PeeringCtx &rctx) +{ + return seastar::async([this, &rctx] { + peering_state.handle_event(PeeringState::Initialize{}, &rctx); + }); +} + + +void PG::print(ostream& out) const +{ + out << peering_state << " "; +} + +void PG::dump_primary(Formatter* f) +{ + peering_state.dump_peering_state(f); + + f->open_array_section("recovery_state"); + PeeringState::QueryState q(f); + peering_state.handle_event(q, 0); + f->close_section(); + + // TODO: snap_trimq + // TODO: scrubber state + // TODO: agent state +} + +std::ostream& operator<<(std::ostream& os, const PG& pg) +{ + os << " pg_epoch " << pg.get_osdmap_epoch() << " "; + pg.print(os); + return os; +} + +std::tuple<PG::interruptible_future<>, + PG::interruptible_future<>> +PG::submit_transaction( + ObjectContextRef&& obc, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + std::vector<pg_log_entry_t>&& log_entries) +{ + if (__builtin_expect(stopping, false)) { + return {seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()), + seastar::now()}; + } + + epoch_t map_epoch = get_osdmap_epoch(); + ceph_assert(!has_reset_since(osd_op_p.at_version.epoch)); + + peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version); + peering_state.append_log_with_trim_to_updated(std::move(log_entries), osd_op_p.at_version, + txn, true, false); + + auto [submitted, all_completed] = backend->mutate_object( + peering_state.get_acting_recovery_backfill(), + std::move(obc), + std::move(txn), + std::move(osd_op_p), + peering_state.get_last_peering_reset(), + map_epoch, + std::move(log_entries)); + return std::make_tuple(std::move(submitted), all_completed.then_interruptible( + [this, last_complete=peering_state.get_info().last_complete, + at_version=osd_op_p.at_version](auto acked) { + for (const auto& peer : acked) { + peering_state.update_peer_last_complete_ondisk( + peer.shard, peer.last_complete_ondisk); + } + peering_state.complete_write(at_version, last_complete); + return seastar::now(); + })); +} + +PG::interruptible_future<> PG::repair_object( + const hobject_t& oid, + eversion_t& v) +{ + // see also PrimaryLogPG::rep_repair_primary_object() + assert(is_primary()); + logger().debug("{}: {} peers osd.{}", __func__, oid, get_acting_recovery_backfill()); + // Add object to PG's missing set if it isn't there already + assert(!get_local_missing().is_missing(oid)); + peering_state.force_object_missing(pg_whoami, oid, v); + auto [op, fut] = get_shard_services().start_operation<UrgentRecovery>( + oid, v, this, get_shard_services(), get_osdmap_epoch()); + return std::move(fut); +} + +template <class Ret, class SuccessFunc, class FailureFunc> +PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<Ret>> +PG::do_osd_ops_execute( + seastar::lw_shared_ptr<OpsExecuter> ox, + std::vector<OSDOp>& ops, + SuccessFunc&& success_func, + FailureFunc&& failure_func) +{ + assert(ox); + auto rollbacker = ox->create_rollbacker([this] (auto& obc) { + return obc_loader.reload_obc(obc).handle_error_interruptible( + load_obc_ertr::assert_all{"can't live with object state messed up"}); + }); + auto failure_func_ptr = seastar::make_lw_shared(std::move(failure_func)); + return interruptor::do_for_each(ops, [ox](OSDOp& osd_op) { + logger().debug( + "do_osd_ops_execute: object {} - handling op {}", + ox->get_target(), + ceph_osd_op_name(osd_op.op.op)); + return ox->execute_op(osd_op); + }).safe_then_interruptible([this, ox, &ops] { + logger().debug( + "do_osd_ops_execute: object {} all operations successful", + ox->get_target()); + // check for full + if ((ox->delta_stats.num_bytes > 0 || + ox->delta_stats.num_objects > 0) && + get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) { + const auto& m = ox->get_message(); + if (m.get_reqid().name.is_mds() || // FIXME: ignore MDS for now + m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) { + logger().info(" full, but proceeding due to FULL_FORCE or MDS"); + } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) { + // they tried, they failed. + logger().info(" full, replying to FULL_TRY op"); + if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) + return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>( + seastar::now(), + OpsExecuter::osd_op_ierrorator::future<>( + crimson::ct_error::edquot::make())); + else + return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>( + seastar::now(), + OpsExecuter::osd_op_ierrorator::future<>( + crimson::ct_error::enospc::make())); + } else { + // drop request + logger().info(" full, dropping request (bad client)"); + return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>( + seastar::now(), + OpsExecuter::osd_op_ierrorator::future<>( + crimson::ct_error::eagain::make())); + } + } + return std::move(*ox).flush_changes_n_do_ops_effects( + ops, + snap_mapper, + osdriver, + [this] (auto&& txn, + auto&& obc, + auto&& osd_op_p, + auto&& log_entries) { + logger().debug( + "do_osd_ops_execute: object {} submitting txn", + obc->get_oid()); + return submit_transaction( + std::move(obc), + std::move(txn), + std::move(osd_op_p), + std::move(log_entries)); + }); + }).safe_then_unpack_interruptible( + [success_func=std::move(success_func), rollbacker, this, failure_func_ptr] + (auto submitted_fut, auto all_completed_fut) mutable { + return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>( + std::move(submitted_fut), + all_completed_fut.safe_then_interruptible_tuple( + std::move(success_func), + crimson::ct_error::object_corrupted::handle( + [rollbacker, this] (const std::error_code& e) mutable { + // this is a path for EIO. it's special because we want to fix the obejct + // and try again. that is, the layer above `PG::do_osd_ops` is supposed to + // restart the execution. + return rollbacker.rollback_obc_if_modified(e).then_interruptible( + [obc=rollbacker.get_obc(), this] { + return repair_object(obc->obs.oi.soid, + obc->obs.oi.version).then_interruptible([] { + return do_osd_ops_iertr::future<Ret>{crimson::ct_error::eagain::make()}; + }); + }); + }), OpsExecuter::osd_op_errorator::all_same_way( + [rollbacker, failure_func_ptr] + (const std::error_code& e) mutable { + return rollbacker.rollback_obc_if_modified(e).then_interruptible( + [e, failure_func_ptr] { + return (*failure_func_ptr)(e); + }); + }) + ) + ); + }, OpsExecuter::osd_op_errorator::all_same_way( + [rollbacker, failure_func_ptr] + (const std::error_code& e) mutable { + return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>( + seastar::now(), + e.value() == ENOENT ? (*failure_func_ptr)(e) : + rollbacker.rollback_obc_if_modified(e).then_interruptible( + [e, failure_func_ptr] { + return (*failure_func_ptr)(e); + })); + })); +} +seastar::future<> PG::submit_error_log( + Ref<MOSDOp> m, + const OpInfo &op_info, + ObjectContextRef obc, + const std::error_code e, + ceph_tid_t rep_tid, + eversion_t &version) +{ + const osd_reqid_t &reqid = m->get_reqid(); + mempool::osd_pglog::list<pg_log_entry_t> log_entries; + log_entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, + obc->obs.oi.soid, + next_version(), + eversion_t(), 0, + reqid, utime_t(), + -e.value())); + if (op_info.allows_returnvec()) { + log_entries.back().set_op_returns(m->ops); + } + ceph_assert(is_primary()); + if (!log_entries.empty()) { + ceph_assert(log_entries.rbegin()->version >= projected_last_update); + version = projected_last_update = log_entries.rbegin()->version; + } + ceph::os::Transaction t; + peering_state.merge_new_log_entries( + log_entries, t, peering_state.get_pg_trim_to(), + peering_state.get_min_last_complete_ondisk()); + + set<pg_shard_t> waiting_on; + for (auto &i : get_acting_recovery_backfill()) { + pg_shard_t peer(i); + if (peer == pg_whoami) continue; + ceph_assert(peering_state.get_peer_missing().count(peer)); + ceph_assert(peering_state.has_peer_info(peer)); + auto log_m = crimson::make_message<MOSDPGUpdateLogMissing>( + log_entries, + spg_t(peering_state.get_info().pgid.pgid, i.shard), + pg_whoami.shard, + get_osdmap_epoch(), + get_last_peering_reset(), + rep_tid, + peering_state.get_pg_trim_to(), + peering_state.get_min_last_complete_ondisk()); + send_cluster_message(peer.osd, std::move(log_m), get_osdmap_epoch()); + waiting_on.insert(peer); + } + waiting_on.insert(pg_whoami); + log_entry_update_waiting_on.insert( + std::make_pair(rep_tid, log_update_t{std::move(waiting_on)})); + return shard_services.get_store().do_transaction( + get_collection_ref(), std::move(t)) + .then([this] { + peering_state.update_trim_to(); + return seastar::now(); + }); +} + +PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<MURef<MOSDOpReply>>> +PG::do_osd_ops( + Ref<MOSDOp> m, + crimson::net::ConnectionRef conn, + ObjectContextRef obc, + const OpInfo &op_info, + const SnapContext& snapc) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + return do_osd_ops_execute<MURef<MOSDOpReply>>( + seastar::make_lw_shared<OpsExecuter>( + Ref<PG>{this}, obc, op_info, *m, conn, snapc), + m->ops, + [this, m, obc, may_write = op_info.may_write(), + may_read = op_info.may_read(), rvec = op_info.allows_returnvec()] { + // TODO: should stop at the first op which returns a negative retval, + // cmpext uses it for returning the index of first unmatched byte + int result = m->ops.empty() ? 0 : m->ops.back().rval.code; + if (may_read && result >= 0) { + for (auto &osdop : m->ops) { + if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { + result = osdop.rval.code; + break; + } + } + } else if (result > 0 && may_write && !rvec) { + result = 0; + } else if (result < 0 && (m->ops.empty() ? + 0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { + result = 0; + } + auto reply = crimson::make_message<MOSDOpReply>(m.get(), + result, + get_osdmap_epoch(), + 0, + false); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + logger().debug( + "do_osd_ops: {} - object {} sending reply", + *m, + m->get_hobj()); + if (obc->obs.exists) { + reply->set_reply_versions(peering_state.get_info().last_update, + obc->obs.oi.user_version); + } else { + reply->set_reply_versions(peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + } + return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>( + std::move(reply)); + }, + [m, &op_info, obc, this] (const std::error_code& e) { + return seastar::do_with(eversion_t(), [m, &op_info, obc, e, this](auto &version) { + auto fut = seastar::now(); + epoch_t epoch = get_osdmap_epoch(); + ceph_tid_t rep_tid = shard_services.get_tid(); + auto last_complete = peering_state.get_info().last_complete; + if (op_info.may_write()) { + fut = submit_error_log(m, op_info, obc, e, rep_tid, version); + } + return fut.then([m, e, epoch, &op_info, rep_tid, &version, last_complete, this] { + auto log_reply = [m, e, this] { + auto reply = crimson::make_message<MOSDOpReply>( + m.get(), -e.value(), get_osdmap_epoch(), 0, false); + if (m->ops.empty() ? 0 : + m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) { + reply->set_result(0); + } + // For all ops except for CMPEXT, the correct error value is encoded + // in e.value(). For CMPEXT, osdop.rval has the actual error value. + if (e.value() == ct_error::cmp_fail_error_value) { + assert(!m->ops.empty()); + for (auto &osdop : m->ops) { + if (osdop.rval < 0) { + reply->set_result(osdop.rval); + break; + } + } + } + reply->set_enoent_reply_versions( + peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>( + std::move(reply)); + }; + + if (!peering_state.pg_has_reset_since(epoch) && op_info.may_write()) { + auto it = log_entry_update_waiting_on.find(rep_tid); + ceph_assert(it != log_entry_update_waiting_on.end()); + auto it2 = it->second.waiting_on.find(pg_whoami); + ceph_assert(it2 != it->second.waiting_on.end()); + it->second.waiting_on.erase(it2); + + if (it->second.waiting_on.empty()) { + log_entry_update_waiting_on.erase(it); + if (version != eversion_t()) { + peering_state.complete_write(version, last_complete); + } + return log_reply(); + } else { + return it->second.all_committed.get_shared_future() + .then([this, &version, last_complete, log_reply = std::move(log_reply)] { + if (version != eversion_t()) { + peering_state.complete_write(version, last_complete); + } + return log_reply(); + }); + } + } else { + return log_reply(); + } + }); + }); + }); +} + +PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<>> +PG::do_osd_ops( + ObjectContextRef obc, + std::vector<OSDOp>& ops, + const OpInfo &op_info, + const do_osd_ops_params_t &&msg_params, + do_osd_ops_success_func_t success_func, + do_osd_ops_failure_func_t failure_func) +{ + // This overload is generally used for internal client requests, + // use an empty SnapContext. + return seastar::do_with( + std::move(msg_params), + [=, this, &ops, &op_info](auto &msg_params) { + return do_osd_ops_execute<void>( + seastar::make_lw_shared<OpsExecuter>( + Ref<PG>{this}, + std::move(obc), + op_info, + msg_params, + msg_params.get_connection(), + SnapContext{} + ), + ops, + std::move(success_func), + std::move(failure_func)); + }); +} + +PG::interruptible_future<MURef<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + auto ox = std::make_unique<PgOpsExecuter>(std::as_const(*this), + std::as_const(*m)); + return interruptor::do_for_each(m->ops, [ox = ox.get()](OSDOp& osd_op) { + logger().debug("will be handling pg op {}", ceph_osd_op_name(osd_op.op.op)); + return ox->execute_op(osd_op); + }).then_interruptible([m, this, ox = std::move(ox)] { + auto reply = crimson::make_message<MOSDOpReply>(m.get(), 0, get_osdmap_epoch(), + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + false); + reply->claim_op_out_data(m->ops); + reply->set_reply_versions(peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + return seastar::make_ready_future<MURef<MOSDOpReply>>(std::move(reply)); + }).handle_exception_type_interruptible([=, this](const crimson::osd::error& e) { + auto reply = crimson::make_message<MOSDOpReply>( + m.get(), -e.code().value(), get_osdmap_epoch(), 0, false); + reply->set_enoent_reply_versions(peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + return seastar::make_ready_future<MURef<MOSDOpReply>>(std::move(reply)); + }); +} + +hobject_t PG::get_oid(const hobject_t& hobj) +{ + return hobj.snap == CEPH_SNAPDIR ? hobj.get_head() : hobj; +} + +RWState::State PG::get_lock_type(const OpInfo &op_info) +{ + + if (op_info.rwordered() && op_info.may_read()) { + return RWState::RWEXCL; + } else if (op_info.rwordered()) { + return RWState::RWWRITE; + } else { + ceph_assert(op_info.may_read()); + return RWState::RWREAD; + } +} + +void PG::check_blocklisted_obc_watchers( + ObjectContextRef &obc) +{ + if (obc->watchers.empty()) { + for (auto &[src, winfo] : obc->obs.oi.watchers) { + auto watch = crimson::osd::Watch::create( + obc, winfo, src.second, this); + watch->disconnect(); + auto [it, emplaced] = obc->watchers.emplace(src, std::move(watch)); + assert(emplaced); + logger().debug("added watch for obj {}, client {}", + obc->get_oid(), src.second); + } + } +} + +PG::load_obc_iertr::future<> +PG::with_locked_obc(const hobject_t &hobj, + const OpInfo &op_info, + with_obc_func_t &&f) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + const hobject_t oid = get_oid(hobj); + auto wrapper = [f=std::move(f), this](auto obc) { + check_blocklisted_obc_watchers(obc); + return f(obc); + }; + switch (get_lock_type(op_info)) { + case RWState::RWREAD: + return obc_loader.with_obc<RWState::RWREAD>(oid, std::move(wrapper)); + case RWState::RWWRITE: + return obc_loader.with_obc<RWState::RWWRITE>(oid, std::move(wrapper)); + case RWState::RWEXCL: + return obc_loader.with_obc<RWState::RWEXCL>(oid, std::move(wrapper)); + default: + ceph_abort(); + }; +} + +PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + logger().debug("{}: {}", __func__, *req); + if (can_discard_replica_op(*req)) { + return seastar::now(); + } + + ceph::os::Transaction txn; + auto encoded_txn = req->get_data().cbegin(); + decode(txn, encoded_txn); + auto p = req->logbl.cbegin(); + std::vector<pg_log_entry_t> log_entries; + decode(log_entries, p); + log_operation(std::move(log_entries), + req->pg_trim_to, + req->version, + req->min_last_complete_ondisk, + !txn.empty(), + txn, + false); + logger().debug("PG::handle_rep_op: do_transaction..."); + return interruptor::make_interruptible(shard_services.get_store().do_transaction( + coll_ref, std::move(txn))).then_interruptible( + [req, lcod=peering_state.get_info().last_complete, this] { + peering_state.update_last_complete_ondisk(lcod); + const auto map_epoch = get_osdmap_epoch(); + auto reply = crimson::make_message<MOSDRepOpReply>( + req.get(), pg_whoami, 0, + map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK); + reply->set_last_complete_ondisk(lcod); + return shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch); + }); +} + +void PG::log_operation( + std::vector<pg_log_entry_t>&& logv, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + const eversion_t &min_last_complete_ondisk, + bool transaction_applied, + ObjectStore::Transaction &txn, + bool async) { + logger().debug("{}", __func__); + if (is_primary()) { + ceph_assert(trim_to <= peering_state.get_last_update_ondisk()); + } + /* TODO: when we add snap mapper and projected log support, + * we'll likely want to update them here. + * + * See src/osd/PrimaryLogPG.h:log_operation for how classic + * handles these cases. + */ +#if 0 + if (transaction_applied) { + //TODO: + //update_snap_map(logv, t); + } + auto last = logv.rbegin(); + if (is_primary() && last != logv.rend()) { + projected_log.skip_can_rollback_to_to_head(); + projected_log.trim(cct, last->version, nullptr, nullptr, nullptr); + } +#endif + if (!is_primary()) { // && !is_ec_pg() + replica_clear_repop_obc(logv); + } + peering_state.append_log(std::move(logv), + trim_to, + roll_forward_to, + min_last_complete_ondisk, + txn, + !txn.empty(), + false); +} + +void PG::replica_clear_repop_obc( + const std::vector<pg_log_entry_t> &logv) { + logger().debug("{} clearing {} entries", __func__, logv.size()); + for (auto &&e: logv) { + logger().debug(" {} get_object_boundary(from): {} " + " head version(to): {}", + e.soid, + e.soid.get_object_boundary(), + e.soid.get_head()); + /* Have to blast all clones, they share a snapset */ + obc_registry.clear_range( + e.soid.get_object_boundary(), e.soid.get_head()); + } +} + +void PG::handle_rep_op_reply(const MOSDRepOpReply& m) +{ + if (!can_discard_replica_op(m)) { + backend->got_rep_op_reply(m); + } +} + +PG::interruptible_future<> PG::do_update_log_missing( + Ref<MOSDPGUpdateLogMissing> m, + crimson::net::ConnectionRef conn) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING); + ObjectStore::Transaction t; + std::optional<eversion_t> op_trim_to, op_roll_forward_to; + if (m->pg_trim_to != eversion_t()) + op_trim_to = m->pg_trim_to; + if (m->pg_roll_forward_to != eversion_t()) + op_roll_forward_to = m->pg_roll_forward_to; + logger().debug("op_trim_to = {}, op_roll_forward_to = {}", + op_trim_to, op_roll_forward_to); + + peering_state.append_log_entries_update_missing( + m->entries, t, op_trim_to, op_roll_forward_to); + + return interruptor::make_interruptible(shard_services.get_store().do_transaction( + coll_ref, std::move(t))).then_interruptible( + [m, conn, lcod=peering_state.get_info().last_complete, this] { + if (!peering_state.pg_has_reset_since(m->get_epoch())) { + peering_state.update_last_complete_ondisk(lcod); + auto reply = + crimson::make_message<MOSDPGUpdateLogMissingReply>( + spg_t(peering_state.get_info().pgid.pgid, get_primary().shard), + pg_whoami.shard, + m->get_epoch(), + m->min_epoch, + m->get_tid(), + lcod); + reply->set_priority(CEPH_MSG_PRIO_HIGH); + return conn->send(std::move(reply)); + } + return seastar::now(); + }); +} + + +PG::interruptible_future<> PG::do_update_log_missing_reply( + Ref<MOSDPGUpdateLogMissingReply> m) +{ + logger().debug("{}: got reply from {}", __func__, m->get_from()); + + auto it = log_entry_update_waiting_on.find(m->get_tid()); + if (it != log_entry_update_waiting_on.end()) { + if (it->second.waiting_on.count(m->get_from())) { + it->second.waiting_on.erase(m->get_from()); + if (m->last_complete_ondisk != eversion_t()) { + peering_state.update_peer_last_complete_ondisk( + m->get_from(), m->last_complete_ondisk); + } + } else { + logger().error("{} : {} got reply {} from shard we are not waiting for ", + __func__, peering_state.get_info().pgid, *m, m->get_from()); + } + + if (it->second.waiting_on.empty()) { + it->second.all_committed.set_value(); + it->second.all_committed = {}; + log_entry_update_waiting_on.erase(it); + } + } else { + logger().error("{} : {} got reply {} on unknown tid {}", + __func__, peering_state.get_info().pgid, *m, m->get_tid()); + } + return seastar::now(); +} + +bool PG::old_peering_msg( + const epoch_t reply_epoch, + const epoch_t query_epoch) const +{ + if (const epoch_t lpr = peering_state.get_last_peering_reset(); + lpr > reply_epoch || lpr > query_epoch) { + logger().debug("{}: pg changed {} lpr {}, reply_epoch {}, query_epoch {}", + __func__, get_info().history, lpr, reply_epoch, query_epoch); + return true; + } + return false; +} + +bool PG::can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const +{ + // if a repop is replied after a replica goes down in a new osdmap, and + // before the pg advances to this new osdmap, the repop replies before this + // repop can be discarded by that replica OSD, because the primary resets the + // connection to it when handling the new osdmap marking it down, and also + // resets the messenger sesssion when the replica reconnects. to avoid the + // out-of-order replies, the messages from that replica should be discarded. + const auto osdmap = peering_state.get_osdmap(); + const int from_osd = m.get_source().num(); + if (osdmap->is_down(from_osd)) { + return true; + } + // Mostly, this overlaps with the old_peering_msg + // condition. An important exception is pushes + // sent by replicas not in the acting set, since + // if such a replica goes down it does not cause + // a new interval. + if (osdmap->get_down_at(from_osd) >= m_map_epoch) { + return true; + } + // same pg? + // if pg changes *at all*, we reset and repeer! + return old_peering_msg(m_map_epoch, m_map_epoch); +} + +seastar::future<> PG::stop() +{ + logger().info("PG {} {}", pgid, __func__); + stopping = true; + cancel_local_background_io_reservation(); + cancel_remote_recovery_reservation(); + check_readable_timer.cancel(); + renew_lease_timer.cancel(); + return osdmap_gate.stop().then([this] { + return wait_for_active_blocker.stop(); + }).then([this] { + return recovery_handler->stop(); + }).then([this] { + return recovery_backend->stop(); + }).then([this] { + return backend->stop(); + }); +} + +void PG::on_change(ceph::os::Transaction &t) { + logger().debug("{} {}:", *this, __func__); + context_registry_on_change(); + obc_loader.notify_on_change(is_primary()); + recovery_backend->on_peering_interval_change(t); + backend->on_actingset_changed(is_primary()); + wait_for_active_blocker.unblock(); + if (is_primary()) { + logger().debug("{} {}: requeueing", *this, __func__); + client_request_orderer.requeue(shard_services, this); + } else { + logger().debug("{} {}: dropping requests", *this, __func__); + client_request_orderer.clear_and_cancel(); + } +} + +void PG::context_registry_on_change() { + obc_registry.for_each([](ObjectContextRef obc) { + assert(obc); + for (auto j = obc->watchers.begin(); + j != obc->watchers.end(); + j = obc->watchers.erase(j)) { + j->second->discard_state(); + } + }); +} + +bool PG::can_discard_op(const MOSDOp& m) const { + if (m.get_map_epoch() < + peering_state.get_info().history.same_primary_since) { + logger().debug("{} changed after {} dropping {} ", + __func__ , m.get_map_epoch(), m); + return true; + } + + if ((m.get_flags() & (CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS)) + && !is_primary() + && (m.get_map_epoch() < + peering_state.get_info().history.same_interval_since)) + { + // Note: the Objecter will resend on interval change without the primary + // changing if it actually sent to a replica. If the primary hasn't + // changed since the send epoch, we got it, and we're primary, it won't + // have resent even if the interval did change as it sent it to the primary + // (us). + return true; + } + return __builtin_expect(m.get_map_epoch() + < peering_state.get_info().history.same_primary_since, false); +} + +bool PG::is_degraded_or_backfilling_object(const hobject_t& soid) const { + /* The conditions below may clear (on_local_recover, before we queue + * the transaction) before we actually requeue the degraded waiters + * in on_global_recover after the transaction completes. + */ + if (peering_state.get_pg_log().get_missing().get_items().count(soid)) + return true; + ceph_assert(!get_acting_recovery_backfill().empty()); + for (auto& peer : get_acting_recovery_backfill()) { + if (peer == get_primary()) continue; + auto peer_missing_entry = peering_state.get_peer_missing().find(peer); + // If an object is missing on an async_recovery_target, return false. + // This will not block the op and the object is async recovered later. + if (peer_missing_entry != peering_state.get_peer_missing().end() && + peer_missing_entry->second.get_items().count(soid)) { + return true; + } + // Object is degraded if after last_backfill AND + // we are backfilling it + if (is_backfill_target(peer) && + peering_state.get_peer_info(peer).last_backfill <= soid && + recovery_handler->backfill_state && + recovery_handler->backfill_state->get_last_backfill_started() >= soid && + recovery_backend->is_recovering(soid)) { + return true; + } + } + return false; +} + +PG::interruptible_future<std::optional<PG::complete_op_t>> +PG::already_complete(const osd_reqid_t& reqid) +{ + eversion_t version; + version_t user_version; + int ret; + std::vector<pg_log_op_return_item_t> op_returns; + + if (peering_state.get_pg_log().get_log().get_request( + reqid, &version, &user_version, &ret, &op_returns)) { + complete_op_t dupinfo{ + user_version, + version, + ret}; + return backend->request_committed(reqid, version).then([dupinfo] { + return seastar::make_ready_future<std::optional<complete_op_t>>(dupinfo); + }); + } else { + return seastar::make_ready_future<std::optional<complete_op_t>>(std::nullopt); + } +} + +} diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h new file mode 100644 index 000000000..d96db2e20 --- /dev/null +++ b/src/crimson/osd/pg.h @@ -0,0 +1,833 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <memory> +#include <optional> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "common/dout.h" +#include "include/interval_set.h" +#include "crimson/net/Fwd.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDOpReply.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" +#include "osd/osd_types_fmt.h" +#include "crimson/osd/object_context.h" +#include "osd/PeeringState.h" +#include "osd/SnapMapper.h" + +#include "crimson/common/interruptible_future.h" +#include "crimson/common/type_helpers.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/logmissing_request.h" +#include "crimson/osd/osd_operations/logmissing_request_reply.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/replicated_request.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/pg_activation_blocker.h" +#include "crimson/osd/pg_recovery.h" +#include "crimson/osd/pg_recovery_listener.h" +#include "crimson/osd/recovery_backend.h" +#include "crimson/osd/object_context_loader.h" + +class MQuery; +class OSDMap; +class PGBackend; +class PGPeeringEvent; +class osd_op_params_t; + +namespace recovery { + class Context; +} + +namespace crimson::net { + class Messenger; +} + +namespace crimson::os { + class FuturizedStore; +} + +namespace crimson::osd { +class OpsExecuter; +class BackfillRecovery; +class SnapTrimEvent; + +class PG : public boost::intrusive_ref_counter< + PG, + boost::thread_unsafe_counter>, + public PGRecoveryListener, + PeeringState::PeeringListener, + DoutPrefixProvider +{ + using ec_profile_t = std::map<std::string,std::string>; + using cached_map_t = OSDMapService::cached_map_t; + + ClientRequest::PGPipeline request_pg_pipeline; + PGPeeringPipeline peering_request_pg_pipeline; + + ClientRequest::Orderer client_request_orderer; + + spg_t pgid; + pg_shard_t pg_whoami; + crimson::os::CollectionRef coll_ref; + ghobject_t pgmeta_oid; + + seastar::timer<seastar::lowres_clock> check_readable_timer; + seastar::timer<seastar::lowres_clock> renew_lease_timer; + +public: + template <typename T = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition, T>; + + PG(spg_t pgid, + pg_shard_t pg_shard, + crimson::os::CollectionRef coll_ref, + pg_pool_t&& pool, + std::string&& name, + cached_map_t osdmap, + ShardServices &shard_services, + ec_profile_t profile); + + ~PG(); + + const pg_shard_t& get_pg_whoami() const final { + return pg_whoami; + } + + const spg_t& get_pgid() const final { + return pgid; + } + + PGBackend& get_backend() { + return *backend; + } + const PGBackend& get_backend() const { + return *backend; + } + // EpochSource + epoch_t get_osdmap_epoch() const final { + return peering_state.get_osdmap_epoch(); + } + + eversion_t get_pg_trim_to() const { + return peering_state.get_pg_trim_to(); + } + + eversion_t get_min_last_complete_ondisk() const { + return peering_state.get_min_last_complete_ondisk(); + } + + const pg_info_t& get_info() const final { + return peering_state.get_info(); + } + + // DoutPrefixProvider + std::ostream& gen_prefix(std::ostream& out) const final { + return out << *this; + } + crimson::common::CephContext *get_cct() const final { + return shard_services.get_cct(); + } + unsigned get_subsys() const final { + return ceph_subsys_osd; + } + + crimson::os::CollectionRef get_collection_ref() { + return coll_ref; + } + + // PeeringListener + void prepare_write( + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ceph::os::Transaction &t) final; + + void on_info_history_change() final { + // Not needed yet -- mainly for scrub scheduling + } + + /// Notify PG that Primary/Replica status has changed (to update scrub registration) + void on_primary_status_change(bool was_primary, bool now_primary) final { + } + + /// Need to reschedule next scrub. Assuming no change in role + void reschedule_scrub() final { + } + + void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) final; + + uint64_t get_snap_trimq_size() const final { + return std::size(snap_trimq); + } + + void send_cluster_message( + int osd, MessageURef m, + epoch_t epoch, bool share_map_update=false) final { + (void)shard_services.send_to_osd(osd, std::move(m), epoch); + } + + void send_pg_created(pg_t pgid) final { + (void)shard_services.send_pg_created(pgid); + } + + bool try_flush_or_schedule_async() final; + + void start_flush_on_transaction( + ceph::os::Transaction &t) final { + t.register_on_commit( + new LambdaContext([this](int r){ + peering_state.complete_flush(); + })); + } + + void on_flushed() final { + // will be needed for unblocking IO operations/peering + } + + template <typename T> + void start_peering_event_operation(T &&evt, float delay = 0) { + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + delay, + std::forward<T>(evt)); + } + + void schedule_event_after( + PGPeeringEventRef event, + float delay) final { + start_peering_event_operation(std::move(*event), delay); + } + std::vector<pg_shard_t> get_replica_recovery_order() const final { + return peering_state.get_replica_recovery_order(); + } + void request_local_background_io_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.local_request_reservation( + pgid, + on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) { + start_peering_event_operation(std::move(*on_grant)); + }) : nullptr, + priority, + on_preempt ? make_lambda_context( + [this, on_preempt=std::move(on_preempt)] (int) { + start_peering_event_operation(std::move(*on_preempt)); + }) : nullptr); + } + + void update_local_background_io_priority( + unsigned priority) final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.local_update_priority( + pgid, + priority); + } + + void cancel_local_background_io_reservation() final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.local_cancel_reservation( + pgid); + } + + void request_remote_recovery_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.remote_request_reservation( + pgid, + on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) { + start_peering_event_operation(std::move(*on_grant)); + }) : nullptr, + priority, + on_preempt ? make_lambda_context( + [this, on_preempt=std::move(on_preempt)] (int) { + start_peering_event_operation(std::move(*on_preempt)); + }) : nullptr); + } + + void cancel_remote_recovery_reservation() final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.remote_cancel_reservation( + pgid); + } + + void schedule_event_on_commit( + ceph::os::Transaction &t, + PGPeeringEventRef on_commit) final { + t.register_on_commit( + make_lambda_context( + [this, on_commit=std::move(on_commit)](int) { + start_peering_event_operation(std::move(*on_commit)); + })); + } + + void update_heartbeat_peers(std::set<int> peers) final { + // Not needed yet + } + void set_probe_targets(const std::set<pg_shard_t> &probe_set) final { + // Not needed yet + } + void clear_probe_targets() final { + // Not needed yet + } + void queue_want_pg_temp(const std::vector<int> &wanted) final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.queue_want_pg_temp(pgid.pgid, wanted); + } + void clear_want_pg_temp() final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.remove_want_pg_temp(pgid.pgid); + } + void check_recovery_sources(const OSDMapRef& newmap) final { + // Not needed yet + } + void check_blocklisted_watchers() final; + void clear_primary_state() final { + // Not needed yet + } + + void queue_check_readable(epoch_t last_peering_reset, + ceph::timespan delay) final; + void recheck_readable() final; + + unsigned get_target_pg_log_entries() const final; + + void on_pool_change() final { + // Not needed yet + } + void on_role_change() final { + // Not needed yet + } + void on_change(ceph::os::Transaction &t) final; + void on_activate(interval_set<snapid_t> to_trim) final; + void on_activate_complete() final; + void on_new_interval() final { + // Not needed yet + } + Context *on_clean() final; + void on_activate_committed() final { + // Not needed yet (will be needed for IO unblocking) + } + void on_active_exit() final { + // Not needed yet + } + + void on_removal(ceph::os::Transaction &t) final; + + std::pair<ghobject_t, bool> + do_delete_work(ceph::os::Transaction &t, ghobject_t _next) final; + + // merge/split not ready + void clear_ready_to_merge() final {} + void set_not_ready_to_merge_target(pg_t pgid, pg_t src) final {} + void set_not_ready_to_merge_source(pg_t pgid) final {} + void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) final {} + void set_ready_to_merge_source(eversion_t lu) final {} + + void on_active_actmap() final; + void on_active_advmap(const OSDMapRef &osdmap) final; + + epoch_t cluster_osdmap_trim_lower_bound() final { + // TODO + return 0; + } + + void on_backfill_reserved() final { + recovery_handler->on_backfill_reserved(); + } + void on_backfill_canceled() final { + ceph_assert(0 == "Not implemented"); + } + + void on_recovery_reserved() final { + recovery_handler->start_pglogbased_recovery(); + } + + + bool try_reserve_recovery_space( + int64_t primary_num_bytes, int64_t local_num_bytes) final { + // TODO + return true; + } + void unreserve_recovery_space() final {} + + struct PGLogEntryHandler : public PGLog::LogEntryHandler { + PG *pg; + ceph::os::Transaction *t; + PGLogEntryHandler(PG *pg, ceph::os::Transaction *t) : pg(pg), t(t) {} + + // LogEntryHandler + void remove(const hobject_t &hoid) override { + // TODO + } + void try_stash(const hobject_t &hoid, version_t v) override { + // TODO + } + void rollback(const pg_log_entry_t &entry) override { + // TODO + } + void rollforward(const pg_log_entry_t &entry) override { + // TODO + } + void trim(const pg_log_entry_t &entry) override { + // TODO + } + }; + PGLog::LogEntryHandlerRef get_log_handler( + ceph::os::Transaction &t) final { + return std::make_unique<PG::PGLogEntryHandler>(this, &t); + } + + void rebuild_missing_set_with_deletes(PGLog &pglog) final { + pglog.rebuild_missing_set_with_deletes_crimson( + shard_services.get_store(), + coll_ref, + peering_state.get_info()).get(); + } + + PerfCounters &get_peering_perf() final { + return shard_services.get_recoverystate_perf_logger(); + } + PerfCounters &get_perf_logger() final { + return shard_services.get_perf_logger(); + } + + void log_state_enter(const char *state) final; + void log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) final; + + void dump_recovery_info(Formatter *f) const final { + } + + OstreamTemp get_clog_info() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_INFO, nullptr); + } + OstreamTemp get_clog_debug() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_DEBUG, nullptr); + } + OstreamTemp get_clog_error() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_ERROR, nullptr); + } + + ceph::signedspan get_mnow() const final; + HeartbeatStampsRef get_hb_stamps(int peer) final; + void schedule_renew_lease(epoch_t plr, ceph::timespan delay) final; + + + // Utility + bool is_primary() const final { + return peering_state.is_primary(); + } + bool is_nonprimary() const { + return peering_state.is_nonprimary(); + } + bool is_peered() const final { + return peering_state.is_peered(); + } + bool is_recovering() const final { + return peering_state.is_recovering(); + } + bool is_backfilling() const final { + return peering_state.is_backfilling(); + } + uint64_t get_last_user_version() const { + return get_info().last_user_version; + } + bool get_need_up_thru() const { + return peering_state.get_need_up_thru(); + } + epoch_t get_same_interval_since() const { + return get_info().history.same_interval_since; + } + + const auto& get_pgpool() const { + return peering_state.get_pgpool(); + } + pg_shard_t get_primary() const { + return peering_state.get_primary(); + } + + /// initialize created PG + void init( + int role, + const std::vector<int>& up, + int up_primary, + const std::vector<int>& acting, + int acting_primary, + const pg_history_t& history, + const PastIntervals& pim, + ceph::os::Transaction &t); + + seastar::future<> read_state(crimson::os::FuturizedStore::Shard* store); + + interruptible_future<> do_peering_event( + PGPeeringEvent& evt, PeeringCtx &rctx); + + seastar::future<> handle_advance_map(cached_map_t next_map, PeeringCtx &rctx); + seastar::future<> handle_activate_map(PeeringCtx &rctx); + seastar::future<> handle_initialize(PeeringCtx &rctx); + + static hobject_t get_oid(const hobject_t& hobj); + static RWState::State get_lock_type(const OpInfo &op_info); + + using load_obc_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::object_corrupted>; + using load_obc_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + load_obc_ertr>; + using interruptor = ::crimson::interruptible::interruptor< + ::crimson::osd::IOInterruptCondition>; + +public: + using with_obc_func_t = + std::function<load_obc_iertr::future<> (ObjectContextRef)>; + + load_obc_iertr::future<> with_locked_obc( + const hobject_t &hobj, + const OpInfo &op_info, + with_obc_func_t&& f); + + interruptible_future<> handle_rep_op(Ref<MOSDRepOp> m); + void log_operation( + std::vector<pg_log_entry_t>&& logv, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + const eversion_t &min_last_complete_ondisk, + bool transaction_applied, + ObjectStore::Transaction &txn, + bool async = false); + void replica_clear_repop_obc( + const std::vector<pg_log_entry_t> &logv); + void handle_rep_op_reply(const MOSDRepOpReply& m); + interruptible_future<> do_update_log_missing( + Ref<MOSDPGUpdateLogMissing> m, + crimson::net::ConnectionRef conn); + interruptible_future<> do_update_log_missing_reply( + Ref<MOSDPGUpdateLogMissingReply> m); + + + void print(std::ostream& os) const; + void dump_primary(Formatter*); + seastar::future<> submit_error_log( + Ref<MOSDOp> m, + const OpInfo &op_info, + ObjectContextRef obc, + const std::error_code e, + ceph_tid_t rep_tid, + eversion_t &version); + +private: + + struct SnapTrimMutex { + struct WaitPG : OrderedConcurrentPhaseT<WaitPG> { + static constexpr auto type_name = "SnapTrimEvent::wait_pg"; + } wait_pg; + seastar::shared_mutex mutex; + + interruptible_future<> lock(SnapTrimEvent &st_event) noexcept; + + void unlock() noexcept { + mutex.unlock(); + } + } snaptrim_mutex; + + using do_osd_ops_ertr = crimson::errorator< + crimson::ct_error::eagain>; + using do_osd_ops_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + ::crimson::errorator<crimson::ct_error::eagain>>; + template <typename Ret = void> + using pg_rep_op_fut_t = + std::tuple<interruptible_future<>, + do_osd_ops_iertr::future<Ret>>; + do_osd_ops_iertr::future<pg_rep_op_fut_t<MURef<MOSDOpReply>>> do_osd_ops( + Ref<MOSDOp> m, + crimson::net::ConnectionRef conn, + ObjectContextRef obc, + const OpInfo &op_info, + const SnapContext& snapc); + using do_osd_ops_success_func_t = + std::function<do_osd_ops_iertr::future<>()>; + using do_osd_ops_failure_func_t = + std::function<do_osd_ops_iertr::future<>(const std::error_code&)>; + struct do_osd_ops_params_t; + do_osd_ops_iertr::future<pg_rep_op_fut_t<>> do_osd_ops( + ObjectContextRef obc, + std::vector<OSDOp>& ops, + const OpInfo &op_info, + const do_osd_ops_params_t &¶ms, + do_osd_ops_success_func_t success_func, + do_osd_ops_failure_func_t failure_func); + template <class Ret, class SuccessFunc, class FailureFunc> + do_osd_ops_iertr::future<pg_rep_op_fut_t<Ret>> do_osd_ops_execute( + seastar::lw_shared_ptr<OpsExecuter> ox, + std::vector<OSDOp>& ops, + SuccessFunc&& success_func, + FailureFunc&& failure_func); + interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m); + std::tuple<interruptible_future<>, interruptible_future<>> + submit_transaction( + ObjectContextRef&& obc, + ceph::os::Transaction&& txn, + osd_op_params_t&& oop, + std::vector<pg_log_entry_t>&& log_entries); + interruptible_future<> repair_object( + const hobject_t& oid, + eversion_t& v); + void check_blocklisted_obc_watchers(ObjectContextRef &obc); + +private: + PG_OSDMapGate osdmap_gate; + ShardServices &shard_services; + + +public: + cached_map_t get_osdmap() { return peering_state.get_osdmap(); } + eversion_t next_version() { + return eversion_t(get_osdmap_epoch(), + ++projected_last_update.version); + } + ShardServices& get_shard_services() final { + return shard_services; + } + seastar::future<> stop(); +private: + std::unique_ptr<PGBackend> backend; + std::unique_ptr<RecoveryBackend> recovery_backend; + std::unique_ptr<PGRecovery> recovery_handler; + + PeeringState peering_state; + eversion_t projected_last_update; + +public: + ObjectContextRegistry obc_registry; + ObjectContextLoader obc_loader; + +private: + OSDriver osdriver; + SnapMapper snap_mapper; + +public: + // PeeringListener + void publish_stats_to_osd() final; + void clear_publish_stats() final; + pg_stat_t get_stats() const; +private: + std::optional<pg_stat_t> pg_stats; + +public: + RecoveryBackend* get_recovery_backend() final { + return recovery_backend.get(); + } + PGRecovery* get_recovery_handler() final { + return recovery_handler.get(); + } + PeeringState& get_peering_state() final { + return peering_state; + } + bool has_reset_since(epoch_t epoch) const final { + return peering_state.pg_has_reset_since(epoch); + } + + const pg_missing_tracker_t& get_local_missing() const { + return peering_state.get_pg_log().get_missing(); + } + epoch_t get_last_peering_reset() const final { + return peering_state.get_last_peering_reset(); + } + const std::set<pg_shard_t> &get_acting_recovery_backfill() const { + return peering_state.get_acting_recovery_backfill(); + } + bool is_backfill_target(pg_shard_t osd) const { + return peering_state.is_backfill_target(osd); + } + void begin_peer_recover(pg_shard_t peer, const hobject_t oid) { + peering_state.begin_peer_recover(peer, oid); + } + uint64_t min_peer_features() const { + return peering_state.get_min_peer_features(); + } + const std::map<hobject_t, std::set<pg_shard_t>>& + get_missing_loc_shards() const { + return peering_state.get_missing_loc().get_missing_locs(); + } + const std::map<pg_shard_t, pg_missing_t> &get_shard_missing() const { + return peering_state.get_peer_missing(); + } + epoch_t get_interval_start_epoch() const { + return get_info().history.same_interval_since; + } + const pg_missing_const_i* get_shard_missing(pg_shard_t shard) const { + if (shard == pg_whoami) + return &get_local_missing(); + else { + auto it = peering_state.get_peer_missing().find(shard); + if (it == peering_state.get_peer_missing().end()) + return nullptr; + else + return &it->second; + } + } + + struct complete_op_t { + const version_t user_version; + const eversion_t version; + const int err; + }; + interruptible_future<std::optional<complete_op_t>> + already_complete(const osd_reqid_t& reqid); + int get_recovery_op_priority() const { + int64_t pri = 0; + get_pgpool().info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri); + return pri > 0 ? pri : crimson::common::local_conf()->osd_recovery_op_priority; + } + seastar::future<> mark_unfound_lost(int) { + // TODO: see PrimaryLogPG::mark_all_unfound_lost() + return seastar::now(); + } + + bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch) const; + + template <typename MsgType> + bool can_discard_replica_op(const MsgType& m) const { + return can_discard_replica_op(m, m.map_epoch); + } + +private: + // instead of seastar::gate, we use a boolean flag to indicate + // whether the system is shutting down, as we don't need to track + // continuations here. + bool stopping = false; + + PGActivationBlocker wait_for_active_blocker; + + friend std::ostream& operator<<(std::ostream&, const PG& pg); + friend class ClientRequest; + friend struct CommonClientRequest; + friend class PGAdvanceMap; + template <class T> + friend class PeeringEvent; + friend class RepRequest; + friend class LogMissingRequest; + friend class LogMissingRequestReply; + friend class BackfillRecovery; + friend struct PGFacade; + friend class InternalClientRequest; + friend class WatchTimeoutRequest; + friend class SnapTrimEvent; + friend class SnapTrimObjSubEvent; +private: + seastar::future<bool> find_unfound() { + return seastar::make_ready_future<bool>(true); + } + + bool can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const; + bool can_discard_op(const MOSDOp& m) const; + void context_registry_on_change(); + bool is_missing_object(const hobject_t& soid) const { + return peering_state.get_pg_log().get_missing().get_items().count(soid); + } + bool is_unreadable_object(const hobject_t &oid, + eversion_t* v = 0) const final { + return is_missing_object(oid) || + !peering_state.get_missing_loc().readable_with_acting( + oid, get_actingset(), v); + } + bool is_degraded_or_backfilling_object(const hobject_t& soid) const; + const std::set<pg_shard_t> &get_actingset() const { + return peering_state.get_actingset(); + } + +private: + friend class IOInterruptCondition; + struct log_update_t { + std::set<pg_shard_t> waiting_on; + seastar::shared_promise<> all_committed; + }; + + std::map<ceph_tid_t, log_update_t> log_entry_update_waiting_on; + // snap trimming + interval_set<snapid_t> snap_trimq; +}; + +struct PG::do_osd_ops_params_t { + crimson::net::ConnectionRef &get_connection() const { + return conn; + } + osd_reqid_t get_reqid() const { + return reqid; + } + utime_t get_mtime() const { + return mtime; + }; + epoch_t get_map_epoch() const { + return map_epoch; + } + entity_inst_t get_orig_source_inst() const { + return orig_source_inst; + } + uint64_t get_features() const { + return features; + } + // Only used by InternalClientRequest, no op flags + bool has_flag(uint32_t flag) const { + return false; + } + + // Only used by ExecutableMessagePimpl + entity_name_t get_source() const { + return orig_source_inst.name; + } + + crimson::net::ConnectionRef &conn; + osd_reqid_t reqid; + utime_t mtime; + epoch_t map_epoch; + entity_inst_t orig_source_inst; + uint64_t features; +}; + +std::ostream& operator<<(std::ostream&, const PG& pg); + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::PG> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/pg_activation_blocker.cc b/src/crimson/osd/pg_activation_blocker.cc new file mode 100644 index 000000000..48ffe3f84 --- /dev/null +++ b/src/crimson/osd/pg_activation_blocker.cc @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_activation_blocker.h" + +namespace crimson::osd { + +void PGActivationBlocker::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->get_pgid(); +} + +void PGActivationBlocker::unblock() +{ + p.set_value(); + p = {}; +} + +seastar::future<> +PGActivationBlocker::wait(PGActivationBlocker::BlockingEvent::TriggerI&& trigger) +{ + if (pg->get_peering_state().is_active()) { + return seastar::now(); + } else { + return trigger.maybe_record_blocking(p.get_shared_future(), *this); + } +} + +seastar::future<> PGActivationBlocker::stop() +{ + p.set_exception(crimson::common::system_shutdown_exception()); + return seastar::now(); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/pg_activation_blocker.h b/src/crimson/osd/pg_activation_blocker.h new file mode 100644 index 000000000..fff8219d1 --- /dev/null +++ b/src/crimson/osd/pg_activation_blocker.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "crimson/common/operation.h" +#include "crimson/osd/osd_operation.h" + +namespace crimson::osd { + +class PG; + +class PGActivationBlocker : public crimson::BlockerT<PGActivationBlocker> { + PG *pg; + + const spg_t pgid; + seastar::shared_promise<> p; + +protected: + void dump_detail(Formatter *f) const; + +public: + static constexpr const char *type_name = "PGActivationBlocker"; + using Blocker = PGActivationBlocker; + + PGActivationBlocker(PG *pg) : pg(pg) {} + void unblock(); + seastar::future<> wait(PGActivationBlocker::BlockingEvent::TriggerI&&); + seastar::future<> stop(); +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc new file mode 100644 index 000000000..02acb9a55 --- /dev/null +++ b/src/crimson/osd/pg_backend.cc @@ -0,0 +1,1811 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg_backend.h" + +#include <charconv> +#include <optional> +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm/copy.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <seastar/core/print.hh> + +#include "messages/MOSDOp.h" +#include "os/Transaction.h" +#include "common/Checksummer.h" +#include "common/Clock.h" + +#include "crimson/common/exception.h" +#include "crimson/common/tmap_helpers.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/object_context_loader.h" +#include "replicated_backend.h" +#include "replicated_recovery_backend.h" +#include "ec_backend.h" +#include "exceptions.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using std::runtime_error; +using std::string; +using std::string_view; +using crimson::common::local_conf; + +std::unique_ptr<PGBackend> +PGBackend::create(pg_t pgid, + const pg_shard_t pg_shard, + const pg_pool_t& pool, + crimson::os::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile, + DoutPrefixProvider &dpp) +{ + switch (pool.type) { + case pg_pool_t::TYPE_REPLICATED: + return std::make_unique<ReplicatedBackend>(pgid, pg_shard, + coll, shard_services, + dpp); + case pg_pool_t::TYPE_ERASURE: + return std::make_unique<ECBackend>(pg_shard.shard, coll, shard_services, + std::move(ec_profile), + pool.stripe_width, + dpp); + default: + throw runtime_error(seastar::format("unsupported pool type '{}'", + pool.type)); + } +} + +PGBackend::PGBackend(shard_id_t shard, + CollectionRef coll, + crimson::osd::ShardServices &shard_services, + DoutPrefixProvider &dpp) + : shard{shard}, + coll{coll}, + shard_services{shard_services}, + dpp{dpp}, + store{&shard_services.get_store()} +{} + +PGBackend::load_metadata_iertr::future + <PGBackend::loaded_object_md_t::ref> +PGBackend::load_metadata(const hobject_t& oid) +{ + return interruptor::make_interruptible(store->get_attrs( + coll, + ghobject_t{oid, ghobject_t::NO_GEN, shard})).safe_then_interruptible( + [oid](auto &&attrs) -> load_metadata_ertr::future<loaded_object_md_t::ref>{ + loaded_object_md_t::ref ret(new loaded_object_md_t()); + if (auto oiiter = attrs.find(OI_ATTR); oiiter != attrs.end()) { + bufferlist bl = std::move(oiiter->second); + try { + ret->os = ObjectState( + object_info_t(bl, oid), + true); + } catch (const buffer::error&) { + logger().warn("unable to decode ObjectState"); + throw crimson::osd::invalid_argument(); + } + } else { + logger().error( + "load_metadata: object {} present but missing object info", + oid); + return crimson::ct_error::object_corrupted::make(); + } + + if (oid.is_head()) { + // Returning object_corrupted when the object exsits and the + // Snapset is either not found or empty. + bool object_corrupted = true; + if (auto ssiter = attrs.find(SS_ATTR); ssiter != attrs.end()) { + object_corrupted = false; + bufferlist bl = std::move(ssiter->second); + if (bl.length()) { + ret->ssc = new crimson::osd::SnapSetContext(oid.get_snapdir()); + try { + ret->ssc->snapset = SnapSet(bl); + ret->ssc->exists = true; + logger().debug( + "load_metadata: object {} and snapset {} present", + oid, ret->ssc->snapset); + } catch (const buffer::error&) { + logger().warn("unable to decode SnapSet"); + throw crimson::osd::invalid_argument(); + } + } else { + object_corrupted = true; + } + } + if (object_corrupted) { + logger().error( + "load_metadata: object {} present but missing snapset", + oid); + return crimson::ct_error::object_corrupted::make(); + } + } + + return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>( + std::move(ret)); + }, crimson::ct_error::enoent::handle([oid] { + logger().debug( + "load_metadata: object {} doesn't exist, returning empty metadata", + oid); + return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>( + new loaded_object_md_t{ + ObjectState( + object_info_t(oid), + false), + oid.is_head() ? (new crimson::osd::SnapSetContext(oid)) : nullptr + }); + })); +} + +PGBackend::rep_op_fut_t +PGBackend::mutate_object( + std::set<pg_shard_t> pg_shards, + crimson::osd::ObjectContextRef &&obc, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, + epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + logger().trace("mutate_object: num_ops={}", txn.get_num_ops()); + if (obc->obs.exists) { +#if 0 + obc->obs.oi.version = ctx->at_version; + obc->obs.oi.prior_version = ctx->obs->oi.version; +#endif + + obc->obs.oi.prior_version = obc->obs.oi.version; + obc->obs.oi.version = osd_op_p.at_version; + if (osd_op_p.user_at_version > obc->obs.oi.user_version) + obc->obs.oi.user_version = osd_op_p.user_at_version; + obc->obs.oi.last_reqid = osd_op_p.req_id; + obc->obs.oi.mtime = osd_op_p.mtime; + obc->obs.oi.local_mtime = ceph_clock_now(); + + // object_info_t + { + ceph::bufferlist osv; + obc->obs.oi.encode_no_oid(osv, CEPH_FEATURES_ALL); + // TODO: get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, OI_ATTR, osv); + } + + // snapset + if (obc->obs.oi.soid.snap == CEPH_NOSNAP) { + logger().debug("final snapset {} in {}", + obc->ssc->snapset, obc->obs.oi.soid); + ceph::bufferlist bss; + encode(obc->ssc->snapset, bss); + txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, SS_ATTR, bss); + obc->ssc->exists = true; + } else { + logger().debug("no snapset (this is a clone)"); + } + } else { + // reset cached ObjectState without enforcing eviction + obc->obs.oi = object_info_t(obc->obs.oi.soid); + } + return _submit_transaction( + std::move(pg_shards), obc->obs.oi.soid, std::move(txn), + std::move(osd_op_p), min_epoch, map_epoch, std::move(log_entries)); +} + +static inline bool _read_verify_data( + const object_info_t& oi, + const ceph::bufferlist& data) +{ + if (oi.is_data_digest() && oi.size == data.length()) { + // whole object? can we verify the checksum? + if (auto crc = data.crc32c(-1); crc != oi.data_digest) { + logger().error("full-object read crc {} != expected {} on {}", + crc, oi.data_digest, oi.soid); + // todo: mark soid missing, perform recovery, and retry + return false; + } + } + return true; +} + +PGBackend::read_ierrorator::future<> +PGBackend::read(const ObjectState& os, OSDOp& osd_op, + object_stat_sum_t& delta_stats) +{ + const auto& oi = os.oi; + const ceph_osd_op& op = osd_op.op; + const uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + logger().trace("read: {} {}~{}", oi.soid, offset, length); + + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + // are we beyond truncate_size? + size_t size = oi.size; + if ((op.extent.truncate_seq > oi.truncate_seq) && + (op.extent.truncate_size < offset + length) && + (op.extent.truncate_size < size)) { + size = op.extent.truncate_size; + } + if (offset >= size) { + // read size was trimmed to zero and it is expected to do nothing, + return read_errorator::now(); + } + if (!length) { + // read the whole object if length is 0 + length = size; + } + return _read(oi.soid, offset, length, op.flags).safe_then_interruptible_tuple( + [&delta_stats, &oi, &osd_op](auto&& bl) -> read_errorator::future<> { + if (!_read_verify_data(oi, bl)) { + // crc mismatches + return crimson::ct_error::object_corrupted::make(); + } + logger().debug("read: data length: {}", bl.length()); + osd_op.op.extent.length = bl.length(); + osd_op.rval = 0; + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + osd_op.outdata = std::move(bl); + return read_errorator::now(); + }, crimson::ct_error::input_output_error::handle([] { + return read_errorator::future<>{crimson::ct_error::object_corrupted::make()}; + }), + read_errorator::pass_further{}); +} + +PGBackend::read_ierrorator::future<> +PGBackend::sparse_read(const ObjectState& os, OSDOp& osd_op, + object_stat_sum_t& delta_stats) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + + const auto& op = osd_op.op; + /* clients (particularly cephfs) may send truncate operations out of order + * w.r.t. reads. op.extent.truncate_seq and op.extent.truncate_size allow + * the OSD to determine whether the client submitted read needs to be + * adjusted to compensate for a truncate the OSD hasn't seen yet. + */ + uint64_t adjusted_size = os.oi.size; + const uint64_t offset = op.extent.offset; + uint64_t adjusted_length = op.extent.length; + if ((os.oi.truncate_seq < op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > op.extent.truncate_size) && + (adjusted_size > op.extent.truncate_size)) { + adjusted_size = op.extent.truncate_size; + } + if (offset > adjusted_size) { + adjusted_length = 0; + } else if (offset + adjusted_length > adjusted_size) { + adjusted_length = adjusted_size - offset; + } + logger().trace("sparse_read: {} {}~{}", + os.oi.soid, op.extent.offset, op.extent.length); + return interruptor::make_interruptible(store->fiemap(coll, ghobject_t{os.oi.soid}, + offset, adjusted_length)).safe_then_interruptible( + [&delta_stats, &os, &osd_op, this](auto&& m) { + return seastar::do_with(interval_set<uint64_t>{std::move(m)}, + [&delta_stats, &os, &osd_op, this](auto&& extents) { + return interruptor::make_interruptible(store->readv(coll, ghobject_t{os.oi.soid}, + extents, osd_op.op.flags)).safe_then_interruptible_tuple( + [&delta_stats, &os, &osd_op, &extents](auto&& bl) -> read_errorator::future<> { + if (_read_verify_data(os.oi, bl)) { + osd_op.op.extent.length = bl.length(); + // re-encode since it might be modified + ceph::encode(extents, osd_op.outdata); + encode_destructively(bl, osd_op.outdata); + logger().trace("sparse_read got {} bytes from object {}", + osd_op.op.extent.length, os.oi.soid); + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(osd_op.op.extent.length, 10); + return read_errorator::make_ready_future<>(); + } else { + // crc mismatches + return crimson::ct_error::object_corrupted::make(); + } + }, crimson::ct_error::input_output_error::handle([] { + return read_errorator::future<>{crimson::ct_error::object_corrupted::make()}; + }), + read_errorator::pass_further{}); + }); + }); +} + +namespace { + + template<class CSum> + PGBackend::checksum_errorator::future<> + do_checksum(ceph::bufferlist& init_value_bl, + size_t chunk_size, + const ceph::bufferlist& buf, + ceph::bufferlist& result) + { + typename CSum::init_value_t init_value; + auto init_value_p = init_value_bl.cbegin(); + try { + decode(init_value, init_value_p); + // chop off the consumed part + init_value_bl.splice(0, init_value_p.get_off()); + } catch (const ceph::buffer::end_of_buffer&) { + logger().warn("{}: init value not provided", __func__); + return crimson::ct_error::invarg::make(); + } + const uint32_t chunk_count = buf.length() / chunk_size; + ceph::bufferptr csum_data{ + ceph::buffer::create(sizeof(typename CSum::value_t) * chunk_count)}; + Checksummer::calculate<CSum>( + init_value, chunk_size, 0, buf.length(), buf, &csum_data); + encode(chunk_count, result); + result.append(std::move(csum_data)); + return PGBackend::checksum_errorator::now(); + } +} + +PGBackend::checksum_ierrorator::future<> +PGBackend::checksum(const ObjectState& os, OSDOp& osd_op) +{ + // sanity tests and normalize the argments + auto& checksum = osd_op.op.checksum; + if (checksum.offset == 0 && checksum.length == 0) { + // zeroed offset+length implies checksum whole object + checksum.length = os.oi.size; + } else if (checksum.offset >= os.oi.size) { + // read size was trimmed to zero, do nothing, + // see PGBackend::read() + return checksum_errorator::now(); + } + if (checksum.chunk_size > 0) { + if (checksum.length == 0) { + logger().warn("{}: length required when chunk size provided", __func__); + return crimson::ct_error::invarg::make(); + } + if (checksum.length % checksum.chunk_size != 0) { + logger().warn("{}: length not aligned to chunk size", __func__); + return crimson::ct_error::invarg::make(); + } + } else { + checksum.chunk_size = checksum.length; + } + if (checksum.length == 0) { + uint32_t count = 0; + encode(count, osd_op.outdata); + return checksum_errorator::now(); + } + + // read the chunk to be checksum'ed + return _read(os.oi.soid, checksum.offset, checksum.length, osd_op.op.flags) + .safe_then_interruptible( + [&osd_op](auto&& read_bl) mutable -> checksum_errorator::future<> { + auto& checksum = osd_op.op.checksum; + if (read_bl.length() != checksum.length) { + logger().warn("checksum: bytes read {} != {}", + read_bl.length(), checksum.length); + return crimson::ct_error::invarg::make(); + } + // calculate its checksum and put the result in outdata + switch (checksum.type) { + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32: + return do_checksum<Checksummer::xxhash32>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64: + return do_checksum<Checksummer::xxhash64>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C: + return do_checksum<Checksummer::crc32c>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + default: + logger().warn("checksum: unknown crc type ({})", + static_cast<uint32_t>(checksum.type)); + return crimson::ct_error::invarg::make(); + } + }); +} + +PGBackend::cmp_ext_ierrorator::future<> +PGBackend::cmp_ext(const ObjectState& os, OSDOp& osd_op) +{ + const ceph_osd_op& op = osd_op.op; + uint64_t obj_size = os.oi.size; + if (os.oi.truncate_seq < op.extent.truncate_seq && + op.extent.offset + op.extent.length > op.extent.truncate_size) { + obj_size = op.extent.truncate_size; + } + uint64_t ext_len; + if (op.extent.offset >= obj_size) { + ext_len = 0; + } else if (op.extent.offset + op.extent.length > obj_size) { + ext_len = obj_size - op.extent.offset; + } else { + ext_len = op.extent.length; + } + auto read_ext = ll_read_ierrorator::make_ready_future<ceph::bufferlist>(); + if (ext_len == 0) { + logger().debug("{}: zero length extent", __func__); + } else if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + } else { + read_ext = _read(os.oi.soid, op.extent.offset, ext_len, 0); + } + return read_ext.safe_then_interruptible([&osd_op](auto&& read_bl) + -> cmp_ext_errorator::future<> { + for (unsigned index = 0; index < osd_op.indata.length(); index++) { + char byte_in_op = osd_op.indata[index]; + char byte_from_disk = (index < read_bl.length() ? read_bl[index] : 0); + if (byte_in_op != byte_from_disk) { + logger().debug("cmp_ext: mismatch at {}", index); + // Unlike other ops, we set osd_op.rval here and return a different + // error code via ct_error::cmp_fail. + osd_op.rval = -MAX_ERRNO - index; + return crimson::ct_error::cmp_fail::make(); + } + } + osd_op.rval = 0; + return cmp_ext_errorator::make_ready_future<>(); + }); +} + +PGBackend::stat_ierrorator::future<> +PGBackend::stat( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) +{ + if (os.exists/* TODO: && !os.is_whiteout() */) { + logger().debug("stat os.oi.size={}, os.oi.mtime={}", os.oi.size, os.oi.mtime); + encode(os.oi.size, osd_op.outdata); + encode(os.oi.mtime, osd_op.outdata); + } else { + logger().debug("stat object does not exist"); + return crimson::ct_error::enoent::make(); + } + delta_stats.num_rd++; + return stat_errorator::now(); +} + +PGBackend::write_iertr::future<> PGBackend::_writefull( + ObjectState& os, + off_t truncate_size, + const bufferlist& bl, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + unsigned flags) +{ + const bool existing = maybe_create_new_object(os, txn, delta_stats); + if (existing && bl.length() < os.oi.size) { + + txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, bl.length()); + truncate_update_size_and_usage(delta_stats, os.oi, truncate_size); + + osd_op_params.clean_regions.mark_data_region_dirty( + bl.length(), + os.oi.size - bl.length()); + } + if (bl.length()) { + txn.write( + coll->get_cid(), ghobject_t{os.oi.soid}, 0, bl.length(), + bl, flags); + update_size_and_usage( + delta_stats, os.oi, 0, + bl.length(), true); + osd_op_params.clean_regions.mark_data_region_dirty( + 0, + std::max((uint64_t)bl.length(), os.oi.size)); + } + return seastar::now(); +} + +PGBackend::write_iertr::future<> PGBackend::_truncate( + ObjectState& os, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + size_t offset, + size_t truncate_size, + uint32_t truncate_seq) +{ + if (truncate_seq) { + assert(offset == truncate_size); + if (truncate_seq <= os.oi.truncate_seq) { + logger().debug("{} truncate seq {} <= current {}, no-op", + __func__, truncate_seq, os.oi.truncate_seq); + return write_ertr::make_ready_future<>(); + } else { + logger().debug("{} truncate seq {} > current {}, truncating", + __func__, truncate_seq, os.oi.truncate_seq); + os.oi.truncate_seq = truncate_seq; + os.oi.truncate_size = truncate_size; + } + } + maybe_create_new_object(os, txn, delta_stats); + if (os.oi.size != offset) { + txn.truncate( + coll->get_cid(), + ghobject_t{os.oi.soid}, offset); + if (os.oi.size > offset) { + // TODO: modified_ranges.union_of(trim); + osd_op_params.clean_regions.mark_data_region_dirty( + offset, + os.oi.size - offset); + } else { + // os.oi.size < offset + osd_op_params.clean_regions.mark_data_region_dirty( + os.oi.size, + offset - os.oi.size); + } + truncate_update_size_and_usage(delta_stats, os.oi, offset); + os.oi.clear_data_digest(); + } + delta_stats.num_wr++; + return write_ertr::now(); +} + +bool PGBackend::maybe_create_new_object( + ObjectState& os, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats) +{ + if (!os.exists) { + ceph_assert(!os.oi.is_whiteout()); + os.exists = true; + os.oi.new_object(); + + txn.touch(coll->get_cid(), ghobject_t{os.oi.soid}); + delta_stats.num_objects++; + return false; + } else if (os.oi.is_whiteout()) { + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + delta_stats.num_whiteouts--; + } + return true; +} + +void PGBackend::update_size_and_usage(object_stat_sum_t& delta_stats, + object_info_t& oi, uint64_t offset, + uint64_t length, bool write_full) +{ + if (write_full || + (offset + length > oi.size && length)) { + uint64_t new_size = offset + length; + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += new_size; + oi.size = new_size; + } + delta_stats.num_wr++; + delta_stats.num_wr_kb += shift_round_up(length, 10); +} + +void PGBackend::truncate_update_size_and_usage(object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size) +{ + if (oi.size != truncate_size) { + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += truncate_size; + oi.size = truncate_size; + } +} + +static bool is_offset_and_length_valid( + const std::uint64_t offset, + const std::uint64_t length) +{ + if (const std::uint64_t max = local_conf()->osd_max_object_size; + offset >= max || length > max || offset + length > max) { + logger().debug("{} osd_max_object_size: {}, offset: {}, len: {}; " + "Hard limit of object size is 4GB", + __func__, max, offset, length); + return false; + } else { + return true; + } +} + +PGBackend::interruptible_future<> PGBackend::set_allochint( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats) +{ + maybe_create_new_object(os, txn, delta_stats); + + os.oi.expected_object_size = osd_op.op.alloc_hint.expected_object_size; + os.oi.expected_write_size = osd_op.op.alloc_hint.expected_write_size; + os.oi.alloc_hint_flags = osd_op.op.alloc_hint.flags; + txn.set_alloc_hint(coll->get_cid(), + ghobject_t{os.oi.soid}, + os.oi.expected_object_size, + os.oi.expected_write_size, + os.oi.alloc_hint_flags); + return seastar::now(); +} + +PGBackend::write_iertr::future<> PGBackend::write( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + const ceph_osd_op& op = osd_op.op; + uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + bufferlist buf = osd_op.indata; + if (op.extent.length != osd_op.indata.length()) { + return crimson::ct_error::invarg::make(); + } + + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + + if (auto seq = os.oi.truncate_seq; + seq != 0 && op.extent.truncate_seq < seq) { + // old write, arrived after trimtrunc + if (offset + length > os.oi.size) { + // no-op + if (offset > os.oi.size) { + length = 0; + buf.clear(); + } else { + // truncate + auto len = os.oi.size - offset; + buf.splice(len, length); + length = len; + } + } + } else if (op.extent.truncate_seq > seq) { + // write arrives before trimtrunc + if (os.exists && !os.oi.is_whiteout()) { + txn.truncate(coll->get_cid(), + ghobject_t{os.oi.soid}, op.extent.truncate_size); + if (op.extent.truncate_size != os.oi.size) { + os.oi.size = length; + if (op.extent.truncate_size > os.oi.size) { + osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size, + op.extent.truncate_size - os.oi.size); + } else { + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.truncate_size, + os.oi.size - op.extent.truncate_size); + } + } + truncate_update_size_and_usage(delta_stats, os.oi, op.extent.truncate_size); + } + os.oi.truncate_seq = op.extent.truncate_seq; + os.oi.truncate_size = op.extent.truncate_size; + } + maybe_create_new_object(os, txn, delta_stats); + if (length == 0) { + if (offset > os.oi.size) { + txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, op.extent.offset); + truncate_update_size_and_usage(delta_stats, os.oi, op.extent.offset); + } else { + txn.nop(); + } + } else { + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + offset, length, std::move(buf), op.flags); + update_size_and_usage(delta_stats, os.oi, offset, length); + } + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset, + op.extent.length); + + return seastar::now(); +} + +PGBackend::interruptible_future<> PGBackend::write_same( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + const ceph_osd_op& op = osd_op.op; + const uint64_t len = op.writesame.length; + if (len == 0) { + return seastar::now(); + } + if (op.writesame.data_length == 0 || + len % op.writesame.data_length != 0 || + op.writesame.data_length != osd_op.indata.length()) { + throw crimson::osd::invalid_argument(); + } + ceph::bufferlist repeated_indata; + for (uint64_t size = 0; size < len; size += op.writesame.data_length) { + repeated_indata.append(osd_op.indata); + } + maybe_create_new_object(os, txn, delta_stats); + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + op.writesame.offset, len, + std::move(repeated_indata), op.flags); + update_size_and_usage(delta_stats, os.oi, op.writesame.offset, len); + osd_op_params.clean_regions.mark_data_region_dirty(op.writesame.offset, len); + return seastar::now(); +} + +PGBackend::write_iertr::future<> PGBackend::writefull( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + const ceph_osd_op& op = osd_op.op; + if (op.extent.length != osd_op.indata.length()) { + return crimson::ct_error::invarg::make(); + } + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + + return _writefull( + os, + op.extent.truncate_size, + osd_op.indata, + txn, + osd_op_params, + delta_stats, + op.flags); +} + +PGBackend::rollback_iertr::future<> PGBackend::rollback( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + crimson::osd::ObjectContextRef head, + crimson::osd::ObjectContextLoader& obc_loader) +{ + const ceph_osd_op& op = osd_op.op; + snapid_t snapid = (uint64_t)op.snap.snapid; + assert(os.oi.soid.is_head()); + logger().debug("{} deleting {} and rolling back to old snap {}", + __func__, os.oi.soid ,snapid); + hobject_t target_coid = os.oi.soid; + target_coid.snap = snapid; + return obc_loader.with_clone_obc_only<RWState::RWWRITE>( + head, target_coid, + [this, &os, &txn, &delta_stats, &osd_op_params] + (auto resolved_obc) { + if (resolved_obc->obs.oi.soid.is_head()) { + // no-op: The resolved oid returned the head object + logger().debug("PGBackend::rollback: loaded head_obc: {}" + " do nothing", + resolved_obc->obs.oi.soid); + return rollback_iertr::now(); + } + /* TODO: https://tracker.ceph.com/issues/59114 This implementation will not + * behave correctly for a rados operation consisting of a mutation followed + * by a rollback to a snapshot since the last mutation of the object. + * The correct behavior would be for the rollback to undo the mutation + * earlier in the operation by resolving to the clone created at the start + * of the operation (see resolve_oid). + * Instead, it will select HEAD leaving that mutation intact since the SnapSet won't + * yet contain that clone. This behavior exists in classic as well. + */ + logger().debug("PGBackend::rollback: loaded clone_obc: {}", + resolved_obc->obs.oi.soid); + // 1) Delete current head + if (os.exists) { + txn.remove(coll->get_cid(), ghobject_t{os.oi.soid, + ghobject_t::NO_GEN, shard}); + } + // 2) Clone correct snapshot into head + txn.clone(coll->get_cid(), ghobject_t{resolved_obc->obs.oi.soid}, + ghobject_t{os.oi.soid}); + // Copy clone obc.os.oi to os.oi + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + os.oi.copy_user_bits(resolved_obc->obs.oi); + delta_stats.num_bytes -= os.oi.size; + delta_stats.num_bytes += resolved_obc->obs.oi.size; + osd_op_params.clean_regions.mark_data_region_dirty(0, + std::max(os.oi.size, resolved_obc->obs.oi.size)); + osd_op_params.clean_regions.mark_omap_dirty(); + // TODO: 3) Calculate clone_overlaps by following overlaps + // forward from rollback snapshot + // https://tracker.ceph.com/issues/58263 + return rollback_iertr::now(); + }).safe_then_interruptible([] { + logger().debug("PGBackend::rollback succefully"); + return rollback_iertr::now(); + },// there's no snapshot here, or there's no object. + // if there's no snapshot, we delete the object; + // otherwise, do nothing. + crimson::ct_error::enoent::handle( + [this, &os, &snapid, &txn, &delta_stats] { + logger().debug("PGBackend::rollback: deleting head on {}" + " with snap_id of {}" + " because got ENOENT|whiteout on obc lookup", + os.oi.soid, snapid); + return remove(os, txn, delta_stats, false); + }), + rollback_ertr::pass_further{}, + crimson::ct_error::assert_all{"unexpected error in rollback"} + ); +} + +PGBackend::append_ierrorator::future<> PGBackend::append( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + const ceph_osd_op& op = osd_op.op; + if (op.extent.length != osd_op.indata.length()) { + return crimson::ct_error::invarg::make(); + } + maybe_create_new_object(os, txn, delta_stats); + if (op.extent.length) { + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + os.oi.size /* offset */, op.extent.length, + std::move(osd_op.indata), op.flags); + update_size_and_usage(delta_stats, os.oi, os.oi.size, + op.extent.length); + osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size, + op.extent.length); + } + return seastar::now(); +} + +PGBackend::write_iertr::future<> PGBackend::truncate( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{} object dne, truncate is a no-op", __func__); + return write_ertr::now(); + } + const ceph_osd_op& op = osd_op.op; + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + return _truncate( + os, txn, osd_op_params, delta_stats, + op.extent.offset, op.extent.truncate_size, op.extent.truncate_seq); +} + +PGBackend::write_iertr::future<> PGBackend::zero( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{} object dne, zero is a no-op", __func__); + return write_ertr::now(); + } + const ceph_osd_op& op = osd_op.op; + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + + if (op.extent.offset >= os.oi.size || op.extent.length == 0) { + return write_iertr::now(); // noop + } + + if (op.extent.offset + op.extent.length >= os.oi.size) { + return _truncate( + os, txn, osd_op_params, delta_stats, + op.extent.offset, op.extent.truncate_size, op.extent.truncate_seq); + } + + txn.zero(coll->get_cid(), + ghobject_t{os.oi.soid}, + op.extent.offset, + op.extent.length); + // TODO: modified_ranges.union_of(zeroed); + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset, + op.extent.length); + delta_stats.num_wr++; + os.oi.clear_data_digest(); + return write_ertr::now(); +} + +PGBackend::create_iertr::future<> PGBackend::create( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats) +{ + if (os.exists && !os.oi.is_whiteout() && + (osd_op.op.flags & CEPH_OSD_OP_FLAG_EXCL)) { + // this is an exclusive create + return crimson::ct_error::eexist::make(); + } + + if (osd_op.indata.length()) { + // handle the legacy. `category` is no longer implemented. + try { + auto p = osd_op.indata.cbegin(); + std::string category; + decode(category, p); + } catch (buffer::error&) { + return crimson::ct_error::invarg::make(); + } + } + maybe_create_new_object(os, txn, delta_stats); + txn.create(coll->get_cid(), + ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard}); + return seastar::now(); +} + +PGBackend::interruptible_future<> +PGBackend::remove(ObjectState& os, ceph::os::Transaction& txn) +{ + // todo: snapset + txn.remove(coll->get_cid(), + ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard}); + os.oi.size = 0; + os.oi.new_object(); + os.exists = false; + // todo: update watchers + if (os.oi.is_whiteout()) { + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + } + return seastar::now(); +} + +PGBackend::remove_iertr::future<> +PGBackend::remove(ObjectState& os, ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats, bool whiteout) +{ + if (!os.exists) { + return crimson::ct_error::enoent::make(); + } + + if (!os.exists) { + logger().debug("{} {} does not exist",__func__, os.oi.soid); + return seastar::now(); + } + if (whiteout && os.oi.is_whiteout()) { + logger().debug("{} whiteout set on {} ",__func__, os.oi.soid); + return seastar::now(); + } + txn.remove(coll->get_cid(), + ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard}); + delta_stats.num_bytes -= os.oi.size; + os.oi.size = 0; + os.oi.new_object(); + + // todo: clone_overlap + if (whiteout) { + logger().debug("{} setting whiteout on {} ",__func__, os.oi.soid); + os.oi.set_flag(object_info_t::FLAG_WHITEOUT); + delta_stats.num_whiteouts++; + txn.create(coll->get_cid(), + ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard}); + return seastar::now(); + } + // todo: update watchers + if (os.oi.is_whiteout()) { + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + delta_stats.num_whiteouts--; + } + delta_stats.num_objects--; + os.exists = false; + return seastar::now(); +} + +PGBackend::interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>> +PGBackend::list_objects(const hobject_t& start, uint64_t limit) const +{ + auto gstart = start.is_min() ? ghobject_t{} : ghobject_t{start, 0, shard}; + return interruptor::make_interruptible(store->list_objects(coll, + gstart, + ghobject_t::get_max(), + limit)) + .then_interruptible([](auto ret) { + auto& [gobjects, next] = ret; + std::vector<hobject_t> objects; + boost::copy(gobjects | + boost::adaptors::filtered([](const ghobject_t& o) { + if (o.is_pgmeta()) { + return false; + } else if (o.hobj.is_temp()) { + return false; + } else { + return o.is_no_gen(); + } + }) | + boost::adaptors::transformed([](const ghobject_t& o) { + return o.hobj; + }), + std::back_inserter(objects)); + return seastar::make_ready_future<std::tuple<std::vector<hobject_t>, hobject_t>>( + std::make_tuple(objects, next.hobj)); + }); +} + +PGBackend::setxattr_ierrorator::future<> PGBackend::setxattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats) +{ + if (local_conf()->osd_max_attr_size > 0 && + osd_op.op.xattr.value_len > local_conf()->osd_max_attr_size) { + return crimson::ct_error::file_too_large::make(); + } + + const auto max_name_len = std::min<uint64_t>( + store->get_max_attr_name_length(), local_conf()->osd_max_attr_name_len); + if (osd_op.op.xattr.name_len > max_name_len) { + return crimson::ct_error::enametoolong::make(); + } + + maybe_create_new_object(os, txn, delta_stats); + + std::string name{"_"}; + ceph::bufferlist val; + { + auto bp = osd_op.indata.cbegin(); + bp.copy(osd_op.op.xattr.name_len, name); + bp.copy(osd_op.op.xattr.value_len, val); + } + logger().debug("setxattr on obj={} for attr={}", os.oi.soid, name); + txn.setattr(coll->get_cid(), ghobject_t{os.oi.soid}, name, val); + delta_stats.num_wr++; + return seastar::now(); +} + +PGBackend::get_attr_ierrorator::future<> PGBackend::getxattr( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + std::string name; + ceph::bufferlist val; + { + auto bp = osd_op.indata.cbegin(); + std::string aname; + bp.copy(osd_op.op.xattr.name_len, aname); + name = "_" + aname; + } + logger().debug("getxattr on obj={} for attr={}", os.oi.soid, name); + return getxattr(os.oi.soid, std::move(name)).safe_then_interruptible( + [&delta_stats, &osd_op] (ceph::bufferlist&& val) { + osd_op.outdata = std::move(val); + osd_op.op.xattr.value_len = osd_op.outdata.length(); + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + return get_attr_errorator::now(); + }); +} + +PGBackend::get_attr_ierrorator::future<ceph::bufferlist> +PGBackend::getxattr( + const hobject_t& soid, + std::string_view key) const +{ + return store->get_attr(coll, ghobject_t{soid}, key); +} + +PGBackend::get_attr_ierrorator::future<ceph::bufferlist> +PGBackend::getxattr( + const hobject_t& soid, + std::string&& key) const +{ + return seastar::do_with(key, [this, &soid](auto &key) { + return store->get_attr(coll, ghobject_t{soid}, key); + }); +} + +PGBackend::get_attr_ierrorator::future<> PGBackend::get_xattrs( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + return store->get_attrs(coll, ghobject_t{os.oi.soid}).safe_then( + [&delta_stats, &osd_op](auto&& attrs) { + std::vector<std::pair<std::string, bufferlist>> user_xattrs; + ceph::bufferlist bl; + for (auto& [key, val] : attrs) { + if (key.size() > 1 && key[0] == '_') { + bl.append(std::move(val)); + user_xattrs.emplace_back(key.substr(1), std::move(bl)); + } + } + ceph::encode(user_xattrs, osd_op.outdata); + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + return get_attr_errorator::now(); + }); +} + +namespace { + +template<typename U, typename V> +int do_cmp_xattr(int op, const U& lhs, const V& rhs) +{ + switch (op) { + case CEPH_OSD_CMPXATTR_OP_EQ: + return lhs == rhs; + case CEPH_OSD_CMPXATTR_OP_NE: + return lhs != rhs; + case CEPH_OSD_CMPXATTR_OP_GT: + return lhs > rhs; + case CEPH_OSD_CMPXATTR_OP_GTE: + return lhs >= rhs; + case CEPH_OSD_CMPXATTR_OP_LT: + return lhs < rhs; + case CEPH_OSD_CMPXATTR_OP_LTE: + return lhs <= rhs; + default: + return -EINVAL; + } +} + +} // anonymous namespace + +static int do_xattr_cmp_u64(int op, uint64_t lhs, bufferlist& rhs_xattr) +{ + uint64_t rhs; + + if (rhs_xattr.length() > 0) { + const char* first = rhs_xattr.c_str(); + if (auto [p, ec] = std::from_chars(first, first + rhs_xattr.length(), rhs); + ec != std::errc()) { + return -EINVAL; + } + } else { + rhs = 0; + } + logger().debug("do_xattr_cmp_u64 '{}' vs '{}' op {}", lhs, rhs, op); + return do_cmp_xattr(op, lhs, rhs); +} + +PGBackend::cmp_xattr_ierrorator::future<> PGBackend::cmp_xattr( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + std::string name{"_"}; + auto bp = osd_op.indata.cbegin(); + bp.copy(osd_op.op.xattr.name_len, name); + + logger().debug("cmpxattr on obj={} for attr={}", os.oi.soid, name); + return getxattr(os.oi.soid, std::move(name)).safe_then_interruptible( + [&delta_stats, &osd_op] (auto &&xattr) -> cmp_xattr_ierrorator::future<> { + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(osd_op.op.xattr.value_len, 10); + + int result = 0; + auto bp = osd_op.indata.cbegin(); + bp += osd_op.op.xattr.name_len; + + switch (osd_op.op.xattr.cmp_mode) { + case CEPH_OSD_CMPXATTR_MODE_STRING: + { + string lhs; + bp.copy(osd_op.op.xattr.value_len, lhs); + string_view rhs(xattr.c_str(), xattr.length()); + result = do_cmp_xattr(osd_op.op.xattr.cmp_op, lhs, rhs); + logger().debug("cmpxattr lhs={}, rhs={}", lhs, rhs); + } + break; + case CEPH_OSD_CMPXATTR_MODE_U64: + { + uint64_t lhs; + try { + decode(lhs, bp); + } catch (ceph::buffer::error& e) { + logger().info("cmp_xattr: buffer error expection"); + result = -EINVAL; + break; + } + result = do_xattr_cmp_u64(osd_op.op.xattr.cmp_op, lhs, xattr); + } + break; + default: + logger().info("bad cmp mode {}", osd_op.op.xattr.cmp_mode); + result = -EINVAL; + } + if (result == 0) { + logger().info("cmp_xattr: comparison returned false"); + return crimson::ct_error::ecanceled::make(); + } else if (result == -EINVAL) { + return crimson::ct_error::invarg::make(); + } else { + osd_op.rval = 1; + return cmp_xattr_ierrorator::now(); + } + }).handle_error_interruptible( + crimson::ct_error::enodata::handle([&delta_stats, &osd_op] () + ->cmp_xattr_errorator::future<> { + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(osd_op.op.xattr.value_len, 10); + return crimson::ct_error::ecanceled::make(); + }), + cmp_xattr_errorator::pass_further{} + ); +} + +PGBackend::rm_xattr_iertr::future<> +PGBackend::rm_xattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + auto bp = osd_op.indata.cbegin(); + string attr_name{"_"}; + bp.copy(osd_op.op.xattr.name_len, attr_name); + txn.rmattr(coll->get_cid(), ghobject_t{os.oi.soid}, attr_name); + return rm_xattr_iertr::now(); +} + +void PGBackend::clone( + /* const */object_info_t& snap_oi, + const ObjectState& os, + const ObjectState& d_os, + ceph::os::Transaction& txn) +{ + // See OpsExecutor::execute_clone documentation + txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid}); + { + ceph::bufferlist bv; + snap_oi.encode_no_oid(bv, CEPH_FEATURES_ALL); + txn.setattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, OI_ATTR, bv); + } + txn.rmattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, SS_ATTR); +} + +using get_omap_ertr = + crimson::os::FuturizedStore::Shard::read_errorator::extend< + crimson::ct_error::enodata>; +using get_omap_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + get_omap_ertr>; +static +get_omap_iertr::future< + crimson::os::FuturizedStore::Shard::omap_values_t> +maybe_get_omap_vals_by_keys( + crimson::os::FuturizedStore::Shard* store, + const crimson::os::CollectionRef& coll, + const object_info_t& oi, + const std::set<std::string>& keys_to_get) +{ + if (oi.is_omap()) { + return store->omap_get_values(coll, ghobject_t{oi.soid}, keys_to_get); + } else { + return crimson::ct_error::enodata::make(); + } +} + +static +get_omap_iertr::future< + std::tuple<bool, crimson::os::FuturizedStore::Shard::omap_values_t>> +maybe_get_omap_vals( + crimson::os::FuturizedStore::Shard* store, + const crimson::os::CollectionRef& coll, + const object_info_t& oi, + const std::string& start_after) +{ + if (oi.is_omap()) { + return store->omap_get_values(coll, ghobject_t{oi.soid}, start_after); + } else { + return crimson::ct_error::enodata::make(); + } +} + +PGBackend::ll_read_ierrorator::future<ceph::bufferlist> +PGBackend::omap_get_header( + const crimson::os::CollectionRef& c, + const ghobject_t& oid) const +{ + return store->omap_get_header(c, oid) + .handle_error( + crimson::ct_error::enodata::handle([] { + return seastar::make_ready_future<bufferlist>(); + }), + ll_read_errorator::pass_further{} + ); +} + +PGBackend::ll_read_ierrorator::future<> +PGBackend::omap_get_header( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + if (os.oi.is_omap()) { + return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then_interruptible( + [&delta_stats, &osd_op] (ceph::bufferlist&& header) { + osd_op.outdata = std::move(header); + delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + delta_stats.num_rd++; + return seastar::now(); + }); + } else { + // no omap? return empty data but not ENOENT. This is imporant for + // the case when the object is being creating due to to may_write(). + return seastar::now(); + } +} + +PGBackend::ll_read_ierrorator::future<> +PGBackend::omap_get_keys( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + std::string start_after; + uint64_t max_return; + try { + auto p = osd_op.indata.cbegin(); + decode(start_after, p); + decode(max_return, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + max_return = + std::min(max_return, local_conf()->osd_max_omap_entries_per_request); + + + // TODO: truly chunk the reading + return maybe_get_omap_vals(store, coll, os.oi, start_after).safe_then_interruptible( + [=,&delta_stats, &osd_op](auto ret) { + ceph::bufferlist result; + bool truncated = false; + uint32_t num = 0; + for (auto &[key, val] : std::get<1>(ret)) { + if (num >= max_return || + result.length() >= local_conf()->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(key, result); + ++num; + } + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(result); + encode(truncated, osd_op.outdata); + delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + delta_stats.num_rd++; + return seastar::now(); + }).handle_error_interruptible( + crimson::ct_error::enodata::handle([&osd_op] { + uint32_t num = 0; + bool truncated = false; + encode(num, osd_op.outdata); + encode(truncated, osd_op.outdata); + osd_op.rval = 0; + return seastar::now(); + }), + ll_read_errorator::pass_further{} + ); +} +static +PGBackend::omap_cmp_ertr::future<> do_omap_val_cmp( + std::map<std::string, bufferlist, std::less<>> out, + std::map<std::string, std::pair<bufferlist, int>> assertions) +{ + bufferlist empty; + for (const auto &[akey, avalue] : assertions) { + const auto [abl, aflag] = avalue; + auto out_entry = out.find(akey); + bufferlist &bl = (out_entry != out.end()) ? out_entry->second : empty; + switch (aflag) { + case CEPH_OSD_CMPXATTR_OP_EQ: + if (!(bl == abl)) { + return crimson::ct_error::ecanceled::make(); + } + break; + case CEPH_OSD_CMPXATTR_OP_LT: + if (!(bl < abl)) { + return crimson::ct_error::ecanceled::make(); + } + break; + case CEPH_OSD_CMPXATTR_OP_GT: + if (!(bl > abl)) { + return crimson::ct_error::ecanceled::make(); + } + break; + default: + return crimson::ct_error::invarg::make(); + } + } + return PGBackend::omap_cmp_ertr::now(); +} +PGBackend::omap_cmp_iertr::future<> +PGBackend::omap_cmp( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + + auto bp = osd_op.indata.cbegin(); + std::map<std::string, std::pair<bufferlist, int> > assertions; + try { + decode(assertions, bp); + } catch (buffer::error&) { + return crimson::ct_error::invarg::make(); + } + + delta_stats.num_rd++; + if (os.oi.is_omap()) { + std::set<std::string> to_get; + for (auto &i: assertions) { + to_get.insert(i.first); + } + return store->omap_get_values(coll, ghobject_t{os.oi.soid}, to_get) + .safe_then([=, &osd_op] (auto&& out) -> omap_cmp_iertr::future<> { + osd_op.rval = 0; + return do_omap_val_cmp(out, assertions); + }); + } else { + return crimson::ct_error::ecanceled::make(); + } +} +PGBackend::ll_read_ierrorator::future<> +PGBackend::omap_get_vals( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + std::string start_after; + uint64_t max_return; + std::string filter_prefix; + try { + auto p = osd_op.indata.cbegin(); + decode(start_after, p); + decode(max_return, p); + decode(filter_prefix, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + max_return = \ + std::min(max_return, local_conf()->osd_max_omap_entries_per_request); + delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + delta_stats.num_rd++; + + // TODO: truly chunk the reading + return maybe_get_omap_vals(store, coll, os.oi, start_after) + .safe_then_interruptible( + [=, &osd_op] (auto&& ret) { + auto [done, vals] = std::move(ret); + assert(done); + ceph::bufferlist result; + bool truncated = false; + uint32_t num = 0; + auto iter = filter_prefix > start_after ? vals.lower_bound(filter_prefix) + : std::begin(vals); + for (; iter != std::end(vals); ++iter) { + const auto& [key, value] = *iter; + if (key.substr(0, filter_prefix.size()) != filter_prefix) { + break; + } else if (num >= max_return || + result.length() >= local_conf()->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(key, result); + encode(value, result); + ++num; + } + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(result); + encode(truncated, osd_op.outdata); + return ll_read_errorator::now(); + }).handle_error_interruptible( + crimson::ct_error::enodata::handle([&osd_op] { + encode(uint32_t{0} /* num */, osd_op.outdata); + encode(bool{false} /* truncated */, osd_op.outdata); + osd_op.rval = 0; + return ll_read_errorator::now(); + }), + ll_read_errorator::pass_further{} + ); +} + +PGBackend::ll_read_ierrorator::future<> +PGBackend::omap_get_vals_by_keys( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + + std::set<std::string> keys_to_get; + try { + auto p = osd_op.indata.cbegin(); + decode(keys_to_get, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument(); + } + delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + delta_stats.num_rd++; + return maybe_get_omap_vals_by_keys(store, coll, os.oi, keys_to_get) + .safe_then_interruptible( + [&osd_op] (crimson::os::FuturizedStore::Shard::omap_values_t&& vals) { + encode(vals, osd_op.outdata); + return ll_read_errorator::now(); + }).handle_error_interruptible( + crimson::ct_error::enodata::handle([&osd_op] { + uint32_t num = 0; + encode(num, osd_op.outdata); + osd_op.rval = 0; + return ll_read_errorator::now(); + }), + ll_read_errorator::pass_further{} + ); +} + +PGBackend::interruptible_future<> +PGBackend::omap_set_vals( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + maybe_create_new_object(os, txn, delta_stats); + + ceph::bufferlist to_set_bl; + try { + auto p = osd_op.indata.cbegin(); + decode_str_str_map_to_bl(p, &to_set_bl); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + txn.omap_setkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_set_bl); + osd_op_params.clean_regions.mark_omap_dirty(); + delta_stats.num_wr++; + delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10); + os.oi.set_flag(object_info_t::FLAG_OMAP); + os.oi.clear_omap_digest(); + return seastar::now(); +} + +PGBackend::interruptible_future<> +PGBackend::omap_set_header( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + maybe_create_new_object(os, txn, delta_stats); + txn.omap_setheader(coll->get_cid(), ghobject_t{os.oi.soid}, osd_op.indata); + osd_op_params.clean_regions.mark_omap_dirty(); + delta_stats.num_wr++; + os.oi.set_flag(object_info_t::FLAG_OMAP); + os.oi.clear_omap_digest(); + return seastar::now(); +} + +PGBackend::interruptible_future<> PGBackend::omap_remove_range( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats) +{ + std::string key_begin, key_end; + try { + auto p = osd_op.indata.cbegin(); + decode(key_begin, p); + decode(key_end, p); + } catch (buffer::error& e) { + throw crimson::osd::invalid_argument{}; + } + txn.omap_rmkeyrange(coll->get_cid(), ghobject_t{os.oi.soid}, key_begin, key_end); + delta_stats.num_wr++; + os.oi.clear_omap_digest(); + return seastar::now(); +} + +PGBackend::interruptible_future<> PGBackend::omap_remove_key( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + ceph::bufferlist to_rm_bl; + try { + auto p = osd_op.indata.cbegin(); + decode_str_set_to_bl(p, &to_rm_bl); + } catch (buffer::error& e) { + throw crimson::osd::invalid_argument{}; + } + txn.omap_rmkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_rm_bl); + // TODO: + // ctx->clean_regions.mark_omap_dirty(); + // ctx->delta_stats.num_wr++; + os.oi.clear_omap_digest(); + return seastar::now(); +} + +PGBackend::omap_clear_iertr::future<> +PGBackend::omap_clear( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + if (!os.oi.is_omap()) { + return omap_clear_ertr::now(); + } + txn.omap_clear(coll->get_cid(), ghobject_t{os.oi.soid}); + osd_op_params.clean_regions.mark_omap_dirty(); + delta_stats.num_wr++; + os.oi.clear_omap_digest(); + os.oi.clear_flag(object_info_t::FLAG_OMAP); + return omap_clear_ertr::now(); +} + +PGBackend::interruptible_future<struct stat> +PGBackend::stat( + CollectionRef c, + const ghobject_t& oid) const +{ + return store->stat(c, oid); +} + +PGBackend::read_errorator::future<std::map<uint64_t, uint64_t>> +PGBackend::fiemap( + CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + return store->fiemap(c, oid, off, len); +} + +PGBackend::write_iertr::future<> PGBackend::tmapput( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats, + osd_op_params_t& osd_op_params) +{ + logger().debug("PGBackend::tmapput: {}", os.oi.soid); + auto ret = crimson::common::do_tmap_put(osd_op.indata.cbegin()); + if (!ret.has_value()) { + logger().debug("PGBackend::tmapup: {}, ret={}", os.oi.soid, ret.error()); + ceph_assert(ret.error() == -EINVAL); + return crimson::ct_error::invarg::make(); + } else { + auto bl = std::move(ret.value()); + return _writefull( + os, + bl.length(), + std::move(bl), + txn, + osd_op_params, + delta_stats, + 0); + } +} + +PGBackend::tmapup_iertr::future<> PGBackend::tmapup( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats, + osd_op_params_t& osd_op_params) +{ + logger().debug("PGBackend::tmapup: {}", os.oi.soid); + return PGBackend::write_iertr::now( + ).si_then([this, &os] { + return _read(os.oi.soid, 0, os.oi.size, 0); + }).handle_error_interruptible( + crimson::ct_error::enoent::handle([](auto &) { + return seastar::make_ready_future<bufferlist>(); + }), + PGBackend::write_iertr::pass_further{}, + crimson::ct_error::assert_all{"read error in mutate_object_contents"} + ).si_then([this, &os, &osd_op, &txn, + &delta_stats, &osd_op_params] + (auto &&bl) mutable -> PGBackend::tmapup_iertr::future<> { + auto result = crimson::common::do_tmap_up( + osd_op.indata.cbegin(), + std::move(bl)); + if (!result.has_value()) { + int ret = result.error(); + logger().debug("PGBackend::tmapup: {}, ret={}", os.oi.soid, ret); + switch (ret) { + case -EEXIST: + return crimson::ct_error::eexist::make(); + case -ENOENT: + return crimson::ct_error::enoent::make(); + case -EINVAL: + return crimson::ct_error::invarg::make(); + default: + ceph_assert(0 == "impossible error"); + return crimson::ct_error::invarg::make(); + } + } + + logger().debug( + "PGBackend::tmapup: {}, result.value.length()={}, ret=0", + os.oi.soid, result.value().length()); + return _writefull( + os, + result.value().length(), + result.value(), + txn, + osd_op_params, + delta_stats, + 0); + }); +} + +PGBackend::read_ierrorator::future<> PGBackend::tmapget( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) +{ + logger().debug("PGBackend::tmapget: {}", os.oi.soid); + const auto& oi = os.oi; + logger().debug("PGBackend::tmapget: read {} 0~{}", oi.soid, oi.size); + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("PGBackend::tmapget: {} DNE", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + + return _read(oi.soid, 0, oi.size, 0).safe_then_interruptible_tuple( + [&delta_stats, &osd_op](auto&& bl) -> read_errorator::future<> { + logger().debug("PGBackend::tmapget: data length: {}", bl.length()); + osd_op.op.extent.length = bl.length(); + osd_op.rval = 0; + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + osd_op.outdata = std::move(bl); + return read_errorator::now(); + }, crimson::ct_error::input_output_error::handle([] { + return read_errorator::future<>{crimson::ct_error::object_corrupted::make()}; + }), + read_errorator::pass_further{}); +} + diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h new file mode 100644 index 000000000..fbad37d4c --- /dev/null +++ b/src/crimson/osd/pg_backend.h @@ -0,0 +1,448 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <memory> +#include <string> +#include <boost/container/flat_set.hpp> + +#include "include/rados.h" + +#include "crimson/os/futurized_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/acked_peers.h" +#include "crimson/common/shared_lru.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/osdop_params.h" + +struct hobject_t; + +namespace ceph::os { + class Transaction; +} + +namespace crimson::osd { + class ShardServices; + class PG; + class ObjectContextLoader; +} + +class PGBackend +{ +protected: + using CollectionRef = crimson::os::CollectionRef; + using ec_profile_t = std::map<std::string, std::string>; + // low-level read errorator + using ll_read_errorator = crimson::os::FuturizedStore::Shard::read_errorator; + using ll_read_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + ll_read_errorator>; + +public: + using load_metadata_ertr = crimson::errorator< + crimson::ct_error::object_corrupted>; + using load_metadata_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + load_metadata_ertr>; + using interruptor = + ::crimson::interruptible::interruptor< + ::crimson::osd::IOInterruptCondition>; + template <typename T = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition, T>; + using rep_op_fut_t = + std::tuple<interruptible_future<>, + interruptible_future<crimson::osd::acked_peers_t>>; + PGBackend(shard_id_t shard, CollectionRef coll, + crimson::osd::ShardServices &shard_services, + DoutPrefixProvider &dpp); + virtual ~PGBackend() = default; + static std::unique_ptr<PGBackend> create(pg_t pgid, + const pg_shard_t pg_shard, + const pg_pool_t& pool, + crimson::os::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile, + DoutPrefixProvider &dpp); + using attrs_t = + std::map<std::string, ceph::bufferptr, std::less<>>; + using read_errorator = ll_read_errorator::extend< + crimson::ct_error::object_corrupted>; + using read_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + read_errorator>; + read_ierrorator::future<> read( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats); + read_ierrorator::future<> sparse_read( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats); + using checksum_errorator = ll_read_errorator::extend< + crimson::ct_error::object_corrupted, + crimson::ct_error::invarg>; + using checksum_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + checksum_errorator>; + checksum_ierrorator::future<> checksum( + const ObjectState& os, + OSDOp& osd_op); + using cmp_ext_errorator = ll_read_errorator::extend< + crimson::ct_error::invarg, + crimson::ct_error::cmp_fail>; + using cmp_ext_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + cmp_ext_errorator>; + cmp_ext_ierrorator::future<> cmp_ext( + const ObjectState& os, + OSDOp& osd_op); + using stat_errorator = crimson::errorator<crimson::ct_error::enoent>; + using stat_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + stat_errorator>; + stat_ierrorator::future<> stat( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats); + + // TODO: switch the entire write family to errorator. + using write_ertr = crimson::errorator< + crimson::ct_error::file_too_large, + crimson::ct_error::invarg>; + using write_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + write_ertr>; + using create_ertr = crimson::errorator< + crimson::ct_error::invarg, + crimson::ct_error::eexist>; + using create_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + create_ertr>; + create_iertr::future<> create( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats); + using remove_ertr = crimson::errorator< + crimson::ct_error::enoent>; + using remove_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + remove_ertr>; + remove_iertr::future<> remove( + ObjectState& os, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats, + bool whiteout); + interruptible_future<> remove( + ObjectState& os, + ceph::os::Transaction& txn); + interruptible_future<> set_allochint( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats); + write_iertr::future<> write( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + interruptible_future<> write_same( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + write_iertr::future<> writefull( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + using append_errorator = crimson::errorator< + crimson::ct_error::invarg>; + using append_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + append_errorator>; + append_ierrorator::future<> append( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + using rollback_ertr = crimson::errorator< + crimson::ct_error::enoent>; + using rollback_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + rollback_ertr>; + rollback_iertr::future<> rollback( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + crimson::osd::ObjectContextRef head, + crimson::osd::ObjectContextLoader& obc_loader); + write_iertr::future<> truncate( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + write_iertr::future<> zero( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + rep_op_fut_t mutate_object( + std::set<pg_shard_t> pg_shards, + crimson::osd::ObjectContextRef &&obc, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, + epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries); + interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>> list_objects( + const hobject_t& start, + uint64_t limit) const; + using setxattr_errorator = crimson::errorator< + crimson::ct_error::file_too_large, + crimson::ct_error::enametoolong>; + using setxattr_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + setxattr_errorator>; + setxattr_ierrorator::future<> setxattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats); + using get_attr_errorator = crimson::os::FuturizedStore::Shard::get_attr_errorator; + using get_attr_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + get_attr_errorator>; + get_attr_ierrorator::future<> getxattr( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + get_attr_ierrorator::future<ceph::bufferlist> getxattr( + const hobject_t& soid, + std::string_view key) const; + get_attr_ierrorator::future<ceph::bufferlist> getxattr( + const hobject_t& soid, + std::string&& key) const; + get_attr_ierrorator::future<> get_xattrs( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + using cmp_xattr_errorator = get_attr_errorator::extend< + crimson::ct_error::ecanceled, + crimson::ct_error::invarg>; + using cmp_xattr_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + cmp_xattr_errorator>; + cmp_xattr_ierrorator::future<> cmp_xattr( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + using rm_xattr_ertr = crimson::errorator<crimson::ct_error::enoent>; + using rm_xattr_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + rm_xattr_ertr>; + rm_xattr_iertr::future<> rm_xattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + void clone( + /* const */object_info_t& snap_oi, + const ObjectState& os, + const ObjectState& d_os, + ceph::os::Transaction& trans); + interruptible_future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) const; + read_errorator::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len); + + write_iertr::future<> tmapput( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats, + osd_op_params_t& osd_op_params); + + using tmapup_ertr = write_ertr::extend< + crimson::ct_error::enoent, + crimson::ct_error::eexist>; + using tmapup_iertr = ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + tmapup_ertr>; + tmapup_iertr::future<> tmapup( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats, + osd_op_params_t& osd_op_params); + + read_ierrorator::future<> tmapget( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats); + + // OMAP + ll_read_ierrorator::future<> omap_get_keys( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + using omap_cmp_ertr = + crimson::os::FuturizedStore::Shard::read_errorator::extend< + crimson::ct_error::ecanceled, + crimson::ct_error::invarg>; + using omap_cmp_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + omap_cmp_ertr>; + omap_cmp_iertr::future<> omap_cmp( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + ll_read_ierrorator::future<> omap_get_vals( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + ll_read_ierrorator::future<> omap_get_vals_by_keys( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + interruptible_future<> omap_set_vals( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + ll_read_ierrorator::future<ceph::bufferlist> omap_get_header( + const crimson::os::CollectionRef& c, + const ghobject_t& oid) const; + ll_read_ierrorator::future<> omap_get_header( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + interruptible_future<> omap_set_header( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + interruptible_future<> omap_remove_range( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats); + interruptible_future<> omap_remove_key( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + using omap_clear_ertr = crimson::errorator<crimson::ct_error::enoent>; + using omap_clear_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + omap_clear_ertr>; + omap_clear_iertr::future<> omap_clear( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + + virtual void got_rep_op_reply(const MOSDRepOpReply&) {} + virtual seastar::future<> stop() = 0; + virtual void on_actingset_changed(bool same_primary) = 0; +protected: + const shard_id_t shard; + CollectionRef coll; + crimson::osd::ShardServices &shard_services; + DoutPrefixProvider &dpp; ///< provides log prefix context + crimson::os::FuturizedStore::Shard* store; + virtual seastar::future<> request_committed( + const osd_reqid_t& reqid, + const eversion_t& at_version) = 0; +public: + struct loaded_object_md_t { + ObjectState os; + crimson::osd::SnapSetContextRef ssc; + using ref = std::unique_ptr<loaded_object_md_t>; + }; + load_metadata_iertr::future<loaded_object_md_t::ref> + load_metadata( + const hobject_t &oid); + +private: + virtual ll_read_ierrorator::future<ceph::bufferlist> _read( + const hobject_t& hoid, + size_t offset, + size_t length, + uint32_t flags) = 0; + write_iertr::future<> _writefull( + ObjectState& os, + off_t truncate_size, + const bufferlist& bl, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + unsigned flags); + write_iertr::future<> _truncate( + ObjectState& os, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + size_t offset, + size_t truncate_size, + uint32_t truncate_seq); + + bool maybe_create_new_object(ObjectState& os, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats); + void update_size_and_usage(object_stat_sum_t& delta_stats, + object_info_t& oi, uint64_t offset, + uint64_t length, bool write_full = false); + void truncate_update_size_and_usage( + object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size); + virtual rep_op_fut_t + _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) = 0; + friend class ReplicatedRecoveryBackend; + friend class ::crimson::osd::PG; +}; diff --git a/src/crimson/osd/pg_interval_interrupt_condition.cc b/src/crimson/osd/pg_interval_interrupt_condition.cc new file mode 100644 index 000000000..36243b825 --- /dev/null +++ b/src/crimson/osd/pg_interval_interrupt_condition.cc @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg_interval_interrupt_condition.h" +#include "pg.h" + +#include "crimson/common/log.h" + +SET_SUBSYS(osd); + +namespace crimson::osd { + +IOInterruptCondition::IOInterruptCondition(Ref<PG>& pg) + : pg(pg), e(pg->get_osdmap_epoch()) {} + +IOInterruptCondition::~IOInterruptCondition() { + // for the sake of forward declaring PG (which is a detivate of + // intrusive_ref_counter<...>) +} + +bool IOInterruptCondition::new_interval_created() { + LOG_PREFIX(IOInterruptCondition::new_interval_created); + const epoch_t interval_start = pg->get_interval_start_epoch(); + bool ret = e < interval_start; + if (ret) { + DEBUGDPP("stored interval e{} < interval_start e{}", *pg, e, interval_start); + } + return ret; +} + +bool IOInterruptCondition::is_stopping() { + LOG_PREFIX(IOInterruptCondition::is_stopping); + if (pg->stopping) { + DEBUGDPP("pg stopping", *pg); + } + return pg->stopping; +} + +bool IOInterruptCondition::is_primary() { + return pg->is_primary(); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/pg_interval_interrupt_condition.h b/src/crimson/osd/pg_interval_interrupt_condition.h new file mode 100644 index 000000000..a3a0a1edb --- /dev/null +++ b/src/crimson/osd/pg_interval_interrupt_condition.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include "include/types.h" +#include "crimson/common/errorator.h" +#include "crimson/common/exception.h" +#include "crimson/common/type_helpers.h" + +namespace crimson::osd { + +class PG; + +class IOInterruptCondition { +public: + IOInterruptCondition(Ref<PG>& pg); + ~IOInterruptCondition(); + + bool new_interval_created(); + + bool is_stopping(); + + bool is_primary(); + + template <typename Fut> + std::optional<Fut> may_interrupt() { + if (new_interval_created()) { + return seastar::futurize<Fut>::make_exception_future( + ::crimson::common::actingset_changed(is_primary())); + } + if (is_stopping()) { + return seastar::futurize<Fut>::make_exception_future( + ::crimson::common::system_shutdown_exception()); + } + return std::optional<Fut>(); + } + + template <typename T> + static constexpr bool is_interruption_v = + std::is_same_v<T, ::crimson::common::actingset_changed> + || std::is_same_v<T, ::crimson::common::system_shutdown_exception>; + + static bool is_interruption(std::exception_ptr& eptr) { + return (*eptr.__cxa_exception_type() == + typeid(::crimson::common::actingset_changed) || + *eptr.__cxa_exception_type() == + typeid(::crimson::common::system_shutdown_exception)); + } + +private: + Ref<PG> pg; + epoch_t e; +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/pg_map.cc b/src/crimson/osd/pg_map.cc new file mode 100644 index 000000000..193781250 --- /dev/null +++ b/src/crimson/osd/pg_map.cc @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/pg_map.h" + +#include "crimson/osd/pg.h" +#include "common/Formatter.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using std::make_pair; + +namespace crimson::osd { + +PGMap::PGCreationState::PGCreationState(spg_t pgid) : pgid(pgid) {} +PGMap::PGCreationState::~PGCreationState() {} + +void PGMap::PGCreationState::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pgid; + f->dump_bool("creating", creating); +} + +PGMap::wait_for_pg_ret +PGMap::wait_for_pg(PGCreationBlockingEvent::TriggerI&& trigger, spg_t pgid) +{ + if (auto pg = get_pg(pgid)) { + return make_pair( + wait_for_pg_fut(wait_for_pg_ertr::ready_future_marker{}, pg), + true); + } else { + auto &state = pgs_creating.emplace(pgid, pgid).first->second; + return make_pair( + wait_for_pg_fut( + trigger.maybe_record_blocking(state.promise.get_shared_future(), state) + ), state.creating); + } +} + +void PGMap::remove_pg(spg_t pgid) { + ceph_assert(pgs.erase(pgid) == 1); +} + +Ref<PG> PGMap::get_pg(spg_t pgid) +{ + if (auto pg = pgs.find(pgid); pg != pgs.end()) { + return pg->second; + } else { + return nullptr; + } +} + +void PGMap::set_creating(spg_t pgid) +{ + logger().debug("Creating {}", pgid); + ceph_assert(pgs.count(pgid) == 0); + auto pg = pgs_creating.find(pgid); + ceph_assert(pg != pgs_creating.end()); + ceph_assert(pg->second.creating == false); + pg->second.creating = true; +} + +void PGMap::pg_created(spg_t pgid, Ref<PG> pg) +{ + logger().debug("Created {}", pgid); + ceph_assert(!pgs.count(pgid)); + pgs.emplace(pgid, pg); + + auto creating_iter = pgs_creating.find(pgid); + ceph_assert(creating_iter != pgs_creating.end()); + auto promise = std::move(creating_iter->second.promise); + pgs_creating.erase(creating_iter); + promise.set_value(pg); +} + +void PGMap::pg_loaded(spg_t pgid, Ref<PG> pg) +{ + ceph_assert(!pgs.count(pgid)); + pgs.emplace(pgid, pg); +} + +void PGMap::pg_creation_canceled(spg_t pgid) +{ + logger().debug("PGMap::pg_creation_canceled: {}", pgid); + ceph_assert(!pgs.count(pgid)); + + auto creating_iter = pgs_creating.find(pgid); + ceph_assert(creating_iter != pgs_creating.end()); + auto promise = std::move(creating_iter->second.promise); + pgs_creating.erase(creating_iter); + promise.set_exception( + crimson::ct_error::ecanceled::exception_ptr() + ); +} + +PGMap::~PGMap() {} + +} diff --git a/src/crimson/osd/pg_map.h b/src/crimson/osd/pg_map.h new file mode 100644 index 000000000..3269de434 --- /dev/null +++ b/src/crimson/osd/pg_map.h @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <algorithm> + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "include/types.h" +#include "crimson/common/type_helpers.h" +#include "crimson/common/smp_helpers.h" +#include "crimson/osd/osd_operation.h" +#include "osd/osd_types.h" + +namespace crimson::osd { +class PG; + +/** + * PGShardMapping + * + * Maintains a mapping from spg_t to the core containing that PG. Internally, each + * core has a local copy of the mapping to enable core-local lookups. Updates + * are proxied to core 0, and the back out to all other cores -- see maybe_create_pg. + */ +class PGShardMapping : public seastar::peering_sharded_service<PGShardMapping> { +public: + /// Returns mapping if present, NULL_CORE otherwise + core_id_t get_pg_mapping(spg_t pgid) { + auto iter = pg_to_core.find(pgid); + ceph_assert_always(iter == pg_to_core.end() || iter->second != NULL_CORE); + return iter == pg_to_core.end() ? NULL_CORE : iter->second; + } + + /// Returns mapping for pgid, creates new one if it doesn't already exist + seastar::future<core_id_t> maybe_create_pg( + spg_t pgid, + core_id_t core = NULL_CORE) { + auto find_iter = pg_to_core.find(pgid); + if (find_iter != pg_to_core.end()) { + ceph_assert_always(find_iter->second != NULL_CORE); + if (core != NULL_CORE) { + ceph_assert_always(find_iter->second == core); + } + return seastar::make_ready_future<core_id_t>(find_iter->second); + } else { + return container().invoke_on(0,[pgid, core] + (auto &primary_mapping) { + auto [insert_iter, inserted] = primary_mapping.pg_to_core.emplace(pgid, core); + ceph_assert_always(inserted); + ceph_assert_always(primary_mapping.core_to_num_pgs.size() > 0); + std::map<core_id_t, unsigned>::iterator core_iter; + if (core == NULL_CORE) { + core_iter = std::min_element( + primary_mapping.core_to_num_pgs.begin(), + primary_mapping.core_to_num_pgs.end(), + [](const auto &left, const auto &right) { + return left.second < right.second; + }); + } else { + core_iter = primary_mapping.core_to_num_pgs.find(core); + } + ceph_assert_always(primary_mapping.core_to_num_pgs.end() != core_iter); + insert_iter->second = core_iter->first; + core_iter->second++; + return primary_mapping.container().invoke_on_others( + [pgid = insert_iter->first, core = insert_iter->second] + (auto &other_mapping) { + ceph_assert_always(core != NULL_CORE); + auto [insert_iter, inserted] = other_mapping.pg_to_core.emplace(pgid, core); + ceph_assert_always(inserted); + }); + }).then([this, pgid] { + auto find_iter = pg_to_core.find(pgid); + return seastar::make_ready_future<core_id_t>(find_iter->second); + }); + } + } + + /// Remove pgid + seastar::future<> remove_pg(spg_t pgid) { + return container().invoke_on(0, [pgid](auto &primary_mapping) { + auto iter = primary_mapping.pg_to_core.find(pgid); + ceph_assert_always(iter != primary_mapping.pg_to_core.end()); + ceph_assert_always(iter->second != NULL_CORE); + auto count_iter = primary_mapping.core_to_num_pgs.find(iter->second); + ceph_assert_always(count_iter != primary_mapping.core_to_num_pgs.end()); + ceph_assert_always(count_iter->second > 0); + --(count_iter->second); + primary_mapping.pg_to_core.erase(iter); + return primary_mapping.container().invoke_on_others( + [pgid](auto &other_mapping) { + auto iter = other_mapping.pg_to_core.find(pgid); + ceph_assert_always(iter != other_mapping.pg_to_core.end()); + ceph_assert_always(iter->second != NULL_CORE); + other_mapping.pg_to_core.erase(iter); + }); + }); + } + + size_t get_num_pgs() const { return pg_to_core.size(); } + + /// Map to cores in [min_core_mapping, core_mapping_limit) + PGShardMapping(core_id_t min_core_mapping, core_id_t core_mapping_limit) { + ceph_assert_always(min_core_mapping < core_mapping_limit); + for (auto i = min_core_mapping; i != core_mapping_limit; ++i) { + core_to_num_pgs.emplace(i, 0); + } + } + + template <typename F> + void for_each_pgid(F &&f) const { + for (const auto &i: pg_to_core) { + std::invoke(f, i.first); + } + } + +private: + std::map<core_id_t, unsigned> core_to_num_pgs; + std::map<spg_t, core_id_t> pg_to_core; +}; + +/** + * PGMap + * + * Maps spg_t to PG instance within a shard. Handles dealing with waiting + * on pg creation. + */ +class PGMap { + struct PGCreationState : BlockerT<PGCreationState> { + static constexpr const char * type_name = "PGCreation"; + + void dump_detail(Formatter *f) const final; + + spg_t pgid; + seastar::shared_promise<Ref<PG>> promise; + bool creating = false; + PGCreationState(spg_t pgid); + + PGCreationState(const PGCreationState &) = delete; + PGCreationState(PGCreationState &&) = delete; + PGCreationState &operator=(const PGCreationState &) = delete; + PGCreationState &operator=(PGCreationState &&) = delete; + + ~PGCreationState(); + }; + + std::map<spg_t, PGCreationState> pgs_creating; + using pgs_t = std::map<spg_t, Ref<PG>>; + pgs_t pgs; + +public: + using PGCreationBlocker = PGCreationState; + using PGCreationBlockingEvent = PGCreationBlocker::BlockingEvent; + /** + * Get future for pg with a bool indicating whether it's already being + * created. + */ + using wait_for_pg_ertr = crimson::errorator< + crimson::ct_error::ecanceled>; + using wait_for_pg_fut = wait_for_pg_ertr::future<Ref<PG>>; + using wait_for_pg_ret = std::pair<wait_for_pg_fut, bool>; + wait_for_pg_ret wait_for_pg(PGCreationBlockingEvent::TriggerI&&, spg_t pgid); + + /** + * get PG in non-blocking manner + */ + Ref<PG> get_pg(spg_t pgid); + + /** + * Set creating + */ + void set_creating(spg_t pgid); + + /** + * Set newly created pg + */ + void pg_created(spg_t pgid, Ref<PG> pg); + + /** + * Add newly loaded pg + */ + void pg_loaded(spg_t pgid, Ref<PG> pg); + + /** + * Cancel pending creation of pgid. + */ + void pg_creation_canceled(spg_t pgid); + + void remove_pg(spg_t pgid); + + pgs_t& get_pgs() { return pgs; } + const pgs_t& get_pgs() const { return pgs; } + auto get_pg_count() const { return pgs.size(); } + PGMap() = default; + ~PGMap(); +}; + +} diff --git a/src/crimson/osd/pg_meta.cc b/src/crimson/osd/pg_meta.cc new file mode 100644 index 000000000..288ee52a0 --- /dev/null +++ b/src/crimson/osd/pg_meta.cc @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg_meta.h" + +#include <string_view> + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +using std::string; +using std::string_view; +// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can +// easily skip them +using crimson::os::FuturizedStore; + +PGMeta::PGMeta(FuturizedStore::Shard& store, spg_t pgid) + : store{store}, + pgid{pgid} +{} + +namespace { + template<typename T> + std::optional<T> find_value(const FuturizedStore::Shard::omap_values_t& values, + string_view key) + { + auto found = values.find(key); + if (found == values.end()) { + return {}; + } + auto p = found->second.cbegin(); + T value; + decode(value, p); + return std::make_optional(std::move(value)); + } +} + +seastar::future<epoch_t> PGMeta::get_epoch() +{ + return store.open_collection(coll_t{pgid}).then([this](auto ch) { + return store.omap_get_values(ch, + pgid.make_pgmeta_oid(), + {string{infover_key}, + string{epoch_key}}).safe_then( + [](auto&& values) { + { + // sanity check + auto infover = find_value<__u8>(values, infover_key); + assert(infover); + if (*infover < 10) { + throw std::runtime_error("incompatible pg meta"); + } + } + { + auto epoch = find_value<epoch_t>(values, epoch_key); + assert(epoch); + return seastar::make_ready_future<epoch_t>(*epoch); + } + }, + FuturizedStore::Shard::read_errorator::assert_all{ + "PGMeta::get_epoch: unable to read pgmeta" + }); + }); +} + +seastar::future<std::tuple<pg_info_t, PastIntervals>> PGMeta::load() +{ + return store.open_collection(coll_t{pgid}).then([this](auto ch) { + return store.omap_get_values(ch, + pgid.make_pgmeta_oid(), + {string{infover_key}, + string{info_key}, + string{biginfo_key}, + string{fastinfo_key}}); + }).safe_then([](auto&& values) { + { + // sanity check + auto infover = find_value<__u8>(values, infover_key); + assert(infover); + if (infover < 10) { + throw std::runtime_error("incompatible pg meta"); + } + } + pg_info_t info; + { + auto found = find_value<pg_info_t>(values, info_key); + assert(found); + info = *std::move(found); + } + PastIntervals past_intervals; + { + using biginfo_t = std::pair<PastIntervals, decltype(info.purged_snaps)>; + auto big_info = find_value<biginfo_t>(values, biginfo_key); + assert(big_info); + past_intervals = std::move(big_info->first); + info.purged_snaps = std::move(big_info->second); + } + { + auto fast_info = find_value<pg_fast_info_t>(values, fastinfo_key); + if (fast_info) { + fast_info->try_apply_to(&info); + } + } + return seastar::make_ready_future<std::tuple<pg_info_t, PastIntervals>>( + std::make_tuple(std::move(info), std::move(past_intervals))); + }, + FuturizedStore::Shard::read_errorator::assert_all{ + "PGMeta::load: unable to read pgmeta" + }); +} diff --git a/src/crimson/osd/pg_meta.h b/src/crimson/osd/pg_meta.h new file mode 100644 index 000000000..21c2bb373 --- /dev/null +++ b/src/crimson/osd/pg_meta.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <tuple> +#include <seastar/core/future.hh> +#include "osd/osd_types.h" +#include "crimson/os/futurized_store.h" + +/// PG related metadata +class PGMeta +{ + crimson::os::FuturizedStore::Shard& store; + const spg_t pgid; +public: + PGMeta(crimson::os::FuturizedStore::Shard& store, spg_t pgid); + seastar::future<epoch_t> get_epoch(); + seastar::future<std::tuple<pg_info_t, PastIntervals>> load(); +}; diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc new file mode 100644 index 000000000..09b45779e --- /dev/null +++ b/src/crimson/osd/pg_recovery.cc @@ -0,0 +1,569 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <fmt/ranges.h> + +#include "crimson/common/type_helpers.h" +#include "crimson/osd/backfill_facades.h" +#include "crimson/osd/osd_operations/background_recovery.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/pg_recovery.h" + +#include "osd/osd_types.h" +#include "osd/PeeringState.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using std::map; +using std::set; + +void PGRecovery::start_pglogbased_recovery() +{ + using PglogBasedRecovery = crimson::osd::PglogBasedRecovery; + (void) pg->get_shard_services().start_operation<PglogBasedRecovery>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_osdmap_epoch(), + float(0.001)); +} + +PGRecovery::interruptible_future<bool> +PGRecovery::start_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + size_t max_to_start) +{ + assert(pg->is_primary()); + assert(pg->is_peered()); + assert(pg->is_recovering()); + // in ceph-osd the do_recovery() path handles both the pg log-based + // recovery and the backfill, albeit they are separated at the layer + // of PeeringState. In crimson-osd backfill has been cut from it, so + // and do_recovery() is actually solely for pg log-based recovery. + // At the time of writing it's considered to move it to FSM and fix + // the naming as well. + assert(!pg->is_backfilling()); + assert(!pg->get_peering_state().is_deleting()); + + std::vector<interruptible_future<>> started; + started.reserve(max_to_start); + max_to_start -= start_primary_recovery_ops(trigger, max_to_start, &started); + if (max_to_start > 0) { + max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started); + } + using interruptor = + crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>; + return interruptor::parallel_for_each(started, + [] (auto&& ifut) { + return std::move(ifut); + }).then_interruptible([this] { + bool done = !pg->get_peering_state().needs_recovery(); + if (done) { + logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}", + pg->get_pgid()); + using LocalPeeringEvent = crimson::osd::LocalPeeringEvent; + if (!pg->get_peering_state().needs_backfill()) { + logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}", + pg->get_pgid()); + (void) pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::AllReplicasRecovered{}); + } else { + logger().debug("start_recovery_ops: RequestBackfill for pg: {}", + pg->get_pgid()); + (void) pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::RequestBackfill{}); + } + } + return seastar::make_ready_future<bool>(!done); + }); +} + +size_t PGRecovery::start_primary_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + size_t max_to_start, + std::vector<PGRecovery::interruptible_future<>> *out) +{ + if (!pg->is_recovering()) { + return 0; + } + + if (!pg->get_peering_state().have_missing()) { + pg->get_peering_state().local_recovery_complete(); + return 0; + } + + const auto &missing = pg->get_peering_state().get_pg_log().get_missing(); + + logger().info("{} recovering {} in pg {}, missing {}", __func__, + pg->get_recovery_backend()->total_recovering(), + *static_cast<crimson::osd::PG*>(pg), + missing); + + unsigned started = 0; + int skipped = 0; + + map<version_t, hobject_t>::const_iterator p = + missing.get_rmissing().lower_bound(pg->get_peering_state().get_pg_log().get_log().last_requested); + while (started < max_to_start && p != missing.get_rmissing().end()) { + // TODO: chain futures here to enable yielding to scheduler? + hobject_t soid; + version_t v = p->first; + + auto it_objects = pg->get_peering_state().get_pg_log().get_log().objects.find(p->second); + if (it_objects != pg->get_peering_state().get_pg_log().get_log().objects.end()) { + // look at log! + pg_log_entry_t *latest = it_objects->second; + assert(latest->is_update() || latest->is_delete()); + soid = latest->soid; + } else { + soid = p->second; + } + const pg_missing_item& item = missing.get_items().find(p->second)->second; + ++p; + + hobject_t head = soid.get_head(); + + logger().info( + "{} {} item.need {} {} {} {} {}", + __func__, + soid, + item.need, + missing.is_missing(soid) ? " (missing)":"", + missing.is_missing(head) ? " (missing head)":"", + pg->get_recovery_backend()->is_recovering(soid) ? " (recovering)":"", + pg->get_recovery_backend()->is_recovering(head) ? " (recovering head)":""); + + // TODO: handle lost/unfound + if (pg->get_recovery_backend()->is_recovering(soid)) { + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + out->emplace_back(recovery_waiter.wait_for_recovered(trigger)); + ++started; + } else if (pg->get_recovery_backend()->is_recovering(head)) { + ++skipped; + } else { + out->emplace_back(recover_missing(trigger, soid, item.need)); + ++started; + } + + if (!skipped) + pg->get_peering_state().set_last_requested(v); + } + + logger().info("{} started {} skipped {}", __func__, started, skipped); + + return started; +} + +size_t PGRecovery::start_replica_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + size_t max_to_start, + std::vector<PGRecovery::interruptible_future<>> *out) +{ + if (!pg->is_recovering()) { + return 0; + } + uint64_t started = 0; + + assert(!pg->get_peering_state().get_acting_recovery_backfill().empty()); + + auto recovery_order = get_replica_recovery_order(); + for (auto &peer : recovery_order) { + assert(peer != pg->get_peering_state().get_primary()); + const auto& pm = pg->get_peering_state().get_peer_missing(peer); + + logger().debug("{}: peer osd.{} missing {} objects", __func__, + peer, pm.num_missing()); + logger().trace("{}: peer osd.{} missing {}", __func__, + peer, pm.get_items()); + + // recover oldest first + for (auto p = pm.get_rmissing().begin(); + p != pm.get_rmissing().end() && started < max_to_start; + ++p) { + const auto &soid = p->second; + + if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) { + logger().debug("{}: object {} still unfound", __func__, soid); + continue; + } + + const pg_info_t &pi = pg->get_peering_state().get_peer_info(peer); + if (soid > pi.last_backfill) { + if (!pg->get_recovery_backend()->is_recovering(soid)) { + logger().error( + "{}: object {} in missing set for backfill (last_backfill {})" + " but not in recovering", + __func__, + soid, + pi.last_backfill); + ceph_abort(); + } + continue; + } + + if (pg->get_recovery_backend()->is_recovering(soid)) { + logger().debug("{}: already recovering object {}", __func__, soid); + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + out->emplace_back(recovery_waiter.wait_for_recovered(trigger)); + started++; + continue; + } + + if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { + logger().debug("{}: soid {} is a delete, removing", __func__, soid); + map<hobject_t,pg_missing_item>::const_iterator r = + pm.get_items().find(soid); + started++; + out->emplace_back( + prep_object_replica_deletes(trigger, soid, r->second.need)); + continue; + } + + if (soid.is_snap() && + pg->get_peering_state().get_pg_log().get_missing().is_missing( + soid.get_head())) { + logger().debug("{}: head {} still missing on primary", __func__, + soid.get_head()); + continue; + } + + if (pg->get_peering_state().get_pg_log().get_missing().is_missing(soid)) { + logger().debug("{}: soid {} still missing on primary", __func__, soid); + continue; + } + + logger().debug("{}: recover_object_replicas({})", __func__,soid); + map<hobject_t,pg_missing_item>::const_iterator r = pm.get_items().find( + soid); + started++; + out->emplace_back( + prep_object_replica_pushes(trigger, soid, r->second.need)); + } + } + + return started; +} + +PGRecovery::interruptible_future<> +PGRecovery::recover_missing( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + const hobject_t &soid, eversion_t need) +{ + if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { + return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( + trigger, + pg->get_recovery_backend()->recover_delete(soid, need)); + } else { + return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( + trigger, + pg->get_recovery_backend()->recover_object(soid, need) + .handle_exception_interruptible( + [=, this, soid = std::move(soid)] (auto e) { + on_failed_recover({ pg->get_pg_whoami() }, soid, need); + return seastar::make_ready_future<>(); + }) + ); + } +} + +RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_deletes( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + const hobject_t& soid, + eversion_t need) +{ + return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( + trigger, + pg->get_recovery_backend()->push_delete(soid, need).then_interruptible( + [=, this] { + object_stat_sum_t stat_diff; + stat_diff.num_objects_recovered = 1; + on_global_recover(soid, stat_diff, true); + return seastar::make_ready_future<>(); + }) + ); +} + +RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_pushes( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + const hobject_t& soid, + eversion_t need) +{ + return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( + trigger, + pg->get_recovery_backend()->recover_object(soid, need) + .handle_exception_interruptible( + [=, this, soid = std::move(soid)] (auto e) { + on_failed_recover({ pg->get_pg_whoami() }, soid, need); + return seastar::make_ready_future<>(); + }) + ); +} + +void PGRecovery::on_local_recover( + const hobject_t& soid, + const ObjectRecoveryInfo& recovery_info, + const bool is_delete, + ceph::os::Transaction& t) +{ + if (const auto &log = pg->get_peering_state().get_pg_log(); + !is_delete && + log.get_missing().is_missing(recovery_info.soid) && + log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) { + assert(pg->is_primary()); + if (const auto* latest = log.get_log().objects.find(recovery_info.soid)->second; + latest->op == pg_log_entry_t::LOST_REVERT) { + ceph_abort("mark_unfound_lost (LOST_REVERT) is not implemented yet"); + } + } + pg->get_peering_state().recover_got(soid, + recovery_info.version, is_delete, t); + + if (pg->is_primary()) { + if (!is_delete) { + auto& obc = pg->get_recovery_backend()->get_recovering(soid).obc; //TODO: move to pg backend? + obc->obs.exists = true; + obc->obs.oi = recovery_info.oi; + } + if (!pg->is_unreadable_object(soid)) { + pg->get_recovery_backend()->get_recovering(soid).set_readable(); + } + pg->publish_stats_to_osd(); + } +} + +void PGRecovery::on_global_recover ( + const hobject_t& soid, + const object_stat_sum_t& stat_diff, + const bool is_delete) +{ + logger().info("{} {}", __func__, soid); + pg->get_peering_state().object_recovered(soid, stat_diff); + pg->publish_stats_to_osd(); + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + if (!is_delete) + recovery_waiter.obc->drop_recovery_read(); + recovery_waiter.set_recovered(); + pg->get_recovery_backend()->remove_recovering(soid); +} + +void PGRecovery::on_failed_recover( + const set<pg_shard_t>& from, + const hobject_t& soid, + const eversion_t& v) +{ + for (auto pg_shard : from) { + if (pg_shard != pg->get_pg_whoami()) { + pg->get_peering_state().force_object_missing(pg_shard, soid, v); + } + } +} + +void PGRecovery::on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info) +{ + crimson::get_logger(ceph_subsys_osd).debug( + "{}: {}, {} on {}", __func__, oid, + recovery_info.version, peer); + pg->get_peering_state().on_peer_recover(peer, oid, recovery_info.version); +} + +void PGRecovery::_committed_pushed_object(epoch_t epoch, + eversion_t last_complete) +{ + if (!pg->has_reset_since(epoch)) { + pg->get_peering_state().recovery_committed_to(last_complete); + } else { + crimson::get_logger(ceph_subsys_osd).debug( + "{} pg has changed, not touching last_complete_ondisk", + __func__); + } +} + +template <class EventT> +void PGRecovery::start_backfill_recovery(const EventT& evt) +{ + using BackfillRecovery = crimson::osd::BackfillRecovery; + std::ignore = pg->get_shard_services().start_operation<BackfillRecovery>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_osdmap_epoch(), + evt); +} + +void PGRecovery::request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) +{ + logger().debug("{}: target.osd={}", __func__, target.osd); + auto msg = crimson::make_message<MOSDPGScan>( + MOSDPGScan::OP_SCAN_GET_DIGEST, + pg->get_pg_whoami(), + pg->get_osdmap_epoch(), + pg->get_last_peering_reset(), + spg_t(pg->get_pgid().pgid, target.shard), + begin, + end); + std::ignore = pg->get_shard_services().send_to_osd( + target.osd, + std::move(msg), + pg->get_osdmap_epoch()); +} + +void PGRecovery::request_primary_scan( + const hobject_t& begin) +{ + logger().debug("{}", __func__); + using crimson::common::local_conf; + std::ignore = pg->get_recovery_backend()->scan_for_backfill( + begin, + local_conf()->osd_backfill_scan_min, + local_conf()->osd_backfill_scan_max + ).then_interruptible([this] (BackfillInterval bi) { + logger().debug("request_primary_scan:{}", __func__); + using BackfillState = crimson::osd::BackfillState; + start_backfill_recovery(BackfillState::PrimaryScanned{ std::move(bi) }); + }); +} + +void PGRecovery::enqueue_push( + const hobject_t& obj, + const eversion_t& v) +{ + logger().debug("{}: obj={} v={}", + __func__, obj, v); + pg->get_recovery_backend()->add_recovering(obj); + std::ignore = pg->get_recovery_backend()->recover_object(obj, v).\ + handle_exception_interruptible([] (auto) { + ceph_abort_msg("got exception on backfill's push"); + return seastar::make_ready_future<>(); + }).then_interruptible([this, obj] { + logger().debug("enqueue_push:{}", __func__); + using BackfillState = crimson::osd::BackfillState; + start_backfill_recovery(BackfillState::ObjectPushed(std::move(obj))); + }); +} + +void PGRecovery::enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) +{ + // allocate a pair if target is seen for the first time + auto& req = backfill_drop_requests[target]; + if (!req) { + req = crimson::make_message<MOSDPGBackfillRemove>( + spg_t(pg->get_pgid().pgid, target.shard), pg->get_osdmap_epoch()); + } + req->ls.emplace_back(obj, v); +} + +void PGRecovery::maybe_flush() +{ + for (auto& [target, req] : backfill_drop_requests) { + std::ignore = pg->get_shard_services().send_to_osd( + target.osd, + std::move(req), + pg->get_osdmap_epoch()); + } + backfill_drop_requests.clear(); +} + +void PGRecovery::update_peers_last_backfill( + const hobject_t& new_last_backfill) +{ + logger().debug("{}: new_last_backfill={}", + __func__, new_last_backfill); + // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to + // all the backfill targets. Otherwise, we will move last_backfill up on + // those targets need it and send OP_BACKFILL_PROGRESS to them. + for (const auto& bt : pg->get_peering_state().get_backfill_targets()) { + if (const pg_info_t& pinfo = pg->get_peering_state().get_peer_info(bt); + new_last_backfill > pinfo.last_backfill) { + pg->get_peering_state().update_peer_last_backfill(bt, new_last_backfill); + auto m = crimson::make_message<MOSDPGBackfill>( + pinfo.last_backfill.is_max() ? MOSDPGBackfill::OP_BACKFILL_FINISH + : MOSDPGBackfill::OP_BACKFILL_PROGRESS, + pg->get_osdmap_epoch(), + pg->get_last_peering_reset(), + spg_t(pg->get_pgid().pgid, bt.shard)); + // Use default priority here, must match sub_op priority + // TODO: if pinfo.last_backfill.is_max(), then + // start_recovery_op(hobject_t::get_max()); + m->last_backfill = pinfo.last_backfill; + m->stats = pinfo.stats; + std::ignore = pg->get_shard_services().send_to_osd( + bt.osd, std::move(m), pg->get_osdmap_epoch()); + logger().info("{}: peer {} num_objects now {} / {}", + __func__, + bt, + pinfo.stats.stats.sum.num_objects, + pg->get_info().stats.stats.sum.num_objects); + } + } +} + +bool PGRecovery::budget_available() const +{ + // TODO: the limits! + return true; +} + +void PGRecovery::backfilled() +{ + using LocalPeeringEvent = crimson::osd::LocalPeeringEvent; + std::ignore = pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::Backfilled{}); +} + +void PGRecovery::dispatch_backfill_event( + boost::intrusive_ptr<const boost::statechart::event_base> evt) +{ + logger().debug("{}", __func__); + backfill_state->process_event(evt); +} + +void PGRecovery::on_backfill_reserved() +{ + logger().debug("{}", __func__); + // PIMP and depedency injection for the sake unittestability. + // I'm not afraid about the performance here. + using BackfillState = crimson::osd::BackfillState; + backfill_state = std::make_unique<BackfillState>( + *this, + std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()), + std::make_unique<crimson::osd::PGFacade>( + *static_cast<crimson::osd::PG*>(pg))); + // yes, it's **not** backfilling yet. The PG_STATE_BACKFILLING + // will be set after on_backfill_reserved() returns. + // Backfill needs to take this into consideration when scheduling + // events -- they must be mutually exclusive with PeeringEvent + // instances. Otherwise the execution might begin without having + // the state updated. + ceph_assert(!pg->get_peering_state().is_backfilling()); + start_backfill_recovery(BackfillState::Triggered{}); +} diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h new file mode 100644 index 000000000..719d0ad2d --- /dev/null +++ b/src/crimson/osd/pg_recovery.h @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/pg_recovery_listener.h" +#include "crimson/osd/scheduler/scheduler.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/recovery_backend.h" + +#include "osd/object_state.h" + +namespace crimson::osd { +class UrgentRecovery; +} + +class MOSDPGBackfillRemove; +class PGBackend; + +class PGRecovery : public crimson::osd::BackfillState::BackfillListener { +public: + template <typename T = void> + using interruptible_future = RecoveryBackend::interruptible_future<T>; + PGRecovery(PGRecoveryListener* pg) : pg(pg) {} + virtual ~PGRecovery() {} + void start_pglogbased_recovery(); + + interruptible_future<bool> start_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI&, + size_t max_to_start); + void on_backfill_reserved(); + void dispatch_backfill_event( + boost::intrusive_ptr<const boost::statechart::event_base> evt); + + seastar::future<> stop() { return seastar::now(); } +private: + PGRecoveryListener* pg; + size_t start_primary_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI&, + size_t max_to_start, + std::vector<interruptible_future<>> *out); + size_t start_replica_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI&, + size_t max_to_start, + std::vector<interruptible_future<>> *out); + + std::vector<pg_shard_t> get_replica_recovery_order() const { + return pg->get_replica_recovery_order(); + } + RecoveryBackend::interruptible_future<> recover_missing( + RecoveryBackend::RecoveryBlockingEvent::TriggerI&, + const hobject_t &soid, eversion_t need); + RecoveryBackend::interruptible_future<> prep_object_replica_deletes( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + const hobject_t& soid, + eversion_t need); + RecoveryBackend::interruptible_future<> prep_object_replica_pushes( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + const hobject_t& soid, + eversion_t need); + + void on_local_recover( + const hobject_t& soid, + const ObjectRecoveryInfo& recovery_info, + bool is_delete, + ceph::os::Transaction& t); + void on_global_recover ( + const hobject_t& soid, + const object_stat_sum_t& stat_diff, + bool is_delete); + void on_failed_recover( + const std::set<pg_shard_t>& from, + const hobject_t& soid, + const eversion_t& v); + void on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info); + void _committed_pushed_object(epoch_t epoch, + eversion_t last_complete); + friend class ReplicatedRecoveryBackend; + friend class crimson::osd::UrgentRecovery; + + // backfill begin + std::unique_ptr<crimson::osd::BackfillState> backfill_state; + std::map<pg_shard_t, + MURef<MOSDPGBackfillRemove>> backfill_drop_requests; + + template <class EventT> + void start_backfill_recovery( + const EventT& evt); + void request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) final; + void request_primary_scan( + const hobject_t& begin) final; + void enqueue_push( + const hobject_t& obj, + const eversion_t& v) final; + void enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) final; + void maybe_flush() final; + void update_peers_last_backfill( + const hobject_t& new_last_backfill) final; + bool budget_available() const final; + void backfilled() final; + friend crimson::osd::BackfillState::PGFacade; + friend crimson::osd::PG; + // backfill end +}; diff --git a/src/crimson/osd/pg_recovery_listener.h b/src/crimson/osd/pg_recovery_listener.h new file mode 100644 index 000000000..c922b9956 --- /dev/null +++ b/src/crimson/osd/pg_recovery_listener.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "common/hobject.h" +#include "include/types.h" +#include "osd/osd_types.h" + +namespace crimson::osd { + class ShardServices; +}; + +class RecoveryBackend; +class PGRecovery; + +class PGRecoveryListener { +public: + virtual crimson::osd::ShardServices& get_shard_services() = 0; + virtual PGRecovery* get_recovery_handler() = 0; + virtual epoch_t get_osdmap_epoch() const = 0; + virtual bool is_primary() const = 0; + virtual bool is_peered() const = 0; + virtual bool is_recovering() const = 0; + virtual bool is_backfilling() const = 0; + virtual PeeringState& get_peering_state() = 0; + virtual const pg_shard_t& get_pg_whoami() const = 0; + virtual const spg_t& get_pgid() const = 0; + virtual RecoveryBackend* get_recovery_backend() = 0; + virtual bool is_unreadable_object(const hobject_t&, eversion_t* v = 0) const = 0; + virtual bool has_reset_since(epoch_t) const = 0; + virtual std::vector<pg_shard_t> get_replica_recovery_order() const = 0; + virtual epoch_t get_last_peering_reset() const = 0; + virtual const pg_info_t& get_info() const= 0; + virtual seastar::future<> stop() = 0; + virtual void publish_stats_to_osd() = 0; +}; diff --git a/src/crimson/osd/pg_shard_manager.cc b/src/crimson/osd/pg_shard_manager.cc new file mode 100644 index 000000000..6061c856b --- /dev/null +++ b/src/crimson/osd/pg_shard_manager.cc @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/pg_shard_manager.h" +#include "crimson/osd/pg.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +seastar::future<> PGShardManager::load_pgs(crimson::os::FuturizedStore& store) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return store.list_collections( + ).then([this](auto colls_cores) { + return seastar::parallel_for_each( + colls_cores, + [this](auto coll_core) { + auto[coll, shard_core] = coll_core; + spg_t pgid; + if (coll.is_pg(&pgid)) { + return get_pg_to_shard_mapping().maybe_create_pg( + pgid, shard_core + ).then([this, pgid] (auto core) { + return this->template with_remote_shard_state( + core, + [pgid]( + PerShardState &per_shard_state, + ShardServices &shard_services) { + return shard_services.load_pg( + pgid + ).then([pgid, &per_shard_state](auto &&pg) { + logger().info("load_pgs: loaded {}", pgid); + per_shard_state.pg_map.pg_loaded(pgid, std::move(pg)); + return seastar::now(); + }); + }); + }); + } else if (coll.is_temp(&pgid)) { + logger().warn( + "found temp collection on crimson osd, should be impossible: {}", + coll); + ceph_assert(0 == "temp collection on crimson osd, should be impossible"); + return seastar::now(); + } else { + logger().warn("ignoring unrecognized collection: {}", coll); + return seastar::now(); + } + }); + }); +} + +seastar::future<> PGShardManager::stop_pgs() +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return shard_services.invoke_on_all([](auto &local_service) { + return local_service.local_state.stop_pgs(); + }); +} + +seastar::future<std::map<pg_t, pg_stat_t>> +PGShardManager::get_pg_stats() const +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return shard_services.map_reduce0( + [](auto &local) { + return local.local_state.get_pg_stats(); + }, + std::map<pg_t, pg_stat_t>(), + [](auto &&left, auto &&right) { + left.merge(std::move(right)); + return std::move(left); + }); +} + +seastar::future<> PGShardManager::broadcast_map_to_pgs(epoch_t epoch) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return shard_services.invoke_on_all([epoch](auto &local_service) { + return local_service.local_state.broadcast_map_to_pgs( + local_service, epoch + ); + }).then([this, epoch] { + logger().debug("PGShardManager::broadcast_map_to_pgs " + "broadcasted up to {}", + epoch); + return shard_services.invoke_on_all([epoch](auto &local_service) { + local_service.local_state.osdmap_gate.got_map(epoch); + return seastar::now(); + }); + }); +} + +seastar::future<> PGShardManager::set_up_epoch(epoch_t e) { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return shard_services.invoke_on_all( + seastar::smp_submit_to_options{}, + [e](auto &local_service) { + local_service.local_state.set_up_epoch(e); + return seastar::now(); + }); +} + +} diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h new file mode 100644 index 000000000..2f3a3015d --- /dev/null +++ b/src/crimson/osd/pg_shard_manager.h @@ -0,0 +1,390 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/sharded.hh> + +#include "crimson/osd/shard_services.h" +#include "crimson/osd/pg_map.h" + +namespace crimson::os { + class FuturizedStore; +} + +namespace crimson::osd { +/** + * PGShardManager + * + * Manages all state required to partition PGs over seastar reactors + * as well as state required to route messages to pgs. Mediates access to + * shared resources required by PGs (objectstore, messenger, monclient, + * etc) + */ +class PGShardManager { + seastar::sharded<OSDSingletonState> &osd_singleton_state; + seastar::sharded<ShardServices> &shard_services; + seastar::sharded<PGShardMapping> &pg_to_shard_mapping; + +#define FORWARD_CONST(FROM_METHOD, TO_METHOD, TARGET) \ + template <typename... Args> \ + auto FROM_METHOD(Args&&... args) const { \ + return TARGET.TO_METHOD(std::forward<Args>(args)...); \ + } + +#define FORWARD(FROM_METHOD, TO_METHOD, TARGET) \ + template <typename... Args> \ + auto FROM_METHOD(Args&&... args) { \ + return TARGET.TO_METHOD(std::forward<Args>(args)...); \ + } + +#define FORWARD_TO_OSD_SINGLETON(METHOD) \ + FORWARD(METHOD, METHOD, get_osd_singleton_state()) + +public: + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; + + PGShardManager( + seastar::sharded<OSDSingletonState> &osd_singleton_state, + seastar::sharded<ShardServices> &shard_services, + seastar::sharded<PGShardMapping> &pg_to_shard_mapping) + : osd_singleton_state(osd_singleton_state), + shard_services(shard_services), + pg_to_shard_mapping(pg_to_shard_mapping) {} + + auto &get_osd_singleton_state() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return osd_singleton_state.local(); + } + auto &get_osd_singleton_state() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return osd_singleton_state.local(); + } + auto &get_shard_services() { + return shard_services.local(); + } + auto &get_shard_services() const { + return shard_services.local(); + } + auto &get_local_state() { return get_shard_services().local_state; } + auto &get_local_state() const { return get_shard_services().local_state; } + auto &get_pg_to_shard_mapping() { return pg_to_shard_mapping.local(); } + auto &get_pg_to_shard_mapping() const { return pg_to_shard_mapping.local(); } + + seastar::future<> update_map(local_cached_map_t &&map) { + get_osd_singleton_state().update_map( + make_local_shared_foreign(local_cached_map_t(map)) + ); + /* We need each core to get its own foreign_ptr<local_cached_map_t>. + * foreign_ptr can't be cheaply copied, so we make one for each core + * up front. */ + return seastar::do_with( + std::vector<seastar::foreign_ptr<local_cached_map_t>>(), + [this, map](auto &fmaps) { + fmaps.resize(seastar::smp::count); + for (auto &i: fmaps) { + i = seastar::foreign_ptr(map); + } + return shard_services.invoke_on_all( + [&fmaps](auto &local) mutable { + local.local_state.update_map( + make_local_shared_foreign( + std::move(fmaps[seastar::this_shard_id()]) + )); + }); + }); + } + + seastar::future<> stop_registries() { + return shard_services.invoke_on_all([](auto &local) { + return local.local_state.stop_registry(); + }); + } + + FORWARD_TO_OSD_SINGLETON(send_pg_created) + + // osd state forwards + FORWARD(is_active, is_active, get_shard_services().local_state.osd_state) + FORWARD(is_preboot, is_preboot, get_shard_services().local_state.osd_state) + FORWARD(is_booting, is_booting, get_shard_services().local_state.osd_state) + FORWARD(is_stopping, is_stopping, get_shard_services().local_state.osd_state) + FORWARD(is_prestop, is_prestop, get_shard_services().local_state.osd_state) + FORWARD(is_initializing, is_initializing, get_shard_services().local_state.osd_state) + FORWARD(set_prestop, set_prestop, get_shard_services().local_state.osd_state) + FORWARD(set_preboot, set_preboot, get_shard_services().local_state.osd_state) + FORWARD(set_booting, set_booting, get_shard_services().local_state.osd_state) + FORWARD(set_stopping, set_stopping, get_shard_services().local_state.osd_state) + FORWARD(set_active, set_active, get_shard_services().local_state.osd_state) + FORWARD(when_active, when_active, get_shard_services().local_state.osd_state) + FORWARD_CONST(get_osd_state_string, to_string, get_shard_services().local_state.osd_state) + + FORWARD(got_map, got_map, get_shard_services().local_state.osdmap_gate) + FORWARD(wait_for_map, wait_for_map, get_shard_services().local_state.osdmap_gate) + + // Metacoll + FORWARD_TO_OSD_SINGLETON(init_meta_coll) + FORWARD_TO_OSD_SINGLETON(get_meta_coll) + + FORWARD_TO_OSD_SINGLETON(set_superblock) + + // Core OSDMap methods + FORWARD_TO_OSD_SINGLETON(get_local_map) + FORWARD_TO_OSD_SINGLETON(load_map_bl) + FORWARD_TO_OSD_SINGLETON(load_map_bls) + FORWARD_TO_OSD_SINGLETON(store_maps) + + seastar::future<> set_up_epoch(epoch_t e); + + template <typename F> + auto with_remote_shard_state(core_id_t core, F &&f) { + return shard_services.invoke_on( + core, [f=std::move(f)](auto &target_shard_services) mutable { + return std::invoke( + std::move(f), target_shard_services.local_state, + target_shard_services); + }); + } + + template <typename T, typename F> + auto with_remote_shard_state_and_op( + core_id_t core, + typename T::IRef &&op, + F &&f) { + if (seastar::this_shard_id() == core) { + auto &target_shard_services = shard_services.local(); + return std::invoke( + std::move(f), + target_shard_services.local_state, + target_shard_services, + std::move(op)); + } + return op->prepare_remote_submission( + ).then([op=std::move(op), f=std::move(f), this, core + ](auto f_conn) mutable { + return shard_services.invoke_on( + core, + [f=std::move(f), op=std::move(op), f_conn=std::move(f_conn) + ](auto &target_shard_services) mutable { + op->finish_remote_submission(std::move(f_conn)); + return std::invoke( + std::move(f), + target_shard_services.local_state, + target_shard_services, + std::move(op)); + }); + }); + } + + /// Runs opref on the appropriate core, creating the pg as necessary. + template <typename T> + seastar::future<> run_with_pg_maybe_create( + typename T::IRef op + ) { + ceph_assert(op->use_count() == 1); + auto &logger = crimson::get_logger(ceph_subsys_osd); + static_assert(T::can_create()); + logger.debug("{}: can_create", *op); + + get_local_state().registry.remove_from_registry(*op); + return get_pg_to_shard_mapping().maybe_create_pg( + op->get_pgid() + ).then([this, op = std::move(op)](auto core) mutable { + return this->template with_remote_shard_state_and_op<T>( + core, std::move(op), + [](PerShardState &per_shard_state, + ShardServices &shard_services, + typename T::IRef op) { + per_shard_state.registry.add_to_registry(*op); + auto &logger = crimson::get_logger(ceph_subsys_osd); + auto &opref = *op; + return opref.template with_blocking_event< + PGMap::PGCreationBlockingEvent + >([&shard_services, &opref]( + auto &&trigger) { + return shard_services.get_or_create_pg( + std::move(trigger), + opref.get_pgid(), + std::move(opref.get_create_info()) + ); + }).safe_then([&logger, &shard_services, &opref](Ref<PG> pgref) { + logger.debug("{}: have_pg", opref); + return opref.with_pg(shard_services, pgref); + }).handle_error( + crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { + logger.debug("{}: pg creation canceled, dropping", opref); + return seastar::now(); + }) + ).then([op=std::move(op)] {}); + }); + }); + } + + /// Runs opref on the appropriate core, waiting for pg as necessary + template <typename T> + seastar::future<> run_with_pg_maybe_wait( + typename T::IRef op + ) { + ceph_assert(op->use_count() == 1); + auto &logger = crimson::get_logger(ceph_subsys_osd); + static_assert(!T::can_create()); + logger.debug("{}: !can_create", *op); + + get_local_state().registry.remove_from_registry(*op); + return get_pg_to_shard_mapping().maybe_create_pg( + op->get_pgid() + ).then([this, op = std::move(op)](auto core) mutable { + return this->template with_remote_shard_state_and_op<T>( + core, std::move(op), + [](PerShardState &per_shard_state, + ShardServices &shard_services, + typename T::IRef op) { + per_shard_state.registry.add_to_registry(*op); + auto &logger = crimson::get_logger(ceph_subsys_osd); + auto &opref = *op; + return opref.template with_blocking_event< + PGMap::PGCreationBlockingEvent + >([&shard_services, &opref]( + auto &&trigger) { + return shard_services.wait_for_pg( + std::move(trigger), opref.get_pgid()); + }).safe_then([&logger, &shard_services, &opref](Ref<PG> pgref) { + logger.debug("{}: have_pg", opref); + return opref.with_pg(shard_services, pgref); + }).handle_error( + crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { + logger.debug("{}: pg creation canceled, dropping", opref); + return seastar::now(); + }) + ).then([op=std::move(op)] {}); + }); + }); + } + + seastar::future<> load_pgs(crimson::os::FuturizedStore& store); + seastar::future<> stop_pgs(); + + seastar::future<std::map<pg_t, pg_stat_t>> get_pg_stats() const; + + /** + * invoke_method_on_each_shard_seq + * + * Invokes shard_services method on each shard sequentially. + */ + template <typename F, typename... Args> + seastar::future<> invoke_on_each_shard_seq( + F &&f) const { + return sharded_map_seq( + shard_services, + [f=std::forward<F>(f)](const ShardServices &shard_services) mutable { + return std::invoke( + f, + shard_services); + }); + } + + /** + * for_each_pg + * + * Invokes f on each pg sequentially. Caller may rely on f not being + * invoked concurrently on multiple cores. + */ + template <typename F> + seastar::future<> for_each_pg(F &&f) const { + return invoke_on_each_shard_seq( + [f=std::move(f)](const auto &local_service) mutable { + for (auto &pg: local_service.local_state.pg_map.get_pgs()) { + std::apply(f, pg); + } + return seastar::now(); + }); + } + + /** + * for_each_pgid + * + * Syncronously invokes f on each pgid + */ + template <typename F> + void for_each_pgid(F &&f) const { + return get_pg_to_shard_mapping().for_each_pgid( + std::forward<F>(f)); + } + + auto get_num_pgs() const { + return get_pg_to_shard_mapping().get_num_pgs(); + } + + seastar::future<> broadcast_map_to_pgs(epoch_t epoch); + + template <typename F> + auto with_pg(spg_t pgid, F &&f) { + core_id_t core = get_pg_to_shard_mapping().get_pg_mapping(pgid); + return with_remote_shard_state( + core, + [pgid, f=std::move(f)](auto &local_state, auto &local_service) mutable { + return std::invoke( + std::move(f), + local_state.pg_map.get_pg(pgid)); + }); + } + + template <typename T, typename... Args> + auto start_pg_operation(Args&&... args) { + auto op = get_local_state().registry.create_operation<T>( + std::forward<Args>(args)...); + auto &logger = crimson::get_logger(ceph_subsys_osd); + logger.debug("{}: starting {}", *op, __func__); + + auto &opref = *op; + auto id = op->get_id(); + if constexpr (T::is_trackable) { + op->template track_event<typename T::StartEvent>(); + } + auto fut = opref.template enter_stage<>( + opref.get_connection_pipeline().await_active + ).then([this, &opref, &logger] { + logger.debug("{}: start_pg_operation in await_active stage", opref); + return get_shard_services().local_state.osd_state.when_active(); + }).then([&logger, &opref] { + logger.debug("{}: start_pg_operation active, entering await_map", opref); + return opref.template enter_stage<>( + opref.get_connection_pipeline().await_map); + }).then([this, &logger, &opref] { + logger.debug("{}: start_pg_operation await_map stage", opref); + using OSDMapBlockingEvent = + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent; + return opref.template with_blocking_event<OSDMapBlockingEvent>( + [this, &opref](auto &&trigger) { + std::ignore = this; + return get_shard_services().local_state.osdmap_gate.wait_for_map( + std::move(trigger), + opref.get_epoch(), + &get_shard_services()); + }); + }).then([&logger, &opref](auto epoch) { + logger.debug("{}: got map {}, entering get_pg", opref, epoch); + return opref.template enter_stage<>( + opref.get_connection_pipeline().get_pg); + }).then([this, &logger, &opref, op=std::move(op)]() mutable { + logger.debug("{}: in get_pg core {}", opref, seastar::this_shard_id()); + logger.debug("{}: in get_pg", opref); + if constexpr (T::can_create()) { + logger.debug("{}: can_create", opref); + return run_with_pg_maybe_create<T>(std::move(op)); + } else { + logger.debug("{}: !can_create", opref); + return run_with_pg_maybe_wait<T>(std::move(op)); + } + }); + return std::make_pair(id, std::move(fut)); + } + +#undef FORWARD +#undef FORWARD_CONST +#undef FORWARD_TO_OSD_SINGLETON +}; + +} diff --git a/src/crimson/osd/recovery_backend.cc b/src/crimson/osd/recovery_backend.cc new file mode 100644 index 000000000..b5394bfdc --- /dev/null +++ b/src/crimson/osd/recovery_backend.cc @@ -0,0 +1,328 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/format.h> + +#include "crimson/common/exception.h" +#include "crimson/osd/recovery_backend.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/osd_operations/background_recovery.h" + +#include "messages/MOSDFastDispatchOp.h" +#include "osd/osd_types.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +hobject_t RecoveryBackend::get_temp_recovery_object( + const hobject_t& target, + eversion_t version) const +{ + hobject_t hoid = + target.make_temp_hobject(fmt::format("temp_recovering_{}_{}_{}_{}", + pg.get_info().pgid, + version, + pg.get_info().history.same_interval_since, + target.snap)); + logger().debug("{} {}", __func__, hoid); + return hoid; +} + +void RecoveryBackend::clean_up(ceph::os::Transaction& t, + std::string_view why) +{ + for (auto& soid : temp_contents) { + t.remove(pg.get_collection_ref()->get_cid(), + ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard)); + } + temp_contents.clear(); + + for (auto& [soid, recovery_waiter] : recovering) { + if ((recovery_waiter->pull_info + && recovery_waiter->pull_info->is_complete()) + || (!recovery_waiter->pull_info + && recovery_waiter->obc && recovery_waiter->obc->obs.exists)) { + recovery_waiter->obc->interrupt( + ::crimson::common::actingset_changed( + pg.is_primary())); + recovery_waiter->interrupt(why); + } + } + recovering.clear(); +} + +void RecoveryBackend::WaitForObjectRecovery::stop() { + readable.set_exception( + crimson::common::system_shutdown_exception()); + recovered.set_exception( + crimson::common::system_shutdown_exception()); + pulled.set_exception( + crimson::common::system_shutdown_exception()); + for (auto& [pg_shard, pr] : pushes) { + pr.set_exception( + crimson::common::system_shutdown_exception()); + } +} + +void RecoveryBackend::handle_backfill_finish( + MOSDPGBackfill& m, + crimson::net::ConnectionRef conn) +{ + logger().debug("{}", __func__); + ceph_assert(!pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 1); + auto reply = crimson::make_message<MOSDPGBackfill>( + MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, + pg.get_osdmap_epoch(), + m.query_epoch, + spg_t(pg.get_pgid().pgid, pg.get_primary().shard)); + reply->set_priority(pg.get_recovery_op_priority()); + std::ignore = conn->send(std::move(reply)); + shard_services.start_operation<crimson::osd::LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(&pg), + pg.get_pg_whoami(), + pg.get_pgid(), + pg.get_osdmap_epoch(), + pg.get_osdmap_epoch(), + RecoveryDone{}); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_backfill_progress( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + ceph_assert(!pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 2); + + ObjectStore::Transaction t; + pg.get_peering_state().update_backfill_progress( + m.last_backfill, + m.stats, + m.op == MOSDPGBackfill::OP_BACKFILL_PROGRESS, + t); + logger().debug("RecoveryBackend::handle_backfill_progress: do_transaction..."); + return shard_services.get_store().do_transaction( + pg.get_collection_ref(), std::move(t)).or_terminate(); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_backfill_finish_ack( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + ceph_assert(pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 3); + // TODO: + // finish_recovery_op(hobject_t::get_max()); + return seastar::now(); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_backfill( + MOSDPGBackfill& m, + crimson::net::ConnectionRef conn) +{ + logger().debug("{}", __func__); + if (pg.old_peering_msg(m.map_epoch, m.query_epoch)) { + logger().debug("{}: discarding {}", __func__, m); + return seastar::now(); + } + switch (m.op) { + case MOSDPGBackfill::OP_BACKFILL_FINISH: + handle_backfill_finish(m, conn); + [[fallthrough]]; + case MOSDPGBackfill::OP_BACKFILL_PROGRESS: + return handle_backfill_progress(m); + case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK: + return handle_backfill_finish_ack(m); + default: + ceph_assert("unknown op type for pg backfill"); + return seastar::now(); + } +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_backfill_remove( + MOSDPGBackfillRemove& m) +{ + logger().debug("{} m.ls={}", __func__, m.ls); + assert(m.get_type() == MSG_OSD_PG_BACKFILL_REMOVE); + if (pg.can_discard_replica_op(m)) { + logger().debug("{}: discarding {}", __func__, m); + return seastar::now(); + } + ObjectStore::Transaction t; + for ([[maybe_unused]] const auto& [soid, ver] : m.ls) { + // TODO: the reserved space management. PG::try_reserve_recovery_space(). + t.remove(pg.get_collection_ref()->get_cid(), + ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard)); + } + logger().debug("RecoveryBackend::handle_backfill_remove: do_transaction..."); + return shard_services.get_store().do_transaction( + pg.get_collection_ref(), std::move(t)).or_terminate(); +} + +RecoveryBackend::interruptible_future<BackfillInterval> +RecoveryBackend::scan_for_backfill( + const hobject_t& start, + [[maybe_unused]] const std::int64_t min, + const std::int64_t max) +{ + logger().debug("{} starting from {}", __func__, start); + auto version_map = seastar::make_lw_shared<std::map<hobject_t, eversion_t>>(); + return backend->list_objects(start, max).then_interruptible( + [this, start, version_map] (auto&& ret) { + auto&& [objects, next] = std::move(ret); + return seastar::do_with( + std::move(objects), + [this, version_map](auto &objects) { + return interruptor::parallel_for_each(objects, + [this, version_map] (const hobject_t& object) + -> interruptible_future<> { + crimson::osd::ObjectContextRef obc; + if (pg.is_primary()) { + obc = pg.obc_registry.maybe_get_cached_obc(object); + } + if (obc) { + if (obc->obs.exists) { + logger().debug("scan_for_backfill found (primary): {} {}", + object, obc->obs.oi.version); + version_map->emplace(object, obc->obs.oi.version); + } else { + // if the object does not exist here, it must have been removed + // between the collection_list_partial and here. This can happen + // for the first item in the range, which is usually last_backfill. + } + return seastar::now(); + } else { + return backend->load_metadata(object).safe_then_interruptible( + [version_map, object] (auto md) { + if (md->os.exists) { + logger().debug("scan_for_backfill found: {} {}", + object, md->os.oi.version); + version_map->emplace(object, md->os.oi.version); + } + return seastar::now(); + }, PGBackend::load_metadata_ertr::assert_all{}); + } + }); + }).then_interruptible([version_map, start=std::move(start), next=std::move(next), this] { + BackfillInterval bi; + bi.begin = std::move(start); + bi.end = std::move(next); + bi.version = pg.get_info().last_update; + bi.objects = std::move(*version_map); + logger().debug("{} BackfillInterval filled, leaving", + "scan_for_backfill"); + return seastar::make_ready_future<BackfillInterval>(std::move(bi)); + }); + }); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_scan_get_digest( + MOSDPGScan& m, + crimson::net::ConnectionRef conn) +{ + logger().debug("{}", __func__); + if (false /* FIXME: check for backfill too full */) { + std::ignore = shard_services.start_operation<crimson::osd::LocalPeeringEvent>( + // TODO: abstract start_background_recovery + static_cast<crimson::osd::PG*>(&pg), + pg.get_pg_whoami(), + pg.get_pgid(), + pg.get_osdmap_epoch(), + pg.get_osdmap_epoch(), + PeeringState::BackfillTooFull()); + return seastar::now(); + } + return scan_for_backfill( + std::move(m.begin), + crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_min"), + crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_max") + ).then_interruptible( + [this, query_epoch=m.query_epoch, conn + ](auto backfill_interval) { + auto reply = crimson::make_message<MOSDPGScan>( + MOSDPGScan::OP_SCAN_DIGEST, + pg.get_pg_whoami(), + pg.get_osdmap_epoch(), + query_epoch, + spg_t(pg.get_info().pgid.pgid, pg.get_primary().shard), + backfill_interval.begin, + backfill_interval.end); + encode(backfill_interval.objects, reply->get_data()); + return conn->send(std::move(reply)); + }); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_scan_digest( + MOSDPGScan& m) +{ + logger().debug("{}", __func__); + // Check that from is in backfill_targets vector + ceph_assert(pg.is_backfill_target(m.from)); + + BackfillInterval bi; + bi.begin = m.begin; + bi.end = m.end; + { + auto p = m.get_data().cbegin(); + // take care to preserve ordering! + bi.clear_objects(); + ::decode_noclear(bi.objects, p); + } + shard_services.start_operation<crimson::osd::BackfillRecovery>( + static_cast<crimson::osd::PG*>(&pg), + shard_services, + pg.get_osdmap_epoch(), + crimson::osd::BackfillState::ReplicaScanned{ m.from, std::move(bi) }); + return seastar::now(); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_scan( + MOSDPGScan& m, + crimson::net::ConnectionRef conn) +{ + logger().debug("{}", __func__); + if (pg.old_peering_msg(m.map_epoch, m.query_epoch)) { + logger().debug("{}: discarding {}", __func__, m); + return seastar::now(); + } + switch (m.op) { + case MOSDPGScan::OP_SCAN_GET_DIGEST: + return handle_scan_get_digest(m, conn); + case MOSDPGScan::OP_SCAN_DIGEST: + return handle_scan_digest(m); + default: + // FIXME: move to errorator + ceph_assert("unknown op type for pg scan"); + return seastar::now(); + } +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_recovery_op( + Ref<MOSDFastDispatchOp> m, + crimson::net::ConnectionRef conn) +{ + switch (m->get_header().type) { + case MSG_OSD_PG_BACKFILL: + return handle_backfill(*boost::static_pointer_cast<MOSDPGBackfill>(m), conn); + case MSG_OSD_PG_BACKFILL_REMOVE: + return handle_backfill_remove(*boost::static_pointer_cast<MOSDPGBackfillRemove>(m)); + case MSG_OSD_PG_SCAN: + return handle_scan(*boost::static_pointer_cast<MOSDPGScan>(m), conn); + default: + return seastar::make_exception_future<>( + std::invalid_argument(fmt::format("invalid request type: {}", + m->get_header().type))); + } +} diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h new file mode 100644 index 000000000..65e9bb01f --- /dev/null +++ b/src/crimson/osd/recovery_backend.h @@ -0,0 +1,233 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "crimson/common/type_helpers.h" +#include "crimson/os/futurized_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/shard_services.h" + +#include "messages/MOSDPGBackfill.h" +#include "messages/MOSDPGBackfillRemove.h" +#include "messages/MOSDPGScan.h" +#include "osd/recovery_types.h" +#include "osd/osd_types.h" + +namespace crimson::osd{ + class PG; +} + +class PGBackend; + +class RecoveryBackend { +public: + class WaitForObjectRecovery; +public: + template <typename T = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition, T>; + using interruptor = + ::crimson::interruptible::interruptor< + ::crimson::osd::IOInterruptCondition>; + RecoveryBackend(crimson::osd::PG& pg, + crimson::osd::ShardServices& shard_services, + crimson::os::CollectionRef coll, + PGBackend* backend) + : pg{pg}, + shard_services{shard_services}, + store{&shard_services.get_store()}, + coll{coll}, + backend{backend} {} + virtual ~RecoveryBackend() {} + WaitForObjectRecovery& add_recovering(const hobject_t& soid) { + auto [it, added] = recovering.emplace(soid, new WaitForObjectRecovery{}); + assert(added); + return *(it->second); + } + WaitForObjectRecovery& get_recovering(const hobject_t& soid) { + assert(is_recovering(soid)); + return *(recovering.at(soid)); + } + void remove_recovering(const hobject_t& soid) { + recovering.erase(soid); + } + bool is_recovering(const hobject_t& soid) const { + return recovering.count(soid) != 0; + } + uint64_t total_recovering() const { + return recovering.size(); + } + + virtual interruptible_future<> handle_recovery_op( + Ref<MOSDFastDispatchOp> m, + crimson::net::ConnectionRef conn); + + virtual interruptible_future<> recover_object( + const hobject_t& soid, + eversion_t need) = 0; + virtual interruptible_future<> recover_delete( + const hobject_t& soid, + eversion_t need) = 0; + virtual interruptible_future<> push_delete( + const hobject_t& soid, + eversion_t need) = 0; + + interruptible_future<BackfillInterval> scan_for_backfill( + const hobject_t& from, + std::int64_t min, + std::int64_t max); + + void on_peering_interval_change(ceph::os::Transaction& t) { + clean_up(t, "new peering interval"); + } + + seastar::future<> stop() { + for (auto& [soid, recovery_waiter] : recovering) { + recovery_waiter->stop(); + } + return on_stop(); + } +protected: + crimson::osd::PG& pg; + crimson::osd::ShardServices& shard_services; + crimson::os::FuturizedStore::Shard* store; + crimson::os::CollectionRef coll; + PGBackend* backend; + + struct pull_info_t { + pg_shard_t from; + hobject_t soid; + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + crimson::osd::ObjectContextRef head_ctx; + crimson::osd::ObjectContextRef obc; + object_stat_sum_t stat; + bool is_complete() const { + return recovery_progress.is_complete(recovery_info); + } + }; + + struct push_info_t { + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + crimson::osd::ObjectContextRef obc; + object_stat_sum_t stat; + }; + +public: + class WaitForObjectRecovery : + public boost::intrusive_ref_counter< + WaitForObjectRecovery, boost::thread_unsafe_counter>, + public crimson::BlockerT<WaitForObjectRecovery> { + seastar::shared_promise<> readable, recovered, pulled; + std::map<pg_shard_t, seastar::shared_promise<>> pushes; + public: + static constexpr const char* type_name = "WaitForObjectRecovery"; + + crimson::osd::ObjectContextRef obc; + std::optional<pull_info_t> pull_info; + std::map<pg_shard_t, push_info_t> pushing; + + seastar::future<> wait_for_readable() { + return readable.get_shared_future(); + } + seastar::future<> wait_for_pushes(pg_shard_t shard) { + return pushes[shard].get_shared_future(); + } + seastar::future<> wait_for_recovered() { + return recovered.get_shared_future(); + } + template <typename T, typename F> + auto wait_track_blocking(T &trigger, F &&fut) { + WaitForObjectRecoveryRef ref = this; + return track_blocking( + trigger, + std::forward<F>(fut) + ).finally([ref] {}); + } + template <typename T> + seastar::future<> wait_for_recovered(T &trigger) { + WaitForObjectRecoveryRef ref = this; + return wait_track_blocking(trigger, recovered.get_shared_future()); + } + seastar::future<> wait_for_pull() { + return pulled.get_shared_future(); + } + void set_readable() { + readable.set_value(); + } + void set_recovered() { + recovered.set_value(); + } + void set_pushed(pg_shard_t shard) { + pushes[shard].set_value(); + } + void set_pulled() { + pulled.set_value(); + } + void set_push_failed(pg_shard_t shard, std::exception_ptr e) { + pushes.at(shard).set_exception(e); + } + void interrupt(std::string_view why) { + readable.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + recovered.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + pulled.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + for (auto& [pg_shard, pr] : pushes) { + pr.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + } + } + void stop(); + void dump_detail(Formatter* f) const { + } + }; + using RecoveryBlockingEvent = + crimson::AggregateBlockingEvent<WaitForObjectRecovery::BlockingEvent>; + using WaitForObjectRecoveryRef = boost::intrusive_ptr<WaitForObjectRecovery>; +protected: + std::map<hobject_t, WaitForObjectRecoveryRef> recovering; + hobject_t get_temp_recovery_object( + const hobject_t& target, + eversion_t version) const; + + boost::container::flat_set<hobject_t> temp_contents; + + void add_temp_obj(const hobject_t &oid) { + temp_contents.insert(oid); + } + void clear_temp_obj(const hobject_t &oid) { + temp_contents.erase(oid); + } + void clean_up(ceph::os::Transaction& t, std::string_view why); + virtual seastar::future<> on_stop() = 0; +private: + void handle_backfill_finish( + MOSDPGBackfill& m, + crimson::net::ConnectionRef conn); + interruptible_future<> handle_backfill_progress( + MOSDPGBackfill& m); + interruptible_future<> handle_backfill_finish_ack( + MOSDPGBackfill& m); + interruptible_future<> handle_backfill( + MOSDPGBackfill& m, + crimson::net::ConnectionRef conn); + + interruptible_future<> handle_scan_get_digest( + MOSDPGScan& m, + crimson::net::ConnectionRef conn); + interruptible_future<> handle_scan_digest( + MOSDPGScan& m); + interruptible_future<> handle_scan( + MOSDPGScan& m, + crimson::net::ConnectionRef conn); + interruptible_future<> handle_backfill_remove(MOSDPGBackfillRemove& m); +}; diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc new file mode 100644 index 000000000..0ff4ad573 --- /dev/null +++ b/src/crimson/osd/replicated_backend.cc @@ -0,0 +1,174 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "replicated_backend.h" + +#include "messages/MOSDRepOpReply.h" + +#include "crimson/common/exception.h" +#include "crimson/common/log.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/shard_services.h" +#include "osd/PeeringState.h" + +SET_SUBSYS(osd); + +ReplicatedBackend::ReplicatedBackend(pg_t pgid, + pg_shard_t whoami, + ReplicatedBackend::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + DoutPrefixProvider &dpp) + : PGBackend{whoami.shard, coll, shard_services, dpp}, + pgid{pgid}, + whoami{whoami} +{} + +ReplicatedBackend::ll_read_ierrorator::future<ceph::bufferlist> +ReplicatedBackend::_read(const hobject_t& hoid, + const uint64_t off, + const uint64_t len, + const uint32_t flags) +{ + return store->read(coll, ghobject_t{hoid}, off, len, flags); +} + +ReplicatedBackend::rep_op_fut_t +ReplicatedBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + LOG_PREFIX(ReplicatedBackend::_submit_transaction); + + const ceph_tid_t tid = shard_services.get_tid(); + auto pending_txn = + pending_trans.try_emplace(tid, pg_shards.size(), osd_op_p.at_version).first; + bufferlist encoded_txn; + encode(txn, encoded_txn); + + DEBUGDPP("object {}", dpp, hoid); + auto all_completed = interruptor::make_interruptible( + shard_services.get_store().do_transaction(coll, std::move(txn)) + ).then_interruptible([FNAME, this, + peers=pending_txn->second.weak_from_this()] { + if (!peers) { + // for now, only actingset_changed can cause peers + // to be nullptr + ERRORDPP("peers is null, this should be impossible", dpp); + assert(0 == "impossible"); + } + if (--peers->pending == 0) { + peers->all_committed.set_value(); + peers->all_committed = {}; + return seastar::now(); + } + return peers->all_committed.get_shared_future(); + }).then_interruptible([pending_txn, this] { + auto acked_peers = std::move(pending_txn->second.acked_peers); + pending_trans.erase(pending_txn); + return seastar::make_ready_future<crimson::osd::acked_peers_t>(std::move(acked_peers)); + }); + + auto sends = std::make_unique<std::vector<seastar::future<>>>(); + for (auto pg_shard : pg_shards) { + if (pg_shard != whoami) { + auto m = crimson::make_message<MOSDRepOp>( + osd_op_p.req_id, + whoami, + spg_t{pgid, pg_shard.shard}, + hoid, + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + map_epoch, + min_epoch, + tid, + osd_op_p.at_version); + m->set_data(encoded_txn); + pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}}); + encode(log_entries, m->logbl); + m->pg_trim_to = osd_op_p.pg_trim_to; + m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk; + m->set_rollback_to(osd_op_p.at_version); + // TODO: set more stuff. e.g., pg_states + sends->emplace_back(shard_services.send_to_osd(pg_shard.osd, std::move(m), map_epoch)); + } + } + auto sends_complete = seastar::when_all_succeed( + sends->begin(), sends->end() + ).finally([sends=std::move(sends)] {}); + return {std::move(sends_complete), std::move(all_completed)}; +} + +void ReplicatedBackend::on_actingset_changed(bool same_primary) +{ + crimson::common::actingset_changed e_actingset_changed{same_primary}; + for (auto& [tid, pending_txn] : pending_trans) { + pending_txn.all_committed.set_exception(e_actingset_changed); + } + pending_trans.clear(); +} + +void ReplicatedBackend::got_rep_op_reply(const MOSDRepOpReply& reply) +{ + LOG_PREFIX(ReplicatedBackend::got_rep_op_reply); + auto found = pending_trans.find(reply.get_tid()); + if (found == pending_trans.end()) { + WARNDPP("cannot find rep op for message {}", dpp, reply); + return; + } + auto& peers = found->second; + for (auto& peer : peers.acked_peers) { + if (peer.shard == reply.from) { + peer.last_complete_ondisk = reply.get_last_complete_ondisk(); + if (--peers.pending == 0) { + peers.all_committed.set_value(); + peers.all_committed = {}; + } + return; + } + } +} + +seastar::future<> ReplicatedBackend::stop() +{ + LOG_PREFIX(ReplicatedBackend::stop); + INFODPP("cid {}", coll->get_cid()); + for (auto& [tid, pending_on] : pending_trans) { + pending_on.all_committed.set_exception( + crimson::common::system_shutdown_exception()); + } + pending_trans.clear(); + return seastar::now(); +} + +seastar::future<> +ReplicatedBackend::request_committed(const osd_reqid_t& reqid, + const eversion_t& at_version) +{ + if (std::empty(pending_trans)) { + return seastar::now(); + } + auto iter = pending_trans.begin(); + auto& pending_txn = iter->second; + if (pending_txn.at_version > at_version) { + return seastar::now(); + } + for (; iter->second.at_version < at_version; ++iter); + // As for now, the previous client_request with the same reqid + // mustn't have finished, as that would mean later client_requests + // has finished before earlier ones. + // + // The following line of code should be "assert(pending_txn.at_version == at_version)", + // as there can be only one transaction at any time in pending_trans due to + // PG::request_pg_pipeline. But there's a high possibility that we will + // improve the parallelism here in the future, which means there may be multiple + // client requests in flight, so we loosed the restriction to as follows. Correct + // me if I'm wrong:-) + assert(iter != pending_trans.end() && iter->second.at_version == at_version); + if (iter->second.pending) { + return iter->second.all_committed.get_shared_future(); + } else { + return seastar::now(); + } +} diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h new file mode 100644 index 000000000..f789a35ea --- /dev/null +++ b/src/crimson/osd/replicated_backend.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/weak_ptr.hh> +#include "include/buffer_fwd.h" +#include "osd/osd_types.h" + +#include "acked_peers.h" +#include "pg_backend.h" + +namespace crimson::osd { + class ShardServices; +} + +class ReplicatedBackend : public PGBackend +{ +public: + ReplicatedBackend(pg_t pgid, pg_shard_t whoami, + CollectionRef coll, + crimson::osd::ShardServices& shard_services, + DoutPrefixProvider &dpp); + void got_rep_op_reply(const MOSDRepOpReply& reply) final; + seastar::future<> stop() final; + void on_actingset_changed(bool same_primary) final; +private: + ll_read_ierrorator::future<ceph::bufferlist> + _read(const hobject_t& hoid, uint64_t off, + uint64_t len, uint32_t flags) override; + rep_op_fut_t _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) final; + const pg_t pgid; + const pg_shard_t whoami; + class pending_on_t : public seastar::weakly_referencable<pending_on_t> { + public: + pending_on_t(size_t pending, const eversion_t& at_version) + : pending{static_cast<unsigned>(pending)}, at_version(at_version) + {} + unsigned pending; + // The order of pending_txns' at_version must be the same as their + // corresponding ceph_tid_t, as we rely on this condition for checking + // whether a client request is already completed. To put it another + // way, client requests at_version must be updated synchorously/simultaneously + // with ceph_tid_t. + const eversion_t at_version; + crimson::osd::acked_peers_t acked_peers; + seastar::shared_promise<> all_committed; + }; + using pending_transactions_t = std::map<ceph_tid_t, pending_on_t>; + pending_transactions_t pending_trans; + + seastar::future<> request_committed( + const osd_reqid_t& reqid, const eversion_t& at_version) final; +}; diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc new file mode 100644 index 000000000..bd301cc2b --- /dev/null +++ b/src/crimson/osd/replicated_recovery_backend.cc @@ -0,0 +1,1182 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <seastar/core/future.hh> +#include <seastar/core/do_with.hh> + +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "osd/osd_types_fmt.h" +#include "replicated_recovery_backend.h" +#include "msg/Message.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using std::less; +using std::map; +using std::string; + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::recover_object( + const hobject_t& soid, + eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + // always add_recovering(soid) before recover_object(soid) + assert(is_recovering(soid)); + // start tracking the recovery of soid + return maybe_pull_missing_obj(soid, need).then_interruptible([this, soid, need] { + logger().debug("recover_object: loading obc: {}", soid); + return pg.obc_loader.with_obc<RWState::RWREAD>(soid, + [this, soid, need](auto obc) { + logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid); + auto& recovery_waiter = get_recovering(soid); + recovery_waiter.obc = obc; + recovery_waiter.obc->wait_recovery_read(); + return maybe_push_shards(soid, need); + }).handle_error_interruptible( + crimson::osd::PG::load_obc_ertr::all_same_way([soid](auto& code) { + // TODO: may need eio handling? + logger().error("recover_object saw error code {}, ignoring object {}", + code, soid); + })); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::maybe_push_shards( + const hobject_t& soid, + eversion_t need) +{ + return seastar::do_with( + get_shards_to_push(soid), + [this, need, soid](auto &shards) { + return interruptor::parallel_for_each( + shards, + [this, need, soid](auto shard) { + return prep_push(soid, need, shard).then_interruptible([this, soid, shard](auto push) { + auto msg = crimson::make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->pushes.push_back(std::move(push)); + msg->set_priority(pg.get_recovery_op_priority()); + return interruptor::make_interruptible( + shard_services.send_to_osd(shard.osd, + std::move(msg), + pg.get_osdmap_epoch())) + .then_interruptible( + [this, soid, shard] { + return get_recovering(soid).wait_for_pushes(shard); + }); + }); + }); + }).then_interruptible([this, soid] { + auto &recovery = get_recovering(soid); + if (auto push_info = recovery.pushing.begin(); + push_info != recovery.pushing.end()) { + pg.get_recovery_handler()->on_global_recover(soid, + push_info->second.stat, + false); + } else if (recovery.pull_info) { + // no push happened (empty get_shards_to_push()) but pull actually did + pg.get_recovery_handler()->on_global_recover(soid, + recovery.pull_info->stat, + false); + } else { + // no pulls, no pushes + } + return seastar::make_ready_future<>(); + }).handle_exception_interruptible([this, soid](auto e) { + auto &recovery = get_recovering(soid); + if (recovery.obc) { + recovery.obc->drop_recovery_read(); + } + recovering.erase(soid); + return seastar::make_exception_future<>(e); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::maybe_pull_missing_obj( + const hobject_t& soid, + eversion_t need) +{ + pg_missing_tracker_t local_missing = pg.get_local_missing(); + if (!local_missing.is_missing(soid)) { + return seastar::make_ready_future<>(); + } + PullOp pull_op; + auto& recovery_waiter = get_recovering(soid); + recovery_waiter.pull_info = + std::make_optional<RecoveryBackend::pull_info_t>(); + auto& pull_info = *recovery_waiter.pull_info; + prepare_pull(pull_op, pull_info, soid, need); + auto msg = crimson::make_message<MOSDPGPull>(); + msg->from = pg.get_pg_whoami(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_pulls({std::move(pull_op)}); + return interruptor::make_interruptible( + shard_services.send_to_osd( + pull_info.from.osd, + std::move(msg), + pg.get_osdmap_epoch() + )).then_interruptible([&recovery_waiter] { + return recovery_waiter.wait_for_pull(); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::push_delete( + const hobject_t& soid, + eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + epoch_t min_epoch = pg.get_last_peering_reset(); + + assert(pg.get_acting_recovery_backfill().size() > 0); + return interruptor::parallel_for_each(pg.get_acting_recovery_backfill(), + [this, soid, need, min_epoch](pg_shard_t shard) + -> interruptible_future<> { + if (shard == pg.get_pg_whoami()) + return seastar::make_ready_future<>(); + auto iter = pg.get_shard_missing().find(shard); + if (iter == pg.get_shard_missing().end()) + return seastar::make_ready_future<>(); + if (iter->second.is_missing(soid)) { + logger().debug("push_delete: will remove {} from {}", soid, shard); + pg.begin_peer_recover(shard, soid); + spg_t target_pg(pg.get_info().pgid.pgid, shard.shard); + auto msg = crimson::make_message<MOSDPGRecoveryDelete>( + pg.get_pg_whoami(), target_pg, pg.get_osdmap_epoch(), min_epoch); + msg->set_priority(pg.get_recovery_op_priority()); + msg->objects.push_back(std::make_pair(soid, need)); + return interruptor::make_interruptible( + shard_services.send_to_osd(shard.osd, std::move(msg), + pg.get_osdmap_epoch())).then_interruptible( + [this, soid, shard] { + return get_recovering(soid).wait_for_pushes(shard); + }); + } + return seastar::make_ready_future<>(); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_recovery_delete( + Ref<MOSDPGRecoveryDelete> m) +{ + logger().debug("{}: {}", __func__, *m); + + auto& p = m->objects.front(); //TODO: only one delete per message for now. + return local_recover_delete(p.first, p.second, pg.get_osdmap_epoch()) + .then_interruptible( + [this, m] { + auto reply = crimson::make_message<MOSDPGRecoveryDeleteReply>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = spg_t(pg.get_info().pgid.pgid, m->from.shard); + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->objects = m->objects; + return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch()); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::on_local_recover_persist( + const hobject_t& soid, + const ObjectRecoveryInfo& _recovery_info, + bool is_delete, + epoch_t epoch_frozen) +{ + logger().debug("{}", __func__); + ceph::os::Transaction t; + pg.get_recovery_handler()->on_local_recover(soid, _recovery_info, is_delete, t); + logger().debug("ReplicatedRecoveryBackend::on_local_recover_persist: do_transaction..."); + return interruptor::make_interruptible( + shard_services.get_store().do_transaction(coll, std::move(t))) + .then_interruptible( + [this, epoch_frozen, last_complete = pg.get_info().last_complete] { + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + return seastar::make_ready_future<>(); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::local_recover_delete( + const hobject_t& soid, + eversion_t need, + epoch_t epoch_to_freeze) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + return backend->load_metadata(soid).safe_then_interruptible([this] + (auto lomt) -> interruptible_future<> { + if (lomt->os.exists) { + return seastar::do_with(ceph::os::Transaction(), + [this, lomt = std::move(lomt)](auto& txn) { + return backend->remove(lomt->os, txn).then_interruptible( + [this, &txn]() mutable { + logger().debug("ReplicatedRecoveryBackend::local_recover_delete: do_transaction..."); + return shard_services.get_store().do_transaction(coll, + std::move(txn)); + }); + }); + } + return seastar::make_ready_future<>(); + }).safe_then_interruptible([this, soid, epoch_to_freeze, need] { + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = need; + return on_local_recover_persist(soid, recovery_info, + true, epoch_to_freeze); + }, PGBackend::load_metadata_ertr::all_same_way( + [this, soid, epoch_to_freeze, need] (auto e) { + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = need; + return on_local_recover_persist(soid, recovery_info, + true, epoch_to_freeze); + }) + ); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::recover_delete( + const hobject_t &soid, eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + + epoch_t cur_epoch = pg.get_osdmap_epoch(); + return seastar::do_with(object_stat_sum_t(), + [this, soid, need, cur_epoch](auto& stat_diff) { + return local_recover_delete(soid, need, cur_epoch).then_interruptible( + [this, &stat_diff, cur_epoch, soid, need]() + -> interruptible_future<> { + if (!pg.has_reset_since(cur_epoch)) { + bool object_missing = false; + for (const auto& shard : pg.get_acting_recovery_backfill()) { + if (shard == pg.get_pg_whoami()) + continue; + if (pg.get_shard_missing(shard)->is_missing(soid)) { + logger().debug("recover_delete: soid {} needs to deleted from replca {}", + soid, shard); + object_missing = true; + break; + } + } + + if (!object_missing) { + stat_diff.num_objects_recovered = 1; + return seastar::make_ready_future<>(); + } else { + return push_delete(soid, need); + } + } + return seastar::make_ready_future<>(); + }).then_interruptible([this, soid, &stat_diff] { + pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true); + return seastar::make_ready_future<>(); + }); + }); +} + +RecoveryBackend::interruptible_future<PushOp> +ReplicatedRecoveryBackend::prep_push( + const hobject_t& soid, + eversion_t need, + pg_shard_t pg_shard) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + + auto& recovery_waiter = get_recovering(soid); + auto& obc = recovery_waiter.obc; + interval_set<uint64_t> data_subset; + if (obc->obs.oi.size) { + data_subset.insert(0, obc->obs.oi.size); + } + const auto& missing = pg.get_shard_missing().find(pg_shard)->second; + const auto it = missing.get_items().find(soid); + assert(it != missing.get_items().end()); + data_subset.intersection_of(it->second.clean_regions.get_dirty_regions()); + logger().debug("prep_push: {} data_subset {} to {}", + soid, data_subset, pg_shard); + + auto& push_info = recovery_waiter.pushing[pg_shard]; + pg.begin_peer_recover(pg_shard, soid); + const auto pmissing_iter = pg.get_shard_missing().find(pg_shard); + const auto missing_iter = pmissing_iter->second.get_items().find(soid); + assert(missing_iter != pmissing_iter->second.get_items().end()); + + push_info.obc = obc; + push_info.recovery_info.size = obc->obs.oi.size; + push_info.recovery_info.copy_subset = data_subset; + push_info.recovery_info.soid = soid; + push_info.recovery_info.oi = obc->obs.oi; + push_info.recovery_info.version = obc->obs.oi.version; + push_info.recovery_info.object_exist = + missing_iter->second.clean_regions.object_is_exist(); + push_info.recovery_progress.omap_complete = + !missing_iter->second.clean_regions.omap_is_dirty(); + + return build_push_op(push_info.recovery_info, + push_info.recovery_progress, + &push_info.stat).then_interruptible( + [this, soid, pg_shard](auto push_op) { + auto& recovery_waiter = get_recovering(soid); + auto& push_info = recovery_waiter.pushing[pg_shard]; + push_info.recovery_progress = push_op.after_progress; + return push_op; + }); +} + +void ReplicatedRecoveryBackend::prepare_pull(PullOp& pull_op, + pull_info_t& pull_info, + const hobject_t& soid, + eversion_t need) { + logger().debug("{}: {}, {}", __func__, soid, need); + + pg_missing_tracker_t local_missing = pg.get_local_missing(); + const auto missing_iter = local_missing.get_items().find(soid); + auto m = pg.get_missing_loc_shards(); + pg_shard_t fromshard = *(m[soid].begin()); + + //TODO: skipped snap objects case for now + pull_op.recovery_info.copy_subset.insert(0, (uint64_t) -1); + pull_op.recovery_info.copy_subset.intersection_of( + missing_iter->second.clean_regions.get_dirty_regions()); + pull_op.recovery_info.size = ((uint64_t) -1); + pull_op.recovery_info.object_exist = + missing_iter->second.clean_regions.object_is_exist(); + pull_op.recovery_info.soid = soid; + pull_op.soid = soid; + pull_op.recovery_progress.data_complete = false; + pull_op.recovery_progress.omap_complete = + !missing_iter->second.clean_regions.omap_is_dirty(); + pull_op.recovery_progress.data_recovered_to = 0; + pull_op.recovery_progress.first = true; + + pull_info.from = fromshard; + pull_info.soid = soid; + pull_info.recovery_info = pull_op.recovery_info; + pull_info.recovery_progress = pull_op.recovery_progress; +} + +RecoveryBackend::interruptible_future<PushOp> +ReplicatedRecoveryBackend::build_push_op( + const ObjectRecoveryInfo& recovery_info, + const ObjectRecoveryProgress& progress, + object_stat_sum_t* stat) +{ + logger().debug("{} {} @{}", + __func__, recovery_info.soid, recovery_info.version); + return seastar::do_with(ObjectRecoveryProgress(progress), + uint64_t(crimson::common::local_conf() + ->osd_recovery_max_chunk), + recovery_info.version, + PushOp(), + [this, &recovery_info, &progress, stat] + (auto& new_progress, auto& available, auto& v, auto& push_op) { + return read_metadata_for_push_op(recovery_info.soid, + progress, new_progress, + v, &push_op + ).then_interruptible([&](eversion_t local_ver) mutable { + // If requestor didn't know the version, use ours + if (v == eversion_t()) { + v = local_ver; + } else if (v != local_ver) { + logger().error("build_push_op: {} push {} v{} failed because local copy is {}", + pg.get_pgid(), recovery_info.soid, recovery_info.version, local_ver); + // TODO: bail out + } + return read_omap_for_push_op(recovery_info.soid, + progress, + new_progress, + available, &push_op); + }).then_interruptible([this, &recovery_info, &progress, + &available, &push_op]() mutable { + logger().debug("build_push_op: available: {}, copy_subset: {}", + available, recovery_info.copy_subset); + return read_object_for_push_op(recovery_info.soid, + recovery_info.copy_subset, + progress.data_recovered_to, + available, &push_op); + }).then_interruptible([&recovery_info, &v, &progress, + &new_progress, stat, &push_op] + (uint64_t recovered_to) mutable { + new_progress.data_recovered_to = recovered_to; + if (new_progress.is_complete(recovery_info)) { + new_progress.data_complete = true; + if (stat) + stat->num_objects_recovered++; + } else if (progress.first && progress.omap_complete) { + // If omap is not changed, we need recovery omap + // when recovery cannot be completed once + new_progress.omap_complete = false; + } + if (stat) { + stat->num_keys_recovered += push_op.omap_entries.size(); + stat->num_bytes_recovered += push_op.data.length(); + } + push_op.version = v; + push_op.soid = recovery_info.soid; + push_op.recovery_info = recovery_info; + push_op.after_progress = new_progress; + push_op.before_progress = progress; + logger().debug("build_push_op: push_op version:" + " {}, push_op data length: {}", + push_op.version, push_op.data.length()); + return seastar::make_ready_future<PushOp>(std::move(push_op)); + }); + }); +} + +RecoveryBackend::interruptible_future<eversion_t> +ReplicatedRecoveryBackend::read_metadata_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + eversion_t ver, + PushOp* push_op) +{ + logger().debug("{}, {}", __func__, oid); + if (!progress.first) { + return seastar::make_ready_future<eversion_t>(ver); + } + return interruptor::make_interruptible(interruptor::when_all_succeed( + backend->omap_get_header(coll, ghobject_t(oid)).handle_error_interruptible<false>( + crimson::os::FuturizedStore::Shard::read_errorator::all_same_way( + [oid] (const std::error_code& e) { + logger().debug("read_metadata_for_push_op, error {} when getting omap header: {}", e, oid); + return seastar::make_ready_future<bufferlist>(); + })), + interruptor::make_interruptible(store->get_attrs(coll, ghobject_t(oid))) + .handle_error_interruptible<false>( + crimson::os::FuturizedStore::Shard::get_attrs_ertr::all_same_way( + [oid] (const std::error_code& e) { + logger().debug("read_metadata_for_push_op, error {} when getting attrs: {}", e, oid); + return seastar::make_ready_future<crimson::os::FuturizedStore::Shard::attrs_t>(); + })) + )).then_unpack_interruptible([&new_progress, push_op](auto bl, auto attrs) { + if (bl.length() == 0) { + logger().warn("read_metadata_for_push_op: fail to read omap header"); + } else if (attrs.empty()) { + logger().error("read_metadata_for_push_op: fail to read attrs"); + return eversion_t{}; + } + push_op->omap_header.claim_append(std::move(bl)); + for (auto&& [key, val] : attrs) { + push_op->attrset.emplace(std::move(key), std::move(val)); + } + logger().debug("read_metadata_for_push_op: {}", push_op->attrset[OI_ATTR]); + object_info_t oi; + oi.decode_no_oid(push_op->attrset[OI_ATTR]); + new_progress.first = false; + return oi.version; + }); +} + +RecoveryBackend::interruptible_future<uint64_t> +ReplicatedRecoveryBackend::read_object_for_push_op( + const hobject_t& oid, + const interval_set<uint64_t>& copy_subset, + uint64_t offset, + uint64_t max_len, + PushOp* push_op) +{ + if (max_len == 0 || copy_subset.empty()) { + push_op->data_included.clear(); + return seastar::make_ready_future<uint64_t>(offset); + } + // 1. get the extents in the interested range + return interruptor::make_interruptible(backend->fiemap(coll, ghobject_t{oid}, + 0, copy_subset.range_end())).safe_then_interruptible( + [=, this](auto&& fiemap_included) mutable { + interval_set<uint64_t> extents; + try { + extents.intersection_of(copy_subset, std::move(fiemap_included)); + } catch (std::exception &) { + // if fiemap() fails, we will read nothing, as the intersection of + // copy_subset and an empty interval_set would be empty anyway + extents.clear(); + } + // 2. we can read up to "max_len" bytes from "offset", so truncate the + // extents down to this quota. no need to return the number of consumed + // bytes, as this is the last consumer of this quota + push_op->data_included.span_of(extents, offset, max_len); + // 3. read the truncated extents + // TODO: check if the returned extents are pruned + return interruptor::make_interruptible(store->readv(coll, ghobject_t{oid}, + push_op->data_included, 0)); + }).safe_then_interruptible([push_op, range_end=copy_subset.range_end()](auto &&bl) { + push_op->data.claim_append(std::move(bl)); + uint64_t recovered_to = 0; + if (push_op->data_included.empty()) { + // zero filled section, skip to end! + recovered_to = range_end; + } else { + // note down the progress, we will start from there next time + recovered_to = push_op->data_included.range_end(); + } + return seastar::make_ready_future<uint64_t>(recovered_to); + }, PGBackend::read_errorator::all_same_way([](auto e) { + logger().debug("build_push_op: read exception"); + return seastar::make_exception_future<uint64_t>(e); + })); +} + +static std::optional<std::string> nullopt_if_empty(const std::string& s) +{ + return s.empty() ? std::nullopt : std::make_optional(s); +} + +static bool is_too_many_entries_per_chunk(const PushOp* push_op) +{ + const uint64_t entries_per_chunk = + crimson::common::local_conf()->osd_recovery_max_omap_entries_per_chunk; + if (!entries_per_chunk) { + // the limit is disabled + return false; + } + return push_op->omap_entries.size() >= entries_per_chunk; +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::read_omap_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + uint64_t& max_len, + PushOp* push_op) +{ + if (progress.omap_complete) { + return seastar::make_ready_future<>(); + } + return seastar::repeat([&new_progress, &max_len, push_op, &oid, this] { + return shard_services.get_store().omap_get_values( + coll, ghobject_t{oid}, nullopt_if_empty(new_progress.omap_recovered_to) + ).safe_then([&new_progress, &max_len, push_op](const auto& ret) { + const auto& [done, kvs] = ret; + bool stop = done; + // assuming "values.empty() only if done" holds here! + for (const auto& [key, value] : kvs) { + if (is_too_many_entries_per_chunk(push_op)) { + stop = true; + break; + } + if (const uint64_t entry_size = key.size() + value.length(); + entry_size > max_len) { + stop = true; + break; + } else { + max_len -= std::min(max_len, entry_size); + } + push_op->omap_entries.emplace(key, value); + } + if (!push_op->omap_entries.empty()) { + // we iterate in order + new_progress.omap_recovered_to = std::rbegin(push_op->omap_entries)->first; + } + if (done) { + new_progress.omap_complete = true; + } + return seastar::make_ready_future<seastar::stop_iteration>( + stop ? seastar::stop_iteration::yes : seastar::stop_iteration::no + ); + }, crimson::os::FuturizedStore::Shard::read_errorator::assert_all{}); + }); +} + +std::vector<pg_shard_t> +ReplicatedRecoveryBackend::get_shards_to_push(const hobject_t& soid) const +{ + std::vector<pg_shard_t> shards; + assert(pg.get_acting_recovery_backfill().size() > 0); + for (const auto& peer : pg.get_acting_recovery_backfill()) { + if (peer == pg.get_pg_whoami()) + continue; + auto shard_missing = + pg.get_shard_missing().find(peer); + assert(shard_missing != pg.get_shard_missing().end()); + if (shard_missing->second.is_missing(soid)) { + shards.push_back(shard_missing->first); + } + } + return shards; +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_pull(Ref<MOSDPGPull> m) +{ + logger().debug("{}: {}", __func__, *m); + if (pg.can_discard_replica_op(*m)) { + logger().debug("{}: discarding {}", __func__, *m); + return seastar::now(); + } + return seastar::do_with(m->take_pulls(), [this, from=m->from](auto& pulls) { + return interruptor::parallel_for_each(pulls, + [this, from](auto& pull_op) { + const hobject_t& soid = pull_op.soid; + logger().debug("handle_pull: {}", soid); + return backend->stat(coll, ghobject_t(soid)).then_interruptible( + [this, &pull_op](auto st) { + ObjectRecoveryInfo &recovery_info = pull_op.recovery_info; + ObjectRecoveryProgress &progress = pull_op.recovery_progress; + if (progress.first && recovery_info.size == ((uint64_t) -1)) { + // Adjust size and copy_subset + recovery_info.size = st.st_size; + if (st.st_size) { + interval_set<uint64_t> object_range; + object_range.insert(0, st.st_size); + recovery_info.copy_subset.intersection_of(object_range); + } else { + recovery_info.copy_subset.clear(); + } + assert(recovery_info.clone_subset.empty()); + } + return build_push_op(recovery_info, progress, 0); + }).then_interruptible([this, from](auto push_op) { + auto msg = crimson::make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pushes.push_back(std::move(push_op)); + return shard_services.send_to_osd(from.osd, std::move(msg), + pg.get_osdmap_epoch()); + }); + }); + }); +} + +RecoveryBackend::interruptible_future<bool> +ReplicatedRecoveryBackend::_handle_pull_response( + pg_shard_t from, + PushOp& push_op, + PullOp* response, + ceph::os::Transaction* t) +{ + logger().debug("handle_pull_response {} {} data.size() is {} data_included: {}", + push_op.recovery_info, push_op.after_progress, + push_op.data.length(), push_op.data_included); + + const hobject_t &hoid = push_op.soid; + auto& recovery_waiter = get_recovering(hoid); + auto& pull_info = *recovery_waiter.pull_info; + if (pull_info.recovery_info.size == (uint64_t(-1))) { + pull_info.recovery_info.size = push_op.recovery_info.size; + pull_info.recovery_info.copy_subset.intersection_of( + push_op.recovery_info.copy_subset); + } + + // If primary doesn't have object info and didn't know version + if (pull_info.recovery_info.version == eversion_t()) + pull_info.recovery_info.version = push_op.version; + + auto prepare_waiter = interruptor::make_interruptible( + seastar::make_ready_future<>()); + if (pull_info.recovery_progress.first) { + prepare_waiter = pg.obc_loader.with_obc<RWState::RWNONE>( + pull_info.recovery_info.soid, + [&pull_info, &recovery_waiter, &push_op](auto obc) { + pull_info.obc = obc; + recovery_waiter.obc = obc; + obc->obs.oi.decode_no_oid(push_op.attrset.at(OI_ATTR), push_op.soid); + pull_info.recovery_info.oi = obc->obs.oi; + return crimson::osd::PG::load_obc_ertr::now(); + }).handle_error_interruptible(crimson::ct_error::assert_all{}); + }; + return prepare_waiter.then_interruptible( + [this, &pull_info, &push_op, t, response]() mutable { + const bool first = pull_info.recovery_progress.first; + pull_info.recovery_progress = push_op.after_progress; + logger().debug("new recovery_info {}, new progress {}", + pull_info.recovery_info, pull_info.recovery_progress); + interval_set<uint64_t> data_zeros; + { + uint64_t offset = push_op.before_progress.data_recovered_to; + uint64_t length = (push_op.after_progress.data_recovered_to - + push_op.before_progress.data_recovered_to); + if (length) { + data_zeros.insert(offset, length); + } + } + auto [usable_intervals, data] = + trim_pushed_data(pull_info.recovery_info.copy_subset, + push_op.data_included, push_op.data); + bool complete = pull_info.is_complete(); + bool clear_omap = !push_op.before_progress.omap_complete; + return submit_push_data(pull_info.recovery_info, + first, complete, clear_omap, + std::move(data_zeros), std::move(usable_intervals), + std::move(data), std::move(push_op.omap_header), + push_op.attrset, std::move(push_op.omap_entries), t) + .then_interruptible( + [this, response, &pull_info, &push_op, complete, + t, bytes_recovered=data.length()] { + pull_info.stat.num_keys_recovered += push_op.omap_entries.size(); + pull_info.stat.num_bytes_recovered += bytes_recovered; + + if (complete) { + pull_info.stat.num_objects_recovered++; + pg.get_recovery_handler()->on_local_recover( + push_op.soid, get_recovering(push_op.soid).pull_info->recovery_info, + false, *t); + return true; + } else { + response->soid = push_op.soid; + response->recovery_info = pull_info.recovery_info; + response->recovery_progress = pull_info.recovery_progress; + return false; + } + }); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_pull_response( + Ref<MOSDPGPush> m) +{ + if (pg.can_discard_replica_op(*m)) { + logger().debug("{}: discarding {}", __func__, *m); + return seastar::now(); + } + const PushOp& push_op = m->pushes[0]; //TODO: only one push per message for now. + if (push_op.version == eversion_t()) { + // replica doesn't have it! + pg.get_recovery_handler()->on_failed_recover({ m->from }, push_op.soid, + get_recovering(push_op.soid).pull_info->recovery_info.version); + return seastar::make_exception_future<>( + std::runtime_error(fmt::format( + "Error on pushing side {} when pulling obj {}", + m->from, push_op.soid))); + } + + logger().debug("{}: {}", __func__, *m); + return seastar::do_with(PullOp(), [this, m](auto& response) { + return seastar::do_with(ceph::os::Transaction(), m.get(), + [this, &response](auto& t, auto& m) { + pg_shard_t from = m->from; + PushOp& push_op = m->pushes[0]; // only one push per message for now + return _handle_pull_response(from, push_op, &response, &t + ).then_interruptible( + [this, &t](bool complete) { + epoch_t epoch_frozen = pg.get_osdmap_epoch(); + logger().debug("ReplicatedRecoveryBackend::handle_pull_response: do_transaction..."); + return shard_services.get_store().do_transaction(coll, std::move(t)) + .then([this, epoch_frozen, complete, + last_complete = pg.get_info().last_complete] { + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + return seastar::make_ready_future<bool>(complete); + }); + }); + }).then_interruptible([this, m, &response](bool complete) { + if (complete) { + auto& push_op = m->pushes[0]; + get_recovering(push_op.soid).set_pulled(); + return seastar::make_ready_future<>(); + } else { + auto reply = crimson::make_message<MOSDPGPull>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = pg.get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->set_pulls({std::move(response)}); + return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch()); + } + }); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::_handle_push( + pg_shard_t from, + PushOp &push_op, + PushReplyOp *response, + ceph::os::Transaction *t) +{ + logger().debug("{}", __func__); + + bool first = push_op.before_progress.first; + interval_set<uint64_t> data_zeros; + { + uint64_t offset = push_op.before_progress.data_recovered_to; + uint64_t length = (push_op.after_progress.data_recovered_to - + push_op.before_progress.data_recovered_to); + if (length) { + data_zeros.insert(offset, length); + } + } + bool complete = (push_op.after_progress.data_complete && + push_op.after_progress.omap_complete); + bool clear_omap = !push_op.before_progress.omap_complete; + response->soid = push_op.recovery_info.soid; + + return submit_push_data(push_op.recovery_info, first, complete, clear_omap, + std::move(data_zeros), + std::move(push_op.data_included), + std::move(push_op.data), + std::move(push_op.omap_header), + push_op.attrset, + std::move(push_op.omap_entries), t) + .then_interruptible( + [this, complete, &push_op, t] { + if (complete) { + pg.get_recovery_handler()->on_local_recover( + push_op.recovery_info.soid, push_op.recovery_info, + false, *t); + } + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_push( + Ref<MOSDPGPush> m) +{ + if (pg.can_discard_replica_op(*m)) { + logger().debug("{}: discarding {}", __func__, *m); + return seastar::now(); + } + if (pg.is_primary()) { + return handle_pull_response(m); + } + + logger().debug("{}: {}", __func__, *m); + return seastar::do_with(PushReplyOp(), [this, m](auto& response) { + PushOp& push_op = m->pushes[0]; // TODO: only one push per message for now + return seastar::do_with(ceph::os::Transaction(), + [this, m, &push_op, &response](auto& t) { + return _handle_push(m->from, push_op, &response, &t).then_interruptible( + [this, &t] { + epoch_t epoch_frozen = pg.get_osdmap_epoch(); + logger().debug("ReplicatedRecoveryBackend::handle_push: do_transaction..."); + return interruptor::make_interruptible( + shard_services.get_store().do_transaction(coll, std::move(t))).then_interruptible( + [this, epoch_frozen, last_complete = pg.get_info().last_complete] { + //TODO: this should be grouped with pg.on_local_recover somehow. + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + }); + }); + }).then_interruptible([this, m, &response]() mutable { + auto reply = crimson::make_message<MOSDPGPushReply>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = pg.get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + std::vector<PushReplyOp> replies = { std::move(response) }; + reply->replies.swap(replies); + return shard_services.send_to_osd(m->from.osd, + std::move(reply), pg.get_osdmap_epoch()); + }); + }); +} + +RecoveryBackend::interruptible_future<std::optional<PushOp>> +ReplicatedRecoveryBackend::_handle_push_reply( + pg_shard_t peer, + const PushReplyOp &op) +{ + const hobject_t& soid = op.soid; + logger().debug("{}, soid {}, from {}", __func__, soid, peer); + auto recovering_iter = recovering.find(soid); + if (recovering_iter == recovering.end() + || !recovering_iter->second->pushing.count(peer)) { + logger().debug("huh, i wasn't pushing {} to osd.{}", soid, peer); + return seastar::make_ready_future<std::optional<PushOp>>(); + } else { + auto& push_info = recovering_iter->second->pushing[peer]; + bool error = push_info.recovery_progress.error; + if (!push_info.recovery_progress.data_complete && !error) { + return build_push_op(push_info.recovery_info, push_info.recovery_progress, + &push_info.stat + ).then_interruptible([&push_info] (auto push_op) { + push_info.recovery_progress = push_op.after_progress; + return seastar::make_ready_future<std::optional<PushOp>>( + std::move(push_op)); + }).handle_exception_interruptible( + [recovering_iter, &push_info, peer] (auto e) { + push_info.recovery_progress.error = true; + recovering_iter->second->set_push_failed(peer, e); + return seastar::make_ready_future<std::optional<PushOp>>(); + }); + } + if (!error) { + pg.get_recovery_handler()->on_peer_recover(peer, + soid, + push_info.recovery_info); + } + recovering_iter->second->set_pushed(peer); + return seastar::make_ready_future<std::optional<PushOp>>(); + } +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_push_reply( + Ref<MOSDPGPushReply> m) +{ + logger().debug("{}: {}", __func__, *m); + auto from = m->from; + auto& push_reply = m->replies[0]; //TODO: only one reply per message + + return _handle_push_reply(from, push_reply).then_interruptible( + [this, from](std::optional<PushOp> push_op) { + if (push_op) { + auto msg = crimson::make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pushes.push_back(std::move(*push_op)); + return shard_services.send_to_osd(from.osd, + std::move(msg), + pg.get_osdmap_epoch()); + } else { + return seastar::make_ready_future<>(); + } + }); +} + +std::pair<interval_set<uint64_t>, + bufferlist> +ReplicatedRecoveryBackend::trim_pushed_data( + const interval_set<uint64_t> ©_subset, + const interval_set<uint64_t> &intervals_received, + ceph::bufferlist data_received) +{ + logger().debug("{}", __func__); + // what i have is only a subset of what i want + if (intervals_received.subset_of(copy_subset)) { + return {intervals_received, data_received}; + } + // only collect the extents included by copy_subset and intervals_received + interval_set<uint64_t> intervals_usable; + bufferlist data_usable; + intervals_usable.intersection_of(copy_subset, intervals_received); + uint64_t have_off = 0; + for (auto [have_start, have_len] : intervals_received) { + interval_set<uint64_t> want; + want.insert(have_start, have_len); + want.intersection_of(copy_subset); + for (auto [want_start, want_len] : want) { + bufferlist sub; + uint64_t data_off = have_off + (want_start - have_start); + sub.substr_of(data_received, data_off, want_len); + data_usable.claim_append(sub); + } + have_off += have_len; + } + return {intervals_usable, data_usable}; +} + +RecoveryBackend::interruptible_future<hobject_t> +ReplicatedRecoveryBackend::prep_push_target( + const ObjectRecoveryInfo& recovery_info, + bool first, + bool complete, + bool clear_omap, + ObjectStore::Transaction* t, + const map<string, bufferlist, less<>>& attrs, + bufferlist&& omap_header) +{ + if (!first) { + return seastar::make_ready_future<hobject_t>( + get_temp_recovery_object(recovery_info.soid, + recovery_info.version)); + } + + ghobject_t target_oid; + if (complete) { + // overwrite the original object + target_oid = ghobject_t(recovery_info.soid); + } else { + target_oid = ghobject_t(get_temp_recovery_object(recovery_info.soid, + recovery_info.version)); + logger().debug("{}: Adding oid {} in the temp collection", + __func__, target_oid); + add_temp_obj(target_oid.hobj); + } + // create a new object + if (!complete || !recovery_info.object_exist) { + t->remove(coll->get_cid(), target_oid); + t->touch(coll->get_cid(), target_oid); + object_info_t oi; + oi.decode_no_oid(attrs.at(OI_ATTR)); + t->set_alloc_hint(coll->get_cid(), target_oid, + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + } + if (complete) { + // remove xattr and update later if overwrite on original object + t->rmattrs(coll->get_cid(), target_oid); + // if need update omap, clear the previous content first + if (clear_omap) { + t->omap_clear(coll->get_cid(), target_oid); + } + } + t->truncate(coll->get_cid(), target_oid, recovery_info.size); + if (omap_header.length()) { + t->omap_setheader(coll->get_cid(), target_oid, omap_header); + } + if (complete || !recovery_info.object_exist) { + return seastar::make_ready_future<hobject_t>(target_oid.hobj); + } + // clone overlap content in local object if using a new object + return interruptor::make_interruptible(store->stat(coll, ghobject_t(recovery_info.soid))) + .then_interruptible( + [this, &recovery_info, t, target_oid] (auto st) { + // TODO: pg num bytes counting + uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size); + interval_set<uint64_t> local_intervals_included, local_intervals_excluded; + if (local_size) { + local_intervals_included.insert(0, local_size); + local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset); + local_intervals_included.subtract(local_intervals_excluded); + } + for (auto [off, len] : local_intervals_included) { + logger().debug(" clone_range {} {}~{}", + recovery_info.soid, off, len); + t->clone_range(coll->get_cid(), ghobject_t(recovery_info.soid), + target_oid, off, len, off); + } + return seastar::make_ready_future<hobject_t>(target_oid.hobj); + }); +} +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::submit_push_data( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + interval_set<uint64_t>&& data_zeros, + interval_set<uint64_t>&& intervals_included, + bufferlist&& data_included, + bufferlist&& omap_header, + const map<string, bufferlist, less<>> &attrs, + map<string, bufferlist>&& omap_entries, + ObjectStore::Transaction *t) +{ + logger().debug("{}", __func__); + return prep_push_target(recovery_info, first, complete, + clear_omap, t, attrs, + std::move(omap_header)).then_interruptible( + [this, + &recovery_info, t, + first, complete, + data_zeros=std::move(data_zeros), + intervals_included=std::move(intervals_included), + data_included=std::move(data_included), + omap_entries=std::move(omap_entries), + &attrs](auto target_oid) mutable { + + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL; + // Punch zeros for data, if fiemap indicates nothing but it is marked dirty + if (!data_zeros.empty()) { + data_zeros.intersection_of(recovery_info.copy_subset); + assert(intervals_included.subset_of(data_zeros)); + data_zeros.subtract(intervals_included); + + logger().debug("submit_push_data recovering object {} copy_subset: {} " + "intervals_included: {} data_zeros: {}", + recovery_info.soid, recovery_info.copy_subset, + intervals_included, data_zeros); + + for (auto [start, len] : data_zeros) { + t->zero(coll->get_cid(), ghobject_t(target_oid), start, len); + } + } + uint64_t off = 0; + for (auto [start, len] : intervals_included) { + bufferlist bit; + bit.substr_of(data_included, off, len); + t->write(coll->get_cid(), ghobject_t(target_oid), + start, len, bit, fadvise_flags); + off += len; + } + + if (!omap_entries.empty()) + t->omap_setkeys(coll->get_cid(), ghobject_t(target_oid), omap_entries); + if (!attrs.empty()) + t->setattrs(coll->get_cid(), ghobject_t(target_oid), attrs); + + if (complete) { + if (!first) { + logger().debug("submit_push_data: Removing oid {} from the temp collection", + target_oid); + clear_temp_obj(target_oid); + t->remove(coll->get_cid(), ghobject_t(recovery_info.soid)); + t->collection_move_rename(coll->get_cid(), ghobject_t(target_oid), + coll->get_cid(), ghobject_t(recovery_info.soid)); + } + submit_push_complete(recovery_info, t); + } + logger().debug("submit_push_data: done"); + return seastar::make_ready_future<>(); + }); +} + +void ReplicatedRecoveryBackend::submit_push_complete( + const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t) +{ + for (const auto& [oid, extents] : recovery_info.clone_subset) { + for (const auto& [off, len] : extents) { + logger().debug(" clone_range {} {}~{}", oid, off, len); + t->clone_range(coll->get_cid(), ghobject_t(oid), ghobject_t(recovery_info.soid), + off, len, off); + } + } +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_recovery_delete_reply( + Ref<MOSDPGRecoveryDeleteReply> m) +{ + auto& p = m->objects.front(); + hobject_t soid = p.first; + ObjectRecoveryInfo recovery_info; + recovery_info.version = p.second; + pg.get_recovery_handler()->on_peer_recover(m->from, soid, recovery_info); + get_recovering(soid).set_pushed(m->from); + return seastar::now(); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_recovery_op( + Ref<MOSDFastDispatchOp> m, + crimson::net::ConnectionRef conn) +{ + switch (m->get_header().type) { + case MSG_OSD_PG_PULL: + return handle_pull(boost::static_pointer_cast<MOSDPGPull>(m)); + case MSG_OSD_PG_PUSH: + return handle_push(boost::static_pointer_cast<MOSDPGPush>(m)); + case MSG_OSD_PG_PUSH_REPLY: + return handle_push_reply( + boost::static_pointer_cast<MOSDPGPushReply>(m)); + case MSG_OSD_PG_RECOVERY_DELETE: + return handle_recovery_delete( + boost::static_pointer_cast<MOSDPGRecoveryDelete>(m)); + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + return handle_recovery_delete_reply( + boost::static_pointer_cast<MOSDPGRecoveryDeleteReply>(m)); + default: + // delegate to parent class for handling backend-agnostic recovery ops. + return RecoveryBackend::handle_recovery_op(std::move(m), conn); + } +} + diff --git a/src/crimson/osd/replicated_recovery_backend.h b/src/crimson/osd/replicated_recovery_backend.h new file mode 100644 index 000000000..b023b7417 --- /dev/null +++ b/src/crimson/osd/replicated_recovery_backend.h @@ -0,0 +1,169 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/interruptible_future.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/recovery_backend.h" + +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" +#include "os/ObjectStore.h" + +class ReplicatedRecoveryBackend : public RecoveryBackend { +public: + ReplicatedRecoveryBackend(crimson::osd::PG& pg, + crimson::osd::ShardServices& shard_services, + crimson::os::CollectionRef coll, + PGBackend* backend) + : RecoveryBackend(pg, shard_services, coll, backend) + {} + interruptible_future<> handle_recovery_op( + Ref<MOSDFastDispatchOp> m, + crimson::net::ConnectionRef conn) final; + + interruptible_future<> recover_object( + const hobject_t& soid, + eversion_t need) final; + interruptible_future<> recover_delete( + const hobject_t& soid, + eversion_t need) final; + interruptible_future<> push_delete( + const hobject_t& soid, + eversion_t need) final; +protected: + interruptible_future<> handle_pull( + Ref<MOSDPGPull> m); + interruptible_future<> handle_pull_response( + Ref<MOSDPGPush> m); + interruptible_future<> handle_push( + Ref<MOSDPGPush> m); + interruptible_future<> handle_push_reply( + Ref<MOSDPGPushReply> m); + interruptible_future<> handle_recovery_delete( + Ref<MOSDPGRecoveryDelete> m); + interruptible_future<> handle_recovery_delete_reply( + Ref<MOSDPGRecoveryDeleteReply> m); + interruptible_future<PushOp> prep_push( + const hobject_t& soid, + eversion_t need, + pg_shard_t pg_shard); + void prepare_pull( + PullOp& pull_op, + pull_info_t& pull_info, + const hobject_t& soid, + eversion_t need); + std::vector<pg_shard_t> get_shards_to_push( + const hobject_t& soid) const; + interruptible_future<PushOp> build_push_op( + const ObjectRecoveryInfo& recovery_info, + const ObjectRecoveryProgress& progress, + object_stat_sum_t* stat); + /// @returns true if this push op is the last push op for + /// recovery @c pop.soid + interruptible_future<bool> _handle_pull_response( + pg_shard_t from, + PushOp& push_op, + PullOp* response, + ceph::os::Transaction* t); + std::pair<interval_set<uint64_t>, ceph::bufferlist> trim_pushed_data( + const interval_set<uint64_t> ©_subset, + const interval_set<uint64_t> &intervals_received, + ceph::bufferlist data_received); + interruptible_future<> submit_push_data( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + interval_set<uint64_t>&& data_zeros, + interval_set<uint64_t>&& intervals_included, + ceph::bufferlist&& data_included, + ceph::bufferlist&& omap_header, + const std::map<std::string, bufferlist, std::less<>> &attrs, + std::map<std::string, bufferlist>&& omap_entries, + ceph::os::Transaction *t); + void submit_push_complete( + const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t); + interruptible_future<> _handle_push( + pg_shard_t from, + PushOp& push_op, + PushReplyOp *response, + ceph::os::Transaction *t); + interruptible_future<std::optional<PushOp>> _handle_push_reply( + pg_shard_t peer, + const PushReplyOp &op); + interruptible_future<> on_local_recover_persist( + const hobject_t& soid, + const ObjectRecoveryInfo& _recovery_info, + bool is_delete, + epoch_t epoch_to_freeze); + interruptible_future<> local_recover_delete( + const hobject_t& soid, + eversion_t need, + epoch_t epoch_frozen); + seastar::future<> on_stop() final { + return seastar::now(); + } +private: + /// pull missing object from peer + interruptible_future<> maybe_pull_missing_obj( + const hobject_t& soid, + eversion_t need); + + /// load object context for recovery if it is not ready yet + using load_obc_ertr = crimson::errorator< + crimson::ct_error::object_corrupted>; + using load_obc_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + load_obc_ertr>; + + interruptible_future<> maybe_push_shards( + const hobject_t& soid, + eversion_t need); + + /// read the data attached to given object. the size of them is supposed to + /// be relatively small. + /// + /// @return @c oi.version + interruptible_future<eversion_t> read_metadata_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + eversion_t ver, + PushOp* push_op); + /// read the remaining extents of object to be recovered and fill push_op + /// with them + /// + /// @param oid object being recovered + /// @param copy_subset extents we want + /// @param offset the offset in object from where we should read + /// @return the new offset + interruptible_future<uint64_t> read_object_for_push_op( + const hobject_t& oid, + const interval_set<uint64_t>& copy_subset, + uint64_t offset, + uint64_t max_len, + PushOp* push_op); + interruptible_future<> read_omap_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + uint64_t& max_len, + PushOp* push_op); + interruptible_future<hobject_t> prep_push_target( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + ObjectStore::Transaction* t, + const std::map<std::string, bufferlist, std::less<>> &attrs, + bufferlist&& omap_header); + using interruptor = crimson::interruptible::interruptor< + crimson::osd::IOInterruptCondition>; +}; diff --git a/src/crimson/osd/scheduler/mclock_scheduler.cc b/src/crimson/osd/scheduler/mclock_scheduler.cc new file mode 100644 index 000000000..006e4816c --- /dev/null +++ b/src/crimson/osd/scheduler/mclock_scheduler.cc @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <memory> +#include <functional> + +#include "crimson/osd/scheduler/mclock_scheduler.h" +#include "common/dout.h" + +namespace dmc = crimson::dmclock; +using namespace std::placeholders; + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout + + +namespace crimson::osd::scheduler { + +mClockScheduler::mClockScheduler(ConfigProxy &conf) : + scheduler( + std::bind(&mClockScheduler::ClientRegistry::get_info, + &client_registry, + _1), + dmc::AtLimit::Allow, + conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout")) +{ + conf.add_observer(this); + client_registry.update_from_config(conf); +} + +void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf) +{ + default_external_client_info.update( + conf.get_val<double>("osd_mclock_scheduler_client_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"), + conf.get_val<double>("osd_mclock_scheduler_client_lim")); + + internal_client_infos[ + static_cast<size_t>(scheduler_class_t::background_recovery)].update( + conf.get_val<double>("osd_mclock_scheduler_background_recovery_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"), + conf.get_val<double>("osd_mclock_scheduler_background_recovery_lim")); + + internal_client_infos[ + static_cast<size_t>(scheduler_class_t::background_best_effort)].update( + conf.get_val<double>("osd_mclock_scheduler_background_best_effort_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"), + conf.get_val<double>("osd_mclock_scheduler_background_best_effort_lim")); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client( + const client_profile_id_t &client) const +{ + auto ret = external_client_infos.find(client); + if (ret == external_client_infos.end()) + return &default_external_client_info; + else + return &(ret->second); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info( + const scheduler_id_t &id) const { + switch (id.class_id) { + case scheduler_class_t::immediate: + ceph_assert(0 == "Cannot schedule immediate"); + return (dmc::ClientInfo*)nullptr; + case scheduler_class_t::repop: + case scheduler_class_t::client: + return get_external_client(id.client_profile_id); + default: + ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size()); + return &internal_client_infos[static_cast<size_t>(id.class_id)]; + } +} + +void mClockScheduler::dump(ceph::Formatter &f) const +{ +} + +void mClockScheduler::enqueue(item_t&& item) +{ + auto id = get_scheduler_id(item); + auto cost = item.params.cost; + + if (scheduler_class_t::immediate == item.params.klass) { + immediate.push_front(std::move(item)); + } else { + scheduler.add_request( + std::move(item), + id, + cost); + } +} + +void mClockScheduler::enqueue_front(item_t&& item) +{ + immediate.push_back(std::move(item)); + // TODO: item may not be immediate, update mclock machinery to permit + // putting the item back in the queue +} + +item_t mClockScheduler::dequeue() +{ + if (!immediate.empty()) { + auto ret = std::move(immediate.back()); + immediate.pop_back(); + return ret; + } else { + mclock_queue_t::PullReq result = scheduler.pull_request(); + if (result.is_future()) { + ceph_assert( + 0 == "Not implemented, user would have to be able to be woken up"); + return std::move(*(item_t*)nullptr); + } else if (result.is_none()) { + ceph_assert( + 0 == "Impossible, must have checked empty() first"); + return std::move(*(item_t*)nullptr); + } else { + ceph_assert(result.is_retn()); + + auto &retn = result.get_retn(); + return std::move(*retn.request); + } + } +} + +const char** mClockScheduler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_mclock_scheduler_client_res", + "osd_mclock_scheduler_client_wgt", + "osd_mclock_scheduler_client_lim", + "osd_mclock_scheduler_background_recovery_res", + "osd_mclock_scheduler_background_recovery_wgt", + "osd_mclock_scheduler_background_recovery_lim", + "osd_mclock_scheduler_background_best_effort_res", + "osd_mclock_scheduler_background_best_effort_wgt", + "osd_mclock_scheduler_background_best_effort_lim", + NULL + }; + return KEYS; +} + +void mClockScheduler::handle_conf_change( + const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + client_registry.update_from_config(conf); +} + +} diff --git a/src/crimson/osd/scheduler/mclock_scheduler.h b/src/crimson/osd/scheduler/mclock_scheduler.h new file mode 100644 index 000000000..153fc758b --- /dev/null +++ b/src/crimson/osd/scheduler/mclock_scheduler.h @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include <ostream> +#include <map> +#include <vector> + +#include "boost/variant.hpp" + +#include "dmclock/src/dmclock_server.h" + +#include "crimson/osd/scheduler/scheduler.h" +#include "common/config.h" +#include "common/ceph_context.h" + + +namespace crimson::osd::scheduler { + +using client_id_t = uint64_t; +using profile_id_t = uint64_t; + +struct client_profile_id_t { + client_id_t client_id; + profile_id_t profile_id; + auto operator<=>(const client_profile_id_t&) const = default; +}; + + +struct scheduler_id_t { + scheduler_class_t class_id; + client_profile_id_t client_profile_id; + auto operator<=>(const scheduler_id_t&) const = default; +}; + +/** + * Scheduler implementation based on mclock. + * + * TODO: explain configs + */ +class mClockScheduler : public Scheduler, md_config_obs_t { + + class ClientRegistry { + std::array< + crimson::dmclock::ClientInfo, + static_cast<size_t>(scheduler_class_t::client) + > internal_client_infos = { + // Placeholder, gets replaced with configured values + crimson::dmclock::ClientInfo(1, 1, 1), + crimson::dmclock::ClientInfo(1, 1, 1) + }; + + crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1}; + std::map<client_profile_id_t, + crimson::dmclock::ClientInfo> external_client_infos; + const crimson::dmclock::ClientInfo *get_external_client( + const client_profile_id_t &client) const; + public: + void update_from_config(const ConfigProxy &conf); + const crimson::dmclock::ClientInfo *get_info( + const scheduler_id_t &id) const; + } client_registry; + + using mclock_queue_t = crimson::dmclock::PullPriorityQueue< + scheduler_id_t, + item_t, + true, + true, + 2>; + mclock_queue_t scheduler; + std::list<item_t> immediate; + + static scheduler_id_t get_scheduler_id(const item_t &item) { + return scheduler_id_t{ + item.params.klass, + client_profile_id_t{ + item.params.owner, + 0 + } + }; + } + +public: + mClockScheduler(ConfigProxy &conf); + + // Enqueue op in the back of the regular queue + void enqueue(item_t &&item) final; + + // Enqueue the op in the front of the regular queue + void enqueue_front(item_t &&item) final; + + // Return an op to be dispatch + item_t dequeue() final; + + // Returns if the queue is empty + bool empty() const final { + return immediate.empty() && scheduler.empty(); + } + + // Formatted output of the queue + void dump(ceph::Formatter &f) const final; + + void print(std::ostream &ostream) const final { + ostream << "mClockScheduler"; + } + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) final; +}; + +} diff --git a/src/crimson/osd/scheduler/scheduler.cc b/src/crimson/osd/scheduler/scheduler.cc new file mode 100644 index 000000000..c85cb388e --- /dev/null +++ b/src/crimson/osd/scheduler/scheduler.cc @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <ostream> + +#include <seastar/core/print.hh> + +#include "crimson/osd/scheduler/scheduler.h" +#include "crimson/osd/scheduler/mclock_scheduler.h" +#include "common/WeightedPriorityQueue.h" + +namespace crimson::osd::scheduler { + +std::ostream &operator<<(std::ostream &lhs, const scheduler_class_t &c) +{ + switch (c) { + case scheduler_class_t::background_best_effort: + return lhs << "background_best_effort"; + case scheduler_class_t::background_recovery: + return lhs << "background_recovery"; + case scheduler_class_t::client: + return lhs << "client"; + case scheduler_class_t::repop: + return lhs << "repop"; + case scheduler_class_t::immediate: + return lhs << "immediate"; + default: + return lhs; + } +} + +/** + * Implements Scheduler in terms of OpQueue + * + * Templated on queue type to avoid dynamic dispatch, T should implement + * OpQueue<Scheduleritem_t, client_t>. This adapter is mainly responsible for + * the boilerplate priority cutoff/strict concept which is needed for + * OpQueue based implementations. + */ +template <typename T> +class ClassedOpQueueScheduler final : public Scheduler { + const scheduler_class_t cutoff; + T queue; + + using priority_t = uint64_t; + std::array< + priority_t, + static_cast<size_t>(scheduler_class_t::immediate) + > priority_map = { + // Placeholder, gets replaced with configured values + 0, 0, 0 + }; + + static scheduler_class_t get_io_prio_cut(ConfigProxy &conf) { + if (conf.get_val<std::string>("osd_op_queue_cut_off") == "debug_random") { + srand(time(NULL)); + return (rand() % 2 < 1) ? + scheduler_class_t::repop : scheduler_class_t::immediate; + } else if (conf.get_val<std::string>("osd_op_queue_cut_off") == "high") { + return scheduler_class_t::immediate; + } else { + return scheduler_class_t::repop; + } + } + + bool use_strict(scheduler_class_t kl) const { + return static_cast<uint8_t>(kl) >= static_cast<uint8_t>(cutoff); + } + + priority_t get_priority(scheduler_class_t kl) const { + ceph_assert(static_cast<size_t>(kl) < + static_cast<size_t>(scheduler_class_t::immediate)); + return priority_map[static_cast<size_t>(kl)]; + } + +public: + template <typename... Args> + ClassedOpQueueScheduler(ConfigProxy &conf, Args&&... args) : + cutoff(get_io_prio_cut(conf)), + queue(std::forward<Args>(args)...) + { + priority_map[ + static_cast<size_t>(scheduler_class_t::background_best_effort) + ] = conf.get_val<uint64_t>("osd_scrub_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::background_recovery) + ] = conf.get_val<uint64_t>("osd_recovery_op_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::client) + ] = conf.get_val<uint64_t>("osd_client_op_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::repop) + ] = conf.get_val<uint64_t>("osd_client_op_priority"); + } + + void enqueue(item_t &&item) final { + if (use_strict(item.params.klass)) + queue.enqueue_strict( + item.params.owner, get_priority(item.params.klass), std::move(item)); + else + queue.enqueue( + item.params.owner, get_priority(item.params.klass), + item.params.cost, std::move(item)); + } + + void enqueue_front(item_t &&item) final { + if (use_strict(item.params.klass)) + queue.enqueue_strict_front( + item.params.owner, get_priority(item.params.klass), std::move(item)); + else + queue.enqueue_front( + item.params.owner, get_priority(item.params.klass), + item.params.cost, std::move(item)); + } + + bool empty() const final { + return queue.empty(); + } + + item_t dequeue() final { + return queue.dequeue(); + } + + void dump(ceph::Formatter &f) const final { + return queue.dump(&f); + } + + void print(std::ostream &out) const final { + out << "ClassedOpQueueScheduler(queue="; + queue.print(out); + out << ", cutoff=" << cutoff << ")"; + } + + ~ClassedOpQueueScheduler() final {}; +}; + +SchedulerRef make_scheduler(ConfigProxy &conf) +{ + const std::string _type = conf.get_val<std::string>("osd_op_queue"); + const std::string *type = &_type; + if (*type == "debug_random") { + static const std::string index_lookup[] = { "mclock_scheduler", + "wpq" }; + srand(time(NULL)); + unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0])); + type = &index_lookup[which]; + } + + if (*type == "wpq" ) { + // default is 'wpq' + return std::make_unique< + ClassedOpQueueScheduler<WeightedPriorityQueue<item_t, client_t>>>( + conf, + conf.get_val<uint64_t>("osd_op_pq_max_tokens_per_priority"), + conf->osd_op_pq_min_cost + ); + } else if (*type == "mclock_scheduler") { + return std::make_unique<mClockScheduler>(conf); + } else { + ceph_assert("Invalid choice of wq" == 0); + return std::unique_ptr<mClockScheduler>(); + } +} + +std::ostream &operator<<(std::ostream &lhs, const Scheduler &rhs) { + rhs.print(lhs); + return lhs; +} + +} diff --git a/src/crimson/osd/scheduler/scheduler.h b/src/crimson/osd/scheduler/scheduler.h new file mode 100644 index 000000000..a014991ab --- /dev/null +++ b/src/crimson/osd/scheduler/scheduler.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <seastar/core/future.hh> +#include <ostream> + +#include "crimson/common/config_proxy.h" + +namespace crimson::osd::scheduler { + +enum class scheduler_class_t : uint8_t { + background_best_effort = 0, + background_recovery, + client, + repop, + immediate, +}; + +std::ostream &operator<<(std::ostream &, const scheduler_class_t &); + +using client_t = uint64_t; +using cost_t = uint64_t; + +struct params_t { + cost_t cost = 1; + client_t owner; + scheduler_class_t klass; +}; + +struct item_t { + params_t params; + seastar::promise<> wake; +}; + +/** + * Base interface for classes responsible for choosing + * op processing order in the OSD. + */ +class Scheduler { +public: + // Enqueue op for scheduling + virtual void enqueue(item_t &&item) = 0; + + // Enqueue op for processing as though it were enqueued prior + // to other items already scheduled. + virtual void enqueue_front(item_t &&item) = 0; + + // Returns true iff there are no ops scheduled + virtual bool empty() const = 0; + + // Return next op to be processed + virtual item_t dequeue() = 0; + + // Dump formatted representation for the queue + virtual void dump(ceph::Formatter &f) const = 0; + + // Print human readable brief description with relevant parameters + virtual void print(std::ostream &out) const = 0; + + // Destructor + virtual ~Scheduler() {}; +}; + +std::ostream &operator<<(std::ostream &lhs, const Scheduler &); +using SchedulerRef = std::unique_ptr<Scheduler>; + +SchedulerRef make_scheduler(ConfigProxy &); + +} diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc new file mode 100644 index 000000000..a6431305d --- /dev/null +++ b/src/crimson/osd/shard_services.cc @@ -0,0 +1,761 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/smart_ptr/make_local_shared.hpp> + +#include "crimson/osd/shard_services.h" + +#include "messages/MOSDAlive.h" +#include "messages/MOSDMap.h" +#include "messages/MOSDPGCreated.h" +#include "messages/MOSDPGTemp.h" + +#include "osd/osd_perf_counters.h" +#include "osd/PeeringState.h" +#include "crimson/common/config_proxy.h" +#include "crimson/mgr/client.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Messenger.h" +#include "crimson/net/Connection.h" +#include "crimson/os/cyanstore/cyan_store.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/osd_operations/pg_advance_map.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_meta.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using std::vector; + +namespace crimson::osd { + +PerShardState::PerShardState( + int whoami, + ceph::mono_time startup_time, + PerfCounters *perf, + PerfCounters *recoverystate_perf, + crimson::os::FuturizedStore &store, + OSDState &osd_state) + : whoami(whoami), + store(store.get_sharded_store()), + osd_state(osd_state), + osdmap_gate("PerShardState::osdmap_gate"), + perf(perf), recoverystate_perf(recoverystate_perf), + throttler(crimson::common::local_conf()), + next_tid( + static_cast<ceph_tid_t>(seastar::this_shard_id()) << + (std::numeric_limits<ceph_tid_t>::digits - 8)), + startup_time(startup_time) +{} + +seastar::future<> PerShardState::dump_ops_in_flight(Formatter *f) const +{ + registry.for_each_op([f](const auto &op) { + op.dump(f); + }); + return seastar::now(); +} + +seastar::future<> PerShardState::stop_pgs() +{ + assert_core(); + return seastar::parallel_for_each( + pg_map.get_pgs(), + [](auto& p) { + return p.second->stop(); + }); +} + +std::map<pg_t, pg_stat_t> PerShardState::get_pg_stats() const +{ + assert_core(); + std::map<pg_t, pg_stat_t> ret; + for (auto [pgid, pg] : pg_map.get_pgs()) { + if (pg->is_primary()) { + auto stats = pg->get_stats(); + // todo: update reported_epoch,reported_seq,last_fresh + stats.reported_epoch = osdmap->get_epoch(); + ret.emplace(pgid.pgid, std::move(stats)); + } + } + return ret; +} + +seastar::future<> PerShardState::broadcast_map_to_pgs( + ShardServices &shard_services, + epoch_t epoch) +{ + assert_core(); + auto &pgs = pg_map.get_pgs(); + return seastar::parallel_for_each( + pgs.begin(), pgs.end(), + [=, &shard_services](auto& pg) { + return shard_services.start_operation<PGAdvanceMap>( + shard_services, + pg.second, epoch, + PeeringCtx{}, false).second; + }); +} + +Ref<PG> PerShardState::get_pg(spg_t pgid) +{ + assert_core(); + return pg_map.get_pg(pgid); +} + +HeartbeatStampsRef PerShardState::get_hb_stamps(int peer) +{ + assert_core(); + auto [stamps, added] = heartbeat_stamps.try_emplace(peer); + if (added) { + stamps->second = ceph::make_ref<HeartbeatStamps>(peer); + } + return stamps->second; +} + +OSDSingletonState::OSDSingletonState( + int whoami, + crimson::net::Messenger &cluster_msgr, + crimson::net::Messenger &public_msgr, + crimson::mon::Client &monc, + crimson::mgr::Client &mgrc) + : whoami(whoami), + cluster_msgr(cluster_msgr), + public_msgr(public_msgr), + monc(monc), + mgrc(mgrc), + local_reserver( + &cct, + &finisher, + crimson::common::local_conf()->osd_max_backfills, + crimson::common::local_conf()->osd_min_recovery_priority), + remote_reserver( + &cct, + &finisher, + crimson::common::local_conf()->osd_max_backfills, + crimson::common::local_conf()->osd_min_recovery_priority), + snap_reserver( + &cct, + &finisher, + crimson::common::local_conf()->osd_max_trimming_pgs) +{ + crimson::common::local_conf().add_observer(this); + osdmaps[0] = boost::make_local_shared<OSDMap>(); + + perf = build_osd_logger(&cct); + cct.get_perfcounters_collection()->add(perf); + + recoverystate_perf = build_recoverystate_perf(&cct); + cct.get_perfcounters_collection()->add(recoverystate_perf); +} + +seastar::future<> OSDSingletonState::send_to_osd( + int peer, MessageURef m, epoch_t from_epoch) +{ + if (osdmap->is_down(peer)) { + logger().info("{}: osd.{} is_down", __func__, peer); + return seastar::now(); + } else if (osdmap->get_info(peer).up_from > from_epoch) { + logger().info("{}: osd.{} {} > {}", __func__, peer, + osdmap->get_info(peer).up_from, from_epoch); + return seastar::now(); + } else { + auto conn = cluster_msgr.connect( + osdmap->get_cluster_addrs(peer).front(), CEPH_ENTITY_TYPE_OSD); + return conn->send(std::move(m)); + } +} + +seastar::future<> OSDSingletonState::osdmap_subscribe( + version_t epoch, bool force_request) +{ + logger().info("{}({})", __func__, epoch); + if (monc.sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) || + force_request) { + return monc.renew_subs(); + } else { + return seastar::now(); + } +} + +void OSDSingletonState::queue_want_pg_temp( + pg_t pgid, + const vector<int>& want, + bool forced) +{ + auto p = pg_temp_pending.find(pgid); + if (p == pg_temp_pending.end() || + p->second.acting != want || + forced) { + pg_temp_wanted[pgid] = {want, forced}; + } +} + +void OSDSingletonState::remove_want_pg_temp(pg_t pgid) +{ + pg_temp_wanted.erase(pgid); + pg_temp_pending.erase(pgid); +} + +void OSDSingletonState::requeue_pg_temp() +{ + unsigned old_wanted = pg_temp_wanted.size(); + unsigned old_pending = pg_temp_pending.size(); + pg_temp_wanted.merge(pg_temp_pending); + pg_temp_pending.clear(); + logger().debug( + "{}: {} + {} -> {}", + __func__ , + old_wanted, + old_pending, + pg_temp_wanted.size()); +} + +seastar::future<> OSDSingletonState::send_pg_temp() +{ + if (pg_temp_wanted.empty()) + return seastar::now(); + logger().debug("{}: {}", __func__, pg_temp_wanted); + MURef<MOSDPGTemp> ms[2] = {nullptr, nullptr}; + for (auto& [pgid, pg_temp] : pg_temp_wanted) { + auto& m = ms[pg_temp.forced]; + if (!m) { + m = crimson::make_message<MOSDPGTemp>(osdmap->get_epoch()); + m->forced = pg_temp.forced; + } + m->pg_temp.emplace(pgid, pg_temp.acting); + } + pg_temp_pending.merge(pg_temp_wanted); + pg_temp_wanted.clear(); + return seastar::parallel_for_each(std::begin(ms), std::end(ms), + [this](auto& m) { + if (m) { + return monc.send_message(std::move(m)); + } else { + return seastar::now(); + } + }); +} + +std::ostream& operator<<( + std::ostream& out, + const OSDSingletonState::pg_temp_t& pg_temp) +{ + out << pg_temp.acting; + if (pg_temp.forced) { + out << " (forced)"; + } + return out; +} + +seastar::future<> OSDSingletonState::send_pg_created(pg_t pgid) +{ + logger().debug(__func__); + auto o = get_osdmap(); + ceph_assert(o->require_osd_release >= ceph_release_t::luminous); + pg_created.insert(pgid); + return monc.send_message(crimson::make_message<MOSDPGCreated>(pgid)); +} + +seastar::future<> OSDSingletonState::send_pg_created() +{ + logger().debug(__func__); + auto o = get_osdmap(); + ceph_assert(o->require_osd_release >= ceph_release_t::luminous); + return seastar::parallel_for_each(pg_created, + [this](auto &pgid) { + return monc.send_message(crimson::make_message<MOSDPGCreated>(pgid)); + }); +} + +void OSDSingletonState::prune_pg_created() +{ + logger().debug(__func__); + auto o = get_osdmap(); + auto i = pg_created.begin(); + while (i != pg_created.end()) { + auto p = o->get_pg_pool(i->pool()); + if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) { + logger().debug("{} pruning {}", __func__, *i); + i = pg_created.erase(i); + } else { + logger().debug(" keeping {}", __func__, *i); + ++i; + } + } +} + +seastar::future<> OSDSingletonState::send_alive(const epoch_t want) +{ + logger().info( + "{} want={} up_thru_wanted={}", + __func__, + want, + up_thru_wanted); + + if (want > up_thru_wanted) { + up_thru_wanted = want; + } else { + logger().debug("{} want={} <= up_thru_wanted={}; skipping", + __func__, want, up_thru_wanted); + return seastar::now(); + } + if (!osdmap->exists(whoami)) { + logger().warn("{} DNE", __func__); + return seastar::now(); + } if (const epoch_t up_thru = osdmap->get_up_thru(whoami); + up_thru_wanted > up_thru) { + logger().debug("{} up_thru_wanted={} up_thru={}", __func__, want, up_thru); + return monc.send_message( + crimson::make_message<MOSDAlive>(osdmap->get_epoch(), want)); + } else { + logger().debug("{} {} <= {}", __func__, want, osdmap->get_up_thru(whoami)); + return seastar::now(); + } +} + +const char** OSDSingletonState::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_max_backfills", + "osd_min_recovery_priority", + "osd_max_trimming_pgs", + nullptr + }; + return KEYS; +} + +void OSDSingletonState::handle_conf_change( + const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + if (changed.count("osd_max_backfills")) { + local_reserver.set_max(conf->osd_max_backfills); + remote_reserver.set_max(conf->osd_max_backfills); + } + if (changed.count("osd_min_recovery_priority")) { + local_reserver.set_min_priority(conf->osd_min_recovery_priority); + remote_reserver.set_min_priority(conf->osd_min_recovery_priority); + } + if (changed.count("osd_max_trimming_pgs")) { + snap_reserver.set_max(conf->osd_max_trimming_pgs); + } +} + +seastar::future<OSDSingletonState::local_cached_map_t> +OSDSingletonState::get_local_map(epoch_t e) +{ + // TODO: use LRU cache for managing osdmap, fallback to disk if we have to + if (auto found = osdmaps.find(e); found) { + logger().debug("{} osdmap.{} found in cache", __func__, e); + return seastar::make_ready_future<local_cached_map_t>(std::move(found)); + } else { + logger().debug("{} loading osdmap.{} from disk", __func__, e); + return load_map(e).then([e, this](std::unique_ptr<OSDMap> osdmap) { + return seastar::make_ready_future<local_cached_map_t>( + osdmaps.insert(e, std::move(osdmap))); + }); + } +} + +void OSDSingletonState::store_map_bl( + ceph::os::Transaction& t, + epoch_t e, bufferlist&& bl) +{ + meta_coll->store_map(t, e, bl); + map_bl_cache.insert(e, std::move(bl)); +} + +seastar::future<bufferlist> OSDSingletonState::load_map_bl( + epoch_t e) +{ + if (std::optional<bufferlist> found = map_bl_cache.find(e); found) { + logger().debug("{} osdmap.{} found in cache", __func__, e); + return seastar::make_ready_future<bufferlist>(*found); + } else { + logger().debug("{} loading osdmap.{} from disk", __func__, e); + return meta_coll->load_map(e); + } +} + +seastar::future<std::map<epoch_t, bufferlist>> OSDSingletonState::load_map_bls( + epoch_t first, + epoch_t last) +{ + logger().debug("{} loading maps [{},{}]", + __func__, first, last); + ceph_assert(first <= last); + return seastar::map_reduce(boost::make_counting_iterator<epoch_t>(first), + boost::make_counting_iterator<epoch_t>(last + 1), + [this](epoch_t e) { + return load_map_bl(e).then([e](auto&& bl) { + return seastar::make_ready_future<std::pair<epoch_t, bufferlist>>( + std::make_pair(e, std::move(bl))); + }); + }, + std::map<epoch_t, bufferlist>{}, + [](auto&& bls, auto&& epoch_bl) { + bls.emplace(std::move(epoch_bl)); + return std::move(bls); + }); +} + +seastar::future<std::unique_ptr<OSDMap>> OSDSingletonState::load_map(epoch_t e) +{ + auto o = std::make_unique<OSDMap>(); + logger().info("{} osdmap.{}", __func__, e); + if (e == 0) { + return seastar::make_ready_future<std::unique_ptr<OSDMap>>(std::move(o)); + } + return load_map_bl(e).then([o=std::move(o)](bufferlist bl) mutable { + o->decode(bl); + return seastar::make_ready_future<std::unique_ptr<OSDMap>>(std::move(o)); + }); +} + +seastar::future<> OSDSingletonState::store_maps(ceph::os::Transaction& t, + epoch_t start, Ref<MOSDMap> m) +{ + return seastar::do_for_each( + boost::make_counting_iterator(start), + boost::make_counting_iterator(m->get_last() + 1), + [&t, m, this](epoch_t e) { + if (auto p = m->maps.find(e); p != m->maps.end()) { + auto o = std::make_unique<OSDMap>(); + o->decode(p->second); + logger().info("store_maps storing osdmap.{}", e); + store_map_bl(t, e, std::move(std::move(p->second))); + osdmaps.insert(e, std::move(o)); + return seastar::now(); + } else if (auto p = m->incremental_maps.find(e); + p != m->incremental_maps.end()) { + logger().info("store_maps found osdmap.{} incremental map, " + "loading osdmap.{}", e, e - 1); + ceph_assert(std::cmp_greater(e, 0u)); + return load_map(e - 1).then([e, bl=p->second, &t, this](auto o) { + OSDMap::Incremental inc; + auto i = bl.cbegin(); + inc.decode(i); + o->apply_incremental(inc); + bufferlist fbl; + o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED); + logger().info("store_maps storing osdmap.{}", o->get_epoch()); + store_map_bl(t, e, std::move(fbl)); + osdmaps.insert(e, std::move(o)); + return seastar::now(); + }); + } else { + logger().error("MOSDMap lied about what maps it had?"); + return seastar::now(); + } + }); +} + +seastar::future<Ref<PG>> ShardServices::make_pg( + OSDMapService::cached_map_t create_map, + spg_t pgid, + bool do_create) +{ + using ec_profile_t = std::map<std::string, std::string>; + auto get_pool_info_for_pg = [create_map, pgid, this] { + if (create_map->have_pg_pool(pgid.pool())) { + pg_pool_t pi = *create_map->get_pg_pool(pgid.pool()); + std::string name = create_map->get_pool_name(pgid.pool()); + ec_profile_t ec_profile; + if (pi.is_erasure()) { + ec_profile = create_map->get_erasure_code_profile( + pi.erasure_code_profile); + } + return seastar::make_ready_future< + std::tuple<pg_pool_t,std::string, ec_profile_t> + >(std::make_tuple( + std::move(pi), + std::move(name), + std::move(ec_profile))); + } else { + // pool was deleted; grab final pg_pool_t off disk. + return get_pool_info(pgid.pool()); + } + }; + auto get_collection = [pgid, do_create, this] { + const coll_t cid{pgid}; + if (do_create) { + return get_store().create_new_collection(cid); + } else { + return get_store().open_collection(cid); + } + }; + return seastar::when_all( + std::move(get_pool_info_for_pg), + std::move(get_collection) + ).then([pgid, create_map, this](auto &&ret) { + auto [pool, name, ec_profile] = std::move(std::get<0>(ret).get0()); + auto coll = std::move(std::get<1>(ret).get0()); + return seastar::make_ready_future<Ref<PG>>( + new PG{ + pgid, + pg_shard_t{local_state.whoami, pgid.shard}, + std::move(coll), + std::move(pool), + std::move(name), + create_map, + *this, + ec_profile}); + }); +} + +seastar::future<Ref<PG>> ShardServices::handle_pg_create_info( + std::unique_ptr<PGCreateInfo> info) { + return seastar::do_with( + std::move(info), + [this](auto &info) + -> seastar::future<Ref<PG>> { + return get_map(info->epoch).then( + [&info, this](cached_map_t startmap) + -> seastar::future<std::tuple<Ref<PG>, cached_map_t>> { + const spg_t &pgid = info->pgid; + if (info->by_mon) { + int64_t pool_id = pgid.pgid.pool(); + const pg_pool_t *pool = get_map()->get_pg_pool(pool_id); + if (!pool) { + logger().debug( + "{} ignoring pgid {}, pool dne", + __func__, + pgid); + local_state.pg_map.pg_creation_canceled(pgid); + return seastar::make_ready_future< + std::tuple<Ref<PG>, OSDMapService::cached_map_t> + >(std::make_tuple(Ref<PG>(), startmap)); + } else if (!pool->is_crimson()) { + logger().debug( + "{} ignoring pgid {}, pool lacks crimson flag", + __func__, + pgid); + local_state.pg_map.pg_creation_canceled(pgid); + return seastar::make_ready_future< + std::tuple<Ref<PG>, OSDMapService::cached_map_t> + >(std::make_tuple(Ref<PG>(), startmap)); + } + ceph_assert(get_map()->require_osd_release >= + ceph_release_t::octopus); + if (!pool->has_flag(pg_pool_t::FLAG_CREATING)) { + // this ensures we do not process old creating messages after the + // pool's initial pgs have been created (and pg are subsequently + // allowed to split or merge). + logger().debug( + "{} dropping {} create, pool does not have CREATING flag set", + __func__, + pgid); + local_state.pg_map.pg_creation_canceled(pgid); + return seastar::make_ready_future< + std::tuple<Ref<PG>, OSDMapService::cached_map_t> + >(std::make_tuple(Ref<PG>(), startmap)); + } + } + return make_pg( + startmap, pgid, true + ).then([startmap=std::move(startmap)](auto pg) mutable { + return seastar::make_ready_future< + std::tuple<Ref<PG>, OSDMapService::cached_map_t> + >(std::make_tuple(std::move(pg), std::move(startmap))); + }); + }).then([this, &info](auto &&ret) + ->seastar::future<Ref<PG>> { + auto [pg, startmap] = std::move(ret); + if (!pg) + return seastar::make_ready_future<Ref<PG>>(Ref<PG>()); + const pg_pool_t* pp = startmap->get_pg_pool(info->pgid.pool()); + + int up_primary, acting_primary; + vector<int> up, acting; + startmap->pg_to_up_acting_osds( + info->pgid.pgid, &up, &up_primary, &acting, &acting_primary); + + int role = startmap->calc_pg_role( + pg_shard_t(local_state.whoami, info->pgid.shard), + acting); + + PeeringCtx rctx; + create_pg_collection( + rctx.transaction, + info->pgid, + info->pgid.get_split_bits(pp->get_pg_num())); + init_pg_ondisk( + rctx.transaction, + info->pgid, + pp); + + pg->init( + role, + up, + up_primary, + acting, + acting_primary, + info->history, + info->past_intervals, + rctx.transaction); + + return start_operation<PGAdvanceMap>( + *this, pg, get_map()->get_epoch(), std::move(rctx), true + ).second.then([pg=pg] { + return seastar::make_ready_future<Ref<PG>>(pg); + }); + }); + }); +} + + +ShardServices::get_or_create_pg_ret +ShardServices::get_or_create_pg( + PGMap::PGCreationBlockingEvent::TriggerI&& trigger, + spg_t pgid, + std::unique_ptr<PGCreateInfo> info) +{ + if (info) { + auto [fut, creating] = local_state.pg_map.wait_for_pg( + std::move(trigger), pgid); + if (!creating) { + local_state.pg_map.set_creating(pgid); + (void)handle_pg_create_info( + std::move(info)); + } + return std::move(fut); + } else { + return get_or_create_pg_ret( + get_or_create_pg_ertr::ready_future_marker{}, + local_state.pg_map.get_pg(pgid)); + } +} + +ShardServices::wait_for_pg_ret +ShardServices::wait_for_pg( + PGMap::PGCreationBlockingEvent::TriggerI&& trigger, spg_t pgid) +{ + return local_state.pg_map.wait_for_pg(std::move(trigger), pgid).first; +} + +seastar::future<Ref<PG>> ShardServices::load_pg(spg_t pgid) + +{ + logger().debug("{}: {}", __func__, pgid); + + return seastar::do_with(PGMeta(get_store(), pgid), [](auto& pg_meta) { + return pg_meta.get_epoch(); + }).then([this](epoch_t e) { + return get_map(e); + }).then([pgid, this](auto&& create_map) { + return make_pg(std::move(create_map), pgid, false); + }).then([this](Ref<PG> pg) { + return pg->read_state(&get_store()).then([pg] { + return seastar::make_ready_future<Ref<PG>>(std::move(pg)); + }); + }).handle_exception([pgid](auto ep) { + logger().info("pg {} saw exception on load {}", pgid, ep); + ceph_abort("Could not load pg" == 0); + return seastar::make_exception_future<Ref<PG>>(ep); + }); +} + +seastar::future<> ShardServices::dispatch_context_transaction( + crimson::os::CollectionRef col, PeeringCtx &ctx) { + if (ctx.transaction.empty()) { + logger().debug("ShardServices::dispatch_context_transaction: empty transaction"); + return seastar::now(); + } + + logger().debug("ShardServices::dispatch_context_transaction: do_transaction ..."); + auto ret = get_store().do_transaction( + col, + std::move(ctx.transaction)); + ctx.reset_transaction(); + return ret; +} + +seastar::future<> ShardServices::dispatch_context_messages( + BufferedRecoveryMessages &&ctx) +{ + auto ret = seastar::parallel_for_each(std::move(ctx.message_map), + [this](auto& osd_messages) { + auto& [peer, messages] = osd_messages; + logger().debug("dispatch_context_messages sending messages to {}", peer); + return seastar::parallel_for_each( + std::move(messages), [=, peer=peer, this](auto& m) { + return send_to_osd(peer, std::move(m), local_state.osdmap->get_epoch()); + }); + }); + ctx.message_map.clear(); + return ret; +} + +seastar::future<> ShardServices::dispatch_context( + crimson::os::CollectionRef col, + PeeringCtx &&ctx) +{ + ceph_assert(col || ctx.transaction.empty()); + return seastar::when_all_succeed( + dispatch_context_messages( + BufferedRecoveryMessages{ctx}), + col ? dispatch_context_transaction(col, ctx) : seastar::now() + ).then_unpack([] { + return seastar::now(); + }); +} + +seastar::future<> OSDSingletonState::send_incremental_map( + crimson::net::Connection &conn, + epoch_t first) +{ + logger().info("{}: first osdmap: {} " + "superblock's oldest map: {}", + __func__, first, superblock.oldest_map); + if (first >= superblock.oldest_map) { + return load_map_bls( + first, superblock.newest_map + ).then([this, &conn, first](auto&& bls) { + auto m = crimson::make_message<MOSDMap>( + monc.get_fsid(), + osdmap->get_encoding_features()); + m->cluster_osdmap_trim_lower_bound = first; + m->newest_map = superblock.newest_map; + m->maps = std::move(bls); + return conn.send(std::move(m)); + }); + } else { + return load_map_bl(osdmap->get_epoch() + ).then([this, &conn](auto&& bl) mutable { + auto m = crimson::make_message<MOSDMap>( + monc.get_fsid(), + osdmap->get_encoding_features()); + /* TODO: once we support the tracking of superblock's + * cluster_osdmap_trim_lower_bound, the MOSDMap should + * be populated with this value instead of the oldest_map. + * See: OSD::handle_osd_map for how classic updates the + * cluster's trim lower bound. + */ + m->cluster_osdmap_trim_lower_bound = superblock.oldest_map; + m->newest_map = superblock.newest_map; + m->maps.emplace(osdmap->get_epoch(), std::move(bl)); + return conn.send(std::move(m)); + }); + } +} + +seastar::future<> OSDSingletonState::send_incremental_map_to_osd( + int osd, + epoch_t first) +{ + if (osdmap->is_down(osd)) { + logger().info("{}: osd.{} is_down", __func__, osd); + return seastar::now(); + } else { + auto conn = cluster_msgr.connect( + osdmap->get_cluster_addrs(osd).front(), CEPH_ENTITY_TYPE_OSD); + return send_incremental_map(*conn, first); + } +} + +}; diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h new file mode 100644 index 000000000..9b7553e7b --- /dev/null +++ b/src/crimson/osd/shard_services.h @@ -0,0 +1,589 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> + +#include "include/common_fwd.h" +#include "osd_operation.h" +#include "msg/MessageRef.h" +#include "crimson/common/exception.h" +#include "crimson/common/shared_lru.h" +#include "crimson/os/futurized_collection.h" +#include "osd/PeeringState.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_meta.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/pg_map.h" +#include "crimson/osd/state.h" +#include "common/AsyncReserver.h" + +namespace crimson::net { + class Messenger; +} + +namespace crimson::mgr { + class Client; +} + +namespace crimson::mon { + class Client; +} + +namespace crimson::os { + class FuturizedStore; +} + +class OSDMap; +class PeeringCtx; +class BufferedRecoveryMessages; + +namespace crimson::osd { + +class PGShardManager; + +/** + * PerShardState + * + * Per-shard state holding instances local to each shard. + */ +class PerShardState { + friend class ShardServices; + friend class PGShardManager; + friend class OSD; + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; + + const core_id_t core = seastar::this_shard_id(); +#define assert_core() ceph_assert(seastar::this_shard_id() == core); + + const int whoami; + crimson::os::FuturizedStore::Shard &store; + crimson::common::CephContext cct; + + OSDState &osd_state; + OSD_OSDMapGate osdmap_gate; + + PerfCounters *perf = nullptr; + PerfCounters *recoverystate_perf = nullptr; + + // Op Management + OSDOperationRegistry registry; + OperationThrottler throttler; + + seastar::future<> dump_ops_in_flight(Formatter *f) const; + + epoch_t up_epoch = 0; + OSDMapService::cached_map_t osdmap; + const auto &get_osdmap() const { + assert_core(); + return osdmap; + } + void update_map(OSDMapService::cached_map_t new_osdmap) { + assert_core(); + osdmap = std::move(new_osdmap); + } + void set_up_epoch(epoch_t epoch) { + assert_core(); + up_epoch = epoch; + } + + // prevent creating new osd operations when system is shutting down, + // this is necessary because there are chances that a new operation + // is created, after the interruption of all ongoing operations, and + // creats and waits on a new and may-never-resolve future, in which + // case the shutdown may never succeed. + bool stopping = false; + seastar::future<> stop_registry() { + assert_core(); + crimson::get_logger(ceph_subsys_osd).info("PerShardState::{}", __func__); + stopping = true; + return registry.stop(); + } + + // PGMap state + PGMap pg_map; + + seastar::future<> stop_pgs(); + std::map<pg_t, pg_stat_t> get_pg_stats() const; + seastar::future<> broadcast_map_to_pgs( + ShardServices &shard_services, + epoch_t epoch); + + Ref<PG> get_pg(spg_t pgid); + template <typename F> + void for_each_pg(F &&f) const { + assert_core(); + for (auto &pg : pg_map.get_pgs()) { + std::invoke(f, pg.first, pg.second); + } + } + + template <typename T, typename... Args> + auto start_operation(Args&&... args) { + assert_core(); + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + auto op = registry.create_operation<T>(std::forward<Args>(args)...); + crimson::get_logger(ceph_subsys_osd).info( + "PerShardState::{}, {}", __func__, *op); + auto fut = seastar::yield().then([op] { + return op->start().finally([op /* by copy */] { + // ensure the op's lifetime is appropriate. It is not enough to + // guarantee it's alive at the scheduling stages (i.e. `then()` + // calling) but also during the actual execution (i.e. when passed + // lambdas are actually run). + }); + }); + return std::make_pair(std::move(op), std::move(fut)); + } + + template <typename InterruptorT, typename T, typename... Args> + auto start_operation_may_interrupt(Args&&... args) { + assert_core(); + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + auto op = registry.create_operation<T>(std::forward<Args>(args)...); + crimson::get_logger(ceph_subsys_osd).info( + "PerShardState::{}, {}", __func__, *op); + auto fut = InterruptorT::make_interruptible( + seastar::yield() + ).then_interruptible([op] { + return op->start().finally([op /* by copy */] { + // ensure the op's lifetime is appropriate. It is not enough to + // guarantee it's alive at the scheduling stages (i.e. `then()` + // calling) but also during the actual execution (i.e. when passed + // lambdas are actually run). + }); + }); + return std::make_pair(std::move(op), std::move(fut)); + } + + // tids for ops i issue, prefixed with core id to ensure uniqueness + ceph_tid_t next_tid; + ceph_tid_t get_tid() { + assert_core(); + return next_tid++; + } + + HeartbeatStampsRef get_hb_stamps(int peer); + std::map<int, HeartbeatStampsRef> heartbeat_stamps; + + // Time state + const ceph::mono_time startup_time; + ceph::signedspan get_mnow() const { + assert_core(); + return ceph::mono_clock::now() - startup_time; + } + +public: + PerShardState( + int whoami, + ceph::mono_time startup_time, + PerfCounters *perf, + PerfCounters *recoverystate_perf, + crimson::os::FuturizedStore &store, + OSDState& osd_state); +}; + +/** + * OSDSingletonState + * + * OSD-wide singleton holding instances that need to be accessible + * from all PGs. + */ +class OSDSingletonState : public md_config_obs_t { + friend class ShardServices; + friend class PGShardManager; + friend class OSD; + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; + +public: + OSDSingletonState( + int whoami, + crimson::net::Messenger &cluster_msgr, + crimson::net::Messenger &public_msgr, + crimson::mon::Client &monc, + crimson::mgr::Client &mgrc); + +private: + const int whoami; + + crimson::common::CephContext cct; + PerfCounters *perf = nullptr; + PerfCounters *recoverystate_perf = nullptr; + + SharedLRU<epoch_t, OSDMap> osdmaps; + SimpleLRU<epoch_t, bufferlist, false> map_bl_cache; + + cached_map_t osdmap; + cached_map_t &get_osdmap() { return osdmap; } + void update_map(cached_map_t new_osdmap) { + osdmap = std::move(new_osdmap); + } + + crimson::net::Messenger &cluster_msgr; + crimson::net::Messenger &public_msgr; + + seastar::future<> send_to_osd(int peer, MessageURef m, epoch_t from_epoch); + + crimson::mon::Client &monc; + seastar::future<> osdmap_subscribe(version_t epoch, bool force_request); + + crimson::mgr::Client &mgrc; + + std::unique_ptr<OSDMeta> meta_coll; + template <typename... Args> + void init_meta_coll(Args&&... args) { + meta_coll = std::make_unique<OSDMeta>(std::forward<Args>(args)...); + } + OSDMeta &get_meta_coll() { + assert(meta_coll); + return *meta_coll; + } + + OSDSuperblock superblock; + void set_superblock(OSDSuperblock _superblock) { + superblock = std::move(_superblock); + } + + seastar::future<> send_incremental_map( + crimson::net::Connection &conn, + epoch_t first); + + seastar::future<> send_incremental_map_to_osd(int osd, epoch_t first); + + auto get_pool_info(int64_t poolid) { + return get_meta_coll().load_final_pool_info(poolid); + } + + // global pg temp state + struct pg_temp_t { + std::vector<int> acting; + bool forced = false; + }; + std::map<pg_t, pg_temp_t> pg_temp_wanted; + std::map<pg_t, pg_temp_t> pg_temp_pending; + friend std::ostream& operator<<(std::ostream&, const pg_temp_t&); + + void queue_want_pg_temp(pg_t pgid, const std::vector<int>& want, + bool forced = false); + void remove_want_pg_temp(pg_t pgid); + void requeue_pg_temp(); + seastar::future<> send_pg_temp(); + + std::set<pg_t> pg_created; + seastar::future<> send_pg_created(pg_t pgid); + seastar::future<> send_pg_created(); + void prune_pg_created(); + + struct DirectFinisher { + void queue(Context *c) { + c->complete(0); + } + } finisher; + AsyncReserver<spg_t, DirectFinisher> local_reserver; + AsyncReserver<spg_t, DirectFinisher> remote_reserver; + AsyncReserver<spg_t, DirectFinisher> snap_reserver; + + epoch_t up_thru_wanted = 0; + seastar::future<> send_alive(epoch_t want); + + const char** get_tracked_conf_keys() const final; + void handle_conf_change( + const ConfigProxy& conf, + const std::set <std::string> &changed) final; + + seastar::future<local_cached_map_t> get_local_map(epoch_t e); + seastar::future<std::unique_ptr<OSDMap>> load_map(epoch_t e); + seastar::future<bufferlist> load_map_bl(epoch_t e); + seastar::future<std::map<epoch_t, bufferlist>> + load_map_bls(epoch_t first, epoch_t last); + void store_map_bl(ceph::os::Transaction& t, + epoch_t e, bufferlist&& bl); + seastar::future<> store_maps(ceph::os::Transaction& t, + epoch_t start, Ref<MOSDMap> m); +}; + +/** + * Represents services available to each PG + */ +class ShardServices : public OSDMapService { + friend class PGShardManager; + friend class OSD; + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; + + PerShardState local_state; + seastar::sharded<OSDSingletonState> &osd_singleton_state; + PGShardMapping& pg_to_shard_mapping; + + template <typename F, typename... Args> + auto with_singleton(F &&f, Args&&... args) { + return osd_singleton_state.invoke_on( + PRIMARY_CORE, + std::forward<F>(f), + std::forward<Args>(args)... + ); + } + +#define FORWARD_CONST(FROM_METHOD, TO_METHOD, TARGET) \ + template <typename... Args> \ + auto FROM_METHOD(Args&&... args) const { \ + return TARGET.TO_METHOD(std::forward<Args>(args)...); \ + } + +#define FORWARD(FROM_METHOD, TO_METHOD, TARGET) \ + template <typename... Args> \ + auto FROM_METHOD(Args&&... args) { \ + return TARGET.TO_METHOD(std::forward<Args>(args)...); \ + } + +#define FORWARD_TO_LOCAL(METHOD) FORWARD(METHOD, METHOD, local_state) +#define FORWARD_TO_LOCAL_CONST(METHOD) FORWARD_CONST( \ + METHOD, METHOD, local_state) \ + +#define FORWARD_TO_OSD_SINGLETON_TARGET(METHOD, TARGET) \ + template <typename... Args> \ + auto METHOD(Args&&... args) { \ + return with_singleton( \ + [](auto &local_state, auto&&... args) { \ + return local_state.TARGET( \ + std::forward<decltype(args)>(args)...); \ + }, std::forward<Args>(args)...); \ + } +#define FORWARD_TO_OSD_SINGLETON(METHOD) \ + FORWARD_TO_OSD_SINGLETON_TARGET(METHOD, METHOD) + +public: + template <typename... PSSArgs> + ShardServices( + seastar::sharded<OSDSingletonState> &osd_singleton_state, + PGShardMapping& pg_to_shard_mapping, + PSSArgs&&... args) + : local_state(std::forward<PSSArgs>(args)...), + osd_singleton_state(osd_singleton_state), + pg_to_shard_mapping(pg_to_shard_mapping) {} + + FORWARD_TO_OSD_SINGLETON(send_to_osd) + + crimson::os::FuturizedStore::Shard &get_store() { + return local_state.store; + } + + auto remove_pg(spg_t pgid) { + local_state.pg_map.remove_pg(pgid); + return pg_to_shard_mapping.remove_pg(pgid); + } + + crimson::common::CephContext *get_cct() { + return &(local_state.cct); + } + + template <typename T, typename... Args> + auto start_operation(Args&&... args) { + return local_state.start_operation<T>(std::forward<Args>(args)...); + } + + template <typename InterruptorT, typename T, typename... Args> + auto start_operation_may_interrupt(Args&&... args) { + return local_state.start_operation_may_interrupt< + InterruptorT, T>(std::forward<Args>(args)...); + } + + auto &get_registry() { return local_state.registry; } + + // Loggers + PerfCounters &get_recoverystate_perf_logger() { + return *local_state.recoverystate_perf; + } + PerfCounters &get_perf_logger() { + return *local_state.perf; + } + + // Diagnostics + FORWARD_TO_LOCAL_CONST(dump_ops_in_flight); + + // Local PG Management + seastar::future<Ref<PG>> make_pg( + cached_map_t create_map, + spg_t pgid, + bool do_create); + seastar::future<Ref<PG>> handle_pg_create_info( + std::unique_ptr<PGCreateInfo> info); + + using get_or_create_pg_ertr = PGMap::wait_for_pg_ertr; + using get_or_create_pg_ret = get_or_create_pg_ertr::future<Ref<PG>>; + get_or_create_pg_ret get_or_create_pg( + PGMap::PGCreationBlockingEvent::TriggerI&&, + spg_t pgid, + std::unique_ptr<PGCreateInfo> info); + + using wait_for_pg_ertr = PGMap::wait_for_pg_ertr; + using wait_for_pg_ret = wait_for_pg_ertr::future<Ref<PG>>; + wait_for_pg_ret wait_for_pg( + PGMap::PGCreationBlockingEvent::TriggerI&&, spg_t pgid); + seastar::future<Ref<PG>> load_pg(spg_t pgid); + + /// Dispatch and reset ctx transaction + seastar::future<> dispatch_context_transaction( + crimson::os::CollectionRef col, PeeringCtx &ctx); + + /// Dispatch and reset ctx messages + seastar::future<> dispatch_context_messages( + BufferedRecoveryMessages &&ctx); + + /// Dispatch ctx and dispose of context + seastar::future<> dispatch_context( + crimson::os::CollectionRef col, + PeeringCtx &&ctx); + + /// Dispatch ctx and dispose of ctx, transaction must be empty + seastar::future<> dispatch_context( + PeeringCtx &&ctx) { + return dispatch_context({}, std::move(ctx)); + } + + /// Return per-core tid + ceph_tid_t get_tid() { return local_state.get_tid(); } + + /// Return core-local pg count * number of cores + unsigned get_num_local_pgs() const { + return local_state.pg_map.get_pg_count(); + } + + // OSDMapService + cached_map_t get_map() const final { return local_state.get_osdmap(); } + epoch_t get_up_epoch() const final { return local_state.up_epoch; } + seastar::future<cached_map_t> get_map(epoch_t e) final { + return with_singleton( + [](auto &sstate, epoch_t e) { + return sstate.get_local_map( + e + ).then([](auto lmap) { + return seastar::foreign_ptr<local_cached_map_t>(lmap); + }); + }, e).then([](auto fmap) { + return make_local_shared_foreign(std::move(fmap)); + }); + } + + FORWARD_TO_OSD_SINGLETON(get_pool_info) + FORWARD(with_throttle_while, with_throttle_while, local_state.throttler) + + FORWARD_TO_OSD_SINGLETON(send_incremental_map) + FORWARD_TO_OSD_SINGLETON(send_incremental_map_to_osd) + + FORWARD_TO_OSD_SINGLETON(osdmap_subscribe) + FORWARD_TO_OSD_SINGLETON(queue_want_pg_temp) + FORWARD_TO_OSD_SINGLETON(remove_want_pg_temp) + FORWARD_TO_OSD_SINGLETON(requeue_pg_temp) + FORWARD_TO_OSD_SINGLETON(send_pg_created) + FORWARD_TO_OSD_SINGLETON(send_alive) + FORWARD_TO_OSD_SINGLETON(send_pg_temp) + FORWARD_TO_LOCAL_CONST(get_mnow) + FORWARD_TO_LOCAL(get_hb_stamps) + + FORWARD(pg_created, pg_created, local_state.pg_map) + + FORWARD_TO_OSD_SINGLETON_TARGET( + local_update_priority, + local_reserver.update_priority) + FORWARD_TO_OSD_SINGLETON_TARGET( + local_cancel_reservation, + local_reserver.cancel_reservation) + FORWARD_TO_OSD_SINGLETON_TARGET( + local_dump_reservations, + local_reserver.dump) + FORWARD_TO_OSD_SINGLETON_TARGET( + remote_cancel_reservation, + remote_reserver.cancel_reservation) + FORWARD_TO_OSD_SINGLETON_TARGET( + remote_dump_reservations, + remote_reserver.dump) + FORWARD_TO_OSD_SINGLETON_TARGET( + snap_cancel_reservation, + snap_reserver.cancel_reservation) + FORWARD_TO_OSD_SINGLETON_TARGET( + snap_dump_reservations, + snap_reserver.dump) + + Context *invoke_context_on_core(core_id_t core, Context *c) { + if (!c) return nullptr; + return new LambdaContext([core, c](int code) { + std::ignore = seastar::smp::submit_to( + core, + [c, code] { + c->complete(code); + }); + }); + } + seastar::future<> local_request_reservation( + spg_t item, + Context *on_reserved, + unsigned prio, + Context *on_preempt) { + return with_singleton( + [item, prio](OSDSingletonState &singleton, + Context *wrapped_on_reserved, Context *wrapped_on_preempt) { + return singleton.local_reserver.request_reservation( + item, + wrapped_on_reserved, + prio, + wrapped_on_preempt); + }, + invoke_context_on_core(seastar::this_shard_id(), on_reserved), + invoke_context_on_core(seastar::this_shard_id(), on_preempt)); + } + seastar::future<> remote_request_reservation( + spg_t item, + Context *on_reserved, + unsigned prio, + Context *on_preempt) { + return with_singleton( + [item, prio](OSDSingletonState &singleton, + Context *wrapped_on_reserved, Context *wrapped_on_preempt) { + return singleton.remote_reserver.request_reservation( + item, + wrapped_on_reserved, + prio, + wrapped_on_preempt); + }, + invoke_context_on_core(seastar::this_shard_id(), on_reserved), + invoke_context_on_core(seastar::this_shard_id(), on_preempt)); + } + seastar::future<> snap_request_reservation( + spg_t item, + Context *on_reserved, + unsigned prio) { + return with_singleton( + [item, prio](OSDSingletonState &singleton, + Context *wrapped_on_reserved) { + return singleton.snap_reserver.request_reservation( + item, + wrapped_on_reserved, + prio); + }, + invoke_context_on_core(seastar::this_shard_id(), on_reserved)); + } + +#undef FORWARD_CONST +#undef FORWARD +#undef FORWARD_TO_OSD_SINGLETON +#undef FORWARD_TO_LOCAL +#undef FORWARD_TO_LOCAL_CONST +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::OSDSingletonState::pg_temp_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/state.h b/src/crimson/osd/state.h new file mode 100644 index 000000000..f0676a4ec --- /dev/null +++ b/src/crimson/osd/state.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string_view> +#include <ostream> + +#include <seastar/core/shared_future.hh> + +class OSDMap; + +namespace crimson::osd { + +// seastar::sharded puts start_single on core 0 +constexpr core_id_t PRIMARY_CORE = 0; + +/** + * OSDState + * + * Maintains state representing the OSD's progress from booting through + * shutdown. + * + * Shards other than PRIMARY_CORE may use their local instance to check + * on ACTIVE and STOPPING. All other methods are restricted to + * PRIMARY_CORE (such methods start with an assert to this effect). + */ +class OSDState : public seastar::peering_sharded_service<OSDState> { + + enum class State { + INITIALIZING, + PREBOOT, + BOOTING, + ACTIVE, + PRESTOP, + STOPPING, + WAITING_FOR_HEALTHY, + }; + + State state = State::INITIALIZING; + mutable seastar::shared_promise<> wait_for_active; + + /// Sets local instance state to active, called from set_active + void _set_active() { + state = State::ACTIVE; + wait_for_active.set_value(); + wait_for_active = {}; + } + /// Sets local instance state to stopping, called from set_stopping + void _set_stopping() { + state = State::STOPPING; + wait_for_active.set_exception(crimson::common::system_shutdown_exception{}); + wait_for_active = {}; + } +public: + bool is_initializing() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return state == State::INITIALIZING; + } + bool is_preboot() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return state == State::PREBOOT; + } + bool is_booting() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return state == State::BOOTING; + } + bool is_active() const { + return state == State::ACTIVE; + } + seastar::future<> when_active() const { + return is_active() ? seastar::now() + : wait_for_active.get_shared_future(); + }; + bool is_prestop() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return state == State::PRESTOP; + } + bool is_stopping() const { + return state == State::STOPPING; + } + bool is_waiting_for_healthy() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return state == State::WAITING_FOR_HEALTHY; + } + void set_preboot() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + state = State::PREBOOT; + } + void set_booting() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + state = State::BOOTING; + } + /// Sets all shards to active + seastar::future<> set_active() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return container().invoke_on_all([](auto& osd_state) { + osd_state._set_active(); + }); + } + void set_prestop() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + state = State::PRESTOP; + } + /// Sets all shards to stopping + seastar::future<> set_stopping() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return container().invoke_on_all([](auto& osd_state) { + osd_state._set_stopping(); + }); + } + std::string_view to_string() const { + switch (state) { + case State::INITIALIZING: return "initializing"; + case State::PREBOOT: return "preboot"; + case State::BOOTING: return "booting"; + case State::ACTIVE: return "active"; + case State::PRESTOP: return "prestop"; + case State::STOPPING: return "stopping"; + case State::WAITING_FOR_HEALTHY: return "waiting_for_healthy"; + default: return "???"; + } + } +}; + +inline std::ostream& +operator<<(std::ostream& os, const OSDState& s) { + return os << s.to_string(); +} +} diff --git a/src/crimson/osd/stop_signal.h b/src/crimson/osd/stop_signal.h new file mode 100644 index 000000000..951f8d4b7 --- /dev/null +++ b/src/crimson/osd/stop_signal.h @@ -0,0 +1,83 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2020 Cloudius Systems, Ltd. + */ + +#pragma once + +#include <seastar/core/abort_source.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/condition-variable.hh> + +/// Seastar apps lib namespace + +namespace seastar_apps_lib { + + +/// \brief Futurized SIGINT/SIGTERM signals handler class +/// +/// Seastar-style helper class that allows easy waiting for SIGINT/SIGTERM signals +/// from your app. +/// +/// Example: +/// \code +/// #include <seastar/apps/lib/stop_signal.hh> +/// ... +/// int main() { +/// ... +/// seastar::thread th([] { +/// seastar_apps_lib::stop_signal stop_signal; +/// <some code> +/// stop_signal.wait().get(); // this will wait till we receive SIGINT or SIGTERM signal +/// }); +/// \endcode +class stop_signal { + seastar::condition_variable _cond; + seastar::abort_source _abort_source; + +private: + void on_signal() { + if (stopping()) { + return; + } + _abort_source.request_abort(); + _cond.broadcast(); + } +public: + stop_signal() { + seastar::engine().handle_signal(SIGINT, [this] { on_signal(); }); + seastar::engine().handle_signal(SIGTERM, [this] { on_signal(); }); + } + ~stop_signal() { + // There's no way to unregister a handler yet, so register a no-op handler instead. + seastar::engine().handle_signal(SIGINT, [] {}); + seastar::engine().handle_signal(SIGTERM, [] {}); + } + seastar::future<> wait() { + return _cond.wait([this] { return _abort_source.abort_requested(); }); + } + bool stopping() const { + return _abort_source.abort_requested(); + } + auto& abort_source() { + return _abort_source; + } +}; +} diff --git a/src/crimson/osd/watch.cc b/src/crimson/osd/watch.cc new file mode 100644 index 000000000..4573333c3 --- /dev/null +++ b/src/crimson/osd/watch.cc @@ -0,0 +1,354 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> + +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm_ext/insert.hpp> + +#include "crimson/osd/watch.h" +#include "crimson/osd/osd_operations/internal_client_request.h" + +#include "messages/MWatchNotify.h" + + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +// a watcher can remove itself if it has not seen a notification after a period of time. +// in the case, we need to drop it also from the persisted `ObjectState` instance. +// this operation resembles a bit the `_UNWATCH` subop. +class WatchTimeoutRequest final : public InternalClientRequest { +public: + WatchTimeoutRequest(WatchRef watch, Ref<PG> pg) + : InternalClientRequest(std::move(pg)), + watch(std::move(watch)) { + } + + const hobject_t& get_target_oid() const final; + PG::do_osd_ops_params_t get_do_osd_ops_params() const final; + std::vector<OSDOp> create_osd_ops() final; + +private: + WatchRef watch; +}; + +const hobject_t& WatchTimeoutRequest::get_target_oid() const +{ + assert(watch->obc); + return watch->obc->get_oid(); +} + +PG::do_osd_ops_params_t +WatchTimeoutRequest::get_do_osd_ops_params() const +{ + osd_reqid_t reqid; + reqid.name = watch->entity_name; + PG::do_osd_ops_params_t params{ + watch->conn, + reqid, + ceph_clock_now(), + get_pg().get_osdmap_epoch(), + entity_inst_t{ watch->entity_name, watch->winfo.addr }, + 0 + }; + logger().debug("{}: params.reqid={}", __func__, params.reqid); + return params; +} + +std::vector<OSDOp> WatchTimeoutRequest::create_osd_ops() +{ + logger().debug("{}", __func__); + assert(watch); + OSDOp osd_op; + osd_op.op.op = CEPH_OSD_OP_WATCH; + osd_op.op.flags = 0; + osd_op.op.watch.op = CEPH_OSD_WATCH_OP_UNWATCH; + osd_op.op.watch.cookie = watch->winfo.cookie; + return std::vector{std::move(osd_op)}; +} + +Watch::~Watch() +{ + logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie()); +} + +seastar::future<> Watch::connect(crimson::net::ConnectionRef conn, bool) +{ + if (this->conn == conn) { + logger().debug("conn={} already connected", conn); + return seastar::now(); + } + timeout_timer.cancel(); + timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds}); + this->conn = std::move(conn); + return seastar::now(); +} + +void Watch::disconnect() +{ + ceph_assert(!conn); + timeout_timer.cancel(); + timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds}); +} + +seastar::future<> Watch::send_notify_msg(NotifyRef notify) +{ + logger().info("{} for notify(id={})", __func__, notify->ninfo.notify_id); + return conn->send(crimson::make_message<MWatchNotify>( + winfo.cookie, + notify->user_version, + notify->ninfo.notify_id, + CEPH_WATCH_EVENT_NOTIFY, + notify->ninfo.bl, + notify->client_gid)); +} + +seastar::future<> Watch::start_notify(NotifyRef notify) +{ + logger().debug("{} gid={} cookie={} starting notify(id={})", + __func__, get_watcher_gid(), get_cookie(), + notify->ninfo.notify_id); + auto [ it, emplaced ] = in_progress_notifies.emplace(std::move(notify)); + ceph_assert(emplaced); + ceph_assert(is_alive()); + return is_connected() ? send_notify_msg(*it) : seastar::now(); +} + +seastar::future<> Watch::notify_ack( + const uint64_t notify_id, + const ceph::bufferlist& reply_bl) +{ + logger().debug("{} gid={} cookie={} notify_id={}", + __func__, get_watcher_gid(), get_cookie(), notify_id); + const auto it = in_progress_notifies.find(notify_id); + if (it == std::end(in_progress_notifies)) { + logger().error("{} notify_id={} not found on the in-progess list." + " Supressing but this should not happen.", + __func__, notify_id); + return seastar::now(); + } + auto notify = *it; + logger().debug("Watch::notify_ack gid={} cookie={} found notify(id={})", + get_watcher_gid(), + get_cookie(), + notify->get_id()); + // let's ensure we're extending the life-time till end of this method + static_assert(std::is_same_v<decltype(notify), NotifyRef>); + in_progress_notifies.erase(it); + return notify->complete_watcher(shared_from_this(), reply_bl); +} + +seastar::future<> Watch::send_disconnect_msg() +{ + if (!is_connected()) { + return seastar::now(); + } + ceph::bufferlist empty; + return conn->send(crimson::make_message<MWatchNotify>( + winfo.cookie, + 0, + 0, + CEPH_WATCH_EVENT_DISCONNECT, + empty)); +} + +void Watch::discard_state() +{ + logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie()); + ceph_assert(obc); + in_progress_notifies.clear(); + timeout_timer.cancel(); +} + +void Watch::got_ping(utime_t) +{ + if (is_connected()) { + // using cancel() + arm() as rearm() has no overload for time delta. + timeout_timer.cancel(); + timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds}); + } +} + +seastar::future<> Watch::remove() +{ + logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie()); + // in contrast to ceph-osd crimson sends CEPH_WATCH_EVENT_DISCONNECT directly + // from the timeout handler and _after_ CEPH_WATCH_EVENT_NOTIFY_COMPLETE. + // this simplifies the Watch::remove() interface as callers aren't obliged + // anymore to decide whether EVENT_DISCONNECT needs to be send or not -- it + // becomes an implementation detail of Watch. + return seastar::do_for_each(in_progress_notifies, + [this_shared=shared_from_this()] (auto notify) { + logger().debug("Watch::remove gid={} cookie={} notify(id={})", + this_shared->get_watcher_gid(), + this_shared->get_cookie(), + notify->ninfo.notify_id); + return notify->remove_watcher(this_shared); + }).then([this] { + discard_state(); + return seastar::now(); + }); +} + +void Watch::cancel_notify(const uint64_t notify_id) +{ + logger().debug("{} gid={} cookie={} notify(id={})", + __func__, get_watcher_gid(), get_cookie(), + notify_id); + const auto it = in_progress_notifies.find(notify_id); + assert(it != std::end(in_progress_notifies)); + in_progress_notifies.erase(it); +} + +void Watch::do_watch_timeout() +{ + assert(pg); + auto [op, fut] = pg->get_shard_services().start_operation<WatchTimeoutRequest>( + shared_from_this(), pg); + std::ignore = std::move(fut).then([op=std::move(op), this] { + return send_disconnect_msg(); + }); +} + +bool notify_reply_t::operator<(const notify_reply_t& rhs) const +{ + // comparing std::pairs to emphasize our legacy. ceph-osd stores + // notify_replies as std::multimap<std::pair<gid, cookie>, bl>. + // unfortunately, what seems to be an implementation detail, got + // exposed as part of our public API (the `reply_buffer` parameter + // of the `rados_notify` family). + const auto lhsp = std::make_pair(watcher_gid, watcher_cookie); + const auto rhsp = std::make_pair(rhs.watcher_gid, rhs.watcher_cookie); + return lhsp < rhsp; +} + +std::ostream &operator<<(std::ostream &out, const notify_reply_t &rhs) +{ + out << "notify_reply_t{watcher_gid=" << rhs.watcher_gid + << ", watcher_cookie=" << rhs.watcher_cookie << "}"; + return out; +} + +Notify::Notify(crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version) + : ninfo(ninfo), + conn(std::move(conn)), + client_gid(client_gid), + user_version(user_version) +{} + +Notify::~Notify() +{ + logger().debug("{} for notify(id={})", __func__, ninfo.notify_id); +} + +seastar::future<> Notify::remove_watcher(WatchRef watch) +{ + logger().debug("{} for notify(id={})", __func__, ninfo.notify_id); + + if (discarded || complete) { + logger().debug("{} for notify(id={}) discarded/complete already" + " discarded: {} complete: {}", __func__, + ninfo.notify_id, discarded ,complete); + return seastar::now(); + } + [[maybe_unused]] const auto num_removed = watchers.erase(watch); + assert(num_removed > 0); + if (watchers.empty()) { + complete = true; + [[maybe_unused]] bool was_armed = timeout_timer.cancel(); + assert(was_armed); + return send_completion(); + } else { + return seastar::now(); + } +} + + +seastar::future<> Notify::complete_watcher( + WatchRef watch, + const ceph::bufferlist& reply_bl) +{ + logger().debug("{} for notify(id={})", __func__, ninfo.notify_id); + + if (discarded || complete) { + logger().debug("{} for notify(id={}) discarded/complete already" + " discarded: {} complete: {}", __func__, + ninfo.notify_id, discarded ,complete); + return seastar::now(); + } + notify_replies.emplace(notify_reply_t{ + watch->get_watcher_gid(), + watch->get_cookie(), + reply_bl}); + return remove_watcher(std::move(watch)); +} + +seastar::future<> Notify::send_completion( + std::set<WatchRef> timedout_watchers) +{ + logger().info("{} -- {} in progress watchers, timedout watchers {}", + __func__, watchers.size(), timedout_watchers.size()); + logger().debug("{} sending notify replies: {}", __func__, notify_replies); + + ceph::bufferlist empty; + auto reply = crimson::make_message<MWatchNotify>( + ninfo.cookie, + user_version, + ninfo.notify_id, + CEPH_WATCH_EVENT_NOTIFY_COMPLETE, + empty, + client_gid); + ceph::bufferlist reply_bl; + { + std::vector<std::pair<uint64_t,uint64_t>> missed; + missed.reserve(std::size(timedout_watchers)); + boost::insert( + missed, std::begin(missed), + timedout_watchers | boost::adaptors::transformed([] (auto w) { + return std::make_pair(w->get_watcher_gid(), w->get_cookie()); + })); + ceph::encode(notify_replies, reply_bl); + ceph::encode(missed, reply_bl); + } + reply->set_data(std::move(reply_bl)); + if (!timedout_watchers.empty()) { + reply->return_code = -ETIMEDOUT; + } + return conn->send(std::move(reply)); +} + +void Notify::do_notify_timeout() +{ + logger().debug("{} complete={}", __func__, complete); + if (complete) { + return; + } + // it might be that `this` is kept alive only because of the reference + // a watcher stores and which is being removed by `cancel_notify()`. + // to avoid use-after-free we bump up the ref counter with `guard_ptr`. + [[maybe_unused]] auto guard_ptr = shared_from_this(); + for (auto& watcher : watchers) { + logger().debug("canceling watcher cookie={} gid={} use_count={}", + watcher->get_cookie(), + watcher->get_watcher_gid(), + watcher->use_count()); + watcher->cancel_notify(ninfo.notify_id); + } + std::ignore = send_completion(std::move(watchers)); + watchers.clear(); +} + +} // namespace crimson::osd + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::WatchTimeoutRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/watch.h b/src/crimson/osd/watch.h new file mode 100644 index 000000000..b3982141d --- /dev/null +++ b/src/crimson/osd/watch.h @@ -0,0 +1,256 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iterator> +#include <map> +#include <set> + +#include <seastar/core/shared_ptr.hh> + +#include "crimson/net/Connection.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/pg.h" +#include "include/denc.h" + +namespace crimson::osd { + +class Notify; +using NotifyRef = seastar::shared_ptr<Notify>; + +// NOTE: really need to have this public. Otherwise `shared_from_this()` +// will abort. According to cppreference.com: +// +// "The constructors of std::shared_ptr detect the presence +// of an unambiguous and accessible (ie. public inheritance +// is mandatory) (since C++17) enable_shared_from_this base". +// +// I expect the `seastar::shared_ptr` shares this behaviour. +class Watch : public seastar::enable_shared_from_this<Watch> { + // this is a private tag for the public constructor that turns it into + // de facto private one. The motivation behind the hack is make_shared + // used by create(). + struct private_ctag_t{}; + + std::set<NotifyRef, std::less<>> in_progress_notifies; + crimson::net::ConnectionRef conn; + crimson::osd::ObjectContextRef obc; + + watch_info_t winfo; + entity_name_t entity_name; + Ref<PG> pg; + + seastar::timer<seastar::lowres_clock> timeout_timer; + + seastar::future<> start_notify(NotifyRef); + seastar::future<> send_notify_msg(NotifyRef); + seastar::future<> send_disconnect_msg(); + + friend Notify; + friend class WatchTimeoutRequest; + +public: + Watch(private_ctag_t, + crimson::osd::ObjectContextRef obc, + const watch_info_t& winfo, + const entity_name_t& entity_name, + Ref<PG> pg) + : obc(std::move(obc)), + winfo(winfo), + entity_name(entity_name), + pg(std::move(pg)), + timeout_timer([this] { + return do_watch_timeout(); + }) { + assert(this->pg); + } + ~Watch(); + + seastar::future<> connect(crimson::net::ConnectionRef, bool); + void disconnect(); + bool is_alive() const { + return true; + } + bool is_connected() const { + return static_cast<bool>(conn); + } + void got_ping(utime_t); + + void discard_state(); + + seastar::future<> remove(); + + /// Call when notify_ack received on notify_id + seastar::future<> notify_ack( + uint64_t notify_id, ///< [in] id of acked notify + const ceph::bufferlist& reply_bl); ///< [in] notify reply buffer + + template <class... Args> + static seastar::shared_ptr<Watch> create(Args&&... args) { + return seastar::make_shared<Watch>(private_ctag_t{}, + std::forward<Args>(args)...); + }; + + uint64_t get_watcher_gid() const { + return entity_name.num(); + } + auto get_pg() const { + return pg; + } + auto& get_entity() const { + return entity_name; + } + auto& get_cookie() const { + return winfo.cookie; + } + auto& get_peer_addr() const { + return winfo.addr; + } + void cancel_notify(const uint64_t notify_id); + void do_watch_timeout(); +}; + +using WatchRef = seastar::shared_ptr<Watch>; + +struct notify_reply_t { + uint64_t watcher_gid; + uint64_t watcher_cookie; + ceph::bufferlist bl; + + bool operator<(const notify_reply_t& rhs) const; + DENC(notify_reply_t, v, p) { + // there is no versioning / preamble + denc(v.watcher_gid, p); + denc(v.watcher_cookie, p); + denc(v.bl, p); + } +}; +std::ostream &operator<<(std::ostream &out, const notify_reply_t &rhs); + +class Notify : public seastar::enable_shared_from_this<Notify> { + std::set<WatchRef> watchers; + const notify_info_t ninfo; + crimson::net::ConnectionRef conn; + const uint64_t client_gid; + const uint64_t user_version; + bool complete{false}; + bool discarded{false}; + seastar::timer<seastar::lowres_clock> timeout_timer{ + [this] { do_notify_timeout(); } + }; + + ~Notify(); + + /// (gid,cookie) -> reply_bl for everyone who acked the notify + std::multiset<notify_reply_t> notify_replies; + + uint64_t get_id() const { return ninfo.notify_id; } + + /// Sends notify completion if watchers.empty() or timeout + seastar::future<> send_completion( + std::set<WatchRef> timedout_watchers = {}); + + /// Called on Notify timeout + void do_notify_timeout(); + + Notify(crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version); + template <class WatchIteratorT> + Notify(WatchIteratorT begin, + WatchIteratorT end, + crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version); + // this is a private tag for the public constructor that turns it into + // de facto private one. The motivation behind the hack is make_shared + // used by create_n_propagate factory. + struct private_ctag_t{}; + + using ptr_t = seastar::shared_ptr<Notify>; + friend bool operator<(const ptr_t& lhs, const ptr_t& rhs) { + assert(lhs); + assert(rhs); + return lhs->get_id() < rhs->get_id(); + } + friend bool operator<(const ptr_t& ptr, const uint64_t id) { + assert(ptr); + return ptr->get_id() < id; + } + friend bool operator<(const uint64_t id, const ptr_t& ptr) { + assert(ptr); + return id < ptr->get_id(); + } + + friend Watch; + +public: + template <class... Args> + Notify(private_ctag_t, Args&&... args) : Notify(std::forward<Args>(args)...) { + } + + template <class WatchIteratorT, class... Args> + static seastar::future<> create_n_propagate( + WatchIteratorT begin, + WatchIteratorT end, + Args&&... args); + + seastar::future<> remove_watcher(WatchRef watch); + seastar::future<> complete_watcher(WatchRef watch, + const ceph::bufferlist& reply_bl); +}; + + +template <class WatchIteratorT> +Notify::Notify(WatchIteratorT begin, + WatchIteratorT end, + crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version) + : watchers(begin, end), + ninfo(ninfo), + conn(std::move(conn)), + client_gid(client_gid), + user_version(user_version) { + assert(!std::empty(watchers)); + if (ninfo.timeout) { + timeout_timer.arm(std::chrono::seconds{ninfo.timeout}); + } +} + +template <class WatchIteratorT, class... Args> +seastar::future<> Notify::create_n_propagate( + WatchIteratorT begin, + WatchIteratorT end, + Args&&... args) +{ + static_assert( + std::is_same_v<typename std::iterator_traits<WatchIteratorT>::value_type, + crimson::osd::WatchRef>); + if (begin == end) { + auto notify = seastar::make_shared<Notify>( + private_ctag_t{}, + std::forward<Args>(args)...); + return notify->send_completion(); + } else { + auto notify = seastar::make_shared<Notify>( + private_ctag_t{}, + begin, end, + std::forward<Args>(args)...); + return seastar::do_for_each(begin, end, [=] (auto& watchref) { + return watchref->start_notify(notify); + }); + } +} + +} // namespace crimson::osd + +WRITE_CLASS_DENC(crimson::osd::notify_reply_t) + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::notify_reply_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/tools/CMakeLists.txt b/src/crimson/tools/CMakeLists.txt new file mode 100644 index 000000000..fc18ff90b --- /dev/null +++ b/src/crimson/tools/CMakeLists.txt @@ -0,0 +1,22 @@ +add_executable(crimson-store-nbd + store_nbd/store-nbd.cc + store_nbd/tm_driver.cc + store_nbd/fs_driver.cc + store_nbd/block_driver.cc + ) +target_link_libraries(crimson-store-nbd + crimson-os) +install(TARGETS crimson-store-nbd DESTINATION bin) + +add_executable(perf-crimson-msgr perf_crimson_msgr.cc) +target_link_libraries(perf-crimson-msgr crimson) + +add_executable(perf-async-msgr perf_async_msgr.cc) +target_link_libraries(perf-async-msgr ceph-common global ${ALLOC_LIBS}) + +add_executable(perf-staged-fltree perf_staged_fltree.cc) +if(WITH_TESTS) +target_link_libraries(perf-staged-fltree crimson-seastore crimson::gtest) +else() +target_link_libraries(perf-staged-fltree crimson-seastore) +endif() diff --git a/src/crimson/tools/perf_async_msgr.cc b/src/crimson/tools/perf_async_msgr.cc new file mode 100644 index 000000000..38cc84fbb --- /dev/null +++ b/src/crimson/tools/perf_async_msgr.cc @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- + +#include <boost/program_options/variables_map.hpp> +#include <boost/program_options/parsers.hpp> + +#include "auth/Auth.h" +#include "global/global_init.h" +#include "msg/Dispatcher.h" +#include "msg/Messenger.h" +#include "messages/MOSDOp.h" + +#include "auth/DummyAuth.h" + +namespace { + +constexpr int CEPH_OSD_PROTOCOL = 10; + +struct Server { + Server(CephContext* cct, unsigned msg_len) + : dummy_auth(cct), dispatcher(cct, msg_len) + { + msgr.reset(Messenger::create(cct, "async", entity_name_t::OSD(0), "server", 0)); + dummy_auth.auth_registry.refresh_config(); + msgr->set_cluster_protocol(CEPH_OSD_PROTOCOL); + msgr->set_default_policy(Messenger::Policy::stateless_server(0)); + msgr->set_auth_client(&dummy_auth); + msgr->set_auth_server(&dummy_auth); + } + DummyAuthClientServer dummy_auth; + std::unique_ptr<Messenger> msgr; + struct ServerDispatcher : Dispatcher { + unsigned msg_len = 0; + bufferlist msg_data; + + ServerDispatcher(CephContext* cct, unsigned msg_len) + : Dispatcher(cct), msg_len(msg_len) + { + msg_data.append_zero(msg_len); + } + bool ms_can_fast_dispatch_any() const override { + return true; + } + bool ms_can_fast_dispatch(const Message* m) const override { + return m->get_type() == CEPH_MSG_OSD_OP; + } + void ms_fast_dispatch(Message* m) override { + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + const static pg_t pgid; + const static object_locator_t oloc; + const static hobject_t hobj(object_t(), oloc.key, CEPH_NOSNAP, pgid.ps(), + pgid.pool(), oloc.nspace); + static spg_t spgid(pgid); + MOSDOp *rep = new MOSDOp(0, 0, hobj, spgid, 0, 0, 0); + bufferlist data(msg_data); + rep->write(0, msg_len, data); + rep->set_tid(m->get_tid()); + m->get_connection()->send_message(rep); + m->put(); + } + bool ms_dispatch(Message*) override { + ceph_abort(); + } + bool ms_handle_reset(Connection*) override { + return true; + } + void ms_handle_remote_reset(Connection*) override { + } + bool ms_handle_refused(Connection*) override { + return true; + } + } dispatcher; +}; + +} + +static void run(CephContext* cct, entity_addr_t addr, unsigned bs) +{ + std::cout << "async server listening at " << addr << std::endl; + Server server{cct, bs}; + server.msgr->bind(addr); + server.msgr->add_dispatcher_head(&server.dispatcher); + server.msgr->start(); + server.msgr->wait(); +} + +int main(int argc, char** argv) +{ + namespace po = boost::program_options; + po::options_description desc{"Allowed options"}; + desc.add_options() + ("help,h", "show help message") + ("addr", po::value<std::string>()->default_value("v2:127.0.0.1:9010"), + "server address(crimson only supports msgr v2 protocol)") + ("bs", po::value<unsigned>()->default_value(0), + "server block size") + ("crc-enabled", po::value<bool>()->default_value(false), + "enable CRC checks") + ("threads", po::value<unsigned>()->default_value(3), + "async messenger worker threads"); + po::variables_map vm; + std::vector<std::string> unrecognized_options; + try { + auto parsed = po::command_line_parser(argc, argv) + .options(desc) + .allow_unregistered() + .run(); + po::store(parsed, vm); + if (vm.count("help")) { + std::cout << desc << std::endl; + return 0; + } + po::notify(vm); + unrecognized_options = po::collect_unrecognized(parsed.options, po::include_positional); + } catch(const po::error& e) { + std::cerr << "error: " << e.what() << std::endl; + return 1; + } + + auto addr = vm["addr"].as<std::string>(); + entity_addr_t target_addr; + target_addr.parse(addr.c_str(), nullptr); + ceph_assert_always(target_addr.is_msgr2()); + auto bs = vm["bs"].as<unsigned>(); + auto crc_enabled = vm["crc-enabled"].as<bool>(); + auto worker_threads = vm["threads"].as<unsigned>(); + + std::vector<const char*> args(argv, argv + argc); + auto cct = global_init(nullptr, args, + CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_MON_CONFIG); + common_init_finish(cct.get()); + + if (crc_enabled) { + cct->_conf.set_val("ms_crc_header", "true"); + cct->_conf.set_val("ms_crc_data", "true"); + } else { + cct->_conf.set_val("ms_crc_header", "false"); + cct->_conf.set_val("ms_crc_data", "false"); + } + + cct->_conf.set_val("ms_async_op_threads", fmt::format("{}", worker_threads)); + + std::cout << "server[" << addr + << "](bs=" << bs + << ", crc_enabled=" << crc_enabled + << ", worker_threads=" << worker_threads + << std::endl; + + run(cct.get(), target_addr, bs); +} diff --git a/src/crimson/tools/perf_crimson_msgr.cc b/src/crimson/tools/perf_crimson_msgr.cc new file mode 100644 index 000000000..aa5753442 --- /dev/null +++ b/src/crimson/tools/perf_crimson_msgr.cc @@ -0,0 +1,1222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <map> +#include <boost/program_options.hpp> +#include <boost/iterator/counting_iterator.hpp> + +#include <seastar/core/app-template.hh> +#include <seastar/core/do_with.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/lowres_clock.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/sleep.hh> +#include <seastar/core/semaphore.hh> +#include <seastar/core/smp.hh> +#include <seastar/core/thread.hh> + +#include "common/ceph_time.h" +#include "messages/MOSDOp.h" +#include "include/random.h" + +#include "crimson/auth/DummyAuth.h" +#include "crimson/common/log.h" +#include "crimson/common/config_proxy.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/net/Messenger.h" +#include "crimson/osd/stop_signal.h" + +using namespace std; +using namespace std::chrono_literals; + +using lowres_clock_t = seastar::lowres_system_clock; + +namespace bpo = boost::program_options; + +namespace { + +template<typename Message> +using Ref = boost::intrusive_ptr<Message>; + +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); +} + +template <typename T, typename... Args> +seastar::future<T*> create_sharded(Args... args) { + // seems we should only construct/stop shards on #0 + return seastar::smp::submit_to(0, [=] { + auto sharded_obj = seastar::make_lw_shared<seastar::sharded<T>>(); + return sharded_obj->start(args...).then([sharded_obj]() { + seastar::engine().at_exit([sharded_obj]() { + return sharded_obj->stop().then([sharded_obj] {}); + }); + return sharded_obj.get(); + }); + }).then([] (seastar::sharded<T> *ptr_shard) { + // return the pointer valid for the caller CPU + return &ptr_shard->local(); + }); +} + +double get_reactor_utilization() { + auto &value_map = seastar::metrics::impl::get_value_map(); + auto found = value_map.find("reactor_utilization"); + assert(found != value_map.end()); + auto &[full_name, metric_family] = *found; + std::ignore = full_name; + assert(metric_family.size() == 1); + const auto& [labels, metric] = *metric_family.begin(); + std::ignore = labels; + auto value = (*metric)(); + return value.ui(); +} + +enum class perf_mode_t { + both, + client, + server +}; + +struct client_config { + entity_addr_t server_addr; + unsigned block_size; + unsigned ramptime; + unsigned msgtime; + unsigned num_clients; + unsigned num_conns; + unsigned depth; + bool skip_core_0; + + std::string str() const { + std::ostringstream out; + out << "client[>> " << server_addr + << "](bs=" << block_size + << ", ramptime=" << ramptime + << ", msgtime=" << msgtime + << ", num_clients=" << num_clients + << ", num_conns=" << num_conns + << ", depth=" << depth + << ", skip_core_0=" << skip_core_0 + << ")"; + return out.str(); + } + + static client_config load(bpo::variables_map& options) { + client_config conf; + entity_addr_t addr; + ceph_assert(addr.parse(options["server-addr"].as<std::string>().c_str(), nullptr)); + ceph_assert_always(addr.is_msgr2()); + + conf.server_addr = addr; + conf.block_size = options["client-bs"].as<unsigned>(); + conf.ramptime = options["ramptime"].as<unsigned>(); + conf.msgtime = options["msgtime"].as<unsigned>(); + conf.num_clients = options["clients"].as<unsigned>(); + ceph_assert_always(conf.num_clients > 0); + conf.num_conns = options["conns-per-client"].as<unsigned>(); + ceph_assert_always(conf.num_conns > 0); + conf.depth = options["depth"].as<unsigned>(); + conf.skip_core_0 = options["client-skip-core-0"].as<bool>(); + return conf; + } +}; + +struct server_config { + entity_addr_t addr; + unsigned block_size; + bool is_fixed_cpu; + unsigned core; + + std::string str() const { + std::ostringstream out; + out << "server[" << addr + << "](bs=" << block_size + << ", is_fixed_cpu=" << is_fixed_cpu + << ", core=" << core + << ")"; + return out.str(); + } + + static server_config load(bpo::variables_map& options) { + server_config conf; + entity_addr_t addr; + ceph_assert(addr.parse(options["server-addr"].as<std::string>().c_str(), nullptr)); + ceph_assert_always(addr.is_msgr2()); + + conf.addr = addr; + conf.block_size = options["server-bs"].as<unsigned>(); + conf.is_fixed_cpu = options["server-fixed-cpu"].as<bool>(); + conf.core = options["server-core"].as<unsigned>(); + return conf; + } +}; + +const unsigned SAMPLE_RATE = 256; + +static seastar::future<> run( + perf_mode_t mode, + const client_config& client_conf, + const server_config& server_conf, + bool crc_enabled) +{ + struct test_state { + struct Server final + : public crimson::net::Dispatcher, + public seastar::peering_sharded_service<Server> { + // available only in msgr_sid + crimson::net::MessengerRef msgr; + crimson::auth::DummyAuthClientServer dummy_auth; + const seastar::shard_id msgr_sid; + std::string lname; + + bool is_fixed_cpu = true; + bool is_stopped = false; + std::optional<seastar::future<>> fut_report; + + unsigned conn_count = 0; + unsigned msg_count = 0; + MessageRef last_msg; + + // available in all shards + unsigned msg_len; + bufferlist msg_data; + + Server(seastar::shard_id msgr_sid, unsigned msg_len, bool needs_report) + : msgr_sid{msgr_sid}, + msg_len{msg_len} { + lname = fmt::format("server@{}", msgr_sid); + msg_data.append_zero(msg_len); + + if (seastar::this_shard_id() == msgr_sid && + needs_report) { + start_report(); + } + } + + void ms_handle_connect( + crimson::net::ConnectionRef, + seastar::shard_id) override { + ceph_abort("impossible, server won't connect"); + } + + void ms_handle_accept( + crimson::net::ConnectionRef, + seastar::shard_id new_shard, + bool is_replace) override { + ceph_assert_always(new_shard == seastar::this_shard_id()); + auto &server = container().local(); + ++server.conn_count; + } + + void ms_handle_reset( + crimson::net::ConnectionRef, + bool) override { + auto &server = container().local(); + --server.conn_count; + } + + std::optional<seastar::future<>> ms_dispatch( + crimson::net::ConnectionRef c, MessageRef m) override { + assert(c->get_shard_id() == seastar::this_shard_id()); + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + + auto &server = container().local(); + + // server replies with MOSDOp to generate server-side write workload + const static pg_t pgid; + const static object_locator_t oloc; + const static hobject_t hobj(object_t(), oloc.key, CEPH_NOSNAP, pgid.ps(), + pgid.pool(), oloc.nspace); + static spg_t spgid(pgid); + auto rep = crimson::make_message<MOSDOp>(0, 0, hobj, spgid, 0, 0, 0); + bufferlist data(server.msg_data); + rep->write(0, server.msg_len, data); + rep->set_tid(m->get_tid()); + ++server.msg_count; + std::ignore = c->send(std::move(rep)); + + if (server.msg_count % 16 == 0) { + server.last_msg = std::move(m); + } + return {seastar::now()}; + } + + seastar::future<> init(const entity_addr_t& addr, bool is_fixed_cpu) { + return container().invoke_on( + msgr_sid, [addr, is_fixed_cpu](auto &server) { + // server msgr is always with nonce 0 + server.msgr = crimson::net::Messenger::create( + entity_name_t::OSD(server.msgr_sid), + server.lname, 0, is_fixed_cpu); + server.msgr->set_default_policy(crimson::net::SocketPolicy::stateless_server(0)); + server.msgr->set_auth_client(&server.dummy_auth); + server.msgr->set_auth_server(&server.dummy_auth); + server.is_fixed_cpu = is_fixed_cpu; + return server.msgr->bind(entity_addrvec_t{addr} + ).safe_then([&server] { + return server.msgr->start({&server}); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [addr] (const std::error_code& e) { + logger().error("Server: " + "there is another instance running at {}", addr); + ceph_abort(); + })); + }); + } + + seastar::future<> shutdown() { + logger().info("{} shutdown...", lname); + return container().invoke_on( + msgr_sid, [](auto &server) { + server.is_stopped = true; + ceph_assert(server.msgr); + server.msgr->stop(); + return server.msgr->shutdown( + ).then([&server] { + if (server.fut_report.has_value()) { + return std::move(server.fut_report.value()); + } else { + return seastar::now(); + } + }); + }); + } + + private: + struct ShardReport { + unsigned msg_count = 0; + + // per-interval metrics + double reactor_utilization; + unsigned conn_count = 0; + int msg_size = 0; + unsigned msg_count_interval = 0; + }; + + // should not be called frequently to impact performance + void get_report(ShardReport& last) { + unsigned last_msg_count = last.msg_count; + int msg_size = -1; + if (last_msg) { + auto msg = boost::static_pointer_cast<MOSDOp>(last_msg); + msg->finish_decode(); + ceph_assert_always(msg->ops.size() == 1); + msg_size = msg->ops[0].op.extent.length; + last_msg.reset(); + } + + last.msg_count = msg_count; + last.reactor_utilization = get_reactor_utilization(); + last.conn_count = conn_count; + last.msg_size = msg_size; + last.msg_count_interval = msg_count - last_msg_count; + } + + struct TimerReport { + unsigned elapsed = 0u; + mono_time start_time = mono_clock::zero(); + std::vector<ShardReport> reports; + + TimerReport(unsigned shards) : reports(shards) {} + }; + + void start_report() { + seastar::promise<> pr_report; + fut_report = pr_report.get_future(); + seastar::do_with( + TimerReport(seastar::smp::count), + [this](auto &report) { + return seastar::do_until( + [this] { return is_stopped; }, + [&report, this] { + return seastar::sleep(2s + ).then([&report, this] { + report.elapsed += 2; + if (is_fixed_cpu) { + return seastar::smp::submit_to(msgr_sid, + [&report, this] { + auto &server = container().local(); + server.get_report(report.reports[seastar::this_shard_id()]); + }).then([&report, this] { + auto now = mono_clock::now(); + auto prv = report.start_time; + report.start_time = now; + if (prv == mono_clock::zero()) { + // cannot compute duration + return; + } + std::chrono::duration<double> duration_d = now - prv; + double duration = duration_d.count(); + auto &ireport = report.reports[msgr_sid]; + double iops = ireport.msg_count_interval / duration; + double throughput_MB = -1; + if (ireport.msg_size >= 0) { + throughput_MB = iops * ireport.msg_size / 1048576; + } + std::ostringstream sout; + sout << setfill(' ') + << report.elapsed + << "(" << std::setw(5) << duration << ") " + << std::setw(9) << iops << "IOPS " + << std::setw(8) << throughput_MB << "MiB/s " + << ireport.reactor_utilization + << "(" << ireport.conn_count << ")"; + std::cout << sout.str() << std::endl; + }); + } else { + return seastar::smp::invoke_on_all([&report, this] { + auto &server = container().local(); + server.get_report(report.reports[seastar::this_shard_id()]); + }).then([&report, this] { + auto now = mono_clock::now(); + auto prv = report.start_time; + report.start_time = now; + if (prv == mono_clock::zero()) { + // cannot compute duration + return; + } + std::chrono::duration<double> duration_d = now - prv; + double duration = duration_d.count(); + unsigned num_msgs = 0; + // -1 means unavailable, -2 means mismatch + int msg_size = -1; + for (auto &i : report.reports) { + if (i.msg_size >= 0) { + if (msg_size == -2) { + // pass + } else if (msg_size == -1) { + msg_size = i.msg_size; + } else { + if (msg_size != i.msg_size) { + msg_size = -2; + } + } + } + num_msgs += i.msg_count_interval; + } + double iops = num_msgs / duration; + double throughput_MB = msg_size; + if (msg_size >= 0) { + throughput_MB = iops * msg_size / 1048576; + } + std::ostringstream sout; + sout << setfill(' ') + << report.elapsed + << "(" << std::setw(5) << duration << ") " + << std::setw(9) << iops << "IOPS " + << std::setw(8) << throughput_MB << "MiB/s "; + for (auto &i : report.reports) { + sout << i.reactor_utilization + << "(" << i.conn_count << ") "; + } + std::cout << sout.str() << std::endl; + }); + } + }); + } + ); + }).then([this] { + logger().info("report is stopped!"); + }).forward_to(std::move(pr_report)); + } + }; + + struct Client final + : public crimson::net::Dispatcher, + public seastar::peering_sharded_service<Client> { + + struct ConnStats { + mono_time connecting_time = mono_clock::zero(); + mono_time connected_time = mono_clock::zero(); + unsigned received_count = 0u; + + mono_time start_time = mono_clock::zero(); + unsigned start_count = 0u; + + unsigned sampled_count = 0u; + double sampled_total_lat_s = 0.0; + + // for reporting only + mono_time finish_time = mono_clock::zero(); + + void start_connecting() { + connecting_time = mono_clock::now(); + } + + void finish_connecting() { + ceph_assert_always(connected_time == mono_clock::zero()); + connected_time = mono_clock::now(); + } + + void start_collect() { + ceph_assert_always(connected_time != mono_clock::zero()); + start_time = mono_clock::now(); + start_count = received_count; + sampled_count = 0u; + sampled_total_lat_s = 0.0; + finish_time = mono_clock::zero(); + } + + void prepare_summary(const ConnStats ¤t) { + *this = current; + finish_time = mono_clock::now(); + } + }; + + struct PeriodStats { + mono_time start_time = mono_clock::zero(); + unsigned start_count = 0u; + unsigned sampled_count = 0u; + double sampled_total_lat_s = 0.0; + + // for reporting only + mono_time finish_time = mono_clock::zero(); + unsigned finish_count = 0u; + unsigned depth = 0u; + + void start_collect(unsigned received_count) { + start_time = mono_clock::now(); + start_count = received_count; + sampled_count = 0u; + sampled_total_lat_s = 0.0; + } + + void reset_period( + unsigned received_count, unsigned _depth, PeriodStats &snapshot) { + snapshot.start_time = start_time; + snapshot.start_count = start_count; + snapshot.sampled_count = sampled_count; + snapshot.sampled_total_lat_s = sampled_total_lat_s; + snapshot.finish_time = mono_clock::now(); + snapshot.finish_count = received_count; + snapshot.depth = _depth; + + start_collect(received_count); + } + }; + + struct JobReport { + std::string name; + unsigned depth = 0; + double connect_time_s = 0; + unsigned total_msgs = 0; + double messaging_time_s = 0; + double latency_ms = 0; + double iops = 0; + double throughput_mbps = 0; + + void account(const JobReport &stats) { + depth += stats.depth; + connect_time_s += stats.connect_time_s; + total_msgs += stats.total_msgs; + messaging_time_s += stats.messaging_time_s; + latency_ms += stats.latency_ms; + iops += stats.iops; + throughput_mbps += stats.throughput_mbps; + } + + void report() const { + auto str = fmt::format( + "{}(depth={}):\n" + " connect time: {:08f}s\n" + " messages received: {}\n" + " messaging time: {:08f}s\n" + " latency: {:08f}ms\n" + " IOPS: {:08f}\n" + " out throughput: {:08f}MB/s", + name, depth, connect_time_s, + total_msgs, messaging_time_s, + latency_ms, iops, + throughput_mbps); + std::cout << str << std::endl; + } + }; + + struct ConnectionPriv : public crimson::net::Connection::user_private_t { + unsigned index; + ConnectionPriv(unsigned i) : index{i} {} + }; + + struct ConnState { + crimson::net::MessengerRef msgr; + ConnStats conn_stats; + PeriodStats period_stats; + seastar::semaphore depth; + std::vector<lowres_clock_t::time_point> time_msgs_sent; + unsigned sent_count = 0u; + crimson::net::ConnectionRef active_conn; + bool stop_send = false; + seastar::promise<JobReport> stopped_send_promise; + + ConnState(std::size_t _depth) + : depth{_depth}, + time_msgs_sent{_depth, lowres_clock_t::time_point::min()} {} + + unsigned get_current_units() const { + ceph_assert(depth.available_units() >= 0); + return depth.current(); + } + + seastar::future<JobReport> stop_dispatch_messages() { + stop_send = true; + depth.broken(DepthBroken()); + return stopped_send_promise.get_future(); + } + }; + + const seastar::shard_id sid; + const unsigned id; + const std::optional<unsigned> server_sid; + + const unsigned num_clients; + const unsigned num_conns; + const unsigned msg_len; + bufferlist msg_data; + const unsigned nr_depth; + const unsigned nonce_base; + crimson::auth::DummyAuthClientServer dummy_auth; + + std::vector<ConnState> conn_states; + + Client(unsigned num_clients, + unsigned num_conns, + unsigned msg_len, + unsigned _depth, + unsigned nonce_base, + std::optional<unsigned> server_sid) + : sid{seastar::this_shard_id()}, + id{sid + num_clients - seastar::smp::count}, + server_sid{server_sid}, + num_clients{num_clients}, + num_conns{num_conns}, + msg_len{msg_len}, + nr_depth{_depth}, + nonce_base{nonce_base} { + if (is_active()) { + for (unsigned i = 0; i < num_conns; ++i) { + conn_states.emplace_back(nr_depth); + } + } + msg_data.append_zero(msg_len); + } + + std::string get_name(unsigned i) { + return fmt::format("client{}Conn{}@{}", id, i, sid); + } + + void ms_handle_connect( + crimson::net::ConnectionRef conn, + seastar::shard_id prv_shard) override { + ceph_assert_always(prv_shard == seastar::this_shard_id()); + assert(is_active()); + unsigned index = static_cast<ConnectionPriv&>(conn->get_user_private()).index; + auto &conn_state = conn_states[index]; + conn_state.conn_stats.finish_connecting(); + } + + std::optional<seastar::future<>> ms_dispatch( + crimson::net::ConnectionRef conn, MessageRef m) override { + assert(is_active()); + // server replies with MOSDOp to generate server-side write workload + ceph_assert(m->get_type() == CEPH_MSG_OSD_OP); + + unsigned index = static_cast<ConnectionPriv&>(conn->get_user_private()).index; + assert(index < num_conns); + auto &conn_state = conn_states[index]; + + auto msg_id = m->get_tid(); + if (msg_id % SAMPLE_RATE == 0) { + auto msg_index = msg_id % conn_state.time_msgs_sent.size(); + ceph_assert(conn_state.time_msgs_sent[msg_index] != + lowres_clock_t::time_point::min()); + std::chrono::duration<double> cur_latency = + lowres_clock_t::now() - conn_state.time_msgs_sent[msg_index]; + conn_state.conn_stats.sampled_total_lat_s += cur_latency.count(); + ++(conn_state.conn_stats.sampled_count); + conn_state.period_stats.sampled_total_lat_s += cur_latency.count(); + ++(conn_state.period_stats.sampled_count); + conn_state.time_msgs_sent[msg_index] = lowres_clock_t::time_point::min(); + } + + ++(conn_state.conn_stats.received_count); + conn_state.depth.signal(1); + + return {seastar::now()}; + } + + // should start messenger at this shard? + bool is_active() { + ceph_assert(seastar::this_shard_id() == sid); + return sid + num_clients >= seastar::smp::count; + } + + seastar::future<> init() { + return container().invoke_on_all([](auto& client) { + if (client.is_active()) { + return seastar::do_for_each( + boost::make_counting_iterator(0u), + boost::make_counting_iterator(client.num_conns), + [&client](auto i) { + auto &conn_state = client.conn_states[i]; + std::string name = client.get_name(i); + conn_state.msgr = crimson::net::Messenger::create( + entity_name_t::OSD(client.id * client.num_conns + i), + name, client.nonce_base + client.id * client.num_conns + i, true); + conn_state.msgr->set_default_policy(crimson::net::SocketPolicy::lossy_client(0)); + conn_state.msgr->set_auth_client(&client.dummy_auth); + conn_state.msgr->set_auth_server(&client.dummy_auth); + return conn_state.msgr->start({&client}); + }); + } + return seastar::now(); + }); + } + + seastar::future<> shutdown() { + return seastar::do_with( + std::vector<JobReport>(num_clients * num_conns), + [this](auto &all_stats) { + return container().invoke_on_all([&all_stats](auto& client) { + if (!client.is_active()) { + return seastar::now(); + } + + return seastar::parallel_for_each( + boost::make_counting_iterator(0u), + boost::make_counting_iterator(client.num_conns), + [&all_stats, &client](auto i) { + logger().info("{} shutdown...", client.get_name(i)); + auto &conn_state = client.conn_states[i]; + return conn_state.stop_dispatch_messages( + ).then([&all_stats, &client, i](auto stats) { + all_stats[client.id * client.num_conns + i] = stats; + }); + }).then([&client] { + return seastar::do_for_each( + boost::make_counting_iterator(0u), + boost::make_counting_iterator(client.num_conns), + [&client](auto i) { + auto &conn_state = client.conn_states[i]; + ceph_assert(conn_state.msgr); + conn_state.msgr->stop(); + return conn_state.msgr->shutdown(); + }); + }); + }).then([&all_stats, this] { + auto nr_jobs = all_stats.size(); + JobReport summary; + std::vector<JobReport> clients(num_clients); + + for (unsigned i = 0; i < nr_jobs; ++i) { + auto &stats = all_stats[i]; + stats.report(); + clients[i / num_conns].account(stats); + summary.account(stats); + } + + std::cout << std::endl; + std::cout << "per client:" << std::endl; + for (unsigned i = 0; i < num_clients; ++i) { + auto &stats = clients[i]; + stats.name = fmt::format("client{}", i); + stats.connect_time_s /= num_conns; + stats.messaging_time_s /= num_conns; + stats.latency_ms /= num_conns; + stats.report(); + } + + std::cout << std::endl; + summary.name = fmt::format("all", nr_jobs); + summary.connect_time_s /= nr_jobs; + summary.messaging_time_s /= nr_jobs; + summary.latency_ms /= nr_jobs; + summary.report(); + }); + }); + } + + seastar::future<> connect_wait_verify(const entity_addr_t& peer_addr) { + return container().invoke_on_all([peer_addr](auto& client) { + // start clients in active cores + if (client.is_active()) { + for (unsigned i = 0; i < client.num_conns; ++i) { + auto &conn_state = client.conn_states[i]; + conn_state.conn_stats.start_connecting(); + conn_state.active_conn = conn_state.msgr->connect(peer_addr, entity_name_t::TYPE_OSD); + conn_state.active_conn->set_user_private( + std::make_unique<ConnectionPriv>(i)); + } + // make sure handshake won't hurt the performance + return seastar::sleep(1s).then([&client] { + for (unsigned i = 0; i < client.num_conns; ++i) { + auto &conn_state = client.conn_states[i]; + if (conn_state.conn_stats.connected_time == mono_clock::zero()) { + logger().error("\n{} not connected after 1s!\n", + client.get_name(i)); + ceph_assert(false); + } + } + }); + } + return seastar::now(); + }); + } + + private: + class TimerReport { + private: + const unsigned num_clients; + const unsigned num_conns; + const unsigned msgtime; + const unsigned bytes_of_block; + + unsigned elapsed = 0u; + std::vector<PeriodStats> snaps; + std::vector<ConnStats> summaries; + std::vector<double> client_reactor_utilizations; + std::optional<double> server_reactor_utilization; + + public: + TimerReport(unsigned num_clients, unsigned num_conns, unsigned msgtime, unsigned bs) + : num_clients{num_clients}, + num_conns{num_conns}, + msgtime{msgtime}, + bytes_of_block{bs}, + snaps{num_clients * num_conns}, + summaries{num_clients * num_conns}, + client_reactor_utilizations(num_clients) {} + + unsigned get_elapsed() const { return elapsed; } + + PeriodStats& get_snap(unsigned client_id, unsigned i) { + return snaps[client_id * num_conns + i]; + } + + ConnStats& get_summary(unsigned client_id, unsigned i) { + return summaries[client_id * num_conns + i]; + } + + void set_client_reactor_utilization(unsigned client_id, double ru) { + client_reactor_utilizations[client_id] = ru; + } + + void set_server_reactor_utilization(double ru) { + server_reactor_utilization = ru; + } + + bool should_stop() const { + return elapsed >= msgtime; + } + + seastar::future<> ticktock() { + return seastar::sleep(1s).then([this] { + ++elapsed; + }); + } + + void report_header() const { + std::ostringstream sout; + sout << std::setfill(' ') + << std::setw(6) << "sec" + << std::setw(7) << "depth" + << std::setw(10) << "IOPS" + << std::setw(9) << "MB/s" + << std::setw(9) << "lat(ms)"; + std::cout << sout.str() << std::endl; + } + + void report_period() { + std::chrono::duration<double> elapsed_d = 0s; + unsigned depth = 0u; + unsigned ops = 0u; + unsigned sampled_count = 0u; + double sampled_total_lat_s = 0.0; + for (const auto& snap: snaps) { + elapsed_d += (snap.finish_time - snap.start_time); + depth += snap.depth; + ops += (snap.finish_count - snap.start_count); + sampled_count += snap.sampled_count; + sampled_total_lat_s += snap.sampled_total_lat_s; + } + double elapsed_s = elapsed_d.count() / (num_clients * num_conns); + double iops = ops/elapsed_s; + std::ostringstream sout; + sout << setfill(' ') + << std::setw(5) << elapsed_s + << " " + << std::setw(6) << depth + << " " + << std::setw(9) << iops + << " " + << std::setw(8) << iops * bytes_of_block / 1048576 + << " " + << std::setw(8) << (sampled_total_lat_s / sampled_count * 1000) + << " -- "; + if (server_reactor_utilization.has_value()) { + sout << *server_reactor_utilization << " -- "; + } + for (double cru : client_reactor_utilizations) { + sout << cru << ","; + } + std::cout << sout.str() << std::endl; + } + + void report_summary() const { + std::chrono::duration<double> elapsed_d = 0s; + unsigned ops = 0u; + unsigned sampled_count = 0u; + double sampled_total_lat_s = 0.0; + for (const auto& summary: summaries) { + elapsed_d += (summary.finish_time - summary.start_time); + ops += (summary.received_count - summary.start_count); + sampled_count += summary.sampled_count; + sampled_total_lat_s += summary.sampled_total_lat_s; + } + double elapsed_s = elapsed_d.count() / (num_clients * num_conns); + double iops = ops / elapsed_s; + std::ostringstream sout; + sout << "--------------" + << " summary " + << "--------------\n" + << setfill(' ') + << std::setw(7) << elapsed_s + << std::setw(6) << "-" + << std::setw(8) << iops + << std::setw(8) << iops * bytes_of_block / 1048576 + << std::setw(8) << (sampled_total_lat_s / sampled_count * 1000) + << "\n"; + std::cout << sout.str() << std::endl; + } + }; + + seastar::future<> report_period(TimerReport& report) { + return container().invoke_on_all([&report] (auto& client) { + if (client.is_active()) { + for (unsigned i = 0; i < client.num_conns; ++i) { + auto &conn_state = client.conn_states[i]; + PeriodStats& snap = report.get_snap(client.id, i); + conn_state.period_stats.reset_period( + conn_state.conn_stats.received_count, + client.nr_depth - conn_state.get_current_units(), + snap); + } + report.set_client_reactor_utilization(client.id, get_reactor_utilization()); + } + if (client.server_sid.has_value() && + seastar::this_shard_id() == *client.server_sid) { + assert(!client.is_active()); + report.set_server_reactor_utilization(get_reactor_utilization()); + } + }).then([&report] { + report.report_period(); + }); + } + + seastar::future<> report_summary(TimerReport& report) { + return container().invoke_on_all([&report] (auto& client) { + if (client.is_active()) { + for (unsigned i = 0; i < client.num_conns; ++i) { + auto &conn_state = client.conn_states[i]; + ConnStats& summary = report.get_summary(client.id, i); + summary.prepare_summary(conn_state.conn_stats); + } + } + }).then([&report] { + report.report_summary(); + }); + } + + public: + seastar::future<> dispatch_with_timer(unsigned ramptime, unsigned msgtime) { + logger().info("[all clients]: start sending MOSDOps from {} clients * {} conns", + num_clients, num_conns); + return container().invoke_on_all([] (auto& client) { + if (client.is_active()) { + for (unsigned i = 0; i < client.num_conns; ++i) { + client.do_dispatch_messages(i); + } + } + }).then([ramptime] { + logger().info("[all clients]: ramping up {} seconds...", ramptime); + return seastar::sleep(std::chrono::seconds(ramptime)); + }).then([this] { + return container().invoke_on_all([] (auto& client) { + if (client.is_active()) { + for (unsigned i = 0; i < client.num_conns; ++i) { + auto &conn_state = client.conn_states[i]; + conn_state.conn_stats.start_collect(); + conn_state.period_stats.start_collect(conn_state.conn_stats.received_count); + } + } + }); + }).then([this, msgtime] { + logger().info("[all clients]: reporting {} seconds...\n", msgtime); + return seastar::do_with( + TimerReport(num_clients, num_conns, msgtime, msg_len), + [this](auto& report) { + report.report_header(); + return seastar::do_until( + [&report] { return report.should_stop(); }, + [&report, this] { + return report.ticktock().then([&report, this] { + // report period every 1s + return report_period(report); + }).then([&report, this] { + // report summary every 10s + if (report.get_elapsed() % 10 == 0) { + return report_summary(report); + } else { + return seastar::now(); + } + }); + } + ).then([&report, this] { + // report the final summary + if (report.get_elapsed() % 10 != 0) { + return report_summary(report); + } else { + return seastar::now(); + } + }); + }); + }); + } + + private: + seastar::future<> send_msg(ConnState &conn_state) { + ceph_assert(seastar::this_shard_id() == sid); + conn_state.sent_count += 1; + return conn_state.depth.wait(1 + ).then([this, &conn_state] { + const static pg_t pgid; + const static object_locator_t oloc; + const static hobject_t hobj(object_t(), oloc.key, CEPH_NOSNAP, pgid.ps(), + pgid.pool(), oloc.nspace); + static spg_t spgid(pgid); + auto m = crimson::make_message<MOSDOp>(0, 0, hobj, spgid, 0, 0, 0); + bufferlist data(msg_data); + m->write(0, msg_len, data); + // use tid as the identity of each round + m->set_tid(conn_state.sent_count); + + // sample message latency + if (unlikely(conn_state.sent_count % SAMPLE_RATE == 0)) { + auto index = conn_state.sent_count % conn_state.time_msgs_sent.size(); + ceph_assert(conn_state.time_msgs_sent[index] == + lowres_clock_t::time_point::min()); + conn_state.time_msgs_sent[index] = lowres_clock_t::now(); + } + + return conn_state.active_conn->send(std::move(m)); + }); + } + + class DepthBroken: public std::exception {}; + + seastar::future<JobReport> stop_dispatch_messages(unsigned i) { + auto &conn_state = conn_states[i]; + conn_state.stop_send = true; + conn_state.depth.broken(DepthBroken()); + return conn_state.stopped_send_promise.get_future(); + } + + void do_dispatch_messages(unsigned i) { + ceph_assert(seastar::this_shard_id() == sid); + auto &conn_state = conn_states[i]; + ceph_assert(conn_state.sent_count == 0); + conn_state.conn_stats.start_time = mono_clock::now(); + // forwarded to stopped_send_promise + (void) seastar::do_until( + [&conn_state] { return conn_state.stop_send; }, + [this, &conn_state] { return send_msg(conn_state); } + ).handle_exception_type([] (const DepthBroken& e) { + // ok, stopped by stop_dispatch_messages() + }).then([this, &conn_state, i] { + std::string name = get_name(i); + logger().info("{} {}: stopped sending OSDOPs", + name, *conn_state.active_conn); + + std::chrono::duration<double> dur_conn = + conn_state.conn_stats.connected_time - + conn_state.conn_stats.connecting_time; + std::chrono::duration<double> dur_msg = + mono_clock::now() - conn_state.conn_stats.start_time; + unsigned ops = + conn_state.conn_stats.received_count - + conn_state.conn_stats.start_count; + + JobReport stats; + stats.name = name; + stats.depth = nr_depth; + stats.connect_time_s = dur_conn.count(); + stats.total_msgs = ops; + stats.messaging_time_s = dur_msg.count(); + stats.latency_ms = + conn_state.conn_stats.sampled_total_lat_s / + conn_state.conn_stats.sampled_count * 1000; + stats.iops = ops / dur_msg.count(); + stats.throughput_mbps = ops / dur_msg.count() * msg_len / 1048576; + + conn_state.stopped_send_promise.set_value(stats); + }); + } + }; + }; + + std::optional<unsigned> server_sid; + bool server_needs_report = false; + if (mode == perf_mode_t::both) { + ceph_assert(server_conf.is_fixed_cpu == true); + server_sid = server_conf.core; + } else if (mode == perf_mode_t::server) { + server_needs_report = true; + } + return seastar::when_all( + seastar::futurize_invoke([mode, server_conf, server_needs_report] { + if (mode == perf_mode_t::client) { + return seastar::make_ready_future<test_state::Server*>(nullptr); + } else { + return create_sharded<test_state::Server>( + server_conf.core, + server_conf.block_size, + server_needs_report); + } + }), + seastar::futurize_invoke([mode, client_conf, server_sid] { + if (mode == perf_mode_t::server) { + return seastar::make_ready_future<test_state::Client*>(nullptr); + } else { + unsigned nonce_base = ceph::util::generate_random_number<unsigned>(); + logger().info("client nonce_base={}", nonce_base); + return create_sharded<test_state::Client>( + client_conf.num_clients, + client_conf.num_conns, + client_conf.block_size, + client_conf.depth, + nonce_base, + server_sid); + } + }), + crimson::common::sharded_conf().start( + EntityName{}, std::string_view{"ceph"} + ).then([] { + return crimson::common::local_conf().start(); + }).then([crc_enabled] { + return crimson::common::local_conf().set_val( + "ms_crc_data", crc_enabled ? "true" : "false"); + }) + ).then([=](auto&& ret) { + auto server = std::move(std::get<0>(ret).get0()); + auto client = std::move(std::get<1>(ret).get0()); + // reserve core 0 for potentially better performance + if (mode == perf_mode_t::both) { + logger().info("\nperf settings:\n smp={}\n {}\n {}\n", + seastar::smp::count, client_conf.str(), server_conf.str()); + if (client_conf.skip_core_0) { + ceph_assert(seastar::smp::count > client_conf.num_clients); + } else { + ceph_assert(seastar::smp::count >= client_conf.num_clients); + } + ceph_assert(client_conf.num_clients > 0); + ceph_assert(seastar::smp::count > server_conf.core + client_conf.num_clients); + return seastar::when_all_succeed( + // it is not reasonable to allow server/client to shared cores for + // performance benchmarking purposes. + server->init(server_conf.addr, server_conf.is_fixed_cpu), + client->init() + ).then_unpack([client, addr = client_conf.server_addr] { + return client->connect_wait_verify(addr); + }).then([client, ramptime = client_conf.ramptime, + msgtime = client_conf.msgtime] { + return client->dispatch_with_timer(ramptime, msgtime); + }).then([client] { + return client->shutdown(); + }).then([server] { + return server->shutdown(); + }); + } else if (mode == perf_mode_t::client) { + logger().info("\nperf settings:\n smp={}\n {}\n", + seastar::smp::count, client_conf.str()); + if (client_conf.skip_core_0) { + ceph_assert(seastar::smp::count > client_conf.num_clients); + } else { + ceph_assert(seastar::smp::count >= client_conf.num_clients); + } + ceph_assert(client_conf.num_clients > 0); + return client->init( + ).then([client, addr = client_conf.server_addr] { + return client->connect_wait_verify(addr); + }).then([client, ramptime = client_conf.ramptime, + msgtime = client_conf.msgtime] { + return client->dispatch_with_timer(ramptime, msgtime); + }).then([client] { + return client->shutdown(); + }); + } else { // mode == perf_mode_t::server + ceph_assert(seastar::smp::count > server_conf.core); + logger().info("\nperf settings:\n smp={}\n {}\n", + seastar::smp::count, server_conf.str()); + return seastar::async([server, server_conf] { + // FIXME: SIGINT is not received by stop_signal + seastar_apps_lib::stop_signal should_stop; + server->init(server_conf.addr, server_conf.is_fixed_cpu).get(); + should_stop.wait().get(); + server->shutdown().get(); + }); + } + }).finally([] { + return crimson::common::sharded_conf().stop(); + }); +} + +} + +int main(int argc, char** argv) +{ + seastar::app_template app; + app.add_options() + ("mode", bpo::value<unsigned>()->default_value(0), + "0: both, 1:client, 2:server") + ("server-addr", bpo::value<std::string>()->default_value("v2:127.0.0.1:9010"), + "server address(only support msgr v2 protocol)") + ("ramptime", bpo::value<unsigned>()->default_value(5), + "seconds of client ramp-up time") + ("msgtime", bpo::value<unsigned>()->default_value(15), + "seconds of client messaging time") + ("clients", bpo::value<unsigned>()->default_value(1), + "number of client messengers") + ("conns-per-client", bpo::value<unsigned>()->default_value(1), + "number of connections per client") + ("client-bs", bpo::value<unsigned>()->default_value(4096), + "client block size") + ("depth", bpo::value<unsigned>()->default_value(512), + "client io depth per job") + ("client-skip-core-0", bpo::value<bool>()->default_value(true), + "client skip core 0") + ("server-fixed-cpu", bpo::value<bool>()->default_value(true), + "server is in the fixed cpu mode, non-fixed doesn't support the mode both") + ("server-core", bpo::value<unsigned>()->default_value(1), + "server messenger running core") + ("server-bs", bpo::value<unsigned>()->default_value(0), + "server block size") + ("crc-enabled", bpo::value<bool>()->default_value(false), + "enable CRC checks"); + return app.run(argc, argv, [&app] { + auto&& config = app.configuration(); + auto mode = config["mode"].as<unsigned>(); + ceph_assert(mode <= 2); + auto _mode = static_cast<perf_mode_t>(mode); + bool crc_enabled = config["crc-enabled"].as<bool>(); + auto server_conf = server_config::load(config); + auto client_conf = client_config::load(config); + return run(_mode, client_conf, server_conf, crc_enabled + ).then([] { + logger().info("\nsuccessful!\n"); + }).handle_exception([] (auto eptr) { + logger().info("\nfailed!\n"); + return seastar::make_exception_future<>(eptr); + }); + }); +} diff --git a/src/crimson/tools/perf_staged_fltree.cc b/src/crimson/tools/perf_staged_fltree.cc new file mode 100644 index 000000000..81b621750 --- /dev/null +++ b/src/crimson/tools/perf_staged_fltree.cc @@ -0,0 +1,178 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/program_options.hpp> + +#include <seastar/core/app-template.hh> +#include <seastar/core/thread.hh> + +#include "crimson/common/config_proxy.h" +#include "crimson/common/log.h" +#include "crimson/common/perf_counters_collection.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +#include "test/crimson/seastore/onode_tree/test_value.h" +#include "test/crimson/seastore/transaction_manager_test_state.h" + +using namespace crimson::os::seastore::onode; +namespace bpo = boost::program_options; + +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_test); +} + +template <bool TRACK> +class PerfTree : public TMTestState { + public: + PerfTree(bool is_dummy) : is_dummy{is_dummy} {} + + seastar::future<> run(KVPool<test_item_t>& kvs, double erase_ratio) { + return tm_setup().then([this, &kvs, erase_ratio] { + return seastar::async([this, &kvs, erase_ratio] { + auto tree = std::make_unique<TreeBuilder<TRACK, ExtendedValue>>(kvs, + (is_dummy ? NodeExtentManager::create_dummy(true) + : NodeExtentManager::create_seastore(*tm))); + { + auto t = create_mutate_transaction(); + with_trans_intr(*t, [&](auto &tr){ + return tree->bootstrap(tr); + }).unsafe_get(); + submit_transaction(std::move(t)); + } + { + auto t = create_mutate_transaction(); + with_trans_intr(*t, [&](auto &tr){ + return tree->insert(tr); + }).unsafe_get(); + auto start_time = mono_clock::now(); + submit_transaction(std::move(t)); + std::chrono::duration<double> duration = mono_clock::now() - start_time; + logger().warn("submit_transaction() done! {}s", duration.count()); + } + { + // Note: create_weak_transaction() can also work, but too slow. + auto t = create_read_transaction(); + with_trans_intr(*t, [&](auto &tr){ + return tree->get_stats(tr); + }).unsafe_get(); + + with_trans_intr(*t, [&](auto &tr){ + return tree->validate(tr); + }).unsafe_get(); + } + { + auto t = create_mutate_transaction(); + with_trans_intr(*t, [&](auto &tr){ + return tree->erase(tr, kvs.size() * erase_ratio); + }).unsafe_get(); + submit_transaction(std::move(t)); + } + { + auto t = create_read_transaction(); + with_trans_intr(*t, [&](auto &tr){ + return tree->get_stats(tr); + }).unsafe_get(); + + with_trans_intr(*t, [&](auto &tr){ + return tree->validate(tr); + }).unsafe_get(); + } + tree.reset(); + }); + }).then([this] { + return tm_teardown(); + }); + } + + private: + bool is_dummy; +}; + +template <bool TRACK> +seastar::future<> run(const bpo::variables_map& config) { + return seastar::async([&config] { + auto backend = config["backend"].as<std::string>(); + bool is_dummy; + if (backend == "dummy") { + is_dummy = true; + } else if (backend == "seastore") { + is_dummy = false; + } else { + ceph_abort(false && "invalid backend"); + } + auto ns_sizes = config["ns-sizes"].as<std::vector<size_t>>(); + auto oid_sizes = config["oid-sizes"].as<std::vector<size_t>>(); + auto onode_sizes = config["onode-sizes"].as<std::vector<size_t>>(); + auto range2 = config["range2"].as<std::vector<int>>(); + ceph_assert(range2.size() == 2); + auto range1 = config["range1"].as<std::vector<unsigned>>(); + ceph_assert(range1.size() == 2); + auto range0 = config["range0"].as<std::vector<unsigned>>(); + ceph_assert(range0.size() == 2); + auto erase_ratio = config["erase-ratio"].as<double>(); + ceph_assert(erase_ratio >= 0); + ceph_assert(erase_ratio <= 1); + + using crimson::common::sharded_conf; + sharded_conf().start(EntityName{}, std::string_view{"ceph"}).get(); + seastar::engine().at_exit([] { + return sharded_conf().stop(); + }); + + using crimson::common::sharded_perf_coll; + sharded_perf_coll().start().get(); + seastar::engine().at_exit([] { + return sharded_perf_coll().stop(); + }); + + auto kvs = KVPool<test_item_t>::create_raw_range( + ns_sizes, oid_sizes, onode_sizes, + {range2[0], range2[1]}, + {range1[0], range1[1]}, + {range0[0], range0[1]}); + PerfTree<TRACK> perf{is_dummy}; + perf.run(kvs, erase_ratio).get0(); + }); +} + + +int main(int argc, char** argv) +{ + seastar::app_template app; + app.add_options() + ("backend", bpo::value<std::string>()->default_value("dummy"), + "tree backend: dummy, seastore") + ("tracked", bpo::value<bool>()->default_value(false), + "track inserted cursors") + ("ns-sizes", bpo::value<std::vector<size_t>>()->default_value( + {8, 11, 64, 128, 255, 256}), + "sizes of ns strings") + ("oid-sizes", bpo::value<std::vector<size_t>>()->default_value( + {8, 13, 64, 512, 2035, 2048}), + "sizes of oid strings") + ("onode-sizes", bpo::value<std::vector<size_t>>()->default_value( + {8, 16, 128, 576, 992, 1200}), + "sizes of onode") + ("range2", bpo::value<std::vector<int>>()->default_value( + {0, 128}), + "range of shard-pool-crush [a, b)") + ("range1", bpo::value<std::vector<unsigned>>()->default_value( + {0, 10}), + "range of ns-oid strings [a, b)") + ("range0", bpo::value<std::vector<unsigned>>()->default_value( + {0, 4}), + "range of snap-gen [a, b)") + ("erase-ratio", bpo::value<double>()->default_value( + 0.8), + "erase-ratio of all the inserted onodes"); + return app.run(argc, argv, [&app] { + auto&& config = app.configuration(); + auto tracked = config["tracked"].as<bool>(); + if (tracked) { + return run<true>(config); + } else { + return run<false>(config); + } + }); +} diff --git a/src/crimson/tools/store_nbd/block_driver.cc b/src/crimson/tools/store_nbd/block_driver.cc new file mode 100644 index 000000000..10e77a34b --- /dev/null +++ b/src/crimson/tools/store_nbd/block_driver.cc @@ -0,0 +1,19 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "fs_driver.h" +#include "block_driver.h" + +#include "tm_driver.h" + +BlockDriverRef get_backend(BlockDriver::config_t config) +{ + if (config.type == "transaction_manager") { + return std::make_unique<TMDriver>(config); + } else if (config.is_futurized_store()) { + return std::make_unique<FSDriver>(config); + } else { + ceph_assert(0 == "invalid option"); + return BlockDriverRef(); + } +} diff --git a/src/crimson/tools/store_nbd/block_driver.h b/src/crimson/tools/store_nbd/block_driver.h new file mode 100644 index 000000000..ea3453ef3 --- /dev/null +++ b/src/crimson/tools/store_nbd/block_driver.h @@ -0,0 +1,134 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/program_options/variables_map.hpp> +#include <boost/program_options/parsers.hpp> + +#include <seastar/core/future.hh> + +#include <string> +#include <optional> + +#include "include/buffer.h" + +/** + * BlockDriver + * + * Simple interface to enable throughput test to compare raw disk to + * transaction_manager, etc + */ +class BlockDriver { +public: + struct config_t { + std::string type; + bool mkfs = false; + unsigned num_pgs = 128; + unsigned log_size = 1000; + unsigned object_size = 4<<20 /* 4MB, rbd default */; + unsigned oi_size = 1<<9 /* 512b */; + unsigned log_entry_size = 1<<9 /* 512b */; + bool prepopulate_log = false; + std::optional<std::string> path; + + bool is_futurized_store() const { + return type == "seastore" || type == "bluestore"; + } + + std::string get_fs_type() const { + ceph_assert(is_futurized_store()); + return type; + } + + bool oi_enabled() const { + return oi_size > 0; + } + + bool log_enabled() const { + return log_entry_size > 0 && log_size > 0; + } + + bool prepopulate_log_enabled() const { + return prepopulate_log; + } + + void populate_options( + boost::program_options::options_description &desc) + { + namespace po = boost::program_options; + desc.add_options() + ("type", + po::value<std::string>() + ->default_value("transaction_manager") + ->notifier([this](auto s) { type = s; }), + "Backend to use, options are transaction_manager, seastore" + ) + ("device-path", + po::value<std::string>() + ->required() + ->notifier([this](auto s) { path = s; }), + "Path to device for backend" + ) + ("num-pgs", + po::value<unsigned>() + ->notifier([this](auto s) { num_pgs = s; }), + "Number of pgs to use for futurized_store backends" + ) + ("log-size", + po::value<unsigned>() + ->notifier([this](auto s) { log_size = s; }), + "Number of log entries per pg to use for futurized_store backends" + ", 0 to disable" + ) + ("log-entry-size", + po::value<unsigned>() + ->notifier([this](auto s) { log_entry_size = s; }), + "Size of each log entry per pg to use for futurized_store backends" + ", 0 to disable" + ) + ("prepopulate-log", + po::value<bool>() + ->notifier([this](auto s) { prepopulate_log = s; }), + "Prepopulate log on mount" + ) + ("object-info-size", + po::value<unsigned>() + ->notifier([this](auto s) { log_entry_size = s; }), + "Size of each log entry per pg to use for futurized_store backends" + ", 0 to disable" + ) + ("object-size", + po::value<unsigned>() + ->notifier([this](auto s) { object_size = s; }), + "Object size to use for futurized_store backends" + ) + ("mkfs", + po::value<bool>() + ->default_value(false) + ->notifier([this](auto s) { mkfs = s; }), + "Do mkfs first" + ); + } + }; + + virtual ceph::bufferptr get_buffer(size_t size) = 0; + + virtual seastar::future<> write( + off_t offset, + ceph::bufferptr ptr) = 0; + + virtual seastar::future<ceph::bufferlist> read( + off_t offset, + size_t size) = 0; + + virtual size_t get_size() const = 0; + + virtual seastar::future<> mount() = 0; + virtual seastar::future<> close() = 0; + + virtual ~BlockDriver() {} +}; +using BlockDriverRef = std::unique_ptr<BlockDriver>; + +BlockDriverRef get_backend(BlockDriver::config_t config); diff --git a/src/crimson/tools/store_nbd/fs_driver.cc b/src/crimson/tools/store_nbd/fs_driver.cc new file mode 100644 index 000000000..18f836766 --- /dev/null +++ b/src/crimson/tools/store_nbd/fs_driver.cc @@ -0,0 +1,310 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/iterator/counting_iterator.hpp> +#include <fmt/format.h> + +#include "os/Transaction.h" +#include "fs_driver.h" + +using namespace crimson; +using namespace crimson::os; + +coll_t get_coll(unsigned num) { + return coll_t(spg_t(pg_t(0, num))); +} + +ghobject_t get_log_object(unsigned coll) +{ + return ghobject_t( + shard_id_t::NO_SHARD, + 0, + (coll << 16), + "", + "", + 0, + ghobject_t::NO_GEN); +} + +std::string make_log_key( + unsigned i) +{ + return fmt::format("log_entry_{}", i); +} + +void add_log_entry( + unsigned i, + unsigned entry_size, + std::map<std::string, ceph::buffer::list> *omap) +{ + assert(omap); + bufferlist bl; + bl.append(ceph::buffer::create('0', entry_size)); + + omap->emplace(std::make_pair(make_log_key(i), bl)); +} + +void populate_log( + ceph::os::Transaction &t, + FSDriver::pg_analogue_t &pg, + unsigned entry_size, + unsigned entries) +{ + t.touch(pg.collection->get_cid(), pg.log_object); + // omap_clear not yet implemented, TODO + // t.omap_clear(pg.collection->get_cid(), pg.log_object); + + std::map<std::string, ceph::buffer::list> omap; + for (unsigned i = 0; i < entries; ++i) { + add_log_entry(i, entry_size, &omap); + } + + t.omap_setkeys( + pg.collection->get_cid(), + pg.log_object, + omap); + + pg.log_head = entries; +} + +void update_log( + ceph::os::Transaction &t, + FSDriver::pg_analogue_t &pg, + unsigned entry_size, + unsigned entries) +{ + ++pg.log_head; + std::map<std::string, ceph::buffer::list> key; + add_log_entry(pg.log_head, entry_size, &key); + + t.omap_setkeys( + pg.collection->get_cid(), + pg.log_object, + key); + + + while ((pg.log_head - pg.log_tail) > entries) { + t.omap_rmkey( + pg.collection->get_cid(), + pg.log_object, + make_log_key(pg.log_tail)); + ++pg.log_tail; + } +} + +FSDriver::offset_mapping_t FSDriver::map_offset(off_t offset) +{ + uint32_t objid = offset / config.object_size; + uint32_t collid = objid % config.num_pgs; + return offset_mapping_t{ + collections[collid], + ghobject_t( + shard_id_t::NO_SHARD, + 0, + (collid << 16) | (objid + 1), + "", + "", + 0, + ghobject_t::NO_GEN), + offset % config.object_size + }; +} + +seastar::future<> FSDriver::write( + off_t offset, + bufferptr ptr) +{ + auto mapping = map_offset(offset); + ceph_assert(mapping.offset + ptr.length() <= config.object_size); + ceph::os::Transaction t; + bufferlist bl; + bl.append(ptr); + t.write( + mapping.pg.collection->get_cid(), + mapping.object, + mapping.offset, + ptr.length(), + bl, + 0); + + if (config.oi_enabled() ) { + bufferlist attr; + attr.append(ceph::buffer::create(config.oi_size, '0')); + t.setattr( + mapping.pg.collection->get_cid(), + mapping.object, + "_", + attr); + } + + if (config.log_enabled()) { + update_log( + t, + mapping.pg, + config.log_entry_size, + config.log_size); + } + + return sharded_fs->do_transaction( + mapping.pg.collection, + std::move(t)); +} + +seastar::future<bufferlist> FSDriver::read( + off_t offset, + size_t size) +{ + auto mapping = map_offset(offset); + ceph_assert((mapping.offset + size) <= config.object_size); + return sharded_fs->read( + mapping.pg.collection, + mapping.object, + mapping.offset, + size, + 0 + ).handle_error( + crimson::ct_error::enoent::handle([size](auto &e) { + bufferlist bl; + bl.append_zero(size); + return seastar::make_ready_future<bufferlist>(std::move(bl)); + }), + crimson::ct_error::assert_all{"Unrecoverable error in FSDriver::read"} + ).then([size](auto &&bl) { + if (bl.length() < size) { + bl.append_zero(size - bl.length()); + } + return seastar::make_ready_future<bufferlist>(std::move(bl)); + }); +} + +seastar::future<> FSDriver::mkfs() +{ + return init( + ).then([this] { + assert(fs); + uuid_d uuid; + uuid.generate_random(); + return fs->mkfs(uuid).handle_error( + crimson::stateful_ec::handle([] (const auto& ec) { + crimson::get_logger(ceph_subsys_test) + .error("error creating empty object store in {}: ({}) {}", + crimson::common::local_conf().get_val<std::string>("osd_data"), + ec.value(), ec.message()); + std::exit(EXIT_FAILURE); + })); + }).then([this] { + return fs->stop(); + }).then([this] { + return init(); + }).then([this] { + return fs->mount( + ).handle_error( + crimson::stateful_ec::handle([] (const auto& ec) { + crimson::get_logger( + ceph_subsys_test + ).error( + "error mounting object store in {}: ({}) {}", + crimson::common::local_conf().get_val<std::string>("osd_data"), + ec.value(), + ec.message()); + std::exit(EXIT_FAILURE); + })); + }).then([this] { + return seastar::do_for_each( + boost::counting_iterator<unsigned>(0), + boost::counting_iterator<unsigned>(config.num_pgs), + [this](auto i) { + return sharded_fs->create_new_collection(get_coll(i) + ).then([this, i](auto coll) { + ceph::os::Transaction t; + t.create_collection(get_coll(i), 0); + return sharded_fs->do_transaction(coll, std::move(t)); + }); + }); + }).then([this] { + return fs->umount(); + }).then([this] { + return fs->stop(); + }).then([this] { + fs.reset(); + return seastar::now(); + }); +} + +seastar::future<> FSDriver::mount() +{ + ceph_assert(config.path); + return ( + config.mkfs ? mkfs() : seastar::now() + ).then([this] { + return init(); + }).then([this] { + return fs->mount( + ).handle_error( + crimson::stateful_ec::handle([] (const auto& ec) { + crimson::get_logger( + ceph_subsys_test + ).error( + "error mounting object store in {}: ({}) {}", + crimson::common::local_conf().get_val<std::string>("osd_data"), + ec.value(), + ec.message()); + std::exit(EXIT_FAILURE); + })); + }).then([this] { + return seastar::do_for_each( + boost::counting_iterator<unsigned>(0), + boost::counting_iterator<unsigned>(config.num_pgs), + [this](auto i) { + return sharded_fs->open_collection(get_coll(i) + ).then([this, i](auto ref) { + collections[i].collection = ref; + collections[i].log_object = get_log_object(i); + if (config.log_enabled()) { + ceph::os::Transaction t; + if (config.prepopulate_log_enabled()) { + populate_log( + t, + collections[i], + config.log_entry_size, + config.log_size); + } + return sharded_fs->do_transaction( + collections[i].collection, + std::move(t)); + } else { + return seastar::now(); + } + }); + }); + }).then([this] { + return fs->stat(); + }).then([this](auto s) { + size = s.total; + }); +}; + +seastar::future<> FSDriver::close() +{ + collections.clear(); + return fs->umount( + ).then([this] { + return fs->stop(); + }).then([this] { + fs.reset(); + return seastar::now(); + }); +} + +seastar::future<> FSDriver::init() +{ + fs.reset(); + fs = FuturizedStore::create( + config.get_fs_type(), + *config.path, + crimson::common::local_conf().get_config_values() + ); + return fs->start().then([this] { + sharded_fs = &(fs->get_sharded_store()); + }); +} diff --git a/src/crimson/tools/store_nbd/fs_driver.h b/src/crimson/tools/store_nbd/fs_driver.h new file mode 100644 index 000000000..89aca075f --- /dev/null +++ b/src/crimson/tools/store_nbd/fs_driver.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "block_driver.h" + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +class FSDriver final : public BlockDriver { +public: + FSDriver(config_t config) + : config(config) + {} + ~FSDriver() final {} + + bufferptr get_buffer(size_t size) final { + return ceph::buffer::create_page_aligned(size); + } + + seastar::future<> write( + off_t offset, + bufferptr ptr) final; + + seastar::future<bufferlist> read( + off_t offset, + size_t size) final; + + size_t get_size() const { + return size; + } + + seastar::future<> mount() final; + + seastar::future<> close() final; + +private: + size_t size = 0; + const config_t config; + std::unique_ptr<crimson::os::FuturizedStore> fs; + crimson::os::FuturizedStore::Shard* sharded_fs; + + struct pg_analogue_t { + crimson::os::CollectionRef collection; + + ghobject_t log_object; + unsigned log_tail = 0; + unsigned log_head = 0; + }; + std::map<unsigned, pg_analogue_t> collections; + + struct offset_mapping_t { + pg_analogue_t &pg; + ghobject_t object; + off_t offset; + }; + offset_mapping_t map_offset(off_t offset); + + seastar::future<> mkfs(); + seastar::future<> init(); + + friend void populate_log( + ceph::os::Transaction &, + pg_analogue_t &, + unsigned, + unsigned); + + friend void update_log( + ceph::os::Transaction &, + FSDriver::pg_analogue_t &, + unsigned, + unsigned); +}; diff --git a/src/crimson/tools/store_nbd/store-nbd.cc b/src/crimson/tools/store_nbd/store-nbd.cc new file mode 100644 index 000000000..9f80c3b2c --- /dev/null +++ b/src/crimson/tools/store_nbd/store-nbd.cc @@ -0,0 +1,456 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- + +/** + * crimson-store-nbd + * + * This tool exposes crimson object store internals as an nbd server + * for use with fio in basic benchmarking. + * + * Example usage: + * + * $ ./bin/crimson-store-nbd --device-path /dev/nvme1n1 -c 1 --mkfs true --uds-path /tmp/store_nbd_socket.sock + * + * $ cat nbd.fio + * [global] + * ioengine=nbd + * uri=nbd+unix:///?socket=/tmp/store_nbd_socket.sock + * rw=randrw + * time_based + * runtime=120 + * group_reporting + * iodepth=1 + * size=500G + * + * [job0] + * offset=0 + * + * $ fio nbd.fio + */ + +#include <random> + +#include <boost/program_options/variables_map.hpp> +#include <boost/program_options/parsers.hpp> + +#include <linux/nbd.h> +#include <linux/fs.h> + +#include <seastar/apps/lib/stop_signal.hh> +#include <seastar/core/app-template.hh> +#include <seastar/core/byteorder.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/rwlock.hh> +#include <seastar/core/thread.hh> +#include <seastar/util/defer.hh> + +#include "crimson/common/config_proxy.h" +#include "crimson/common/log.h" + +#include "block_driver.h" + +namespace po = boost::program_options; + +using namespace ceph; + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_test); + } +} + +struct request_context_t { + uint32_t magic = 0; + uint32_t type = 0; + + char handle[8] = {0}; + + uint64_t from = 0; + uint32_t len = 0; + + unsigned err = 0; + std::optional<bufferptr> in_buffer; + std::optional<bufferlist> out_buffer; + + using ref = std::unique_ptr<request_context_t>; + static ref make_ref() { + return std::make_unique<request_context_t>(); + } + + bool check_magic() const { + auto ret = magic == NBD_REQUEST_MAGIC; + if (!ret) { + logger().error( + "Invalid magic {} should be {}", + magic, + NBD_REQUEST_MAGIC); + } + return ret; + } + + uint32_t get_command() const { + return type & 0xff; + } + + bool has_input_buffer() const { + return get_command() == NBD_CMD_WRITE; + } + + seastar::future<> read_request(seastar::input_stream<char> &in) { + return in.read_exactly(sizeof(struct nbd_request) + ).then([this, &in](auto buf) { + if (buf.size() < sizeof(struct nbd_request)) { + throw std::system_error( + std::make_error_code( + std::errc::connection_reset)); + } + auto p = buf.get(); + magic = seastar::consume_be<uint32_t>(p); + type = seastar::consume_be<uint32_t>(p); + memcpy(handle, p, sizeof(handle)); + p += sizeof(handle); + from = seastar::consume_be<uint64_t>(p); + len = seastar::consume_be<uint32_t>(p); + logger().debug( + "Got request, magic {}, type {}, from {}, len {}", + magic, type, from, len); + + if (!check_magic()) { + throw std::system_error( + std::make_error_code( + std::errc::invalid_argument)); + } + + if (has_input_buffer()) { + return in.read_exactly(len).then([this](auto buf) { + in_buffer = ceph::buffer::create_page_aligned(len); + in_buffer->copy_in(0, len, buf.get()); + return seastar::now(); + }); + } else { + return seastar::now(); + } + }); + } + + seastar::future<> write_reply(seastar::output_stream<char> &out) { + seastar::temporary_buffer<char> buffer{sizeof(struct nbd_reply)}; + auto p = buffer.get_write(); + seastar::produce_be<uint32_t>(p, NBD_REPLY_MAGIC); + seastar::produce_be<uint32_t>(p, err); + logger().debug("write_reply writing err {}", err); + memcpy(p, handle, sizeof(handle)); + return out.write(std::move(buffer)).then([this, &out] { + if (out_buffer) { + return seastar::do_for_each( + out_buffer->mut_buffers(), + [&out](bufferptr &ptr) { + logger().debug("write_reply writing {}", ptr.length()); + return out.write( + seastar::temporary_buffer<char>( + ptr.c_str(), + ptr.length(), + seastar::make_deleter([ptr](){})) + ); + }); + } else { + return seastar::now(); + } + }).then([&out] { + return out.flush(); + }); + } +}; + +struct RequestWriter { + seastar::rwlock lock; + seastar::output_stream<char> stream; + seastar::gate gate; + + RequestWriter( + seastar::output_stream<char> &&stream) : stream(std::move(stream)) {} + RequestWriter(RequestWriter &&) = default; + + seastar::future<> complete(request_context_t::ref &&req) { + auto &request = *req; + return lock.write_lock( + ).then([&request, this] { + return request.write_reply(stream); + }).finally([&, this, req=std::move(req)] { + lock.write_unlock(); + logger().debug("complete"); + return seastar::now(); + }); + } + + seastar::future<> close() { + return gate.close().then([this] { + return stream.close(); + }); + } +}; + +/** + * NBDHandler + * + * Simple throughput test for concurrent, single threaded + * writes to an BlockDriver. + */ +class NBDHandler { + BlockDriver &backend; + std::string uds_path; + std::optional<seastar::server_socket> server_socket; + std::optional<seastar::connected_socket> connected_socket; + seastar::gate gate; +public: + struct config_t { + std::string uds_path; + + void populate_options( + po::options_description &desc) + { + desc.add_options() + ("uds-path", + po::value<std::string>() + ->default_value("/tmp/store_nbd_socket.sock") + ->notifier([this](auto s) { + uds_path = s; + }), + "Path to domain socket for nbd" + ); + } + }; + + NBDHandler( + BlockDriver &backend, + config_t config) : + backend(backend), + uds_path(config.uds_path) + {} + + void run(); + seastar::future<> stop(); +}; + +int main(int argc, char** argv) +{ + po::options_description desc{"Allowed options"}; + bool debug = false; + desc.add_options() + ("help,h", "show help message") + ("debug", po::value<bool>(&debug)->default_value(false), + "enable debugging"); + + po::options_description nbd_pattern_options{"NBD Pattern Options"}; + NBDHandler::config_t nbd_config; + nbd_config.populate_options(nbd_pattern_options); + desc.add(nbd_pattern_options); + + po::options_description backend_pattern_options{"Backend Options"}; + BlockDriver::config_t backend_config; + backend_config.populate_options(backend_pattern_options); + desc.add(backend_pattern_options); + + po::variables_map vm; + std::vector<std::string> unrecognized_options; + try { + auto parsed = po::command_line_parser(argc, argv) + .options(desc) + .allow_unregistered() + .run(); + po::store(parsed, vm); + if (vm.count("help")) { + std::cout << desc << std::endl; + return 0; + } + + po::notify(vm); + unrecognized_options = + po::collect_unrecognized(parsed.options, po::include_positional); + } catch(const po::error& e) { + std::cerr << "error: " << e.what() << std::endl; + return 1; + } + std::vector<const char*> args(argv, argv + argc); + + seastar::app_template::config app_cfg; + app_cfg.name = "crimson-store-nbd"; + app_cfg.auto_handle_sigint_sigterm = false; + seastar::app_template app(std::move(app_cfg)); + + std::vector<char*> av{argv[0]}; + std::transform(begin(unrecognized_options), + end(unrecognized_options), + std::back_inserter(av), + [](auto& s) { + return const_cast<char*>(s.c_str()); + }); + return app.run(av.size(), av.data(), [&] { + if (debug) { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::debug + ); + } + return seastar::async([&] { + seastar_apps_lib::stop_signal should_stop; + crimson::common::sharded_conf() + .start(EntityName{}, std::string_view{"ceph"}).get(); + auto stop_conf = seastar::defer([] { + crimson::common::sharded_conf().stop().get(); + }); + + auto backend = get_backend(backend_config); + NBDHandler nbd(*backend, nbd_config); + backend->mount().get(); + auto close_backend = seastar::defer([&] { + backend->close().get(); + }); + + logger().debug("Running nbd server..."); + nbd.run(); + auto stop_nbd = seastar::defer([&] { + nbd.stop().get(); + }); + should_stop.wait().get(); + return 0; + }); + }); +} + +class nbd_oldstyle_negotiation_t { + uint64_t magic = seastar::cpu_to_be(0x4e42444d41474943); // "NBDMAGIC" + uint64_t magic2 = seastar::cpu_to_be(0x00420281861253); // "IHAVEOPT" + uint64_t size = 0; + uint32_t flags = seastar::cpu_to_be(0); + char reserved[124] = {0}; + +public: + nbd_oldstyle_negotiation_t(uint64_t size, uint32_t flags) + : size(seastar::cpu_to_be(size)), flags(seastar::cpu_to_be(flags)) {} +} __attribute__((packed)); + +seastar::future<> send_negotiation( + size_t size, + seastar::output_stream<char>& out) +{ + seastar::temporary_buffer<char> buf{sizeof(nbd_oldstyle_negotiation_t)}; + new (buf.get_write()) nbd_oldstyle_negotiation_t(size, 1); + return out.write(std::move(buf) + ).then([&out] { + return out.flush(); + }); +} + +seastar::future<> handle_command( + BlockDriver &backend, + request_context_t::ref request_ref, + RequestWriter &out) +{ + auto &request = *request_ref; + logger().debug("got command {}", request.get_command()); + return ([&] { + switch (request.get_command()) { + case NBD_CMD_WRITE: + return backend.write( + request.from, + *request.in_buffer); + case NBD_CMD_READ: + return backend.read( + request.from, + request.len).then([&] (auto buffer) { + logger().debug("read returned buffer len {}", buffer.length()); + request.out_buffer = buffer; + }); + case NBD_CMD_DISC: + throw std::system_error(std::make_error_code(std::errc::bad_message)); + case NBD_CMD_TRIM: + throw std::system_error(std::make_error_code(std::errc::bad_message)); + default: + throw std::system_error(std::make_error_code(std::errc::bad_message)); + } + })().then([&, request_ref=std::move(request_ref)]() mutable { + logger().debug("handle_command complete"); + return out.complete(std::move(request_ref)); + }); +} + + +seastar::future<> handle_commands( + BlockDriver &backend, + seastar::input_stream<char>& in, + RequestWriter &out) +{ + logger().debug("handle_commands"); + return seastar::keep_doing([&] { + logger().debug("waiting for command"); + auto request_ref = request_context_t::make_ref(); + auto &request = *request_ref; + return request.read_request(in).then( + [&, request_ref=std::move(request_ref)]() mutable { + // keep running in background + (void)seastar::try_with_gate(out.gate, + [&backend, &out, request_ref=std::move(request_ref)]() mutable { + return handle_command(backend, std::move(request_ref), out); + }); + logger().debug("handle_commands after fork"); + }); + }).handle_exception_type([](const seastar::gate_closed_exception&) {}); +} + +void NBDHandler::run() +{ + logger().debug("About to listen on {}", uds_path); + server_socket = seastar::engine().listen( + seastar::socket_address{ + seastar::unix_domain_addr{uds_path}}); + + // keep running in background + (void)seastar::keep_doing([this] { + return seastar::try_with_gate(gate, [this] { + return server_socket->accept().then([this](auto acc) { + logger().debug("Accepted"); + connected_socket = std::move(acc.connection); + return seastar::do_with( + connected_socket->input(), + RequestWriter{connected_socket->output()}, + [&, this](auto &input, auto &output) { + return send_negotiation( + backend.get_size(), + output.stream + ).then([&, this] { + return handle_commands(backend, input, output); + }).finally([&] { + std::cout << "closing input and output" << std::endl; + return seastar::when_all(input.close(), + output.close()); + }).discard_result().handle_exception([](auto e) { + logger().error("NBDHandler::run saw exception {}", e); + }); + }); + }).handle_exception_type([] (const std::system_error &e) { + // an ECONNABORTED is expected when we are being stopped. + if (e.code() != std::errc::connection_aborted) { + logger().error("accept failed: {}", e); + } + }); + }); + }).handle_exception_type([](const seastar::gate_closed_exception&) {}); +} + +seastar::future<> NBDHandler::stop() +{ + if (server_socket) { + server_socket->abort_accept(); + } + if (connected_socket) { + connected_socket->shutdown_input(); + connected_socket->shutdown_output(); + } + return gate.close().then([this] { + if (!server_socket.has_value()) { + return seastar::now(); + } + return seastar::remove_file(uds_path); + }); +} diff --git a/src/crimson/tools/store_nbd/tm_driver.cc b/src/crimson/tools/store_nbd/tm_driver.cc new file mode 100644 index 000000000..bd216fd58 --- /dev/null +++ b/src/crimson/tools/store_nbd/tm_driver.cc @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "tm_driver.h" + +using namespace crimson; +using namespace crimson::os; +using namespace crimson::os::seastore; + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_test); + } +} + +seastar::future<> TMDriver::write( + off_t offset, + bufferptr ptr) +{ + logger().debug("Writing offset {}", offset); + assert(offset % device->get_block_size() == 0); + assert((ptr.length() % device->get_block_size()) == 0); + return seastar::do_with(ptr, [this, offset](auto& ptr) { + return repeat_eagain([this, offset, &ptr] { + return tm->with_transaction_intr( + Transaction::src_t::MUTATE, + "write", + [this, offset, &ptr](auto& t) + { + return tm->dec_ref(t, offset + ).si_then([](auto){}).handle_error_interruptible( + crimson::ct_error::enoent::handle([](auto) { return seastar::now(); }), + crimson::ct_error::pass_further_all{} + ).si_then([this, offset, &t, &ptr] { + logger().debug("dec_ref complete"); + return tm->alloc_extent<TestBlock>(t, offset, ptr.length()); + }).si_then([this, offset, &t, &ptr](auto ext) { + boost::ignore_unused(offset); // avoid clang warning; + assert(ext->get_laddr() == (size_t)offset); + assert(ext->get_bptr().length() == ptr.length()); + ext->get_bptr().swap(ptr); + logger().debug("submitting transaction"); + return tm->submit_transaction(t); + }); + }); + }); + }).handle_error( + crimson::ct_error::assert_all{"store-nbd write"} + ); +} + +TMDriver::read_extents_ret TMDriver::read_extents( + Transaction &t, + laddr_t offset, + extent_len_t length) +{ + return seastar::do_with( + lba_pin_list_t(), + lextent_list_t<TestBlock>(), + [this, &t, offset, length](auto &pins, auto &ret) { + return tm->get_pins( + t, offset, length + ).si_then([this, &t, &pins, &ret](auto _pins) { + _pins.swap(pins); + logger().debug("read_extents: mappings {}", pins); + return trans_intr::do_for_each( + pins.begin(), + pins.end(), + [this, &t, &ret](auto &&pin) { + logger().debug( + "read_extents: get_extent {}~{}", + pin->get_val(), + pin->get_length()); + return tm->read_pin<TestBlock>( + t, + std::move(pin) + ).si_then([&ret](auto ref) mutable { + ret.push_back(std::make_pair(ref->get_laddr(), ref)); + logger().debug( + "read_extents: got extent {}", + *ref); + return seastar::now(); + }); + }).si_then([&ret] { + return std::move(ret); + }); + }); + }); +} + +seastar::future<bufferlist> TMDriver::read( + off_t offset, + size_t size) +{ + logger().debug("Reading offset {}", offset); + assert(offset % device->get_block_size() == 0); + assert(size % device->get_block_size() == 0); + auto blptrret = std::make_unique<bufferlist>(); + auto &blret = *blptrret; + return repeat_eagain([=, &blret, this] { + return tm->with_transaction_intr( + Transaction::src_t::READ, + "read", + [=, &blret, this](auto& t) + { + return read_extents(t, offset, size + ).si_then([=, &blret](auto ext_list) { + size_t cur = offset; + for (auto &i: ext_list) { + if (cur != i.first) { + assert(cur < i.first); + blret.append_zero(i.first - cur); + cur = i.first; + } + blret.append(i.second->get_bptr()); + cur += i.second->get_bptr().length(); + } + if (blret.length() != size) { + assert(blret.length() < size); + blret.append_zero(size - blret.length()); + } + }); + }); + }).handle_error( + crimson::ct_error::assert_all{"store-nbd read"} + ).then([blptrret=std::move(blptrret)]() mutable { + logger().debug("read complete"); + return std::move(*blptrret); + }); +} + +void TMDriver::init() +{ + std::vector<Device*> sec_devices; +#ifndef NDEBUG + tm = make_transaction_manager(device.get(), sec_devices, true); +#else + tm = make_transaction_manager(device.get(), sec_devices, false); +#endif +} + +void TMDriver::clear() +{ + tm.reset(); +} + +size_t TMDriver::get_size() const +{ + return device->get_available_size() * .5; +} + +seastar::future<> TMDriver::mkfs() +{ + assert(config.path); + logger().debug("mkfs"); + return Device::make_device(*config.path, device_type_t::SSD + ).then([this](DeviceRef dev) { + device = std::move(dev); + seastore_meta_t meta; + meta.seastore_id.generate_random(); + return device->mkfs( + device_config_t{ + true, + (magic_t)std::rand(), + device_type_t::SSD, + 0, + meta, + secondary_device_set_t()}); + }).safe_then([this] { + logger().debug("device mkfs done"); + return device->mount(); + }).safe_then([this] { + init(); + logger().debug("tm mkfs"); + return tm->mkfs(); + }).safe_then([this] { + logger().debug("tm close"); + return tm->close(); + }).safe_then([this] { + logger().debug("sm close"); + return device->close(); + }).safe_then([this] { + clear(); + device.reset(); + logger().debug("mkfs complete"); + return TransactionManager::mkfs_ertr::now(); + }).handle_error( + crimson::ct_error::assert_all{ + "Invalid errror during TMDriver::mkfs" + } + ); +} + +seastar::future<> TMDriver::mount() +{ + return (config.mkfs ? mkfs() : seastar::now() + ).then([this] { + return Device::make_device(*config.path, device_type_t::SSD); + }).then([this](DeviceRef dev) { + device = std::move(dev); + return device->mount(); + }).safe_then([this] { + init(); + return tm->mount(); + }).handle_error( + crimson::ct_error::assert_all{ + "Invalid errror during TMDriver::mount" + } + ); +}; + +seastar::future<> TMDriver::close() +{ + return tm->close().safe_then([this] { + clear(); + return device->close(); + }).handle_error( + crimson::ct_error::assert_all{ + "Invalid errror during TMDriver::close" + } + ); +} diff --git a/src/crimson/tools/store_nbd/tm_driver.h b/src/crimson/tools/store_nbd/tm_driver.h new file mode 100644 index 000000000..24aabdeb6 --- /dev/null +++ b/src/crimson/tools/store_nbd/tm_driver.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "block_driver.h" + +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/device.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "test/crimson/seastore/test_block.h" + +class TMDriver final : public BlockDriver { +public: + TMDriver(config_t config) : config(config) {} + ~TMDriver() final {} + + bufferptr get_buffer(size_t size) final { + return ceph::buffer::create_page_aligned(size); + } + + seastar::future<> write( + off_t offset, + bufferptr ptr) final; + + seastar::future<bufferlist> read( + off_t offset, + size_t size) final; + + size_t get_size() const final; + + seastar::future<> mount() final; + + seastar::future<> close() final; + +private: + const config_t config; + + using DeviceRef = crimson::os::seastore::DeviceRef; + DeviceRef device; + + using TransactionManager = crimson::os::seastore::TransactionManager; + using TransactionManagerRef = crimson::os::seastore::TransactionManagerRef; + TransactionManagerRef tm; + + seastar::future<> mkfs(); + void init(); + void clear(); + + using read_extents_iertr = TransactionManager::read_extent_iertr; + using read_extents_ret = read_extents_iertr::future< + crimson::os::seastore::lextent_list_t<crimson::os::seastore::TestBlock> + >; + read_extents_ret read_extents( + crimson::os::seastore::Transaction &t, + crimson::os::seastore::laddr_t offset, + crimson::os::seastore::extent_len_t length); +}; |