diff options
Diffstat (limited to 'src/crimson')
232 files changed, 55919 insertions, 0 deletions
diff --git a/src/crimson/CMakeLists.txt b/src/crimson/CMakeLists.txt new file mode 100644 index 000000000..26f729336 --- /dev/null +++ b/src/crimson/CMakeLists.txt @@ -0,0 +1,191 @@ +add_library(crimson::cflags INTERFACE IMPORTED) +set(crimson_cflag_definitions "WITH_SEASTAR=1") +# disable concepts to address https://github.com/boostorg/asio/issues/312 +if((CMAKE_CXX_COMPILER_ID STREQUAL GNU AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10) OR + (CMAKE_CXX_COMPILER_ID STREQUAL Clang)) + list(APPEND crimson_cflag_definitions + "BOOST_ASIO_DISABLE_CONCEPTS") +endif() +set_target_properties(crimson::cflags PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "${crimson_cflag_definitions}" + INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:-Wno-non-virtual-dtor> + INTERFACE_LINK_LIBRARIES Seastar::seastar) + +set(crimson_common_srcs + common/assert.cc + common/buffer_io.cc + common/config_proxy.cc + common/formatter.cc + common/perf_counters_collection.cc + common/log.cc + common/throttle.cc + common/tri_mutex.cc) + +# the specialized version of ceph-common, where +# - the logging is sent to Seastar backend +# - and the template parameter of lock_policy is SINGLE +add_library(crimson-common STATIC + ${PROJECT_SOURCE_DIR}/src/common/admin_socket_client.cc + ${PROJECT_SOURCE_DIR}/src/common/bit_str.cc + ${PROJECT_SOURCE_DIR}/src/common/bloom_filter.cc + ${PROJECT_SOURCE_DIR}/src/common/buffer.cc + ${PROJECT_SOURCE_DIR}/src/common/buffer_seastar.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_argparse.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_context.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_crypto.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_hash.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_time.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_strings.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_releases.cc + ${PROJECT_SOURCE_DIR}/src/common/cmdparse.cc + ${PROJECT_SOURCE_DIR}/src/common/common_init.cc + ${PROJECT_SOURCE_DIR}/src/common/compat.cc + ${PROJECT_SOURCE_DIR}/src/common/code_environment.cc + ${PROJECT_SOURCE_DIR}/src/common/config.cc + ${PROJECT_SOURCE_DIR}/src/common/config_values.cc + ${PROJECT_SOURCE_DIR}/src/common/dout.cc + ${PROJECT_SOURCE_DIR}/src/common/entity_name.cc + ${PROJECT_SOURCE_DIR}/src/common/environment.cc + ${PROJECT_SOURCE_DIR}/src/common/errno.cc + ${PROJECT_SOURCE_DIR}/src/common/escape.cc + ${PROJECT_SOURCE_DIR}/src/common/hex.cc + ${PROJECT_SOURCE_DIR}/src/common/fs_types.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_json.cc + ${PROJECT_SOURCE_DIR}/src/common/histogram.cc + ${PROJECT_SOURCE_DIR}/src/common/hobject.cc + ${PROJECT_SOURCE_DIR}/src/common/hostname.cc + ${PROJECT_SOURCE_DIR}/src/common/ipaddr.cc + ${PROJECT_SOURCE_DIR}/src/common/mempool.cc + ${PROJECT_SOURCE_DIR}/src/common/options.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_counters.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_histogram.cc + ${PROJECT_SOURCE_DIR}/src/common/page.cc + ${PROJECT_SOURCE_DIR}/src/common/pick_address.cc + ${PROJECT_SOURCE_DIR}/src/common/snap_types.cc + ${PROJECT_SOURCE_DIR}/src/common/signal.cc + ${PROJECT_SOURCE_DIR}/src/common/str_list.cc + ${PROJECT_SOURCE_DIR}/src/common/str_map.cc + ${PROJECT_SOURCE_DIR}/src/common/strtol.cc + ${PROJECT_SOURCE_DIR}/src/common/reverse.c + ${PROJECT_SOURCE_DIR}/src/common/types.cc + ${PROJECT_SOURCE_DIR}/src/common/utf8.c + ${PROJECT_SOURCE_DIR}/src/common/version.cc + ${PROJECT_SOURCE_DIR}/src/common/BackTrace.cc + ${PROJECT_SOURCE_DIR}/src/common/ConfUtils.cc + ${PROJECT_SOURCE_DIR}/src/common/DecayCounter.cc + ${PROJECT_SOURCE_DIR}/src/common/HTMLFormatter.cc + ${PROJECT_SOURCE_DIR}/src/common/Formatter.cc + ${PROJECT_SOURCE_DIR}/src/common/Graylog.cc + ${PROJECT_SOURCE_DIR}/src/common/ostream_temp.cc + ${PROJECT_SOURCE_DIR}/src/common/LogEntry.cc + ${PROJECT_SOURCE_DIR}/src/common/TextTable.cc + ${PROJECT_SOURCE_DIR}/src/common/Thread.cc + ${PROJECT_SOURCE_DIR}/src/common/PluginRegistry.cc + ${PROJECT_SOURCE_DIR}/src/common/RefCountedObj.cc + ${PROJECT_SOURCE_DIR}/src/common/util.cc + ${PROJECT_SOURCE_DIR}/src/crush/builder.c + ${PROJECT_SOURCE_DIR}/src/crush/mapper.c + ${PROJECT_SOURCE_DIR}/src/crush/crush.c + ${PROJECT_SOURCE_DIR}/src/crush/hash.c + ${PROJECT_SOURCE_DIR}/src/crush/CrushWrapper.cc + ${PROJECT_SOURCE_DIR}/src/crush/CrushCompiler.cc + ${PROJECT_SOURCE_DIR}/src/crush/CrushTester.cc + ${PROJECT_SOURCE_DIR}/src/crush/CrushLocation.cc + ${PROJECT_SOURCE_DIR}/src/global/global_context.cc + ${PROJECT_SOURCE_DIR}/src/global/pidfile.cc + ${PROJECT_SOURCE_DIR}/src/librbd/Features.cc + ${PROJECT_SOURCE_DIR}/src/log/Log.cc + ${PROJECT_SOURCE_DIR}/src/mgr/ServiceMap.cc + ${PROJECT_SOURCE_DIR}/src/mds/inode_backtrace.cc + ${PROJECT_SOURCE_DIR}/src/mds/mdstypes.cc + ${PROJECT_SOURCE_DIR}/src/mds/cephfs_features.cc + ${PROJECT_SOURCE_DIR}/src/mds/FSMap.cc + ${PROJECT_SOURCE_DIR}/src/mds/FSMapUser.cc + ${PROJECT_SOURCE_DIR}/src/mds/MDSMap.cc + ${PROJECT_SOURCE_DIR}/src/msg/msg_types.cc + ${PROJECT_SOURCE_DIR}/src/msg/Message.cc + ${PROJECT_SOURCE_DIR}/src/mon/PGMap.cc + ${PROJECT_SOURCE_DIR}/src/mon/MonCap.cc + ${PROJECT_SOURCE_DIR}/src/mon/MonMap.cc + ${PROJECT_SOURCE_DIR}/src/osd/osd_types.cc + ${PROJECT_SOURCE_DIR}/src/osd/ECMsgTypes.cc + ${PROJECT_SOURCE_DIR}/src/osd/HitSet.cc + ${PROJECT_SOURCE_DIR}/src/osd/OSDMap.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc + ${PROJECT_SOURCE_DIR}/src/xxHash/xxhash.c + ${crimson_common_srcs} + $<TARGET_OBJECTS:common_mountcephfs_objs>) + +target_compile_definitions(crimson-common PRIVATE + "CEPH_LIBDIR=\"${CMAKE_INSTALL_FULL_LIBDIR}\"" + "CEPH_PKGLIBDIR=\"${CEPH_INSTALL_FULL_PKGLIBDIR}\"" + "CEPH_DATADIR=\"${CEPH_INSTALL_DATADIR}\"") + +set(crimson_common_deps + Boost::iostreams + Boost::random + json_spirit) + +if(WITH_JAEGER) + include_directories(SYSTEM ${CMAKE_BINARY_DIR}/external/include) + list(APPEND crimson_common_deps jaeger-base) +endif() + +if(NOT WITH_SYSTEM_BOOST) + list(APPEND crimson_common_deps ${ZLIB_LIBRARIES}) +endif() + +target_link_libraries(crimson-common + PUBLIC + crimson::cflags + PRIVATE + crc32 + ${crimson_common_deps} + OpenSSL::Crypto) + +set(crimson_auth_srcs + auth/KeyRing.cc + ${PROJECT_SOURCE_DIR}/src/auth/AuthClientHandler.cc + ${PROJECT_SOURCE_DIR}/src/auth/AuthMethodList.cc + ${PROJECT_SOURCE_DIR}/src/auth/AuthRegistry.cc + ${PROJECT_SOURCE_DIR}/src/auth/AuthSessionHandler.cc + ${PROJECT_SOURCE_DIR}/src/auth/Crypto.cc + ${PROJECT_SOURCE_DIR}/src/auth/KeyRing.cc + ${PROJECT_SOURCE_DIR}/src/auth/RotatingKeyRing.cc + ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxAuthorizeHandler.cc + ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxClientHandler.cc + ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxProtocol.cc + ${PROJECT_SOURCE_DIR}/src/auth/cephx/CephxSessionHandler.cc + ${PROJECT_SOURCE_DIR}/src/auth/none/AuthNoneAuthorizeHandler.cc) +set(crimson_mgr_srcs + mgr/client.cc) +set(crimson_mon_srcs + mon/MonClient.cc + ${PROJECT_SOURCE_DIR}/src/mon/MonSub.cc) +set(crimson_net_srcs + ${PROJECT_SOURCE_DIR}/src/msg/async/crypto_onwire.cc + ${PROJECT_SOURCE_DIR}/src/msg/async/frames_v2.cc + net/Errors.cc + net/Messenger.cc + net/SocketConnection.cc + net/SocketMessenger.cc + net/Socket.cc + net/Protocol.cc + net/ProtocolV1.cc + net/ProtocolV2.cc + net/chained_dispatchers.cc) +add_library(crimson STATIC + ${crimson_auth_srcs} + ${crimson_mgr_srcs} + ${crimson_mon_srcs} + ${crimson_net_srcs}) +target_compile_options(crimson PUBLIC + "-ftemplate-backtrace-limit=0") +target_link_libraries(crimson + PUBLIC + crimson-common + crimson::cflags) +add_subdirectory(admin) +add_subdirectory(os) +add_subdirectory(osd) +add_subdirectory(tools) diff --git a/src/crimson/admin/CMakeLists.txt b/src/crimson/admin/CMakeLists.txt new file mode 100644 index 000000000..aa0771735 --- /dev/null +++ b/src/crimson/admin/CMakeLists.txt @@ -0,0 +1,8 @@ +add_library(crimson-admin STATIC + admin_socket.cc + osd_admin.cc + pg_commands.cc) + +target_link_libraries(crimson-admin + crimson::cflags + Boost::MPL) diff --git a/src/crimson/admin/admin_socket.cc b/src/crimson/admin/admin_socket.cc new file mode 100644 index 000000000..852185af1 --- /dev/null +++ b/src/crimson/admin/admin_socket.cc @@ -0,0 +1,521 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/admin/admin_socket.h" + +#include <boost/algorithm/string/join.hpp> +#include <fmt/format.h> +#include <seastar/net/api.hh> +#include <seastar/net/inet_address.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/sleep.hh> +#include <seastar/core/thread.hh> +#include <seastar/util/std-compat.hh> + +#include "common/version.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "crimson/common/log.h" +#include "crimson/net/Socket.h" + +using namespace crimson::common; + +namespace { +seastar::logger& logger() +{ + return crimson::get_logger(ceph_subsys_osd); +} +} // namespace + +namespace crimson::admin { + +tell_result_t::tell_result_t(int ret, std::string&& err) + : ret{ret}, err(std::move(err)) +{} + +tell_result_t::tell_result_t(int ret, std::string&& err, ceph::bufferlist&& out) + : ret{ret}, err(std::move(err)), out(std::move(out)) +{} + +tell_result_t::tell_result_t(std::unique_ptr<Formatter> formatter) +{ + formatter->flush(out); +} + +seastar::future<> +AdminSocket::register_command(std::unique_ptr<AdminSocketHook>&& hook) +{ + return seastar::with_lock(servers_tbl_rwlock, + [this, hook = std::move(hook)]() mutable { + auto prefix = hook->prefix; + auto [it, added] = hooks.emplace(prefix, std::move(hook)); + // was this server tag already registered? + assert(added); + if (added) { + logger().info("register_command(): {})", it->first); + } + return seastar::now(); + }); +} + +/* + * Note: parse_cmd() is executed with servers_tbl_rwlock held as shared + */ +auto AdminSocket::parse_cmd(const std::vector<std::string>& cmd) + -> std::variant<parsed_command_t, tell_result_t> +{ + // preliminaries: + // - create the formatter specified by the cmd parameters + // - locate the "op-code" string (the 'prefix' segment) + // - prepare for command parameters extraction via cmdmap_t + cmdmap_t cmdmap; + ceph::bufferlist out; + + try { + stringstream errss; + // note that cmdmap_from_json() may throw on syntax issues + if (!cmdmap_from_json(cmd, &cmdmap, errss)) { + logger().error("{}: incoming command error: {}", __func__, errss.str()); + out.append("error:"s); + out.append(errss.str()); + return tell_result_t{-EINVAL, "invalid json", std::move(out)}; + } + } catch (const std::runtime_error& e) { + logger().error("{}: incoming command syntax: {}", __func__, cmd); + out.append(string{e.what()}); + return tell_result_t{-EINVAL, "invalid json", std::move(out)}; + } + + string format; + string prefix; + try { + cmd_getval(cmdmap, "format", format); + cmd_getval(cmdmap, "prefix", prefix); + } catch (const bad_cmd_get& e) { + logger().error("{}: invalid syntax: {}", __func__, cmd); + out.append(string{e.what()}); + return tell_result_t{-EINVAL, "invalid json", std::move(out)}; + } + + // match the incoming op-code to one of the registered APIs + if (auto found = hooks.find(prefix); found != hooks.end()) { + return parsed_command_t{ cmdmap, format, *found->second }; + } else { + return tell_result_t{-EINVAL, + fmt::format("unknown command '{}'", prefix), + std::move(out)}; + } +} + +seastar::future<> AdminSocket::finalize_response( + seastar::output_stream<char>& out, ceph::bufferlist&& msgs) +{ + string outbuf_cont = msgs.to_str(); + if (outbuf_cont.empty()) { + outbuf_cont = " {} "; + } + uint32_t response_length = htonl(outbuf_cont.length()); + logger().info("asok response length: {}", outbuf_cont.length()); + + return out.write((char*)&response_length, sizeof(uint32_t)) + .then([&out, outbuf_cont] { return out.write(outbuf_cont.c_str()); }); +} + + +seastar::future<> AdminSocket::handle_command(crimson::net::ConnectionRef conn, + boost::intrusive_ptr<MCommand> m) +{ + return execute_command(m->cmd, std::move(m->get_data())).then( + [conn, tid=m->get_tid()](auto result) { + auto [ret, err, out] = std::move(result); + auto reply = make_message<MCommandReply>(ret, err); + reply->set_tid(tid); + reply->set_data(out); + return conn->send(reply); + }); +} + +seastar::future<> AdminSocket::execute_line(std::string cmdline, + seastar::output_stream<char>& out) +{ + return execute_command({cmdline}, {}).then([&out, this](auto result) { + auto [ret, stderr, stdout] = std::move(result); + if (ret < 0) { + stdout.append(fmt::format("ERROR: {}\n", cpp_strerror(ret))); + stdout.append(stderr); + } + return finalize_response(out, std::move(stdout)); + }); +} + +auto AdminSocket::execute_command(const std::vector<std::string>& cmd, + ceph::bufferlist&& buf) + -> seastar::future<tell_result_t> +{ + return seastar::with_shared(servers_tbl_rwlock, + [cmd, buf=std::move(buf), this]() mutable { + auto maybe_parsed = parse_cmd(cmd); + if (auto parsed = std::get_if<parsed_command_t>(&maybe_parsed); parsed) { + stringstream os; + string desc{parsed->hook.desc}; + if (!validate_cmd(nullptr, desc, parsed->params, os)) { + logger().error("AdminSocket::execute_command: " + "failed to validate '{}': {}", cmd, os.str()); + ceph::bufferlist out; + out.append(os); + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EINVAL, "invalid command json", std::move(out)}); + } + return parsed->hook.call(parsed->params, parsed->format, std::move(buf)); + } else { + auto& result = std::get<tell_result_t>(maybe_parsed); + return seastar::make_ready_future<tell_result_t>(std::move(result)); + } + }); +} + +// an input_stream consumer that reads buffer into a std::string up to the first +// '\0' which indicates the end of command +struct line_consumer { + using tmp_buf = seastar::temporary_buffer<char>; + using consumption_result_type = + typename seastar::input_stream<char>::consumption_result_type; + + seastar::future<consumption_result_type> operator()(tmp_buf&& buf) { + size_t consumed = 0; + for (auto c : buf) { + consumed++; + if (c == '\0') { + buf.trim_front(consumed); + return seastar::make_ready_future<consumption_result_type>( + consumption_result_type::stop_consuming_type(std::move(buf))); + } else { + line.push_back(c); + } + } + return seastar::make_ready_future<consumption_result_type>( + seastar::continue_consuming{}); + } + std::string line; +}; + +seastar::future<> AdminSocket::handle_client(seastar::input_stream<char>& in, + seastar::output_stream<char>& out) +{ + auto consumer = seastar::make_shared<line_consumer>(); + return in.consume(*consumer).then([consumer, &out, this] { + logger().debug("AdminSocket::handle_client: incoming asok string: {}", + consumer->line); + return execute_line(consumer->line, out); + }).then([&out] { + return out.flush(); + }).finally([&out] { + return out.close(); + }).then([&in] { + return in.close(); + }).handle_exception([](auto ep) { + logger().debug("exception on {}: {}", __func__, ep); + }); +} + +seastar::future<> AdminSocket::start(const std::string& path) +{ + if (path.empty()) { + logger().error( + "{}: Admin Socket socket path missing from the configuration", __func__); + return seastar::now(); + } + + logger().debug("{}: asok socket path={}", __func__, path); + auto sock_path = seastar::socket_address{ seastar::unix_domain_addr{ path } }; + try { + server_sock = seastar::engine().listen(sock_path); + } catch (const std::system_error& e) { + logger().error("{}: unable to listen({}): {}", __func__, path, e.what()); + server_sock.reset(); + return seastar::make_ready_future<>(); + } + // listen in background + task = seastar::do_until( + [this] { return stop_gate.is_closed(); }, + [this] { + return seastar::with_gate(stop_gate, [this] { + assert(!connected_sock.has_value()); + return server_sock->accept().then([this](seastar::accept_result acc) { + connected_sock = std::move(acc.connection); + return seastar::do_with(connected_sock->input(), + connected_sock->output(), + [this](auto& input, auto& output) mutable { + return handle_client(input, output); + }).finally([this] { + assert(connected_sock.has_value()); + connected_sock.reset(); + }); + }).handle_exception([this](auto ep) { + if (!stop_gate.is_closed()) { + logger().error("AdminSocket: terminated: {}", ep); + } + }); + }); + }).finally([path] { + return seastar::remove_file(path); + }); + return seastar::make_ready_future<>(); +} + +seastar::future<> AdminSocket::stop() +{ + if (!server_sock) { + return seastar::now(); + } + server_sock->abort_accept(); + if (connected_sock) { + connected_sock->shutdown_input(); + connected_sock->shutdown_output(); + } + return stop_gate.close().then([this] { + assert(task.has_value()); + return task->then([] { + logger().info("AdminSocket: stopped"); + return seastar::now(); + }); + }); +} + +///////////////////////////////////////// +// the internal hooks +///////////////////////////////////////// + +class VersionHook final : public AdminSocketHook { + public: + VersionHook() + : AdminSocketHook{"version", "", "get ceph version"} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&&) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("version"); + f->dump_string("version", ceph_version_to_str()); + f->dump_string("release", ceph_release_to_str()); + f->dump_string("release_type", ceph_release_type()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +/** + Note that the git_version command is expected to return a 'version' JSON + segment. +*/ +class GitVersionHook final : public AdminSocketHook { + public: + GitVersionHook() + : AdminSocketHook{"git_version", "", "get git sha1"} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&&) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("version"); + f->dump_string("git_version", git_version_to_str()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +class HelpHook final : public AdminSocketHook { + const AdminSocket& m_as; + + public: + explicit HelpHook(const AdminSocket& as) : + AdminSocketHook{"help", "", "list available commands"}, + m_as{as} + {} + + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&&) const final + { + return seastar::with_shared(m_as.servers_tbl_rwlock, + [format, this] { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("help"); + for (const auto& [prefix, hook] : m_as) { + if (!hook->help.empty()) { + f->dump_string(prefix.data(), hook->help); + } + } + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + }); + } +}; + +class GetdescsHook final : public AdminSocketHook { + const AdminSocket& m_as; + + public: + explicit GetdescsHook(const AdminSocket& as) : + AdminSocketHook{"get_command_descriptions", + "", + "list available commands"}, + m_as{ as } {} + + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&&) const final + { + return seastar::with_shared(m_as.servers_tbl_rwlock, [format, this] { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + int cmdnum = 0; + f->open_object_section("command_descriptions"); + for (const auto& [prefix, hook] : m_as) { + auto secname = fmt::format("cmd {:>03}", cmdnum); + auto cmd = fmt::format("{} {}", hook->prefix, hook->desc); + dump_cmd_and_help_to_json(f.get(), CEPH_FEATURES_ALL, secname, + cmd, std::string{hook->help}); + cmdnum++; + } + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + }); + } +}; + +class InjectArgsHook final : public AdminSocketHook { +public: + InjectArgsHook() + : AdminSocketHook{"injectargs", + "name=injected_args,type=CephString,n=N", + "inject configuration arguments into running daemon"} + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&&) const final + { + std::vector<std::string> argv; + if (!cmd_getval(cmdmap, "injected_args", argv)) { + return seastar::make_ready_future<tell_result_t>(); + } + const std::string args = boost::algorithm::join(argv, " "); + return local_conf().inject_args(args).then([] { + return seastar::make_ready_future<tell_result_t>(); + }).handle_exception_type([] (const std::invalid_argument& e) { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EINVAL, e.what()}); + }); + } +}; + +/** + * listing the configuration values + */ +class ConfigShowHook : public AdminSocketHook { +public: + ConfigShowHook() : + AdminSocketHook{"config show", + "", + "dump current config settings"} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("config_show"); + local_conf().show_config(f.get()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +/** + * fetching the value of a specific configuration item + */ +class ConfigGetHook : public AdminSocketHook { +public: + ConfigGetHook() : + AdminSocketHook("config get", + "name=var,type=CephString", + "config get <field>: get the config value") + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const final + { + std::string var; + [[maybe_unused]] bool found = cmd_getval(cmdmap, "var", var); + assert(found); + std::string conf_val; + if (int r = local_conf().get_val(var, &conf_val); r < 0) { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{r, fmt::format("error getting {}: {}", + var, cpp_strerror(r))}); + } + unique_ptr<Formatter> f{Formatter::create(format, + "json-pretty", + "json-pretty")}; + f->open_object_section("config_get"); + f->dump_string(var, conf_val); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +/** + * setting the value of a specific configuration item (an example: + * {"prefix": "config set", "var":"debug_osd", "val": ["30/20"]} ) + */ +class ConfigSetHook : public AdminSocketHook { +public: + ConfigSetHook() + : AdminSocketHook("config set", + "name=var,type=CephString " + "name=val,type=CephString,n=N", + "config set <field> <val> [<val> ...]: set a config variable") + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&&) const final + { + std::string var; + std::vector<std::string> new_val; + cmd_getval(cmdmap, "var", var); + cmd_getval(cmdmap, "val", new_val); + // val may be multiple words + const std::string joined_values = boost::algorithm::join(new_val, " "); + return local_conf().set_val(var, joined_values).then([format] { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("config_set"); + f->dump_string("success", ""); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + }).handle_exception_type([](std::invalid_argument& e) { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EINVAL, e.what()}); + }); + } +}; + +/// the hooks that are served directly by the admin_socket server +seastar::future<> AdminSocket::register_admin_commands() +{ + return seastar::when_all_succeed( + register_command(std::make_unique<VersionHook>()), + register_command(std::make_unique<GitVersionHook>()), + register_command(std::make_unique<HelpHook>(*this)), + register_command(std::make_unique<GetdescsHook>(*this)), + register_command(std::make_unique<ConfigGetHook>()), + register_command(std::make_unique<ConfigSetHook>()), + register_command(std::make_unique<ConfigShowHook>()), + register_command(std::make_unique<InjectArgsHook>()) + ).then_unpack([] { + return seastar::now(); + }); +} + +} // namespace crimson::admin diff --git a/src/crimson/admin/admin_socket.h b/src/crimson/admin/admin_socket.h new file mode 100644 index 000000000..a842b62a2 --- /dev/null +++ b/src/crimson/admin/admin_socket.h @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +/** + A Crimson-wise version of the src/common/admin_socket.h + + Note: assumed to be running on a single core. +*/ +#include <map> +#include <string> +#include <string_view> + +#include <seastar/core/future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/iostream.hh> +#include <seastar/core/shared_mutex.hh> +#include <seastar/core/shared_ptr.hh> +#include <seastar/net/api.hh> + +#include "common/cmdparse.h" +#include "include/buffer.h" +#include "crimson/net/Fwd.h" + +using namespace std::literals; + +class MCommand; + +namespace crimson::admin { + +class AdminSocket; + +struct tell_result_t { + int ret = 0; + std::string err; + ceph::bufferlist out; + tell_result_t() = default; + tell_result_t(int ret, std::string&& err); + tell_result_t(int ret, std::string&& err, ceph::bufferlist&& out); + /** + * create a \c tell_result_t indicating the successful completion + * of command + * + * \param formatter the content of formatter will be flushed to the + * output buffer + */ + tell_result_t(std::unique_ptr<Formatter> formatter); +}; + +/** + * An abstract class to be inherited by implementations of asock hooks + */ +class AdminSocketHook { + public: + AdminSocketHook(std::string_view prefix, + std::string_view desc, + std::string_view help) : + prefix{prefix}, desc{desc}, help{help} + {} + /** + * handle command defined by cmdmap + * + * \param cmdmap dictionary holding the named parameters + * \param format the expected format of the output + * \param input the binary input of the command + * \pre \c cmdmap should be validated with \c desc + * \retval an instance of \c tell_result_t + * \note a negative \c ret should be set to indicate that the hook fails to + * fulfill the command either because of an invalid input or other + * failures. in that case, a brief reason of the failure should + * noted in \c err in the returned value + */ + virtual seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const = 0; + virtual ~AdminSocketHook() {} + const std::string_view prefix; + const std::string_view desc; + const std::string_view help; +}; + +class AdminSocket : public seastar::enable_lw_shared_from_this<AdminSocket> { + public: + AdminSocket() = default; + ~AdminSocket() = default; + + AdminSocket(const AdminSocket&) = delete; + AdminSocket& operator=(const AdminSocket&) = delete; + AdminSocket(AdminSocket&&) = delete; + AdminSocket& operator=(AdminSocket&&) = delete; + + using hook_server_tag = const void*; + + /** + * create the async Seastar thread that handles asok commands arriving + * over the socket. + */ + seastar::future<> start(const std::string& path); + + seastar::future<> stop(); + + /** + * register an admin socket hook + * + * Commands (APIs) are registered under a command string. Incoming + * commands are split by spaces and matched against the longest + * registered command. For example, if 'foo' and 'foo bar' are + * registered, and an incoming command is 'foo bar baz', it is + * matched with 'foo bar', while 'foo fud' will match 'foo'. + * + * \param hook a hook which includes its identifying command string, the + * expected call syntax, and some help text. + * + * A note regarding the help text: if empty, command will not be + * included in 'help' output. + */ + seastar::future<> register_command(std::unique_ptr<AdminSocketHook>&& hook); + + /** + * Registering the APIs that are served directly by the admin_socket server. + */ + seastar::future<> register_admin_commands(); + /** + * handle a command message by replying an MCommandReply with the same tid + * + * \param conn connection over which the incoming command message is received + * \param m message carrying the command vector and optional input buffer + */ + seastar::future<> handle_command(crimson::net::ConnectionRef conn, + boost::intrusive_ptr<MCommand> m); + +private: + /** + * the result of analyzing an incoming command, and locating it in + * the registered APIs collection. + */ + struct parsed_command_t { + cmdmap_t params; + std::string format; + const AdminSocketHook& hook; + }; + // and the shorthand: + seastar::future<> handle_client(seastar::input_stream<char>& inp, + seastar::output_stream<char>& out); + + seastar::future<> execute_line(std::string cmdline, + seastar::output_stream<char>& out); + + seastar::future<> finalize_response(seastar::output_stream<char>& out, + ceph::bufferlist&& msgs); + + seastar::future<tell_result_t> execute_command(const std::vector<std::string>& cmd, + ceph::bufferlist&& buf); + + std::optional<seastar::future<>> task; + std::optional<seastar::server_socket> server_sock; + std::optional<seastar::connected_socket> connected_sock; + + /** + * stopping incoming ASOK requests at shutdown + */ + seastar::gate stop_gate; + + /** + * parse the incoming command vector, find a registered hook by looking up by + * its prefix, perform sanity checks on the parsed parameters with the hook's + * command description + * + * \param cmd a vector of string which presents a command + * \retval on success, a \c parsed_command_t is returned, tell_result_t with + * detailed error messages is returned otherwise + */ + std::variant<parsed_command_t, tell_result_t> + parse_cmd(const std::vector<std::string>& cmd); + + /** + * The servers table is protected by a rw-lock, to be acquired exclusively + * only when registering or removing a server. + * The lock is locked-shared when executing any hook. + */ + mutable seastar::shared_mutex servers_tbl_rwlock; + using hooks_t = std::map<std::string_view, std::unique_ptr<AdminSocketHook>>; + hooks_t hooks; + + public: + /** + * iterator support + */ + hooks_t::const_iterator begin() const { + return hooks.cbegin(); + } + hooks_t::const_iterator end() const { + return hooks.cend(); + } + + friend class AdminSocketTest; + friend class HelpHook; + friend class GetdescsHook; +}; + +} // namespace crimson::admin diff --git a/src/crimson/admin/osd_admin.cc b/src/crimson/admin/osd_admin.cc new file mode 100644 index 000000000..ce6b6695d --- /dev/null +++ b/src/crimson/admin/osd_admin.cc @@ -0,0 +1,191 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/admin/osd_admin.h" +#include <string> +#include <string_view> + +#include <fmt/format.h> +#include <seastar/core/do_with.hh> +#include <seastar/core/future.hh> +#include <seastar/core/thread.hh> +#include <seastar/core/scollectd_api.hh> + +#include "common/config.h" +#include "crimson/admin/admin_socket.h" +#include "crimson/common/log.h" +#include "crimson/osd/exceptions.h" +#include "crimson/osd/osd.h" + +using crimson::osd::OSD; +using namespace crimson::common; + +namespace crimson::admin { + +using crimson::common::local_conf; + +template <class Hook, class... Args> +std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args) +{ + return std::make_unique<Hook>(std::forward<Args>(args)...); +} + +/** + * An OSD admin hook: OSD status + */ +class OsdStatusHook : public AdminSocketHook { +public: + explicit OsdStatusHook(const crimson::osd::OSD& osd) : + AdminSocketHook{"status", "", "OSD status"}, + osd(osd) + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("status"); + osd.dump_status(f.get()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +private: + const crimson::osd::OSD& osd; +}; +template std::unique_ptr<AdminSocketHook> +make_asok_hook<OsdStatusHook>(const crimson::osd::OSD& osd); + +/** + * An OSD admin hook: send beacon + */ +class SendBeaconHook : public AdminSocketHook { +public: + explicit SendBeaconHook(crimson::osd::OSD& osd) : + AdminSocketHook{"send_beacon", + "", + "send OSD beacon to mon immediately"}, + osd(osd) + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + return osd.send_beacon().then([] { + return seastar::make_ready_future<tell_result_t>(); + }); + } +private: + crimson::osd::OSD& osd; +}; +template std::unique_ptr<AdminSocketHook> +make_asok_hook<SendBeaconHook>(crimson::osd::OSD& osd); + +/** + * send the latest pg stats to mgr + */ +class FlushPgStatsHook : public AdminSocketHook { +public: + explicit FlushPgStatsHook(crimson::osd::OSD& osd) : + AdminSocketHook("flush_pg_stats", + "", + "flush pg stats"), + osd{osd} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + uint64_t seq = osd.send_pg_stats(); + unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->dump_unsigned("stat_seq", seq); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } + +private: + crimson::osd::OSD& osd; +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<FlushPgStatsHook>(crimson::osd::OSD& osd); + +/// dump the history of PGs' peering state +class DumpPGStateHistory final: public AdminSocketHook { +public: + explicit DumpPGStateHistory(const crimson::osd::OSD &osd) : + AdminSocketHook{"dump_pgstate_history", + "", + "dump history of PGs' peering state"}, + osd{osd} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + std::unique_ptr<Formatter> f{Formatter::create(format, + "json-pretty", + "json-pretty")}; + f->open_object_section("pgstate_history"); + osd.dump_pg_state_history(f.get()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +private: + const crimson::osd::OSD& osd; +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<DumpPGStateHistory>(const crimson::osd::OSD& osd); + +/** + * A CephContext admin hook: calling assert (if allowed by + * 'debug_asok_assert_abort') + */ +class AssertAlwaysHook : public AdminSocketHook { +public: + AssertAlwaysHook() : + AdminSocketHook{"assert", + "", + "asserts"} + {} + seastar::future<tell_result_t> call(const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + if (local_conf().get_val<bool>("debug_asok_assert_abort")) { + ceph_assert_always(0); + return seastar::make_ready_future<tell_result_t>(); + } else { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EPERM, "configuration set to disallow asok assert"}); + } + } +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<AssertAlwaysHook>(); + +/** +* A Seastar admin hook: fetching the values of configured metrics +*/ +class SeastarMetricsHook : public AdminSocketHook { +public: + SeastarMetricsHook() : + AdminSocketHook("perf dump_seastar", + "", + "dump current configured seastar metrics and their values") + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const final + { + std::unique_ptr<Formatter> f{Formatter::create(format, "json-pretty", "json-pretty")}; + f->open_object_section("perf_dump_seastar"); + for (const auto& mf : seastar::scollectd::get_value_map()) { + for (const auto& m : mf.second) { + if (m.second && m.second->is_enabled()) { + auto& metric_function = m.second->get_function(); + f->dump_float(m.second->get_id().full_name(), metric_function().d()); + } + } + } + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; +template std::unique_ptr<AdminSocketHook> make_asok_hook<SeastarMetricsHook>(); + +} // namespace crimson::admin diff --git a/src/crimson/admin/osd_admin.h b/src/crimson/admin/osd_admin.h new file mode 100644 index 000000000..395042ea8 --- /dev/null +++ b/src/crimson/admin/osd_admin.h @@ -0,0 +1,22 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include <memory> + +#include "admin_socket.h" + +namespace crimson::admin { + +class AssertAlwaysHook; +class FlushPgStatsHook; +class OsdStatusHook; +class SendBeaconHook; +class DumpPGStateHistory; +class SeastarMetricsHook; + + +template<class Hook, class... Args> +std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args); + +} // namespace crimson::admin diff --git a/src/crimson/admin/pg_commands.cc b/src/crimson/admin/pg_commands.cc new file mode 100644 index 000000000..dacfd515d --- /dev/null +++ b/src/crimson/admin/pg_commands.cc @@ -0,0 +1,159 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/admin/pg_commands.h" + +#include <memory> +#include <string> +#include <string_view> + +#include <fmt/format.h> +#include <seastar/core/future.hh> + +#include "crimson/admin/admin_socket.h" +#include "crimson/osd/osd.h" +#include "crimson/osd/pg.h" + + +using crimson::osd::OSD; +using crimson::osd::PG; +using namespace crimson::common; + + +namespace crimson::admin::pg { + +class PGCommand : public AdminSocketHook { +public: + // TODO: const correctness of osd + PGCommand(crimson::osd::OSD& osd, + std::string_view prefix, + std::string_view desc, + std::string_view help) + : AdminSocketHook{prefix, desc, help}, osd {osd} + {} + seastar::future<tell_result_t> call(const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const final + { + // we have "ceph tell <pgid> <cmd>". and it is the ceph cli's responsibility + // to add "pgid" to the cmd dict. as rados_pg_command() does not set it for + // us. moreover, and "pgid" is not listed in the command description, as user + // command format does not follow the convention of "<prefix> [<args>,...]" + // so we have to verify it on the server side. + std::string pgid_str; + pg_t pgid; + if (!cmd_getval(cmdmap, "pgid", pgid_str)) { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EINVAL, "no pgid specified"}); + } else if (!pgid.parse(pgid_str.c_str())) { + return seastar::make_ready_future<tell_result_t>( + tell_result_t{-EINVAL, fmt::format("couldn't parse pgid '{}'", pgid_str)}); + } + // am i the primary for this pg? + const auto osdmap = osd.get_shard_services().get_osdmap(); + spg_t spg_id; + if (!osdmap->get_primary_shard(pgid, &spg_id)) { + return seastar::make_ready_future<tell_result_t>(tell_result_t{ + -ENOENT, fmt::format("pgid '{}' does not exist", pgid_str)}); + } + Ref<PG> pg = osd.get_pg(spg_id); + if (!pg) { + return seastar::make_ready_future<tell_result_t>(tell_result_t{ + -ENOENT, fmt::format("i don't have pgid '{}'", spg_id)}); + } + if (!pg->is_primary()) { + return seastar::make_ready_future<tell_result_t>(tell_result_t{ + -EAGAIN, fmt::format("not primary for pgid '{}'", spg_id)}); + } + return this->do_command(pg, cmdmap, format, std::move(input)); + } + +private: + virtual seastar::future<tell_result_t> + do_command(Ref<PG> pg, + const cmdmap_t& cmdmap, + std::string_view format, + ceph::bufferlist&& input) const = 0; + + OSD& osd; +}; + +class QueryCommand final : public PGCommand { +public: + // TODO: const correctness of osd + explicit QueryCommand(crimson::osd::OSD& osd) : + PGCommand{osd, + "query", + "", + "show details of a specific pg"} + {} +private: + seastar::future<tell_result_t> + do_command(Ref<PG> pg, + const cmdmap_t&, + std::string_view format, + ceph::bufferlist&& input) const final + { + std::unique_ptr<Formatter> f{Formatter::create(format, + "json-pretty", + "json-pretty")}; + f->open_object_section("pg"); + pg->dump_primary(f.get()); + f->close_section(); + return seastar::make_ready_future<tell_result_t>(std::move(f)); + } +}; + +class MarkUnfoundLostCommand final : public PGCommand { +public: + explicit MarkUnfoundLostCommand(crimson::osd::OSD& osd) : + PGCommand{osd, + "mark_unfound_lost", + "name=pgid,type=CephPgid,req=false" + " name=mulcmd,type=CephChoices,strings=revert|delete", + "mark all unfound objects in this pg as lost, either" + " removing or reverting to a prior version if one is" + " available"} + {} + seastar::future<tell_result_t> + do_command(Ref<PG> pg, + const cmdmap_t& cmdmap, + std::string_view, + ceph::bufferlist&&) const final + { + // what to do with the unfound object specifically. + std::string cmd; + int op = -1; + cmd_getval(cmdmap, "mulcmd", cmd); + if (cmd == "revert") { + op = pg_log_entry_t::LOST_REVERT; + } else if (cmd == "delete") { + op = pg_log_entry_t::LOST_DELETE; + } else { + return seastar::make_ready_future<tell_result_t>(tell_result_t{ + -EINVAL, "mode must be 'revert' or 'delete'; mark not yet implemented"}); + } + return pg->mark_unfound_lost(op).then([] { + // TODO + return seastar::make_ready_future<tell_result_t>(); + }); + } +}; + +} // namespace crimson::admin::pg + +namespace crimson::admin { + +template <class Hook, class... Args> +std::unique_ptr<AdminSocketHook> make_asok_hook(Args&&... args) +{ + return std::make_unique<Hook>(std::forward<Args>(args)...); +} + +template std::unique_ptr<AdminSocketHook> +make_asok_hook<crimson::admin::pg::QueryCommand>(crimson::osd::OSD& osd); + +template std::unique_ptr<AdminSocketHook> +make_asok_hook<crimson::admin::pg::MarkUnfoundLostCommand>(crimson::osd::OSD& osd); + +} // namespace crimson::admin diff --git a/src/crimson/admin/pg_commands.h b/src/crimson/admin/pg_commands.h new file mode 100644 index 000000000..873b3c923 --- /dev/null +++ b/src/crimson/admin/pg_commands.h @@ -0,0 +1,10 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +namespace crimson::admin::pg { + +class QueryCommand; +class MarkUnfoundLostCommand; + +} // namespace crimson::admin::pg diff --git a/src/crimson/auth/AuthClient.h b/src/crimson/auth/AuthClient.h new file mode 100644 index 000000000..cd21b3838 --- /dev/null +++ b/src/crimson/auth/AuthClient.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> +#include <string> +#include <tuple> +#include <vector> +#include "include/buffer_fwd.h" +#include "crimson/net/Fwd.h" + +class CryptoKey; + +namespace crimson::auth { + +class error : public std::logic_error { +public: + using std::logic_error::logic_error; +}; + +using method_t = uint32_t; + +// TODO: revisit interfaces for non-dummy implementations +class AuthClient { +public: + virtual ~AuthClient() {} + + struct auth_request_t { + method_t auth_method; + std::vector<uint32_t> preferred_modes; + ceph::bufferlist auth_bl; + }; + /// Build an authentication request to begin the handshake + /// + /// @throw auth::error if unable to build the request + virtual auth_request_t get_auth_request(crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta) = 0; + + /// Handle server's request to continue the handshake + /// + /// @throw auth::error if unable to build the request + virtual ceph::bufferlist handle_auth_reply_more( + crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + const ceph::bufferlist& bl) = 0; + + /// Handle server's indication that authentication succeeded + /// + /// @return 0 if authenticated, a negative number otherwise + virtual int handle_auth_done( + crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + uint64_t global_id, + uint32_t con_mode, + const bufferlist& bl) = 0; + + /// Handle server's indication that the previous auth attempt failed + /// + /// @return 0 if will try next auth method, a negative number if we have no + /// more options + virtual int handle_auth_bad_method( + crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) = 0; +}; + +} // namespace crimson::auth diff --git a/src/crimson/auth/AuthServer.h b/src/crimson/auth/AuthServer.h new file mode 100644 index 000000000..d75c8f586 --- /dev/null +++ b/src/crimson/auth/AuthServer.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> +#include <utility> +#include <vector> +#include "crimson/net/Fwd.h" + +struct AuthAuthorizeHandler; + +namespace crimson::auth { + +class AuthServer { +public: + virtual ~AuthServer() {} + + // Get authentication methods and connection modes for the given peer type + virtual std::pair<std::vector<uint32_t>, std::vector<uint32_t>> + get_supported_auth_methods(int peer_type) = 0; + // Get support connection modes for the given peer type and auth method + virtual uint32_t pick_con_mode( + int peer_type, + uint32_t auth_method, + const std::vector<uint32_t>& preferred_modes) = 0; + // return an AuthAuthorizeHandler for the given peer type and auth method + virtual AuthAuthorizeHandler* get_auth_authorize_handler( + int peer_type, + int auth_method) = 0; + // Handle an authentication request on an incoming connection + virtual int handle_auth_request( + crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + bool more, //< true if this is not the first part of the handshake + uint32_t auth_method, + const bufferlist& bl, + bufferlist *reply) = 0; +}; + +} // namespace crimson::auth diff --git a/src/crimson/auth/DummyAuth.h b/src/crimson/auth/DummyAuth.h new file mode 100644 index 000000000..7c26140a2 --- /dev/null +++ b/src/crimson/auth/DummyAuth.h @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "AuthClient.h" +#include "AuthServer.h" + +namespace crimson::auth { + +class DummyAuthClientServer : public AuthClient, + public AuthServer { +public: + DummyAuthClientServer() {} + + // client + std::pair<std::vector<uint32_t>, std::vector<uint32_t>> + get_supported_auth_methods(int peer_type) final { + return {{CEPH_AUTH_NONE}, {CEPH_AUTH_NONE}}; + } + + uint32_t pick_con_mode(int peer_type, + uint32_t auth_method, + const std::vector<uint32_t>& preferred_modes) final { + ceph_assert(auth_method == CEPH_AUTH_NONE); + ceph_assert(preferred_modes.size() && + preferred_modes[0] == CEPH_CON_MODE_CRC); + return CEPH_CON_MODE_CRC; + } + + AuthAuthorizeHandler* get_auth_authorize_handler(int peer_type, + int auth_method) final { + return nullptr; + } + + AuthClient::auth_request_t get_auth_request( + crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta) override { + return {CEPH_AUTH_NONE, {CEPH_CON_MODE_CRC}, {}}; + } + + ceph::bufferlist handle_auth_reply_more( + crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + const bufferlist& bl) override { + ceph_abort(); + } + + int handle_auth_done( + crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + uint64_t global_id, + uint32_t con_mode, + const bufferlist& bl) override { + return 0; + } + + int handle_auth_bad_method( + crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) override { + ceph_abort(); + } + + // server + int handle_auth_request( + crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + bool more, + uint32_t auth_method, + const bufferlist& bl, + bufferlist *reply) override { + return 1; + } +}; + +} // namespace crimson::auth diff --git a/src/crimson/auth/KeyRing.cc b/src/crimson/auth/KeyRing.cc new file mode 100644 index 000000000..436e29c1b --- /dev/null +++ b/src/crimson/auth/KeyRing.cc @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "KeyRing.h" + +#include <boost/algorithm/string.hpp> + +#include <seastar/core/do_with.hh> +#include <seastar/core/fstream.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/reactor.hh> + +#include "common/buffer_seastar.h" +#include "auth/KeyRing.h" +#include "include/denc.h" +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" + +namespace crimson::auth { + +seastar::future<KeyRing*> load_from_keyring(KeyRing* keyring) +{ + std::vector<std::string> paths; + boost::split(paths, crimson::common::local_conf()->keyring, + boost::is_any_of(",;")); + std::pair<bool, std::string> found; + return seastar::map_reduce(paths, [](auto path) { + return seastar::engine().file_exists(path).then([path](bool file_exists) { + return std::make_pair(file_exists, path); + }); + }, std::move(found), [](auto found, auto file_exists_and_path) { + if (!found.first && file_exists_and_path.first) { + found = std::move(file_exists_and_path); + } + return found; + }).then([keyring] (auto file_exists_and_path) { + const auto& [exists, path] = file_exists_and_path; + if (exists) { + return read_file(path).then([keyring](auto buf) { + bufferlist bl; + bl.append(buffer::create(std::move(buf))); + auto i = bl.cbegin(); + keyring->decode(i); + return seastar::make_ready_future<KeyRing*>(keyring); + }); + } else { + return seastar::make_ready_future<KeyRing*>(keyring); + } + }); +} + +seastar::future<KeyRing*> load_from_keyfile(KeyRing* keyring) +{ + auto& path = crimson::common::local_conf()->keyfile; + if (!path.empty()) { + return read_file(path).then([keyring](auto buf) { + EntityAuth ea; + ea.key.decode_base64(std::string(buf.begin(), + buf.end())); + keyring->add(crimson::common::local_conf()->name, ea); + return seastar::make_ready_future<KeyRing*>(keyring); + }); + } else { + return seastar::make_ready_future<KeyRing*>(keyring); + } +} + +seastar::future<KeyRing*> load_from_key(KeyRing* keyring) +{ + auto& key = crimson::common::local_conf()->key; + if (!key.empty()) { + EntityAuth ea; + ea.key.decode_base64(key); + keyring->add(crimson::common::local_conf()->name, ea); + } + return seastar::make_ready_future<KeyRing*>(keyring); +} + +} // namespace crimson::auth diff --git a/src/crimson/auth/KeyRing.h b/src/crimson/auth/KeyRing.h new file mode 100644 index 000000000..850f1bb79 --- /dev/null +++ b/src/crimson/auth/KeyRing.h @@ -0,0 +1,15 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +class KeyRing; + +namespace crimson::auth { + // see KeyRing::from_ceph_context + seastar::future<KeyRing*> load_from_keyring(KeyRing* keyring); + seastar::future<KeyRing*> load_from_keyfile(KeyRing* keyring); + seastar::future<KeyRing*> load_from_key(KeyRing* keyring); +} diff --git a/src/crimson/common/assert.cc b/src/crimson/common/assert.cc new file mode 100644 index 000000000..07610c33f --- /dev/null +++ b/src/crimson/common/assert.cc @@ -0,0 +1,81 @@ +#include <cstdarg> +#include <iostream> + +#include <seastar/util/backtrace.hh> +#include <seastar/core/reactor.hh> + +#include "include/ceph_assert.h" + +#include "crimson/common/log.h" + +namespace ceph { + [[gnu::cold]] void __ceph_assert_fail(const ceph::assert_data &ctx) + { + __ceph_assert_fail(ctx.assertion, ctx.file, ctx.line, ctx.function); + } + + [[gnu::cold]] void __ceph_assert_fail(const char* assertion, + const char* file, int line, + const char* func) + { + seastar::logger& logger = crimson::get_logger(0); + logger.error("{}:{} : In function '{}', ceph_assert(%s)\n" + "{}", + file, line, func, assertion, + seastar::current_backtrace()); + std::cout << std::flush; + abort(); + } + [[gnu::cold]] void __ceph_assertf_fail(const char *assertion, + const char *file, int line, + const char *func, const char* msg, + ...) + { + char buf[8096]; + va_list args; + va_start(args, msg); + std::vsnprintf(buf, sizeof(buf), msg, args); + va_end(args); + + seastar::logger& logger = crimson::get_logger(0); + logger.error("{}:{} : In function '{}', ceph_assert(%s)\n" + "{}\n{}\n", + file, line, func, assertion, + buf, + seastar::current_backtrace()); + std::cout << std::flush; + abort(); + } + + [[gnu::cold]] void __ceph_abort(const char* file, int line, + const char* func, const std::string& msg) + { + seastar::logger& logger = crimson::get_logger(0); + logger.error("{}:{} : In function '{}', abort(%s)\n" + "{}", + file, line, func, msg, + seastar::current_backtrace()); + std::cout << std::flush; + abort(); + } + + [[gnu::cold]] void __ceph_abortf(const char* file, int line, + const char* func, const char* fmt, + ...) + { + char buf[8096]; + va_list args; + va_start(args, fmt); + std::vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + seastar::logger& logger = crimson::get_logger(0); + logger.error("{}:{} : In function '{}', abort()\n" + "{}\n{}\n", + file, line, func, + buf, + seastar::current_backtrace()); + std::cout << std::flush; + abort(); + } +} diff --git a/src/crimson/common/auth_handler.h b/src/crimson/common/auth_handler.h new file mode 100644 index 000000000..d4140b6a2 --- /dev/null +++ b/src/crimson/common/auth_handler.h @@ -0,0 +1,17 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +class EntityName; +class AuthCapsInfo; + +namespace crimson::common { +class AuthHandler { +public: + // the peer just got authorized + virtual void handle_authentication(const EntityName& name, + const AuthCapsInfo& caps) = 0; + virtual ~AuthHandler() = default; +}; +} diff --git a/src/crimson/common/buffer_io.cc b/src/crimson/common/buffer_io.cc new file mode 100644 index 000000000..86edf7a6f --- /dev/null +++ b/src/crimson/common/buffer_io.cc @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "buffer_io.h" + +#include <seastar/core/reactor.hh> +#include <seastar/core/fstream.hh> +#include <seastar/core/do_with.hh> + +#include "include/buffer.h" + +namespace crimson { + +seastar::future<> write_file(ceph::buffer::list&& bl, + seastar::sstring fn, + seastar::file_permissions permissions) +{ + const auto flags = (seastar::open_flags::wo | + seastar::open_flags::create | + seastar::open_flags::truncate); + seastar::file_open_options foo; + foo.create_permissions = permissions; + return seastar::open_file_dma(fn, flags, foo).then( + [bl=std::move(bl)](seastar::file f) { + return seastar::make_file_output_stream(f).then( + [bl=std::move(bl), f=std::move(f)](seastar::output_stream<char> out) { + return seastar::do_with(std::move(out), + std::move(f), + std::move(bl), + [](seastar::output_stream<char>& out, + seastar::file& f, + ceph::buffer::list& bl) { + return seastar::do_for_each(bl.buffers(), [&out](auto& buf) { + return out.write(buf.c_str(), buf.length()); + }).then([&out] { + return out.close(); + }); + }); + }); + }); +} + +seastar::future<seastar::temporary_buffer<char>> +read_file(const seastar::sstring fn) +{ + return seastar::open_file_dma(fn, seastar::open_flags::ro).then( + [] (seastar::file f) { + return f.size().then([f = std::move(f)](size_t s) { + return seastar::do_with(seastar::make_file_input_stream(f), + [s](seastar::input_stream<char>& in) { + return in.read_exactly(s); + }); + }); + }); +} + +} diff --git a/src/crimson/common/buffer_io.h b/src/crimson/common/buffer_io.h new file mode 100644 index 000000000..c5ece4a6f --- /dev/null +++ b/src/crimson/common/buffer_io.h @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/file-types.hh> + +#include "include/buffer_fwd.h" + +namespace crimson { + seastar::future<> write_file(ceph::buffer::list&& bl, + seastar::sstring fn, + seastar::file_permissions= // 0644 + (seastar::file_permissions::user_read | + seastar::file_permissions::user_write | + seastar::file_permissions::group_read | + seastar::file_permissions::others_read)); + seastar::future<seastar::temporary_buffer<char>> + read_file(const seastar::sstring fn); +} diff --git a/src/crimson/common/config_proxy.cc b/src/crimson/common/config_proxy.cc new file mode 100644 index 000000000..88d4679d5 --- /dev/null +++ b/src/crimson/common/config_proxy.cc @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "config_proxy.h" + +#include <filesystem> + +#include "crimson/common/buffer_io.h" + +namespace crimson::common { + +ConfigProxy::ConfigProxy(const EntityName& name, std::string_view cluster) +{ + if (seastar::this_shard_id() != 0) { + return; + } + // set the initial value on CPU#0 + values.reset(seastar::make_lw_shared<ConfigValues>()); + values.get()->name = name; + values.get()->cluster = cluster; + // and the only copy of md_config_impl<> is allocated on CPU#0 + local_config.reset(new md_config_t{*values, obs_mgr, true}); + if (name.is_mds()) { + local_config->set_val_default(*values, obs_mgr, + "keyring", "$mds_data/keyring"); + } else if (name.is_osd()) { + local_config->set_val_default(*values, obs_mgr, + "keyring", "$osd_data/keyring"); + } +} + +seastar::future<> ConfigProxy::start() +{ + // populate values and config to all other shards + if (!values) { + return seastar::make_ready_future<>(); + } + return container().invoke_on_others([this](auto& proxy) { + return values.copy().then([config=local_config.get(), + &proxy](auto foreign_values) { + proxy.values.reset(); + proxy.values = std::move(foreign_values); + proxy.remote_config = config; + return seastar::make_ready_future<>(); + }); + }); +} + +void ConfigProxy::show_config(ceph::Formatter* f) const { + get_config().show_config(*values, f); +} + +seastar::future<> ConfigProxy::parse_config_files(const std::string& conf_files) +{ + auto conffile_paths = + get_config().get_conffile_paths(*values, + conf_files.empty() ? nullptr : conf_files.c_str(), + &std::cerr, + CODE_ENVIRONMENT_DAEMON); + return seastar::do_with(std::move(conffile_paths), [this] (auto& paths) { + return seastar::repeat([path=paths.begin(), e=paths.end(), this]() mutable { + if (path == e) { + // tried all conffile, none of them works + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + return crimson::read_file(*path++).then([this](auto&& buf) { + return do_change([buf=std::move(buf), this](ConfigValues& values) { + if (get_config().parse_buffer(values, obs_mgr, + buf.get(), buf.size(), + &std::cerr) == 0) { + get_config().update_legacy_vals(values); + } else { + throw std::invalid_argument("parse error"); + } + }).then([] { + // this one works! + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + }); + }).handle_exception_type([] (const std::filesystem::filesystem_error&) { + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }).handle_exception_type([] (const std::invalid_argument&) { + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }); + }); + }); +} + +ConfigProxy::ShardedConfig ConfigProxy::sharded_conf; +} diff --git a/src/crimson/common/config_proxy.h b/src/crimson/common/config_proxy.h new file mode 100644 index 000000000..f50a63431 --- /dev/null +++ b/src/crimson/common/config_proxy.h @@ -0,0 +1,200 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/reactor.hh> +#include <seastar/core/sharded.hh> +#include "common/config.h" +#include "common/config_obs.h" +#include "common/config_obs_mgr.h" +#include "common/errno.h" + +namespace ceph { +class Formatter; +} + +namespace crimson::common { + +// a facade for managing config. each shard has its own copy of ConfigProxy. +// +// In seastar-osd, there could be multiple instances of @c ConfigValues in a +// single process, as we are using a variant of read-copy-update mechinary to +// update the settings at runtime. +class ConfigProxy : public seastar::peering_sharded_service<ConfigProxy> +{ + using LocalConfigValues = seastar::lw_shared_ptr<ConfigValues>; + seastar::foreign_ptr<LocalConfigValues> values; + + md_config_t* remote_config = nullptr; + std::unique_ptr<md_config_t> local_config; + + using ConfigObserver = ceph::md_config_obs_impl<ConfigProxy>; + ObserverMgr<ConfigObserver> obs_mgr; + + const md_config_t& get_config() const { + return remote_config ? *remote_config : * local_config; + } + md_config_t& get_config() { + return remote_config ? *remote_config : * local_config; + } + + // apply changes to all shards + // @param func a functor which accepts @c "ConfigValues&" + template<typename Func> + seastar::future<> do_change(Func&& func) { + return container().invoke_on(values.get_owner_shard(), + [func = std::move(func)](ConfigProxy& owner) { + // apply the changes to a copy of the values + auto new_values = seastar::make_lw_shared(*owner.values); + new_values->changed.clear(); + func(*new_values); + + // always apply the new settings synchronously on the owner shard, to + // avoid racings with other do_change() calls in parallel. + ObserverMgr<ConfigObserver>::rev_obs_map rev_obs; + owner.values.reset(new_values); + owner.obs_mgr.for_each_change(owner.values->changed, owner, + [&rev_obs](ConfigObserver *obs, + const std::string &key) { + rev_obs[obs].insert(key); + }, nullptr); + for (auto& [obs, keys] : rev_obs) { + obs->handle_conf_change(owner, keys); + } + + return seastar::parallel_for_each(boost::irange(1u, seastar::smp::count), + [&owner, new_values] (auto cpu) { + return owner.container().invoke_on(cpu, + [foreign_values = seastar::make_foreign(new_values)](ConfigProxy& proxy) mutable { + proxy.values.reset(); + proxy.values = std::move(foreign_values); + + ObserverMgr<ConfigObserver>::rev_obs_map rev_obs; + proxy.obs_mgr.for_each_change(proxy.values->changed, proxy, + [&rev_obs](ConfigObserver *obs, const std::string& key) { + rev_obs[obs].insert(key); + }, nullptr); + for (auto& obs_keys : rev_obs) { + obs_keys.first->handle_conf_change(proxy, obs_keys.second); + } + }); + }).finally([new_values] { + new_values->changed.clear(); + }); + }); + } +public: + ConfigProxy(const EntityName& name, std::string_view cluster); + const ConfigValues* operator->() const noexcept { + return values.get(); + } + const ConfigValues get_config_values() { + return *values.get(); + } + ConfigValues* operator->() noexcept { + return values.get(); + } + + // required by sharded<> + seastar::future<> start(); + seastar::future<> stop() { + return seastar::make_ready_future<>(); + } + void add_observer(ConfigObserver* obs) { + obs_mgr.add_observer(obs); + } + void remove_observer(ConfigObserver* obs) { + obs_mgr.remove_observer(obs); + } + seastar::future<> rm_val(const std::string& key) { + return do_change([key, this](ConfigValues& values) { + auto ret = get_config().rm_val(values, key); + if (ret < 0) { + throw std::invalid_argument(cpp_strerror(ret)); + } + }); + } + seastar::future<> set_val(const std::string& key, + const std::string& val) { + return do_change([key, val, this](ConfigValues& values) { + std::stringstream err; + auto ret = get_config().set_val(values, obs_mgr, key, val, &err); + if (ret < 0) { + throw std::invalid_argument(err.str()); + } + }); + } + int get_val(const std::string &key, std::string *val) const { + return get_config().get_val(*values, key, val); + } + template<typename T> + const T get_val(const std::string& key) const { + return get_config().template get_val<T>(*values, key); + } + + int get_all_sections(std::vector<std::string>& sections) const { + return get_config().get_all_sections(sections); + } + + int get_val_from_conf_file(const std::vector<std::string>& sections, + const std::string& key, std::string& out, + bool expand_meta) const { + return get_config().get_val_from_conf_file(*values, sections, key, + out, expand_meta); + } + + unsigned get_osd_pool_default_min_size(uint8_t size) const { + return get_config().get_osd_pool_default_min_size(*values, size); + } + + seastar::future<> + set_mon_vals(const std::map<std::string,std::string,std::less<>>& kv) { + return do_change([kv, this](ConfigValues& values) { + get_config().set_mon_vals(nullptr, values, obs_mgr, kv, nullptr); + }); + } + + seastar::future<> inject_args(const std::string& s) { + return do_change([s, this](ConfigValues& values) { + std::stringstream err; + if (get_config().injectargs(values, obs_mgr, s, &err)) { + throw std::invalid_argument(err.str()); + } + }); + } + void show_config(ceph::Formatter* f) const; + + seastar::future<> parse_argv(std::vector<const char*>& argv) { + // we could pass whatever is unparsed to seastar, but seastar::app_template + // is used for driving the seastar application, and + // crimson::common::ConfigProxy is not available until seastar engine is up + // and running, so we have to feed the command line args to app_template + // first, then pass them to ConfigProxy. + return do_change([&argv, this](ConfigValues& values) { + get_config().parse_argv(values, + obs_mgr, + argv, + CONF_CMDLINE); + }); + } + + seastar::future<> parse_config_files(const std::string& conf_files); + + using ShardedConfig = seastar::sharded<ConfigProxy>; + +private: + static ShardedConfig sharded_conf; + friend ConfigProxy& local_conf(); + friend ShardedConfig& sharded_conf(); +}; + +inline ConfigProxy& local_conf() { + return ConfigProxy::sharded_conf.local(); +} + +inline ConfigProxy::ShardedConfig& sharded_conf() { + return ConfigProxy::sharded_conf; +} + +} diff --git a/src/crimson/common/errorator.h b/src/crimson/common/errorator.h new file mode 100644 index 000000000..af1e6ea45 --- /dev/null +++ b/src/crimson/common/errorator.h @@ -0,0 +1,1140 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <exception> +#include <system_error> + +#include <seastar/core/future-util.hh> + +#include "include/ceph_assert.h" + +namespace crimson { + +template<typename Iterator, typename AsyncAction> +inline auto do_for_each(Iterator begin, Iterator end, AsyncAction action) { + using futurator = \ + ::seastar::futurize<std::invoke_result_t<AsyncAction, decltype(*begin)>>; + + if (begin == end) { + return futurator::type::errorator_type::template make_ready_future<>(); + } + while (true) { + auto f = futurator::invoke(action, *begin); + ++begin; + if (begin == end) { + return f; + } + if (!f.available() || seastar::need_preempt()) { + return std::move(f)._then( + [ action = std::move(action), + begin = std::move(begin), + end = std::move(end) + ] () mutable { + return ::crimson::do_for_each(std::move(begin), + std::move(end), + std::move(action)); + }); + } + if (f.failed()) { + return f; + } + } +} +template<typename Container, typename AsyncAction> +inline auto do_for_each(Container& c, AsyncAction action) { + return ::crimson::do_for_each(std::begin(c), std::end(c), std::move(action)); +} + +template<typename AsyncAction> +inline auto do_until(AsyncAction action) { + using errorator_t = + typename ::seastar::futurize_t<std::invoke_result_t<AsyncAction>>::errorator_type; + + while (true) { + auto f = ::seastar::futurize_invoke(action); + if (f.failed()) { + return errorator_t::template make_exception_future2<>( + f.get_exception() + ); + } else if (f.available()) { + if (auto done = f.get0()) { + return errorator_t::template make_ready_future<>(); + } + } else { + return std::move(f)._then( + [action = std::move(action)] (auto &&done) mutable { + if (done) { + return errorator_t::template make_ready_future<>(); + } + return ::crimson::do_until( + std::move(action)); + }); + } + } +} + +// define the interface between error types and errorator +template <class ConcreteErrorT> +class error_t { + static constexpr const std::type_info& get_exception_ptr_type_info() { + return ConcreteErrorT::exception_ptr_type_info(); + } + + std::exception_ptr to_exception_ptr() const { + const auto* concrete_error = static_cast<const ConcreteErrorT*>(this); + return concrete_error->to_exception_ptr(); + } + + decltype(auto) static from_exception_ptr(std::exception_ptr ep) { + return ConcreteErrorT::from_exception_ptr(std::move(ep)); + } + + template <class... AllowedErrorsT> + friend struct errorator; + + template <class ErrorVisitorT, class FuturatorT> + friend class maybe_handle_error_t; + +public: + template <class Func> + static decltype(auto) handle(Func&& func) { + return ConcreteErrorT::handle(std::forward<Func>(func)); + } +}; + +// unthrowable_wrapper ensures compilation failure when somebody +// would like to `throw make_error<...>)()` instead of returning. +// returning allows for the compile-time verification of future's +// AllowedErrorsV and also avoid the burden of throwing. +template <class ErrorT, ErrorT ErrorV> +struct unthrowable_wrapper : error_t<unthrowable_wrapper<ErrorT, ErrorV>> { + unthrowable_wrapper(const unthrowable_wrapper&) = delete; + [[nodiscard]] static const auto& make() { + static constexpr unthrowable_wrapper instance{}; + return instance; + } + + template<class Func> + static auto handle(Func&& func) { + return [ + func = std::forward<Func>(func) + ] (const unthrowable_wrapper&) mutable -> decltype(auto) { + if constexpr (std::is_invocable_v<Func, ErrorT>) { + return std::invoke(std::forward<Func>(func), ErrorV); + } else { + return std::invoke(std::forward<Func>(func)); + } + }; + } + + struct pass_further { + decltype(auto) operator()(const unthrowable_wrapper& e) { + return e; + } + }; + + struct discard { + decltype(auto) operator()(const unthrowable_wrapper&) { + } + }; + + +private: + // can be used only to initialize the `instance` member + explicit unthrowable_wrapper() = default; + + // implement the errorable interface + struct throwable_carrier{}; + static std::exception_ptr carrier_instance; + + static constexpr const std::type_info& exception_ptr_type_info() { + return typeid(throwable_carrier); + } + auto to_exception_ptr() const { + // error codes don't need to instantiate `std::exception_ptr` each + // time as the code is actually a part of the type itself. + // `std::make_exception_ptr()` on modern enough GCCs is quite cheap + // (see the Gleb Natapov's patch eradicating throw/catch there), + // but using one instance per type boils down the overhead to just + // ref-counting. + return carrier_instance; + } + static const auto& from_exception_ptr(std::exception_ptr) { + return make(); + } + + friend class error_t<unthrowable_wrapper<ErrorT, ErrorV>>; +}; + +template <class ErrorT, ErrorT ErrorV> +std::exception_ptr unthrowable_wrapper<ErrorT, ErrorV>::carrier_instance = \ + std::make_exception_ptr< + unthrowable_wrapper<ErrorT, ErrorV>::throwable_carrier>({}); + + +template <class ErrorT> +struct stateful_error_t : error_t<stateful_error_t<ErrorT>> { + template <class... Args> + explicit stateful_error_t(Args&&... args) + : ep(std::make_exception_ptr<ErrorT>(std::forward<Args>(args)...)) { + } + + template<class Func> + static auto handle(Func&& func) { + static_assert(std::is_invocable_v<Func, ErrorT>); + return [ + func = std::forward<Func>(func) + ] (stateful_error_t<ErrorT>&& e) mutable -> decltype(auto) { + try { + std::rethrow_exception(e.ep); + } catch (const ErrorT& obj) { + return std::invoke(std::forward<Func>(func), obj); + } + ceph_abort_msg("exception type mismatch – impossible!"); + }; + } + +private: + std::exception_ptr ep; + + explicit stateful_error_t(std::exception_ptr ep) : ep(std::move(ep)) {} + + static constexpr const std::type_info& exception_ptr_type_info() { + return typeid(ErrorT); + } + auto to_exception_ptr() const { + return ep; + } + static stateful_error_t<ErrorT> from_exception_ptr(std::exception_ptr ep) { + return stateful_error_t<ErrorT>(std::move(ep)); + } + + friend class error_t<stateful_error_t<ErrorT>>; +}; + +namespace _impl { + template <class T> struct always_false : std::false_type {}; +}; + +template <class ErrorVisitorT, class FuturatorT> +class maybe_handle_error_t { + const std::type_info& type_info; + typename FuturatorT::type result; + ErrorVisitorT errfunc; + +public: + maybe_handle_error_t(ErrorVisitorT&& errfunc, std::exception_ptr ep) + : type_info(*ep.__cxa_exception_type()), + result(FuturatorT::make_exception_future(std::move(ep))), + errfunc(std::forward<ErrorVisitorT>(errfunc)) { + } + + template <class ErrorT> + void handle() { + static_assert(std::is_invocable<ErrorVisitorT, ErrorT>::value, + "provided Error Visitor is not exhaustive"); + // In C++ throwing an exception isn't the sole way to signal + // error with it. This approach nicely fits cold, infrequent cases + // but when applied to a hot one, it will likely hurt performance. + // + // Alternative approach is to create `std::exception_ptr` on our + // own and place it in the future via `make_exception_future()`. + // When it comes to handling, the pointer can be interrogated for + // pointee's type with `__cxa_exception_type()` instead of costly + // re-throwing (via `std::rethrow_exception()`) and matching with + // `catch`. The limitation here is lack of support for hierarchies + // of exceptions. The code below checks for exact match only while + // `catch` would allow to match against a base class as well. + // However, this shouldn't be a big issue for `errorator` as Error + // Visitors are already checked for exhaustiveness at compile-time. + // + // NOTE: `__cxa_exception_type()` is an extension of the language. + // It should be available both in GCC and Clang but a fallback + // (based on `std::rethrow_exception()` and `catch`) can be made + // to handle other platforms if necessary. + if (type_info == ErrorT::error_t::get_exception_ptr_type_info()) { + // set `state::invalid` in internals of `seastar::future` to not + // call `report_failed_future()` during `operator=()`. + [[maybe_unused]] auto&& ep = std::move(result).get_exception(); + + using return_t = std::invoke_result_t<ErrorVisitorT, ErrorT>; + if constexpr (std::is_assignable_v<decltype(result), return_t>) { + result = std::invoke(std::forward<ErrorVisitorT>(errfunc), + ErrorT::error_t::from_exception_ptr(std::move(ep))); + } else if constexpr (std::is_same_v<return_t, void>) { + // void denotes explicit discarding + // execute for the sake a side effects. Typically this boils down + // to throwing an exception by the handler. + std::invoke(std::forward<ErrorVisitorT>(errfunc), + ErrorT::error_t::from_exception_ptr(std::move(ep))); + } else { + static_assert(_impl::always_false<return_t>::value, + "return of Error Visitor is not assignable to future"); + // do nothing with `ep`. + } + } + } + + auto get_result() && { + return std::move(result); + } +}; + +template <class FuncHead, class... FuncTail> +static constexpr auto composer(FuncHead&& head, FuncTail&&... tail) { + return [ + head = std::forward<FuncHead>(head), + // perfect forwarding in lambda's closure isn't available in C++17 + // using tuple as workaround; see: https://stackoverflow.com/a/49902823 + tail = std::make_tuple(std::forward<FuncTail>(tail)...) + ] (auto&&... args) mutable -> decltype(auto) { + if constexpr (std::is_invocable_v<FuncHead, decltype(args)...>) { + return std::invoke(std::forward<FuncHead>(head), + std::forward<decltype(args)>(args)...); + } else if constexpr (sizeof...(FuncTail) > 0) { + using next_composer_t = decltype(composer<FuncTail...>); + auto&& next = std::apply<next_composer_t>(composer<FuncTail...>, + std::move(tail)); + return std::invoke(std::move(next), + std::forward<decltype(args)>(args)...); + } else { + static_assert( + std::is_invocable_v<FuncHead, decltype(args)...> || + (sizeof...(FuncTail) > 0), + "composition is not exhaustive"); + } + }; +} + +template <class ValueT> +struct errorated_future_marker{}; + +template <class... AllowedErrors> +struct errorator { + template <class T> + static inline constexpr bool is_error_v = std::is_base_of_v<error_t<T>, T>; + + static_assert((... && is_error_v<AllowedErrors>), + "errorator expects presence of ::is_error in all error types"); + + template <class ErrorT> + struct contains_once { + static constexpr bool value = + (0 + ... + std::is_same_v<ErrorT, AllowedErrors>) == 1; + }; + template <class... Errors> + struct contains_once<errorator<Errors...>> { + static constexpr bool value = (... && contains_once<Errors>::value); + }; + template <class T> + static constexpr bool contains_once_v = contains_once<T>::value; + + static_assert((... && contains_once_v<AllowedErrors>), + "no error type in errorator can be duplicated"); + + struct ready_future_marker{}; + struct exception_future_marker{}; + +private: + // see the comment for `using future = _future` below. + template <class> + class _future {}; + template <class ValueT> + class _future<::crimson::errorated_future_marker<ValueT>> + : private seastar::future<ValueT> { + using base_t = seastar::future<ValueT>; + // we need the friendship for the sake of `get_exception() &&` when + // `safe_then()` is going to return an errorated future as a result of + // chaining. In contrast to `seastar::future`, errorator<T...>::future` + // has this member private. + template <class ErrorVisitor, class Futurator> + friend class maybe_handle_error_t; + + // any `seastar::futurize` specialization must be able to access the base. + // see : `satisfy_with_result_of()` far below. + template <typename> + friend class seastar::futurize; + + template <typename T1, typename T2, typename... More> + friend auto seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, More&&... more); + + template <class, class = std::void_t<>> + struct get_errorator { + // generic template for non-errorated things (plain types and + // vanilla seastar::future as well). + using type = errorator<>; + }; + template <class FutureT> + struct get_errorator<FutureT, + std::void_t<typename FutureT::errorator_type>> { + using type = typename FutureT::errorator_type; + }; + template <class T> + using get_errorator_t = typename get_errorator<T>::type; + + template <class ValueFuncErroratorT, class... ErrorVisitorRetsT> + struct make_errorator { + // NOP. The generic template. + }; + template <class... ValueFuncAllowedErrors, + class ErrorVisitorRetsHeadT, + class... ErrorVisitorRetsTailT> + struct make_errorator<errorator<ValueFuncAllowedErrors...>, + ErrorVisitorRetsHeadT, + ErrorVisitorRetsTailT...> { + private: + using step_errorator = errorator<ValueFuncAllowedErrors...>; + // add ErrorVisitorRetsHeadT only if 1) it's an error type and + // 2) isn't already included in the errorator's error set. + // It's enough to negate contains_once_v as any errorator<...> + // type is already guaranteed to be free of duplications. + using next_errorator = std::conditional_t< + is_error_v<ErrorVisitorRetsHeadT> && + !step_errorator::template contains_once_v<ErrorVisitorRetsHeadT>, + typename step_errorator::template extend<ErrorVisitorRetsHeadT>, + step_errorator>; + + public: + using type = typename make_errorator<next_errorator, + ErrorVisitorRetsTailT...>::type; + }; + // finish the recursion + template <class... ValueFuncAllowedErrors> + struct make_errorator<errorator<ValueFuncAllowedErrors...>> { + using type = ::crimson::errorator<ValueFuncAllowedErrors...>; + }; + template <class... Args> + using make_errorator_t = typename make_errorator<Args...>::type; + + using base_t::base_t; + + template <class Futurator, class Future, class ErrorVisitor> + [[gnu::noinline]] + static auto _safe_then_handle_errors(Future&& future, + ErrorVisitor&& errfunc) { + maybe_handle_error_t<ErrorVisitor, Futurator> maybe_handle_error( + std::forward<ErrorVisitor>(errfunc), + std::move(future).get_exception() + ); + (maybe_handle_error.template handle<AllowedErrors>() , ...); + return std::move(maybe_handle_error).get_result(); + } + + public: + using errorator_type = ::crimson::errorator<AllowedErrors...>; + using promise_type = seastar::promise<ValueT>; + + using base_t::available; + using base_t::failed; + // need this because of the legacy in PG::do_osd_ops(). + using base_t::handle_exception_type; + + [[gnu::always_inline]] + _future(base_t&& base) + : base_t(std::move(base)) { + } + + template <class... A> + [[gnu::always_inline]] + _future(ready_future_marker, A&&... a) + : base_t(::seastar::make_ready_future<ValueT>(std::forward<A>(a)...)) { + } + [[gnu::always_inline]] + _future(exception_future_marker, ::seastar::future_state_base&& state) noexcept + : base_t(::seastar::futurize<base_t>::make_exception_future(std::move(state))) { + } + [[gnu::always_inline]] + _future(exception_future_marker, std::exception_ptr&& ep) noexcept + : base_t(::seastar::futurize<base_t>::make_exception_future(std::move(ep))) { + } + + template <template <class...> class ErroratedFuture, + class = std::void_t< + typename ErroratedFuture< + ::crimson::errorated_future_marker<ValueT>>::errorator_type>> + operator ErroratedFuture<errorated_future_marker<ValueT>> () && { + using dest_errorator_t = \ + typename ErroratedFuture< + ::crimson::errorated_future_marker<ValueT>>::errorator_type; + static_assert(dest_errorator_t::template contains_once_v<errorator_type>, + "conversion is possible to more-or-eq errorated future!"); + return static_cast<base_t&&>(*this); + } + + // initialize future as failed without throwing. `make_exception_future()` + // internally uses `std::make_exception_ptr()`. cppreference.com shouldn't + // be misinterpreted when it says: + // + // "This is done as if executing the following code: + // try { + // throw e; + // } catch(...) { + // return std::current_exception(); + // }", + // + // the "as if" is absolutely crucial because modern GCCs employ optimized + // path for it. See: + // * https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=cce8e59224e18858749a2324bce583bcfd160d6c, + // * https://gcc.gnu.org/ml/gcc-patches/2016-08/msg00373.html. + // + // This behavior, combined with `__cxa_exception_type()` for inspecting + // exception's type, allows for throw/catch-free handling of stateless + // exceptions (which is fine for error codes). Stateful jumbos would be + // actually a bit harder as `_M_get()` is private, and thus rethrowing is + // necessary to get to the state inside. However, it's not unthinkable to + // see another extension bringing operator*() to the exception pointer... + // + // TODO: we don't really need to `make_exception_ptr` each time. It still + // allocates memory underneath while can be replaced with single instance + // per type created on start-up. + template <class ErrorT, + class DecayedT = std::decay_t<ErrorT>, + bool IsError = is_error_v<DecayedT>, + class = std::enable_if_t<IsError>> + _future(ErrorT&& e) + : base_t( + seastar::make_exception_future<ValueT>( + errorator_type::make_exception_ptr(e))) { + static_assert(errorator_type::contains_once_v<DecayedT>, + "ErrorT is not enlisted in errorator"); + } + + template <class ValueFuncT, class ErrorVisitorT> + auto safe_then(ValueFuncT&& valfunc, ErrorVisitorT&& errfunc) { + static_assert((... && std::is_invocable_v<ErrorVisitorT, + AllowedErrors>), + "provided Error Visitor is not exhaustive"); + + using value_func_result_t = + typename std::conditional_t<std::is_void_v<ValueT>, + std::invoke_result<ValueFuncT>, + std::invoke_result<ValueFuncT, ValueT>>::type; + // recognize whether there can be any error coming from the Value + // Function. + using value_func_errorator_t = get_errorator_t<value_func_result_t>; + // mutate the Value Function's errorator to harvest errors coming + // from the Error Visitor. Yes, it's perfectly fine to fail error + // handling at one step and delegate even broader set of issues + // to next continuation. + using return_errorator_t = make_errorator_t< + value_func_errorator_t, + std::decay_t<std::invoke_result_t<ErrorVisitorT, AllowedErrors>>...>; + // OK, now we know about all errors next continuation must take + // care about. If Visitor handled everything and the Value Func + // doesn't return any, we'll finish with errorator<>::future + // which is just vanilla seastar::future – that's it, next cont + // finally could use `.then()`! + using futurator_t = \ + typename return_errorator_t::template futurize<value_func_result_t>; + // `seastar::futurize`, used internally by `then_wrapped()`, would + // wrap any non-`seastar::future` type coming from Value Func into + // `seastar::future`. As we really don't want to end with things + // like `seastar::future<errorator::future<...>>`, we need either: + // * convert the errorated future into plain in the lambda below + // and back here or + // * specialize the `seastar::futurize<T>` to get proper kind of + // future directly from `::then_wrapped()`. + // As C++17 doesn't guarantee copy elision when non-same types are + // involved while examination of assemblies from GCC 8.1 confirmed + // extra copying, switch to the second approach has been made. + return this->then_wrapped( + [ valfunc = std::forward<ValueFuncT>(valfunc), + errfunc = std::forward<ErrorVisitorT>(errfunc) + ] (auto&& future) mutable noexcept { + if (__builtin_expect(future.failed(), false)) { + return _safe_then_handle_errors<futurator_t>( + std::move(future), std::forward<ErrorVisitorT>(errfunc)); + } else { + // NOTE: using `seastar::future::get()` here is a bit bloaty + // as the method rechecks availability of future's value and, + // if it's unavailable, does the `::do_wait()` path (yes, it + // targets `seastar::thread`). Actually this is dead code as + // `then_wrapped()` executes the lambda only when the future + // is available (which means: failed or ready). However, GCC + // hasn't optimized it out: + // + // if (__builtin_expect(future.failed(), false)) { + // ea25: 48 83 bd c8 fe ff ff cmpq $0x2,-0x138(%rbp) + // ea2c: 02 + // ea2d: 0f 87 f0 05 00 00 ja f023 <ceph::osd:: + // ... + // /// If get() is called in a \ref seastar::thread context, + // /// then it need not be available; instead, the thread will + // /// be paused until the future becomes available. + // [[gnu::always_inline]] + // std::tuple<T...> get() { + // if (!_state.available()) { + // ea3a: 0f 85 1b 05 00 00 jne ef5b <ceph::osd:: + // } + // ... + // + // I don't perceive this as huge issue. Though, it cannot be + // claimed errorator has 0 overhead on hot path. The perfect + // solution here would be mark the `::get_available_state()` + // as `protected` and use dedicated `get_value()` exactly as + // `::then()` already does. + return futurator_t::invoke(std::forward<ValueFuncT>(valfunc), + std::move(future).get()); + } + }); + } + + /** + * unsafe_thread_get + * + * Only valid within a seastar_thread. Ignores errorator protections + * and throws any contained exceptions. + * + * Should really only be used within test code + * (see test/crimson/gtest_seastar.h). + */ + auto &&unsafe_get() { + return seastar::future<ValueT>::get(); + } + auto unsafe_get0() { + return seastar::future<ValueT>::get0(); + } + + template <class FuncT> + _future finally(FuncT &&func) { + return this->then_wrapped( + [func = std::forward<FuncT>(func)](auto &&result) mutable noexcept { + if constexpr (seastar::is_future<std::invoke_result_t<FuncT>>::value) { + return ::seastar::futurize_invoke(std::forward<FuncT>(func)).then_wrapped( + [result = std::move(result)](auto&& f_res) mutable { + // TODO: f_res.failed() + (void)f_res.discard_result(); + return std::move(result); + }); + } else { + try { + func(); + } catch (...) { + // TODO: rethrow + } + return std::move(result); + } + }); + } + + // taking ErrorFuncOne and ErrorFuncTwo separately from ErrorFuncTail + // to avoid SFINAE + template <class ValueFunc, + class ErrorFuncHead, + class... ErrorFuncTail> + auto safe_then(ValueFunc&& value_func, + ErrorFuncHead&& error_func_head, + ErrorFuncTail&&... error_func_tail) { + static_assert(sizeof...(ErrorFuncTail) > 0); + return safe_then( + std::forward<ValueFunc>(value_func), + composer(std::forward<ErrorFuncHead>(error_func_head), + std::forward<ErrorFuncTail>(error_func_tail)...)); + } + + template <class ValueFunc> + auto safe_then(ValueFunc&& value_func) { + return safe_then(std::forward<ValueFunc>(value_func), + errorator_type::pass_further{}); + } + + template <class Func> + void then(Func&&) = delete; + + template <class ErrorVisitorT> + auto handle_error(ErrorVisitorT&& errfunc) { + static_assert((... && std::is_invocable_v<ErrorVisitorT, + AllowedErrors>), + "provided Error Visitor is not exhaustive"); + using return_errorator_t = make_errorator_t< + errorator<>, + std::decay_t<std::invoke_result_t<ErrorVisitorT, AllowedErrors>>...>; + using futurator_t = \ + typename return_errorator_t::template futurize<::seastar::future<ValueT>>; + return this->then_wrapped( + [ errfunc = std::forward<ErrorVisitorT>(errfunc) + ] (auto&& future) mutable noexcept { + if (__builtin_expect(future.failed(), false)) { + return _safe_then_handle_errors<futurator_t>( + std::move(future), std::forward<ErrorVisitorT>(errfunc)); + } else { + return typename futurator_t::type{ std::move(future) }; + } + }); + } + template <class ErrorFuncHead, + class... ErrorFuncTail> + auto handle_error(ErrorFuncHead&& error_func_head, + ErrorFuncTail&&... error_func_tail) { + static_assert(sizeof...(ErrorFuncTail) > 0); + return this->handle_error( + composer(std::forward<ErrorFuncHead>(error_func_head), + std::forward<ErrorFuncTail>(error_func_tail)...)); + } + + private: + // for ::crimson::do_for_each + template <class Func> + auto _then(Func&& func) { + return base_t::then(std::forward<Func>(func)); + } + template<typename Iterator, typename AsyncAction> + friend inline auto ::crimson::do_for_each(Iterator begin, + Iterator end, + AsyncAction action); + + template<typename AsyncAction> + friend inline auto ::crimson::do_until(AsyncAction action); + + template <typename Result> + friend class ::seastar::future; + + // let seastar::do_with_impl to up-cast us to seastar::future. + template<typename T, typename F> + friend inline auto ::seastar::internal::do_with_impl(T&& rvalue, F&& f); + template<typename T1, typename T2, typename T3_or_F, typename... More> + friend inline auto ::seastar::internal::do_with_impl(T1&& rv1, T2&& rv2, T3_or_F&& rv3, More&&... more); + }; + + class Enabler {}; + + template <typename T> + using EnableIf = typename std::enable_if<contains_once_v<std::decay_t<T>>, Enabler>::type; + + template <typename ErrorFunc> + struct all_same_way_t { + ErrorFunc func; + all_same_way_t(ErrorFunc &&error_func) + : func(std::forward<ErrorFunc>(error_func)) {} + + template <typename ErrorT, EnableIf<ErrorT>...> + decltype(auto) operator()(ErrorT&& e) { + using decayed_t = std::decay_t<decltype(e)>; + auto&& handler = + decayed_t::error_t::handle(std::forward<ErrorFunc>(func)); + static_assert(std::is_invocable_v<decltype(handler), ErrorT>); + return std::invoke(std::move(handler), std::forward<ErrorT>(e)); + } + }; + +public: + // HACK: `errorated_future_marker` and `_future` is just a hack to + // specialize `seastar::futurize` for category of class templates: + // `future<...>` from distinct errorators. Such tricks are usually + // performed basing on SFINAE and `std::void_t` to check existence + // of a trait/member (`future<...>::errorator_type` in our case). + // Unfortunately, this technique can't be applied as the `futurize` + // lacks the optional parameter. The problem looks awfully similar + // to following SO item: https://stackoverflow.com/a/38860413. + template <class ValueT=void> + using future = _future<::crimson::errorated_future_marker<ValueT>>; + + // the visitor that forwards handling of all errors to next continuation + struct pass_further { + template <class ErrorT, EnableIf<ErrorT>...> + decltype(auto) operator()(ErrorT&& e) { + static_assert(contains_once_v<std::decay_t<ErrorT>>, + "passing further disallowed ErrorT"); + return std::forward<ErrorT>(e); + } + }; + + struct discard_all { + template <class ErrorT, EnableIf<ErrorT>...> + void operator()(ErrorT&&) { + static_assert(contains_once_v<std::decay_t<ErrorT>>, + "discarding disallowed ErrorT"); + } + }; + + // assert_all{ "TODO" }; + class assert_all { + const char* const msg = nullptr; + public: + template <std::size_t N> + assert_all(const char (&msg)[N]) + : msg(msg) { + } + assert_all() = default; + + template <class ErrorT, EnableIf<ErrorT>...> + void operator()(ErrorT&&) { + static_assert(contains_once_v<std::decay_t<ErrorT>>, + "discarding disallowed ErrorT"); + if (msg) { + ceph_abort_msg(msg); + } else { + ceph_abort(); + } + } + }; + + template <class ErrorFunc> + static decltype(auto) all_same_way(ErrorFunc&& error_func) { + return all_same_way_t<ErrorFunc>{std::forward<ErrorFunc>(error_func)}; + }; + + // get a new errorator by extending current one with new error + template <class... NewAllowedErrorsT> + using extend = errorator<AllowedErrors..., NewAllowedErrorsT...>; + + // get a new errorator by summing and deduplicating error set of + // the errorator `unify<>` is applied on with another errorator + // provided as template parameter. + template <class OtherErroratorT> + struct unify { + // 1st: generic NOP template + }; + template <class OtherAllowedErrorsHead, + class... OtherAllowedErrorsTail> + struct unify<errorator<OtherAllowedErrorsHead, + OtherAllowedErrorsTail...>> { + private: + // 2nd: specialization for errorators with non-empty error set. + // + // split error set of other errorator, passed as template param, + // into head and tail. Mix error set of this errorator with head + // of the other one only if it isn't already present in the set. + using step_errorator = std::conditional_t< + contains_once_v<OtherAllowedErrorsHead> == false, + errorator<AllowedErrors..., OtherAllowedErrorsHead>, + errorator<AllowedErrors...>>; + using rest_errorator = errorator<OtherAllowedErrorsTail...>; + + public: + using type = typename step_errorator::template unify<rest_errorator>::type; + }; + template <class... EmptyPack> + struct unify<errorator<EmptyPack...>> { + // 3rd: recursion finisher + static_assert(sizeof...(EmptyPack) == 0); + using type = errorator<AllowedErrors...>; + }; + + template <typename T=void, typename... A> + static future<T> make_ready_future(A&&... value) { + return future<T>(ready_future_marker(), std::forward<A>(value)...); + } + + template <typename T=void> + static + future<T> make_exception_future2(std::exception_ptr&& ex) noexcept { + return future<T>(exception_future_marker(), std::move(ex)); + } + template <typename T=void> + static + future<T> make_exception_future2(seastar::future_state_base&& state) noexcept { + return future<T>(exception_future_marker(), std::move(state)); + } + template <typename T=void, typename Exception> + static + future<T> make_exception_future2(Exception&& ex) noexcept { + return make_exception_future2<T>(std::make_exception_ptr(std::forward<Exception>(ex))); + } + + static auto now() { + return make_ready_future<>(); + } + +private: + template <class T, class = std::void_t<T>> + class futurize { + using vanilla_futurize = seastar::futurize<T>; + + // explicit specializations for nested type is not allowed unless both + // the member template and the enclosing template are specialized. see + // section temp.expl.spec, N4659 + template <class Stored, int Dummy = 0> + struct stored_to_future { + using type = future<Stored>; + }; + template <int Dummy> + struct stored_to_future <seastar::internal::monostate, Dummy> { + using type = future<>; + }; + + public: + using type = + typename stored_to_future<typename vanilla_futurize::value_type>::type; + + template <class Func, class... Args> + static type invoke(Func&& func, Args&&... args) { + try { + return vanilla_futurize::invoke(std::forward<Func>(func), + std::forward<Args>(args)...); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <class Func> + static type invoke(Func&& func, seastar::internal::monostate) { + try { + return vanilla_futurize::invoke(std::forward<Func>(func)); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <typename Arg> + static type make_exception_future(Arg&& arg) { + return vanilla_futurize::make_exception_future(std::forward<Arg>(arg)); + } + }; + template <template <class...> class ErroratedFutureT, + class ValueT> + class futurize<ErroratedFutureT<::crimson::errorated_future_marker<ValueT>>, + std::void_t< + typename ErroratedFutureT< + ::crimson::errorated_future_marker<ValueT>>::errorator_type>> { + public: + using type = ::crimson::errorator<AllowedErrors...>::future<ValueT>; + + template <class Func, class... Args> + static type apply(Func&& func, std::tuple<Args...>&& args) { + try { + return ::seastar::futurize_apply(std::forward<Func>(func), + std::forward<std::tuple<Args...>>(args)); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <class Func, class... Args> + static type invoke(Func&& func, Args&&... args) { + try { + return ::seastar::futurize_invoke(std::forward<Func>(func), + std::forward<Args>(args)...); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <class Func> + static type invoke(Func&& func, seastar::internal::monostate) { + try { + return ::seastar::futurize_invoke(std::forward<Func>(func)); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <typename Arg> + static type make_exception_future(Arg&& arg) { + return ::crimson::errorator<AllowedErrors...>::make_exception_future2<ValueT>(std::forward<Arg>(arg)); + } + }; + + template <class ErrorT> + static std::exception_ptr make_exception_ptr(ErrorT&& e) { + // calling via interface class due to encapsulation and friend relations. + return e.error_t<std::decay_t<ErrorT>>::to_exception_ptr(); + } + + // needed because of: + // * return_errorator_t::template futurize<...> in `safe_then()`, + // * conversion to `std::exception_ptr` in `future::future(ErrorT&&)`. + // the friendship with all errorators is an idea from Kefu to fix build + // issues on GCC 9. This version likely fixes some access violation bug + // we were exploiting before. + template <class...> + friend class errorator; +}; // class errorator, generic template + +// no errors? errorator<>::future is plain seastar::future then! +template <> +class errorator<> { +public: + template <class ValueT> + using future = ::seastar::future<ValueT>; + + template <class T> + using futurize = ::seastar::futurize<T>; + + // get a new errorator by extending current one with new error + template <class... NewAllowedErrors> + using extend = errorator<NewAllowedErrors...>; + + // errorator with empty error set never contains any error + template <class T> + static constexpr bool contains_once_v = false; +}; // class errorator, <> specialization + + +template <class ErroratorOne, + class ErroratorTwo, + class... FurtherErrators> +struct compound_errorator { +private: + // generic template. Empty `FurtherErrators` are handled by + // the specialization below. + static_assert(sizeof...(FurtherErrators) > 0); + using step = + typename compound_errorator<ErroratorOne, ErroratorTwo>::type; + +public: + using type = + typename compound_errorator<step, FurtherErrators...>::type; +}; +template <class ErroratorOne, + class ErroratorTwo> +struct compound_errorator<ErroratorOne, ErroratorTwo> { + // specialization for empty `FurtherErrators` arg pack + using type = + typename ErroratorOne::template unify<ErroratorTwo>::type; +}; +template <class... Args> +using compound_errorator_t = typename compound_errorator<Args...>::type; + +// this is conjunction of two nasty features: C++14's variable template +// and inline global variable of C++17. The latter is crucial to ensure +// the variable will get the same address across all translation units. +template <std::errc ErrorV> +inline std::error_code ec = std::make_error_code(ErrorV); + +template <std::errc ErrorV> +using ct_error_code = unthrowable_wrapper<const std::error_code&, ec<ErrorV>>; + +namespace ct_error { + using enoent = ct_error_code<std::errc::no_such_file_or_directory>; + using enodata = ct_error_code<std::errc::no_message_available>; + using invarg = ct_error_code<std::errc::invalid_argument>; + using input_output_error = ct_error_code<std::errc::io_error>; + using object_corrupted = ct_error_code<std::errc::illegal_byte_sequence>; + using permission_denied = ct_error_code<std::errc::permission_denied>; + using operation_not_supported = + ct_error_code<std::errc::operation_not_supported>; + using not_connected = ct_error_code<std::errc::not_connected>; + using timed_out = ct_error_code<std::errc::timed_out>; + using erange = + ct_error_code<std::errc::result_out_of_range>; + using ebadf = + ct_error_code<std::errc::bad_file_descriptor>; + using enospc = + ct_error_code<std::errc::no_space_on_device>; + using value_too_large = ct_error_code<std::errc::value_too_large>; + using eagain = + ct_error_code<std::errc::resource_unavailable_try_again>; + using file_too_large = + ct_error_code<std::errc::file_too_large>; + using address_in_use = ct_error_code<std::errc::address_in_use>; + + struct pass_further_all { + template <class ErrorT> + decltype(auto) operator()(ErrorT&& e) { + return std::forward<ErrorT>(e); + } + }; + + struct discard_all { + template <class ErrorT> + void operator()(ErrorT&&) { + } + }; + + class assert_all { + const char* const msg = nullptr; + public: + template <std::size_t N> + assert_all(const char (&msg)[N]) + : msg(msg) { + } + assert_all() = default; + + template <class ErrorT> + void operator()(ErrorT&&) { + if (msg) { + ceph_abort(msg); + } else { + ceph_abort(); + } + } + }; + + template <class ErrorFunc> + static decltype(auto) all_same_way(ErrorFunc&& error_func) { + return [ + error_func = std::forward<ErrorFunc>(error_func) + ] (auto&& e) mutable -> decltype(auto) { + using decayed_t = std::decay_t<decltype(e)>; + auto&& handler = + decayed_t::error_t::handle(std::forward<ErrorFunc>(error_func)); + return std::invoke(std::move(handler), std::forward<decltype(e)>(e)); + }; + }; +} + +using stateful_errc = stateful_error_t<std::errc>; +using stateful_errint = stateful_error_t<int>; +using stateful_ec = stateful_error_t<std::error_code>; + +} // namespace crimson + + +// open the `seastar` namespace to specialize `futurize`. This is not +// pretty for sure. I just hope it's not worse than e.g. specializing +// `hash` in the `std` namespace. The justification is copy avoidance +// in `future<...>::safe_then()`. See the comments there for details. +namespace seastar { + +// Container is a placeholder for errorator::_future<> template +template <template <class> class Container, + class Value> +struct futurize<Container<::crimson::errorated_future_marker<Value>>> { + using errorator_type = typename Container< + ::crimson::errorated_future_marker<Value>>::errorator_type; + + using type = typename errorator_type::template future<Value>; + using value_type = seastar::internal::future_stored_type_t<Value>; + + template<typename Func, typename... FuncArgs> + [[gnu::always_inline]] + static inline type invoke(Func&& func, FuncArgs&&... args) noexcept { + try { + return func(std::forward<FuncArgs>(args)...); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <class Func> + [[gnu::always_inline]] + static type invoke(Func&& func, seastar::internal::monostate) noexcept { + try { + return func(); + } catch (...) { + return make_exception_future(std::current_exception()); + } + } + + template <typename Arg> + [[gnu::always_inline]] + static type make_exception_future(Arg&& arg) { + return errorator_type::template make_exception_future2<Value>(std::forward<Arg>(arg)); + } + +private: + template<typename PromiseT, typename Func> + static void satisfy_with_result_of(PromiseT&& pr, Func&& func) { + // this may use the protected variant of `seastar::future::forward_to()` + // because: + // 1. `seastar::future` established a friendship with with all + // specializations of `seastar::futurize`, including this + // one (we're in the `seastar` namespace!) WHILE + // 2. any errorated future declares now the friendship with any + // `seastar::futurize<...>`. + func().forward_to(std::move(pr)); + } + template <typename U> + friend class future; +}; + +template <template <class> class Container, + class Value> +struct continuation_base_from_future<Container<::crimson::errorated_future_marker<Value>>> { + using type = continuation_base<Value>; +}; + +} // namespace seastar diff --git a/src/crimson/common/exception.h b/src/crimson/common/exception.h new file mode 100644 index 000000000..05caf5ebd --- /dev/null +++ b/src/crimson/common/exception.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <exception> +#include <seastar/core/future.hh> +#include <seastar/core/future-util.hh> + +#include "crimson/common/log.h" + +namespace crimson::common { + +class system_shutdown_exception final : public std::exception{ +public: + const char* what() const noexcept final { + return "system shutting down"; + } +}; + +class actingset_changed final : public std::exception { +public: + actingset_changed(bool sp) : still_primary(sp) {} + const char* what() const noexcept final { + return "acting set changed"; + } + bool is_primary() const { + return still_primary; + } +private: + const bool still_primary; +}; + +template<typename Func, typename... Args> +inline seastar::future<> handle_system_shutdown(Func&& func, Args&&... args) +{ + return seastar::futurize_invoke(std::forward<Func>(func), + std::forward<Args>(args)...) + .handle_exception([](std::exception_ptr eptr) { + if (*eptr.__cxa_exception_type() == + typeid(crimson::common::system_shutdown_exception)) { + crimson::get_logger(ceph_subsys_osd).debug( + "operation skipped, system shutdown"); + return seastar::now(); + } + std::rethrow_exception(eptr); + }); +} + +} diff --git a/src/crimson/common/fixed_kv_node_layout.h b/src/crimson/common/fixed_kv_node_layout.h new file mode 100644 index 000000000..4c7cc2e76 --- /dev/null +++ b/src/crimson/common/fixed_kv_node_layout.h @@ -0,0 +1,700 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include "include/byteorder.h" + +#include "crimson/common/layout.h" + +namespace crimson::common { + +template <typename T, bool is_const> +struct maybe_const_t { +}; +template<typename T> +struct maybe_const_t<T, true> { + using type = const T*; +}; +template<typename T> +struct maybe_const_t<T, false> { + using type = T*; +}; + + +/** + * FixedKVNodeLayout + * + * Reusable implementation of a fixed size block mapping + * K -> V with internal representations KINT and VINT. + * + * Uses absl::container_internal::Layout for the actual memory layout. + * + * The primary interface exposed is centered on the iterator + * and related methods. + * + * Also included are helpers for doing splits and merges as for a btree. + */ +template < + size_t CAPACITY, + typename Meta, + typename MetaInt, + typename K, + typename KINT, + typename V, + typename VINT, + bool VALIDATE_INVARIANTS=true> +class FixedKVNodeLayout { + char *buf = nullptr; + + using L = absl::container_internal::Layout<ceph_le32, MetaInt, KINT, VINT>; + static constexpr L layout{1, 1, CAPACITY, CAPACITY}; + +public: + template <bool is_const> + struct iter_t { + friend class FixedKVNodeLayout; + using parent_t = typename maybe_const_t<FixedKVNodeLayout, is_const>::type; + + parent_t node; + uint16_t offset; + + iter_t( + parent_t parent, + uint16_t offset) : node(parent), offset(offset) {} + + iter_t(const iter_t &) = default; + iter_t(iter_t &&) = default; + iter_t &operator=(const iter_t &) = default; + iter_t &operator=(iter_t &&) = default; + + operator iter_t<!is_const>() const { + static_assert(!is_const); + return iter_t<!is_const>(node, offset); + } + + // Work nicely with for loops without requiring a nested type. + iter_t &operator*() { return *this; } + iter_t *operator->() { return this; } + + iter_t operator++(int) { + auto ret = *this; + ++offset; + return ret; + } + + iter_t &operator++() { + ++offset; + return *this; + } + + uint16_t operator-(const iter_t &rhs) const { + assert(rhs.node == node); + return offset - rhs.offset; + } + + iter_t operator+(uint16_t off) const { + return iter_t( + node, + offset + off); + } + iter_t operator-(uint16_t off) const { + return iter_t( + node, + offset - off); + } + + bool operator==(const iter_t &rhs) const { + assert(node == rhs.node); + return rhs.offset == offset; + } + + bool operator!=(const iter_t &rhs) const { + return !(*this == rhs); + } + + K get_key() const { + return K(node->get_key_ptr()[offset]); + } + + K get_next_key_or_max() const { + auto next = *this + 1; + if (next == node->end()) + return std::numeric_limits<K>::max(); + else + return next->get_key(); + } + + void set_val(V val) const { + static_assert(!is_const); + node->get_val_ptr()[offset] = VINT(val); + } + + V get_val() const { + return V(node->get_val_ptr()[offset]); + }; + + bool contains(K addr) const { + return (get_key() <= addr) && (get_next_key_or_max() > addr); + } + + uint16_t get_offset() const { + return offset; + } + + private: + void set_key(K _lb) const { + static_assert(!is_const); + KINT lb; + lb = _lb; + node->get_key_ptr()[offset] = lb; + } + + typename maybe_const_t<char, is_const>::type get_key_ptr() const { + return reinterpret_cast< + typename maybe_const_t<char, is_const>::type>( + node->get_key_ptr() + offset); + } + + typename maybe_const_t<char, is_const>::type get_val_ptr() const { + return reinterpret_cast< + typename maybe_const_t<char, is_const>::type>( + node->get_val_ptr() + offset); + } + }; + using const_iterator = iter_t<true>; + using iterator = iter_t<false>; + + struct delta_t { + enum class op_t : uint8_t { + INSERT, + REMOVE, + UPDATE, + } op; + KINT key; + VINT val; + + void replay(FixedKVNodeLayout &l) { + switch (op) { + case op_t::INSERT: { + l.insert(l.lower_bound(key), key, val); + break; + } + case op_t::REMOVE: { + auto iter = l.find(key); + assert(iter != l.end()); + l.remove(iter); + break; + } + case op_t::UPDATE: { + auto iter = l.find(key); + assert(iter != l.end()); + l.update(iter, val); + break; + } + default: + assert(0 == "Impossible"); + } + } + + bool operator==(const delta_t &rhs) const { + return op == rhs.op && + key == rhs.key && + val == rhs.val; + } + }; + +public: + class delta_buffer_t { + std::vector<delta_t> buffer; + public: + bool empty() const { + return buffer.empty(); + } + void insert( + const K &key, + const V &val) { + KINT k; + k = key; + buffer.push_back( + delta_t{ + delta_t::op_t::INSERT, + k, + VINT(val) + }); + } + void update( + const K &key, + const V &val) { + KINT k; + k = key; + buffer.push_back( + delta_t{ + delta_t::op_t::UPDATE, + k, + VINT(val) + }); + } + void remove(const K &key) { + KINT k; + k = key; + buffer.push_back( + delta_t{ + delta_t::op_t::REMOVE, + k, + VINT() + }); + } + void replay(FixedKVNodeLayout &node) { + for (auto &i: buffer) { + i.replay(node); + } + } + size_t get_bytes() const { + return buffer.size() * sizeof(delta_t); + } + void copy_out(char *out, size_t len) { + assert(len == get_bytes()); + ::memcpy(out, reinterpret_cast<const void *>(buffer.data()), get_bytes()); + buffer.clear(); + } + void copy_in(const char *out, size_t len) { + assert(empty()); + assert(len % sizeof(delta_t) == 0); + buffer = std::vector( + reinterpret_cast<const delta_t*>(out), + reinterpret_cast<const delta_t*>(out + len)); + } + bool operator==(const delta_buffer_t &rhs) const { + return buffer == rhs.buffer; + } + }; + + void journal_insert( + const_iterator _iter, + const K &key, + const V &val, + delta_buffer_t *recorder) { + auto iter = iterator(this, _iter.offset); + if (recorder) { + recorder->insert( + key, + val); + } + insert(iter, key, val); + } + + void journal_update( + const_iterator _iter, + const V &val, + delta_buffer_t *recorder) { + auto iter = iterator(this, _iter.offset); + if (recorder) { + recorder->update(iter->get_key(), val); + } + update(iter, val); + } + + void journal_replace( + const_iterator _iter, + const K &key, + const V &val, + delta_buffer_t *recorder) { + auto iter = iterator(this, _iter.offset); + if (recorder) { + recorder->remove(iter->get_key()); + recorder->insert(key, val); + } + replace(iter, key, val); + } + + + void journal_remove( + const_iterator _iter, + delta_buffer_t *recorder) { + auto iter = iterator(this, _iter.offset); + if (recorder) { + recorder->remove(iter->get_key()); + } + remove(iter); + } + + + FixedKVNodeLayout(char *buf) : + buf(buf) {} + + virtual ~FixedKVNodeLayout() = default; + + const_iterator begin() const { + return const_iterator( + this, + 0); + } + + const_iterator end() const { + return const_iterator( + this, + get_size()); + } + + iterator begin() { + return iterator( + this, + 0); + } + + iterator end() { + return iterator( + this, + get_size()); + } + + const_iterator iter_idx(uint16_t off) const { + return const_iterator( + this, + off); + } + + const_iterator find(K l) const { + auto ret = begin(); + for (; ret != end(); ++ret) { + if (ret->get_key() == l) + break; + } + return ret; + } + iterator find(K l) { + const auto &tref = *this; + return iterator(this, tref.find(l).offset); + } + + const_iterator lower_bound(K l) const { + auto ret = begin(); + for (; ret != end(); ++ret) { + if (ret->get_key() >= l) + break; + } + return ret; + } + iterator lower_bound(K l) { + const auto &tref = *this; + return iterator(this, tref.lower_bound(l).offset); + } + + const_iterator upper_bound(K l) const { + auto ret = begin(); + for (; ret != end(); ++ret) { + if (ret->get_key() > l) + break; + } + return ret; + } + iterator upper_bound(K l) { + const auto &tref = *this; + return iterator(this, tref.upper_bound(l).offset); + } + + const_iterator get_split_pivot() const { + return iter_idx(get_size() / 2); + } + + uint16_t get_size() const { + return *layout.template Pointer<0>(buf); + } + + /** + * set_size + * + * Set size representation to match size + */ + void set_size(uint16_t size) { + *layout.template Pointer<0>(buf) = size; + } + + /** + * get_meta/set_meta + * + * Enables stashing a templated type within the layout. + * Cannot be modified after initial write as it is not represented + * in delta_t + */ + Meta get_meta() const { + MetaInt &metaint = *layout.template Pointer<1>(buf); + return Meta(metaint); + } + void set_meta(const Meta &meta) { + *layout.template Pointer<1>(buf) = MetaInt(meta); + } + + constexpr static size_t get_capacity() { + return CAPACITY; + } + + bool operator==(const FixedKVNodeLayout &rhs) const { + if (get_size() != rhs.get_size()) { + return false; + } + + auto iter = begin(); + auto iter2 = rhs.begin(); + while (iter != end()) { + if (iter->get_key() != iter2->get_key() || + iter->get_val() != iter2->get_val()) { + return false; + } + iter++; + iter2++; + } + return true; + } + + /** + * split_into + * + * Takes *this and splits its contents into left and right. + */ + K split_into( + FixedKVNodeLayout &left, + FixedKVNodeLayout &right) const { + auto piviter = get_split_pivot(); + + left.copy_from_foreign(left.begin(), begin(), piviter); + left.set_size(piviter - begin()); + + right.copy_from_foreign(right.begin(), piviter, end()); + right.set_size(end() - piviter); + + auto [lmeta, rmeta] = get_meta().split_into(piviter->get_key()); + left.set_meta(lmeta); + right.set_meta(rmeta); + + return piviter->get_key(); + } + + /** + * merge_from + * + * Takes two nodes and copies their contents into *this. + * + * precondition: left.size() + right.size() < CAPACITY + */ + void merge_from( + const FixedKVNodeLayout &left, + const FixedKVNodeLayout &right) + { + copy_from_foreign( + end(), + left.begin(), + left.end()); + set_size(left.get_size()); + copy_from_foreign( + end(), + right.begin(), + right.end()); + set_size(left.get_size() + right.get_size()); + set_meta(Meta::merge_from(left.get_meta(), right.get_meta())); + } + + /** + * balance_into_new_nodes + * + * Takes the contents of left and right and copies them into + * replacement_left and replacement_right such that in the + * event that the number of elements is odd the extra goes to + * the left side iff prefer_left. + */ + static K balance_into_new_nodes( + const FixedKVNodeLayout &left, + const FixedKVNodeLayout &right, + bool prefer_left, + FixedKVNodeLayout &replacement_left, + FixedKVNodeLayout &replacement_right) + { + auto total = left.get_size() + right.get_size(); + auto pivot_idx = (left.get_size() + right.get_size()) / 2; + if (total % 2 && prefer_left) { + pivot_idx++; + } + auto replacement_pivot = pivot_idx >= left.get_size() ? + right.iter_idx(pivot_idx - left.get_size())->get_key() : + left.iter_idx(pivot_idx)->get_key(); + + if (pivot_idx < left.get_size()) { + replacement_left.copy_from_foreign( + replacement_left.end(), + left.begin(), + left.iter_idx(pivot_idx)); + replacement_left.set_size(pivot_idx); + + replacement_right.copy_from_foreign( + replacement_right.end(), + left.iter_idx(pivot_idx), + left.end()); + + replacement_right.set_size(left.get_size() - pivot_idx); + replacement_right.copy_from_foreign( + replacement_right.end(), + right.begin(), + right.end()); + replacement_right.set_size(total - pivot_idx); + } else { + replacement_left.copy_from_foreign( + replacement_left.end(), + left.begin(), + left.end()); + replacement_left.set_size(left.get_size()); + + replacement_left.copy_from_foreign( + replacement_left.end(), + right.begin(), + right.iter_idx(pivot_idx - left.get_size())); + replacement_left.set_size(pivot_idx); + + replacement_right.copy_from_foreign( + replacement_right.end(), + right.iter_idx(pivot_idx - left.get_size()), + right.end()); + replacement_right.set_size(total - pivot_idx); + } + + auto [lmeta, rmeta] = Meta::rebalance( + left.get_meta(), right.get_meta(), replacement_pivot); + replacement_left.set_meta(lmeta); + replacement_right.set_meta(rmeta); + return replacement_pivot; + } + +private: + void insert( + iterator iter, + const K &key, + const V &val) { + if (VALIDATE_INVARIANTS) { + if (iter != begin()) { + assert((iter - 1)->get_key() < key); + } + if (iter != end()) { + assert(iter->get_key() > key); + } + assert(get_size() < CAPACITY); + } + copy_from_local(iter + 1, iter, end()); + iter->set_key(key); + iter->set_val(val); + set_size(get_size() + 1); + } + + void update( + iterator iter, + V val) { + assert(iter != end()); + iter->set_val(val); + } + + void replace( + iterator iter, + const K &key, + const V &val) { + assert(iter != end()); + if (VALIDATE_INVARIANTS) { + if (iter != begin()) { + assert((iter - 1)->get_key() < key); + } + if ((iter + 1) != end()) { + assert((iter + 1)->get_key() > key); + } + } + iter->set_key(key); + iter->set_val(val); + } + + void remove(iterator iter) { + assert(iter != end()); + copy_from_local(iter, iter + 1, end()); + set_size(get_size() - 1); + } + + /** + * get_key_ptr + * + * Get pointer to start of key array + */ + KINT *get_key_ptr() { + return layout.template Pointer<2>(buf); + } + const KINT *get_key_ptr() const { + return layout.template Pointer<2>(buf); + } + + /** + * get_val_ptr + * + * Get pointer to start of val array + */ + VINT *get_val_ptr() { + return layout.template Pointer<3>(buf); + } + const VINT *get_val_ptr() const { + return layout.template Pointer<3>(buf); + } + + /** + * node_resolve/unresolve_vals + * + * If the representation for values depends in some way on the + * node in which they are located, users may implement + * resolve/unresolve to enable copy_from_foreign to handle that + * transition. + */ + virtual void node_resolve_vals(iterator from, iterator to) const {} + virtual void node_unresolve_vals(iterator from, iterator to) const {} + + /** + * copy_from_foreign + * + * Copies entries from [from_src, to_src) to tgt. + * + * tgt and from_src must be from different nodes. + * from_src and to_src must be from the same node. + */ + static void copy_from_foreign( + iterator tgt, + const_iterator from_src, + const_iterator to_src) { + assert(tgt->node != from_src->node); + assert(to_src->node == from_src->node); + memcpy( + tgt->get_val_ptr(), from_src->get_val_ptr(), + to_src->get_val_ptr() - from_src->get_val_ptr()); + memcpy( + tgt->get_key_ptr(), from_src->get_key_ptr(), + to_src->get_key_ptr() - from_src->get_key_ptr()); + from_src->node->node_resolve_vals(tgt, tgt + (to_src - from_src)); + tgt->node->node_unresolve_vals(tgt, tgt + (to_src - from_src)); + } + + /** + * copy_from_local + * + * Copies entries from [from_src, to_src) to tgt. + * + * tgt, from_src, and to_src must be from the same node. + */ + static void copy_from_local( + iterator tgt, + iterator from_src, + iterator to_src) { + assert(tgt->node == from_src->node); + assert(to_src->node == from_src->node); + memmove( + tgt->get_val_ptr(), from_src->get_val_ptr(), + to_src->get_val_ptr() - from_src->get_val_ptr()); + memmove( + tgt->get_key_ptr(), from_src->get_key_ptr(), + to_src->get_key_ptr() - from_src->get_key_ptr()); + } +}; + +} diff --git a/src/crimson/common/formatter.cc b/src/crimson/common/formatter.cc new file mode 100644 index 000000000..677216224 --- /dev/null +++ b/src/crimson/common/formatter.cc @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "formatter.h" + +#include <fmt/format.h> +#if FMT_VERSION >= 60000 +#include <fmt/chrono.h> +#else +#include <fmt/time.h> +#endif + + +template <> +struct fmt::formatter<seastar::lowres_system_clock::time_point> { + // ignore the format string + template <typename ParseContext> + constexpr auto parse(ParseContext &ctx) { return ctx.begin(); } + + template <typename FormatContext> + auto format(const seastar::lowres_system_clock::time_point& t, + FormatContext& ctx) { + std::time_t tt = std::chrono::duration_cast<std::chrono::seconds>( + t.time_since_epoch()).count(); + auto milliseconds = (t.time_since_epoch() % + std::chrono::seconds(1)).count(); + return fmt::format_to(ctx.out(), "{:%Y-%m-%d %H:%M:%S} {:03d}", + fmt::localtime(tt), milliseconds); + } +}; + +template <> +struct fmt::formatter<ceph::coarse_real_clock::time_point> { + // ignore the format string + template <typename ParseContext> + constexpr auto parse(ParseContext &ctx) { return ctx.begin(); } + + template <typename FormatContext> + auto format(const ceph::coarse_real_clock::time_point& t, + FormatContext& ctx) { + std::time_t tt = std::chrono::duration_cast<std::chrono::seconds>( + t.time_since_epoch()).count(); + auto milliseconds = (t.time_since_epoch() % + std::chrono::seconds(1)).count(); + return fmt::format_to(ctx.out(), "{:%Y-%m-%d %H:%M:%S} {:03d}", + fmt::localtime(tt), milliseconds); + } +}; + +namespace std { + +ostream& operator<<(ostream& out, + const seastar::lowres_system_clock::time_point& t) +{ + return out << fmt::format("{}", t); +} + +ostream& operator<<(ostream& out, + const ceph::coarse_real_clock::time_point& t) +{ + return out << fmt::format("{}", t); +} + +} diff --git a/src/crimson/common/formatter.h b/src/crimson/common/formatter.h new file mode 100644 index 000000000..1775b0954 --- /dev/null +++ b/src/crimson/common/formatter.h @@ -0,0 +1,15 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/lowres_clock.hh> + +#include "common/ceph_time.h" + +namespace std { + +ostream& operator<<(ostream& out, + const seastar::lowres_system_clock::time_point& t); +ostream& operator<<(ostream& out, + const ceph::coarse_real_clock::time_point& t); + +} diff --git a/src/crimson/common/gated.h b/src/crimson/common/gated.h new file mode 100644 index 000000000..7d901b6b1 --- /dev/null +++ b/src/crimson/common/gated.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/gate.hh> +#include <seastar/core/future.hh> +#include <seastar/core/future-util.hh> + +#include "crimson/common/exception.h" +#include "crimson/common/log.h" +#include "include/ceph_assert.h" + +namespace crimson::common { + +class Gated { + public: + static seastar::logger& gated_logger() { + return crimson::get_logger(ceph_subsys_osd); + } + template <typename Func, typename T> + inline void dispatch_in_background(const char* what, T& who, Func&& func) { + (void) dispatch(what, who, func); + } + template <typename Func, typename T> + inline seastar::future<> dispatch(const char* what, T& who, Func&& func) { + return seastar::with_gate(pending_dispatch, std::forward<Func>(func) + ).handle_exception([what, &who] (std::exception_ptr eptr) { + if (*eptr.__cxa_exception_type() == typeid(system_shutdown_exception)) { + gated_logger().debug( + "{}, {} skipped, system shutdown", who, what); + return; + } + gated_logger().error( + "{} dispatch() {} caught exception: {}", who, what, eptr); + assert(*eptr.__cxa_exception_type() + == typeid(seastar::gate_closed_exception)); + }); + } + + seastar::future<> close() { + return pending_dispatch.close(); + } + bool is_closed() const { + return pending_dispatch.is_closed(); + } + private: + seastar::gate pending_dispatch; +}; + +}// namespace crimson::common diff --git a/src/crimson/common/layout.h b/src/crimson/common/layout.h new file mode 100644 index 000000000..9d54ecd1d --- /dev/null +++ b/src/crimson/common/layout.h @@ -0,0 +1,737 @@ +// Copyright 2018 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// MOTIVATION AND TUTORIAL +// +// If you want to put in a single heap allocation N doubles followed by M ints, +// it's easy if N and M are known at compile time. +// +// struct S { +// double a[N]; +// int b[M]; +// }; +// +// S* p = new S; +// +// But what if N and M are known only in run time? Class template Layout to the +// rescue! It's a portable generalization of the technique known as struct hack. +// +// // This object will tell us everything we need to know about the memory +// // layout of double[N] followed by int[M]. It's structurally identical to +// // size_t[2] that stores N and M. It's very cheap to create. +// const Layout<double, int> layout(N, M); +// +// // Allocate enough memory for both arrays. `AllocSize()` tells us how much +// // memory is needed. We are free to use any allocation function we want as +// // long as it returns aligned memory. +// std::unique_ptr<unsigned char[]> p(new unsigned char[layout.AllocSize()]); +// +// // Obtain the pointer to the array of doubles. +// // Equivalent to `reinterpret_cast<double*>(p.get())`. +// // +// // We could have written layout.Pointer<0>(p) instead. If all the types are +// // unique you can use either form, but if some types are repeated you must +// // use the index form. +// double* a = layout.Pointer<double>(p.get()); +// +// // Obtain the pointer to the array of ints. +// // Equivalent to `reinterpret_cast<int*>(p.get() + N * 8)`. +// int* b = layout.Pointer<int>(p); +// +// If we are unable to specify sizes of all fields, we can pass as many sizes as +// we can to `Partial()`. In return, it'll allow us to access the fields whose +// locations and sizes can be computed from the provided information. +// `Partial()` comes in handy when the array sizes are embedded into the +// allocation. +// +// // size_t[1] containing N, size_t[1] containing M, double[N], int[M]. +// using L = Layout<size_t, size_t, double, int>; +// +// unsigned char* Allocate(size_t n, size_t m) { +// const L layout(1, 1, n, m); +// unsigned char* p = new unsigned char[layout.AllocSize()]; +// *layout.Pointer<0>(p) = n; +// *layout.Pointer<1>(p) = m; +// return p; +// } +// +// void Use(unsigned char* p) { +// // First, extract N and M. +// // Specify that the first array has only one element. Using `prefix` we +// // can access the first two arrays but not more. +// constexpr auto prefix = L::Partial(1); +// size_t n = *prefix.Pointer<0>(p); +// size_t m = *prefix.Pointer<1>(p); +// +// // Now we can get pointers to the payload. +// const L layout(1, 1, n, m); +// double* a = layout.Pointer<double>(p); +// int* b = layout.Pointer<int>(p); +// } +// +// The layout we used above combines fixed-size with dynamically-sized fields. +// This is quite common. Layout is optimized for this use case and generates +// optimal code. All computations that can be performed at compile time are +// indeed performed at compile time. +// +// Efficiency tip: The order of fields matters. In `Layout<T1, ..., TN>` try to +// ensure that `alignof(T1) >= ... >= alignof(TN)`. This way you'll have no +// padding in between arrays. +// +// You can manually override the alignment of an array by wrapping the type in +// `Aligned<T, N>`. `Layout<..., Aligned<T, N>, ...>` has exactly the same API +// and behavior as `Layout<..., T, ...>` except that the first element of the +// array of `T` is aligned to `N` (the rest of the elements follow without +// padding). `N` cannot be less than `alignof(T)`. +// +// `AllocSize()` and `Pointer()` are the most basic methods for dealing with +// memory layouts. Check out the reference or code below to discover more. +// +// EXAMPLE +// +// // Immutable move-only string with sizeof equal to sizeof(void*). The +// // string size and the characters are kept in the same heap allocation. +// class CompactString { +// public: +// CompactString(const char* s = "") { +// const size_t size = strlen(s); +// // size_t[1] followed by char[size + 1]. +// const L layout(1, size + 1); +// p_.reset(new unsigned char[layout.AllocSize()]); +// // If running under ASAN, mark the padding bytes, if any, to catch +// // memory errors. +// layout.PoisonPadding(p_.get()); +// // Store the size in the allocation. +// *layout.Pointer<size_t>(p_.get()) = size; +// // Store the characters in the allocation. +// memcpy(layout.Pointer<char>(p_.get()), s, size + 1); +// } +// +// size_t size() const { +// // Equivalent to reinterpret_cast<size_t&>(*p). +// return *L::Partial().Pointer<size_t>(p_.get()); +// } +// +// const char* c_str() const { +// // Equivalent to reinterpret_cast<char*>(p.get() + sizeof(size_t)). +// // The argument in Partial(1) specifies that we have size_t[1] in front +// // of the characters. +// return L::Partial(1).Pointer<char>(p_.get()); +// } +// +// private: +// // Our heap allocation contains a size_t followed by an array of chars. +// using L = Layout<size_t, char>; +// std::unique_ptr<unsigned char[]> p_; +// }; +// +// int main() { +// CompactString s = "hello"; +// assert(s.size() == 5); +// assert(strcmp(s.c_str(), "hello") == 0); +// } +// +// DOCUMENTATION +// +// The interface exported by this file consists of: +// - class `Layout<>` and its public members. +// - The public members of class `internal_layout::LayoutImpl<>`. That class +// isn't intended to be used directly, and its name and template parameter +// list are internal implementation details, but the class itself provides +// most of the functionality in this file. See comments on its members for +// detailed documentation. +// +// `Layout<T1,... Tn>::Partial(count1,..., countm)` (where `m` <= `n`) returns a +// `LayoutImpl<>` object. `Layout<T1,..., Tn> layout(count1,..., countn)` +// creates a `Layout` object, which exposes the same functionality by inheriting +// from `LayoutImpl<>`. + +#ifndef ABSL_CONTAINER_INTERNAL_LAYOUT_H_ +#define ABSL_CONTAINER_INTERNAL_LAYOUT_H_ + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> +#include <ostream> +#include <string> +#include <tuple> +#include <type_traits> +#include <typeinfo> +#include <utility> + +#ifdef ADDRESS_SANITIZER +#include <sanitizer/asan_interface.h> +#endif + +// for C++20 std::span +#include <boost/beast/core/span.hpp> +#include <fmt/format.h> + +#if defined(__GXX_RTTI) +#define ABSL_INTERNAL_HAS_CXA_DEMANGLE +#endif + +#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE +#include <cxxabi.h> +#endif + +namespace absl { +namespace container_internal { + +// A type wrapper that instructs `Layout` to use the specific alignment for the +// array. `Layout<..., Aligned<T, N>, ...>` has exactly the same API +// and behavior as `Layout<..., T, ...>` except that the first element of the +// array of `T` is aligned to `N` (the rest of the elements follow without +// padding). +// +// Requires: `N >= alignof(T)` and `N` is a power of 2. +template <class T, size_t N> +struct Aligned; + +namespace internal_layout { + +template <class T> +struct NotAligned {}; + +template <class T, size_t N> +struct NotAligned<const Aligned<T, N>> { + static_assert(sizeof(T) == 0, "Aligned<T, N> cannot be const-qualified"); +}; + +template <size_t> +using IntToSize = size_t; + +template <class> +using TypeToSize = size_t; + +template <class T> +struct Type : NotAligned<T> { + using type = T; +}; + +template <class T, size_t N> +struct Type<Aligned<T, N>> { + using type = T; +}; + +template <class T> +struct SizeOf : NotAligned<T>, std::integral_constant<size_t, sizeof(T)> {}; + +template <class T, size_t N> +struct SizeOf<Aligned<T, N>> : std::integral_constant<size_t, sizeof(T)> {}; + +// Note: workaround for https://gcc.gnu.org/PR88115 +template <class T> +struct AlignOf : NotAligned<T> { + static constexpr size_t value = alignof(T); +}; + +template <class T, size_t N> +struct AlignOf<Aligned<T, N>> { + static_assert(N % alignof(T) == 0, + "Custom alignment can't be lower than the type's alignment"); + static constexpr size_t value = N; +}; + +// Does `Ts...` contain `T`? +template <class T, class... Ts> +using Contains = std::disjunction<std::is_same<T, Ts>...>; + +template <class From, class To> +using CopyConst = + typename std::conditional_t<std::is_const_v<From>, const To, To>; + +// Note: We're not qualifying this with absl:: because it doesn't compile under +// MSVC. +template <class T> +using SliceType = boost::beast::span<T>; + +// This namespace contains no types. It prevents functions defined in it from +// being found by ADL. +namespace adl_barrier { + +template <class Needle, class... Ts> +constexpr size_t Find(Needle, Needle, Ts...) { + static_assert(!Contains<Needle, Ts...>(), "Duplicate element type"); + return 0; +} + +template <class Needle, class T, class... Ts> +constexpr size_t Find(Needle, T, Ts...) { + return adl_barrier::Find(Needle(), Ts()...) + 1; +} + +constexpr bool IsPow2(size_t n) { return !(n & (n - 1)); } + +// Returns `q * m` for the smallest `q` such that `q * m >= n`. +// Requires: `m` is a power of two. It's enforced by IsLegalElementType below. +constexpr size_t Align(size_t n, size_t m) { return (n + m - 1) & ~(m - 1); } + +constexpr size_t Min(size_t a, size_t b) { return b < a ? b : a; } + +constexpr size_t Max(size_t a) { return a; } + +template <class... Ts> +constexpr size_t Max(size_t a, size_t b, Ts... rest) { + return adl_barrier::Max(b < a ? a : b, rest...); +} + +template <class T> +std::string TypeName() { + std::string out; + int status = 0; + char* demangled = nullptr; +#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE + demangled = abi::__cxa_demangle(typeid(T).name(), nullptr, nullptr, &status); +#endif + if (status == 0 && demangled != nullptr) { // Demangling succeeded. + out = fmt::format("<{}>", demangled); + free(demangled); + } else { +#if defined(__GXX_RTTI) || defined(_CPPRTTI) + out = fmt::format("<{}>", typeid(T).name()); +#endif + } + return out; +} + +} // namespace adl_barrier + +template <bool C> +using EnableIf = typename std::enable_if_t<C, int>; + +// Can `T` be a template argument of `Layout`? +template <class T> +using IsLegalElementType = std::integral_constant< + bool, !std::is_reference_v<T> && !std::is_volatile_v<T> && + !std::is_reference_v<typename Type<T>::type> && + !std::is_volatile_v<typename Type<T>::type> && + adl_barrier::IsPow2(AlignOf<T>::value)>; + +template <class Elements, class SizeSeq, class OffsetSeq> +class LayoutImpl; + +// Public base class of `Layout` and the result type of `Layout::Partial()`. +// +// `Elements...` contains all template arguments of `Layout` that created this +// instance. +// +// `SizeSeq...` is `[0, NumSizes)` where `NumSizes` is the number of arguments +// passed to `Layout::Partial()` or `Layout::Layout()`. +// +// `OffsetSeq...` is `[0, NumOffsets)` where `NumOffsets` is +// `Min(sizeof...(Elements), NumSizes + 1)` (the number of arrays for which we +// can compute offsets). +template <class... Elements, size_t... SizeSeq, size_t... OffsetSeq> +class LayoutImpl<std::tuple<Elements...>, std::index_sequence<SizeSeq...>, + std::index_sequence<OffsetSeq...>> { + private: + static_assert(sizeof...(Elements) > 0, "At least one field is required"); + static_assert(std::conjunction_v<IsLegalElementType<Elements>...>, + "Invalid element type (see IsLegalElementType)"); + + enum { + NumTypes = sizeof...(Elements), + NumSizes = sizeof...(SizeSeq), + NumOffsets = sizeof...(OffsetSeq), + }; + + // These are guaranteed by `Layout`. + static_assert(NumOffsets == adl_barrier::Min(NumTypes, NumSizes + 1), + "Internal error"); + static_assert(NumTypes > 0, "Internal error"); + + // Returns the index of `T` in `Elements...`. Results in a compilation error + // if `Elements...` doesn't contain exactly one instance of `T`. + template <class T> + static constexpr size_t ElementIndex() { + static_assert(Contains<Type<T>, Type<typename Type<Elements>::type>...>(), + "Type not found"); + return adl_barrier::Find(Type<T>(), + Type<typename Type<Elements>::type>()...); + } + + template <size_t N> + using ElementAlignment = + AlignOf<typename std::tuple_element<N, std::tuple<Elements...>>::type>; + + public: + // Element types of all arrays packed in a tuple. + using ElementTypes = std::tuple<typename Type<Elements>::type...>; + + // Element type of the Nth array. + template <size_t N> + using ElementType = typename std::tuple_element<N, ElementTypes>::type; + + constexpr explicit LayoutImpl(IntToSize<SizeSeq>... sizes) + : size_{sizes...} {} + + // Alignment of the layout, equal to the strictest alignment of all elements. + // All pointers passed to the methods of layout must be aligned to this value. + static constexpr size_t Alignment() { + return adl_barrier::Max(AlignOf<Elements>::value...); + } + + // Offset in bytes of the Nth array. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // assert(x.Offset<0>() == 0); // The ints starts from 0. + // assert(x.Offset<1>() == 16); // The doubles starts from 16. + // + // Requires: `N <= NumSizes && N < sizeof...(Ts)`. + template <size_t N, EnableIf<N == 0> = 0> + constexpr size_t Offset() const { + return 0; + } + + template <size_t N, EnableIf<N != 0> = 0> + constexpr size_t Offset() const { + static_assert(N < NumOffsets, "Index out of bounds"); + return adl_barrier::Align( + Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1], + ElementAlignment<N>::value); + } + + // Offset in bytes of the array with the specified element type. There must + // be exactly one such array and its zero-based index must be at most + // `NumSizes`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // assert(x.Offset<int>() == 0); // The ints starts from 0. + // assert(x.Offset<double>() == 16); // The doubles starts from 16. + template <class T> + constexpr size_t Offset() const { + return Offset<ElementIndex<T>()>(); + } + + // Offsets in bytes of all arrays for which the offsets are known. + constexpr std::array<size_t, NumOffsets> Offsets() const { + return {{Offset<OffsetSeq>()...}}; + } + + // The number of elements in the Nth array. This is the Nth argument of + // `Layout::Partial()` or `Layout::Layout()` (zero-based). + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // assert(x.Size<0>() == 3); + // assert(x.Size<1>() == 4); + // + // Requires: `N < NumSizes`. + template <size_t N> + constexpr size_t Size() const { + static_assert(N < NumSizes, "Index out of bounds"); + return size_[N]; + } + + // The number of elements in the array with the specified element type. + // There must be exactly one such array and its zero-based index must be + // at most `NumSizes`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // assert(x.Size<int>() == 3); + // assert(x.Size<double>() == 4); + template <class T> + constexpr size_t Size() const { + return Size<ElementIndex<T>()>(); + } + + // The number of elements of all arrays for which they are known. + constexpr std::array<size_t, NumSizes> Sizes() const { + return {{Size<SizeSeq>()...}}; + } + + // Pointer to the beginning of the Nth array. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // int* ints = x.Pointer<0>(p); + // double* doubles = x.Pointer<1>(p); + // + // Requires: `N <= NumSizes && N < sizeof...(Ts)`. + // Requires: `p` is aligned to `Alignment()`. + template <size_t N, class Char> + CopyConst<Char, ElementType<N>>* Pointer(Char* p) const { + using C = typename std::remove_const<Char>::type; + static_assert( + std::is_same<C, char>() || std::is_same<C, unsigned char>() || + std::is_same<C, signed char>(), + "The argument must be a pointer to [const] [signed|unsigned] char"); + constexpr size_t alignment = Alignment(); + (void)alignment; + assert(reinterpret_cast<uintptr_t>(p) % alignment == 0); + return reinterpret_cast<CopyConst<Char, ElementType<N>>*>(p + Offset<N>()); + } + + // Pointer to the beginning of the array with the specified element type. + // There must be exactly one such array and its zero-based index must be at + // most `NumSizes`. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // int* ints = x.Pointer<int>(p); + // double* doubles = x.Pointer<double>(p); + // + // Requires: `p` is aligned to `Alignment()`. + template <class T, class Char> + CopyConst<Char, T>* Pointer(Char* p) const { + return Pointer<ElementIndex<T>()>(p); + } + + // Pointers to all arrays for which pointers are known. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // + // int* ints; + // double* doubles; + // std::tie(ints, doubles) = x.Pointers(p); + // + // Requires: `p` is aligned to `Alignment()`. + // + // Note: We're not using ElementType alias here because it does not compile + // under MSVC. + template <class Char> + std::tuple<CopyConst< + Char, typename std::tuple_element<OffsetSeq, ElementTypes>::type>*...> + Pointers(Char* p) const { + return std::tuple<CopyConst<Char, ElementType<OffsetSeq>>*...>( + Pointer<OffsetSeq>(p)...); + } + + // The Nth array. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // Span<int> ints = x.Slice<0>(p); + // Span<double> doubles = x.Slice<1>(p); + // + // Requires: `N < NumSizes`. + // Requires: `p` is aligned to `Alignment()`. + template <size_t N, class Char> + SliceType<CopyConst<Char, ElementType<N>>> Slice(Char* p) const { + return SliceType<CopyConst<Char, ElementType<N>>>(Pointer<N>(p), Size<N>()); + } + + // The array with the specified element type. There must be exactly one + // such array and its zero-based index must be less than `NumSizes`. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // Span<int> ints = x.Slice<int>(p); + // Span<double> doubles = x.Slice<double>(p); + // + // Requires: `p` is aligned to `Alignment()`. + template <class T, class Char> + SliceType<CopyConst<Char, T>> Slice(Char* p) const { + return Slice<ElementIndex<T>()>(p); + } + + // All arrays with known sizes. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; + // + // Span<int> ints; + // Span<double> doubles; + // std::tie(ints, doubles) = x.Slices(p); + // + // Requires: `p` is aligned to `Alignment()`. + // + // Note: We're not using ElementType alias here because it does not compile + // under MSVC. + template <class Char> + std::tuple<SliceType<CopyConst< + Char, typename std::tuple_element<SizeSeq, ElementTypes>::type>>...> + Slices(Char* p) const { + // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63875 (fixed + // in 6.1). + (void)p; + return std::tuple<SliceType<CopyConst<Char, ElementType<SizeSeq>>>...>( + Slice<SizeSeq>(p)...); + } + + // The size of the allocation that fits all arrays. + // + // // int[3], 4 bytes of padding, double[4]. + // Layout<int, double> x(3, 4); + // unsigned char* p = new unsigned char[x.AllocSize()]; // 48 bytes + // + // Requires: `NumSizes == sizeof...(Ts)`. + constexpr size_t AllocSize() const { + static_assert(NumTypes == NumSizes, "You must specify sizes of all fields"); + return Offset<NumTypes - 1>() + + SizeOf<ElementType<NumTypes - 1>>() * size_[NumTypes - 1]; + } + + // If built with --config=asan, poisons padding bytes (if any) in the + // allocation. The pointer must point to a memory block at least + // `AllocSize()` bytes in length. + // + // `Char` must be `[const] [signed|unsigned] char`. + // + // Requires: `p` is aligned to `Alignment()`. + template <class Char, size_t N = NumOffsets - 1, EnableIf<N == 0> = 0> + void PoisonPadding(const Char* p) const { + Pointer<0>(p); // verify the requirements on `Char` and `p` + } + + template <class Char, size_t N = NumOffsets - 1, EnableIf<N != 0> = 0> + void PoisonPadding(const Char* p) const { + static_assert(N < NumOffsets, "Index out of bounds"); + (void)p; +#ifdef ADDRESS_SANITIZER + PoisonPadding<Char, N - 1>(p); + // The `if` is an optimization. It doesn't affect the observable behaviour. + if (ElementAlignment<N - 1>::value % ElementAlignment<N>::value) { + size_t start = + Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1]; + ASAN_POISON_MEMORY_REGION(p + start, Offset<N>() - start); + } +#endif + } + + // Human-readable description of the memory layout. Useful for debugging. + // Slow. + // + // // char[5], 3 bytes of padding, int[3], 4 bytes of padding, followed + // // by an unknown number of doubles. + // auto x = Layout<char, int, double>::Partial(5, 3); + // assert(x.DebugString() == + // "@0<char>(1)[5]; @8<int>(4)[3]; @24<double>(8)"); + // + // Each field is in the following format: @offset<type>(sizeof)[size] (<type> + // may be missing depending on the target platform). For example, + // @8<int>(4)[3] means that at offset 8 we have an array of ints, where each + // int is 4 bytes, and we have 3 of those ints. The size of the last field may + // be missing (as in the example above). Only fields with known offsets are + // described. Type names may differ across platforms: one compiler might + // produce "unsigned*" where another produces "unsigned int *". + std::string DebugString() const { + const auto offsets = Offsets(); + const size_t sizes[] = {SizeOf<ElementType<OffsetSeq>>()...}; + const std::string types[] = { + adl_barrier::TypeName<ElementType<OffsetSeq>>()...}; + std::string res = fmt::format("@0{}({})", types[0], sizes[0]); + for (size_t i = 0; i != NumOffsets - 1; ++i) { + res += fmt::format("[{}]; @({})", size_[i], offsets[i + 1], types[i + 1], sizes[i + 1]); + } + // NumSizes is a constant that may be zero. Some compilers cannot see that + // inside the if statement "size_[NumSizes - 1]" must be valid. + int last = static_cast<int>(NumSizes) - 1; + if (NumTypes == NumSizes && last >= 0) { + res += fmt::format("[{}]", size_[last]); + } + return res; + } + + private: + // Arguments of `Layout::Partial()` or `Layout::Layout()`. + size_t size_[NumSizes > 0 ? NumSizes : 1]; +}; + +template <size_t NumSizes, class... Ts> +using LayoutType = LayoutImpl< + std::tuple<Ts...>, std::make_index_sequence<NumSizes>, + std::make_index_sequence<adl_barrier::Min(sizeof...(Ts), NumSizes + 1)>>; + +} // namespace internal_layout + +// Descriptor of arrays of various types and sizes laid out in memory one after +// another. See the top of the file for documentation. +// +// Check out the public API of internal_layout::LayoutImpl above. The type is +// internal to the library but its methods are public, and they are inherited +// by `Layout`. +template <class... Ts> +class Layout : public internal_layout::LayoutType<sizeof...(Ts), Ts...> { + public: + static_assert(sizeof...(Ts) > 0, "At least one field is required"); + static_assert( + std::conjunction_v<internal_layout::IsLegalElementType<Ts>...>, + "Invalid element type (see IsLegalElementType)"); + + // The result type of `Partial()` with `NumSizes` arguments. + template <size_t NumSizes> + using PartialType = internal_layout::LayoutType<NumSizes, Ts...>; + + // `Layout` knows the element types of the arrays we want to lay out in + // memory but not the number of elements in each array. + // `Partial(size1, ..., sizeN)` allows us to specify the latter. The + // resulting immutable object can be used to obtain pointers to the + // individual arrays. + // + // It's allowed to pass fewer array sizes than the number of arrays. E.g., + // if all you need is to the offset of the second array, you only need to + // pass one argument -- the number of elements in the first array. + // + // // int[3] followed by 4 bytes of padding and an unknown number of + // // doubles. + // auto x = Layout<int, double>::Partial(3); + // // doubles start at byte 16. + // assert(x.Offset<1>() == 16); + // + // If you know the number of elements in all arrays, you can still call + // `Partial()` but it's more convenient to use the constructor of `Layout`. + // + // Layout<int, double> x(3, 5); + // + // Note: The sizes of the arrays must be specified in number of elements, + // not in bytes. + // + // Requires: `sizeof...(Sizes) <= sizeof...(Ts)`. + // Requires: all arguments are convertible to `size_t`. + template <class... Sizes> + static constexpr PartialType<sizeof...(Sizes)> Partial(Sizes&&... sizes) { + static_assert(sizeof...(Sizes) <= sizeof...(Ts)); + return PartialType<sizeof...(Sizes)>(std::forward<Sizes>(sizes)...); + } + + // Creates a layout with the sizes of all arrays specified. If you know + // only the sizes of the first N arrays (where N can be zero), you can use + // `Partial()` defined above. The constructor is essentially equivalent to + // calling `Partial()` and passing in all array sizes; the constructor is + // provided as a convenient abbreviation. + // + // Note: The sizes of the arrays must be specified in number of elements, + // not in bytes. + constexpr explicit Layout(internal_layout::TypeToSize<Ts>... sizes) + : internal_layout::LayoutType<sizeof...(Ts), Ts...>(sizes...) {} +}; + +} // namespace container_internal +} // namespace absl + +#endif // ABSL_CONTAINER_INTERNAL_LAYOUT_H_ diff --git a/src/crimson/common/log.cc b/src/crimson/common/log.cc new file mode 100644 index 000000000..cae9f6a7b --- /dev/null +++ b/src/crimson/common/log.cc @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "log.h" + +static std::array<seastar::logger, ceph_subsys_get_num()> loggers{ +#define SUBSYS(name, log_level, gather_level) \ + seastar::logger(#name), +#define DEFAULT_SUBSYS(log_level, gather_level) \ + seastar::logger("none"), + #include "common/subsys.h" +#undef SUBSYS +#undef DEFAULT_SUBSYS +}; + +namespace crimson { +seastar::logger& get_logger(int subsys) { + assert(subsys < ceph_subsys_max); + return loggers[subsys]; +} +} diff --git a/src/crimson/common/log.h b/src/crimson/common/log.h new file mode 100644 index 000000000..635349098 --- /dev/null +++ b/src/crimson/common/log.h @@ -0,0 +1,24 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/util/log.hh> +#include "common/subsys_types.h" + +namespace crimson { +seastar::logger& get_logger(int subsys); +static inline seastar::log_level to_log_level(int level) { + if (level < 0) { + return seastar::log_level::error; + } else if (level < 1) { + return seastar::log_level::warn; + } else if (level < 5) { + return seastar::log_level::info; + } else if (level <= 20) { + return seastar::log_level::debug; + } else { + return seastar::log_level::trace; + } +} +} diff --git a/src/crimson/common/perf_counters_collection.cc b/src/crimson/common/perf_counters_collection.cc new file mode 100644 index 000000000..af80dbcc2 --- /dev/null +++ b/src/crimson/common/perf_counters_collection.cc @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "perf_counters_collection.h" + +namespace crimson::common { +PerfCountersCollection::PerfCountersCollection() +{ + perf_collection = std::make_unique<PerfCountersCollectionImpl>(); +} +PerfCountersCollection::~PerfCountersCollection() +{ + perf_collection->clear(); +} + +PerfCountersCollectionImpl* PerfCountersCollection:: get_perf_collection() +{ + return perf_collection.get(); +} + +PerfCountersCollection::ShardedPerfCountersCollection PerfCountersCollection::sharded_perf_coll; + +} + + diff --git a/src/crimson/common/perf_counters_collection.h b/src/crimson/common/perf_counters_collection.h new file mode 100644 index 000000000..a19630247 --- /dev/null +++ b/src/crimson/common/perf_counters_collection.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "common/perf_counters.h" +#include <seastar/core/sharded.hh> + +using crimson::common::PerfCountersCollectionImpl; +namespace crimson::common { +class PerfCountersCollection: public seastar::sharded<PerfCountersCollection> +{ + using ShardedPerfCountersCollection = seastar::sharded<PerfCountersCollection>; + +private: + std::unique_ptr<PerfCountersCollectionImpl> perf_collection; + static ShardedPerfCountersCollection sharded_perf_coll; + friend PerfCountersCollection& local_perf_coll(); + friend ShardedPerfCountersCollection& sharded_perf_coll(); + +public: + PerfCountersCollection(); + ~PerfCountersCollection(); + PerfCountersCollectionImpl* get_perf_collection(); + +}; + +inline PerfCountersCollection::ShardedPerfCountersCollection& sharded_perf_coll(){ + return PerfCountersCollection::sharded_perf_coll; +} + +inline PerfCountersCollection& local_perf_coll() { + return PerfCountersCollection::sharded_perf_coll.local(); +} + +} + diff --git a/src/crimson/common/shared_lru.h b/src/crimson/common/shared_lru.h new file mode 100644 index 000000000..4c1da401e --- /dev/null +++ b/src/crimson/common/shared_lru.h @@ -0,0 +1,178 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <optional> +#include <boost/smart_ptr/local_shared_ptr.hpp> +#include <boost/smart_ptr/weak_ptr.hpp> +#include "simple_lru.h" + +/// SharedLRU does its best to cache objects. It not only tracks the objects +/// in its LRU cache with strong references, it also tracks objects with +/// weak_ptr even if the cache does not hold any strong references to them. so +/// that it can return the objects after they are evicted, as long as they've +/// ever been cached and have not been destroyed yet. +template<class K, class V> +class SharedLRU { + using shared_ptr_t = boost::local_shared_ptr<V>; + using weak_ptr_t = boost::weak_ptr<V>; + using value_type = std::pair<K, shared_ptr_t>; + + // weak_refs is already ordered, and we don't use accessors like + // LRUCache::lower_bound(), so unordered LRUCache would suffice. + SimpleLRU<K, shared_ptr_t, false> cache; + std::map<K, std::pair<weak_ptr_t, V*>> weak_refs; + + struct Deleter { + SharedLRU<K,V>* cache; + const K key; + void operator()(V* ptr) { + cache->_erase_weak(key); + delete ptr; + } + }; + void _erase_weak(const K& key) { + weak_refs.erase(key); + } +public: + SharedLRU(size_t max_size = 20) + : cache{max_size} + {} + ~SharedLRU() { + cache.clear(); + // use plain assert() in utiliy classes to avoid dependencies on logging + assert(weak_refs.empty()); + } + /** + * Returns a reference to the given key, and perform an insertion if such + * key does not already exist + */ + shared_ptr_t operator[](const K& key); + /** + * Returns true iff there are no live references left to anything that has been + * in the cache. + */ + bool empty() const { + return weak_refs.empty(); + } + size_t size() const { + return cache.size(); + } + size_t capacity() const { + return cache.capacity(); + } + /*** + * Inserts a key if not present, or bumps it to the front of the LRU if + * it is, and then gives you a reference to the value. If the key already + * existed, you are responsible for deleting the new value you tried to + * insert. + * + * @param key The key to insert + * @param value The value that goes with the key + * @param existed Set to true if the value was already in the + * map, false otherwise + * @return A reference to the map's value for the given key + */ + shared_ptr_t insert(const K& key, std::unique_ptr<V> value); + // clear all strong reference from the lru. + void clear() { + cache.clear(); + } + shared_ptr_t find(const K& key); + // return the last element that is not greater than key + shared_ptr_t lower_bound(const K& key); + // return the first element that is greater than key + std::optional<value_type> upper_bound(const K& key); + + void erase(const K& key) { + cache.erase(key); + _erase_weak(key); + } +}; + +template<class K, class V> +typename SharedLRU<K,V>::shared_ptr_t +SharedLRU<K,V>::insert(const K& key, std::unique_ptr<V> value) +{ + shared_ptr_t val; + if (auto found = weak_refs.find(key); found != weak_refs.end()) { + val = found->second.first.lock(); + } + if (!val) { + val.reset(value.release(), Deleter{this, key}); + weak_refs.emplace(key, std::make_pair(val, val.get())); + } + cache.insert(key, val); + return val; +} + +template<class K, class V> +typename SharedLRU<K,V>::shared_ptr_t +SharedLRU<K,V>::operator[](const K& key) +{ + if (auto found = cache.find(key); found) { + return *found; + } + shared_ptr_t val; + if (auto found = weak_refs.find(key); found != weak_refs.end()) { + val = found->second.first.lock(); + } + if (!val) { + val.reset(new V{}, Deleter{this, key}); + weak_refs.emplace(key, std::make_pair(val, val.get())); + } + cache.insert(key, val); + return val; +} + +template<class K, class V> +typename SharedLRU<K,V>::shared_ptr_t +SharedLRU<K,V>::find(const K& key) +{ + if (auto found = cache.find(key); found) { + return *found; + } + shared_ptr_t val; + if (auto found = weak_refs.find(key); found != weak_refs.end()) { + val = found->second.first.lock(); + } + if (val) { + cache.insert(key, val); + } + return val; +} + +template<class K, class V> +typename SharedLRU<K,V>::shared_ptr_t +SharedLRU<K,V>::lower_bound(const K& key) +{ + if (weak_refs.empty()) { + return {}; + } + auto found = weak_refs.lower_bound(key); + if (found == weak_refs.end()) { + --found; + } + if (auto val = found->second.first.lock(); val) { + cache.insert(key, val); + return val; + } else { + return {}; + } +} + +template<class K, class V> +std::optional<typename SharedLRU<K,V>::value_type> +SharedLRU<K,V>::upper_bound(const K& key) +{ + for (auto found = weak_refs.upper_bound(key); + found != weak_refs.end(); + ++found) { + if (auto val = found->second.first.lock(); val) { + return std::make_pair(found->first, val); + } + } + return std::nullopt; +} diff --git a/src/crimson/common/simple_lru.h b/src/crimson/common/simple_lru.h new file mode 100644 index 000000000..1419c4885 --- /dev/null +++ b/src/crimson/common/simple_lru.h @@ -0,0 +1,141 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <list> +#include <map> +#include <optional> +#include <type_traits> +#include <unordered_map> + +template <class Key, class Value, bool Ordered> +class SimpleLRU { + static_assert(std::is_default_constructible_v<Value>); + using list_type = std::list<Key>; + template<class K, class V> + using map_t = std::conditional_t<Ordered, + std::map<K, V>, + std::unordered_map<K, V>>; + using map_type = map_t<Key, std::pair<Value, typename list_type::iterator>>; + list_type lru; + map_type cache; + const size_t max_size; + +public: + SimpleLRU(size_t size = 20) + : cache(size), + max_size(size) + {} + size_t size() const { + return cache.size(); + } + size_t capacity() const { + return max_size; + } + using insert_return_type = std::pair<Value, bool>; + insert_return_type insert(const Key& key, Value value); + std::optional<Value> find(const Key& key); + std::optional<std::enable_if<Ordered, Value>> lower_bound(const Key& key); + void erase(const Key& key); + void clear(); +private: + // bump the item to the front of the lru list + Value _lru_add(typename map_type::iterator found); + // evict the last element of most recently used list + void _evict(); +}; + +template <class Key, class Value, bool Ordered> +typename SimpleLRU<Key,Value,Ordered>::insert_return_type +SimpleLRU<Key,Value,Ordered>::insert(const Key& key, Value value) +{ + if constexpr(Ordered) { + auto found = cache.lower_bound(key); + if (found != cache.end() && found->first == key) { + // already exists + return {found->second.first, true}; + } else { + if (size() >= capacity()) { + _evict(); + } + lru.push_front(key); + // use lower_bound as hint to save the lookup + cache.emplace_hint(found, key, std::make_pair(value, lru.begin())); + return {std::move(value), false}; + } + } else { + // cache is not ordered + auto found = cache.find(key); + if (found != cache.end()) { + // already exists + return {found->second.first, true}; + } else { + if (size() >= capacity()) { + _evict(); + } + lru.push_front(key); + cache.emplace(key, std::make_pair(value, lru.begin())); + return {std::move(value), false}; + } + } +} + +template <class Key, class Value, bool Ordered> +std::optional<Value> SimpleLRU<Key,Value,Ordered>::find(const Key& key) +{ + if (auto found = cache.find(key); found != cache.end()){ + return _lru_add(found); + } else { + return {}; + } +} + +template <class Key, class Value, bool Ordered> +std::optional<std::enable_if<Ordered, Value>> +SimpleLRU<Key,Value,Ordered>::lower_bound(const Key& key) +{ + if (auto found = cache.lower_bound(key); found != cache.end()) { + return _lru_add(found); + } else { + return {}; + } +} + +template <class Key, class Value, bool Ordered> +void SimpleLRU<Key,Value,Ordered>::clear() +{ + lru.clear(); + cache.clear(); +} + +template <class Key, class Value, bool Ordered> +void SimpleLRU<Key,Value,Ordered>::erase(const Key& key) +{ + if (auto found = cache.find(key); found != cache.end()) { + lru.erase(found->second.second); + cache.erase(found); + } +} + +template <class Key, class Value, bool Ordered> +Value SimpleLRU<Key,Value,Ordered>::_lru_add( + typename SimpleLRU<Key,Value,Ordered>::map_type::iterator found) +{ + auto& [value, in_lru] = found->second; + if (in_lru != lru.begin()){ + // move item to the front + lru.splice(lru.begin(), lru, in_lru); + } + // the item is already at the front + return value; +} + +template <class Key, class Value, bool Ordered> +void SimpleLRU<Key,Value,Ordered>::_evict() +{ + // evict the last element of most recently used list + auto last = --lru.end(); + cache.erase(*last); + lru.erase(last); +} diff --git a/src/crimson/common/throttle.cc b/src/crimson/common/throttle.cc new file mode 100644 index 000000000..bd9195181 --- /dev/null +++ b/src/crimson/common/throttle.cc @@ -0,0 +1,59 @@ +#include "throttle.h" + +namespace crimson::common { + +int64_t Throttle::take(int64_t c) +{ + if (!max) { + return 0; + } + count += c; + return count; +} + +int64_t Throttle::put(int64_t c) +{ + if (!max) { + return 0; + } + if (!c) { + return count; + } + on_free_slots.signal(); + count -= c; + return count; +} + +seastar::future<> Throttle::get(size_t c) +{ + if (!max) { + return seastar::make_ready_future<>(); + } + return on_free_slots.wait([this, c] { + return !_should_wait(c); + }).then([this, c] { + count += c; + return seastar::make_ready_future<>(); + }); +} + +void Throttle::reset_max(size_t m) { + if (max == m) { + return; + } + + if (m > max) { + on_free_slots.signal(); + } + max = m; +} + +bool Throttle::_should_wait(size_t c) const { + if (!max) { + return false; + } + return ((c <= max && count + c > max) || // normally stay under max + (c >= max && count > max)); // except for large c +} + +} // namespace crimson::common diff --git a/src/crimson/common/throttle.h b/src/crimson/common/throttle.h new file mode 100644 index 000000000..fea471c8d --- /dev/null +++ b/src/crimson/common/throttle.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/condition-variable.hh> +// pull seastar::timer<...>::timer definitions. FIX SEASTAR or reactor.hh +// is obligatory and should be included everywhere? +#include <seastar/core/reactor.hh> + +#include "common/ThrottleInterface.h" + +namespace crimson::common { + +class Throttle final : public ThrottleInterface { + size_t max = 0; + size_t count = 0; + // we cannot change the "count" of seastar::semaphore after it is created, + // so use condition_variable instead. + seastar::condition_variable on_free_slots; +public: + explicit Throttle(size_t m) + : max(m) + {} + int64_t take(int64_t c = 1) override; + int64_t put(int64_t c = 1) override; + seastar::future<> get(size_t c); + size_t get_current() const { + return count; + } + size_t get_max() const { + return max; + } + void reset_max(size_t m); +private: + bool _should_wait(size_t c) const; +}; + +} // namespace crimson::common diff --git a/src/crimson/common/tri_mutex.cc b/src/crimson/common/tri_mutex.cc new file mode 100644 index 000000000..c18aff1a0 --- /dev/null +++ b/src/crimson/common/tri_mutex.cc @@ -0,0 +1,225 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "tri_mutex.h" + +seastar::future<> read_lock::lock() +{ + return static_cast<tri_mutex*>(this)->lock_for_read(); +} + +void read_lock::unlock() +{ + static_cast<tri_mutex*>(this)->unlock_for_read(); +} + +seastar::future<> write_lock::lock() +{ + return static_cast<tri_mutex*>(this)->lock_for_write(false); +} + +void write_lock::unlock() +{ + static_cast<tri_mutex*>(this)->unlock_for_write(); +} + +seastar::future<> excl_lock::lock() +{ + return static_cast<tri_mutex*>(this)->lock_for_excl(); +} + +void excl_lock::unlock() +{ + static_cast<tri_mutex*>(this)->unlock_for_excl(); +} + +seastar::future<> excl_lock_from_read::lock() +{ + static_cast<tri_mutex*>(this)->promote_from_read(); + return seastar::make_ready_future<>(); +} + +void excl_lock_from_read::unlock() +{ + static_cast<tri_mutex*>(this)->demote_to_read(); +} + +seastar::future<> excl_lock_from_write::lock() +{ + static_cast<tri_mutex*>(this)->promote_from_write(); + return seastar::make_ready_future<>(); +} + +void excl_lock_from_write::unlock() +{ + static_cast<tri_mutex*>(this)->demote_to_write(); +} + +seastar::future<> excl_lock_from_excl::lock() +{ + return seastar::make_ready_future<>(); +} + +void excl_lock_from_excl::unlock() +{ +} + +tri_mutex::~tri_mutex() +{ + assert(!is_acquired()); +} + +seastar::future<> tri_mutex::lock_for_read() +{ + if (try_lock_for_read()) { + return seastar::make_ready_future<>(); + } + waiters.emplace_back(seastar::promise<>(), type_t::read); + return waiters.back().pr.get_future(); +} + +bool tri_mutex::try_lock_for_read() noexcept +{ + if (!writers && !exclusively_used && waiters.empty()) { + ++readers; + return true; + } else { + return false; + } +} + +void tri_mutex::unlock_for_read() +{ + assert(readers > 0); + if (--readers == 0) { + wake(); + } +} + +void tri_mutex::promote_from_read() +{ + assert(readers == 1); + --readers; + exclusively_used = true; +} + +void tri_mutex::demote_to_read() +{ + assert(exclusively_used); + exclusively_used = false; + ++readers; +} + +seastar::future<> tri_mutex::lock_for_write(bool greedy) +{ + if (try_lock_for_write(greedy)) { + return seastar::make_ready_future<>(); + } + waiters.emplace_back(seastar::promise<>(), type_t::write); + return waiters.back().pr.get_future(); +} + +bool tri_mutex::try_lock_for_write(bool greedy) noexcept +{ + if (!readers && !exclusively_used) { + if (greedy || waiters.empty()) { + ++writers; + return true; + } + } + return false; +} + +void tri_mutex::unlock_for_write() +{ + assert(writers > 0); + if (--writers == 0) { + wake(); + } +} + +void tri_mutex::promote_from_write() +{ + assert(writers == 1); + --writers; + exclusively_used = true; +} + +void tri_mutex::demote_to_write() +{ + assert(exclusively_used); + exclusively_used = false; + ++writers; +} + +// for exclusive users +seastar::future<> tri_mutex::lock_for_excl() +{ + if (try_lock_for_excl()) { + return seastar::make_ready_future<>(); + } + waiters.emplace_back(seastar::promise<>(), type_t::exclusive); + return waiters.back().pr.get_future(); +} + +bool tri_mutex::try_lock_for_excl() noexcept +{ + if (!readers && !writers && !exclusively_used) { + exclusively_used = true; + return true; + } else { + return false; + } +} + +void tri_mutex::unlock_for_excl() +{ + assert(exclusively_used); + exclusively_used = false; + wake(); +} + +bool tri_mutex::is_acquired() const +{ + if (readers) { + return true; + } else if (writers) { + return true; + } else if (exclusively_used) { + return true; + } else { + return false; + } +} + +void tri_mutex::wake() +{ + assert(!readers && !writers && !exclusively_used); + type_t type = type_t::none; + while (!waiters.empty()) { + auto& waiter = waiters.front(); + if (type == type_t::exclusive) { + break; + } if (type == type_t::none) { + type = waiter.type; + } else if (type != waiter.type) { + // to be woken in the next batch + break; + } + switch (type) { + case type_t::read: + ++readers; + break; + case type_t::write: + ++writers; + break; + case type_t::exclusive: + exclusively_used = true; + break; + default: + assert(0); + } + waiter.pr.set_value(); + waiters.pop_front(); + } +} diff --git a/src/crimson/common/tri_mutex.h b/src/crimson/common/tri_mutex.h new file mode 100644 index 000000000..127573b3a --- /dev/null +++ b/src/crimson/common/tri_mutex.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/circular_buffer.hh> + +class read_lock { +public: + seastar::future<> lock(); + void unlock(); +}; + +class write_lock { +public: + seastar::future<> lock(); + void unlock(); +}; + +class excl_lock { +public: + seastar::future<> lock(); + void unlock(); +}; + +// promote from read to excl +class excl_lock_from_read { +public: + seastar::future<> lock(); + void unlock(); +}; + +// promote from write to excl +class excl_lock_from_write { +public: + seastar::future<> lock(); + void unlock(); +}; + +// promote from excl to excl +class excl_lock_from_excl { +public: + seastar::future<> lock(); + void unlock(); +}; + +/// shared/exclusive mutual exclusion +/// +/// this lock design uses reader and writer is entirely and completely +/// independent of the conventional reader/writer lock usage. Here, what we +/// mean is that we can pipeline reads, and we can pipeline writes, but we +/// cannot allow a read while writes are in progress or a write while reads are +/// in progress. Any rmw operation is therefore exclusive. +/// +/// tri_mutex is based on seastar::shared_mutex, but instead of two kinds of +/// waiters, tri_mutex keeps track of three kinds of lock users: +/// - readers +/// - writers +/// - exclusive users +class tri_mutex : private read_lock, + write_lock, + excl_lock, + excl_lock_from_read, + excl_lock_from_write, + excl_lock_from_excl +{ +public: + tri_mutex() = default; + ~tri_mutex(); + + read_lock& for_read() { + return *this; + } + write_lock& for_write() { + return *this; + } + excl_lock& for_excl() { + return *this; + } + excl_lock_from_read& excl_from_read() { + return *this; + } + excl_lock_from_write& excl_from_write() { + return *this; + } + excl_lock_from_write& excl_from_excl() { + return *this; + } + + // for shared readers + seastar::future<> lock_for_read(); + bool try_lock_for_read() noexcept; + void unlock_for_read(); + void promote_from_read(); + void demote_to_read(); + unsigned get_readers() const { + return readers; + } + + // for shared writers + seastar::future<> lock_for_write(bool greedy); + bool try_lock_for_write(bool greedy) noexcept; + void unlock_for_write(); + void promote_from_write(); + void demote_to_write(); + unsigned get_writers() const { + return writers; + } + + // for exclusive users + seastar::future<> lock_for_excl(); + bool try_lock_for_excl() noexcept; + void unlock_for_excl(); + bool is_excl_acquired() const { + return exclusively_used; + } + + bool is_acquired() const; + + /// pass the provided exception to any waiting waiters + template<typename Exception> + void abort(Exception ex) { + while (!waiters.empty()) { + auto& waiter = waiters.front(); + waiter.pr.set_exception(std::make_exception_ptr(ex)); + waiters.pop_front(); + } + } + +private: + void wake(); + unsigned readers = 0; + unsigned writers = 0; + bool exclusively_used = false; + enum class type_t : uint8_t { + read, + write, + exclusive, + none, + }; + struct waiter_t { + waiter_t(seastar::promise<>&& pr, type_t type) + : pr(std::move(pr)), type(type) + {} + seastar::promise<> pr; + type_t type; + }; + seastar::circular_buffer<waiter_t> waiters; + friend class read_lock; + friend class write_lock; + friend class excl_lock; + friend class excl_lock_from_read; + friend class excl_lock_from_write; + friend class excl_lock_from_excl; +}; diff --git a/src/crimson/common/type_helpers.h b/src/crimson/common/type_helpers.h new file mode 100644 index 000000000..4c606581f --- /dev/null +++ b/src/crimson/common/type_helpers.h @@ -0,0 +1,8 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "boost/intrusive_ptr.hpp" + +template<typename T> using Ref = boost::intrusive_ptr<T>; diff --git a/src/crimson/mgr/client.cc b/src/crimson/mgr/client.cc new file mode 100644 index 000000000..5aa8a88ba --- /dev/null +++ b/src/crimson/mgr/client.cc @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "client.h" + +#include <seastar/core/sleep.hh> + +#include "crimson/common/log.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "messages/MMgrConfigure.h" +#include "messages/MMgrMap.h" +#include "messages/MMgrOpen.h" + +namespace { + seastar::logger& logger() + { + return crimson::get_logger(ceph_subsys_mgrc); + } +} + +using crimson::common::local_conf; + +namespace crimson::mgr +{ + +Client::Client(crimson::net::Messenger& msgr, + WithStats& with_stats) + : msgr{msgr}, + with_stats{with_stats}, + report_timer{[this] {report();}} +{} + +seastar::future<> Client::start() +{ + return seastar::now(); +} + +seastar::future<> Client::stop() +{ + logger().info("{}", __func__); + report_timer.cancel(); + auto fut = gate.close(); + if (conn) { + conn->mark_down(); + } + return fut; +} + +std::optional<seastar::future<>> +Client::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + bool dispatched = true; + gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] { + switch(m->get_type()) { + case MSG_MGR_MAP: + return handle_mgr_map(conn, boost::static_pointer_cast<MMgrMap>(m)); + case MSG_MGR_CONFIGURE: + return handle_mgr_conf(conn, boost::static_pointer_cast<MMgrConfigure>(m)); + default: + dispatched = false; + return seastar::now(); + } + }); + return (dispatched ? std::make_optional(seastar::now()) : std::nullopt); +} + +void Client::ms_handle_connect(crimson::net::ConnectionRef c) +{ + gate.dispatch_in_background(__func__, *this, [this, c] { + if (conn == c) { + // ask for the mgrconfigure message + auto m = ceph::make_message<MMgrOpen>(); + m->daemon_name = local_conf()->name.get_id(); + return conn->send(std::move(m)); + } else { + return seastar::now(); + } + }); +} + +void Client::ms_handle_reset(crimson::net::ConnectionRef c, bool /* is_replace */) +{ + gate.dispatch_in_background(__func__, *this, [this, c] { + if (conn == c) { + report_timer.cancel(); + return reconnect(); + } else { + return seastar::now(); + } + }); +} + +seastar::future<> Client::reconnect() +{ + if (conn) { + conn->mark_down(); + conn = {}; + } + if (!mgrmap.get_available()) { + logger().warn("No active mgr available yet"); + return seastar::now(); + } + auto retry_interval = std::chrono::duration<double>( + local_conf().get_val<double>("mgr_connect_retry_interval")); + auto a_while = std::chrono::duration_cast<seastar::steady_clock_type::duration>( + retry_interval); + return seastar::sleep(a_while).then([this] { + auto peer = mgrmap.get_active_addrs().pick_addr(msgr.get_myaddr().get_type()); + if (peer == entity_addr_t{}) { + // crimson msgr only uses the first bound addr + logger().error("mgr.{} does not have an addr compatible with me", + mgrmap.get_active_name()); + return; + } + conn = msgr.connect(peer, CEPH_ENTITY_TYPE_MGR); + }); +} + +seastar::future<> Client::handle_mgr_map(crimson::net::ConnectionRef, + Ref<MMgrMap> m) +{ + mgrmap = m->get_map(); + if (!conn) { + return reconnect(); + } else if (conn->get_peer_addr() != + mgrmap.get_active_addrs().legacy_addr()) { + return reconnect(); + } else { + return seastar::now(); + } +} + +seastar::future<> Client::handle_mgr_conf(crimson::net::ConnectionRef, + Ref<MMgrConfigure> m) +{ + logger().info("{} {}", __func__, *m); + + auto report_period = std::chrono::seconds{m->stats_period}; + if (report_period.count()) { + if (report_timer.armed()) { + report_timer.rearm(report_timer.get_timeout(), report_period); + } else { + report_timer.arm_periodic(report_period); + } + } else { + report_timer.cancel(); + } + return seastar::now(); +} + +void Client::report() +{ + gate.dispatch_in_background(__func__, *this, [this] { + assert(conn); + auto pg_stats = with_stats.get_stats(); + return conn->send(std::move(pg_stats)); + }); +} + +void Client::print(std::ostream& out) const +{ + out << "mgrc "; +} + +} diff --git a/src/crimson/mgr/client.h b/src/crimson/mgr/client.h new file mode 100644 index 000000000..ad7e1fde5 --- /dev/null +++ b/src/crimson/mgr/client.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/timer.hh> + +#include "crimson/common/gated.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/net/Fwd.h" +#include "mon/MgrMap.h" + +template<typename Message> using Ref = boost::intrusive_ptr<Message>; +namespace crimson::net { + class Messenger; +} + +class MMgrMap; +class MMgrConfigure; + +namespace crimson::mgr +{ + +// implement WithStats if you want to report stats to mgr periodically +class WithStats { +public: + virtual MessageRef get_stats() const = 0; + virtual ~WithStats() {} +}; + +class Client : public crimson::net::Dispatcher { +public: + Client(crimson::net::Messenger& msgr, + WithStats& with_stats); + seastar::future<> start(); + seastar::future<> stop(); + void report(); + +private: + std::optional<seastar::future<>> ms_dispatch( + crimson::net::ConnectionRef conn, Ref<Message> m) override; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final; + void ms_handle_connect(crimson::net::ConnectionRef conn) final; + seastar::future<> handle_mgr_map(crimson::net::ConnectionRef conn, + Ref<MMgrMap> m); + seastar::future<> handle_mgr_conf(crimson::net::ConnectionRef conn, + Ref<MMgrConfigure> m); + seastar::future<> reconnect(); + + void print(std::ostream&) const; + friend std::ostream& operator<<(std::ostream& out, const Client& client); +private: + MgrMap mgrmap; + crimson::net::Messenger& msgr; + WithStats& with_stats; + crimson::net::ConnectionRef conn; + seastar::timer<seastar::lowres_clock> report_timer; + crimson::common::Gated gate; +}; + +inline std::ostream& operator<<(std::ostream& out, const Client& client) { + client.print(out); + return out; +} + +} diff --git a/src/crimson/mon/MonClient.cc b/src/crimson/mon/MonClient.cc new file mode 100644 index 000000000..9dfbb103a --- /dev/null +++ b/src/crimson/mon/MonClient.cc @@ -0,0 +1,1111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "MonClient.h" + +#include <random> + +#include <seastar/core/future-util.hh> +#include <seastar/core/lowres_clock.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/util/log.hh> + +#include "auth/AuthClientHandler.h" +#include "auth/RotatingKeyRing.h" + +#include "common/hostname.h" + +#include "crimson/auth/KeyRing.h" +#include "crimson/common/config_proxy.h" +#include "crimson/common/log.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Errors.h" +#include "crimson/net/Messenger.h" + +#include "messages/MAuth.h" +#include "messages/MAuthReply.h" +#include "messages/MConfig.h" +#include "messages/MLogAck.h" +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" +#include "messages/MMonGetMap.h" +#include "messages/MMonGetVersion.h" +#include "messages/MMonGetVersionReply.h" +#include "messages/MMonMap.h" +#include "messages/MMonSubscribe.h" +#include "messages/MMonSubscribeAck.h" + +namespace { + seastar::logger& logger() + { + return crimson::get_logger(ceph_subsys_monc); + } +} + +namespace crimson::mon { + +using crimson::common::local_conf; + +class Connection { +public: + Connection(const AuthRegistry& auth_registry, + crimson::net::ConnectionRef conn, + KeyRing* keyring); + enum class auth_result_t { + success = 0, + failure, + canceled + }; + seastar::future<> handle_auth_reply(Ref<MAuthReply> m); + // v1 + seastar::future<auth_result_t> authenticate_v1( + epoch_t epoch, + const EntityName& name, + uint32_t want_keys); + // v2 + seastar::future<auth_result_t> authenticate_v2(); + auth::AuthClient::auth_request_t + get_auth_request(const EntityName& name, + uint32_t want_keys); + using secret_t = string; + tuple<CryptoKey, secret_t, bufferlist> + handle_auth_reply_more(const ceph::buffer::list& bl); + int handle_auth_bad_method(uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes); + + // v1 and v2 + tuple<CryptoKey, secret_t, int> + handle_auth_done(uint64_t new_global_id, + const ceph::buffer::list& bl); + void close(); + bool is_my_peer(const entity_addr_t& addr) const; + AuthAuthorizer* get_authorizer(entity_type_t peer) const; + KeyStore& get_keys(); + seastar::future<> renew_tickets(); + seastar::future<> renew_rotating_keyring(); + + crimson::net::ConnectionRef get_conn(); + +private: + seastar::future<> setup_session(epoch_t epoch, + const EntityName& name); + std::unique_ptr<AuthClientHandler> create_auth(crimson::auth::method_t, + uint64_t global_id, + const EntityName& name, + uint32_t want_keys); + enum class request_t { + rotating, + general, + }; + seastar::future<std::optional<auth_result_t>> do_auth_single(request_t); + seastar::future<auth_result_t> do_auth(request_t); + +private: + bool closed = false; + // v1 + seastar::shared_promise<Ref<MAuthReply>> reply; + // v2 + using clock_t = seastar::lowres_system_clock; + clock_t::time_point auth_start; + crimson::auth::method_t auth_method = 0; + std::optional<seastar::promise<auth_result_t>> auth_done; + // v1 and v2 + const AuthRegistry& auth_registry; + crimson::net::ConnectionRef conn; + std::unique_ptr<AuthClientHandler> auth; + std::unique_ptr<RotatingKeyRing> rotating_keyring; + uint64_t global_id = 0; + clock_t::time_point last_rotating_renew_sent; +}; + +Connection::Connection(const AuthRegistry& auth_registry, + crimson::net::ConnectionRef conn, + KeyRing* keyring) + : auth_registry{auth_registry}, + conn{conn}, + rotating_keyring{ + std::make_unique<RotatingKeyRing>(nullptr, + CEPH_ENTITY_TYPE_OSD, + keyring)} +{} + +seastar::future<> Connection::handle_auth_reply(Ref<MAuthReply> m) +{ + reply.set_value(m); + reply = {}; + return seastar::now(); +} + +seastar::future<> Connection::renew_tickets() +{ + if (auth->need_tickets()) { + return do_auth(request_t::general).then([](auth_result_t r) { + if (r != auth_result_t::success) { + throw std::system_error( + make_error_code( + crimson::net::error::negotiation_failure)); + } + }); + } + return seastar::now(); +} + +seastar::future<> Connection::renew_rotating_keyring() +{ + auto now = clock_t::now(); + auto ttl = std::chrono::seconds{ + static_cast<long>(crimson::common::local_conf()->auth_service_ticket_ttl)}; + auto cutoff = now - ttl / 4; + if (!rotating_keyring->need_new_secrets(utime_t(cutoff))) { + return seastar::now(); + } + if (now - last_rotating_renew_sent < std::chrono::seconds{1}) { + logger().info("renew_rotating_keyring called too often"); + return seastar::now(); + } + last_rotating_renew_sent = now; + return do_auth(request_t::rotating).then([](auth_result_t r) { + if (r != auth_result_t::success) { + throw std::system_error(make_error_code( + crimson::net::error::negotiation_failure)); + } + }); +} + +AuthAuthorizer* Connection::get_authorizer(entity_type_t peer) const +{ + if (auth) { + return auth->build_authorizer(peer); + } else { + return nullptr; + } +} + +KeyStore& Connection::get_keys() { + return *rotating_keyring; +} + +std::unique_ptr<AuthClientHandler> +Connection::create_auth(crimson::auth::method_t protocol, + uint64_t global_id, + const EntityName& name, + uint32_t want_keys) +{ + static crimson::common::CephContext cct; + std::unique_ptr<AuthClientHandler> auth; + auth.reset(AuthClientHandler::create(&cct, + protocol, + rotating_keyring.get())); + if (!auth) { + logger().error("no handler for protocol {}", protocol); + throw std::system_error(make_error_code( + crimson::net::error::negotiation_failure)); + } + auth->init(name); + auth->set_want_keys(want_keys); + auth->set_global_id(global_id); + return auth; +} + +seastar::future<> +Connection::setup_session(epoch_t epoch, + const EntityName& name) +{ + auto m = ceph::make_message<MAuth>(); + m->protocol = CEPH_AUTH_UNKNOWN; + m->monmap_epoch = epoch; + __u8 struct_v = 1; + encode(struct_v, m->auth_payload); + std::vector<crimson::auth::method_t> auth_methods; + auth_registry.get_supported_methods(conn->get_peer_type(), &auth_methods); + encode(auth_methods, m->auth_payload); + encode(name, m->auth_payload); + encode(global_id, m->auth_payload); + return conn->send(m); +} + +seastar::future<std::optional<Connection::auth_result_t>> +Connection::do_auth_single(Connection::request_t what) +{ + auto m = make_message<MAuth>(); + m->protocol = auth->get_protocol(); + auth->prepare_build_request(); + switch (what) { + case request_t::rotating: + auth->build_rotating_request(m->auth_payload); + break; + case request_t::general: + if (int ret = auth->build_request(m->auth_payload); ret) { + logger().error("missing/bad key for '{}'", local_conf()->name); + throw std::system_error(make_error_code( + crimson::net::error::negotiation_failure)); + } + break; + default: + assert(0); + } + logger().info("sending {}", *m); + return conn->send(m).then([this] { + logger().info("waiting"); + return reply.get_shared_future(); + }).then([this] (Ref<MAuthReply> m) { + if (!m) { + ceph_assert(closed); + logger().info("do_auth: connection closed"); + return seastar::make_ready_future<std::optional<Connection::auth_result_t>>( + std::make_optional(auth_result_t::canceled)); + } + logger().info( + "do_auth: mon {} => {} returns {}: {}", + conn->get_messenger()->get_myaddr(), + conn->get_peer_addr(), *m, m->result); + auto p = m->result_bl.cbegin(); + auto ret = auth->handle_response(m->result, p, + nullptr, nullptr); + if (ret != 0 && ret != -EAGAIN) { + logger().error( + "do_auth: got error {} on mon {}", + ret, + conn->get_peer_addr()); + } + return seastar::make_ready_future<std::optional<Connection::auth_result_t>>( + ret == -EAGAIN + ? std::nullopt + : std::make_optional(ret == 0 + ? auth_result_t::success + : auth_result_t::failure + )); + }); +} + +seastar::future<Connection::auth_result_t> +Connection::do_auth(Connection::request_t what) { + return seastar::repeat_until_value([this, what]() { + return do_auth_single(what); + }); +} + +seastar::future<Connection::auth_result_t> +Connection::authenticate_v1(epoch_t epoch, + const EntityName& name, + uint32_t want_keys) +{ + return conn->keepalive().then([epoch, name, this] { + return setup_session(epoch, name); + }).then([this] { + return reply.get_shared_future(); + }).then([name, want_keys, this](Ref<MAuthReply> m) { + if (!m) { + logger().error("authenticate_v1 canceled on {}", name); + return seastar::make_ready_future<auth_result_t>(auth_result_t::canceled); + } + global_id = m->global_id; + auth = create_auth(m->protocol, m->global_id, name, want_keys); + switch (auto p = m->result_bl.cbegin(); + auth->handle_response(m->result, p, + nullptr, nullptr)) { + case 0: + // none + return seastar::make_ready_future<auth_result_t>(auth_result_t::success); + case -EAGAIN: + // cephx + return do_auth(request_t::general); + default: + ceph_assert_always(0); + } + }).handle_exception([](auto ep) { + logger().error("authenticate_v1 failed with {}", ep); + return seastar::make_ready_future<auth_result_t>(auth_result_t::canceled); + }); +} + +seastar::future<Connection::auth_result_t> Connection::authenticate_v2() +{ + auth_start = seastar::lowres_system_clock::now(); + return conn->send(make_message<MMonGetMap>()).then([this] { + auth_done.emplace(); + return auth_done->get_future(); + }); +} + +auth::AuthClient::auth_request_t +Connection::get_auth_request(const EntityName& entity_name, + uint32_t want_keys) +{ + // choose method + auth_method = [&] { + std::vector<crimson::auth::method_t> methods; + auth_registry.get_supported_methods(conn->get_peer_type(), &methods); + if (methods.empty()) { + logger().info("get_auth_request no methods is supported"); + throw crimson::auth::error("no methods is supported"); + } + return methods.front(); + }(); + + std::vector<uint32_t> modes; + auth_registry.get_supported_modes(conn->get_peer_type(), auth_method, + &modes); + logger().info("method {} preferred_modes {}", auth_method, modes); + if (modes.empty()) { + throw crimson::auth::error("no modes is supported"); + } + auth = create_auth(auth_method, global_id, entity_name, want_keys); + + using ceph::encode; + bufferlist bl; + // initial request includes some boilerplate... + encode((char)AUTH_MODE_MON, bl); + encode(entity_name, bl); + encode(global_id, bl); + // and (maybe) some method-specific initial payload + auth->build_initial_request(&bl); + return {auth_method, modes, bl}; +} + +tuple<CryptoKey, Connection::secret_t, bufferlist> +Connection::handle_auth_reply_more(const ceph::buffer::list& payload) +{ + CryptoKey session_key; + secret_t connection_secret; + bufferlist reply; + auto p = payload.cbegin(); + int r = auth->handle_response(0, p, &session_key, &connection_secret); + if (r == -EAGAIN) { + auth->prepare_build_request(); + auth->build_request(reply); + logger().info(" responding with {} bytes", reply.length()); + return {session_key, connection_secret, reply}; + } else if (r < 0) { + logger().error(" handle_response returned {}", r); + throw crimson::auth::error("unable to build auth"); + } else { + logger().info("authenticated!"); + std::terminate(); + } +} + +tuple<CryptoKey, Connection::secret_t, int> +Connection::handle_auth_done(uint64_t new_global_id, + const ceph::buffer::list& payload) +{ + global_id = new_global_id; + auth->set_global_id(global_id); + auto p = payload.begin(); + CryptoKey session_key; + secret_t connection_secret; + int r = auth->handle_response(0, p, &session_key, &connection_secret); + conn->set_last_keepalive_ack(auth_start); + if (auth_done) { + auth_done->set_value(auth_result_t::success); + auth_done.reset(); + } + return {session_key, connection_secret, r}; +} + +int Connection::handle_auth_bad_method(uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) +{ + logger().info("old_auth_method {} result {} allowed_methods {}", + old_auth_method, cpp_strerror(result), allowed_methods); + std::vector<uint32_t> auth_supported; + auth_registry.get_supported_methods(conn->get_peer_type(), &auth_supported); + auto p = std::find(auth_supported.begin(), auth_supported.end(), + old_auth_method); + assert(p != auth_supported.end()); + p = std::find_first_of(std::next(p), auth_supported.end(), + allowed_methods.begin(), allowed_methods.end()); + if (p == auth_supported.end()) { + logger().error("server allowed_methods {} but i only support {}", + allowed_methods, auth_supported); + assert(auth_done); + auth_done->set_exception(std::system_error(make_error_code( + crimson::net::error::negotiation_failure))); + return -EACCES; + } + auth_method = *p; + logger().info("will try {} next", auth_method); + return 0; +} + +void Connection::close() +{ + reply.set_value(Ref<MAuthReply>(nullptr)); + reply = {}; + if (auth_done) { + auth_done->set_value(auth_result_t::canceled); + auth_done.reset(); + } + if (conn && !std::exchange(closed, true)) { + conn->mark_down(); + } +} + +bool Connection::is_my_peer(const entity_addr_t& addr) const +{ + ceph_assert(conn); + return conn->get_peer_addr() == addr; +} + +crimson::net::ConnectionRef Connection::get_conn() { + return conn; +} + +Client::Client(crimson::net::Messenger& messenger, + crimson::common::AuthHandler& auth_handler) + // currently, crimson is OSD-only + : want_keys{CEPH_ENTITY_TYPE_MON | + CEPH_ENTITY_TYPE_OSD | + CEPH_ENTITY_TYPE_MGR}, + timer{[this] { tick(); }}, + msgr{messenger}, + auth_registry{&cct}, + auth_handler{auth_handler} +{} + +Client::Client(Client&&) = default; +Client::~Client() = default; + +seastar::future<> Client::start() { + entity_name = crimson::common::local_conf()->name; + auth_registry.refresh_config(); + return load_keyring().then([this] { + return monmap.build_initial(crimson::common::local_conf(), false); + }).then([this] { + return authenticate(); + }).then([this] { + auto interval = + std::chrono::duration_cast<seastar::lowres_clock::duration>( + std::chrono::duration<double>( + local_conf().get_val<double>("mon_client_ping_interval"))); + timer.arm_periodic(interval); + }); +} + +seastar::future<> Client::load_keyring() +{ + if (!auth_registry.is_supported_method(msgr.get_mytype(), CEPH_AUTH_CEPHX)) { + return seastar::now(); + } else { + return crimson::auth::load_from_keyring(&keyring).then([](KeyRing* keyring) { + return crimson::auth::load_from_keyfile(keyring); + }).then([](KeyRing* keyring) { + return crimson::auth::load_from_key(keyring); + }).then([](KeyRing*) { + return seastar::now(); + }); + } +} + +void Client::tick() +{ + gate.dispatch_in_background(__func__, *this, [this] { + if (active_con) { + return seastar::when_all_succeed(active_con->get_conn()->keepalive(), + active_con->renew_tickets(), + active_con->renew_rotating_keyring()).then_unpack([] {}); + } else { + return seastar::now(); + } + }); +} + +bool Client::is_hunting() const { + return !active_con; +} + +std::optional<seastar::future<>> +Client::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + bool dispatched = true; + gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] { + // we only care about these message types + switch (m->get_type()) { + case CEPH_MSG_MON_MAP: + return handle_monmap(conn, boost::static_pointer_cast<MMonMap>(m)); + case CEPH_MSG_AUTH_REPLY: + return handle_auth_reply( + conn, boost::static_pointer_cast<MAuthReply>(m)); + case CEPH_MSG_MON_SUBSCRIBE_ACK: + return handle_subscribe_ack( + boost::static_pointer_cast<MMonSubscribeAck>(m)); + case CEPH_MSG_MON_GET_VERSION_REPLY: + return handle_get_version_reply( + boost::static_pointer_cast<MMonGetVersionReply>(m)); + case MSG_MON_COMMAND_ACK: + return handle_mon_command_ack( + boost::static_pointer_cast<MMonCommandAck>(m)); + case MSG_LOGACK: + return handle_log_ack( + boost::static_pointer_cast<MLogAck>(m)); + case MSG_CONFIG: + return handle_config( + boost::static_pointer_cast<MConfig>(m)); + default: + dispatched = false; + return seastar::now(); + } + }); + return (dispatched ? std::make_optional(seastar::now()) : std::nullopt); +} + +void Client::ms_handle_reset(crimson::net::ConnectionRef conn, bool /* is_replace */) +{ + gate.dispatch_in_background(__func__, *this, [this, conn] { + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn->get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found != pending_conns.end()) { + logger().warn("pending conn reset by {}", conn->get_peer_addr()); + (*found)->close(); + return seastar::now(); + } else if (active_con && active_con->is_my_peer(conn->get_peer_addr())) { + logger().warn("active conn reset {}", conn->get_peer_addr()); + active_con.reset(); + return reopen_session(-1).then([this] { + send_pendings(); + return seastar::now(); + }); + } else { + return seastar::now(); + } + }); +} + +std::pair<std::vector<uint32_t>, std::vector<uint32_t>> +Client::get_supported_auth_methods(int peer_type) +{ + std::vector<uint32_t> methods; + std::vector<uint32_t> modes; + auth_registry.get_supported_methods(peer_type, &methods, &modes); + return {methods, modes}; +} + +uint32_t Client::pick_con_mode(int peer_type, + uint32_t auth_method, + const std::vector<uint32_t>& preferred_modes) +{ + return auth_registry.pick_mode(peer_type, auth_method, preferred_modes); +} + +AuthAuthorizeHandler* Client::get_auth_authorize_handler(int peer_type, + int auth_method) +{ + return auth_registry.get_handler(peer_type, auth_method); +} + + +int Client::handle_auth_request(crimson::net::ConnectionRef con, + AuthConnectionMetaRef auth_meta, + bool more, + uint32_t auth_method, + const ceph::bufferlist& payload, + ceph::bufferlist *reply) +{ + // for some channels prior to nautilus (osd heartbeat), we tolerate the lack of + // an authorizer. + if (payload.length() == 0) { + if (con->get_messenger()->get_require_authorizer()) { + return -EACCES; + } else { + auth_handler.handle_authentication({}, {}); + return 1; + } + } + auth_meta->auth_mode = payload[0]; + if (auth_meta->auth_mode < AUTH_MODE_AUTHORIZER || + auth_meta->auth_mode > AUTH_MODE_AUTHORIZER_MAX) { + return -EACCES; + } + AuthAuthorizeHandler* ah = get_auth_authorize_handler(con->get_peer_type(), + auth_method); + if (!ah) { + logger().error("no AuthAuthorizeHandler found for auth method: {}", + auth_method); + return -EOPNOTSUPP; + } + auto authorizer_challenge = &auth_meta->authorizer_challenge; + if (auth_meta->skip_authorizer_challenge) { + logger().info("skipping challenge on {}", con); + authorizer_challenge = nullptr; + } + bool was_challenge = (bool)auth_meta->authorizer_challenge; + EntityName name; + AuthCapsInfo caps_info; + bool is_valid = ah->verify_authorizer( + &cct, + active_con->get_keys(), + payload, + auth_meta->get_connection_secret_length(), + reply, + &name, + &active_con->get_conn()->peer_global_id, + &caps_info, + &auth_meta->session_key, + &auth_meta->connection_secret, + authorizer_challenge); + if (is_valid) { + auth_handler.handle_authentication(name, caps_info); + return 1; + } + if (!more && !was_challenge && auth_meta->authorizer_challenge) { + logger().info("added challenge on {}", con); + return 0; + } else { + logger().info("bad authorizer on {}", con); + return -EACCES; + } +} + +auth::AuthClient::auth_request_t +Client::get_auth_request(crimson::net::ConnectionRef con, + AuthConnectionMetaRef auth_meta) +{ + logger().info("get_auth_request(con={}, auth_method={})", + con, auth_meta->auth_method); + // connection to mon? + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = con->get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found == pending_conns.end()) { + throw crimson::auth::error{"unknown connection"}; + } + return (*found)->get_auth_request(entity_name, want_keys); + } else { + // generate authorizer + if (!active_con) { + logger().error(" but no auth handler is set up"); + throw crimson::auth::error("no auth available"); + } + auto authorizer = active_con->get_authorizer(con->get_peer_type()); + if (!authorizer) { + logger().error("failed to build_authorizer for type {}", + ceph_entity_type_name(con->get_peer_type())); + throw crimson::auth::error("unable to build auth"); + } + auth_meta->authorizer.reset(authorizer); + auth_meta->auth_method = authorizer->protocol; + vector<uint32_t> modes; + auth_registry.get_supported_modes(con->get_peer_type(), + auth_meta->auth_method, + &modes); + return {authorizer->protocol, modes, authorizer->bl}; + } +} + +ceph::bufferlist Client::handle_auth_reply_more(crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + const bufferlist& bl) +{ + if (conn->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn->get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found == pending_conns.end()) { + throw crimson::auth::error{"unknown connection"}; + } + bufferlist reply; + tie(auth_meta->session_key, auth_meta->connection_secret, reply) = + (*found)->handle_auth_reply_more(bl); + return reply; + } else { + // authorizer challenges + if (!active_con || !auth_meta->authorizer) { + logger().error("no authorizer?"); + throw crimson::auth::error("no auth available"); + } + auth_meta->authorizer->add_challenge(&cct, bl); + return auth_meta->authorizer->bl; + } +} + +int Client::handle_auth_done(crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + uint64_t global_id, + uint32_t con_mode, + const bufferlist& bl) +{ + if (conn->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn->get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found == pending_conns.end()) { + return -ENOENT; + } + int r = 0; + tie(auth_meta->session_key, auth_meta->connection_secret, r) = + (*found)->handle_auth_done(global_id, bl); + return r; + } else { + // verify authorizer reply + auto p = bl.begin(); + if (!auth_meta->authorizer->verify_reply(p, &auth_meta->connection_secret)) { + logger().error("failed verifying authorizer reply"); + return -EACCES; + } + auth_meta->session_key = auth_meta->authorizer->session_key; + return 0; + } +} + + // Handle server's indication that the previous auth attempt failed +int Client::handle_auth_bad_method(crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) +{ + if (conn->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn->get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found != pending_conns.end()) { + return (*found)->handle_auth_bad_method( + old_auth_method, result, + allowed_methods, allowed_modes); + } else { + return -ENOENT; + } + } else { + // huh... + logger().info("hmm, they didn't like {} result {}", + old_auth_method, cpp_strerror(result)); + return -EACCES; + } +} + +seastar::future<> Client::handle_monmap(crimson::net::ConnectionRef conn, + Ref<MMonMap> m) +{ + monmap.decode(m->monmapbl); + const auto peer_addr = conn->get_peer_addr(); + auto cur_mon = monmap.get_name(peer_addr); + logger().info("got monmap {}, mon.{}, is now rank {}", + monmap.epoch, cur_mon, monmap.get_rank(cur_mon)); + sub.got("monmap", monmap.get_epoch()); + + if (monmap.get_addr_name(peer_addr, cur_mon)) { + if (active_con) { + logger().info("handle_monmap: renewing tickets"); + return seastar::when_all_succeed( + active_con->renew_tickets(), + active_con->renew_rotating_keyring()).then_unpack([](){ + logger().info("handle_mon_map: renewed tickets"); + }); + } else { + return seastar::now(); + } + } else { + logger().warn("mon.{} went away", cur_mon); + return reopen_session(-1).then([this] { + send_pendings(); + return seastar::now(); + }); + } +} + +seastar::future<> Client::handle_auth_reply(crimson::net::ConnectionRef conn, + Ref<MAuthReply> m) +{ + logger().info( + "handle_auth_reply mon {} => {} returns {}: {}", + conn->get_messenger()->get_myaddr(), + conn->get_peer_addr(), *m, m->result); + auto found = std::find_if(pending_conns.begin(), pending_conns.end(), + [peer_addr = conn->get_peer_addr()](auto& mc) { + return mc->is_my_peer(peer_addr); + }); + if (found != pending_conns.end()) { + return (*found)->handle_auth_reply(m); + } else if (active_con) { + return active_con->handle_auth_reply(m); + } else { + logger().error("unknown auth reply from {}", conn->get_peer_addr()); + return seastar::now(); + } +} + +seastar::future<> Client::handle_subscribe_ack(Ref<MMonSubscribeAck> m) +{ + sub.acked(m->interval); + return seastar::now(); +} + +Client::get_version_t Client::get_version(const std::string& map) +{ + auto m = make_message<MMonGetVersion>(); + auto tid = ++last_version_req_id; + m->handle = tid; + m->what = map; + auto& req = version_reqs[tid]; + return send_message(m).then([&req] { + return req.get_future(); + }); +} + +seastar::future<> +Client::handle_get_version_reply(Ref<MMonGetVersionReply> m) +{ + if (auto found = version_reqs.find(m->handle); + found != version_reqs.end()) { + auto& result = found->second; + logger().trace("{}: {} returns {}", + __func__, m->handle, m->version); + result.set_value(std::make_tuple(m->version, m->oldest_version)); + version_reqs.erase(found); + } else { + logger().warn("{}: version request with handle {} not found", + __func__, m->handle); + } + return seastar::now(); +} + +seastar::future<> Client::handle_mon_command_ack(Ref<MMonCommandAck> m) +{ + const auto tid = m->get_tid(); + if (auto found = mon_commands.find(tid); + found != mon_commands.end()) { + auto& result = found->second; + logger().trace("{} {}", __func__, tid); + result.set_value(std::make_tuple(m->r, m->rs, std::move(m->get_data()))); + mon_commands.erase(found); + } else { + logger().warn("{} {} not found", __func__, tid); + } + return seastar::now(); +} + +seastar::future<> Client::handle_log_ack(Ref<MLogAck> m) +{ + // XXX + return seastar::now(); +} + +seastar::future<> Client::handle_config(Ref<MConfig> m) +{ + return crimson::common::local_conf().set_mon_vals(m->config); +} + +std::vector<unsigned> Client::get_random_mons(unsigned n) const +{ + uint16_t min_priority = std::numeric_limits<uint16_t>::max(); + for (const auto& m : monmap.mon_info) { + if (m.second.priority < min_priority) { + min_priority = m.second.priority; + } + } + vector<unsigned> ranks; + for (auto [name, info] : monmap.mon_info) { + if (info.priority == min_priority) { + ranks.push_back(monmap.get_rank(name)); + } + } + std::random_device rd; + std::default_random_engine rng{rd()}; + std::shuffle(ranks.begin(), ranks.end(), rng); + if (n == 0 || n > ranks.size()) { + return ranks; + } else { + return {ranks.begin(), ranks.begin() + n}; + } +} + +seastar::future<> Client::authenticate() +{ + return reopen_session(-1).then([this] { + send_pendings(); + return seastar::now(); + }); +} + +seastar::future<> Client::stop() +{ + logger().info("{}", __func__); + auto fut = gate.close(); + timer.cancel(); + for (auto& pending_con : pending_conns) { + pending_con->close(); + } + if (active_con) { + active_con->close(); + } + return fut; +} + +seastar::future<> Client::reopen_session(int rank) +{ + logger().info("{} to mon.{}", __func__, rank); + vector<unsigned> mons; + if (rank >= 0) { + mons.push_back(rank); + } else { + const auto parallel = + crimson::common::local_conf().get_val<uint64_t>("mon_client_hunt_parallel"); + mons = get_random_mons(parallel); + } + pending_conns.reserve(mons.size()); + return seastar::parallel_for_each(mons, [this](auto rank) { + // TODO: connect to multiple addrs + auto peer = monmap.get_addrs(rank).pick_addr(msgr.get_myaddr().get_type()); + if (peer == entity_addr_t{}) { + // crimson msgr only uses the first bound addr + logger().warn("mon.{} does not have an addr compatible with me", rank); + return seastar::now(); + } + logger().info("connecting to mon.{}", rank); + return seastar::futurize_invoke( + [peer, this] () -> seastar::future<Connection::auth_result_t> { + auto conn = msgr.connect(peer, CEPH_ENTITY_TYPE_MON); + auto& mc = pending_conns.emplace_back( + std::make_unique<Connection>(auth_registry, conn, &keyring)); + if (conn->get_peer_addr().is_msgr2()) { + return mc->authenticate_v2(); + } else { + return mc->authenticate_v1(monmap.get_epoch(), entity_name, want_keys) + .handle_exception([conn](auto ep) { + conn->mark_down(); + return seastar::make_exception_future<Connection::auth_result_t>(ep); + }); + } + }).then([peer, this](auto result) { + if (result == Connection::auth_result_t::success) { + _finish_auth(peer); + } + logger().debug("reopen_session mon connection attempts complete"); + }).handle_exception([](auto ep) { + logger().error("mon connections failed with ep {}", ep); + return seastar::make_exception_future(ep); + }); + }).then([this] { + if (!active_con) { + return seastar::make_exception_future( + crimson::common::system_shutdown_exception()); + } + return active_con->renew_rotating_keyring(); + }); +} + +void Client::_finish_auth(const entity_addr_t& peer) +{ + if (!is_hunting()) { + return; + } + logger().info("found mon.{}", monmap.get_name(peer)); + + auto found = std::find_if( + pending_conns.begin(), pending_conns.end(), + [peer](auto& conn) { + return conn->is_my_peer(peer); + }); + if (found == pending_conns.end()) { + // Happens if another connection has won the race + ceph_assert(active_con && pending_conns.empty()); + logger().info("no pending connection for mon.{}, peer {}", + monmap.get_name(peer), peer); + return; + } + + ceph_assert(!active_con && !pending_conns.empty()); + active_con = std::move(*found); + found->reset(); + for (auto& conn : pending_conns) { + if (conn) { + conn->close(); + } + } + pending_conns.clear(); +} + +Client::command_result_t +Client::run_command(const std::vector<std::string>& cmd, + const bufferlist& bl) +{ + auto m = make_message<MMonCommand>(monmap.fsid); + auto tid = ++last_mon_command_id; + m->set_tid(tid); + m->cmd = cmd; + m->set_data(bl); + auto& req = mon_commands[tid]; + return send_message(m).then([&req] { + return req.get_future(); + }); +} + +seastar::future<> Client::send_message(MessageRef m) +{ + if (active_con) { + if (!pending_messages.empty()) { + send_pendings(); + } + return active_con->get_conn()->send(m); + } + auto& delayed = pending_messages.emplace_back(m); + return delayed.pr.get_future(); +} + +void Client::send_pendings() +{ + if (active_con) { + for (auto& m : pending_messages) { + (void) active_con->get_conn()->send(m.msg); + m.pr.set_value(); + } + pending_messages.clear(); + } +} + +bool Client::sub_want(const std::string& what, version_t start, unsigned flags) +{ + return sub.want(what, start, flags); +} + +void Client::sub_got(const std::string& what, version_t have) +{ + sub.got(what, have); +} + +void Client::sub_unwant(const std::string& what) +{ + sub.unwant(what); +} + +bool Client::sub_want_increment(const std::string& what, + version_t start, + unsigned flags) +{ + return sub.inc_want(what, start, flags); +} + +seastar::future<> Client::renew_subs() +{ + if (!sub.have_new()) { + logger().warn("{} - empty", __func__); + return seastar::now(); + } + logger().trace("{}", __func__); + + auto m = make_message<MMonSubscribe>(); + m->what = sub.get_subs(); + m->hostname = ceph_get_short_hostname(); + return send_message(m).then([this] { + sub.renewed(); + }); +} + +void Client::print(std::ostream& out) const +{ + out << "mon." << entity_name; +} + +} // namespace crimson::mon diff --git a/src/crimson/mon/MonClient.h b/src/crimson/mon/MonClient.h new file mode 100644 index 000000000..e7d2df863 --- /dev/null +++ b/src/crimson/mon/MonClient.h @@ -0,0 +1,183 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> + +#include <seastar/core/future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/lowres_clock.hh> +#include <seastar/core/timer.hh> + +#include "auth/AuthRegistry.h" +#include "auth/KeyRing.h" +#include "common/ceph_context.h" + +#include "crimson/auth/AuthClient.h" +#include "crimson/auth/AuthServer.h" +#include "crimson/common/auth_handler.h" +#include "crimson/common/gated.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/net/Fwd.h" + +#include "mon/MonMap.h" + +#include "mon/MonSub.h" + +template<typename Message> using Ref = boost::intrusive_ptr<Message>; +namespace crimson::net { + class Messenger; +} + +struct AuthAuthorizeHandler; +class MAuthReply; +struct MMonMap; +struct MMonSubscribeAck; +struct MMonGetVersionReply; +struct MMonCommandAck; +struct MLogAck; +struct MConfig; + +namespace crimson::mon { + +class Connection; + +class Client : public crimson::net::Dispatcher, + public crimson::auth::AuthClient, + public crimson::auth::AuthServer +{ + EntityName entity_name; + KeyRing keyring; + const uint32_t want_keys; + + MonMap monmap; + std::unique_ptr<Connection> active_con; + std::vector<std::unique_ptr<Connection>> pending_conns; + seastar::timer<seastar::lowres_clock> timer; + + crimson::net::Messenger& msgr; + + // commands + using get_version_t = seastar::future<std::tuple<version_t, version_t>>; + + ceph_tid_t last_version_req_id = 0; + std::map<ceph_tid_t, typename get_version_t::promise_type> version_reqs; + + ceph_tid_t last_mon_command_id = 0; + using command_result_t = + seastar::future<std::tuple<std::int32_t, string, ceph::bufferlist>>; + std::map<ceph_tid_t, typename command_result_t::promise_type> mon_commands; + + MonSub sub; + +public: + Client(crimson::net::Messenger&, crimson::common::AuthHandler&); + Client(Client&&); + ~Client(); + seastar::future<> start(); + seastar::future<> stop(); + + const uuid_d& get_fsid() const { + return monmap.fsid; + } + get_version_t get_version(const std::string& map); + command_result_t run_command(const std::vector<std::string>& cmd, + const bufferlist& bl); + seastar::future<> send_message(MessageRef); + bool sub_want(const std::string& what, version_t start, unsigned flags); + void sub_got(const std::string& what, version_t have); + void sub_unwant(const std::string& what); + bool sub_want_increment(const std::string& what, version_t start, unsigned flags); + seastar::future<> renew_subs(); + + void print(std::ostream&) const; +private: + // AuthServer methods + std::pair<std::vector<uint32_t>, std::vector<uint32_t>> + get_supported_auth_methods(int peer_type) final; + uint32_t pick_con_mode(int peer_type, + uint32_t auth_method, + const std::vector<uint32_t>& preferred_modes) final; + AuthAuthorizeHandler* get_auth_authorize_handler(int peer_type, + int auth_method) final; + int handle_auth_request(crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + bool more, + uint32_t auth_method, + const ceph::bufferlist& payload, + ceph::bufferlist *reply) final; + + crimson::common::CephContext cct; // for auth_registry + AuthRegistry auth_registry; + crimson::common::AuthHandler& auth_handler; + + // AuthClient methods + crimson::auth::AuthClient::auth_request_t + get_auth_request(crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta) final; + + // Handle server's request to continue the handshake + ceph::bufferlist handle_auth_reply_more(crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + const bufferlist& bl) final; + + // Handle server's indication that authentication succeeded + int handle_auth_done(crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + uint64_t global_id, + uint32_t con_mode, + const bufferlist& bl) final; + + // Handle server's indication that the previous auth attempt failed + int handle_auth_bad_method(crimson::net::ConnectionRef conn, + AuthConnectionMetaRef auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) final; + +private: + void tick(); + + std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef conn, + MessageRef m) override; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override; + + seastar::future<> handle_monmap(crimson::net::ConnectionRef conn, + Ref<MMonMap> m); + seastar::future<> handle_auth_reply(crimson::net::ConnectionRef conn, + Ref<MAuthReply> m); + seastar::future<> handle_subscribe_ack(Ref<MMonSubscribeAck> m); + seastar::future<> handle_get_version_reply(Ref<MMonGetVersionReply> m); + seastar::future<> handle_mon_command_ack(Ref<MMonCommandAck> m); + seastar::future<> handle_log_ack(Ref<MLogAck> m); + seastar::future<> handle_config(Ref<MConfig> m); + + void send_pendings(); +private: + seastar::future<> load_keyring(); + seastar::future<> authenticate(); + + bool is_hunting() const; + seastar::future<> reopen_session(int rank); + std::vector<unsigned> get_random_mons(unsigned n) const; + seastar::future<> _add_conn(unsigned rank, uint64_t global_id); + void _finish_auth(const entity_addr_t& peer); + crimson::common::Gated gate; + + // messages that are waiting for the active_con to be available + struct pending_msg_t { + pending_msg_t(MessageRef& m) : msg(m) {} + MessageRef msg; + seastar::promise<> pr; + }; + std::deque<pending_msg_t> pending_messages; +}; + +inline std::ostream& operator<<(std::ostream& out, const Client& client) { + client.print(out); + return out; +} + +} // namespace crimson::mon diff --git a/src/crimson/net/Connection.h b/src/crimson/net/Connection.h new file mode 100644 index 000000000..6af12692e --- /dev/null +++ b/src/crimson/net/Connection.h @@ -0,0 +1,175 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <queue> +#include <seastar/core/future.hh> +#include <seastar/core/shared_ptr.hh> + +#include "Fwd.h" + +namespace crimson::net { + +#ifdef UNIT_TESTS_BUILT +class Interceptor; +#endif + +using seq_num_t = uint64_t; + +class Connection : public seastar::enable_shared_from_this<Connection> { + entity_name_t peer_name = {0, entity_name_t::NEW}; + + protected: + entity_addr_t peer_addr; + + // which of the peer_addrs we're connecting to (as client) + // or should reconnect to (as peer) + entity_addr_t target_addr; + + using clock_t = seastar::lowres_system_clock; + clock_t::time_point last_keepalive; + clock_t::time_point last_keepalive_ack; + + void set_peer_type(entity_type_t peer_type) { + // it is not allowed to assign an unknown value when the current + // value is known + assert(!(peer_type == 0 && + peer_name.type() != 0)); + // it is not allowed to assign a different known value when the + // current value is also known. + assert(!(peer_type != 0 && + peer_name.type() != 0 && + peer_type != peer_name.type())); + peer_name._type = peer_type; + } + void set_peer_id(int64_t peer_id) { + // it is not allowed to assign an unknown value when the current + // value is known + assert(!(peer_id == entity_name_t::NEW && + peer_name.num() != entity_name_t::NEW)); + // it is not allowed to assign a different known value when the + // current value is also known. + assert(!(peer_id != entity_name_t::NEW && + peer_name.num() != entity_name_t::NEW && + peer_id != peer_name.num())); + peer_name._num = peer_id; + } + void set_peer_name(entity_name_t name) { + set_peer_type(name.type()); + set_peer_id(name.num()); + } + + public: + uint64_t peer_global_id = 0; + + protected: + uint64_t features = 0; + + public: + void set_features(uint64_t new_features) { + features = new_features; + } + auto get_features() const { + return features; + } + bool has_feature(uint64_t f) const { + return features & f; + } + + public: + Connection() {} + virtual ~Connection() {} + +#ifdef UNIT_TESTS_BUILT + Interceptor *interceptor = nullptr; +#endif + + virtual Messenger* get_messenger() const = 0; + const entity_addr_t& get_peer_addr() const { return peer_addr; } + const entity_addrvec_t get_peer_addrs() const { + return entity_addrvec_t(peer_addr); + } + const auto& get_peer_socket_addr() const { + return target_addr; + } + const entity_name_t& get_peer_name() const { return peer_name; } + entity_type_t get_peer_type() const { return peer_name.type(); } + int64_t get_peer_id() const { return peer_name.num(); } + + bool peer_is_mon() const { return peer_name.is_mon(); } + bool peer_is_mgr() const { return peer_name.is_mgr(); } + bool peer_is_mds() const { return peer_name.is_mds(); } + bool peer_is_osd() const { return peer_name.is_osd(); } + bool peer_is_client() const { return peer_name.is_client(); } + + /// true if the handshake has completed and no errors have been encountered + virtual bool is_connected() const = 0; + +#ifdef UNIT_TESTS_BUILT + virtual bool is_closed() const = 0; + + virtual bool is_closed_clean() const = 0; + + virtual bool peer_wins() const = 0; +#endif + + /// send a message over a connection that has completed its handshake + virtual seastar::future<> send(MessageRef msg) = 0; + + /// send a keepalive message over a connection that has completed its + /// handshake + virtual seastar::future<> keepalive() = 0; + + // close the connection and cancel any any pending futures from read/send, + // without dispatching any reset event + virtual void mark_down() = 0; + + virtual void print(ostream& out) const = 0; + + void set_last_keepalive(clock_t::time_point when) { + last_keepalive = when; + } + void set_last_keepalive_ack(clock_t::time_point when) { + last_keepalive_ack = when; + } + auto get_last_keepalive() const { return last_keepalive; } + auto get_last_keepalive_ack() const { return last_keepalive_ack; } + + struct user_private_t { + virtual ~user_private_t() = default; + }; +private: + unique_ptr<user_private_t> user_private; +public: + bool has_user_private() const { + return user_private != nullptr; + } + void set_user_private(unique_ptr<user_private_t> new_user_private) { + user_private = std::move(new_user_private); + } + user_private_t &get_user_private() { + ceph_assert(user_private); + return *user_private; + } +}; + +inline ostream& operator<<(ostream& out, const Connection& conn) { + out << "["; + conn.print(out); + out << "]"; + return out; +} + +} // namespace crimson::net diff --git a/src/crimson/net/Dispatcher.h b/src/crimson/net/Dispatcher.h new file mode 100644 index 000000000..cc6fd4574 --- /dev/null +++ b/src/crimson/net/Dispatcher.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "Fwd.h" + +class AuthAuthorizer; + +namespace crimson::net { + +class Dispatcher { + public: + virtual ~Dispatcher() {} + + // Dispatchers are put into a chain as described by chain-of-responsibility + // pattern. If any of the dispatchers claims this message, it returns a valid + // future to prevent other dispatchers from processing it, and this is also + // used to throttle the connection if it's too busy. + virtual std::optional<seastar::future<>> ms_dispatch(ConnectionRef, MessageRef) = 0; + + virtual void ms_handle_accept(ConnectionRef conn) {} + + virtual void ms_handle_connect(ConnectionRef conn) {} + + // a reset event is dispatched when the connection is closed unexpectedly. + // is_replace=true means the reset connection is going to be replaced by + // another accepting connection with the same peer_addr, which currently only + // happens under lossy policy when both sides wish to connect to each other. + virtual void ms_handle_reset(ConnectionRef conn, bool is_replace) {} + + virtual void ms_handle_remote_reset(ConnectionRef conn) {} +}; + +} // namespace crimson::net diff --git a/src/crimson/net/Errors.cc b/src/crimson/net/Errors.cc new file mode 100644 index 000000000..d07c090db --- /dev/null +++ b/src/crimson/net/Errors.cc @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Errors.h" + +namespace crimson::net { + +const std::error_category& net_category() +{ + struct category : public std::error_category { + const char* name() const noexcept override { + return "crimson::net"; + } + + std::string message(int ev) const override { + switch (static_cast<error>(ev)) { + case error::success: + return "success"; + case error::bad_connect_banner: + return "bad connect banner"; + case error::bad_peer_address: + return "bad peer address"; + case error::negotiation_failure: + return "negotiation failure"; + case error::read_eof: + return "read eof"; + case error::corrupted_message: + return "corrupted message"; + case error::protocol_aborted: + return "protocol aborted"; + default: + return "unknown"; + } + } + }; + static category instance; + return instance; +} + +} // namespace crimson::net diff --git a/src/crimson/net/Errors.h b/src/crimson/net/Errors.h new file mode 100644 index 000000000..3a17a103a --- /dev/null +++ b/src/crimson/net/Errors.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <system_error> + +namespace crimson::net { + +/// net error codes +enum class error { + success = 0, + bad_connect_banner, + bad_peer_address, + negotiation_failure, + read_eof, + corrupted_message, + protocol_aborted, +}; + +/// net error category +const std::error_category& net_category(); + +inline std::error_code make_error_code(error e) +{ + return {static_cast<int>(e), net_category()}; +} + +inline std::error_condition make_error_condition(error e) +{ + return {static_cast<int>(e), net_category()}; +} + +} // namespace crimson::net + +namespace std { + +/// enables implicit conversion to std::error_condition +template <> +struct is_error_condition_enum<crimson::net::error> : public true_type {}; + +} // namespace std diff --git a/src/crimson/net/Fwd.h b/src/crimson/net/Fwd.h new file mode 100644 index 000000000..e10120571 --- /dev/null +++ b/src/crimson/net/Fwd.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <boost/container/small_vector.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/shared_ptr.hh> +#include <seastar/core/sharded.hh> + +#include "msg/Connection.h" +#include "msg/MessageRef.h" +#include "msg/msg_types.h" + +#include "crimson/common/errorator.h" + +using auth_proto_t = int; + +class AuthConnectionMeta; +using AuthConnectionMetaRef = seastar::lw_shared_ptr<AuthConnectionMeta>; + +namespace crimson::net { + +using msgr_tag_t = uint8_t; +using stop_t = seastar::stop_iteration; + +class Connection; +using ConnectionRef = seastar::shared_ptr<Connection>; + +class Dispatcher; +class ChainedDispatchers; +constexpr std::size_t NUM_DISPATCHERS = 4u; +using dispatchers_t = boost::container::small_vector<Dispatcher*, NUM_DISPATCHERS>; + +class Messenger; +using MessengerRef = seastar::shared_ptr<Messenger>; + +} // namespace crimson::net diff --git a/src/crimson/net/Interceptor.h b/src/crimson/net/Interceptor.h new file mode 100644 index 000000000..dfa2183ec --- /dev/null +++ b/src/crimson/net/Interceptor.h @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <variant> +#include <seastar/core/sharded.hh> +#include <seastar/core/sleep.hh> + +#include "Fwd.h" +#include "msg/async/frames_v2.h" + +namespace crimson::net { + +enum class custom_bp_t : uint8_t { + BANNER_WRITE = 0, + BANNER_READ, + BANNER_PAYLOAD_READ, + SOCKET_CONNECTING, + SOCKET_ACCEPTED +}; +inline const char* get_bp_name(custom_bp_t bp) { + uint8_t index = static_cast<uint8_t>(bp); + static const char *const bp_names[] = {"BANNER_WRITE", + "BANNER_READ", + "BANNER_PAYLOAD_READ", + "SOCKET_CONNECTING", + "SOCKET_ACCEPTED"}; + assert(index < std::size(bp_names)); + return bp_names[index]; +} + +enum class bp_type_t { + READ = 0, + WRITE +}; + +enum class bp_action_t { + CONTINUE = 0, + FAULT, + BLOCK, + STALL +}; + +inline std::ostream& operator<<(std::ostream& out, const bp_action_t& action) { + static const char *const action_names[] = {"CONTINUE", + "FAULT", + "BLOCK", + "STALL"}; + assert(static_cast<size_t>(action) < std::size(action_names)); + return out << action_names[static_cast<size_t>(action)]; +} + +class socket_blocker { + std::optional<seastar::abort_source> p_blocked; + std::optional<seastar::abort_source> p_unblocked; + + public: + seastar::future<> wait_blocked() { + ceph_assert(!p_blocked); + if (p_unblocked) { + return seastar::make_ready_future<>(); + } else { + p_blocked = seastar::abort_source(); + return seastar::sleep_abortable(10s, *p_blocked).then([] { + throw std::runtime_error( + "Timeout (10s) in socket_blocker::wait_blocked()"); + }).handle_exception_type([] (const seastar::sleep_aborted& e) { + // wait done! + }); + } + } + + seastar::future<> block() { + if (p_blocked) { + p_blocked->request_abort(); + p_blocked = std::nullopt; + } + ceph_assert(!p_unblocked); + p_unblocked = seastar::abort_source(); + return seastar::sleep_abortable(10s, *p_unblocked).then([] { + ceph_abort("Timeout (10s) in socket_blocker::block()"); + }).handle_exception_type([] (const seastar::sleep_aborted& e) { + // wait done! + }); + } + + void unblock() { + ceph_assert(!p_blocked); + ceph_assert(p_unblocked); + p_unblocked->request_abort(); + p_unblocked = std::nullopt; + } +}; + +struct tag_bp_t { + ceph::msgr::v2::Tag tag; + bp_type_t type; + bool operator==(const tag_bp_t& x) const { + return tag == x.tag && type == x.type; + } + bool operator!=(const tag_bp_t& x) const { return !operator==(x); } + bool operator<(const tag_bp_t& x) const { + return std::tie(tag, type) < std::tie(x.tag, x.type); + } +}; + +struct Breakpoint { + using var_t = std::variant<custom_bp_t, tag_bp_t>; + var_t bp; + Breakpoint(custom_bp_t bp) : bp(bp) { } + Breakpoint(ceph::msgr::v2::Tag tag, bp_type_t type) + : bp(tag_bp_t{tag, type}) { } + bool operator==(const Breakpoint& x) const { return bp == x.bp; } + bool operator!=(const Breakpoint& x) const { return !operator==(x); } + bool operator==(const custom_bp_t& x) const { return bp == var_t(x); } + bool operator!=(const custom_bp_t& x) const { return !operator==(x); } + bool operator==(const tag_bp_t& x) const { return bp == var_t(x); } + bool operator!=(const tag_bp_t& x) const { return !operator==(x); } + bool operator<(const Breakpoint& x) const { return bp < x.bp; } +}; + +inline std::ostream& operator<<(std::ostream& out, const Breakpoint& bp) { + if (auto custom_bp = std::get_if<custom_bp_t>(&bp.bp)) { + return out << get_bp_name(*custom_bp); + } else { + auto tag_bp = std::get<tag_bp_t>(bp.bp); + static const char *const tag_names[] = {"NONE", + "HELLO", + "AUTH_REQUEST", + "AUTH_BAD_METHOD", + "AUTH_REPLY_MORE", + "AUTH_REQUEST_MORE", + "AUTH_DONE", + "AUTH_SIGNATURE", + "CLIENT_IDENT", + "SERVER_IDENT", + "IDENT_MISSING_FEATURES", + "SESSION_RECONNECT", + "SESSION_RESET", + "SESSION_RETRY", + "SESSION_RETRY_GLOBAL", + "SESSION_RECONNECT_OK", + "WAIT", + "MESSAGE", + "KEEPALIVE2", + "KEEPALIVE2_ACK", + "ACK"}; + assert(static_cast<size_t>(tag_bp.tag) < std::size(tag_names)); + return out << tag_names[static_cast<size_t>(tag_bp.tag)] + << (tag_bp.type == bp_type_t::WRITE ? "_WRITE" : "_READ"); + } +} + +struct Interceptor { + socket_blocker blocker; + virtual ~Interceptor() {} + virtual void register_conn(Connection& conn) = 0; + virtual void register_conn_ready(Connection& conn) = 0; + virtual void register_conn_closed(Connection& conn) = 0; + virtual void register_conn_replaced(Connection& conn) = 0; + virtual bp_action_t intercept(Connection& conn, Breakpoint bp) = 0; +}; + +} // namespace crimson::net diff --git a/src/crimson/net/Messenger.cc b/src/crimson/net/Messenger.cc new file mode 100644 index 000000000..aab476f7a --- /dev/null +++ b/src/crimson/net/Messenger.cc @@ -0,0 +1,17 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Messenger.h" +#include "SocketMessenger.h" + +namespace crimson::net { + +MessengerRef +Messenger::create(const entity_name_t& name, + const std::string& lname, + const uint64_t nonce) +{ + return seastar::make_shared<SocketMessenger>(name, lname, nonce); +} + +} // namespace crimson::net diff --git a/src/crimson/net/Messenger.h b/src/crimson/net/Messenger.h new file mode 100644 index 000000000..2b39fbf63 --- /dev/null +++ b/src/crimson/net/Messenger.h @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include "Fwd.h" +#include "crimson/common/throttle.h" +#include "msg/Message.h" +#include "msg/Policy.h" + +class AuthAuthorizer; + +namespace crimson::auth { +class AuthClient; +class AuthServer; +} + +namespace crimson::net { + +#ifdef UNIT_TESTS_BUILT +class Interceptor; +#endif + +using Throttle = crimson::common::Throttle; +using SocketPolicy = ceph::net::Policy<Throttle>; + +class Messenger { + entity_name_t my_name; + entity_addrvec_t my_addrs; + uint32_t crc_flags = 0; + crimson::auth::AuthClient* auth_client = nullptr; + crimson::auth::AuthServer* auth_server = nullptr; + bool require_authorizer = true; + +public: + Messenger(const entity_name_t& name) + : my_name(name) + {} + virtual ~Messenger() {} + +#ifdef UNIT_TESTS_BUILT + Interceptor *interceptor = nullptr; +#endif + + entity_type_t get_mytype() const { return my_name.type(); } + const entity_name_t& get_myname() const { return my_name; } + const entity_addrvec_t& get_myaddrs() const { return my_addrs; } + entity_addr_t get_myaddr() const { return my_addrs.front(); } + virtual seastar::future<> set_myaddrs(const entity_addrvec_t& addrs) { + my_addrs = addrs; + return seastar::now(); + } + + using bind_ertr = crimson::errorator< + crimson::ct_error::address_in_use // The address (range) is already bound + >; + /// bind to the given address + virtual bind_ertr::future<> bind(const entity_addrvec_t& addr) = 0; + + /// try to bind to the first unused port of given address + virtual bind_ertr::future<> try_bind(const entity_addrvec_t& addr, + uint32_t min_port, uint32_t max_port) = 0; + + /// start the messenger + virtual seastar::future<> start(const dispatchers_t&) = 0; + + /// either return an existing connection to the peer, + /// or a new pending connection + virtual ConnectionRef + connect(const entity_addr_t& peer_addr, + const entity_name_t& peer_name) = 0; + + ConnectionRef + connect(const entity_addr_t& peer_addr, + const entity_type_t& peer_type) { + return connect(peer_addr, entity_name_t(peer_type, -1)); + } + + // wait for messenger shutdown + virtual seastar::future<> wait() = 0; + + // stop dispatching events and messages + virtual void stop() = 0; + + virtual bool is_started() const = 0; + + // free internal resources before destruction, must be called after stopped, + // and must be called if is bound. + virtual seastar::future<> shutdown() = 0; + + uint32_t get_crc_flags() const { + return crc_flags; + } + void set_crc_data() { + crc_flags |= MSG_CRC_DATA; + } + void set_crc_header() { + crc_flags |= MSG_CRC_HEADER; + } + + crimson::auth::AuthClient* get_auth_client() const { return auth_client; } + void set_auth_client(crimson::auth::AuthClient *ac) { + auth_client = ac; + } + crimson::auth::AuthServer* get_auth_server() const { return auth_server; } + void set_auth_server(crimson::auth::AuthServer *as) { + auth_server = as; + } + + virtual void print(ostream& out) const = 0; + + virtual SocketPolicy get_policy(entity_type_t peer_type) const = 0; + + virtual SocketPolicy get_default_policy() const = 0; + + virtual void set_default_policy(const SocketPolicy& p) = 0; + + virtual void set_policy(entity_type_t peer_type, const SocketPolicy& p) = 0; + + virtual void set_policy_throttler(entity_type_t peer_type, Throttle* throttle) = 0; + + // allow unauthenticated connections. This is needed for compatibility with + // pre-nautilus OSDs, which do not authenticate the heartbeat sessions. + bool get_require_authorizer() const { + return require_authorizer; + } + void set_require_authorizer(bool r) { + require_authorizer = r; + } + static MessengerRef + create(const entity_name_t& name, + const std::string& lname, + const uint64_t nonce); +}; + +inline ostream& operator<<(ostream& out, const Messenger& msgr) { + out << "["; + msgr.print(out); + out << "]"; + return out; +} + +} // namespace crimson::net diff --git a/src/crimson/net/Protocol.cc b/src/crimson/net/Protocol.cc new file mode 100644 index 000000000..50b5c45a3 --- /dev/null +++ b/src/crimson/net/Protocol.cc @@ -0,0 +1,323 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Protocol.h" + +#include "auth/Auth.h" + +#include "crimson/common/log.h" +#include "crimson/net/Errors.h" +#include "crimson/net/chained_dispatchers.h" +#include "crimson/net/Socket.h" +#include "crimson/net/SocketConnection.h" +#include "msg/Message.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); + } +} + +namespace crimson::net { + +Protocol::Protocol(proto_t type, + ChainedDispatchers& dispatchers, + SocketConnection& conn) + : proto_type(type), + dispatchers(dispatchers), + conn(conn), + auth_meta{seastar::make_lw_shared<AuthConnectionMeta>()} +{} + +Protocol::~Protocol() +{ + ceph_assert(gate.is_closed()); + assert(!exit_open); +} + +void Protocol::close(bool dispatch_reset, + std::optional<std::function<void()>> f_accept_new) +{ + if (closed) { + // already closing + return; + } + + bool is_replace = f_accept_new ? true : false; + logger().info("{} closing: reset {}, replace {}", conn, + dispatch_reset ? "yes" : "no", + is_replace ? "yes" : "no"); + + // atomic operations + closed = true; + trigger_close(); + if (f_accept_new) { + (*f_accept_new)(); + } + if (socket) { + socket->shutdown(); + } + set_write_state(write_state_t::drop); + assert(!gate.is_closed()); + auto gate_closed = gate.close(); + + if (dispatch_reset) { + dispatchers.ms_handle_reset( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this()), + is_replace); + } + + // asynchronous operations + assert(!close_ready.valid()); + close_ready = std::move(gate_closed).then([this] { + if (socket) { + return socket->close(); + } else { + return seastar::now(); + } + }).then([this] { + logger().debug("{} closed!", conn); + on_closed(); +#ifdef UNIT_TESTS_BUILT + is_closed_clean = true; + if (conn.interceptor) { + conn.interceptor->register_conn_closed(conn); + } +#endif + }).handle_exception([conn_ref = conn.shared_from_this(), this] (auto eptr) { + logger().error("{} closing: close_ready got unexpected exception {}", conn, eptr); + ceph_abort(); + }); +} + +seastar::future<> Protocol::send(MessageRef msg) +{ + if (write_state != write_state_t::drop) { + conn.out_q.push_back(std::move(msg)); + write_event(); + } + return seastar::now(); +} + +seastar::future<> Protocol::keepalive() +{ + if (!need_keepalive) { + need_keepalive = true; + write_event(); + } + return seastar::now(); +} + +void Protocol::notify_keepalive_ack(utime_t _keepalive_ack) +{ + logger().trace("{} got keepalive ack {}", conn, _keepalive_ack); + keepalive_ack = _keepalive_ack; + write_event(); +} + +void Protocol::notify_ack() +{ + if (!conn.policy.lossy) { + ++ack_left; + write_event(); + } +} + +void Protocol::requeue_sent() +{ + assert(write_state != write_state_t::open); + if (conn.sent.empty()) { + return; + } + + conn.out_seq -= conn.sent.size(); + logger().debug("{} requeue {} items, revert out_seq to {}", + conn, conn.sent.size(), conn.out_seq); + for (MessageRef& msg : conn.sent) { + msg->clear_payload(); + msg->set_seq(0); + } + conn.out_q.insert(conn.out_q.begin(), + std::make_move_iterator(conn.sent.begin()), + std::make_move_iterator(conn.sent.end())); + conn.sent.clear(); + write_event(); +} + +void Protocol::requeue_up_to(seq_num_t seq) +{ + assert(write_state != write_state_t::open); + if (conn.sent.empty() && conn.out_q.empty()) { + logger().debug("{} nothing to requeue, reset out_seq from {} to seq {}", + conn, conn.out_seq, seq); + conn.out_seq = seq; + return; + } + logger().debug("{} discarding sent items by seq {} (sent_len={}, out_seq={})", + conn, seq, conn.sent.size(), conn.out_seq); + while (!conn.sent.empty()) { + auto cur_seq = conn.sent.front()->get_seq(); + if (cur_seq == 0 || cur_seq > seq) { + break; + } else { + conn.sent.pop_front(); + } + } + requeue_sent(); +} + +void Protocol::reset_write() +{ + assert(write_state != write_state_t::open); + conn.out_seq = 0; + conn.out_q.clear(); + conn.sent.clear(); + need_keepalive = false; + keepalive_ack = std::nullopt; + ack_left = 0; +} + +void Protocol::ack_writes(seq_num_t seq) +{ + if (conn.policy.lossy) { // lossy connections don't keep sent messages + return; + } + while (!conn.sent.empty() && conn.sent.front()->get_seq() <= seq) { + logger().trace("{} got ack seq {} >= {}, pop {}", + conn, seq, conn.sent.front()->get_seq(), conn.sent.front()); + conn.sent.pop_front(); + } +} + +seastar::future<stop_t> Protocol::try_exit_sweep() { + assert(!is_queued()); + return socket->flush().then([this] { + if (!is_queued()) { + // still nothing pending to send after flush, + // the dispatching can ONLY stop now + ceph_assert(write_dispatching); + write_dispatching = false; + if (unlikely(exit_open.has_value())) { + exit_open->set_value(); + exit_open = std::nullopt; + logger().info("{} write_event: nothing queued at {}," + " set exit_open", + conn, get_state_name(write_state)); + } + return seastar::make_ready_future<stop_t>(stop_t::yes); + } else { + // something is pending to send during flushing + return seastar::make_ready_future<stop_t>(stop_t::no); + } + }); +} + +seastar::future<> Protocol::do_write_dispatch_sweep() +{ + return seastar::repeat([this] { + switch (write_state) { + case write_state_t::open: { + size_t num_msgs = conn.out_q.size(); + bool still_queued = is_queued(); + if (unlikely(!still_queued)) { + return try_exit_sweep(); + } + conn.pending_q.clear(); + conn.pending_q.swap(conn.out_q); + if (!conn.policy.lossy) { + conn.sent.insert(conn.sent.end(), + conn.pending_q.begin(), + conn.pending_q.end()); + } + auto acked = ack_left; + assert(acked == 0 || conn.in_seq > 0); + // sweep all pending writes with the concrete Protocol + return socket->write(do_sweep_messages( + conn.pending_q, num_msgs, need_keepalive, keepalive_ack, acked > 0) + ).then([this, prv_keepalive_ack=keepalive_ack, acked] { + need_keepalive = false; + if (keepalive_ack == prv_keepalive_ack) { + keepalive_ack = std::nullopt; + } + assert(ack_left >= acked); + ack_left -= acked; + if (!is_queued()) { + return try_exit_sweep(); + } else { + // messages were enqueued during socket write + return seastar::make_ready_future<stop_t>(stop_t::no); + } + }); + } + case write_state_t::delay: + // delay dispatching writes until open + if (exit_open) { + exit_open->set_value(); + exit_open = std::nullopt; + logger().info("{} write_event: delay and set exit_open ...", conn); + } else { + logger().info("{} write_event: delay ...", conn); + } + return state_changed.get_shared_future() + .then([] { return stop_t::no; }); + case write_state_t::drop: + ceph_assert(write_dispatching); + write_dispatching = false; + if (exit_open) { + exit_open->set_value(); + exit_open = std::nullopt; + logger().info("{} write_event: dropped and set exit_open", conn); + } else { + logger().info("{} write_event: dropped", conn); + } + return seastar::make_ready_future<stop_t>(stop_t::yes); + default: + ceph_assert(false); + } + }).handle_exception_type([this] (const std::system_error& e) { + if (e.code() != std::errc::broken_pipe && + e.code() != std::errc::connection_reset && + e.code() != error::negotiation_failure) { + logger().error("{} write_event(): unexpected error at {} -- {}", + conn, get_state_name(write_state), e); + ceph_abort(); + } + socket->shutdown(); + if (write_state == write_state_t::open) { + logger().info("{} write_event(): fault at {}, going to delay -- {}", + conn, get_state_name(write_state), e); + write_state = write_state_t::delay; + } else { + logger().info("{} write_event(): fault at {} -- {}", + conn, get_state_name(write_state), e); + } + return do_write_dispatch_sweep(); + }); +} + +void Protocol::write_event() +{ + notify_write(); + if (write_dispatching) { + // already dispatching + return; + } + write_dispatching = true; + switch (write_state) { + case write_state_t::open: + [[fallthrough]]; + case write_state_t::delay: + assert(!gate.is_closed()); + gate.dispatch_in_background("do_write_dispatch_sweep", *this, [this] { + return do_write_dispatch_sweep(); + }); + return; + case write_state_t::drop: + write_dispatching = false; + return; + default: + ceph_assert(false); + } +} + +} // namespace crimson::net diff --git a/src/crimson/net/Protocol.h b/src/crimson/net/Protocol.h new file mode 100644 index 000000000..dc4e4f2af --- /dev/null +++ b/src/crimson/net/Protocol.h @@ -0,0 +1,173 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/gate.hh> +#include <seastar/core/shared_future.hh> + +#include "crimson/common/gated.h" +#include "crimson/common/log.h" +#include "Fwd.h" +#include "SocketConnection.h" + +namespace crimson::net { + +class Protocol { + public: + enum class proto_t { + none, + v1, + v2 + }; + + Protocol(Protocol&&) = delete; + virtual ~Protocol(); + + virtual bool is_connected() const = 0; + +#ifdef UNIT_TESTS_BUILT + bool is_closed_clean = false; + bool is_closed() const { return closed; } +#endif + + // Reentrant closing + void close(bool dispatch_reset, std::optional<std::function<void()>> f_accept_new=std::nullopt); + seastar::future<> close_clean(bool dispatch_reset) { + close(dispatch_reset); + // it can happen if close_clean() is called inside Dispatcher::ms_handle_reset() + // which will otherwise result in deadlock + assert(close_ready.valid()); + return close_ready.get_future(); + } + + virtual void start_connect(const entity_addr_t& peer_addr, + const entity_name_t& peer_name) = 0; + + virtual void start_accept(SocketRef&& socket, + const entity_addr_t& peer_addr) = 0; + + virtual void print(std::ostream&) const = 0; + protected: + Protocol(proto_t type, + ChainedDispatchers& dispatchers, + SocketConnection& conn); + + virtual void trigger_close() = 0; + + virtual ceph::bufferlist do_sweep_messages( + const std::deque<MessageRef>& msgs, + size_t num_msgs, + bool require_keepalive, + std::optional<utime_t> keepalive_ack, + bool require_ack) = 0; + + virtual void notify_write() {}; + + virtual void on_closed() {} + + public: + const proto_t proto_type; + SocketRef socket; + + protected: + ChainedDispatchers& dispatchers; + SocketConnection &conn; + + AuthConnectionMetaRef auth_meta; + + private: + bool closed = false; + // become valid only after closed == true + seastar::shared_future<> close_ready; + +// the write state-machine + public: + seastar::future<> send(MessageRef msg); + seastar::future<> keepalive(); + +// TODO: encapsulate a SessionedSender class + protected: + // write_state is changed with state atomically, indicating the write + // behavior of the according state. + enum class write_state_t : uint8_t { + none, + delay, + open, + drop + }; + + static const char* get_state_name(write_state_t state) { + uint8_t index = static_cast<uint8_t>(state); + static const char *const state_names[] = {"none", + "delay", + "open", + "drop"}; + assert(index < std::size(state_names)); + return state_names[index]; + } + + void set_write_state(const write_state_t& state) { + if (write_state == write_state_t::open && + state != write_state_t::open && + write_dispatching) { + exit_open = seastar::shared_promise<>(); + } + write_state = state; + state_changed.set_value(); + state_changed = seastar::shared_promise<>(); + } + + seastar::future<> wait_write_exit() { + if (exit_open) { + return exit_open->get_shared_future(); + } + return seastar::now(); + } + + void notify_keepalive_ack(utime_t keepalive_ack); + + void notify_ack(); + + void requeue_up_to(seq_num_t seq); + + void requeue_sent(); + + void reset_write(); + + bool is_queued() const { + return (!conn.out_q.empty() || + ack_left > 0 || + need_keepalive || + keepalive_ack.has_value()); + } + + void ack_writes(seq_num_t seq); + crimson::common::Gated gate; + + private: + write_state_t write_state = write_state_t::none; + // wait until current state changed + seastar::shared_promise<> state_changed; + + bool need_keepalive = false; + std::optional<utime_t> keepalive_ack = std::nullopt; + uint64_t ack_left = 0; + bool write_dispatching = false; + // If another continuation is trying to close or replace socket when + // write_dispatching is true and write_state is open, + // it needs to wait for exit_open until writing is stopped or failed. + std::optional<seastar::shared_promise<>> exit_open; + + seastar::future<stop_t> try_exit_sweep(); + seastar::future<> do_write_dispatch_sweep(); + void write_event(); +}; + +inline std::ostream& operator<<(std::ostream& out, const Protocol& proto) { + proto.print(out); + return out; +} + + +} // namespace crimson::net diff --git a/src/crimson/net/ProtocolV1.cc b/src/crimson/net/ProtocolV1.cc new file mode 100644 index 000000000..3c604240d --- /dev/null +++ b/src/crimson/net/ProtocolV1.cc @@ -0,0 +1,1014 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ProtocolV1.h" + +#include <seastar/core/shared_future.hh> +#include <seastar/core/sleep.hh> +#include <seastar/net/packet.hh> + +#include "include/msgr.h" +#include "include/random.h" +#include "auth/Auth.h" +#include "auth/AuthSessionHandler.h" + +#include "crimson/auth/AuthClient.h" +#include "crimson/auth/AuthServer.h" +#include "crimson/common/log.h" +#include "chained_dispatchers.h" +#include "Errors.h" +#include "Socket.h" +#include "SocketConnection.h" +#include "SocketMessenger.h" + +WRITE_RAW_ENCODER(ceph_msg_connect); +WRITE_RAW_ENCODER(ceph_msg_connect_reply); + +using crimson::common::local_conf; + +std::ostream& operator<<(std::ostream& out, const ceph_msg_connect& c) +{ + return out << "connect{features=" << std::hex << c.features << std::dec + << " host_type=" << c.host_type + << " global_seq=" << c.global_seq + << " connect_seq=" << c.connect_seq + << " protocol_version=" << c.protocol_version + << " authorizer_protocol=" << c.authorizer_protocol + << " authorizer_len=" << c.authorizer_len + << " flags=" << std::hex << static_cast<uint16_t>(c.flags) << std::dec << '}'; +} + +std::ostream& operator<<(std::ostream& out, const ceph_msg_connect_reply& r) +{ + return out << "connect_reply{tag=" << static_cast<uint16_t>(r.tag) + << " features=" << std::hex << r.features << std::dec + << " global_seq=" << r.global_seq + << " connect_seq=" << r.connect_seq + << " protocol_version=" << r.protocol_version + << " authorizer_len=" << r.authorizer_len + << " flags=" << std::hex << static_cast<uint16_t>(r.flags) << std::dec << '}'; +} + +namespace { + +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); +} + +template <typename T> +seastar::net::packet make_static_packet(const T& value) { + return { reinterpret_cast<const char*>(&value), sizeof(value) }; +} + +// store the banner in a non-const string for buffer::create_static() +char banner[] = CEPH_BANNER; +constexpr size_t banner_size = sizeof(CEPH_BANNER)-1; + +constexpr size_t client_header_size = banner_size + sizeof(ceph_entity_addr); +constexpr size_t server_header_size = banner_size + 2 * sizeof(ceph_entity_addr); + +// check that the buffer starts with a valid banner without requiring it to +// be contiguous in memory +void validate_banner(bufferlist::const_iterator& p) +{ + auto b = std::cbegin(banner); + auto end = b + banner_size; + while (b != end) { + const char *buf{nullptr}; + auto remaining = std::distance(b, end); + auto len = p.get_ptr_and_advance(remaining, &buf); + if (!std::equal(buf, buf + len, b)) { + throw std::system_error( + make_error_code(crimson::net::error::bad_connect_banner)); + } + b += len; + } +} + +// return a static bufferptr to the given object +template <typename T> +bufferptr create_static(T& obj) +{ + return buffer::create_static(sizeof(obj), reinterpret_cast<char*>(&obj)); +} + +uint32_t get_proto_version(entity_type_t peer_type, bool connect) +{ + constexpr entity_type_t my_type = CEPH_ENTITY_TYPE_OSD; + // see also OSD.h, unlike other connection of simple/async messenger, + // crimson msgr is only used by osd + constexpr uint32_t CEPH_OSD_PROTOCOL = 10; + if (peer_type == my_type) { + // internal + return CEPH_OSD_PROTOCOL; + } else { + // public + switch (connect ? peer_type : my_type) { + case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL; + case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL; + case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL; + default: return 0; + } + } +} + +void discard_up_to(std::deque<MessageRef>* queue, + crimson::net::seq_num_t seq) +{ + while (!queue->empty() && + queue->front()->get_seq() < seq) { + queue->pop_front(); + } +} + +} // namespace anonymous + +namespace crimson::net { + +ProtocolV1::ProtocolV1(ChainedDispatchers& dispatchers, + SocketConnection& conn, + SocketMessenger& messenger) + : Protocol(proto_t::v1, dispatchers, conn), messenger{messenger} {} + +ProtocolV1::~ProtocolV1() {} + +bool ProtocolV1::is_connected() const +{ + return state == state_t::open; +} + +// connecting state + +void ProtocolV1::reset_session() +{ + conn.out_q = {}; + conn.sent = {}; + conn.in_seq = 0; + h.connect_seq = 0; + if (HAVE_FEATURE(conn.features, MSG_AUTH)) { + // Set out_seq to a random value, so CRC won't be predictable. + // Constant to limit starting sequence number to 2^31. Nothing special + // about it, just a big number. + constexpr uint64_t SEQ_MASK = 0x7fffffff; + conn.out_seq = ceph::util::generate_random_number<uint64_t>(0, SEQ_MASK); + } else { + // previously, seq #'s always started at 0. + conn.out_seq = 0; + } +} + +seastar::future<stop_t> +ProtocolV1::handle_connect_reply(msgr_tag_t tag) +{ + if (h.auth_payload.length() && !conn.peer_is_mon()) { + if (tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { // more + h.auth_more = messenger.get_auth_client()->handle_auth_reply_more( + conn.shared_from_this(), auth_meta, h.auth_payload); + return seastar::make_ready_future<stop_t>(stop_t::no); + } else { + int ret = messenger.get_auth_client()->handle_auth_done( + conn.shared_from_this(), auth_meta, 0, 0, h.auth_payload); + if (ret < 0) { + // fault + logger().warn("{} AuthClient::handle_auth_done() return {}", conn, ret); + throw std::system_error(make_error_code(error::negotiation_failure)); + } + } + } + + switch (tag) { + case CEPH_MSGR_TAG_FEATURES: + logger().error("{} connect protocol feature mispatch", __func__); + throw std::system_error(make_error_code(error::negotiation_failure)); + case CEPH_MSGR_TAG_BADPROTOVER: + logger().error("{} connect protocol version mispatch", __func__); + throw std::system_error(make_error_code(error::negotiation_failure)); + case CEPH_MSGR_TAG_BADAUTHORIZER: + logger().error("{} got bad authorizer", __func__); + throw std::system_error(make_error_code(error::negotiation_failure)); + case CEPH_MSGR_TAG_RESETSESSION: + reset_session(); + return seastar::make_ready_future<stop_t>(stop_t::no); + case CEPH_MSGR_TAG_RETRY_GLOBAL: + return messenger.get_global_seq(h.reply.global_seq).then([this] (auto gs) { + h.global_seq = gs; + return seastar::make_ready_future<stop_t>(stop_t::no); + }); + case CEPH_MSGR_TAG_RETRY_SESSION: + ceph_assert(h.reply.connect_seq > h.connect_seq); + h.connect_seq = h.reply.connect_seq; + return seastar::make_ready_future<stop_t>(stop_t::no); + case CEPH_MSGR_TAG_WAIT: + // TODO: state wait + throw std::system_error(make_error_code(error::negotiation_failure)); + case CEPH_MSGR_TAG_SEQ: + case CEPH_MSGR_TAG_READY: + if (auto missing = (conn.policy.features_required & ~(uint64_t)h.reply.features); + missing) { + logger().error("{} missing required features", __func__); + throw std::system_error(make_error_code(error::negotiation_failure)); + } + return seastar::futurize_invoke([this, tag] { + if (tag == CEPH_MSGR_TAG_SEQ) { + return socket->read_exactly(sizeof(seq_num_t)) + .then([this] (auto buf) { + auto acked_seq = reinterpret_cast<const seq_num_t*>(buf.get()); + discard_up_to(&conn.out_q, *acked_seq); + return socket->write_flush(make_static_packet(conn.in_seq)); + }); + } + // tag CEPH_MSGR_TAG_READY + return seastar::now(); + }).then([this] { + // hooray! + h.peer_global_seq = h.reply.global_seq; + conn.policy.lossy = h.reply.flags & CEPH_MSG_CONNECT_LOSSY; + h.connect_seq++; + h.backoff = 0ms; + conn.set_features(h.reply.features & h.connect.features); + if (auth_meta->authorizer) { + session_security.reset( + get_auth_session_handler(nullptr, + auth_meta->authorizer->protocol, + auth_meta->session_key, + conn.features)); + } else { + session_security.reset(); + } + return seastar::make_ready_future<stop_t>(stop_t::yes); + }); + break; + default: + // unknown tag + logger().error("{} got unknown tag", __func__, int(tag)); + throw std::system_error(make_error_code(error::negotiation_failure)); + } +} + +ceph::bufferlist ProtocolV1::get_auth_payload() +{ + // only non-mons connectings to mons use MAuth messages + if (conn.peer_is_mon() && + messenger.get_mytype() != CEPH_ENTITY_TYPE_MON) { + return {}; + } else { + if (h.auth_more.length()) { + logger().info("using augmented (challenge) auth payload"); + return std::move(h.auth_more); + } else { + auto [auth_method, preferred_modes, auth_bl] = + messenger.get_auth_client()->get_auth_request( + conn.shared_from_this(), auth_meta); + auth_meta->auth_method = auth_method; + return auth_bl; + } + } +} + +seastar::future<stop_t> +ProtocolV1::repeat_connect() +{ + // encode ceph_msg_connect + memset(&h.connect, 0, sizeof(h.connect)); + h.connect.features = conn.policy.features_supported; + h.connect.host_type = messenger.get_myname().type(); + h.connect.global_seq = h.global_seq; + h.connect.connect_seq = h.connect_seq; + h.connect.protocol_version = get_proto_version(conn.get_peer_type(), true); + // this is fyi, actually, server decides! + h.connect.flags = conn.policy.lossy ? CEPH_MSG_CONNECT_LOSSY : 0; + + ceph_assert(messenger.get_auth_client()); + + bufferlist bl; + bufferlist auth_bl = get_auth_payload(); + if (auth_bl.length()) { + h.connect.authorizer_protocol = auth_meta->auth_method; + h.connect.authorizer_len = auth_bl.length(); + bl.append(create_static(h.connect)); + bl.claim_append(auth_bl); + } else { + h.connect.authorizer_protocol = 0; + h.connect.authorizer_len = 0; + bl.append(create_static(h.connect)); + }; + return socket->write_flush(std::move(bl)) + .then([this] { + // read the reply + return socket->read(sizeof(h.reply)); + }).then([this] (bufferlist bl) { + auto p = bl.cbegin(); + ::decode(h.reply, p); + ceph_assert(p.end()); + return socket->read(h.reply.authorizer_len); + }).then([this] (bufferlist bl) { + h.auth_payload = std::move(bl); + return handle_connect_reply(h.reply.tag); + }); +} + +void ProtocolV1::start_connect(const entity_addr_t& _peer_addr, + const entity_name_t& _peer_name) +{ + ceph_assert(state == state_t::none); + logger().trace("{} trigger connecting, was {}", conn, static_cast<int>(state)); + state = state_t::connecting; + set_write_state(write_state_t::delay); + + ceph_assert(!socket); + ceph_assert(!gate.is_closed()); + conn.peer_addr = _peer_addr; + conn.target_addr = _peer_addr; + conn.set_peer_name(_peer_name); + conn.policy = messenger.get_policy(_peer_name.type()); + messenger.register_conn( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + gate.dispatch_in_background("start_connect", *this, [this] { + return Socket::connect(conn.peer_addr) + .then([this](SocketRef sock) { + socket = std::move(sock); + if (state != state_t::connecting) { + assert(state == state_t::closing); + return socket->close().then([] { + throw std::system_error(make_error_code(error::protocol_aborted)); + }); + } + return seastar::now(); + }).then([this] { + return messenger.get_global_seq(); + }).then([this] (auto gs) { + h.global_seq = gs; + // read server's handshake header + return socket->read(server_header_size); + }).then([this] (bufferlist headerbl) { + auto p = headerbl.cbegin(); + validate_banner(p); + entity_addr_t saddr, caddr; + ::decode(saddr, p); + ::decode(caddr, p); + ceph_assert(p.end()); + if (saddr != conn.peer_addr) { + logger().error("{} my peer_addr {} doesn't match what peer advertized {}", + conn, conn.peer_addr, saddr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (state != state_t::connecting) { + assert(state == state_t::closing); + throw std::system_error(make_error_code(error::protocol_aborted)); + } + socket->learn_ephemeral_port_as_connector(caddr.get_port()); + if (unlikely(caddr.is_msgr2())) { + logger().warn("{} peer sent a v2 address for me: {}", + conn, caddr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + caddr.set_type(entity_addr_t::TYPE_LEGACY); + return messenger.learned_addr(caddr, conn); + }).then([this] { + // encode/send client's handshake header + bufferlist bl; + bl.append(buffer::create_static(banner_size, banner)); + ::encode(messenger.get_myaddr(), bl, 0); + return socket->write_flush(std::move(bl)); + }).then([=] { + return seastar::repeat([this] { + return repeat_connect(); + }); + }).then([this] { + if (state != state_t::connecting) { + assert(state == state_t::closing); + throw std::system_error(make_error_code(error::protocol_aborted)); + } + execute_open(open_t::connected); + }).handle_exception([this] (std::exception_ptr eptr) { + // TODO: handle fault in the connecting state + logger().warn("{} connecting fault: {}", conn, eptr); + close(true); + }); + }); +} + +// accepting state + +seastar::future<stop_t> ProtocolV1::send_connect_reply( + msgr_tag_t tag, bufferlist&& authorizer_reply) +{ + h.reply.tag = tag; + h.reply.features = static_cast<uint64_t>((h.connect.features & + conn.policy.features_supported) | + conn.policy.features_required); + h.reply.authorizer_len = authorizer_reply.length(); + return socket->write(make_static_packet(h.reply)) + .then([this, reply=std::move(authorizer_reply)]() mutable { + return socket->write_flush(std::move(reply)); + }).then([] { + return stop_t::no; + }); +} + +seastar::future<stop_t> ProtocolV1::send_connect_reply_ready( + msgr_tag_t tag, bufferlist&& authorizer_reply) +{ + return messenger.get_global_seq( + ).then([this, tag, auth_len = authorizer_reply.length()] (auto gs) { + h.global_seq = gs; + h.reply.tag = tag; + h.reply.features = conn.policy.features_supported; + h.reply.global_seq = h.global_seq; + h.reply.connect_seq = h.connect_seq; + h.reply.flags = 0; + if (conn.policy.lossy) { + h.reply.flags = h.reply.flags | CEPH_MSG_CONNECT_LOSSY; + } + h.reply.authorizer_len = auth_len; + + session_security.reset( + get_auth_session_handler(nullptr, + auth_meta->auth_method, + auth_meta->session_key, + conn.features)); + + return socket->write(make_static_packet(h.reply)); + }).then([this, reply=std::move(authorizer_reply)]() mutable { + if (reply.length()) { + return socket->write(std::move(reply)); + } else { + return seastar::now(); + } + }).then([this] { + if (h.reply.tag == CEPH_MSGR_TAG_SEQ) { + return socket->write_flush(make_static_packet(conn.in_seq)) + .then([this] { + return socket->read_exactly(sizeof(seq_num_t)); + }).then([this] (auto buf) { + auto acked_seq = reinterpret_cast<const seq_num_t*>(buf.get()); + discard_up_to(&conn.out_q, *acked_seq); + }); + } else { + return socket->flush(); + } + }).then([] { + return stop_t::yes; + }); +} + +seastar::future<stop_t> ProtocolV1::replace_existing( + SocketConnectionRef existing, + bufferlist&& authorizer_reply, + bool is_reset_from_peer) +{ + msgr_tag_t reply_tag; + if (HAVE_FEATURE(h.connect.features, RECONNECT_SEQ) && + !is_reset_from_peer) { + reply_tag = CEPH_MSGR_TAG_SEQ; + } else { + reply_tag = CEPH_MSGR_TAG_READY; + } + if (!existing->is_lossy()) { + // XXX: we decided not to support lossless connection in v1. as the + // client's default policy is + // Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX) which is + // lossy. And by the time + // will all be performed using v2 protocol. + ceph_abort("lossless policy not supported for v1"); + } + existing->protocol->close(true); + return send_connect_reply_ready(reply_tag, std::move(authorizer_reply)); +} + +seastar::future<stop_t> ProtocolV1::handle_connect_with_existing( + SocketConnectionRef existing, bufferlist&& authorizer_reply) +{ + ProtocolV1 *exproto = dynamic_cast<ProtocolV1*>(existing->protocol.get()); + + if (h.connect.global_seq < exproto->peer_global_seq()) { + h.reply.global_seq = exproto->peer_global_seq(); + return send_connect_reply(CEPH_MSGR_TAG_RETRY_GLOBAL); + } else if (existing->is_lossy()) { + return replace_existing(existing, std::move(authorizer_reply)); + } else if (h.connect.connect_seq == 0 && exproto->connect_seq() > 0) { + return replace_existing(existing, std::move(authorizer_reply), true); + } else if (h.connect.connect_seq < exproto->connect_seq()) { + // old attempt, or we sent READY but they didn't get it. + h.reply.connect_seq = exproto->connect_seq() + 1; + return send_connect_reply(CEPH_MSGR_TAG_RETRY_SESSION); + } else if (h.connect.connect_seq == exproto->connect_seq()) { + // if the existing connection successfully opened, and/or + // subsequently went to standby, then the peer should bump + // their connect_seq and retry: this is not a connection race + // we need to resolve here. + if (exproto->get_state() == state_t::open || + exproto->get_state() == state_t::standby) { + if (conn.policy.resetcheck && exproto->connect_seq() == 0) { + return replace_existing(existing, std::move(authorizer_reply)); + } else { + h.reply.connect_seq = exproto->connect_seq() + 1; + return send_connect_reply(CEPH_MSGR_TAG_RETRY_SESSION); + } + } else if (existing->peer_wins()) { + return replace_existing(existing, std::move(authorizer_reply)); + } else { + return send_connect_reply(CEPH_MSGR_TAG_WAIT); + } + } else if (conn.policy.resetcheck && + exproto->connect_seq() == 0) { + return send_connect_reply(CEPH_MSGR_TAG_RESETSESSION); + } else { + return replace_existing(existing, std::move(authorizer_reply)); + } +} + +bool ProtocolV1::require_auth_feature() const +{ + if (h.connect.authorizer_protocol != CEPH_AUTH_CEPHX) { + return false; + } + if (local_conf()->cephx_require_signatures) { + return true; + } + if (h.connect.host_type == CEPH_ENTITY_TYPE_OSD || + h.connect.host_type == CEPH_ENTITY_TYPE_MDS || + h.connect.host_type == CEPH_ENTITY_TYPE_MGR) { + return local_conf()->cephx_cluster_require_signatures; + } else { + return local_conf()->cephx_service_require_signatures; + } +} + +bool ProtocolV1::require_cephx_v2_feature() const +{ + if (h.connect.authorizer_protocol != CEPH_AUTH_CEPHX) { + return false; + } + if (local_conf()->cephx_require_version >= 2) { + return true; + } + if (h.connect.host_type == CEPH_ENTITY_TYPE_OSD || + h.connect.host_type == CEPH_ENTITY_TYPE_MDS || + h.connect.host_type == CEPH_ENTITY_TYPE_MGR) { + return local_conf()->cephx_cluster_require_version >= 2; + } else { + return local_conf()->cephx_service_require_version >= 2; + } +} + +seastar::future<stop_t> ProtocolV1::repeat_handle_connect() +{ + return socket->read(sizeof(h.connect)) + .then([this](bufferlist bl) { + auto p = bl.cbegin(); + ::decode(h.connect, p); + if (conn.get_peer_type() != 0 && + conn.get_peer_type() != h.connect.host_type) { + logger().error("{} repeat_handle_connect(): my peer type does not match" + " what peer advertises {} != {}", + conn, conn.get_peer_type(), h.connect.host_type); + throw std::system_error(make_error_code(error::protocol_aborted)); + } + conn.set_peer_type(h.connect.host_type); + conn.policy = messenger.get_policy(h.connect.host_type); + if (!conn.policy.lossy && !conn.policy.server && conn.target_addr.get_port() <= 0) { + logger().error("{} we don't know how to reconnect to peer {}", + conn, conn.target_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + return socket->read(h.connect.authorizer_len); + }).then([this] (bufferlist authorizer) { + memset(&h.reply, 0, sizeof(h.reply)); + // TODO: set reply.protocol_version + if (h.connect.protocol_version != get_proto_version(h.connect.host_type, false)) { + return send_connect_reply( + CEPH_MSGR_TAG_BADPROTOVER, bufferlist{}); + } + if (require_auth_feature()) { + conn.policy.features_required |= CEPH_FEATURE_MSG_AUTH; + } + if (require_cephx_v2_feature()) { + conn.policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2; + } + if (auto feat_missing = conn.policy.features_required & ~(uint64_t)h.connect.features; + feat_missing != 0) { + return send_connect_reply( + CEPH_MSGR_TAG_FEATURES, bufferlist{}); + } + + bufferlist authorizer_reply; + auth_meta->auth_method = h.connect.authorizer_protocol; + if (!HAVE_FEATURE((uint64_t)h.connect.features, CEPHX_V2)) { + // peer doesn't support it and we won't get here if we require it + auth_meta->skip_authorizer_challenge = true; + } + auto more = static_cast<bool>(auth_meta->authorizer_challenge); + ceph_assert(messenger.get_auth_server()); + int r = messenger.get_auth_server()->handle_auth_request( + conn.shared_from_this(), auth_meta, more, auth_meta->auth_method, authorizer, + &authorizer_reply); + + if (r < 0) { + session_security.reset(); + return send_connect_reply( + CEPH_MSGR_TAG_BADAUTHORIZER, std::move(authorizer_reply)); + } else if (r == 0) { + ceph_assert(authorizer_reply.length()); + return send_connect_reply( + CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER, std::move(authorizer_reply)); + } + + // r > 0 + if (auto existing = messenger.lookup_conn(conn.peer_addr); existing) { + if (existing->protocol->proto_type != proto_t::v1) { + logger().warn("{} existing {} proto version is {} not 1, close existing", + conn, *existing, + static_cast<int>(existing->protocol->proto_type)); + // NOTE: this is following async messenger logic, but we may miss the reset event. + existing->mark_down(); + } else { + return handle_connect_with_existing(existing, std::move(authorizer_reply)); + } + } + if (h.connect.connect_seq > 0) { + return send_connect_reply(CEPH_MSGR_TAG_RESETSESSION, + std::move(authorizer_reply)); + } + h.connect_seq = h.connect.connect_seq + 1; + h.peer_global_seq = h.connect.global_seq; + conn.set_features((uint64_t)conn.policy.features_supported & (uint64_t)h.connect.features); + // TODO: cct + return send_connect_reply_ready(CEPH_MSGR_TAG_READY, std::move(authorizer_reply)); + }); +} + +void ProtocolV1::start_accept(SocketRef&& sock, + const entity_addr_t& _peer_addr) +{ + ceph_assert(state == state_t::none); + logger().trace("{} trigger accepting, was {}", + conn, static_cast<int>(state)); + state = state_t::accepting; + set_write_state(write_state_t::delay); + + ceph_assert(!socket); + // until we know better + conn.target_addr = _peer_addr; + socket = std::move(sock); + messenger.accept_conn( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + gate.dispatch_in_background("start_accept", *this, [this] { + // stop learning my_addr before sending it out, so it won't change + return messenger.learned_addr(messenger.get_myaddr(), conn).then([this] { + // encode/send server's handshake header + bufferlist bl; + bl.append(buffer::create_static(banner_size, banner)); + ::encode(messenger.get_myaddr(), bl, 0); + ::encode(conn.target_addr, bl, 0); + return socket->write_flush(std::move(bl)); + }).then([this] { + // read client's handshake header and connect request + return socket->read(client_header_size); + }).then([this] (bufferlist bl) { + auto p = bl.cbegin(); + validate_banner(p); + entity_addr_t addr; + ::decode(addr, p); + ceph_assert(p.end()); + if ((addr.is_legacy() || addr.is_any()) && + addr.is_same_host(conn.target_addr)) { + // good + } else { + logger().error("{} peer advertized an invalid peer_addr: {}," + " which should be v1 and the same host with {}.", + conn, addr, conn.peer_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + conn.peer_addr = addr; + conn.target_addr = conn.peer_addr; + return seastar::repeat([this] { + return repeat_handle_connect(); + }); + }).then([this] { + if (state != state_t::accepting) { + assert(state == state_t::closing); + throw std::system_error(make_error_code(error::protocol_aborted)); + } + messenger.register_conn( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + messenger.unaccept_conn( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + execute_open(open_t::accepted); + }).handle_exception([this] (std::exception_ptr eptr) { + // TODO: handle fault in the accepting state + logger().warn("{} accepting fault: {}", conn, eptr); + close(false); + }); + }); +} + +// open state + +ceph::bufferlist ProtocolV1::do_sweep_messages( + const std::deque<MessageRef>& msgs, + size_t num_msgs, + bool require_keepalive, + std::optional<utime_t> _keepalive_ack, + bool require_ack) +{ + static const size_t RESERVE_MSG_SIZE = sizeof(CEPH_MSGR_TAG_MSG) + + sizeof(ceph_msg_header) + + sizeof(ceph_msg_footer); + static const size_t RESERVE_MSG_SIZE_OLD = sizeof(CEPH_MSGR_TAG_MSG) + + sizeof(ceph_msg_header) + + sizeof(ceph_msg_footer_old); + + ceph::bufferlist bl; + if (likely(num_msgs)) { + if (HAVE_FEATURE(conn.features, MSG_AUTH)) { + bl.reserve(num_msgs * RESERVE_MSG_SIZE); + } else { + bl.reserve(num_msgs * RESERVE_MSG_SIZE_OLD); + } + } + + if (unlikely(require_keepalive)) { + k.req.stamp = ceph::coarse_real_clock::to_ceph_timespec( + ceph::coarse_real_clock::now()); + logger().trace("{} write keepalive2 {}", conn, k.req.stamp.tv_sec); + bl.append(create_static(k.req)); + } + + if (unlikely(_keepalive_ack.has_value())) { + logger().trace("{} write keepalive2 ack {}", conn, *_keepalive_ack); + k.ack.stamp = ceph_timespec(*_keepalive_ack); + bl.append(create_static(k.ack)); + } + + if (require_ack) { + // XXX: we decided not to support lossless connection in v1. as the + // client's default policy is + // Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX) which is + // lossy. And by the time of crimson-osd's GA, the in-cluster communication + // will all be performed using v2 protocol. + ceph_abort("lossless policy not supported for v1"); + } + + std::for_each(msgs.begin(), msgs.begin()+num_msgs, [this, &bl](const MessageRef& msg) { + ceph_assert(!msg->get_seq() && "message already has seq"); + msg->set_seq(++conn.out_seq); + auto& header = msg->get_header(); + header.src = messenger.get_myname(); + msg->encode(conn.features, messenger.get_crc_flags()); + if (session_security) { + session_security->sign_message(msg.get()); + } + logger().debug("{} --> #{} === {} ({})", + conn, msg->get_seq(), *msg, msg->get_type()); + bl.append(CEPH_MSGR_TAG_MSG); + bl.append((const char*)&header, sizeof(header)); + bl.append(msg->get_payload()); + bl.append(msg->get_middle()); + bl.append(msg->get_data()); + auto& footer = msg->get_footer(); + if (HAVE_FEATURE(conn.features, MSG_AUTH)) { + bl.append((const char*)&footer, sizeof(footer)); + } else { + ceph_msg_footer_old old_footer; + if (messenger.get_crc_flags() & MSG_CRC_HEADER) { + old_footer.front_crc = footer.front_crc; + old_footer.middle_crc = footer.middle_crc; + } else { + old_footer.front_crc = old_footer.middle_crc = 0; + } + if (messenger.get_crc_flags() & MSG_CRC_DATA) { + old_footer.data_crc = footer.data_crc; + } else { + old_footer.data_crc = 0; + } + old_footer.flags = footer.flags; + bl.append((const char*)&old_footer, sizeof(old_footer)); + } + }); + + return bl; +} + +seastar::future<> ProtocolV1::handle_keepalive2_ack() +{ + return socket->read_exactly(sizeof(ceph_timespec)) + .then([this] (auto buf) { + auto t = reinterpret_cast<const ceph_timespec*>(buf.get()); + k.ack_stamp = *t; + logger().trace("{} got keepalive2 ack {}", conn, t->tv_sec); + }); +} + +seastar::future<> ProtocolV1::handle_keepalive2() +{ + return socket->read_exactly(sizeof(ceph_timespec)) + .then([this] (auto buf) { + utime_t ack{*reinterpret_cast<const ceph_timespec*>(buf.get())}; + notify_keepalive_ack(ack); + }); +} + +seastar::future<> ProtocolV1::handle_ack() +{ + return socket->read_exactly(sizeof(ceph_le64)) + .then([this] (auto buf) { + auto seq = reinterpret_cast<const ceph_le64*>(buf.get()); + discard_up_to(&conn.sent, *seq); + }); +} + +seastar::future<> ProtocolV1::maybe_throttle() +{ + if (!conn.policy.throttler_bytes) { + return seastar::now(); + } + const auto to_read = (m.header.front_len + + m.header.middle_len + + m.header.data_len); + return conn.policy.throttler_bytes->get(to_read); +} + +seastar::future<> ProtocolV1::read_message() +{ + return socket->read(sizeof(m.header)) + .then([this] (bufferlist bl) { + // throttle the traffic, maybe + auto p = bl.cbegin(); + ::decode(m.header, p); + return maybe_throttle(); + }).then([this] { + // read front + return socket->read(m.header.front_len); + }).then([this] (bufferlist bl) { + m.front = std::move(bl); + // read middle + return socket->read(m.header.middle_len); + }).then([this] (bufferlist bl) { + m.middle = std::move(bl); + // read data + return socket->read(m.header.data_len); + }).then([this] (bufferlist bl) { + m.data = std::move(bl); + // read footer + return socket->read(sizeof(m.footer)); + }).then([this] (bufferlist bl) { + auto p = bl.cbegin(); + ::decode(m.footer, p); + auto conn_ref = seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this()); + auto msg = ::decode_message(nullptr, 0, m.header, m.footer, + m.front, m.middle, m.data, conn_ref); + if (unlikely(!msg)) { + logger().warn("{} decode message failed", conn); + throw std::system_error{make_error_code(error::corrupted_message)}; + } + constexpr bool add_ref = false; // Message starts with 1 ref + // TODO: change MessageRef with foreign_ptr + auto msg_ref = MessageRef{msg, add_ref}; + + if (session_security) { + if (unlikely(session_security->check_message_signature(msg))) { + logger().warn("{} message signature check failed", conn); + throw std::system_error{make_error_code(error::corrupted_message)}; + } + } + // TODO: set time stamps + msg->set_byte_throttler(conn.policy.throttler_bytes); + + if (unlikely(!conn.update_rx_seq(msg->get_seq()))) { + // skip this message + return seastar::now(); + } + + logger().debug("{} <== #{} === {} ({})", + conn, msg_ref->get_seq(), *msg_ref, msg_ref->get_type()); + // throttle the reading process by the returned future + return dispatchers.ms_dispatch(conn_ref, std::move(msg_ref)); + }); +} + +seastar::future<> ProtocolV1::handle_tags() +{ + return seastar::keep_doing([this] { + // read the next tag + return socket->read_exactly(1) + .then([this] (auto buf) { + switch (buf[0]) { + case CEPH_MSGR_TAG_MSG: + return read_message(); + case CEPH_MSGR_TAG_ACK: + return handle_ack(); + case CEPH_MSGR_TAG_KEEPALIVE: + return seastar::now(); + case CEPH_MSGR_TAG_KEEPALIVE2: + return handle_keepalive2(); + case CEPH_MSGR_TAG_KEEPALIVE2_ACK: + return handle_keepalive2_ack(); + case CEPH_MSGR_TAG_CLOSE: + logger().info("{} got tag close", conn); + throw std::system_error(make_error_code(error::protocol_aborted)); + default: + logger().error("{} got unknown msgr tag {}", + conn, static_cast<int>(buf[0])); + throw std::system_error(make_error_code(error::read_eof)); + } + }); + }); +} + +void ProtocolV1::execute_open(open_t type) +{ + logger().trace("{} trigger open, was {}", conn, static_cast<int>(state)); + state = state_t::open; + set_write_state(write_state_t::open); + + if (type == open_t::connected) { + dispatchers.ms_handle_connect( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + } else { // type == open_t::accepted + dispatchers.ms_handle_accept( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + } + + gate.dispatch_in_background("execute_open", *this, [this] { + // start background processing of tags + return handle_tags() + .handle_exception_type([this] (const std::system_error& e) { + logger().warn("{} open fault: {}", conn, e); + if (e.code() == error::protocol_aborted || + e.code() == std::errc::connection_reset || + e.code() == error::read_eof) { + close(true); + return seastar::now(); + } else { + throw e; + } + }).handle_exception([this] (std::exception_ptr eptr) { + // TODO: handle fault in the open state + logger().warn("{} open fault: {}", conn, eptr); + close(true); + }); + }); +} + +// closing state + +void ProtocolV1::trigger_close() +{ + logger().trace("{} trigger closing, was {}", + conn, static_cast<int>(state)); + messenger.closing_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + + if (state == state_t::accepting) { + messenger.unaccept_conn(seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + } else if (state >= state_t::connecting && state < state_t::closing) { + messenger.unregister_conn(seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + } else { + // cannot happen + ceph_assert(false); + } + + if (!socket) { + ceph_assert(state == state_t::connecting); + } + + state = state_t::closing; +} + +void ProtocolV1::on_closed() +{ + messenger.closed_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); +} + +seastar::future<> ProtocolV1::fault() +{ + if (conn.policy.lossy) { + messenger.unregister_conn(seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + } + // XXX: we decided not to support lossless connection in v1. as the + // client's default policy is + // Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX) which is + // lossy. And by the time of crimson-osd's GA, the in-cluster communication + // will all be performed using v2 protocol. + ceph_abort("lossless policy not supported for v1"); + return seastar::now(); +} + +void ProtocolV1::print(std::ostream& out) const +{ + out << conn; +} + +} // namespace crimson::net diff --git a/src/crimson/net/ProtocolV1.h b/src/crimson/net/ProtocolV1.h new file mode 100644 index 000000000..ed6df8954 --- /dev/null +++ b/src/crimson/net/ProtocolV1.h @@ -0,0 +1,137 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "Protocol.h" + +class AuthAuthorizer; +class AuthSessionHandler; + +namespace crimson::net { + +class ProtocolV1 final : public Protocol { + public: + ProtocolV1(ChainedDispatchers& dispatchers, + SocketConnection& conn, + SocketMessenger& messenger); + ~ProtocolV1() override; + void print(std::ostream&) const final; + private: + void on_closed() override; + bool is_connected() const override; + + void start_connect(const entity_addr_t& peer_addr, + const entity_name_t& peer_name) override; + + void start_accept(SocketRef&& socket, + const entity_addr_t& peer_addr) override; + + void trigger_close() override; + + ceph::bufferlist do_sweep_messages( + const std::deque<MessageRef>& msgs, + size_t num_msgs, + bool require_keepalive, + std::optional<utime_t> keepalive_ack, + bool require_ack) override; + + private: + SocketMessenger &messenger; + + enum class state_t { + none, + accepting, + connecting, + open, + standby, + wait, + closing + }; + state_t state = state_t::none; + + // state for handshake + struct Handshake { + ceph_msg_connect connect; + ceph_msg_connect_reply reply; + ceph::bufferlist auth_payload; // auth(orizer) payload read off the wire + ceph::bufferlist auth_more; // connect-side auth retry (we added challenge) + std::chrono::milliseconds backoff; + uint32_t connect_seq = 0; + uint32_t peer_global_seq = 0; + uint32_t global_seq; + } h; + + std::unique_ptr<AuthSessionHandler> session_security; + + // state for an incoming message + struct MessageReader { + ceph_msg_header header; + ceph_msg_footer footer; + bufferlist front; + bufferlist middle; + bufferlist data; + } m; + + struct Keepalive { + struct { + const char tag = CEPH_MSGR_TAG_KEEPALIVE2; + ceph_timespec stamp; + } __attribute__((packed)) req; + struct { + const char tag = CEPH_MSGR_TAG_KEEPALIVE2_ACK; + ceph_timespec stamp; + } __attribute__((packed)) ack; + ceph_timespec ack_stamp; + } k; + + private: + // connecting + void reset_session(); + seastar::future<stop_t> handle_connect_reply(crimson::net::msgr_tag_t tag); + seastar::future<stop_t> repeat_connect(); + ceph::bufferlist get_auth_payload(); + + // accepting + seastar::future<stop_t> send_connect_reply( + msgr_tag_t tag, bufferlist&& authorizer_reply = {}); + seastar::future<stop_t> send_connect_reply_ready( + msgr_tag_t tag, bufferlist&& authorizer_reply); + seastar::future<stop_t> replace_existing( + SocketConnectionRef existing, + bufferlist&& authorizer_reply, + bool is_reset_from_peer = false); + seastar::future<stop_t> handle_connect_with_existing( + SocketConnectionRef existing, bufferlist&& authorizer_reply); + bool require_auth_feature() const; + bool require_cephx_v2_feature() const; + seastar::future<stop_t> repeat_handle_connect(); + + // open + seastar::future<> handle_keepalive2_ack(); + seastar::future<> handle_keepalive2(); + seastar::future<> handle_ack(); + seastar::future<> maybe_throttle(); + seastar::future<> read_message(); + seastar::future<> handle_tags(); + + enum class open_t { + connected, + accepted + }; + void execute_open(open_t type); + + // replacing + // the number of connections initiated in this session, increment when a + // new connection is established + uint32_t connect_seq() const { return h.connect_seq; } + // the client side should connect us with a gseq. it will be reset with + // the one of exsting connection if it's greater. + uint32_t peer_global_seq() const { return h.peer_global_seq; } + // current state of ProtocolV1 + state_t get_state() const { return state; } + + seastar::future<> fault(); +}; + +} // namespace crimson::net diff --git a/src/crimson/net/ProtocolV2.cc b/src/crimson/net/ProtocolV2.cc new file mode 100644 index 000000000..b7137b8b8 --- /dev/null +++ b/src/crimson/net/ProtocolV2.cc @@ -0,0 +1,2139 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ProtocolV2.h" + +#include <seastar/core/lowres_clock.hh> +#include <fmt/format.h> +#include "include/msgr.h" +#include "include/random.h" + +#include "crimson/auth/AuthClient.h" +#include "crimson/auth/AuthServer.h" +#include "crimson/common/formatter.h" + +#include "chained_dispatchers.h" +#include "Errors.h" +#include "Socket.h" +#include "SocketConnection.h" +#include "SocketMessenger.h" + +#ifdef UNIT_TESTS_BUILT +#include "Interceptor.h" +#endif + +using namespace ceph::msgr::v2; +using crimson::common::local_conf; + +namespace { + +// TODO: apply the same logging policy to Protocol V1 +// Log levels in V2 Protocol: +// * error level, something error that cause connection to terminate: +// - fatal errors; +// - bugs; +// * warn level: something unusual that identifies connection fault or replacement: +// - unstable network; +// - incompatible peer; +// - auth failure; +// - connection race; +// - connection reset; +// * info level, something very important to show connection lifecycle, +// which doesn't happen very frequently; +// * debug level, important logs for debugging, including: +// - all the messages sent/received (-->/<==); +// - all the frames exchanged (WRITE/GOT); +// - important fields updated (UPDATE); +// - connection state transitions (TRIGGER); +// * trace level, trivial logs showing: +// - the exact bytes being sent/received (SEND/RECV(bytes)); +// - detailed information of sub-frames; +// - integrity checks; +// - etc. +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); +} + +[[noreturn]] void abort_in_fault() { + throw std::system_error(make_error_code(crimson::net::error::negotiation_failure)); +} + +[[noreturn]] void abort_protocol() { + throw std::system_error(make_error_code(crimson::net::error::protocol_aborted)); +} + +[[noreturn]] void abort_in_close(crimson::net::ProtocolV2& proto, bool dispatch_reset) { + proto.close(dispatch_reset); + abort_protocol(); +} + +inline void expect_tag(const Tag& expected, + const Tag& actual, + crimson::net::SocketConnection& conn, + const char *where) { + if (actual != expected) { + logger().warn("{} {} received wrong tag: {}, expected {}", + conn, where, + static_cast<uint32_t>(actual), + static_cast<uint32_t>(expected)); + abort_in_fault(); + } +} + +inline void unexpected_tag(const Tag& unexpected, + crimson::net::SocketConnection& conn, + const char *where) { + logger().warn("{} {} received unexpected tag: {}", + conn, where, static_cast<uint32_t>(unexpected)); + abort_in_fault(); +} + +inline uint64_t generate_client_cookie() { + return ceph::util::generate_random_number<uint64_t>( + 1, std::numeric_limits<uint64_t>::max()); +} + +} // namespace anonymous + +namespace crimson::net { + +#ifdef UNIT_TESTS_BUILT +void intercept(Breakpoint bp, bp_type_t type, + SocketConnection& conn, SocketRef& socket) { + if (conn.interceptor) { + auto action = conn.interceptor->intercept(conn, Breakpoint(bp)); + socket->set_trap(type, action, &conn.interceptor->blocker); + } +} + +#define INTERCEPT_CUSTOM(bp, type) \ +intercept({bp}, type, conn, socket) + +#define INTERCEPT_FRAME(tag, type) \ +intercept({static_cast<Tag>(tag), type}, \ + type, conn, socket) + +#define INTERCEPT_N_RW(bp) \ +if (conn.interceptor) { \ + auto action = conn.interceptor->intercept(conn, {bp}); \ + ceph_assert(action != bp_action_t::BLOCK); \ + if (action == bp_action_t::FAULT) { \ + abort_in_fault(); \ + } \ +} + +#else +#define INTERCEPT_CUSTOM(bp, type) +#define INTERCEPT_FRAME(tag, type) +#define INTERCEPT_N_RW(bp) +#endif + +seastar::future<> ProtocolV2::Timer::backoff(double seconds) +{ + logger().warn("{} waiting {} seconds ...", conn, seconds); + cancel(); + last_dur_ = seconds; + as = seastar::abort_source(); + auto dur = std::chrono::duration_cast<seastar::lowres_clock::duration>( + std::chrono::duration<double>(seconds)); + return seastar::sleep_abortable(dur, *as + ).handle_exception_type([this] (const seastar::sleep_aborted& e) { + logger().debug("{} wait aborted", conn); + abort_protocol(); + }); +} + +ProtocolV2::ProtocolV2(ChainedDispatchers& dispatchers, + SocketConnection& conn, + SocketMessenger& messenger) + : Protocol(proto_t::v2, dispatchers, conn), + messenger{messenger}, + protocol_timer{conn} +{} + +ProtocolV2::~ProtocolV2() {} + +bool ProtocolV2::is_connected() const { + return state == state_t::READY || + state == state_t::ESTABLISHING || + state == state_t::REPLACING; +} + +void ProtocolV2::start_connect(const entity_addr_t& _peer_addr, + const entity_name_t& _peer_name) +{ + ceph_assert(state == state_t::NONE); + ceph_assert(!socket); + ceph_assert(!gate.is_closed()); + conn.peer_addr = _peer_addr; + conn.target_addr = _peer_addr; + conn.set_peer_name(_peer_name); + conn.policy = messenger.get_policy(_peer_name.type()); + client_cookie = generate_client_cookie(); + logger().info("{} ProtocolV2::start_connect(): peer_addr={}, peer_name={}, cc={}" + " policy(lossy={}, server={}, standby={}, resetcheck={})", + conn, _peer_addr, _peer_name, client_cookie, + conn.policy.lossy, conn.policy.server, + conn.policy.standby, conn.policy.resetcheck); + messenger.register_conn( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + execute_connecting(); +} + +void ProtocolV2::start_accept(SocketRef&& sock, + const entity_addr_t& _peer_addr) +{ + ceph_assert(state == state_t::NONE); + ceph_assert(!socket); + // until we know better + conn.target_addr = _peer_addr; + socket = std::move(sock); + logger().info("{} ProtocolV2::start_accept(): target_addr={}", conn, _peer_addr); + messenger.accept_conn( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + execute_accepting(); +} + +// TODO: Frame related implementations, probably to a separate class. + +void ProtocolV2::enable_recording() +{ + rxbuf.clear(); + txbuf.clear(); + record_io = true; +} + +seastar::future<Socket::tmp_buf> ProtocolV2::read_exactly(size_t bytes) +{ + if (unlikely(record_io)) { + return socket->read_exactly(bytes) + .then([this] (auto bl) { + rxbuf.append(buffer::create(bl.share())); + return bl; + }); + } else { + return socket->read_exactly(bytes); + }; +} + +seastar::future<bufferlist> ProtocolV2::read(size_t bytes) +{ + if (unlikely(record_io)) { + return socket->read(bytes) + .then([this] (auto buf) { + rxbuf.append(buf); + return buf; + }); + } else { + return socket->read(bytes); + } +} + +seastar::future<> ProtocolV2::write(bufferlist&& buf) +{ + if (unlikely(record_io)) { + txbuf.append(buf); + } + return socket->write(std::move(buf)); +} + +seastar::future<> ProtocolV2::write_flush(bufferlist&& buf) +{ + if (unlikely(record_io)) { + txbuf.append(buf); + } + return socket->write_flush(std::move(buf)); +} + +size_t ProtocolV2::get_current_msg_size() const +{ + ceph_assert(rx_frame_asm.get_num_segments() > 0); + size_t sum = 0; + // we don't include SegmentIndex::Msg::HEADER. + for (size_t idx = 1; idx < rx_frame_asm.get_num_segments(); idx++) { + sum += rx_frame_asm.get_segment_logical_len(idx); + } + return sum; +} + +seastar::future<Tag> ProtocolV2::read_main_preamble() +{ + rx_preamble.clear(); + return read_exactly(rx_frame_asm.get_preamble_onwire_len()) + .then([this] (auto bl) { + rx_segments_data.clear(); + try { + rx_preamble.append(buffer::create(std::move(bl))); + const Tag tag = rx_frame_asm.disassemble_preamble(rx_preamble); + INTERCEPT_FRAME(tag, bp_type_t::READ); + return tag; + } catch (FrameError& e) { + logger().warn("{} read_main_preamble: {}", conn, e.what()); + abort_in_fault(); + } + }); +} + +seastar::future<> ProtocolV2::read_frame_payload() +{ + ceph_assert(rx_segments_data.empty()); + + return seastar::do_until( + [this] { return rx_frame_asm.get_num_segments() == rx_segments_data.size(); }, + [this] { + // TODO: create aligned and contiguous buffer from socket + const size_t seg_idx = rx_segments_data.size(); + if (uint16_t alignment = rx_frame_asm.get_segment_align(seg_idx); + alignment != segment_t::DEFAULT_ALIGNMENT) { + logger().trace("{} cannot allocate {} aligned buffer at segment desc index {}", + conn, alignment, rx_segments_data.size()); + } + uint32_t onwire_len = rx_frame_asm.get_segment_onwire_len(seg_idx); + // TODO: create aligned and contiguous buffer from socket + return read_exactly(onwire_len).then([this] (auto tmp_bl) { + logger().trace("{} RECV({}) frame segment[{}]", + conn, tmp_bl.size(), rx_segments_data.size()); + bufferlist segment; + segment.append(buffer::create(std::move(tmp_bl))); + rx_segments_data.emplace_back(std::move(segment)); + }); + } + ).then([this] { + return read_exactly(rx_frame_asm.get_epilogue_onwire_len()); + }).then([this] (auto bl) { + logger().trace("{} RECV({}) frame epilogue", conn, bl.size()); + bool ok = false; + try { + rx_frame_asm.disassemble_first_segment(rx_preamble, rx_segments_data[0]); + bufferlist rx_epilogue; + rx_epilogue.append(buffer::create(std::move(bl))); + ok = rx_frame_asm.disassemble_remaining_segments(rx_segments_data.data(), rx_epilogue); + } catch (FrameError& e) { + logger().error("read_frame_payload: {} {}", conn, e.what()); + abort_in_fault(); + } catch (ceph::crypto::onwire::MsgAuthError&) { + logger().error("read_frame_payload: {} bad auth tag", conn); + abort_in_fault(); + } + // we do have a mechanism that allows transmitter to start sending message + // and abort after putting entire data field on wire. This will be used by + // the kernel client to avoid unnecessary buffering. + if (!ok) { + // TODO + ceph_assert(false); + } + }); +} + +template <class F> +seastar::future<> ProtocolV2::write_frame(F &frame, bool flush) +{ + auto bl = frame.get_buffer(tx_frame_asm); + const auto main_preamble = reinterpret_cast<const preamble_block_t*>(bl.front().c_str()); + logger().trace("{} SEND({}) frame: tag={}, num_segments={}, crc={}", + conn, bl.length(), (int)main_preamble->tag, + (int)main_preamble->num_segments, main_preamble->crc); + INTERCEPT_FRAME(main_preamble->tag, bp_type_t::WRITE); + if (flush) { + return write_flush(std::move(bl)); + } else { + return write(std::move(bl)); + } +} + +void ProtocolV2::trigger_state(state_t _state, write_state_t _write_state, bool reentrant) +{ + if (!reentrant && _state == state) { + logger().error("{} is not allowed to re-trigger state {}", + conn, get_state_name(state)); + ceph_assert(false); + } + logger().debug("{} TRIGGER {}, was {}", + conn, get_state_name(_state), get_state_name(state)); + state = _state; + set_write_state(_write_state); +} + +void ProtocolV2::fault(bool backoff, const char* func_name, std::exception_ptr eptr) +{ + if (conn.policy.lossy) { + logger().info("{} {}: fault at {} on lossy channel, going to CLOSING -- {}", + conn, func_name, get_state_name(state), eptr); + close(true); + } else if (conn.policy.server || + (conn.policy.standby && + (!is_queued() && conn.sent.empty()))) { + logger().info("{} {}: fault at {} with nothing to send, going to STANDBY -- {}", + conn, func_name, get_state_name(state), eptr); + execute_standby(); + } else if (backoff) { + logger().info("{} {}: fault at {}, going to WAIT -- {}", + conn, func_name, get_state_name(state), eptr); + execute_wait(false); + } else { + logger().info("{} {}: fault at {}, going to CONNECTING -- {}", + conn, func_name, get_state_name(state), eptr); + execute_connecting(); + } +} + +void ProtocolV2::reset_session(bool full) +{ + server_cookie = 0; + connect_seq = 0; + conn.in_seq = 0; + if (full) { + client_cookie = generate_client_cookie(); + peer_global_seq = 0; + reset_write(); + dispatchers.ms_handle_remote_reset( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + } +} + +seastar::future<std::tuple<entity_type_t, entity_addr_t>> +ProtocolV2::banner_exchange(bool is_connect) +{ + // 1. prepare and send banner + bufferlist banner_payload; + encode((uint64_t)CEPH_MSGR2_SUPPORTED_FEATURES, banner_payload, 0); + encode((uint64_t)CEPH_MSGR2_REQUIRED_FEATURES, banner_payload, 0); + + bufferlist bl; + bl.append(CEPH_BANNER_V2_PREFIX, strlen(CEPH_BANNER_V2_PREFIX)); + auto len_payload = static_cast<uint16_t>(banner_payload.length()); + encode(len_payload, bl, 0); + bl.claim_append(banner_payload); + logger().debug("{} SEND({}) banner: len_payload={}, supported={}, " + "required={}, banner=\"{}\"", + conn, bl.length(), len_payload, + CEPH_MSGR2_SUPPORTED_FEATURES, CEPH_MSGR2_REQUIRED_FEATURES, + CEPH_BANNER_V2_PREFIX); + INTERCEPT_CUSTOM(custom_bp_t::BANNER_WRITE, bp_type_t::WRITE); + return write_flush(std::move(bl)).then([this] { + // 2. read peer banner + unsigned banner_len = strlen(CEPH_BANNER_V2_PREFIX) + sizeof(ceph_le16); + INTERCEPT_CUSTOM(custom_bp_t::BANNER_READ, bp_type_t::READ); + return read_exactly(banner_len); // or read exactly? + }).then([this] (auto bl) { + // 3. process peer banner and read banner_payload + unsigned banner_prefix_len = strlen(CEPH_BANNER_V2_PREFIX); + logger().debug("{} RECV({}) banner: \"{}\"", + conn, bl.size(), + std::string((const char*)bl.get(), banner_prefix_len)); + + if (memcmp(bl.get(), CEPH_BANNER_V2_PREFIX, banner_prefix_len) != 0) { + if (memcmp(bl.get(), CEPH_BANNER, strlen(CEPH_BANNER)) == 0) { + logger().warn("{} peer is using V1 protocol", conn); + } else { + logger().warn("{} peer sent bad banner", conn); + } + abort_in_fault(); + } + bl.trim_front(banner_prefix_len); + + uint16_t payload_len; + bufferlist buf; + buf.append(buffer::create(std::move(bl))); + auto ti = buf.cbegin(); + try { + decode(payload_len, ti); + } catch (const buffer::error &e) { + logger().warn("{} decode banner payload len failed", conn); + abort_in_fault(); + } + logger().debug("{} GOT banner: payload_len={}", conn, payload_len); + INTERCEPT_CUSTOM(custom_bp_t::BANNER_PAYLOAD_READ, bp_type_t::READ); + return read(payload_len); + }).then([this, is_connect] (bufferlist bl) { + // 4. process peer banner_payload and send HelloFrame + auto p = bl.cbegin(); + uint64_t peer_supported_features; + uint64_t peer_required_features; + try { + decode(peer_supported_features, p); + decode(peer_required_features, p); + } catch (const buffer::error &e) { + logger().warn("{} decode banner payload failed", conn); + abort_in_fault(); + } + logger().debug("{} RECV({}) banner features: supported={} required={}", + conn, bl.length(), + peer_supported_features, peer_required_features); + + // Check feature bit compatibility + uint64_t supported_features = CEPH_MSGR2_SUPPORTED_FEATURES; + uint64_t required_features = CEPH_MSGR2_REQUIRED_FEATURES; + if ((required_features & peer_supported_features) != required_features) { + logger().error("{} peer does not support all required features" + " required={} peer_supported={}", + conn, required_features, peer_supported_features); + abort_in_close(*this, is_connect); + } + if ((supported_features & peer_required_features) != peer_required_features) { + logger().error("{} we do not support all peer required features" + " peer_required={} supported={}", + conn, peer_required_features, supported_features); + abort_in_close(*this, is_connect); + } + this->peer_required_features = peer_required_features; + if (this->peer_required_features == 0) { + this->connection_features = msgr2_required; + } + const bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1); + tx_frame_asm.set_is_rev1(is_rev1); + rx_frame_asm.set_is_rev1(is_rev1); + + auto hello = HelloFrame::Encode(messenger.get_mytype(), + conn.target_addr); + logger().debug("{} WRITE HelloFrame: my_type={}, peer_addr={}", + conn, ceph_entity_type_name(messenger.get_mytype()), + conn.target_addr); + return write_frame(hello); + }).then([this] { + //5. read peer HelloFrame + return read_main_preamble(); + }).then([this] (Tag tag) { + expect_tag(Tag::HELLO, tag, conn, __func__); + return read_frame_payload(); + }).then([this] { + // 6. process peer HelloFrame + auto hello = HelloFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT HelloFrame: my_type={} peer_addr={}", + conn, ceph_entity_type_name(hello.entity_type()), + hello.peer_addr()); + return seastar::make_ready_future<std::tuple<entity_type_t, entity_addr_t>>( + std::make_tuple(hello.entity_type(), hello.peer_addr())); + }); +} + +// CONNECTING state + +seastar::future<> ProtocolV2::handle_auth_reply() +{ + return read_main_preamble() + .then([this] (Tag tag) { + switch (tag) { + case Tag::AUTH_BAD_METHOD: + return read_frame_payload().then([this] { + // handle_auth_bad_method() logic + auto bad_method = AuthBadMethodFrame::Decode(rx_segments_data.back()); + logger().warn("{} GOT AuthBadMethodFrame: method={} result={}, " + "allowed_methods={}, allowed_modes={}", + conn, bad_method.method(), cpp_strerror(bad_method.result()), + bad_method.allowed_methods(), bad_method.allowed_modes()); + ceph_assert(messenger.get_auth_client()); + int r = messenger.get_auth_client()->handle_auth_bad_method( + conn.shared_from_this(), auth_meta, + bad_method.method(), bad_method.result(), + bad_method.allowed_methods(), bad_method.allowed_modes()); + if (r < 0) { + logger().warn("{} auth_client handle_auth_bad_method returned {}", + conn, r); + abort_in_fault(); + } + return client_auth(bad_method.allowed_methods()); + }); + case Tag::AUTH_REPLY_MORE: + return read_frame_payload().then([this] { + // handle_auth_reply_more() logic + auto auth_more = AuthReplyMoreFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT AuthReplyMoreFrame: payload_len={}", + conn, auth_more.auth_payload().length()); + ceph_assert(messenger.get_auth_client()); + // let execute_connecting() take care of the thrown exception + auto reply = messenger.get_auth_client()->handle_auth_reply_more( + conn.shared_from_this(), auth_meta, auth_more.auth_payload()); + auto more_reply = AuthRequestMoreFrame::Encode(reply); + logger().debug("{} WRITE AuthRequestMoreFrame: payload_len={}", + conn, reply.length()); + return write_frame(more_reply); + }).then([this] { + return handle_auth_reply(); + }); + case Tag::AUTH_DONE: + return read_frame_payload().then([this] { + // handle_auth_done() logic + auto auth_done = AuthDoneFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT AuthDoneFrame: gid={}, con_mode={}, payload_len={}", + conn, auth_done.global_id(), + ceph_con_mode_name(auth_done.con_mode()), + auth_done.auth_payload().length()); + ceph_assert(messenger.get_auth_client()); + int r = messenger.get_auth_client()->handle_auth_done( + conn.shared_from_this(), auth_meta, + auth_done.global_id(), + auth_done.con_mode(), + auth_done.auth_payload()); + if (r < 0) { + logger().warn("{} auth_client handle_auth_done returned {}", conn, r); + abort_in_fault(); + } + auth_meta->con_mode = auth_done.con_mode(); + session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair( + nullptr, *auth_meta, tx_frame_asm.get_is_rev1(), false); + return finish_auth(); + }); + default: { + unexpected_tag(tag, conn, __func__); + return seastar::now(); + } + } + }); +} + +seastar::future<> ProtocolV2::client_auth(std::vector<uint32_t> &allowed_methods) +{ + // send_auth_request() logic + ceph_assert(messenger.get_auth_client()); + + try { + auto [auth_method, preferred_modes, bl] = + messenger.get_auth_client()->get_auth_request(conn.shared_from_this(), auth_meta); + auth_meta->auth_method = auth_method; + auto frame = AuthRequestFrame::Encode(auth_method, preferred_modes, bl); + logger().debug("{} WRITE AuthRequestFrame: method={}," + " preferred_modes={}, payload_len={}", + conn, auth_method, preferred_modes, bl.length()); + return write_frame(frame).then([this] { + return handle_auth_reply(); + }); + } catch (const crimson::auth::error& e) { + logger().error("{} get_initial_auth_request returned {}", conn, e); + abort_in_close(*this, true); + return seastar::now(); + } +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::process_wait() +{ + return read_frame_payload().then([this] { + // handle_wait() logic + logger().debug("{} GOT WaitFrame", conn); + WaitFrame::Decode(rx_segments_data.back()); + return next_step_t::wait; + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::client_connect() +{ + // send_client_ident() logic + uint64_t flags = 0; + if (conn.policy.lossy) { + flags |= CEPH_MSG_CONNECT_LOSSY; + } + + auto client_ident = ClientIdentFrame::Encode( + messenger.get_myaddrs(), + conn.target_addr, + messenger.get_myname().num(), + global_seq, + conn.policy.features_supported, + conn.policy.features_required | msgr2_required, flags, + client_cookie); + + logger().debug("{} WRITE ClientIdentFrame: addrs={}, target={}, gid={}," + " gs={}, features_supported={}, features_required={}," + " flags={}, cookie={}", + conn, messenger.get_myaddrs(), conn.target_addr, + messenger.get_myname().num(), global_seq, + conn.policy.features_supported, + conn.policy.features_required | msgr2_required, + flags, client_cookie); + return write_frame(client_ident).then([this] { + return read_main_preamble(); + }).then([this] (Tag tag) { + switch (tag) { + case Tag::IDENT_MISSING_FEATURES: + return read_frame_payload().then([this] { + // handle_ident_missing_features() logic + auto ident_missing = IdentMissingFeaturesFrame::Decode(rx_segments_data.back()); + logger().warn("{} GOT IdentMissingFeaturesFrame: features={}" + " (client does not support all server features)", + conn, ident_missing.features()); + abort_in_fault(); + return next_step_t::none; + }); + case Tag::WAIT: + return process_wait(); + case Tag::SERVER_IDENT: + return read_frame_payload().then([this] { + // handle_server_ident() logic + requeue_sent(); + auto server_ident = ServerIdentFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT ServerIdentFrame:" + " addrs={}, gid={}, gs={}," + " features_supported={}, features_required={}," + " flags={}, cookie={}", + conn, + server_ident.addrs(), server_ident.gid(), + server_ident.global_seq(), + server_ident.supported_features(), + server_ident.required_features(), + server_ident.flags(), server_ident.cookie()); + + // is this who we intended to talk to? + // be a bit forgiving here, since we may be connecting based on addresses parsed out + // of mon_host or something. + if (!server_ident.addrs().contains(conn.target_addr)) { + logger().warn("{} peer identifies as {}, does not include {}", + conn, server_ident.addrs(), conn.target_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + + server_cookie = server_ident.cookie(); + + // TODO: change peer_addr to entity_addrvec_t + if (server_ident.addrs().front() != conn.peer_addr) { + logger().warn("{} peer advertises as {}, does not match {}", + conn, server_ident.addrs(), conn.peer_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (conn.get_peer_id() != entity_name_t::NEW && + conn.get_peer_id() != server_ident.gid()) { + logger().error("{} connection peer id ({}) does not match " + "what it should be ({}) during connecting, close", + conn, server_ident.gid(), conn.get_peer_id()); + abort_in_close(*this, true); + } + conn.set_peer_id(server_ident.gid()); + conn.set_features(server_ident.supported_features() & + conn.policy.features_supported); + peer_global_seq = server_ident.global_seq(); + + bool lossy = server_ident.flags() & CEPH_MSG_CONNECT_LOSSY; + if (lossy != conn.policy.lossy) { + logger().warn("{} UPDATE Policy(lossy={}) from server flags", conn, lossy); + conn.policy.lossy = lossy; + } + if (lossy && (connect_seq != 0 || server_cookie != 0)) { + logger().warn("{} UPDATE cs=0({}) sc=0({}) for lossy policy", + conn, connect_seq, server_cookie); + connect_seq = 0; + server_cookie = 0; + } + + return seastar::make_ready_future<next_step_t>(next_step_t::ready); + }); + default: { + unexpected_tag(tag, conn, "post_client_connect"); + return seastar::make_ready_future<next_step_t>(next_step_t::none); + } + } + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::client_reconnect() +{ + // send_reconnect() logic + auto reconnect = ReconnectFrame::Encode(messenger.get_myaddrs(), + client_cookie, + server_cookie, + global_seq, + connect_seq, + conn.in_seq); + logger().debug("{} WRITE ReconnectFrame: addrs={}, client_cookie={}," + " server_cookie={}, gs={}, cs={}, msg_seq={}", + conn, messenger.get_myaddrs(), + client_cookie, server_cookie, + global_seq, connect_seq, conn.in_seq); + return write_frame(reconnect).then([this] { + return read_main_preamble(); + }).then([this] (Tag tag) { + switch (tag) { + case Tag::SESSION_RETRY_GLOBAL: + return read_frame_payload().then([this] { + // handle_session_retry_global() logic + auto retry = RetryGlobalFrame::Decode(rx_segments_data.back()); + logger().warn("{} GOT RetryGlobalFrame: gs={}", + conn, retry.global_seq()); + return messenger.get_global_seq(retry.global_seq()).then([this] (auto gs) { + global_seq = gs; + logger().warn("{} UPDATE: gs={} for retry global", conn, global_seq); + return client_reconnect(); + }); + }); + case Tag::SESSION_RETRY: + return read_frame_payload().then([this] { + // handle_session_retry() logic + auto retry = RetryFrame::Decode(rx_segments_data.back()); + logger().warn("{} GOT RetryFrame: cs={}", + conn, retry.connect_seq()); + connect_seq = retry.connect_seq() + 1; + logger().warn("{} UPDATE: cs={}", conn, connect_seq); + return client_reconnect(); + }); + case Tag::SESSION_RESET: + return read_frame_payload().then([this] { + // handle_session_reset() logic + auto reset = ResetFrame::Decode(rx_segments_data.back()); + logger().warn("{} GOT ResetFrame: full={}", conn, reset.full()); + reset_session(reset.full()); + return client_connect(); + }); + case Tag::WAIT: + return process_wait(); + case Tag::SESSION_RECONNECT_OK: + return read_frame_payload().then([this] { + // handle_reconnect_ok() logic + auto reconnect_ok = ReconnectOkFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT ReconnectOkFrame: msg_seq={}", + conn, reconnect_ok.msg_seq()); + requeue_up_to(reconnect_ok.msg_seq()); + return seastar::make_ready_future<next_step_t>(next_step_t::ready); + }); + default: { + unexpected_tag(tag, conn, "post_client_reconnect"); + return seastar::make_ready_future<next_step_t>(next_step_t::none); + } + } + }); +} + +void ProtocolV2::execute_connecting() +{ + trigger_state(state_t::CONNECTING, write_state_t::delay, true); + if (socket) { + socket->shutdown(); + } + gated_execute("execute_connecting", [this] { + return messenger.get_global_seq().then([this] (auto gs) { + global_seq = gs; + assert(client_cookie != 0); + if (!conn.policy.lossy && server_cookie != 0) { + ++connect_seq; + logger().debug("{} UPDATE: gs={}, cs={} for reconnect", + conn, global_seq, connect_seq); + } else { // conn.policy.lossy || server_cookie == 0 + assert(connect_seq == 0); + assert(server_cookie == 0); + logger().debug("{} UPDATE: gs={} for connect", conn, global_seq); + } + + return wait_write_exit(); + }).then([this] { + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} before Socket::connect()", + conn, get_state_name(state)); + abort_protocol(); + } + if (socket) { + gate.dispatch_in_background("close_sockect_connecting", *this, + [sock = std::move(socket)] () mutable { + return sock->close().then([sock = std::move(sock)] {}); + }); + } + INTERCEPT_N_RW(custom_bp_t::SOCKET_CONNECTING); + return Socket::connect(conn.peer_addr); + }).then([this](SocketRef sock) { + logger().debug("{} socket connected", conn); + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} during Socket::connect()", + conn, get_state_name(state)); + return sock->close().then([sock = std::move(sock)] { + abort_protocol(); + }); + } + socket = std::move(sock); + return seastar::now(); + }).then([this] { + auth_meta = seastar::make_lw_shared<AuthConnectionMeta>(); + session_stream_handlers = { nullptr, nullptr }; + enable_recording(); + return banner_exchange(true); + }).then([this] (auto&& ret) { + auto [_peer_type, _my_addr_from_peer] = std::move(ret); + if (conn.get_peer_type() != _peer_type) { + logger().warn("{} connection peer type does not match what peer advertises {} != {}", + conn, ceph_entity_type_name(conn.get_peer_type()), + ceph_entity_type_name(_peer_type)); + abort_in_close(*this, true); + } + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} during banner_exchange(), abort", + conn, get_state_name(state)); + abort_protocol(); + } + socket->learn_ephemeral_port_as_connector(_my_addr_from_peer.get_port()); + if (unlikely(_my_addr_from_peer.is_legacy())) { + logger().warn("{} peer sent a legacy address for me: {}", + conn, _my_addr_from_peer); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + _my_addr_from_peer.set_type(entity_addr_t::TYPE_MSGR2); + return messenger.learned_addr(_my_addr_from_peer, conn); + }).then([this] { + return client_auth(); + }).then([this] { + if (server_cookie == 0) { + ceph_assert(connect_seq == 0); + return client_connect(); + } else { + ceph_assert(connect_seq > 0); + return client_reconnect(); + } + }).then([this] (next_step_t next) { + if (unlikely(state != state_t::CONNECTING)) { + logger().debug("{} triggered {} at the end of execute_connecting()", + conn, get_state_name(state)); + abort_protocol(); + } + switch (next) { + case next_step_t::ready: { + logger().info("{} connected:" + " gs={}, pgs={}, cs={}, client_cookie={}," + " server_cookie={}, in_seq={}, out_seq={}, out_q={}", + conn, global_seq, peer_global_seq, connect_seq, + client_cookie, server_cookie, conn.in_seq, + conn.out_seq, conn.out_q.size()); + execute_ready(true); + break; + } + case next_step_t::wait: { + logger().info("{} execute_connecting(): going to WAIT", conn); + execute_wait(true); + break; + } + default: { + ceph_abort("impossible next step"); + } + } + }).handle_exception([this] (std::exception_ptr eptr) { + if (state != state_t::CONNECTING) { + logger().info("{} execute_connecting(): protocol aborted at {} -- {}", + conn, get_state_name(state), eptr); + assert(state == state_t::CLOSING || + state == state_t::REPLACING); + return; + } + + if (conn.policy.server || + (conn.policy.standby && + (!is_queued() && conn.sent.empty()))) { + logger().info("{} execute_connecting(): fault at {} with nothing to send," + " going to STANDBY -- {}", + conn, get_state_name(state), eptr); + execute_standby(); + } else { + logger().info("{} execute_connecting(): fault at {}, going to WAIT -- {}", + conn, get_state_name(state), eptr); + execute_wait(false); + } + }); + }); +} + +// ACCEPTING state + +seastar::future<> ProtocolV2::_auth_bad_method(int r) +{ + // _auth_bad_method() logic + ceph_assert(r < 0); + auto [allowed_methods, allowed_modes] = + messenger.get_auth_server()->get_supported_auth_methods(conn.get_peer_type()); + auto bad_method = AuthBadMethodFrame::Encode( + auth_meta->auth_method, r, allowed_methods, allowed_modes); + logger().warn("{} WRITE AuthBadMethodFrame: method={}, result={}, " + "allowed_methods={}, allowed_modes={})", + conn, auth_meta->auth_method, cpp_strerror(r), + allowed_methods, allowed_modes); + return write_frame(bad_method).then([this] { + return server_auth(); + }); +} + +seastar::future<> ProtocolV2::_handle_auth_request(bufferlist& auth_payload, bool more) +{ + // _handle_auth_request() logic + ceph_assert(messenger.get_auth_server()); + bufferlist reply; + int r = messenger.get_auth_server()->handle_auth_request( + conn.shared_from_this(), auth_meta, + more, auth_meta->auth_method, auth_payload, + &reply); + switch (r) { + // successful + case 1: { + auto auth_done = AuthDoneFrame::Encode( + conn.peer_global_id, auth_meta->con_mode, reply); + logger().debug("{} WRITE AuthDoneFrame: gid={}, con_mode={}, payload_len={}", + conn, conn.peer_global_id, + ceph_con_mode_name(auth_meta->con_mode), reply.length()); + return write_frame(auth_done).then([this] { + ceph_assert(auth_meta); + session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair( + nullptr, *auth_meta, tx_frame_asm.get_is_rev1(), true); + return finish_auth(); + }); + } + // auth more + case 0: { + auto more = AuthReplyMoreFrame::Encode(reply); + logger().debug("{} WRITE AuthReplyMoreFrame: payload_len={}", + conn, reply.length()); + return write_frame(more).then([this] { + return read_main_preamble(); + }).then([this] (Tag tag) { + expect_tag(Tag::AUTH_REQUEST_MORE, tag, conn, __func__); + return read_frame_payload(); + }).then([this] { + auto auth_more = AuthRequestMoreFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT AuthRequestMoreFrame: payload_len={}", + conn, auth_more.auth_payload().length()); + return _handle_auth_request(auth_more.auth_payload(), true); + }); + } + case -EBUSY: { + logger().warn("{} auth_server handle_auth_request returned -EBUSY", conn); + abort_in_fault(); + return seastar::now(); + } + default: { + logger().warn("{} auth_server handle_auth_request returned {}", conn, r); + return _auth_bad_method(r); + } + } +} + +seastar::future<> ProtocolV2::server_auth() +{ + return read_main_preamble() + .then([this] (Tag tag) { + expect_tag(Tag::AUTH_REQUEST, tag, conn, __func__); + return read_frame_payload(); + }).then([this] { + // handle_auth_request() logic + auto request = AuthRequestFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT AuthRequestFrame: method={}, preferred_modes={}," + " payload_len={}", + conn, request.method(), request.preferred_modes(), + request.auth_payload().length()); + auth_meta->auth_method = request.method(); + auth_meta->con_mode = messenger.get_auth_server()->pick_con_mode( + conn.get_peer_type(), auth_meta->auth_method, + request.preferred_modes()); + if (auth_meta->con_mode == CEPH_CON_MODE_UNKNOWN) { + logger().warn("{} auth_server pick_con_mode returned mode CEPH_CON_MODE_UNKNOWN", conn); + return _auth_bad_method(-EOPNOTSUPP); + } + return _handle_auth_request(request.auth_payload(), false); + }); +} + +bool ProtocolV2::validate_peer_name(const entity_name_t& peer_name) const +{ + auto my_peer_name = conn.get_peer_name(); + if (my_peer_name.type() != peer_name.type()) { + return false; + } + if (my_peer_name.num() != entity_name_t::NEW && + peer_name.num() != entity_name_t::NEW && + my_peer_name.num() != peer_name.num()) { + return false; + } + return true; +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::send_wait() +{ + auto wait = WaitFrame::Encode(); + logger().debug("{} WRITE WaitFrame", conn); + return write_frame(wait).then([] { + return next_step_t::wait; + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::reuse_connection( + ProtocolV2* existing_proto, bool do_reset, + bool reconnect, uint64_t conn_seq, uint64_t msg_seq) +{ + existing_proto->trigger_replacing(reconnect, + do_reset, + std::move(socket), + std::move(auth_meta), + std::move(session_stream_handlers), + peer_global_seq, + client_cookie, + conn.get_peer_name(), + connection_features, + tx_frame_asm.get_is_rev1(), + rx_frame_asm.get_is_rev1(), + conn_seq, + msg_seq); +#ifdef UNIT_TESTS_BUILT + if (conn.interceptor) { + conn.interceptor->register_conn_replaced(conn); + } +#endif + // close this connection because all the necessary information is delivered + // to the exisiting connection, and jump to error handling code to abort the + // current state. + abort_in_close(*this, false); + return seastar::make_ready_future<next_step_t>(next_step_t::none); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::handle_existing_connection(SocketConnectionRef existing_conn) +{ + // handle_existing_connection() logic + ProtocolV2 *existing_proto = dynamic_cast<ProtocolV2*>( + existing_conn->protocol.get()); + ceph_assert(existing_proto); + logger().debug("{}(gs={}, pgs={}, cs={}, cc={}, sc={}) connecting," + " found existing {}(state={}, gs={}, pgs={}, cs={}, cc={}, sc={})", + conn, global_seq, peer_global_seq, connect_seq, + client_cookie, server_cookie, + existing_conn, get_state_name(existing_proto->state), + existing_proto->global_seq, + existing_proto->peer_global_seq, + existing_proto->connect_seq, + existing_proto->client_cookie, + existing_proto->server_cookie); + + if (!validate_peer_name(existing_conn->get_peer_name())) { + logger().error("{} server_connect: my peer_name doesn't match" + " the existing connection {}, abort", conn, existing_conn); + abort_in_fault(); + } + + if (existing_proto->state == state_t::REPLACING) { + logger().warn("{} server_connect: racing replace happened while" + " replacing existing connection {}, send wait.", + conn, *existing_conn); + return send_wait(); + } + + if (existing_proto->peer_global_seq > peer_global_seq) { + logger().warn("{} server_connect:" + " this is a stale connection, because peer_global_seq({})" + " < existing->peer_global_seq({}), close this connection" + " in favor of existing connection {}", + conn, peer_global_seq, + existing_proto->peer_global_seq, *existing_conn); + abort_in_fault(); + } + + if (existing_conn->policy.lossy) { + // existing connection can be thrown out in favor of this one + logger().warn("{} server_connect:" + " existing connection {} is a lossy channel. Close existing in favor of" + " this connection", conn, *existing_conn); + execute_establishing(existing_conn, true); + return seastar::make_ready_future<next_step_t>(next_step_t::ready); + } + + if (existing_proto->server_cookie != 0) { + if (existing_proto->client_cookie != client_cookie) { + // Found previous session + // peer has reset and we're going to reuse the existing connection + // by replacing the socket + logger().warn("{} server_connect:" + " found new session (cs={})" + " when existing {} is with stale session (cs={}, ss={})," + " peer must have reset", + conn, client_cookie, + *existing_conn, existing_proto->client_cookie, + existing_proto->server_cookie); + return reuse_connection(existing_proto, conn.policy.resetcheck); + } else { + // session establishment interrupted between client_ident and server_ident, + // continuing... + logger().warn("{} server_connect: found client session with existing {}" + " matched (cs={}, ss={}), continuing session establishment", + conn, *existing_conn, client_cookie, existing_proto->server_cookie); + return reuse_connection(existing_proto); + } + } else { + // Looks like a connection race: server and client are both connecting to + // each other at the same time. + if (existing_proto->client_cookie != client_cookie) { + if (existing_conn->peer_wins()) { + logger().warn("{} server_connect: connection race detected (cs={}, e_cs={}, ss=0)" + " and win, reusing existing {}", + conn, client_cookie, existing_proto->client_cookie, *existing_conn); + return reuse_connection(existing_proto); + } else { + logger().warn("{} server_connect: connection race detected (cs={}, e_cs={}, ss=0)" + " and lose to existing {}, ask client to wait", + conn, client_cookie, existing_proto->client_cookie, *existing_conn); + return existing_conn->keepalive().then([this] { + return send_wait(); + }); + } + } else { + logger().warn("{} server_connect: found client session with existing {}" + " matched (cs={}, ss={}), continuing session establishment", + conn, *existing_conn, client_cookie, existing_proto->server_cookie); + return reuse_connection(existing_proto); + } + } +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::server_connect() +{ + return read_frame_payload().then([this] { + // handle_client_ident() logic + auto client_ident = ClientIdentFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT ClientIdentFrame: addrs={}, target={}," + " gid={}, gs={}, features_supported={}," + " features_required={}, flags={}, cookie={}", + conn, client_ident.addrs(), client_ident.target_addr(), + client_ident.gid(), client_ident.global_seq(), + client_ident.supported_features(), + client_ident.required_features(), + client_ident.flags(), client_ident.cookie()); + + if (client_ident.addrs().empty() || + client_ident.addrs().front() == entity_addr_t()) { + logger().warn("{} oops, client_ident.addrs() is empty", conn); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (!messenger.get_myaddrs().contains(client_ident.target_addr())) { + logger().warn("{} peer is trying to reach {} which is not us ({})", + conn, client_ident.target_addr(), messenger.get_myaddrs()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + // TODO: change peer_addr to entity_addrvec_t + entity_addr_t paddr = client_ident.addrs().front(); + if ((paddr.is_msgr2() || paddr.is_any()) && + paddr.is_same_host(conn.target_addr)) { + // good + } else { + logger().warn("{} peer's address {} is not v2 or not the same host with {}", + conn, paddr, conn.target_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + conn.peer_addr = paddr; + logger().debug("{} UPDATE: peer_addr={}", conn, conn.peer_addr); + conn.target_addr = conn.peer_addr; + if (!conn.policy.lossy && !conn.policy.server && conn.target_addr.get_port() <= 0) { + logger().warn("{} we don't know how to reconnect to peer {}", + conn, conn.target_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + + if (conn.get_peer_id() != entity_name_t::NEW && + conn.get_peer_id() != client_ident.gid()) { + logger().error("{} client_ident peer_id ({}) does not match" + " what it should be ({}) during accepting, abort", + conn, client_ident.gid(), conn.get_peer_id()); + abort_in_fault(); + } + conn.set_peer_id(client_ident.gid()); + client_cookie = client_ident.cookie(); + + uint64_t feat_missing = + (conn.policy.features_required | msgr2_required) & + ~(uint64_t)client_ident.supported_features(); + if (feat_missing) { + auto ident_missing_features = IdentMissingFeaturesFrame::Encode(feat_missing); + logger().warn("{} WRITE IdentMissingFeaturesFrame: features={} (peer missing)", + conn, feat_missing); + return write_frame(ident_missing_features).then([] { + return next_step_t::wait; + }); + } + connection_features = + client_ident.supported_features() & conn.policy.features_supported; + logger().debug("{} UPDATE: connection_features={}", conn, connection_features); + + peer_global_seq = client_ident.global_seq(); + + // Looks good so far, let's check if there is already an existing connection + // to this peer. + + SocketConnectionRef existing_conn = messenger.lookup_conn(conn.peer_addr); + + if (existing_conn) { + if (existing_conn->protocol->proto_type != proto_t::v2) { + logger().warn("{} existing connection {} proto version is {}, close existing", + conn, *existing_conn, + static_cast<int>(existing_conn->protocol->proto_type)); + // should unregister the existing from msgr atomically + // NOTE: this is following async messenger logic, but we may miss the reset event. + execute_establishing(existing_conn, false); + return seastar::make_ready_future<next_step_t>(next_step_t::ready); + } else { + return handle_existing_connection(existing_conn); + } + } else { + execute_establishing(nullptr, true); + return seastar::make_ready_future<next_step_t>(next_step_t::ready); + } + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::read_reconnect() +{ + return read_main_preamble() + .then([this] (Tag tag) { + expect_tag(Tag::SESSION_RECONNECT, tag, conn, "read_reconnect"); + return server_reconnect(); + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::send_retry(uint64_t connect_seq) +{ + auto retry = RetryFrame::Encode(connect_seq); + logger().warn("{} WRITE RetryFrame: cs={}", conn, connect_seq); + return write_frame(retry).then([this] { + return read_reconnect(); + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::send_retry_global(uint64_t global_seq) +{ + auto retry = RetryGlobalFrame::Encode(global_seq); + logger().warn("{} WRITE RetryGlobalFrame: gs={}", conn, global_seq); + return write_frame(retry).then([this] { + return read_reconnect(); + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::send_reset(bool full) +{ + auto reset = ResetFrame::Encode(full); + logger().warn("{} WRITE ResetFrame: full={}", conn, full); + return write_frame(reset).then([this] { + return read_main_preamble(); + }).then([this] (Tag tag) { + expect_tag(Tag::CLIENT_IDENT, tag, conn, "post_send_reset"); + return server_connect(); + }); +} + +seastar::future<ProtocolV2::next_step_t> +ProtocolV2::server_reconnect() +{ + return read_frame_payload().then([this] { + // handle_reconnect() logic + auto reconnect = ReconnectFrame::Decode(rx_segments_data.back()); + + logger().debug("{} GOT ReconnectFrame: addrs={}, client_cookie={}," + " server_cookie={}, gs={}, cs={}, msg_seq={}", + conn, reconnect.addrs(), + reconnect.client_cookie(), reconnect.server_cookie(), + reconnect.global_seq(), reconnect.connect_seq(), + reconnect.msg_seq()); + + // can peer_addrs be changed on-the-fly? + // TODO: change peer_addr to entity_addrvec_t + entity_addr_t paddr = reconnect.addrs().front(); + if (paddr.is_msgr2() || paddr.is_any()) { + // good + } else { + logger().warn("{} peer's address {} is not v2", conn, paddr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (conn.peer_addr == entity_addr_t()) { + conn.peer_addr = paddr; + } else if (conn.peer_addr != paddr) { + logger().error("{} peer identifies as {}, while conn.peer_addr={}," + " reconnect failed", + conn, paddr, conn.peer_addr); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + peer_global_seq = reconnect.global_seq(); + + SocketConnectionRef existing_conn = messenger.lookup_conn(conn.peer_addr); + + if (!existing_conn) { + // there is no existing connection therefore cannot reconnect to previous + // session + logger().warn("{} server_reconnect: no existing connection from address {}," + " reseting client", conn, conn.peer_addr); + return send_reset(true); + } + + if (existing_conn->protocol->proto_type != proto_t::v2) { + logger().warn("{} server_reconnect: existing connection {} proto version is {}," + "close existing and reset client.", + conn, *existing_conn, + static_cast<int>(existing_conn->protocol->proto_type)); + // NOTE: this is following async messenger logic, but we may miss the reset event. + existing_conn->mark_down(); + return send_reset(true); + } + + ProtocolV2 *existing_proto = dynamic_cast<ProtocolV2*>( + existing_conn->protocol.get()); + ceph_assert(existing_proto); + logger().debug("{}(gs={}, pgs={}, cs={}, cc={}, sc={}) re-connecting," + " found existing {}(state={}, gs={}, pgs={}, cs={}, cc={}, sc={})", + conn, global_seq, peer_global_seq, reconnect.connect_seq(), + reconnect.client_cookie(), reconnect.server_cookie(), + existing_conn, + get_state_name(existing_proto->state), + existing_proto->global_seq, + existing_proto->peer_global_seq, + existing_proto->connect_seq, + existing_proto->client_cookie, + existing_proto->server_cookie); + + if (!validate_peer_name(existing_conn->get_peer_name())) { + logger().error("{} server_reconnect: my peer_name doesn't match" + " the existing connection {}, abort", conn, existing_conn); + abort_in_fault(); + } + + if (existing_proto->state == state_t::REPLACING) { + logger().warn("{} server_reconnect: racing replace happened while " + " replacing existing connection {}, retry global.", + conn, *existing_conn); + return send_retry_global(existing_proto->peer_global_seq); + } + + if (existing_proto->client_cookie != reconnect.client_cookie()) { + logger().warn("{} server_reconnect:" + " client_cookie mismatch with existing connection {}," + " cc={} rcc={}. I must have reset, reseting client.", + conn, *existing_conn, + existing_proto->client_cookie, reconnect.client_cookie()); + return send_reset(conn.policy.resetcheck); + } else if (existing_proto->server_cookie == 0) { + // this happens when: + // - a connects to b + // - a sends client_ident + // - b gets client_ident, sends server_ident and sets cookie X + // - connection fault + // - b reconnects to a with cookie X, connect_seq=1 + // - a has cookie==0 + logger().warn("{} server_reconnect: I was a client (cc={}) and didn't received the" + " server_ident with existing connection {}." + " Asking peer to resume session establishment", + conn, existing_proto->client_cookie, *existing_conn); + return send_reset(false); + } + + if (existing_proto->peer_global_seq > reconnect.global_seq()) { + logger().warn("{} server_reconnect: stale global_seq: exist_pgs({}) > peer_gs({})," + " with existing connection {}," + " ask client to retry global", + conn, existing_proto->peer_global_seq, + reconnect.global_seq(), *existing_conn); + return send_retry_global(existing_proto->peer_global_seq); + } + + if (existing_proto->connect_seq > reconnect.connect_seq()) { + logger().warn("{} server_reconnect: stale peer connect_seq peer_cs({}) < exist_cs({})," + " with existing connection {}, ask client to retry", + conn, reconnect.connect_seq(), + existing_proto->connect_seq, *existing_conn); + return send_retry(existing_proto->connect_seq); + } else if (existing_proto->connect_seq == reconnect.connect_seq()) { + // reconnect race: both peers are sending reconnect messages + if (existing_conn->peer_wins()) { + logger().warn("{} server_reconnect: reconnect race detected (cs={})" + " and win, reusing existing {}", + conn, reconnect.connect_seq(), *existing_conn); + return reuse_connection( + existing_proto, false, + true, reconnect.connect_seq(), reconnect.msg_seq()); + } else { + logger().warn("{} server_reconnect: reconnect race detected (cs={})" + " and lose to existing {}, ask client to wait", + conn, reconnect.connect_seq(), *existing_conn); + return send_wait(); + } + } else { // existing_proto->connect_seq < reconnect.connect_seq() + logger().warn("{} server_reconnect: stale exsiting connect_seq exist_cs({}) < peer_cs({})," + " reusing existing {}", + conn, existing_proto->connect_seq, + reconnect.connect_seq(), *existing_conn); + return reuse_connection( + existing_proto, false, + true, reconnect.connect_seq(), reconnect.msg_seq()); + } + }); +} + +void ProtocolV2::execute_accepting() +{ + trigger_state(state_t::ACCEPTING, write_state_t::none, false); + gate.dispatch_in_background("execute_accepting", *this, [this] { + return seastar::futurize_invoke([this] { + INTERCEPT_N_RW(custom_bp_t::SOCKET_ACCEPTED); + auth_meta = seastar::make_lw_shared<AuthConnectionMeta>(); + session_stream_handlers = { nullptr, nullptr }; + enable_recording(); + return banner_exchange(false); + }).then([this] (auto&& ret) { + auto [_peer_type, _my_addr_from_peer] = std::move(ret); + ceph_assert(conn.get_peer_type() == 0); + conn.set_peer_type(_peer_type); + + conn.policy = messenger.get_policy(_peer_type); + logger().info("{} UPDATE: peer_type={}," + " policy(lossy={} server={} standby={} resetcheck={})", + conn, ceph_entity_type_name(_peer_type), + conn.policy.lossy, conn.policy.server, + conn.policy.standby, conn.policy.resetcheck); + if (messenger.get_myaddr().get_port() != _my_addr_from_peer.get_port() || + messenger.get_myaddr().get_nonce() != _my_addr_from_peer.get_nonce()) { + logger().warn("{} my_addr_from_peer {} port/nonce doesn't match myaddr {}", + conn, _my_addr_from_peer, messenger.get_myaddr()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + return messenger.learned_addr(_my_addr_from_peer, conn); + }).then([this] { + return server_auth(); + }).then([this] { + return read_main_preamble(); + }).then([this] (Tag tag) { + switch (tag) { + case Tag::CLIENT_IDENT: + return server_connect(); + case Tag::SESSION_RECONNECT: + return server_reconnect(); + default: { + unexpected_tag(tag, conn, "post_server_auth"); + return seastar::make_ready_future<next_step_t>(next_step_t::none); + } + } + }).then([this] (next_step_t next) { + switch (next) { + case next_step_t::ready: + assert(state != state_t::ACCEPTING); + break; + case next_step_t::wait: + if (unlikely(state != state_t::ACCEPTING)) { + logger().debug("{} triggered {} at the end of execute_accepting()", + conn, get_state_name(state)); + abort_protocol(); + } + logger().info("{} execute_accepting(): going to SERVER_WAIT", conn); + execute_server_wait(); + break; + default: + ceph_abort("impossible next step"); + } + }).handle_exception([this] (std::exception_ptr eptr) { + logger().info("{} execute_accepting(): fault at {}, going to CLOSING -- {}", + conn, get_state_name(state), eptr); + close(false); + }); + }); +} + +// CONNECTING or ACCEPTING state + +seastar::future<> ProtocolV2::finish_auth() +{ + ceph_assert(auth_meta); + + const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() : + auth_meta->session_key.hmac_sha256(nullptr, rxbuf); + auto sig_frame = AuthSignatureFrame::Encode(sig); + ceph_assert(record_io); + record_io = false; + rxbuf.clear(); + logger().debug("{} WRITE AuthSignatureFrame: signature={}", conn, sig); + return write_frame(sig_frame).then([this] { + return read_main_preamble(); + }).then([this] (Tag tag) { + expect_tag(Tag::AUTH_SIGNATURE, tag, conn, "post_finish_auth"); + return read_frame_payload(); + }).then([this] { + // handle_auth_signature() logic + auto sig_frame = AuthSignatureFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT AuthSignatureFrame: signature={}", conn, sig_frame.signature()); + + const auto actual_tx_sig = auth_meta->session_key.empty() ? + sha256_digest_t() : auth_meta->session_key.hmac_sha256(nullptr, txbuf); + if (sig_frame.signature() != actual_tx_sig) { + logger().warn("{} pre-auth signature mismatch actual_tx_sig={}" + " sig_frame.signature()={}", + conn, actual_tx_sig, sig_frame.signature()); + abort_in_fault(); + } + txbuf.clear(); + }); +} + +// ESTABLISHING + +void ProtocolV2::execute_establishing( + SocketConnectionRef existing_conn, bool dispatch_reset) { + if (unlikely(state != state_t::ACCEPTING)) { + logger().debug("{} triggered {} before execute_establishing()", + conn, get_state_name(state)); + abort_protocol(); + } + + auto accept_me = [this] { + messenger.register_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + messenger.unaccept_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + }; + + trigger_state(state_t::ESTABLISHING, write_state_t::delay, false); + if (existing_conn) { + existing_conn->protocol->close(dispatch_reset, std::move(accept_me)); + if (unlikely(state != state_t::ESTABLISHING)) { + logger().warn("{} triggered {} during execute_establishing(), " + "the accept event will not be delivered!", + conn, get_state_name(state)); + abort_protocol(); + } + } else { + accept_me(); + } + + dispatchers.ms_handle_accept( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + + gated_execute("execute_establishing", [this] { + return seastar::futurize_invoke([this] { + return send_server_ident(); + }).then([this] { + if (unlikely(state != state_t::ESTABLISHING)) { + logger().debug("{} triggered {} at the end of execute_establishing()", + conn, get_state_name(state)); + abort_protocol(); + } + logger().info("{} established: gs={}, pgs={}, cs={}, client_cookie={}," + " server_cookie={}, in_seq={}, out_seq={}, out_q={}", + conn, global_seq, peer_global_seq, connect_seq, + client_cookie, server_cookie, conn.in_seq, + conn.out_seq, conn.out_q.size()); + execute_ready(false); + }).handle_exception([this] (std::exception_ptr eptr) { + if (state != state_t::ESTABLISHING) { + logger().info("{} execute_establishing() protocol aborted at {} -- {}", + conn, get_state_name(state), eptr); + assert(state == state_t::CLOSING || + state == state_t::REPLACING); + return; + } + fault(false, "execute_establishing()", eptr); + }); + }); +} + +// ESTABLISHING or REPLACING state + +seastar::future<> +ProtocolV2::send_server_ident() +{ + // send_server_ident() logic + + // refered to async-conn v2: not assign gs to global_seq + return messenger.get_global_seq().then([this] (auto gs) { + logger().debug("{} UPDATE: gs={} for server ident", conn, global_seq); + + // this is required for the case when this connection is being replaced + requeue_up_to(0); + conn.in_seq = 0; + + if (!conn.policy.lossy) { + server_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll); + } + + uint64_t flags = 0; + if (conn.policy.lossy) { + flags = flags | CEPH_MSG_CONNECT_LOSSY; + } + + auto server_ident = ServerIdentFrame::Encode( + messenger.get_myaddrs(), + messenger.get_myname().num(), + gs, + conn.policy.features_supported, + conn.policy.features_required | msgr2_required, + flags, + server_cookie); + + logger().debug("{} WRITE ServerIdentFrame: addrs={}, gid={}," + " gs={}, features_supported={}, features_required={}," + " flags={}, cookie={}", + conn, messenger.get_myaddrs(), messenger.get_myname().num(), + gs, conn.policy.features_supported, + conn.policy.features_required | msgr2_required, + flags, server_cookie); + + conn.set_features(connection_features); + + return write_frame(server_ident); + }); +} + +// REPLACING state + +void ProtocolV2::trigger_replacing(bool reconnect, + bool do_reset, + SocketRef&& new_socket, + AuthConnectionMetaRef&& new_auth_meta, + ceph::crypto::onwire::rxtx_t new_rxtx, + uint64_t new_peer_global_seq, + uint64_t new_client_cookie, + entity_name_t new_peer_name, + uint64_t new_conn_features, + bool tx_is_rev1, + bool rx_is_rev1, + uint64_t new_connect_seq, + uint64_t new_msg_seq) +{ + trigger_state(state_t::REPLACING, write_state_t::delay, false); + if (socket) { + socket->shutdown(); + } + dispatchers.ms_handle_accept( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + gate.dispatch_in_background("trigger_replacing", *this, + [this, + reconnect, + do_reset, + new_socket = std::move(new_socket), + new_auth_meta = std::move(new_auth_meta), + new_rxtx = std::move(new_rxtx), + tx_is_rev1, rx_is_rev1, + new_client_cookie, new_peer_name, + new_conn_features, new_peer_global_seq, + new_connect_seq, new_msg_seq] () mutable { + return wait_write_exit().then([this, do_reset] { + if (do_reset) { + reset_session(true); + } + protocol_timer.cancel(); + return execution_done.get_future(); + }).then([this, + reconnect, + new_socket = std::move(new_socket), + new_auth_meta = std::move(new_auth_meta), + new_rxtx = std::move(new_rxtx), + tx_is_rev1, rx_is_rev1, + new_client_cookie, new_peer_name, + new_conn_features, new_peer_global_seq, + new_connect_seq, new_msg_seq] () mutable { + if (unlikely(state != state_t::REPLACING)) { + return new_socket->close().then([sock = std::move(new_socket)] { + abort_protocol(); + }); + } + + if (socket) { + gate.dispatch_in_background("close_socket_replacing", *this, + [sock = std::move(socket)] () mutable { + return sock->close().then([sock = std::move(sock)] {}); + }); + } + socket = std::move(new_socket); + auth_meta = std::move(new_auth_meta); + session_stream_handlers = std::move(new_rxtx); + record_io = false; + peer_global_seq = new_peer_global_seq; + + if (reconnect) { + connect_seq = new_connect_seq; + // send_reconnect_ok() logic + requeue_up_to(new_msg_seq); + auto reconnect_ok = ReconnectOkFrame::Encode(conn.in_seq); + logger().debug("{} WRITE ReconnectOkFrame: msg_seq={}", conn, conn.in_seq); + return write_frame(reconnect_ok); + } else { + client_cookie = new_client_cookie; + assert(conn.get_peer_type() == new_peer_name.type()); + if (conn.get_peer_id() == entity_name_t::NEW) { + conn.set_peer_id(new_peer_name.num()); + } + connection_features = new_conn_features; + tx_frame_asm.set_is_rev1(tx_is_rev1); + rx_frame_asm.set_is_rev1(rx_is_rev1); + return send_server_ident(); + } + }).then([this, reconnect] { + if (unlikely(state != state_t::REPLACING)) { + logger().debug("{} triggered {} at the end of trigger_replacing()", + conn, get_state_name(state)); + abort_protocol(); + } + logger().info("{} replaced ({}):" + " gs={}, pgs={}, cs={}, client_cookie={}, server_cookie={}," + " in_seq={}, out_seq={}, out_q={}", + conn, reconnect ? "reconnected" : "connected", + global_seq, peer_global_seq, connect_seq, client_cookie, + server_cookie, conn.in_seq, conn.out_seq, conn.out_q.size()); + execute_ready(false); + }).handle_exception([this] (std::exception_ptr eptr) { + if (state != state_t::REPLACING) { + logger().info("{} trigger_replacing(): protocol aborted at {} -- {}", + conn, get_state_name(state), eptr); + assert(state == state_t::CLOSING); + return; + } + fault(true, "trigger_replacing()", eptr); + }); + }); +} + +// READY state + +ceph::bufferlist ProtocolV2::do_sweep_messages( + const std::deque<MessageRef>& msgs, + size_t num_msgs, + bool require_keepalive, + std::optional<utime_t> _keepalive_ack, + bool require_ack) +{ + ceph::bufferlist bl; + + if (unlikely(require_keepalive)) { + auto keepalive_frame = KeepAliveFrame::Encode(); + bl.append(keepalive_frame.get_buffer(tx_frame_asm)); + INTERCEPT_FRAME(ceph::msgr::v2::Tag::KEEPALIVE2, bp_type_t::WRITE); + } + + if (unlikely(_keepalive_ack.has_value())) { + auto keepalive_ack_frame = KeepAliveFrameAck::Encode(*_keepalive_ack); + bl.append(keepalive_ack_frame.get_buffer(tx_frame_asm)); + INTERCEPT_FRAME(ceph::msgr::v2::Tag::KEEPALIVE2_ACK, bp_type_t::WRITE); + } + + if (require_ack && !num_msgs) { + auto ack_frame = AckFrame::Encode(conn.in_seq); + bl.append(ack_frame.get_buffer(tx_frame_asm)); + INTERCEPT_FRAME(ceph::msgr::v2::Tag::ACK, bp_type_t::WRITE); + } + + std::for_each(msgs.begin(), msgs.begin()+num_msgs, [this, &bl](const MessageRef& msg) { + // TODO: move to common code + // set priority + msg->get_header().src = messenger.get_myname(); + + msg->encode(conn.features, 0); + + ceph_assert(!msg->get_seq() && "message already has seq"); + msg->set_seq(++conn.out_seq); + + ceph_msg_header &header = msg->get_header(); + ceph_msg_footer &footer = msg->get_footer(); + + ceph_msg_header2 header2{header.seq, header.tid, + header.type, header.priority, + header.version, + init_le32(0), header.data_off, + init_le64(conn.in_seq), + footer.flags, header.compat_version, + header.reserved}; + + auto message = MessageFrame::Encode(header2, + msg->get_payload(), msg->get_middle(), msg->get_data()); + logger().debug("{} --> #{} === {} ({})", + conn, msg->get_seq(), *msg, msg->get_type()); + bl.append(message.get_buffer(tx_frame_asm)); + INTERCEPT_FRAME(ceph::msgr::v2::Tag::MESSAGE, bp_type_t::WRITE); + }); + + return bl; +} + +seastar::future<> ProtocolV2::read_message(utime_t throttle_stamp) +{ + return read_frame_payload() + .then([this, throttle_stamp] { + utime_t recv_stamp{seastar::lowres_system_clock::now()}; + + // we need to get the size before std::moving segments data + const size_t cur_msg_size = get_current_msg_size(); + auto msg_frame = MessageFrame::Decode(rx_segments_data); + // XXX: paranoid copy just to avoid oops + ceph_msg_header2 current_header = msg_frame.header(); + + logger().trace("{} got {} + {} + {} byte message," + " envelope type={} src={} off={} seq={}", + conn, msg_frame.front_len(), msg_frame.middle_len(), + msg_frame.data_len(), current_header.type, conn.get_peer_name(), + current_header.data_off, current_header.seq); + + ceph_msg_header header{current_header.seq, + current_header.tid, + current_header.type, + current_header.priority, + current_header.version, + init_le32(msg_frame.front_len()), + init_le32(msg_frame.middle_len()), + init_le32(msg_frame.data_len()), + current_header.data_off, + conn.get_peer_name(), + current_header.compat_version, + current_header.reserved, + init_le32(0)}; + ceph_msg_footer footer{init_le32(0), init_le32(0), + init_le32(0), init_le64(0), current_header.flags}; + + auto conn_ref = seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this()); + Message *message = decode_message(nullptr, 0, header, footer, + msg_frame.front(), msg_frame.middle(), msg_frame.data(), conn_ref); + if (!message) { + logger().warn("{} decode message failed", conn); + abort_in_fault(); + } + + // store reservation size in message, so we don't get confused + // by messages entering the dispatch queue through other paths. + message->set_dispatch_throttle_size(cur_msg_size); + + message->set_throttle_stamp(throttle_stamp); + message->set_recv_stamp(recv_stamp); + message->set_recv_complete_stamp(utime_t{seastar::lowres_system_clock::now()}); + + // check received seq#. if it is old, drop the message. + // note that incoming messages may skip ahead. this is convenient for the + // client side queueing because messages can't be renumbered, but the (kernel) + // client will occasionally pull a message out of the sent queue to send + // elsewhere. in that case it doesn't matter if we "got" it or not. + uint64_t cur_seq = conn.in_seq; + if (message->get_seq() <= cur_seq) { + logger().error("{} got old message {} <= {} {}, discarding", + conn, message->get_seq(), cur_seq, *message); + if (HAVE_FEATURE(conn.features, RECONNECT_SEQ) && + local_conf()->ms_die_on_old_message) { + ceph_assert(0 == "old msgs despite reconnect_seq feature"); + } + return seastar::now(); + } else if (message->get_seq() > cur_seq + 1) { + logger().error("{} missed message? skipped from seq {} to {}", + conn, cur_seq, message->get_seq()); + if (local_conf()->ms_die_on_skipped_message) { + ceph_assert(0 == "skipped incoming seq"); + } + } + + // note last received message. + conn.in_seq = message->get_seq(); + logger().debug("{} <== #{} === {} ({})", + conn, message->get_seq(), *message, message->get_type()); + notify_ack(); + ack_writes(current_header.ack_seq); + + // TODO: change MessageRef with seastar::shared_ptr + auto msg_ref = MessageRef{message, false}; + // throttle the reading process by the returned future + return dispatchers.ms_dispatch(conn_ref, std::move(msg_ref)); + }); +} + +void ProtocolV2::execute_ready(bool dispatch_connect) +{ + assert(conn.policy.lossy || (client_cookie != 0 && server_cookie != 0)); + trigger_state(state_t::READY, write_state_t::open, false); + if (dispatch_connect) { + dispatchers.ms_handle_connect( + seastar::static_pointer_cast<SocketConnection>(conn.shared_from_this())); + } +#ifdef UNIT_TESTS_BUILT + if (conn.interceptor) { + conn.interceptor->register_conn_ready(conn); + } +#endif + gated_execute("execute_ready", [this] { + protocol_timer.cancel(); + return seastar::keep_doing([this] { + return read_main_preamble() + .then([this] (Tag tag) { + switch (tag) { + case Tag::MESSAGE: { + return seastar::futurize_invoke([this] { + // throttle_message() logic + if (!conn.policy.throttler_messages) { + return seastar::now(); + } + // TODO: message throttler + ceph_assert(false); + return seastar::now(); + }).then([this] { + // throttle_bytes() logic + if (!conn.policy.throttler_bytes) { + return seastar::now(); + } + size_t cur_msg_size = get_current_msg_size(); + if (!cur_msg_size) { + return seastar::now(); + } + logger().trace("{} wants {} bytes from policy throttler {}/{}", + conn, cur_msg_size, + conn.policy.throttler_bytes->get_current(), + conn.policy.throttler_bytes->get_max()); + return conn.policy.throttler_bytes->get(cur_msg_size); + }).then([this] { + // TODO: throttle_dispatch_queue() logic + utime_t throttle_stamp{seastar::lowres_system_clock::now()}; + return read_message(throttle_stamp); + }); + } + case Tag::ACK: + return read_frame_payload().then([this] { + // handle_message_ack() logic + auto ack = AckFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT AckFrame: seq={}", conn, ack.seq()); + ack_writes(ack.seq()); + }); + case Tag::KEEPALIVE2: + return read_frame_payload().then([this] { + // handle_keepalive2() logic + auto keepalive_frame = KeepAliveFrame::Decode(rx_segments_data.back()); + logger().debug("{} GOT KeepAliveFrame: timestamp={}", + conn, keepalive_frame.timestamp()); + notify_keepalive_ack(keepalive_frame.timestamp()); + conn.set_last_keepalive(seastar::lowres_system_clock::now()); + }); + case Tag::KEEPALIVE2_ACK: + return read_frame_payload().then([this] { + // handle_keepalive2_ack() logic + auto keepalive_ack_frame = KeepAliveFrameAck::Decode(rx_segments_data.back()); + conn.set_last_keepalive_ack( + seastar::lowres_system_clock::time_point{keepalive_ack_frame.timestamp()}); + logger().debug("{} GOT KeepAliveFrameAck: timestamp={}", + conn, conn.last_keepalive_ack); + }); + default: { + unexpected_tag(tag, conn, "execute_ready"); + return seastar::now(); + } + } + }); + }).handle_exception([this] (std::exception_ptr eptr) { + if (state != state_t::READY) { + logger().info("{} execute_ready(): protocol aborted at {} -- {}", + conn, get_state_name(state), eptr); + assert(state == state_t::REPLACING || + state == state_t::CLOSING); + return; + } + fault(false, "execute_ready()", eptr); + }); + }); +} + +// STANDBY state + +void ProtocolV2::execute_standby() +{ + trigger_state(state_t::STANDBY, write_state_t::delay, true); + if (socket) { + socket->shutdown(); + } +} + +void ProtocolV2::notify_write() +{ + if (unlikely(state == state_t::STANDBY && !conn.policy.server)) { + logger().info("{} notify_write(): at {}, going to CONNECTING", + conn, get_state_name(state)); + execute_connecting(); + } +} + +// WAIT state + +void ProtocolV2::execute_wait(bool max_backoff) +{ + trigger_state(state_t::WAIT, write_state_t::delay, true); + if (socket) { + socket->shutdown(); + } + gated_execute("execute_wait", [this, max_backoff] { + double backoff = protocol_timer.last_dur(); + if (max_backoff) { + backoff = local_conf().get_val<double>("ms_max_backoff"); + } else if (backoff > 0) { + backoff = std::min(local_conf().get_val<double>("ms_max_backoff"), 2 * backoff); + } else { + backoff = local_conf().get_val<double>("ms_initial_backoff"); + } + return protocol_timer.backoff(backoff).then([this] { + if (unlikely(state != state_t::WAIT)) { + logger().debug("{} triggered {} at the end of execute_wait()", + conn, get_state_name(state)); + abort_protocol(); + } + logger().info("{} execute_wait(): going to CONNECTING", conn); + execute_connecting(); + }).handle_exception([this] (std::exception_ptr eptr) { + logger().info("{} execute_wait(): protocol aborted at {} -- {}", + conn, get_state_name(state), eptr); + assert(state == state_t::REPLACING || + state == state_t::CLOSING); + }); + }); +} + +// SERVER_WAIT state + +void ProtocolV2::execute_server_wait() +{ + trigger_state(state_t::SERVER_WAIT, write_state_t::delay, false); + gated_execute("execute_server_wait", [this] { + return read_exactly(1).then([this] (auto bl) { + logger().warn("{} SERVER_WAIT got read, abort", conn); + abort_in_fault(); + }).handle_exception([this] (std::exception_ptr eptr) { + logger().info("{} execute_server_wait(): fault at {}, going to CLOSING -- {}", + conn, get_state_name(state), eptr); + close(false); + }); + }); +} + +// CLOSING state + +void ProtocolV2::trigger_close() +{ + messenger.closing_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + + if (state == state_t::ACCEPTING || state == state_t::SERVER_WAIT) { + messenger.unaccept_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + } else if (state >= state_t::ESTABLISHING && state < state_t::CLOSING) { + messenger.unregister_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); + } else { + // cannot happen + ceph_assert(false); + } + + protocol_timer.cancel(); + trigger_state(state_t::CLOSING, write_state_t::drop, false); +} + +void ProtocolV2::on_closed() +{ + messenger.closed_conn( + seastar::static_pointer_cast<SocketConnection>( + conn.shared_from_this())); +} + +void ProtocolV2::print(std::ostream& out) const +{ + out << conn; +} + +} // namespace crimson::net diff --git a/src/crimson/net/ProtocolV2.h b/src/crimson/net/ProtocolV2.h new file mode 100644 index 000000000..be9a22816 --- /dev/null +++ b/src/crimson/net/ProtocolV2.h @@ -0,0 +1,225 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/sleep.hh> + +#include "Protocol.h" +#include "msg/async/frames_v2.h" +#include "msg/async/crypto_onwire.h" + +namespace crimson::net { + +class ProtocolV2 final : public Protocol { + public: + ProtocolV2(ChainedDispatchers& dispatchers, + SocketConnection& conn, + SocketMessenger& messenger); + ~ProtocolV2() override; + void print(std::ostream&) const final; + private: + void on_closed() override; + bool is_connected() const override; + + void start_connect(const entity_addr_t& peer_addr, + const entity_name_t& peer_name) override; + + void start_accept(SocketRef&& socket, + const entity_addr_t& peer_addr) override; + + void trigger_close() override; + + ceph::bufferlist do_sweep_messages( + const std::deque<MessageRef>& msgs, + size_t num_msgs, + bool require_keepalive, + std::optional<utime_t> keepalive_ack, + bool require_ack) override; + + void notify_write() override; + + private: + SocketMessenger &messenger; + + enum class state_t { + NONE = 0, + ACCEPTING, + SERVER_WAIT, + ESTABLISHING, + CONNECTING, + READY, + STANDBY, + WAIT, + REPLACING, + CLOSING + }; + state_t state = state_t::NONE; + + static const char *get_state_name(state_t state) { + const char *const statenames[] = {"NONE", + "ACCEPTING", + "SERVER_WAIT", + "ESTABLISHING", + "CONNECTING", + "READY", + "STANDBY", + "WAIT", + "REPLACING", + "CLOSING"}; + return statenames[static_cast<int>(state)]; + } + + void trigger_state(state_t state, write_state_t write_state, bool reentrant); + + uint64_t connection_features = 0; + uint64_t peer_required_features = 0; + + uint64_t client_cookie = 0; + uint64_t server_cookie = 0; + uint64_t global_seq = 0; + uint64_t peer_global_seq = 0; + uint64_t connect_seq = 0; + + seastar::shared_future<> execution_done = seastar::now(); + + template <typename Func> + void gated_execute(const char* what, Func&& func) { + gate.dispatch_in_background(what, *this, [this, &func] { + execution_done = seastar::futurize_invoke(std::forward<Func>(func)); + return execution_done.get_future(); + }); + } + + class Timer { + double last_dur_ = 0.0; + const SocketConnection& conn; + std::optional<seastar::abort_source> as; + public: + Timer(SocketConnection& conn) : conn(conn) {} + double last_dur() const { return last_dur_; } + seastar::future<> backoff(double seconds); + void cancel() { + last_dur_ = 0.0; + if (as) { + as->request_abort(); + as = std::nullopt; + } + } + }; + Timer protocol_timer; + + // TODO: Frame related implementations, probably to a separate class. + private: + bool record_io = false; + ceph::bufferlist rxbuf; + ceph::bufferlist txbuf; + + void enable_recording(); + seastar::future<Socket::tmp_buf> read_exactly(size_t bytes); + seastar::future<bufferlist> read(size_t bytes); + seastar::future<> write(bufferlist&& buf); + seastar::future<> write_flush(bufferlist&& buf); + + ceph::crypto::onwire::rxtx_t session_stream_handlers; + ceph::msgr::v2::FrameAssembler tx_frame_asm{&session_stream_handlers, false}; + ceph::msgr::v2::FrameAssembler rx_frame_asm{&session_stream_handlers, false}; + ceph::bufferlist rx_preamble; + ceph::msgr::v2::segment_bls_t rx_segments_data; + + size_t get_current_msg_size() const; + seastar::future<ceph::msgr::v2::Tag> read_main_preamble(); + seastar::future<> read_frame_payload(); + template <class F> + seastar::future<> write_frame(F &frame, bool flush=true); + + private: + void fault(bool backoff, const char* func_name, std::exception_ptr eptr); + void reset_session(bool full); + seastar::future<std::tuple<entity_type_t, entity_addr_t>> + banner_exchange(bool is_connect); + + enum class next_step_t { + ready, + wait, + none, // protocol should have been aborted or failed + }; + + // CONNECTING (client) + seastar::future<> handle_auth_reply(); + inline seastar::future<> client_auth() { + std::vector<uint32_t> empty; + return client_auth(empty); + } + seastar::future<> client_auth(std::vector<uint32_t> &allowed_methods); + + seastar::future<next_step_t> process_wait(); + seastar::future<next_step_t> client_connect(); + seastar::future<next_step_t> client_reconnect(); + void execute_connecting(); + + // ACCEPTING (server) + seastar::future<> _auth_bad_method(int r); + seastar::future<> _handle_auth_request(bufferlist& auth_payload, bool more); + seastar::future<> server_auth(); + + bool validate_peer_name(const entity_name_t& peer_name) const; + seastar::future<next_step_t> send_wait(); + seastar::future<next_step_t> reuse_connection(ProtocolV2* existing_proto, + bool do_reset=false, + bool reconnect=false, + uint64_t conn_seq=0, + uint64_t msg_seq=0); + + seastar::future<next_step_t> handle_existing_connection(SocketConnectionRef existing_conn); + seastar::future<next_step_t> server_connect(); + + seastar::future<next_step_t> read_reconnect(); + seastar::future<next_step_t> send_retry(uint64_t connect_seq); + seastar::future<next_step_t> send_retry_global(uint64_t global_seq); + seastar::future<next_step_t> send_reset(bool full); + seastar::future<next_step_t> server_reconnect(); + + void execute_accepting(); + + // CONNECTING/ACCEPTING + seastar::future<> finish_auth(); + + // ESTABLISHING + void execute_establishing(SocketConnectionRef existing_conn, bool dispatch_reset); + + // ESTABLISHING/REPLACING (server) + seastar::future<> send_server_ident(); + + // REPLACING (server) + void trigger_replacing(bool reconnect, + bool do_reset, + SocketRef&& new_socket, + AuthConnectionMetaRef&& new_auth_meta, + ceph::crypto::onwire::rxtx_t new_rxtx, + uint64_t new_peer_global_seq, + // !reconnect + uint64_t new_client_cookie, + entity_name_t new_peer_name, + uint64_t new_conn_features, + bool tx_is_rev1, + bool rx_is_rev1, + // reconnect + uint64_t new_connect_seq, + uint64_t new_msg_seq); + + // READY + seastar::future<> read_message(utime_t throttle_stamp); + void execute_ready(bool dispatch_connect); + + // STANDBY + void execute_standby(); + + // WAIT + void execute_wait(bool max_backoff); + + // SERVER_WAIT + void execute_server_wait(); +}; + +} // namespace crimson::net diff --git a/src/crimson/net/Socket.cc b/src/crimson/net/Socket.cc new file mode 100644 index 000000000..8ad106dbd --- /dev/null +++ b/src/crimson/net/Socket.cc @@ -0,0 +1,276 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "Socket.h" + +#include <seastar/core/when_all.hh> + +#include "crimson/common/log.h" +#include "Errors.h" + +namespace crimson::net { + +namespace { + +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); +} + +// an input_stream consumer that reads buffer segments into a bufferlist up to +// the given number of remaining bytes +struct bufferlist_consumer { + bufferlist& bl; + size_t& remaining; + + bufferlist_consumer(bufferlist& bl, size_t& remaining) + : bl(bl), remaining(remaining) {} + + using tmp_buf = seastar::temporary_buffer<char>; + using consumption_result_type = typename seastar::input_stream<char>::consumption_result_type; + + // consume some or all of a buffer segment + seastar::future<consumption_result_type> operator()(tmp_buf&& data) { + if (remaining >= data.size()) { + // consume the whole buffer + remaining -= data.size(); + bl.append(buffer::create_foreign(std::move(data))); + if (remaining > 0) { + // return none to request more segments + return seastar::make_ready_future<consumption_result_type>( + seastar::continue_consuming{}); + } else { + // return an empty buffer to singal that we're done + return seastar::make_ready_future<consumption_result_type>( + consumption_result_type::stop_consuming_type({})); + } + } + if (remaining > 0) { + // consume the front + bl.append(buffer::create_foreign(data.share(0, remaining))); + data.trim_front(remaining); + remaining = 0; + } + // give the rest back to signal that we're done + return seastar::make_ready_future<consumption_result_type>( + consumption_result_type::stop_consuming_type{std::move(data)}); + }; +}; + +} // anonymous namespace + +seastar::future<bufferlist> Socket::read(size_t bytes) +{ +#ifdef UNIT_TESTS_BUILT + return try_trap_pre(next_trap_read).then([bytes, this] { +#endif + if (bytes == 0) { + return seastar::make_ready_future<bufferlist>(); + } + r.buffer.clear(); + r.remaining = bytes; + return in.consume(bufferlist_consumer{r.buffer, r.remaining}).then([this] { + if (r.remaining) { // throw on short reads + throw std::system_error(make_error_code(error::read_eof)); + } + return seastar::make_ready_future<bufferlist>(std::move(r.buffer)); + }); +#ifdef UNIT_TESTS_BUILT + }).then([this] (auto buf) { + return try_trap_post(next_trap_read + ).then([buf = std::move(buf)] () mutable { + return std::move(buf); + }); + }); +#endif +} + +seastar::future<seastar::temporary_buffer<char>> +Socket::read_exactly(size_t bytes) { +#ifdef UNIT_TESTS_BUILT + return try_trap_pre(next_trap_read).then([bytes, this] { +#endif + if (bytes == 0) { + return seastar::make_ready_future<seastar::temporary_buffer<char>>(); + } + return in.read_exactly(bytes).then([](auto buf) { + if (buf.empty()) { + throw std::system_error(make_error_code(error::read_eof)); + } + return seastar::make_ready_future<tmp_buf>(std::move(buf)); + }); +#ifdef UNIT_TESTS_BUILT + }).then([this] (auto buf) { + return try_trap_post(next_trap_read + ).then([buf = std::move(buf)] () mutable { + return std::move(buf); + }); + }); +#endif +} + +void Socket::shutdown() { + socket.shutdown_input(); + socket.shutdown_output(); +} + +static inline seastar::future<> +close_and_handle_errors(seastar::output_stream<char>& out) +{ + return out.close().handle_exception_type([] (const std::system_error& e) { + if (e.code() != std::errc::broken_pipe && + e.code() != std::errc::connection_reset) { + logger().error("Socket::close(): unexpected error {}", e); + ceph_abort(); + } + // can happen when out is already shutdown, ignore + }); +} + +seastar::future<> Socket::close() { +#ifndef NDEBUG + ceph_assert(!closed); + closed = true; +#endif + return seastar::when_all_succeed( + in.close(), + close_and_handle_errors(out) + ).then_unpack([] { + return seastar::make_ready_future<>(); + }).handle_exception([] (auto eptr) { + logger().error("Socket::close(): unexpected exception {}", eptr); + ceph_abort(); + }); +} + +#ifdef UNIT_TESTS_BUILT +seastar::future<> Socket::try_trap_pre(bp_action_t& trap) { + auto action = trap; + trap = bp_action_t::CONTINUE; + switch (action) { + case bp_action_t::CONTINUE: + break; + case bp_action_t::FAULT: + logger().info("[Test] got FAULT"); + throw std::system_error(make_error_code(crimson::net::error::negotiation_failure)); + case bp_action_t::BLOCK: + logger().info("[Test] got BLOCK"); + return blocker->block(); + case bp_action_t::STALL: + trap = action; + break; + default: + ceph_abort("unexpected action from trap"); + } + return seastar::make_ready_future<>(); +} + +seastar::future<> Socket::try_trap_post(bp_action_t& trap) { + auto action = trap; + trap = bp_action_t::CONTINUE; + switch (action) { + case bp_action_t::CONTINUE: + break; + case bp_action_t::STALL: + logger().info("[Test] got STALL and block"); + shutdown(); + return blocker->block(); + default: + ceph_abort("unexpected action from trap"); + } + return seastar::make_ready_future<>(); +} + +void Socket::set_trap(bp_type_t type, bp_action_t action, socket_blocker* blocker_) { + blocker = blocker_; + if (type == bp_type_t::READ) { + ceph_assert(next_trap_read == bp_action_t::CONTINUE); + next_trap_read = action; + } else { // type == bp_type_t::WRITE + if (next_trap_write == bp_action_t::CONTINUE) { + next_trap_write = action; + } else if (next_trap_write == bp_action_t::FAULT) { + // do_sweep_messages() may combine multiple write events into one socket write + ceph_assert(action == bp_action_t::FAULT || action == bp_action_t::CONTINUE); + } else { + ceph_abort(); + } + } +} +#endif + +FixedCPUServerSocket::listen_ertr::future<> +FixedCPUServerSocket::listen(entity_addr_t addr) +{ + assert(seastar::this_shard_id() == cpu); + logger().trace("FixedCPUServerSocket::listen({})...", addr); + return container().invoke_on_all([addr] (auto& ss) { + ss.addr = addr; + seastar::socket_address s_addr(addr.in4_addr()); + seastar::listen_options lo; + lo.reuse_address = true; + lo.set_fixed_cpu(ss.cpu); + ss.listener = seastar::listen(s_addr, lo); + }).then([] { + return true; + }).handle_exception_type([addr] (const std::system_error& e) { + if (e.code() == std::errc::address_in_use) { + logger().trace("FixedCPUServerSocket::listen({}): address in use", addr); + } else { + logger().error("FixedCPUServerSocket::listen({}): " + "got unexpeted error {}", addr, e); + ceph_abort(); + } + return false; + }).then([] (bool success) -> listen_ertr::future<> { + if (success) { + return listen_ertr::now(); + } else { + return crimson::ct_error::address_in_use::make(); + } + }); +} + +seastar::future<> FixedCPUServerSocket::shutdown() +{ + assert(seastar::this_shard_id() == cpu); + logger().trace("FixedCPUServerSocket({})::shutdown()...", addr); + return container().invoke_on_all([] (auto& ss) { + if (ss.listener) { + ss.listener->abort_accept(); + } + return ss.shutdown_gate.close(); + }).then([this] { + return reset(); + }); +} + +seastar::future<> FixedCPUServerSocket::destroy() +{ + assert(seastar::this_shard_id() == cpu); + return shutdown().then([this] { + // we should only construct/stop shards on #0 + return container().invoke_on(0, [] (auto& ss) { + assert(ss.service); + return ss.service->stop().finally([cleanup = std::move(ss.service)] {}); + }); + }); +} + +seastar::future<FixedCPUServerSocket*> FixedCPUServerSocket::create() +{ + auto cpu = seastar::this_shard_id(); + // we should only construct/stop shards on #0 + return seastar::smp::submit_to(0, [cpu] { + auto service = std::make_unique<sharded_service_t>(); + return service->start(cpu, construct_tag{} + ).then([service = std::move(service)] () mutable { + auto p_shard = service.get(); + p_shard->local().service = std::move(service); + return p_shard; + }); + }).then([] (auto p_shard) { + return &p_shard->local(); + }); +} + +} // namespace crimson::net diff --git a/src/crimson/net/Socket.h b/src/crimson/net/Socket.h new file mode 100644 index 000000000..d39a2517f --- /dev/null +++ b/src/crimson/net/Socket.h @@ -0,0 +1,268 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/gate.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/sharded.hh> +#include <seastar/net/packet.hh> + +#include "include/buffer.h" + +#include "crimson/common/log.h" +#include "Errors.h" +#include "Fwd.h" + +#ifdef UNIT_TESTS_BUILT +#include "Interceptor.h" +#endif + +namespace crimson::net { + +class Socket; +using SocketRef = std::unique_ptr<Socket>; + +class Socket +{ + struct construct_tag {}; + + public: + // if acceptor side, peer is using a different port (ephemeral_port) + // if connector side, I'm using a different port (ephemeral_port) + enum class side_t { + acceptor, + connector + }; + + Socket(seastar::connected_socket&& _socket, side_t _side, uint16_t e_port, construct_tag) + : sid{seastar::this_shard_id()}, + socket(std::move(_socket)), + in(socket.input()), + // the default buffer size 8192 is too small that may impact our write + // performance. see seastar::net::connected_socket::output() + out(socket.output(65536)), + side(_side), + ephemeral_port(e_port) {} + + ~Socket() { +#ifndef NDEBUG + assert(closed); +#endif + } + + Socket(Socket&& o) = delete; + + static seastar::future<SocketRef> + connect(const entity_addr_t& peer_addr) { + return seastar::connect(peer_addr.in4_addr() + ).then([] (seastar::connected_socket socket) { + return std::make_unique<Socket>( + std::move(socket), side_t::connector, 0, construct_tag{}); + }); + } + + /// read the requested number of bytes into a bufferlist + seastar::future<bufferlist> read(size_t bytes); + using tmp_buf = seastar::temporary_buffer<char>; + using packet = seastar::net::packet; + seastar::future<tmp_buf> read_exactly(size_t bytes); + + seastar::future<> write(packet&& buf) { +#ifdef UNIT_TESTS_BUILT + return try_trap_pre(next_trap_write).then([buf = std::move(buf), this] () mutable { +#endif + return out.write(std::move(buf)); +#ifdef UNIT_TESTS_BUILT + }).then([this] { + return try_trap_post(next_trap_write); + }); +#endif + } + seastar::future<> flush() { + return out.flush(); + } + seastar::future<> write_flush(packet&& buf) { +#ifdef UNIT_TESTS_BUILT + return try_trap_pre(next_trap_write).then([buf = std::move(buf), this] () mutable { +#endif + return out.write(std::move(buf)).then([this] { return out.flush(); }); +#ifdef UNIT_TESTS_BUILT + }).then([this] { + return try_trap_post(next_trap_write); + }); +#endif + } + + // preemptively disable further reads or writes, can only be shutdown once. + void shutdown(); + + /// Socket can only be closed once. + seastar::future<> close(); + + // shutdown input_stream only, for tests + void force_shutdown_in() { + socket.shutdown_input(); + } + + // shutdown output_stream only, for tests + void force_shutdown_out() { + socket.shutdown_output(); + } + + side_t get_side() const { + return side; + } + + uint16_t get_ephemeral_port() const { + return ephemeral_port; + } + + // learn my ephemeral_port as connector. + // unfortunately, there's no way to identify which port I'm using as + // connector with current seastar interface. + void learn_ephemeral_port_as_connector(uint16_t port) { + assert(side == side_t::connector && + (ephemeral_port == 0 || ephemeral_port == port)); + ephemeral_port = port; + } + + private: + const seastar::shard_id sid; + seastar::connected_socket socket; + seastar::input_stream<char> in; + seastar::output_stream<char> out; + side_t side; + uint16_t ephemeral_port; + +#ifndef NDEBUG + bool closed = false; +#endif + + /// buffer state for read() + struct { + bufferlist buffer; + size_t remaining; + } r; + +#ifdef UNIT_TESTS_BUILT + public: + void set_trap(bp_type_t type, bp_action_t action, socket_blocker* blocker_); + + private: + bp_action_t next_trap_read = bp_action_t::CONTINUE; + bp_action_t next_trap_write = bp_action_t::CONTINUE; + socket_blocker* blocker = nullptr; + seastar::future<> try_trap_pre(bp_action_t& trap); + seastar::future<> try_trap_post(bp_action_t& trap); + +#endif + friend class FixedCPUServerSocket; +}; + +class FixedCPUServerSocket + : public seastar::peering_sharded_service<FixedCPUServerSocket> { + const seastar::shard_id cpu; + entity_addr_t addr; + std::optional<seastar::server_socket> listener; + seastar::gate shutdown_gate; + + using sharded_service_t = seastar::sharded<FixedCPUServerSocket>; + std::unique_ptr<sharded_service_t> service; + + struct construct_tag {}; + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); + } + + seastar::future<> reset() { + return container().invoke_on_all([] (auto& ss) { + assert(ss.shutdown_gate.is_closed()); + ss.shutdown_gate = seastar::gate(); + ss.addr = entity_addr_t(); + ss.listener.reset(); + }); + } + +public: + FixedCPUServerSocket(seastar::shard_id cpu, construct_tag) : cpu{cpu} {} + ~FixedCPUServerSocket() { + assert(!listener); + // detect whether user have called destroy() properly + ceph_assert(!service); + } + + FixedCPUServerSocket(FixedCPUServerSocket&&) = delete; + FixedCPUServerSocket(const FixedCPUServerSocket&) = delete; + FixedCPUServerSocket& operator=(const FixedCPUServerSocket&) = delete; + + using listen_ertr = crimson::errorator< + crimson::ct_error::address_in_use // The address is already bound + >; + listen_ertr::future<> listen(entity_addr_t addr); + + // fn_accept should be a nothrow function of type + // seastar::future<>(SocketRef, entity_addr_t) + template <typename Func> + seastar::future<> accept(Func&& fn_accept) { + assert(seastar::this_shard_id() == cpu); + logger().trace("FixedCPUServerSocket({})::accept()...", addr); + return container().invoke_on_all( + [fn_accept = std::move(fn_accept)] (auto& ss) mutable { + assert(ss.listener); + // gate accepting + // FixedCPUServerSocket::shutdown() will drain the continuations in the gate + // so ignore the returned future + std::ignore = seastar::with_gate(ss.shutdown_gate, + [&ss, fn_accept = std::move(fn_accept)] () mutable { + return seastar::keep_doing([&ss, fn_accept = std::move(fn_accept)] () mutable { + return ss.listener->accept().then( + [&ss, fn_accept = std::move(fn_accept)] + (seastar::accept_result accept_result) mutable { + // assert seastar::listen_options::set_fixed_cpu() works + assert(seastar::this_shard_id() == ss.cpu); + auto [socket, paddr] = std::move(accept_result); + entity_addr_t peer_addr; + peer_addr.set_sockaddr(&paddr.as_posix_sockaddr()); + peer_addr.set_type(entity_addr_t::TYPE_ANY); + SocketRef _socket = std::make_unique<Socket>( + std::move(socket), Socket::side_t::acceptor, + peer_addr.get_port(), Socket::construct_tag{}); + std::ignore = seastar::with_gate(ss.shutdown_gate, + [socket = std::move(_socket), peer_addr, + &ss, fn_accept = std::move(fn_accept)] () mutable { + logger().trace("FixedCPUServerSocket({})::accept(): " + "accepted peer {}", ss.addr, peer_addr); + return fn_accept(std::move(socket), peer_addr + ).handle_exception([&ss, peer_addr] (auto eptr) { + logger().error("FixedCPUServerSocket({})::accept(): " + "fn_accept(s, {}) got unexpected exception {}", + ss.addr, peer_addr, eptr); + ceph_abort(); + }); + }); + }); + }).handle_exception_type([&ss] (const std::system_error& e) { + if (e.code() == std::errc::connection_aborted || + e.code() == std::errc::invalid_argument) { + logger().trace("FixedCPUServerSocket({})::accept(): stopped ({})", + ss.addr, e); + } else { + throw; + } + }).handle_exception([&ss] (auto eptr) { + logger().error("FixedCPUServerSocket({})::accept(): " + "got unexpected exception {}", ss.addr, eptr); + ceph_abort(); + }); + }); + }); + } + + seastar::future<> shutdown(); + seastar::future<> destroy(); + static seastar::future<FixedCPUServerSocket*> create(); +}; + +} // namespace crimson::net diff --git a/src/crimson/net/SocketConnection.cc b/src/crimson/net/SocketConnection.cc new file mode 100644 index 000000000..623dca32f --- /dev/null +++ b/src/crimson/net/SocketConnection.cc @@ -0,0 +1,150 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "SocketConnection.h" + +#include "ProtocolV1.h" +#include "ProtocolV2.h" +#include "SocketMessenger.h" + +#ifdef UNIT_TESTS_BUILT +#include "Interceptor.h" +#endif + +using namespace crimson::net; +using crimson::common::local_conf; + +SocketConnection::SocketConnection(SocketMessenger& messenger, + ChainedDispatchers& dispatchers, + bool is_msgr2) + : messenger(messenger) +{ + if (is_msgr2) { + protocol = std::make_unique<ProtocolV2>(dispatchers, *this, messenger); + } else { + protocol = std::make_unique<ProtocolV1>(dispatchers, *this, messenger); + } +#ifdef UNIT_TESTS_BUILT + if (messenger.interceptor) { + interceptor = messenger.interceptor; + interceptor->register_conn(*this); + } +#endif +} + +SocketConnection::~SocketConnection() {} + +crimson::net::Messenger* +SocketConnection::get_messenger() const { + return &messenger; +} + +bool SocketConnection::is_connected() const +{ + assert(seastar::this_shard_id() == shard_id()); + return protocol->is_connected(); +} + +#ifdef UNIT_TESTS_BUILT +bool SocketConnection::is_closed() const +{ + assert(seastar::this_shard_id() == shard_id()); + return protocol->is_closed(); +} + +bool SocketConnection::is_closed_clean() const +{ + assert(seastar::this_shard_id() == shard_id()); + return protocol->is_closed_clean; +} + +#endif +bool SocketConnection::peer_wins() const +{ + return (messenger.get_myaddr() > peer_addr || policy.server); +} + +seastar::future<> SocketConnection::send(MessageRef msg) +{ + assert(seastar::this_shard_id() == shard_id()); + return protocol->send(std::move(msg)); +} + +seastar::future<> SocketConnection::keepalive() +{ + assert(seastar::this_shard_id() == shard_id()); + return protocol->keepalive(); +} + +void SocketConnection::mark_down() +{ + assert(seastar::this_shard_id() == shard_id()); + protocol->close(false); +} + +bool SocketConnection::update_rx_seq(seq_num_t seq) +{ + if (seq <= in_seq) { + if (HAVE_FEATURE(features, RECONNECT_SEQ) && + local_conf()->ms_die_on_old_message) { + ceph_abort_msg("old msgs despite reconnect_seq feature"); + } + return false; + } else if (seq > in_seq + 1) { + if (local_conf()->ms_die_on_skipped_message) { + ceph_abort_msg("skipped incoming seq"); + } + return false; + } else { + in_seq = seq; + return true; + } +} + +void +SocketConnection::start_connect(const entity_addr_t& _peer_addr, + const entity_name_t& _peer_name) +{ + protocol->start_connect(_peer_addr, _peer_name); +} + +void +SocketConnection::start_accept(SocketRef&& sock, + const entity_addr_t& _peer_addr) +{ + protocol->start_accept(std::move(sock), _peer_addr); +} + +seastar::future<> +SocketConnection::close_clean(bool dispatch_reset) +{ + return protocol->close_clean(dispatch_reset); +} + +seastar::shard_id SocketConnection::shard_id() const { + return messenger.shard_id(); +} + +void SocketConnection::print(ostream& out) const { + messenger.print(out); + if (!protocol->socket) { + out << " >> " << get_peer_name() << " " << peer_addr; + } else if (protocol->socket->get_side() == Socket::side_t::acceptor) { + out << " >> " << get_peer_name() << " " << peer_addr + << "@" << protocol->socket->get_ephemeral_port(); + } else { // protocol->socket->get_side() == Socket::side_t::connector + out << "@" << protocol->socket->get_ephemeral_port() + << " >> " << get_peer_name() << " " << peer_addr; + } +} diff --git a/src/crimson/net/SocketConnection.h b/src/crimson/net/SocketConnection.h new file mode 100644 index 000000000..9c977c7cf --- /dev/null +++ b/src/crimson/net/SocketConnection.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <seastar/core/sharded.hh> + +#include "msg/Policy.h" +#include "crimson/common/throttle.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Socket.h" + +namespace crimson::net { + +class Protocol; +class SocketMessenger; +class SocketConnection; +using SocketConnectionRef = seastar::shared_ptr<SocketConnection>; + +class SocketConnection : public Connection { + SocketMessenger& messenger; + std::unique_ptr<Protocol> protocol; + + ceph::net::Policy<crimson::common::Throttle> policy; + + /// the seq num of the last transmitted message + seq_num_t out_seq = 0; + /// the seq num of the last received message + seq_num_t in_seq = 0; + /// update the seq num of last received message + /// @returns true if the @c seq is valid, and @c in_seq is updated, + /// false otherwise. + bool update_rx_seq(seq_num_t seq); + + // messages to be resent after connection gets reset + std::deque<MessageRef> out_q; + std::deque<MessageRef> pending_q; + // messages sent, but not yet acked by peer + std::deque<MessageRef> sent; + + seastar::shard_id shard_id() const; + + public: + SocketConnection(SocketMessenger& messenger, + ChainedDispatchers& dispatchers, + bool is_msgr2); + ~SocketConnection() override; + + Messenger* get_messenger() const override; + + bool is_connected() const override; + +#ifdef UNIT_TESTS_BUILT + bool is_closed_clean() const override; + + bool is_closed() const override; + + bool peer_wins() const override; +#else + bool peer_wins() const; +#endif + + seastar::future<> send(MessageRef msg) override; + + seastar::future<> keepalive() override; + + void mark_down() override; + + void print(ostream& out) const override; + + /// start a handshake from the client's perspective, + /// only call when SocketConnection first construct + void start_connect(const entity_addr_t& peer_addr, + const entity_name_t& peer_name); + /// start a handshake from the server's perspective, + /// only call when SocketConnection first construct + void start_accept(SocketRef&& socket, + const entity_addr_t& peer_addr); + + seastar::future<> close_clean(bool dispatch_reset); + + bool is_server_side() const { + return policy.server; + } + + bool is_lossy() const { + return policy.lossy; + } + + friend class Protocol; + friend class ProtocolV1; + friend class ProtocolV2; +}; + +} // namespace crimson::net diff --git a/src/crimson/net/SocketMessenger.cc b/src/crimson/net/SocketMessenger.cc new file mode 100644 index 000000000..db9421e79 --- /dev/null +++ b/src/crimson/net/SocketMessenger.cc @@ -0,0 +1,351 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "SocketMessenger.h" + +#include <tuple> +#include <boost/functional/hash.hpp> + +#include "auth/Auth.h" +#include "Errors.h" +#include "Socket.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); + } +} + +namespace crimson::net { + +SocketMessenger::SocketMessenger(const entity_name_t& myname, + const std::string& logic_name, + uint32_t nonce) + : Messenger{myname}, + master_sid{seastar::this_shard_id()}, + logic_name{logic_name}, + nonce{nonce} +{} + +seastar::future<> SocketMessenger::set_myaddrs(const entity_addrvec_t& addrs) +{ + assert(seastar::this_shard_id() == master_sid); + auto my_addrs = addrs; + for (auto& addr : my_addrs.v) { + addr.nonce = nonce; + } + return Messenger::set_myaddrs(my_addrs); +} + +SocketMessenger::bind_ertr::future<> SocketMessenger::do_bind(const entity_addrvec_t& addrs) +{ + assert(seastar::this_shard_id() == master_sid); + ceph_assert(addrs.front().get_family() == AF_INET); + return set_myaddrs(addrs).then([this] { + if (!listener) { + return FixedCPUServerSocket::create().then([this] (auto _listener) { + listener = _listener; + }); + } else { + return seastar::now(); + } + }).then([this] () -> bind_ertr::future<> { + const entity_addr_t listen_addr = get_myaddr(); + logger().debug("{} do_bind: try listen {}...", *this, listen_addr); + if (!listener) { + logger().warn("{} do_bind: listener doesn't exist", *this); + return bind_ertr::now(); + } + return listener->listen(listen_addr); + }); +} + +SocketMessenger::bind_ertr::future<> +SocketMessenger::bind(const entity_addrvec_t& addrs) +{ + return do_bind(addrs).safe_then([this] { + logger().info("{} bind: done", *this); + }); +} + +SocketMessenger::bind_ertr::future<> +SocketMessenger::try_bind(const entity_addrvec_t& addrs, + uint32_t min_port, uint32_t max_port) +{ + auto addr = addrs.front(); + if (addr.get_port() != 0) { + return do_bind(addrs).safe_then([this] { + logger().info("{} try_bind: done", *this); + }); + } + ceph_assert(min_port <= max_port); + return seastar::do_with(uint32_t(min_port), + [this, max_port, addr] (auto& port) { + return seastar::repeat_until_value([this, max_port, addr, &port] { + auto to_bind = addr; + to_bind.set_port(port); + return do_bind(entity_addrvec_t{to_bind} + ).safe_then([this] () -> seastar::future<std::optional<bool>> { + logger().info("{} try_bind: done", *this); + return seastar::make_ready_future<std::optional<bool>>( + std::make_optional<bool>(true)); + }, bind_ertr::all_same_way([this, max_port, &port] + (const std::error_code& e) mutable + -> seastar::future<std::optional<bool>> { + assert(e == std::errc::address_in_use); + logger().trace("{} try_bind: {} already used", *this, port); + if (port == max_port) { + return seastar::make_ready_future<std::optional<bool>>( + std::make_optional<bool>(false)); + } + ++port; + return seastar::make_ready_future<std::optional<bool>>(); + })); + }).then([] (bool success) -> bind_ertr::future<> { + if (success) { + return bind_ertr::now(); + } else { + return crimson::ct_error::address_in_use::make(); + } + }); + }); +} + +seastar::future<> SocketMessenger::start( + const dispatchers_t& _dispatchers) { + assert(seastar::this_shard_id() == master_sid); + + dispatchers.assign(_dispatchers); + if (listener) { + // make sure we have already bound to a valid address + ceph_assert(get_myaddr().is_legacy() || get_myaddr().is_msgr2()); + ceph_assert(get_myaddr().get_port() > 0); + + return listener->accept([this] (SocketRef socket, entity_addr_t peer_addr) { + assert(seastar::this_shard_id() == master_sid); + SocketConnectionRef conn = seastar::make_shared<SocketConnection>( + *this, dispatchers, get_myaddr().is_msgr2()); + conn->start_accept(std::move(socket), peer_addr); + return seastar::now(); + }); + } + return seastar::now(); +} + +crimson::net::ConnectionRef +SocketMessenger::connect(const entity_addr_t& peer_addr, const entity_name_t& peer_name) +{ + assert(seastar::this_shard_id() == master_sid); + + // make sure we connect to a valid peer_addr + ceph_assert(peer_addr.is_legacy() || peer_addr.is_msgr2()); + ceph_assert(peer_addr.get_port() > 0); + + if (auto found = lookup_conn(peer_addr); found) { + logger().debug("{} connect to existing", *found); + return found->shared_from_this(); + } + SocketConnectionRef conn = seastar::make_shared<SocketConnection>( + *this, dispatchers, peer_addr.is_msgr2()); + conn->start_connect(peer_addr, peer_name); + return conn->shared_from_this(); +} + +seastar::future<> SocketMessenger::shutdown() +{ + assert(seastar::this_shard_id() == master_sid); + return seastar::futurize_invoke([this] { + assert(dispatchers.empty()); + if (listener) { + auto d_listener = listener; + listener = nullptr; + return d_listener->destroy(); + } else { + return seastar::now(); + } + // close all connections + }).then([this] { + return seastar::parallel_for_each(accepting_conns, [] (auto conn) { + return conn->close_clean(false); + }); + }).then([this] { + ceph_assert(accepting_conns.empty()); + return seastar::parallel_for_each(connections, [] (auto conn) { + return conn.second->close_clean(false); + }); + }).then([this] { + return seastar::parallel_for_each(closing_conns, [] (auto conn) { + return conn->close_clean(false); + }); + }).then([this] { + ceph_assert(connections.empty()); + shutdown_promise.set_value(); + }); +} + +seastar::future<> SocketMessenger::learned_addr(const entity_addr_t &peer_addr_for_me, const SocketConnection& conn) +{ + assert(seastar::this_shard_id() == master_sid); + if (!need_addr) { + if ((!get_myaddr().is_any() && + get_myaddr().get_type() != peer_addr_for_me.get_type()) || + get_myaddr().get_family() != peer_addr_for_me.get_family() || + !get_myaddr().is_same_host(peer_addr_for_me)) { + logger().warn("{} peer_addr_for_me {} type/family/IP doesn't match myaddr {}", + conn, peer_addr_for_me, get_myaddr()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + return seastar::now(); + } + + if (get_myaddr().get_type() == entity_addr_t::TYPE_NONE) { + // Not bound + entity_addr_t addr = peer_addr_for_me; + addr.set_type(entity_addr_t::TYPE_ANY); + addr.set_port(0); + need_addr = false; + return set_myaddrs(entity_addrvec_t{addr} + ).then([this, &conn, peer_addr_for_me] { + logger().info("{} learned myaddr={} (unbound) from {}", + conn, get_myaddr(), peer_addr_for_me); + }); + } else { + // Already bound + if (!get_myaddr().is_any() && + get_myaddr().get_type() != peer_addr_for_me.get_type()) { + logger().warn("{} peer_addr_for_me {} type doesn't match myaddr {}", + conn, peer_addr_for_me, get_myaddr()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (get_myaddr().get_family() != peer_addr_for_me.get_family()) { + logger().warn("{} peer_addr_for_me {} family doesn't match myaddr {}", + conn, peer_addr_for_me, get_myaddr()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } + if (get_myaddr().is_blank_ip()) { + entity_addr_t addr = peer_addr_for_me; + addr.set_type(get_myaddr().get_type()); + addr.set_port(get_myaddr().get_port()); + need_addr = false; + return set_myaddrs(entity_addrvec_t{addr} + ).then([this, &conn, peer_addr_for_me] { + logger().info("{} learned myaddr={} (blank IP) from {}", + conn, get_myaddr(), peer_addr_for_me); + }); + } else if (!get_myaddr().is_same_host(peer_addr_for_me)) { + logger().warn("{} peer_addr_for_me {} IP doesn't match myaddr {}", + conn, peer_addr_for_me, get_myaddr()); + throw std::system_error( + make_error_code(crimson::net::error::bad_peer_address)); + } else { + need_addr = false; + return seastar::now(); + } + } +} + +SocketPolicy SocketMessenger::get_policy(entity_type_t peer_type) const +{ + return policy_set.get(peer_type); +} + +SocketPolicy SocketMessenger::get_default_policy() const +{ + return policy_set.get_default(); +} + +void SocketMessenger::set_default_policy(const SocketPolicy& p) +{ + policy_set.set_default(p); +} + +void SocketMessenger::set_policy(entity_type_t peer_type, + const SocketPolicy& p) +{ + policy_set.set(peer_type, p); +} + +void SocketMessenger::set_policy_throttler(entity_type_t peer_type, + Throttle* throttle) +{ + // only byte throttler is used in OSD + policy_set.set_throttlers(peer_type, throttle, nullptr); +} + +crimson::net::SocketConnectionRef SocketMessenger::lookup_conn(const entity_addr_t& addr) +{ + if (auto found = connections.find(addr); + found != connections.end()) { + return found->second; + } else { + return nullptr; + } +} + +void SocketMessenger::accept_conn(SocketConnectionRef conn) +{ + accepting_conns.insert(conn); +} + +void SocketMessenger::unaccept_conn(SocketConnectionRef conn) +{ + accepting_conns.erase(conn); +} + +void SocketMessenger::register_conn(SocketConnectionRef conn) +{ + auto [i, added] = connections.emplace(conn->get_peer_addr(), conn); + std::ignore = i; + ceph_assert(added); +} + +void SocketMessenger::unregister_conn(SocketConnectionRef conn) +{ + ceph_assert(conn); + auto found = connections.find(conn->get_peer_addr()); + ceph_assert(found != connections.end()); + ceph_assert(found->second == conn); + connections.erase(found); +} + +void SocketMessenger::closing_conn(SocketConnectionRef conn) +{ + closing_conns.push_back(conn); +} + +void SocketMessenger::closed_conn(SocketConnectionRef conn) +{ + for (auto it = closing_conns.begin(); + it != closing_conns.end();) { + if (*it == conn) { + it = closing_conns.erase(it); + } else { + it++; + } + } +} + +seastar::future<uint32_t> +SocketMessenger::get_global_seq(uint32_t old) +{ + if (old > global_seq) { + global_seq = old; + } + return seastar::make_ready_future<uint32_t>(++global_seq); +} + +} // namespace crimson::net diff --git a/src/crimson/net/SocketMessenger.h b/src/crimson/net/SocketMessenger.h new file mode 100644 index 000000000..44c1d3c21 --- /dev/null +++ b/src/crimson/net/SocketMessenger.h @@ -0,0 +1,122 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <map> +#include <set> +#include <vector> +#include <seastar/core/gate.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/sharded.hh> +#include <seastar/core/shared_future.hh> + +#include "crimson/net/chained_dispatchers.h" +#include "Messenger.h" +#include "SocketConnection.h" + +namespace crimson::net { + +class FixedCPUServerSocket; + +class SocketMessenger final : public Messenger { + const seastar::shard_id master_sid; + seastar::promise<> shutdown_promise; + + FixedCPUServerSocket* listener = nullptr; + ChainedDispatchers dispatchers; + std::map<entity_addr_t, SocketConnectionRef> connections; + std::set<SocketConnectionRef> accepting_conns; + std::vector<SocketConnectionRef> closing_conns; + ceph::net::PolicySet<Throttle> policy_set; + // Distinguish messengers with meaningful names for debugging + const std::string logic_name; + const uint32_t nonce; + // specifying we haven't learned our addr; set false when we find it. + bool need_addr = true; + uint32_t global_seq = 0; + bool started = false; + + bind_ertr::future<> do_bind(const entity_addrvec_t& addr); + + public: + SocketMessenger(const entity_name_t& myname, + const std::string& logic_name, + uint32_t nonce); + ~SocketMessenger() override { ceph_assert(!listener); } + + seastar::future<> set_myaddrs(const entity_addrvec_t& addr) override; + + // Messenger interfaces are assumed to be called from its own shard, but its + // behavior should be symmetric when called from any shard. + bind_ertr::future<> bind(const entity_addrvec_t& addr) override; + + bind_ertr::future<> try_bind(const entity_addrvec_t& addr, + uint32_t min_port, uint32_t max_port) override; + + seastar::future<> start(const dispatchers_t& dispatchers) override; + + ConnectionRef connect(const entity_addr_t& peer_addr, + const entity_name_t& peer_name) override; + // can only wait once + seastar::future<> wait() override { + assert(seastar::this_shard_id() == master_sid); + return shutdown_promise.get_future(); + } + + void stop() override { + dispatchers.clear(); + } + + bool is_started() const override { + return !dispatchers.empty(); + } + + seastar::future<> shutdown() override; + + void print(ostream& out) const override { + out << get_myname() + << "(" << logic_name + << ") " << get_myaddr(); + } + + SocketPolicy get_policy(entity_type_t peer_type) const override; + + SocketPolicy get_default_policy() const override; + + void set_default_policy(const SocketPolicy& p) override; + + void set_policy(entity_type_t peer_type, const SocketPolicy& p) override; + + void set_policy_throttler(entity_type_t peer_type, Throttle* throttle) override; + + public: + seastar::future<uint32_t> get_global_seq(uint32_t old=0); + seastar::future<> learned_addr(const entity_addr_t &peer_addr_for_me, + const SocketConnection& conn); + + SocketConnectionRef lookup_conn(const entity_addr_t& addr); + void accept_conn(SocketConnectionRef); + void unaccept_conn(SocketConnectionRef); + void register_conn(SocketConnectionRef); + void unregister_conn(SocketConnectionRef); + void closing_conn(SocketConnectionRef); + void closed_conn(SocketConnectionRef); + seastar::shard_id shard_id() const { + assert(seastar::this_shard_id() == master_sid); + return master_sid; + } +}; + +} // namespace crimson::net diff --git a/src/crimson/net/chained_dispatchers.cc b/src/crimson/net/chained_dispatchers.cc new file mode 100644 index 000000000..b13d40c8f --- /dev/null +++ b/src/crimson/net/chained_dispatchers.cc @@ -0,0 +1,93 @@ +#include "crimson/common/log.h" +#include "crimson/net/chained_dispatchers.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Dispatcher.h" +#include "msg/Message.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_ms); + } +} + +namespace crimson::net { + +seastar::future<> +ChainedDispatchers::ms_dispatch(crimson::net::ConnectionRef conn, + MessageRef m) { + try { + for (auto& dispatcher : dispatchers) { + auto dispatched = dispatcher->ms_dispatch(conn, m); + if (dispatched.has_value()) { + return std::move(*dispatched + ).handle_exception([conn] (std::exception_ptr eptr) { + logger().error("{} got unexpected exception in ms_dispatch() throttling {}", + *conn, eptr); + ceph_abort(); + }); + } + } + } catch (...) { + logger().error("{} got unexpected exception in ms_dispatch() {}", + *conn, std::current_exception()); + ceph_abort(); + } + if (!dispatchers.empty()) { + logger().error("ms_dispatch unhandled message {}", *m); + } + return seastar::now(); +} + +void +ChainedDispatchers::ms_handle_accept(crimson::net::ConnectionRef conn) { + try { + for (auto& dispatcher : dispatchers) { + dispatcher->ms_handle_accept(conn); + } + } catch (...) { + logger().error("{} got unexpected exception in ms_handle_accept() {}", + *conn, std::current_exception()); + ceph_abort(); + } +} + +void +ChainedDispatchers::ms_handle_connect(crimson::net::ConnectionRef conn) { + try { + for(auto& dispatcher : dispatchers) { + dispatcher->ms_handle_connect(conn); + } + } catch (...) { + logger().error("{} got unexpected exception in ms_handle_connect() {}", + *conn, std::current_exception()); + ceph_abort(); + } +} + +void +ChainedDispatchers::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) { + try { + for (auto& dispatcher : dispatchers) { + dispatcher->ms_handle_reset(conn, is_replace); + } + } catch (...) { + logger().error("{} got unexpected exception in ms_handle_reset() {}", + *conn, std::current_exception()); + ceph_abort(); + } +} + +void +ChainedDispatchers::ms_handle_remote_reset(crimson::net::ConnectionRef conn) { + try { + for (auto& dispatcher : dispatchers) { + dispatcher->ms_handle_remote_reset(conn); + } + } catch (...) { + logger().error("{} got unexpected exception in ms_handle_remote_reset() {}", + *conn, std::current_exception()); + ceph_abort(); + } +} + +} diff --git a/src/crimson/net/chained_dispatchers.h b/src/crimson/net/chained_dispatchers.h new file mode 100644 index 000000000..712b0894b --- /dev/null +++ b/src/crimson/net/chained_dispatchers.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "Fwd.h" +#include "crimson/common/log.h" + +namespace crimson::net { + +class Dispatcher; + +class ChainedDispatchers { +public: + void assign(const dispatchers_t& _dispatchers) { + assert(empty()); + assert(!_dispatchers.empty()); + dispatchers = _dispatchers; + } + void clear() { + dispatchers.clear(); + } + bool empty() const { + return dispatchers.empty(); + } + seastar::future<> ms_dispatch(crimson::net::ConnectionRef, MessageRef); + void ms_handle_accept(crimson::net::ConnectionRef conn); + void ms_handle_connect(crimson::net::ConnectionRef conn); + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace); + void ms_handle_remote_reset(crimson::net::ConnectionRef conn); + + private: + dispatchers_t dispatchers; +}; + +} diff --git a/src/crimson/os/CMakeLists.txt b/src/crimson/os/CMakeLists.txt new file mode 100644 index 000000000..f221dd7c1 --- /dev/null +++ b/src/crimson/os/CMakeLists.txt @@ -0,0 +1,15 @@ +add_library(crimson-os STATIC + futurized_store.cc + ${PROJECT_SOURCE_DIR}/src/os/Transaction.cc) +add_subdirectory(cyanstore) + +if(WITH_BLUESTORE) + add_subdirectory(alienstore) +endif() + +add_subdirectory(seastore) +target_link_libraries(crimson-os + crimson-cyanstore + crimson-alienstore + crimson-seastore + crimson) diff --git a/src/crimson/os/alienstore/CMakeLists.txt b/src/crimson/os/alienstore/CMakeLists.txt new file mode 100644 index 000000000..659a3c6ce --- /dev/null +++ b/src/crimson/os/alienstore/CMakeLists.txt @@ -0,0 +1,76 @@ +include_directories(SYSTEM "${CMAKE_SOURCE_DIR}/src/rocksdb/include") + +add_library(alien::cflags INTERFACE IMPORTED) +set_target_properties(alien::cflags PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "WITH_SEASTAR;WITH_ALIEN" + INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:Seastar::seastar,INTERFACE_INCLUDE_DIRECTORIES>) + +add_library(crimson-alien-common STATIC + ${PROJECT_SOURCE_DIR}/src/common/admin_socket.cc + ${PROJECT_SOURCE_DIR}/src/common/blkdev.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_context.cc + ${PROJECT_SOURCE_DIR}/src/common/ceph_crypto.cc + ${PROJECT_SOURCE_DIR}/src/common/condition_variable_debug.cc + ${PROJECT_SOURCE_DIR}/src/common/cmdparse.cc + ${PROJECT_SOURCE_DIR}/src/common/Finisher.cc + ${PROJECT_SOURCE_DIR}/src/common/HeartbeatMap.cc + ${PROJECT_SOURCE_DIR}/src/common/PluginRegistry.cc + ${PROJECT_SOURCE_DIR}/src/common/lockdep.cc + ${PROJECT_SOURCE_DIR}/src/common/mutex_debug.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_counters.cc + ${PROJECT_SOURCE_DIR}/src/common/perf_counters_collection.cc + ${PROJECT_SOURCE_DIR}/src/common/RefCountedObj.cc + ${PROJECT_SOURCE_DIR}/src/common/shared_mutex_debug.cc + ${PROJECT_SOURCE_DIR}/src/common/SubProcess.cc + ${PROJECT_SOURCE_DIR}/src/common/Throttle.cc + ${PROJECT_SOURCE_DIR}/src/common/Timer.cc + ${PROJECT_SOURCE_DIR}/src/common/TrackedOp.cc + ${PROJECT_SOURCE_DIR}/src/common/WorkQueue.cc + ${PROJECT_SOURCE_DIR}/src/common/util.cc + ${PROJECT_SOURCE_DIR}/src/crush/CrushLocation.cc + ${PROJECT_SOURCE_DIR}/src/global/global_context.cc + $<TARGET_OBJECTS:compressor_objs> + $<TARGET_OBJECTS:common_prioritycache_obj>) +target_link_libraries(crimson-alien-common + crimson-common + alien::cflags) + +set(alien_store_srcs + alien_store.cc + thread_pool.cc + ${PROJECT_SOURCE_DIR}/src/os/ObjectStore.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/Allocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/AvlAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapFreelistManager.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueFS.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluefs_types.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueRocksEnv.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluestore_types.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/fastbmap_allocator_impl.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/FreelistManager.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/HybridAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/StupidAllocator.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BitmapAllocator.cc) +if(WITH_ZBD) + list(APPEND alien_store_srcs + ${PROJECT_SOURCE_DIR}/src/os/bluestore/zoned_types.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedFreelistManager.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/ZonedAllocator.cc) +endif() +add_library(crimson-alienstore STATIC + ${alien_store_srcs}) +if(WITH_LTTNG) + add_dependencies(crimson-alienstore bluestore-tp) +endif() +target_link_libraries(crimson-alienstore + PRIVATE + alien::cflags + fmt::fmt + kv + heap_profiler + crimson-alien-common + ${BLKID_LIBRARIES} + ${UDEV_LIBRARIES} + crimson + blk) diff --git a/src/crimson/os/alienstore/alien_collection.h b/src/crimson/os/alienstore/alien_collection.h new file mode 100644 index 000000000..98b8fdef4 --- /dev/null +++ b/src/crimson/os/alienstore/alien_collection.h @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "os/ObjectStore.h" + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "alien_store.h" + +namespace crimson::os { + +class AlienCollection final : public FuturizedCollection { +public: + AlienCollection(ObjectStore::CollectionHandle ch) + : FuturizedCollection(ch->cid), + collection(ch) {} + + ~AlienCollection() {} + +private: + ObjectStore::CollectionHandle collection; + friend AlienStore; +}; +} diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc new file mode 100644 index 000000000..cb5553254 --- /dev/null +++ b/src/crimson/os/alienstore/alien_store.cc @@ -0,0 +1,575 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "alien_collection.h" +#include "alien_store.h" + +#include <map> +#include <string_view> +#include <boost/algorithm/string/trim.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include <seastar/core/alien.hh> +#include <seastar/core/future-util.hh> +#include <seastar/core/reactor.hh> + +#include "common/ceph_context.h" +#include "global/global_context.h" +#include "include/Context.h" +#include "os/bluestore/BlueStore.h" +#include "os/ObjectStore.h" +#include "os/Transaction.h" + +#include "crimson/common/log.h" +#include "crimson/os/futurized_store.h" + +namespace { + seastar::logger& logger() + { + return crimson::get_logger(ceph_subsys_filestore); + } + +class OnCommit final: public Context +{ + int cpuid; + Context *oncommit; + seastar::promise<> &alien_done; +public: + OnCommit( + int id, + seastar::promise<> &done, + Context *oncommit, + ceph::os::Transaction& txn) + : cpuid(id), oncommit(oncommit), + alien_done(done) {} + + void finish(int) final { + return seastar::alien::submit_to(cpuid, [this] { + if (oncommit) oncommit->complete(0); + alien_done.set_value(); + return seastar::make_ready_future<>(); + }).wait(); + } +}; +} + +namespace crimson::os { + +AlienStore::AlienStore(const std::string& path, const ConfigValues& values) + : path{path} +{ + cct = std::make_unique<CephContext>(CEPH_ENTITY_TYPE_OSD); + g_ceph_context = cct.get(); + cct->_conf.set_config_values(values); + store = std::make_unique<BlueStore>(cct.get(), path); + + long cpu_id = 0; + if (long nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); nr_cpus != -1) { + cpu_id = nr_cpus - 1; + } else { + logger().error("{}: unable to get nproc: {}", __func__, errno); + cpu_id = -1; + } + tp = std::make_unique<crimson::os::ThreadPool>(1, 128, cpu_id); +} + +seastar::future<> AlienStore::start() +{ + return tp->start(); +} + +seastar::future<> AlienStore::stop() +{ + return tp->submit([this] { + for (auto [cid, ch]: coll_map) + static_cast<AlienCollection*>(ch.get())->collection.reset(); + store.reset(); + }).then([this] { + return tp->stop(); + }); +} + +AlienStore::~AlienStore() = default; + +seastar::future<> AlienStore::mount() +{ + logger().debug("{}", __func__); + return tp->submit([this] { + return store->mount(); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<> AlienStore::umount() +{ + logger().info("{}", __func__); + return transaction_gate.close().then([this] { + return tp->submit([this] { + return store->umount(); + }); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<> AlienStore::mkfs(uuid_d osd_fsid) +{ + logger().debug("{}", __func__); + store->set_fsid(osd_fsid); + return tp->submit([this] { + return store->mkfs(); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> +AlienStore::list_objects(CollectionRef ch, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const +{ + logger().debug("{}", __func__); + return seastar::do_with(std::vector<ghobject_t>(), ghobject_t(), + [=] (auto &objects, auto &next) { + objects.reserve(limit); + return tp->submit([=, &objects, &next] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->collection_list(c->collection, start, end, + store->get_ideal_list_max(), + &objects, &next); + }).then([&objects, &next] (int r) { + assert(r == 0); + return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>( + std::make_tuple(std::move(objects), std::move(next))); + }); + }); +} + +seastar::future<CollectionRef> AlienStore::create_new_collection(const coll_t& cid) +{ + logger().debug("{}", __func__); + return tp->submit([this, cid] { + return store->create_new_collection(cid); + }).then([this, cid] (ObjectStore::CollectionHandle c) { + CollectionRef ch; + auto cp = coll_map.find(c->cid); + if (cp == coll_map.end()) { + ch = new AlienCollection(c); + coll_map[c->cid] = ch; + } else { + ch = cp->second; + auto ach = static_cast<AlienCollection*>(ch.get()); + if (ach->collection != c) { + ach->collection = c; + } + } + return seastar::make_ready_future<CollectionRef>(ch); + }); + +} + +seastar::future<CollectionRef> AlienStore::open_collection(const coll_t& cid) +{ + logger().debug("{}", __func__); + return tp->submit([this, cid] { + return store->open_collection(cid); + }).then([this] (ObjectStore::CollectionHandle c) { + CollectionRef ch; + auto cp = coll_map.find(c->cid); + if (cp == coll_map.end()){ + ch = new AlienCollection(c); + coll_map[c->cid] = ch; + } else { + ch = cp->second; + auto ach = static_cast<AlienCollection*>(ch.get()); + if (ach->collection != c){ + ach->collection = c; + } + } + return seastar::make_ready_future<CollectionRef>(ch); + }); +} + +seastar::future<std::vector<coll_t>> AlienStore::list_collections() +{ + logger().debug("{}", __func__); + + return seastar::do_with(std::vector<coll_t>{}, [=] (auto &ls) { + return tp->submit([this, &ls] { + return store->list_collections(ls); + }).then([&ls] (int r) { + assert(r == 0); + return seastar::make_ready_future<std::vector<coll_t>>(std::move(ls)); + }); + }); +} + +AlienStore::read_errorator::future<ceph::bufferlist> +AlienStore::read(CollectionRef ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags) +{ + logger().debug("{}", __func__); + return seastar::do_with(ceph::bufferlist{}, [=] (auto &bl) { + return tp->submit([=, &bl] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->read(c->collection, oid, offset, len, bl, op_flags); + }).then([&bl] (int r) -> read_errorator::future<ceph::bufferlist> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r == -EIO) { + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl)); + } + }); + }); +} + +AlienStore::read_errorator::future<ceph::bufferlist> +AlienStore::readv(CollectionRef ch, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags) +{ + logger().debug("{}", __func__); + return seastar::do_with(ceph::bufferlist{}, + [this, ch, oid, &m, op_flags](auto& bl) { + return tp->submit([this, ch, oid, &m, op_flags, &bl] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->readv(c->collection, oid, m, bl, op_flags); + }).then([&bl](int r) -> read_errorator::future<ceph::bufferlist> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r == -EIO) { + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl)); + } + }); + }); +} + +AlienStore::get_attr_errorator::future<ceph::bufferptr> +AlienStore::get_attr(CollectionRef ch, + const ghobject_t& oid, + std::string_view name) const +{ + logger().debug("{}", __func__); + return seastar::do_with(ceph::bufferptr{}, [=] (auto &value) { + return tp->submit([=, &value] { + auto c =static_cast<AlienCollection*>(ch.get()); + return store->getattr(c->collection, oid, + static_cast<std::string>(name).c_str(), value); + }).then([oid, &value] (int r) -> get_attr_errorator::future<ceph::bufferptr> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r == -ENODATA) { + return crimson::ct_error::enodata::make(); + } else { + return get_attr_errorator::make_ready_future<ceph::bufferptr>( + std::move(value)); + } + }); + }); +} + +AlienStore::get_attrs_ertr::future<AlienStore::attrs_t> +AlienStore::get_attrs(CollectionRef ch, + const ghobject_t& oid) +{ + logger().debug("{}", __func__); + return seastar::do_with(attrs_t{}, [=] (auto &aset) { + return tp->submit([=, &aset] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->getattrs(c->collection, oid, + reinterpret_cast<map<string,bufferptr>&>(aset)); + }).then([&aset] (int r) -> get_attrs_ertr::future<attrs_t> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else { + return get_attrs_ertr::make_ready_future<attrs_t>(std::move(aset)); + } + }); + }); +} + +auto AlienStore::omap_get_values(CollectionRef ch, + const ghobject_t& oid, + const set<string>& keys) + -> read_errorator::future<omap_values_t> +{ + logger().debug("{}", __func__); + return seastar::do_with(omap_values_t{}, [=] (auto &values) { + return tp->submit([=, &values] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->omap_get_values(c->collection, oid, keys, + reinterpret_cast<map<string, bufferlist>*>(&values)); + }).then([&values] (int r) -> read_errorator::future<omap_values_t> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else { + assert(r == 0); + return read_errorator::make_ready_future<omap_values_t>(std::move(values)); + } + }); + }); +} + +auto AlienStore::omap_get_values(CollectionRef ch, + const ghobject_t &oid, + const std::optional<string> &start) + -> read_errorator::future<std::tuple<bool, omap_values_t>> +{ + logger().debug("{} with_start", __func__); + return seastar::do_with(omap_values_t{}, [=] (auto &values) { + return tp->submit([=, &values] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->omap_get_values(c->collection, oid, start, + reinterpret_cast<map<string, bufferlist>*>(&values)); + }).then([&values] (int r) + -> read_errorator::future<std::tuple<bool, omap_values_t>> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r < 0){ + logger().error("omap_get_values(start): {}", r); + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<std::tuple<bool, omap_values_t>>( + std::make_tuple(true, std::move(values))); + } + }); + }); +} + +seastar::future<> AlienStore::do_transaction(CollectionRef ch, + ceph::os::Transaction&& txn) +{ + logger().debug("{}", __func__); + auto id = seastar::this_shard_id(); + auto done = seastar::promise<>(); + return seastar::do_with( + std::move(txn), + std::move(done), + [this, ch, id] (auto &txn, auto &done) { + return seastar::with_gate(transaction_gate, [this, ch, id, &txn, &done] { + return tp_mutex.lock().then ([this, ch, id, &txn, &done] { + Context *crimson_wrapper = + ceph::os::Transaction::collect_all_contexts(txn); + return tp->submit([this, ch, id, crimson_wrapper, &txn, &done] { + txn.register_on_commit(new OnCommit(id, done, crimson_wrapper, txn)); + auto c = static_cast<AlienCollection*>(ch.get()); + return store->queue_transaction(c->collection, std::move(txn)); + }); + }).then([this, &done] (int r) { + assert(r == 0); + tp_mutex.unlock(); + return done.get_future(); + }); + }); + }); +} + +seastar::future<> AlienStore::write_meta(const std::string& key, + const std::string& value) +{ + logger().debug("{}", __func__); + return tp->submit([=] { + return store->write_meta(key, value); + }).then([] (int r) { + assert(r == 0); + return seastar::make_ready_future<>(); + }); +} + +seastar::future<std::tuple<int, std::string>> +AlienStore::read_meta(const std::string& key) +{ + logger().debug("{}", __func__); + return tp->submit([this, key] { + std::string value; + int r = store->read_meta(key, &value); + if (r > 0) { + value.resize(r); + boost::algorithm::trim_right_if(value, + [] (unsigned char c) {return isspace(c);}); + } else { + value.clear(); + } + return std::make_pair(r, value); + }).then([] (auto entry) { + return seastar::make_ready_future<std::tuple<int, std::string>>( + std::move(entry)); + }); +} + +uuid_d AlienStore::get_fsid() const +{ + logger().debug("{}", __func__); + return store->get_fsid(); +} + +seastar::future<store_statfs_t> AlienStore::stat() const +{ + logger().info("{}", __func__); + return seastar::do_with(store_statfs_t{}, [this] (store_statfs_t &st) { + return tp->submit([this, &st] { + return store->statfs(&st, nullptr); + }).then([&st] (int r) { + assert(r == 0); + return seastar::make_ready_future<store_statfs_t>(std::move(st)); + }); + }); +} + +unsigned AlienStore::get_max_attr_name_length() const +{ + logger().info("{}", __func__); + return 256; +} + +seastar::future<struct stat> AlienStore::stat( + CollectionRef ch, + const ghobject_t& oid) +{ + return seastar::do_with((struct stat){}, [this, ch, oid](auto& st) { + return tp->submit([this, ch, oid, &st] { + auto c = static_cast<AlienCollection*>(ch.get()); + store->stat(c->collection, oid, &st); + return st; + }); + }); +} + +auto AlienStore::omap_get_header(CollectionRef ch, + const ghobject_t& oid) + -> read_errorator::future<ceph::bufferlist> +{ + return seastar::do_with(ceph::bufferlist(), [=](auto& bl) { + return tp->submit([=, &bl] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->omap_get_header(c->collection, oid, &bl); + }).then([&bl] (int r) -> read_errorator::future<ceph::bufferlist> { + if (r == -ENOENT) { + return crimson::ct_error::enoent::make(); + } else if (r < 0) { + logger().error("omap_get_header: {}", r); + return crimson::ct_error::input_output_error::make(); + } else { + return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl)); + } + }); + }); +} + +seastar::future<std::map<uint64_t, uint64_t>> AlienStore::fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + return seastar::do_with(std::map<uint64_t, uint64_t>(), [=](auto& destmap) { + return tp->submit([=, &destmap] { + auto c = static_cast<AlienCollection*>(ch.get()); + return store->fiemap(c->collection, oid, off, len, destmap); + }).then([&destmap] (int i) { + return seastar::make_ready_future + <std::map<uint64_t, uint64_t>> + (std::move(destmap)); + }); + }); +} + +seastar::future<FuturizedStore::OmapIteratorRef> AlienStore::get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) +{ + return tp->submit([=] { + auto c = static_cast<AlienCollection*>(ch.get()); + auto iter = store->get_omap_iterator(c->collection, oid); + return FuturizedStore::OmapIteratorRef( + new AlienStore::AlienOmapIterator(iter, + this)); + }); +} + +//TODO: each iterator op needs one submit, this is not efficient, +// needs further optimization. +seastar::future<> AlienStore::AlienOmapIterator::seek_to_first() +{ + return store->tp->submit([=] { + return iter->seek_to_first(); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<> AlienStore::AlienOmapIterator::upper_bound( + const std::string& after) +{ + return store->tp->submit([this, after] { + return iter->upper_bound(after); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<> AlienStore::AlienOmapIterator::lower_bound( + const std::string& to) +{ + return store->tp->submit([this, to] { + return iter->lower_bound(to); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +seastar::future<> AlienStore::AlienOmapIterator::next() +{ + return store->tp->submit([this] { + return iter->next(); + }).then([] (int r) { + assert(r == 0); + return seastar::now(); + }); +} + +bool AlienStore::AlienOmapIterator::valid() const +{ + return iter->valid(); +} + +std::string AlienStore::AlienOmapIterator::key() +{ + return iter->key(); +} + +seastar::future<std::string> AlienStore::AlienOmapIterator::tail_key() +{ + return store->tp->submit([this] { + return iter->tail_key(); + }); +} + +ceph::buffer::list AlienStore::AlienOmapIterator::value() +{ + return iter->value(); +} + +int AlienStore::AlienOmapIterator::status() const +{ + return iter->status(); +} + +} diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h new file mode 100644 index 000000000..92739340e --- /dev/null +++ b/src/crimson/os/alienstore/alien_store.h @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/shared_mutex.hh> + +#include "common/ceph_context.h" +#include "os/ObjectStore.h" +#include "osd/osd_types.h" + +#include "crimson/os/alienstore/thread_pool.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +namespace ceph::os { +class Transaction; +} + +namespace crimson::os { +class AlienStore final : public FuturizedStore { +public: + class AlienOmapIterator final : public OmapIterator { + public: + AlienOmapIterator(ObjectMap::ObjectMapIterator& it, + AlienStore* store) : iter(it), store(store) {} + seastar::future<> seek_to_first(); + seastar::future<> upper_bound(const std::string& after); + seastar::future<> lower_bound(const std::string& to); + bool valid() const; + seastar::future<> next(); + std::string key(); + seastar::future<std::string> tail_key(); + ceph::buffer::list value(); + int status() const; + private: + ObjectMap::ObjectMapIterator iter; + AlienStore* store; + }; + AlienStore(const std::string& path, const ConfigValues& values); + ~AlienStore() final; + + seastar::future<> start() final; + seastar::future<> stop() final; + seastar::future<> mount() final; + seastar::future<> umount() final; + + seastar::future<> mkfs(uuid_d new_osd_fsid) final; + read_errorator::future<ceph::bufferlist> read(CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) final; + read_errorator::future<ceph::bufferlist> readv(CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) final; + + + get_attr_errorator::future<ceph::bufferptr> get_attr(CollectionRef c, + const ghobject_t& oid, + std::string_view name) const final; + get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) final; + + /// Retrieves paged set of values > start (if present) + read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) final; ///< @return <done, values> values.empty() iff done + + seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const final; + + seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; + seastar::future<CollectionRef> open_collection(const coll_t& cid) final; + seastar::future<std::vector<coll_t>> list_collections() final; + + seastar::future<> do_transaction(CollectionRef c, + ceph::os::Transaction&& txn) final; + + seastar::future<> write_meta(const std::string& key, + const std::string& value) final; + seastar::future<std::tuple<int, std::string>> read_meta( + const std::string& key) final; + uuid_d get_fsid() const final; + seastar::future<store_statfs_t> stat() const final; + unsigned get_max_attr_name_length() const final; + seastar::future<struct stat> stat( + CollectionRef, + const ghobject_t&) final; + read_errorator::future<ceph::bufferlist> omap_get_header( + CollectionRef, + const ghobject_t&) final; + seastar::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef, + const ghobject_t&, + uint64_t off, + uint64_t len) final; + seastar::future<FuturizedStore::OmapIteratorRef> get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) final; + +private: + constexpr static unsigned MAX_KEYS_PER_OMAP_GET_CALL = 32; + mutable std::unique_ptr<crimson::os::ThreadPool> tp; + const std::string path; + uint64_t used_bytes = 0; + std::unique_ptr<ObjectStore> store; + std::unique_ptr<CephContext> cct; + seastar::gate transaction_gate; + std::unordered_map<coll_t, CollectionRef> coll_map; + seastar::shared_mutex tp_mutex; +}; +} diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc new file mode 100644 index 000000000..e127d87d5 --- /dev/null +++ b/src/crimson/os/alienstore/thread_pool.cc @@ -0,0 +1,80 @@ +#include "thread_pool.h" + +#include <chrono> +#include <pthread.h> + +#include "include/ceph_assert.h" +#include "crimson/common/config_proxy.h" + +using crimson::common::local_conf; + +namespace crimson::os { + +ThreadPool::ThreadPool(size_t n_threads, + size_t queue_sz, + long cpu_id) + : queue_size{round_up_to(queue_sz, seastar::smp::count)}, + pending{queue_size} +{ + auto queue_max_wait = std::chrono::seconds(local_conf()->threadpool_empty_queue_max_wait); + for (size_t i = 0; i < n_threads; i++) { + threads.emplace_back([this, cpu_id, queue_max_wait] { + if (cpu_id >= 0) { + pin(cpu_id); + } + loop(queue_max_wait); + }); + } +} + +ThreadPool::~ThreadPool() +{ + for (auto& thread : threads) { + thread.join(); + } +} + +void ThreadPool::pin(unsigned cpu_id) +{ + cpu_set_t cs; + CPU_ZERO(&cs); + CPU_SET(cpu_id, &cs); + [[maybe_unused]] auto r = pthread_setaffinity_np(pthread_self(), + sizeof(cs), &cs); + ceph_assert(r == 0); +} + +void ThreadPool::loop(std::chrono::milliseconds queue_max_wait) +{ + for (;;) { + WorkItem* work_item = nullptr; + { + std::unique_lock lock{mutex}; + cond.wait_for(lock, queue_max_wait, + [this, &work_item] { + return pending.pop(work_item) || is_stopping(); + }); + } + if (work_item) { + work_item->process(); + } else if (is_stopping()) { + break; + } + } +} + +seastar::future<> ThreadPool::start() +{ + auto slots_per_shard = queue_size / seastar::smp::count; + return submit_queue.start(slots_per_shard); +} + +seastar::future<> ThreadPool::stop() +{ + return submit_queue.stop().then([this] { + stopping = true; + cond.notify_all(); + }); +} + +} // namespace crimson::os diff --git a/src/crimson/os/alienstore/thread_pool.h b/src/crimson/os/alienstore/thread_pool.h new file mode 100644 index 000000000..27840da18 --- /dev/null +++ b/src/crimson/os/alienstore/thread_pool.h @@ -0,0 +1,132 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include <atomic> +#include <condition_variable> +#include <tuple> +#include <type_traits> +#include <boost/lockfree/queue.hpp> +#include <boost/optional.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/semaphore.hh> +#include <seastar/core/sharded.hh> + +namespace crimson::os { + +struct WorkItem { + virtual ~WorkItem() {} + virtual void process() = 0; +}; + +template<typename Func> +struct Task final : WorkItem { + using T = std::invoke_result_t<Func>; + using future_stored_type_t = + std::conditional_t<std::is_void_v<T>, + seastar::internal::future_stored_type_t<>, + seastar::internal::future_stored_type_t<T>>; + using futurator_t = seastar::futurize<T>; +public: + explicit Task(Func&& f) + : func(std::move(f)) + {} + void process() override { + try { + if constexpr (std::is_void_v<T>) { + func(); + state.set(); + } else { + state.set(func()); + } + } catch (...) { + state.set_exception(std::current_exception()); + } + on_done.write_side().signal(1); + } + typename futurator_t::type get_future() { + return on_done.wait().then([this](size_t) { + if (state.failed()) { + return futurator_t::make_exception_future(state.get_exception()); + } else { + return futurator_t::from_tuple(state.get_value()); + } + }); + } +private: + Func func; + seastar::future_state<future_stored_type_t> state; + seastar::readable_eventfd on_done; +}; + +struct SubmitQueue { + seastar::semaphore free_slots; + seastar::gate pending_tasks; + explicit SubmitQueue(size_t num_free_slots) + : free_slots(num_free_slots) + {} + seastar::future<> stop() { + return pending_tasks.close(); + } +}; + +/// an engine for scheduling non-seastar tasks from seastar fibers +class ThreadPool { + std::atomic<bool> stopping = false; + std::mutex mutex; + std::condition_variable cond; + std::vector<std::thread> threads; + seastar::sharded<SubmitQueue> submit_queue; + const size_t queue_size; + boost::lockfree::queue<WorkItem*> pending; + + void loop(std::chrono::milliseconds queue_max_wait); + bool is_stopping() const { + return stopping.load(std::memory_order_relaxed); + } + static void pin(unsigned cpu_id); + seastar::semaphore& local_free_slots() { + return submit_queue.local().free_slots; + } + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator=(const ThreadPool&) = delete; +public: + /** + * @param queue_sz the depth of pending queue. before a task is scheduled, + * it waits in this queue. we will round this number to + * multiple of the number of cores. + * @param n_threads the number of threads in this thread pool. + * @param cpu the CPU core to which this thread pool is assigned + * @note each @c Task has its own crimson::thread::Condition, which possesses + * an fd, so we should keep the size of queue under a reasonable limit. + */ + ThreadPool(size_t n_threads, size_t queue_sz, long cpu); + ~ThreadPool(); + seastar::future<> start(); + seastar::future<> stop(); + template<typename Func, typename...Args> + auto submit(Func&& func, Args&&... args) { + auto packaged = [func=std::move(func), + args=std::forward_as_tuple(args...)] { + return std::apply(std::move(func), std::move(args)); + }; + return seastar::with_gate(submit_queue.local().pending_tasks, + [packaged=std::move(packaged), this] { + return local_free_slots().wait() + .then([packaged=std::move(packaged), this] { + auto task = new Task{std::move(packaged)}; + auto fut = task->get_future(); + pending.push(task); + cond.notify_one(); + return fut.finally([task, this] { + local_free_slots().signal(); + delete task; + }); + }); + }); + } +}; + +} // namespace crimson::os diff --git a/src/crimson/os/cyanstore/CMakeLists.txt b/src/crimson/os/cyanstore/CMakeLists.txt new file mode 100644 index 000000000..65f2b5498 --- /dev/null +++ b/src/crimson/os/cyanstore/CMakeLists.txt @@ -0,0 +1,7 @@ +add_library(crimson-cyanstore STATIC + cyan_store.cc + cyan_collection.cc + cyan_object.cc) +target_link_libraries(crimson-cyanstore + crimson + crimson-os) diff --git a/src/crimson/os/cyanstore/cyan_collection.cc b/src/crimson/os/cyanstore/cyan_collection.cc new file mode 100644 index 000000000..f44234e84 --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_collection.cc @@ -0,0 +1,76 @@ +#include "cyan_collection.h" + +#include "cyan_object.h" + +namespace crimson::os +{ + +Collection::Collection(const coll_t& c) + : FuturizedCollection{c} +{} + +Collection::~Collection() = default; + +Collection::ObjectRef Collection::create_object() const +{ + return new crimson::os::Object; +} + +Collection::ObjectRef Collection::get_object(ghobject_t oid) +{ + auto o = object_hash.find(oid); + if (o == object_hash.end()) + return ObjectRef(); + return o->second; +} + +Collection::ObjectRef Collection::get_or_create_object(ghobject_t oid) +{ + auto result = object_hash.emplace(oid, ObjectRef{}); + if (result.second) + object_map[oid] = result.first->second = create_object(); + return result.first->second; +} + +uint64_t Collection::used_bytes() const +{ + uint64_t result = 0; + for (auto& obj : object_map) { + result += obj.second->get_size(); + } + return result; +} + +void Collection::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(xattr, bl); + encode(use_page_set, bl); + uint32_t s = object_map.size(); + encode(s, bl); + for (auto& [oid, obj] : object_map) { + encode(oid, bl); + obj->encode(bl); + } + ENCODE_FINISH(bl); +} + +void Collection::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(xattr, p); + decode(use_page_set, p); + uint32_t s; + decode(s, p); + while (s--) { + ghobject_t k; + decode(k, p); + auto o = create_object(); + o->decode(p); + object_map.insert(make_pair(k, o)); + object_hash.insert(make_pair(k, o)); + } + DECODE_FINISH(p); +} + +} diff --git a/src/crimson/os/cyanstore/cyan_collection.h b/src/crimson/os/cyanstore/cyan_collection.h new file mode 100644 index 000000000..068e427d8 --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_collection.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <unordered_map> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "include/buffer.h" +#include "osd/osd_types.h" + +#include "crimson/os/futurized_collection.h" + +namespace crimson::os { + +class Object; +/** + * a collection also orders transactions + * + * Any transactions queued under a given collection will be applied in + * sequence. Transactions queued under different collections may run + * in parallel. + * + * ObjectStore users may get collection handles with open_collection() (or, + * for bootstrapping a new collection, create_new_collection()). + */ +struct Collection final : public FuturizedCollection { + using ObjectRef = boost::intrusive_ptr<Object>; + int bits = 0; + // always use bufferlist object for testing + bool use_page_set = false; + std::unordered_map<ghobject_t, ObjectRef> object_hash; ///< for lookup + std::map<ghobject_t, ObjectRef> object_map; ///< for iteration + std::map<std::string,bufferptr> xattr; + bool exists = true; + + Collection(const coll_t& c); + ~Collection() final; + + ObjectRef create_object() const; + ObjectRef get_object(ghobject_t oid); + ObjectRef get_or_create_object(ghobject_t oid); + uint64_t used_bytes() const; + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); +}; + +} diff --git a/src/crimson/os/cyanstore/cyan_object.cc b/src/crimson/os/cyanstore/cyan_object.cc new file mode 100644 index 000000000..34bc13b7f --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_object.cc @@ -0,0 +1,89 @@ +#include "cyan_object.h" +#include "include/encoding.h" + +namespace crimson::os { + +size_t Object::get_size() const { + return data.length(); +} + +ceph::bufferlist Object::read(uint64_t offset, uint64_t len) +{ + bufferlist ret; + ret.substr_of(data, offset, len); + return ret; +} + +int Object::write(uint64_t offset, const bufferlist &src) +{ + unsigned len = src.length(); + // before + bufferlist newdata; + if (get_size() >= offset) { + newdata.substr_of(data, 0, offset); + } else { + if (get_size()) { + newdata.substr_of(data, 0, get_size()); + } + newdata.append_zero(offset - get_size()); + } + + newdata.append(src); + + // after + if (get_size() > offset + len) { + bufferlist tail; + tail.substr_of(data, offset + len, get_size() - (offset + len)); + newdata.append(tail); + } + + data = std::move(newdata); + return 0; +} + +int Object::clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff) +{ + bufferlist bl; + if (srcoff == dstoff && len == src->get_size()) { + data = src->data; + return 0; + } + bl.substr_of(src->data, srcoff, len); + return write(dstoff, bl); + +} + +int Object::truncate(uint64_t size) +{ + if (get_size() > size) { + bufferlist bl; + bl.substr_of(data, 0, size); + data = std::move(bl); + } else if (get_size() == size) { + // do nothing + } else { + data.append_zero(size - get_size()); + } + return 0; +} + +void Object::encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(data, bl); + encode(xattr, bl); + encode(omap_header, bl); + encode(omap, bl); + ENCODE_FINISH(bl); +} + +void Object::decode(bufferlist::const_iterator& p) { + DECODE_START(1, p); + decode(data, p); + decode(xattr, p); + decode(omap_header, p); + decode(omap, p); + DECODE_FINISH(p); +} + +} diff --git a/src/crimson/os/cyanstore/cyan_object.h b/src/crimson/os/cyanstore/cyan_object.h new file mode 100644 index 000000000..f19b87212 --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_object.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +#include <cstddef> +#include <map> +#include <string> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "include/buffer.h" + +namespace crimson::os { + +struct Object : public boost::intrusive_ref_counter< + Object, + boost::thread_unsafe_counter> +{ + using bufferlist = ceph::bufferlist; + + bufferlist data; + // use transparent comparator for better performance, see + // https://en.cppreference.com/w/cpp/utility/functional/less_void + std::map<std::string,bufferptr,std::less<>> xattr; + bufferlist omap_header; + std::map<std::string,bufferlist> omap; + + typedef boost::intrusive_ptr<Object> Ref; + + Object() = default; + + // interface for object data + size_t get_size() const; + ceph::bufferlist read(uint64_t offset, uint64_t len); + int write(uint64_t offset, const bufferlist &bl); + int clone(Object *src, uint64_t srcoff, uint64_t len, + uint64_t dstoff); + int truncate(uint64_t offset); + + void encode(bufferlist& bl) const; + void decode(bufferlist::const_iterator& p); +}; +using ObjectRef = boost::intrusive_ptr<Object>; + +} diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc new file mode 100644 index 000000000..eb93d72ec --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_store.cc @@ -0,0 +1,835 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "cyan_store.h" + +#include <boost/algorithm/string/trim.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "common/safe_io.h" +#include "os/Transaction.h" + +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" +#include "cyan_collection.h" +#include "cyan_object.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +using crimson::common::local_conf; + +namespace crimson::os { + +using ObjectRef = boost::intrusive_ptr<Object>; + +CyanStore::CyanStore(const std::string& path) + : path{path} +{} + +CyanStore::~CyanStore() = default; + +seastar::future<> CyanStore::mount() +{ + ceph::bufferlist bl; + std::string fn = path + "/collections"; + std::string err; + if (int r = bl.read_file(fn.c_str(), &err); r < 0) { + throw std::runtime_error("read_file"); + } + + std::set<coll_t> collections; + auto p = bl.cbegin(); + ceph::decode(collections, p); + + for (auto& coll : collections) { + std::string fn = fmt::format("{}/{}", path, coll); + ceph::bufferlist cbl; + if (int r = cbl.read_file(fn.c_str(), &err); r < 0) { + throw std::runtime_error("read_file"); + } + boost::intrusive_ptr<Collection> c{new Collection{coll}}; + auto p = cbl.cbegin(); + c->decode(p); + coll_map[coll] = c; + used_bytes += c->used_bytes(); + } + return seastar::now(); +} + +seastar::future<> CyanStore::umount() +{ + return seastar::do_with(std::set<coll_t>{}, [this](auto& collections) { + return seastar::do_for_each(coll_map, [&collections, this](auto& coll) { + auto& [col, ch] = coll; + collections.insert(col); + ceph::bufferlist bl; + ceph_assert(ch); + ch->encode(bl); + std::string fn = fmt::format("{}/{}", path, col); + return crimson::write_file(std::move(bl), fn); + }).then([&collections, this] { + ceph::bufferlist bl; + ceph::encode(collections, bl); + std::string fn = fmt::format("{}/collections", path); + return crimson::write_file(std::move(bl), fn); + }); + }); +} + +seastar::future<> CyanStore::mkfs(uuid_d new_osd_fsid) +{ + return read_meta("fsid").then([=](auto&& ret) { + auto& [r, fsid_str] = ret; + if (r == -ENOENT) { + if (new_osd_fsid.is_zero()) { + osd_fsid.generate_random(); + } else { + osd_fsid = new_osd_fsid; + } + return write_meta("fsid", fmt::format("{}", osd_fsid)); + } else if (r < 0) { + throw std::runtime_error("read_meta"); + } else { + logger().info("{} already has fsid {}", __func__, fsid_str); + if (!osd_fsid.parse(fsid_str.c_str())) { + throw std::runtime_error("failed to parse fsid"); + } else if (osd_fsid != new_osd_fsid) { + logger().error("on-disk fsid {} != provided {}", osd_fsid, new_osd_fsid); + throw std::runtime_error("unmatched osd_fsid"); + } else { + return seastar::now(); + } + } + }).then([this]{ + std::string fn = path + "/collections"; + ceph::bufferlist bl; + std::set<coll_t> collections; + ceph::encode(collections, bl); + return crimson::write_file(std::move(bl), fn); + }).then([this] { + return write_meta("type", "memstore"); + }); +} + +seastar::future<store_statfs_t> CyanStore::stat() const +{ + logger().debug("{}", __func__); + store_statfs_t st; + st.total = crimson::common::local_conf().get_val<Option::size_t>("memstore_device_bytes"); + st.available = st.total - used_bytes; + return seastar::make_ready_future<store_statfs_t>(std::move(st)); +} + +seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> +CyanStore::list_objects(CollectionRef ch, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {} {} {}", + __func__, c->get_cid(), start, end, limit); + std::vector<ghobject_t> objects; + objects.reserve(limit); + ghobject_t next = ghobject_t::get_max(); + for (const auto& [oid, obj] : + boost::make_iterator_range(c->object_map.lower_bound(start), + c->object_map.end())) { + std::ignore = obj; + if (oid >= end || objects.size() >= limit) { + next = oid; + break; + } + objects.push_back(oid); + } + return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>( + std::make_tuple(std::move(objects), next)); +} + +seastar::future<CollectionRef> CyanStore::create_new_collection(const coll_t& cid) +{ + auto c = new Collection{cid}; + new_coll_map[cid] = c; + return seastar::make_ready_future<CollectionRef>(c); +} + +seastar::future<CollectionRef> CyanStore::open_collection(const coll_t& cid) +{ + return seastar::make_ready_future<CollectionRef>(_get_collection(cid)); +} + +seastar::future<std::vector<coll_t>> CyanStore::list_collections() +{ + std::vector<coll_t> collections; + for (auto& coll : coll_map) { + collections.push_back(coll.first); + } + return seastar::make_ready_future<std::vector<coll_t>>(std::move(collections)); +} + +CyanStore::read_errorator::future<ceph::bufferlist> CyanStore::read( + CollectionRef ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags) +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {} {}~{}", + __func__, c->get_cid(), oid, offset, len); + if (!c->exists) { + return crimson::ct_error::enoent::make(); + } + ObjectRef o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + if (offset >= o->get_size()) + return read_errorator::make_ready_future<ceph::bufferlist>(); + size_t l = len; + if (l == 0 && offset == 0) // note: len == 0 means read the entire object + l = o->get_size(); + else if (offset + l > o->get_size()) + l = o->get_size() - offset; + return read_errorator::make_ready_future<ceph::bufferlist>(o->read(offset, l)); +} + +CyanStore::read_errorator::future<ceph::bufferlist> CyanStore::readv( + CollectionRef ch, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags) +{ + return seastar::do_with(ceph::bufferlist{}, + [this, ch, oid, &m, op_flags](auto& bl) { + return crimson::do_for_each(m, + [this, ch, oid, op_flags, &bl](auto& p) { + return read(ch, oid, p.first, p.second, op_flags) + .safe_then([&bl](auto ret) { + bl.claim_append(ret); + }); + }).safe_then([&bl] { + return read_errorator::make_ready_future<ceph::bufferlist>(std::move(bl)); + }); + }); +} + + +CyanStore::get_attr_errorator::future<ceph::bufferptr> CyanStore::get_attr( + CollectionRef ch, + const ghobject_t& oid, + std::string_view name) const +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + if (auto found = o->xattr.find(name); found != o->xattr.end()) { + return get_attr_errorator::make_ready_future<ceph::bufferptr>(found->second); + } else { + return crimson::ct_error::enodata::make(); + } +} + +CyanStore::get_attrs_ertr::future<CyanStore::attrs_t> CyanStore::get_attrs( + CollectionRef ch, + const ghobject_t& oid) +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + return get_attrs_ertr::make_ready_future<attrs_t>(o->xattr); +} + +auto CyanStore::omap_get_values(CollectionRef ch, + const ghobject_t& oid, + const omap_keys_t& keys) + -> read_errorator::future<omap_values_t> +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + omap_values_t values; + for (auto& key : keys) { + if (auto found = o->omap.find(key); found != o->omap.end()) { + values.insert(*found); + } + } + return seastar::make_ready_future<omap_values_t>(std::move(values)); +} + +auto +CyanStore::omap_get_values(CollectionRef ch, + const ghobject_t &oid, + const std::optional<string> &start) + -> read_errorator::future<std::tuple<bool, omap_values_t>> +{ + auto c = static_cast<Collection*>(ch.get()); + logger().debug("{} {} {}", __func__, c->get_cid(), oid); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + omap_values_t values; + for (auto i = start ? o->omap.upper_bound(*start) : o->omap.begin(); + values.size() < MAX_KEYS_PER_OMAP_GET_CALL && i != o->omap.end(); + ++i) { + values.insert(*i); + } + return seastar::make_ready_future<std::tuple<bool, omap_values_t>>( + std::make_tuple(true, std::move(values))); +} + +auto +CyanStore::omap_get_header(CollectionRef ch, + const ghobject_t& oid) + -> read_errorator::future<ceph::bufferlist> +{ + auto c = static_cast<Collection*>(ch.get()); + auto o = c->get_object(oid); + if (!o) { + return crimson::ct_error::enoent::make(); + } + + return read_errorator::make_ready_future<ceph::bufferlist>( + o->omap_header); +} + +seastar::future<> CyanStore::do_transaction(CollectionRef ch, + ceph::os::Transaction&& t) +{ + using ceph::os::Transaction; + int r = 0; + try { + auto i = t.begin(); + while (i.have_op()) { + r = 0; + switch (auto op = i.decode_op(); op->op) { + case Transaction::OP_NOP: + break; + case Transaction::OP_REMOVE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _remove(cid, oid); + if (r == -ENOENT) { + r = 0; + } + } + break; + case Transaction::OP_TOUCH: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _touch(cid, oid); + } + break; + case Transaction::OP_WRITE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + ceph::bufferlist bl; + i.decode_bl(bl); + r = _write(cid, oid, off, len, bl, fadvise_flags); + } + break; + case Transaction::OP_ZERO: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + r = _zero(cid, oid, off, len); + } + break; + case Transaction::OP_TRUNCATE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + r = _truncate(cid, oid, off); + } + break; + case Transaction::OP_SETATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + std::string name = i.decode_string(); + ceph::bufferlist bl; + i.decode_bl(bl); + std::map<std::string, bufferptr> to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(cid, oid, to_set); + } + break; + case Transaction::OP_RMATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + std::string name = i.decode_string(); + r = _rm_attr(cid, oid, name); + } + break; + case Transaction::OP_MKCOLL: + { + coll_t cid = i.get_cid(op->cid); + r = _create_collection(cid, op->split_bits); + } + break; + case Transaction::OP_OMAP_CLEAR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _omap_clear(cid, oid); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + std::map<std::string, ceph::bufferlist> aset; + i.decode_attrset(aset); + r = _omap_set_values(cid, oid, std::move(aset)); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + const coll_t &cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + ceph::bufferlist bl; + i.decode_bl(bl); + r = _omap_set_header(cid, oid, bl); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + const coll_t &cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + omap_keys_t keys; + i.decode_keyset(keys); + r = _omap_rmkeys(cid, oid, keys); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + const coll_t &cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + r = _omap_rmkeyrange(cid, oid, first, last); + } + break; + case Transaction::OP_COLL_HINT: + { + ceph::bufferlist hint; + i.decode_bl(hint); + // ignored + break; + } + default: + logger().error("bad op {}", static_cast<unsigned>(op->op)); + abort(); + } + if (r < 0) { + break; + } + } + } catch (std::exception &e) { + logger().error("{} got exception {}", __func__, e); + r = -EINVAL; + } + if (r < 0) { + logger().error(" transaction dump:\n"); + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + std::stringstream str; + f.flush(str); + logger().error("{}", str.str()); + ceph_assert(r == 0); + } + for (auto i : { + t.get_on_applied(), + t.get_on_commit(), + t.get_on_applied_sync()}) { + if (i) { + i->complete(0); + } + } + return seastar::now(); +} + +int CyanStore::_remove(const coll_t& cid, const ghobject_t& oid) +{ + logger().debug("{} cid={} oid={}", + __func__, cid, oid); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + auto i = c->object_hash.find(oid); + if (i == c->object_hash.end()) + return -ENOENT; + used_bytes -= i->second->get_size(); + c->object_hash.erase(i); + c->object_map.erase(oid); + return 0; +} + +int CyanStore::_touch(const coll_t& cid, const ghobject_t& oid) +{ + logger().debug("{} cid={} oid={}", + __func__, cid, oid); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + c->get_or_create_object(oid); + return 0; +} + +int CyanStore::_write(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len, const ceph::bufferlist& bl, + uint32_t fadvise_flags) +{ + logger().debug("{} {} {} {} ~ {}", + __func__, cid, oid, offset, len); + assert(len == bl.length()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + if (len > 0 && !local_conf()->memstore_debug_omit_block_device_write) { + const ssize_t old_size = o->get_size(); + o->write(offset, bl); + used_bytes += (o->get_size() - old_size); + } + + return 0; +} + +int CyanStore::_zero(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len) +{ + logger().debug("{} {} {} {} ~ {}", + __func__, cid, oid, offset, len); + + ceph::buffer::list bl; + bl.append_zero(len); + return _write(cid, oid, offset, len, bl, 0); +} + +int CyanStore::_omap_clear( + const coll_t& cid, + const ghobject_t& oid) +{ + logger().debug("{} {} {}", __func__, cid, oid); + + auto c = _get_collection(cid); + if (!c) { + return -ENOENT; + } + ObjectRef o = c->get_object(oid); + if (!o) { + return -ENOENT; + } + o->omap.clear(); + o->omap_header.clear(); + return 0; +} + +int CyanStore::_omap_set_values( + const coll_t& cid, + const ghobject_t& oid, + std::map<std::string, ceph::bufferlist> &&aset) +{ + logger().debug( + "{} {} {} {} keys", + __func__, cid, oid, aset.size()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + for (auto &&i: aset) { + o->omap.insert(std::move(i)); + } + return 0; +} + +int CyanStore::_omap_set_header( + const coll_t& cid, + const ghobject_t& oid, + const ceph::bufferlist &header) +{ + logger().debug( + "{} {} {} {} bytes", + __func__, cid, oid, header.length()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + o->omap_header = header; + return 0; +} + +int CyanStore::_omap_rmkeys( + const coll_t& cid, + const ghobject_t& oid, + const omap_keys_t& aset) +{ + logger().debug( + "{} {} {} {} keys", + __func__, cid, oid, aset.size()); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + for (auto &i: aset) { + o->omap.erase(i); + } + return 0; +} + +int CyanStore::_omap_rmkeyrange( + const coll_t& cid, + const ghobject_t& oid, + const std::string &first, + const std::string &last) +{ + logger().debug( + "{} {} {} first={} last={}", + __func__, cid, oid, first, last); + + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_or_create_object(oid); + for (auto i = o->omap.lower_bound(first); + i != o->omap.end() && i->first <= last; + o->omap.erase(i++)); + return 0; +} + +int CyanStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size) +{ + logger().debug("{} cid={} oid={} size={}", + __func__, cid, oid, size); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + if (local_conf()->memstore_debug_omit_block_device_write) + return 0; + const ssize_t old_size = o->get_size(); + int r = o->truncate(size); + used_bytes += (o->get_size() - old_size); + return r; +} + +int CyanStore::_setattrs(const coll_t& cid, const ghobject_t& oid, + std::map<std::string,bufferptr>& aset) +{ + logger().debug("{} cid={} oid={}", + __func__, cid, oid); + auto c = _get_collection(cid); + if (!c) + return -ENOENT; + + ObjectRef o = c->get_object(oid); + if (!o) + return -ENOENT; + for (std::map<std::string, bufferptr>::const_iterator p = aset.begin(); + p != aset.end(); ++p) + o->xattr[p->first] = p->second; + return 0; +} + +int CyanStore::_rm_attr(const coll_t& cid, const ghobject_t& oid, + std::string_view name) +{ + logger().debug("{} cid={} oid={} name={}", __func__, cid, oid, name); + auto c = _get_collection(cid); + if (!c) { + return -ENOENT; + } + ObjectRef o = c->get_object(oid); + if (!o) { + return -ENOENT; + } + auto i = o->xattr.find(name); + if (i == o->xattr.end()) { + return -ENODATA; + } + o->xattr.erase(i); + return 0; +} + +int CyanStore::_create_collection(const coll_t& cid, int bits) +{ + auto result = coll_map.try_emplace(cid); + if (!result.second) + return -EEXIST; + auto p = new_coll_map.find(cid); + assert(p != new_coll_map.end()); + result.first->second = p->second; + result.first->second->bits = bits; + new_coll_map.erase(p); + return 0; +} + +boost::intrusive_ptr<Collection> CyanStore::_get_collection(const coll_t& cid) +{ + auto cp = coll_map.find(cid); + if (cp == coll_map.end()) + return {}; + return cp->second; +} + +seastar::future<> CyanStore::write_meta(const std::string& key, + const std::string& value) +{ + std::string v = value; + v += "\n"; + if (int r = safe_write_file(path.c_str(), key.c_str(), + v.c_str(), v.length(), 0600); + r < 0) { + throw std::runtime_error{fmt::format("unable to write_meta({})", key)}; + } + return seastar::make_ready_future<>(); +} + +seastar::future<std::tuple<int, std::string>> +CyanStore::read_meta(const std::string& key) +{ + std::string fsid(4096, '\0'); + int r = safe_read_file(path.c_str(), key.c_str(), fsid.data(), fsid.size()); + if (r > 0) { + fsid.resize(r); + // drop trailing newlines + boost::algorithm::trim_right_if(fsid, + [](unsigned char c) {return isspace(c);}); + } else { + fsid.clear(); + } + return seastar::make_ready_future<std::tuple<int, std::string>>( + std::make_tuple(r, fsid)); +} + +uuid_d CyanStore::get_fsid() const +{ + return osd_fsid; +} + +unsigned CyanStore::get_max_attr_name_length() const +{ + // arbitrary limitation exactly like in the case of MemStore. + return 256; +} + +seastar::future<FuturizedStore::OmapIteratorRef> CyanStore::get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) +{ + auto c = static_cast<Collection*>(ch.get()); + auto o = c->get_object(oid); + if (!o) { + throw std::runtime_error(fmt::format("object does not exist: {}", oid)); + } + return seastar::make_ready_future<FuturizedStore::OmapIteratorRef>( + new CyanStore::CyanOmapIterator(o)); +} + +seastar::future<std::map<uint64_t, uint64_t>> +CyanStore::fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + auto c = static_cast<Collection*>(ch.get()); + + ObjectRef o = c->get_object(oid); + if (!o) { + throw std::runtime_error(fmt::format("object does not exist: {}", oid)); + } + std::map<uint64_t, uint64_t> m{{0, o->get_size()}}; + return seastar::make_ready_future<std::map<uint64_t, uint64_t>>(std::move(m)); +} + +seastar::future<struct stat> +CyanStore::stat( + CollectionRef ch, + const ghobject_t& oid) +{ + auto c = static_cast<Collection*>(ch.get()); + auto o = c->get_object(oid); + if (!o) { + throw std::runtime_error(fmt::format("object does not exist: {}", oid)); + } + struct stat st; + st.st_size = o->get_size(); + return seastar::make_ready_future<struct stat>(std::move(st)); +} + +seastar::future<> CyanStore::CyanOmapIterator::seek_to_first() +{ + iter = obj->omap.begin(); + return seastar::make_ready_future<>(); +} + +seastar::future<> CyanStore::CyanOmapIterator::upper_bound(const std::string& after) +{ + iter = obj->omap.upper_bound(after); + return seastar::make_ready_future<>(); +} + +seastar::future<> CyanStore::CyanOmapIterator::lower_bound(const std::string &to) +{ + iter = obj->omap.lower_bound(to); + return seastar::make_ready_future<>(); +} + +bool CyanStore::CyanOmapIterator::valid() const +{ + return iter != obj->omap.end(); +} + +seastar::future<> CyanStore::CyanOmapIterator::next() +{ + ++iter; + return seastar::make_ready_future<>(); +} + +} diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h new file mode 100644 index 000000000..07a8ff29e --- /dev/null +++ b/src/crimson/os/cyanstore/cyan_store.h @@ -0,0 +1,185 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <unordered_map> +#include <map> +#include <typeinfo> +#include <vector> + +#include <optional> +#include <seastar/core/future.hh> +#include <seastar/core/future-util.hh> + +#include "osd/osd_types.h" +#include "include/uuid.h" + +#include "crimson/os/cyanstore/cyan_object.h" +#include "crimson/os/futurized_store.h" + +namespace ceph::os { +class Transaction; +} + +namespace crimson::os { +class Collection; + +class CyanStore final : public FuturizedStore { + constexpr static unsigned MAX_KEYS_PER_OMAP_GET_CALL = 32; + + const std::string path; + std::unordered_map<coll_t, boost::intrusive_ptr<Collection>> coll_map; + std::map<coll_t, boost::intrusive_ptr<Collection>> new_coll_map; + uint64_t used_bytes = 0; + uuid_d osd_fsid; + +public: + class CyanOmapIterator final : public OmapIterator { + public: + CyanOmapIterator() {} + CyanOmapIterator(ObjectRef obj) : obj(obj) { + iter = obj->omap.begin(); + } + seastar::future<> seek_to_first() final; + seastar::future<> upper_bound(const std::string &after) final; + seastar::future<> lower_bound(const std::string &to) final; + bool valid() const final; + seastar::future<> next() final; + std::string key() final { + return iter->first; + } + virtual seastar::future<std::string> tail_key(){ + return seastar::make_ready_future<std::string>((++obj->omap.end())->first); + } + virtual ceph::buffer::list value() { + return iter->second; + } + virtual int status() const { + return iter != obj->omap.end() ? 0 : -1; + } + virtual ~CyanOmapIterator() {} + private: + std::map<std::string, bufferlist>::const_iterator iter; + ObjectRef obj; + }; + + CyanStore(const std::string& path); + ~CyanStore() final; + + seastar::future<> stop() final { + return seastar::now(); + } + seastar::future<> mount() final; + seastar::future<> umount() final; + + seastar::future<> mkfs(uuid_d new_osd_fsid) final; + seastar::future<store_statfs_t> stat() const final; + seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<ceph::bufferlist> read( + CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) final; + read_errorator::future<ceph::bufferlist> readv( + CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) final; + + get_attr_errorator::future<ceph::bufferptr> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name) const final; + get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid); + + read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) final; + + /// Retrieves paged set of values > start (if present) + read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) final; ///< @return <done, values> values.empty() iff done + + seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const final; + + read_errorator::future<ceph::bufferlist> omap_get_header( + CollectionRef c, + const ghobject_t& oid) final; + + seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; + seastar::future<CollectionRef> open_collection(const coll_t& cid) final; + seastar::future<std::vector<coll_t>> list_collections() final; + + seastar::future<> do_transaction(CollectionRef ch, + ceph::os::Transaction&& txn) final; + + seastar::future<> write_meta(const std::string& key, + const std::string& value) final; + seastar::future<std::tuple<int, std::string>> + read_meta(const std::string& key) final; + uuid_d get_fsid() const final; + unsigned get_max_attr_name_length() const final; + + seastar::future<OmapIteratorRef> get_omap_iterator( + CollectionRef c, + const ghobject_t& oid); + + seastar::future<std::map<uint64_t, uint64_t>> fiemap(CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len); + +private: + int _remove(const coll_t& cid, const ghobject_t& oid); + int _touch(const coll_t& cid, const ghobject_t& oid); + int _write(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len, const ceph::bufferlist& bl, + uint32_t fadvise_flags); + int _zero(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len); + int _omap_clear( + const coll_t& cid, + const ghobject_t& oid); + int _omap_set_values( + const coll_t& cid, + const ghobject_t& oid, + std::map<std::string, ceph::bufferlist> &&aset); + int _omap_set_header( + const coll_t& cid, + const ghobject_t& oid, + const ceph::bufferlist &header); + int _omap_rmkeys( + const coll_t& cid, + const ghobject_t& oid, + const omap_keys_t& aset); + int _omap_rmkeyrange( + const coll_t& cid, + const ghobject_t& oid, + const std::string &first, + const std::string &last); + int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size); + int _setattrs(const coll_t& cid, const ghobject_t& oid, + std::map<std::string,bufferptr>& aset); + int _rm_attr(const coll_t& cid, const ghobject_t& oid, + string_view name); + int _create_collection(const coll_t& cid, int bits); + boost::intrusive_ptr<Collection> _get_collection(const coll_t& cid); +}; + +} diff --git a/src/crimson/os/futurized_collection.h b/src/crimson/os/futurized_collection.h new file mode 100644 index 000000000..06f7d2f47 --- /dev/null +++ b/src/crimson/os/futurized_collection.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "osd/osd_types.h" + +namespace crimson::os { +class FuturizedStore; + +class FuturizedCollection + : public boost::intrusive_ref_counter<FuturizedCollection, + boost::thread_unsafe_counter> +{ +public: + FuturizedCollection(const coll_t& cid) + : cid{cid} {} + virtual ~FuturizedCollection() {} + virtual seastar::future<> flush() { + return seastar::make_ready_future<>(); + } + virtual seastar::future<bool> flush_commit() { + return seastar::make_ready_future<bool>(true); + } + const coll_t& get_cid() const { + return cid; + } +private: + const coll_t cid; +}; + +using CollectionRef = boost::intrusive_ptr<FuturizedCollection>; +} diff --git a/src/crimson/os/futurized_store.cc b/src/crimson/os/futurized_store.cc new file mode 100644 index 000000000..bb73c3478 --- /dev/null +++ b/src/crimson/os/futurized_store.cc @@ -0,0 +1,22 @@ +#include "futurized_store.h" +#include "cyanstore/cyan_store.h" +#include "alienstore/alien_store.h" + +namespace crimson::os { + +std::unique_ptr<FuturizedStore> +FuturizedStore::create(const std::string& type, + const std::string& data, + const ConfigValues& values) +{ + if (type == "memstore") { + return std::make_unique<crimson::os::CyanStore>(data); + } else if (type == "bluestore") { + return std::make_unique<crimson::os::AlienStore>(data, values); + } else { + ceph_abort_msgf("unsupported objectstore type: %s", type.c_str()); + return {}; + } +} + +} diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h new file mode 100644 index 000000000..bb173056b --- /dev/null +++ b/src/crimson/os/futurized_store.h @@ -0,0 +1,167 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <map> +#include <optional> +#include <vector> + +#include <seastar/core/future.hh> + +#include "crimson/osd/exceptions.h" +#include "include/buffer_fwd.h" +#include "include/uuid.h" +#include "osd/osd_types.h" + +namespace ceph::os { +class Transaction; +} + +namespace crimson::os { +class FuturizedCollection; + +class FuturizedStore { + +public: + class OmapIterator { + public: + virtual seastar::future<> seek_to_first() = 0; + virtual seastar::future<> upper_bound(const std::string &after) = 0; + virtual seastar::future<> lower_bound(const std::string &to) = 0; + virtual bool valid() const { + return false; + } + virtual seastar::future<> next() = 0; + virtual std::string key() { + return {}; + } + virtual seastar::future<std::string> tail_key() { + return seastar::make_ready_future<std::string>(); + } + virtual ceph::buffer::list value() { + return {}; + } + virtual int status() const { + return 0; + } + virtual ~OmapIterator() {} + private: + unsigned count = 0; + friend void intrusive_ptr_add_ref(FuturizedStore::OmapIterator* iter); + friend void intrusive_ptr_release(FuturizedStore::OmapIterator* iter); + }; + using OmapIteratorRef = boost::intrusive_ptr<OmapIterator>; + + static std::unique_ptr<FuturizedStore> create(const std::string& type, + const std::string& data, + const ConfigValues& values); + FuturizedStore() = default; + virtual ~FuturizedStore() = default; + + // no copying + explicit FuturizedStore(const FuturizedStore& o) = delete; + const FuturizedStore& operator=(const FuturizedStore& o) = delete; + + virtual seastar::future<> start() { + return seastar::now(); + } + virtual seastar::future<> stop() = 0; + virtual seastar::future<> mount() = 0; + virtual seastar::future<> umount() = 0; + + virtual seastar::future<> mkfs(uuid_d new_osd_fsid) = 0; + virtual seastar::future<store_statfs_t> stat() const = 0; + + using CollectionRef = boost::intrusive_ptr<FuturizedCollection>; + using read_errorator = crimson::errorator<crimson::ct_error::enoent, + crimson::ct_error::input_output_error>; + virtual read_errorator::future<ceph::bufferlist> read( + CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) = 0; + virtual read_errorator::future<ceph::bufferlist> readv( + CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) = 0; + + using get_attr_errorator = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::enodata>; + virtual get_attr_errorator::future<ceph::bufferptr> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name) const = 0; + + using get_attrs_ertr = crimson::errorator< + crimson::ct_error::enoent>; + using attrs_t = std::map<std::string, ceph::bufferptr, std::less<>>; + virtual get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid) = 0; + virtual seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) = 0; + + using omap_values_t = std::map<std::string, bufferlist, std::less<>>; + using omap_keys_t = std::set<std::string>; + virtual read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) = 0; + virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const = 0; + virtual read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) = 0; ///< @return <done, values> values.empty() iff done + + virtual read_errorator::future<bufferlist> omap_get_header( + CollectionRef c, + const ghobject_t& oid) = 0; + + virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0; + virtual seastar::future<CollectionRef> open_collection(const coll_t& cid) = 0; + virtual seastar::future<std::vector<coll_t>> list_collections() = 0; + + virtual seastar::future<> do_transaction(CollectionRef ch, + ceph::os::Transaction&& txn) = 0; + virtual seastar::future<OmapIteratorRef> get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) = 0; + virtual seastar::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) = 0; + + virtual seastar::future<> write_meta(const std::string& key, + const std::string& value) = 0; + virtual seastar::future<std::tuple<int, std::string>> read_meta( + const std::string& key) = 0; + virtual uuid_d get_fsid() const = 0; + virtual unsigned get_max_attr_name_length() const = 0; +}; + +inline void intrusive_ptr_add_ref(FuturizedStore::OmapIterator* iter) { + assert(iter); + iter->count++; +} + +inline void intrusive_ptr_release(FuturizedStore::OmapIterator* iter) { + assert(iter); + assert(iter->count > 0); + if ((--iter->count) == 0) { + delete iter; + } +} + +} diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt new file mode 100644 index 000000000..77f8465cf --- /dev/null +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -0,0 +1,37 @@ +add_library(crimson-seastore STATIC + cached_extent.cc + seastore_types.cc + segment_manager/ephemeral.cc + segment_manager/block.cc + transaction_manager.cc + journal.cc + cache.cc + lba_manager.cc + segment_cleaner.cc + lba_manager/btree/btree_lba_manager.cc + lba_manager/btree/lba_btree_node_impl.cc + lba_manager/btree/btree_range_pin.cc + onode.cc + onode_manager/simple-fltree/onode_block.cc + onode_manager/simple-fltree/onode_delta.cc + onode_manager/simple-fltree/onode_node.cc + onode_manager/staged-fltree/node.cc + onode_manager/staged-fltree/node_extent_manager.cc + onode_manager/staged-fltree/node_extent_manager/seastore.cc + onode_manager/staged-fltree/node_extent_mutable.cc + onode_manager/staged-fltree/node_impl.cc + onode_manager/staged-fltree/stages/item_iterator_stage.cc + onode_manager/staged-fltree/stages/key_layout.cc + onode_manager/staged-fltree/stages/node_stage_layout.cc + onode_manager/staged-fltree/stages/node_stage.cc + onode_manager/staged-fltree/stages/sub_items_stage.cc + onode_manager/staged-fltree/super.cc + onode_manager/staged-fltree/tree.cc + extentmap_manager.cc + extentmap_manager/btree/extentmap_btree_node_impl.cc + extentmap_manager/btree/btree_extentmap_manager.cc + seastore.cc + ../../../test/crimson/seastore/test_block.cc + ) +target_link_libraries(crimson-seastore + crimson) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc new file mode 100644 index 000000000..6a406c1b8 --- /dev/null +++ b/src/crimson/os/seastore/cache.cc @@ -0,0 +1,541 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/cache.h" +#include "crimson/common/log.h" + +// included for get_extent_by_type +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" +#include "crimson/os/seastore/onode_manager/simple-fltree/onode_block.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h" +#include "test/crimson/seastore/test_block.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore { + +Cache::Cache(SegmentManager &segment_manager) : + segment_manager(segment_manager) {} + +Cache::~Cache() +{ + for (auto &i: extents) { + logger().error("~Cache: extent {} still alive", i); + } + ceph_assert(extents.empty()); +} + +Cache::retire_extent_ret Cache::retire_extent_if_cached( + Transaction &t, paddr_t addr) +{ + if (auto ext = t.write_set.find_offset(addr); ext != t.write_set.end()) { + logger().debug("{}: found {} in t.write_set", __func__, addr); + t.add_to_retired_set(CachedExtentRef(&*ext)); + return retire_extent_ertr::now(); + } else if (auto iter = extents.find_offset(addr); + iter != extents.end()) { + auto ret = CachedExtentRef(&*iter); + return ret->wait_io().then([&t, ret=std::move(ret)]() mutable { + t.add_to_retired_set(ret); + return retire_extent_ertr::now(); + }); + } else { + return retire_extent_ertr::now(); + } +} + +void Cache::add_extent(CachedExtentRef ref) +{ + assert(ref->is_valid()); + extents.insert(*ref); + + if (ref->is_dirty()) { + add_to_dirty(ref); + } else { + ceph_assert(!ref->primary_ref_list_hook.is_linked()); + } + logger().debug("add_extent: {}", *ref); +} + +void Cache::mark_dirty(CachedExtentRef ref) +{ + if (ref->is_dirty()) { + assert(ref->primary_ref_list_hook.is_linked()); + return; + } + + add_to_dirty(ref); + ref->state = CachedExtent::extent_state_t::DIRTY; + + logger().debug("mark_dirty: {}", *ref); +} + +void Cache::add_to_dirty(CachedExtentRef ref) +{ + assert(ref->is_valid()); + assert(!ref->primary_ref_list_hook.is_linked()); + intrusive_ptr_add_ref(&*ref); + dirty.push_back(*ref); +} + +void Cache::remove_extent(CachedExtentRef ref) +{ + logger().debug("remove_extent: {}", *ref); + assert(ref->is_valid()); + extents.erase(*ref); + + if (ref->is_dirty()) { + ceph_assert(ref->primary_ref_list_hook.is_linked()); + dirty.erase(dirty.s_iterator_to(*ref)); + intrusive_ptr_release(&*ref); + } else { + ceph_assert(!ref->primary_ref_list_hook.is_linked()); + } +} + +void Cache::replace_extent(CachedExtentRef next, CachedExtentRef prev) +{ + assert(next->get_paddr() == prev->get_paddr()); + assert(next->version == prev->version + 1); + extents.replace(*next, *prev); + + if (prev->is_dirty()) { + ceph_assert(prev->primary_ref_list_hook.is_linked()); + auto prev_it = dirty.iterator_to(*prev); + dirty.insert(prev_it, *next); + dirty.erase(prev_it); + intrusive_ptr_release(&*prev); + intrusive_ptr_add_ref(&*next); + } else { + add_to_dirty(next); + } +} + +CachedExtentRef Cache::alloc_new_extent_by_type( + Transaction &t, ///< [in, out] current transaction + extent_types_t type, ///< [in] type tag + segment_off_t length ///< [in] length +) +{ + switch (type) { + case extent_types_t::ROOT: + assert(0 == "ROOT is never directly alloc'd"); + return CachedExtentRef(); + case extent_types_t::LADDR_INTERNAL: + return alloc_new_extent<lba_manager::btree::LBAInternalNode>(t, length); + case extent_types_t::LADDR_LEAF: + return alloc_new_extent<lba_manager::btree::LBALeafNode>(t, length); + case extent_types_t::ONODE_BLOCK: + return alloc_new_extent<OnodeBlock>(t, length); + case extent_types_t::EXTMAP_INNER: + return alloc_new_extent<extentmap_manager::ExtMapInnerNode>(t, length); + case extent_types_t::EXTMAP_LEAF: + return alloc_new_extent<extentmap_manager::ExtMapLeafNode>(t, length); + case extent_types_t::TEST_BLOCK: + return alloc_new_extent<TestBlock>(t, length); + case extent_types_t::TEST_BLOCK_PHYSICAL: + return alloc_new_extent<TestBlockPhysical>(t, length); + case extent_types_t::NONE: { + ceph_assert(0 == "NONE is an invalid extent type"); + return CachedExtentRef(); + } + default: + ceph_assert(0 == "impossible"); + return CachedExtentRef(); + } +} + +CachedExtentRef Cache::duplicate_for_write( + Transaction &t, + CachedExtentRef i) { + if (i->is_pending()) + return i; + + auto ret = i->duplicate_for_write(); + if (ret->get_type() == extent_types_t::ROOT) { + // root must be loaded before mutate + assert(t.root == i); + t.root = ret->cast<RootBlock>(); + } else { + ret->last_committed_crc = i->last_committed_crc; + ret->prior_instance = i; + t.add_mutated_extent(ret); + } + + ret->version++; + ret->state = CachedExtent::extent_state_t::MUTATION_PENDING; + logger().debug("Cache::duplicate_for_write: {} -> {}", *i, *ret); + return ret; +} + +std::optional<record_t> Cache::try_construct_record(Transaction &t) +{ + // First, validate read set + for (auto &i: t.read_set) { + if (i->state == CachedExtent::extent_state_t::INVALID) + return std::nullopt; + } + + record_t record; + + t.write_set.clear(); + + // Add new copy of mutated blocks, set_io_wait to block until written + record.deltas.reserve(t.mutated_block_list.size()); + for (auto &i: t.mutated_block_list) { + if (!i->is_valid()) { + logger().debug("try_construct_record: ignoring invalid {}", *i); + continue; + } + logger().debug("try_construct_record: mutating {}", *i); + + assert(i->prior_instance); + replace_extent(i, i->prior_instance); + + i->prepare_write(); + i->set_io_wait(); + + assert(i->get_version() > 0); + auto final_crc = i->get_crc32c(); + record.deltas.push_back( + delta_info_t{ + i->get_type(), + i->get_paddr(), + (i->is_logical() + ? i->cast<LogicalCachedExtent>()->get_laddr() + : L_ADDR_NULL), + i->last_committed_crc, + final_crc, + (segment_off_t)i->get_length(), + i->get_version() - 1, + i->get_delta() + }); + i->last_committed_crc = final_crc; + } + + if (t.root) { + logger().debug( + "{}: writing out root delta for {}", + __func__, + *t.root); + record.deltas.push_back( + delta_info_t{ + extent_types_t::ROOT, + paddr_t{}, + L_ADDR_NULL, + 0, + 0, + 0, + t.root->get_version() - 1, + t.root->get_delta() + }); + } + + // Transaction is now a go, set up in-memory cache state + // invalidate now invalid blocks + for (auto &i: t.retired_set) { + logger().debug("try_construct_record: retiring {}", *i); + ceph_assert(i->is_valid()); + remove_extent(i); + i->state = CachedExtent::extent_state_t::INVALID; + } + + record.extents.reserve(t.fresh_block_list.size()); + for (auto &i: t.fresh_block_list) { + logger().debug("try_construct_record: fresh block {}", *i); + bufferlist bl; + i->prepare_write(); + bl.append(i->get_bptr()); + if (i->get_type() == extent_types_t::ROOT) { + assert(0 == "ROOT never gets written as a fresh block"); + } + + assert(bl.length() == i->get_length()); + record.extents.push_back(extent_t{ + i->get_type(), + i->is_logical() + ? i->cast<LogicalCachedExtent>()->get_laddr() + : L_ADDR_NULL, + std::move(bl) + }); + } + + return std::make_optional<record_t>(std::move(record)); +} + +void Cache::complete_commit( + Transaction &t, + paddr_t final_block_start, + journal_seq_t seq, + SegmentCleaner *cleaner) +{ + if (t.root) { + remove_extent(root); + root = t.root; + root->state = CachedExtent::extent_state_t::DIRTY; + root->on_delta_write(final_block_start); + root->dirty_from = seq; + add_extent(root); + logger().debug("complete_commit: new root {}", *t.root); + } + + for (auto &i: t.fresh_block_list) { + i->set_paddr(final_block_start.add_relative(i->get_paddr())); + i->last_committed_crc = i->get_crc32c(); + i->on_initial_write(); + + if (!i->is_valid()) { + logger().debug("complete_commit: invalid {}", *i); + continue; + } + + i->state = CachedExtent::extent_state_t::CLEAN; + logger().debug("complete_commit: fresh {}", *i); + add_extent(i); + if (cleaner) { + cleaner->mark_space_used( + i->get_paddr(), + i->get_length()); + } + } + + // Add new copy of mutated blocks, set_io_wait to block until written + for (auto &i: t.mutated_block_list) { + logger().debug("complete_commit: mutated {}", *i); + assert(i->prior_instance); + i->on_delta_write(final_block_start); + i->prior_instance = CachedExtentRef(); + if (!i->is_valid()) { + logger().debug("complete_commit: not dirtying invalid {}", *i); + continue; + } + i->state = CachedExtent::extent_state_t::DIRTY; + if (i->version == 1) { + i->dirty_from = seq; + } + } + + if (cleaner) { + for (auto &i: t.retired_set) { + cleaner->mark_space_free( + i->get_paddr(), + i->get_length()); + } + } + + for (auto &i: t.mutated_block_list) { + i->complete_io(); + } +} + +void Cache::init() { + if (root) { + // initial creation will do mkfs followed by mount each of which calls init + remove_extent(root); + root = nullptr; + } + root = new RootBlock(); + root->state = CachedExtent::extent_state_t::DIRTY; + add_extent(root); +} + +Cache::mkfs_ertr::future<> Cache::mkfs(Transaction &t) +{ + return get_root(t).safe_then([this, &t](auto croot) { + duplicate_for_write(t, croot); + return mkfs_ertr::now(); + }); +} + +Cache::close_ertr::future<> Cache::close() +{ + root.reset(); + for (auto i = dirty.begin(); i != dirty.end(); ) { + auto ptr = &*i; + dirty.erase(i++); + intrusive_ptr_release(ptr); + } + return close_ertr::now(); +} + +Cache::replay_delta_ret +Cache::replay_delta( + journal_seq_t journal_seq, + paddr_t record_base, + const delta_info_t &delta) +{ + if (delta.type == extent_types_t::ROOT) { + logger().debug("replay_delta: found root delta"); + root->apply_delta_and_adjust_crc(record_base, delta.bl); + root->dirty_from = journal_seq; + return replay_delta_ertr::now(); + } else { + auto get_extent_if_cached = [this](paddr_t addr) + -> replay_delta_ertr::future<CachedExtentRef> { + auto retiter = extents.find_offset(addr); + if (retiter != extents.end()) { + return replay_delta_ertr::make_ready_future<CachedExtentRef>(&*retiter); + } else { + return replay_delta_ertr::make_ready_future<CachedExtentRef>(); + } + }; + auto extent_fut = delta.pversion == 0 ? + get_extent_by_type( + delta.type, + delta.paddr, + delta.laddr, + delta.length) : + get_extent_if_cached( + delta.paddr); + return extent_fut.safe_then([=, &delta](auto extent) { + if (!extent) { + assert(delta.pversion > 0); + logger().debug( + "replay_delta: replaying {}, extent not present so delta is obsolete", + delta); + return; + } + + logger().debug( + "replay_delta: replaying {} on {}", + *extent, + delta); + + assert(extent->version == delta.pversion); + + assert(extent->last_committed_crc == delta.prev_crc); + extent->apply_delta_and_adjust_crc(record_base, delta.bl); + assert(extent->last_committed_crc == delta.final_crc); + + if (extent->version == 0) { + extent->dirty_from = journal_seq; + } + extent->version++; + mark_dirty(extent); + }); + } +} + +Cache::get_next_dirty_extents_ret Cache::get_next_dirty_extents( + journal_seq_t seq) +{ + std::vector<CachedExtentRef> ret; + for (auto i = dirty.begin(); i != dirty.end(); ++i) { + CachedExtentRef cand; + if (i->dirty_from < seq) { + assert(ret.empty() || ret.back()->dirty_from <= i->dirty_from); + ret.push_back(&*i); + } else { + break; + } + } + return seastar::do_with( + std::move(ret), + [](auto &ret) { + return seastar::do_for_each( + ret, + [](auto &ext) { + logger().debug( + "get_next_dirty_extents: waiting on {}", + *ext); + return ext->wait_io(); + }).then([&ret]() mutable { + return seastar::make_ready_future<std::vector<CachedExtentRef>>( + std::move(ret)); + }); + }); +} + +Cache::get_root_ret Cache::get_root(Transaction &t) +{ + if (t.root) { + return get_root_ret( + get_root_ertr::ready_future_marker{}, + t.root); + } else { + auto ret = root; + return ret->wait_io().then([ret, &t] { + t.root = ret; + return get_root_ret( + get_root_ertr::ready_future_marker{}, + ret); + }); + } +} + +using StagedOnodeBlock = crimson::os::seastore::onode::SeastoreNodeExtent; + +Cache::get_extent_ertr::future<CachedExtentRef> Cache::get_extent_by_type( + extent_types_t type, + paddr_t offset, + laddr_t laddr, + segment_off_t length) +{ + return [=] { + switch (type) { + case extent_types_t::ROOT: + assert(0 == "ROOT is never directly read"); + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + case extent_types_t::LADDR_INTERNAL: + return get_extent<lba_manager::btree::LBAInternalNode>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::LADDR_LEAF: + return get_extent<lba_manager::btree::LBALeafNode>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::EXTMAP_INNER: + return get_extent<extentmap_manager::ExtMapInnerNode>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::EXTMAP_LEAF: + return get_extent<extentmap_manager::ExtMapLeafNode>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::ONODE_BLOCK: + return get_extent<OnodeBlock>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::ONODE_BLOCK_STAGED: + return get_extent<StagedOnodeBlock>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::TEST_BLOCK: + return get_extent<TestBlock>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::TEST_BLOCK_PHYSICAL: + return get_extent<TestBlockPhysical>(offset, length + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); + case extent_types_t::NONE: { + ceph_assert(0 == "NONE is an invalid extent type"); + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + } + default: + ceph_assert(0 == "impossible"); + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + } + }().safe_then([laddr](CachedExtentRef e) { + assert(e->is_logical() == (laddr != L_ADDR_NULL)); + if (e->is_logical()) { + e->cast<LogicalCachedExtent>()->set_laddr(laddr); + } + return get_extent_ertr::make_ready_future<CachedExtentRef>(e); + }); +} + +} diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h new file mode 100644 index 000000000..624272162 --- /dev/null +++ b/src/crimson/os/seastore/cache.h @@ -0,0 +1,516 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include "seastar/core/shared_future.hh" + +#include "include/buffer.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/root_block.h" +#include "crimson/os/seastore/segment_cleaner.h" + +namespace crimson::os::seastore { + +/** + * Cache + * + * This component is responsible for buffer management, including + * transaction lifecycle. + * + * Seastore transactions are expressed as an atomic combination of + * 1) newly written blocks + * 2) logical mutations to existing physical blocks + * + * See record_t + * + * As such, any transaction has 3 components: + * 1) read_set: references to extents read during the transaction + * See Transaction::read_set + * 2) write_set: references to extents to be written as: + * a) new physical blocks, see Transaction::fresh_block_list + * b) mutations to existing physical blocks, + * see Transaction::mutated_block_list + * 3) retired_set: extent refs to be retired either due to 2b or + * due to releasing the extent generally. + + * In the case of 2b, the CachedExtent will have been copied into + * a fresh CachedExtentRef such that the source extent ref is present + * in the read set and the newly allocated extent is present in the + * write_set. + * + * A transaction has 3 phases: + * 1) construction: user calls Cache::get_transaction() and populates + * the returned transaction by calling Cache methods + * 2) submission: user calls Cache::try_start_transaction(). If + * succcessful, the user may construct a record and submit the + * transaction to the journal. + * 3) completion: once the transaction is durable, the user must call + * Cache::complete_transaction() with the block offset to complete + * the transaction. + * + * Internally, in phase 1, the fields in Transaction are filled in. + * - reads may block if the referenced extent is being written + * - once a read obtains a particular CachedExtentRef for a paddr_t, + * it'll always get the same one until overwritten + * - once a paddr_t is overwritten or written, subsequent reads of + * that addr will get the new ref + * + * In phase 2, if all extents in the read set are valid (not expired), + * we can commit (otherwise, we fail and the user must retry). + * - Expire all extents in the retired_set (they must all be valid) + * - Remove all extents in the retired_set from Cache::extents + * - Mark all extents in the write_set wait_io(), add promises to + * transaction + * - Merge Transaction::write_set into Cache::extents + * + * After phase 2, the user will submit the record to the journal. + * Once complete, we perform phase 3: + * - For each CachedExtent in block_list, call + * CachedExtent::complete_initial_write(paddr_t) with the block's + * final offset (inferred from the extent's position in the block_list + * and extent lengths). + * - For each block in mutation_list, call + * CachedExtent::delta_written(paddr_t) with the address of the start + * of the record + * - Complete all promises with the final record start paddr_t + */ +class Cache { +public: + Cache(SegmentManager &segment_manager); + ~Cache(); + + /** + * drop_from_cache + * + * Drop extent from cache. Intended for use when + * ref refers to a logically dead extent as during + * replay. + */ + void drop_from_cache(CachedExtentRef ref) { + remove_extent(ref); + } + + /// Declare ref retired in t + void retire_extent(Transaction &t, CachedExtentRef ref) { + t.add_to_retired_set(ref); + } + + /// Declare paddr retired in t, noop if not cached + using retire_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using retire_extent_ret = retire_extent_ertr::future<>; + retire_extent_ret retire_extent_if_cached( + Transaction &t, paddr_t addr); + + /** + * get_root + * + * returns ref to current root or t.root if modified in t + */ + using get_root_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_root_ret = get_root_ertr::future<RootBlockRef>; + get_root_ret get_root(Transaction &t); + + /** + * get_root_fast + * + * returns t.root and assume it is already present/read in t + */ + RootBlockRef get_root_fast(Transaction &t) { + assert(t.root); + return t.root; + } + + /** + * get_extent + * + * returns ref to extent at offset~length of type T either from + * - extent_set if already in cache + * - disk + */ + using get_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + template <typename T> + get_extent_ertr::future<TCachedExtentRef<T>> get_extent( + paddr_t offset, ///< [in] starting addr + segment_off_t length ///< [in] length + ) { + if (auto iter = extents.find_offset(offset); + iter != extents.end()) { + auto ret = TCachedExtentRef<T>(static_cast<T*>(&*iter)); + return ret->wait_io().then([ret=std::move(ret)]() mutable { + return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + std::move(ret)); + }); + } else { + auto ref = CachedExtent::make_cached_extent_ref<T>( + alloc_cache_buf(length)); + ref->set_io_wait(); + ref->set_paddr(offset); + ref->state = CachedExtent::extent_state_t::CLEAN; + + return segment_manager.read( + offset, + length, + ref->get_bptr()).safe_then( + [this, ref=std::move(ref)]() mutable { + /* TODO: crc should be checked against LBA manager */ + ref->last_committed_crc = ref->get_crc32c(); + + ref->on_clean_read(); + ref->complete_io(); + add_extent(ref); + return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + std::move(ref)); + }, + get_extent_ertr::pass_further{}, + crimson::ct_error::discard_all{}); + } + } + + /** + * get_extent_if_cached + * + * Returns extent at offset if in cache + */ + Transaction::get_extent_ret get_extent_if_cached( + Transaction &t, + paddr_t offset, + CachedExtentRef *out) { + auto result = t.get_extent(offset, out); + if (result != Transaction::get_extent_ret::ABSENT) { + return result; + } else if (auto iter = extents.find_offset(offset); + iter != extents.end()) { + if (out) + *out = &*iter; + return Transaction::get_extent_ret::PRESENT; + } else { + return Transaction::get_extent_ret::ABSENT; + } + } + + /** + * get_extent + * + * returns ref to extent at offset~length of type T either from + * - t if modified by t + * - extent_set if already in cache + * - disk + * + * t *must not* have retired offset + */ + template <typename T> + get_extent_ertr::future<TCachedExtentRef<T>> get_extent( + Transaction &t, ///< [in,out] current transaction + paddr_t offset, ///< [in] starting addr + segment_off_t length ///< [in] length + ) { + CachedExtentRef ret; + auto result = t.get_extent(offset, &ret); + if (result != Transaction::get_extent_ret::ABSENT) { + assert(result != Transaction::get_extent_ret::RETIRED); + return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + ret->cast<T>()); + } else { + return get_extent<T>(offset, length).safe_then( + [&t](auto ref) mutable { + t.add_to_read_set(ref); + return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + std::move(ref)); + }); + } + } + + /** + * get_extent_by_type + * + * Based on type, instantiate the correct concrete type + * and read in the extent at location offset~length. + */ + get_extent_ertr::future<CachedExtentRef> get_extent_by_type( + extent_types_t type, ///< [in] type tag + paddr_t offset, ///< [in] starting addr + laddr_t laddr, ///< [in] logical address if logical + segment_off_t length ///< [in] length + ); + + get_extent_ertr::future<CachedExtentRef> get_extent_by_type( + Transaction &t, + extent_types_t type, + paddr_t offset, + laddr_t laddr, + segment_off_t length) { + CachedExtentRef ret; + auto status = get_extent_if_cached(t, offset, &ret); + if (status == Transaction::get_extent_ret::RETIRED) { + return get_extent_ertr::make_ready_future<CachedExtentRef>(); + } else if (status == Transaction::get_extent_ret::PRESENT) { + return get_extent_ertr::make_ready_future<CachedExtentRef>(ret); + } else { + return get_extent_by_type(type, offset, laddr, length + ).safe_then([=, &t](CachedExtentRef ret) { + t.add_to_read_set(ret); + return get_extent_ertr::make_ready_future<CachedExtentRef>( + std::move(ret)); + }); + } + } + + /** + * get_extents + * + * returns refs to extents in extents from: + * - t if modified by t + * - extent_set if already in cache + * - disk + */ + template<typename T> + get_extent_ertr::future<t_pextent_list_t<T>> get_extents( + Transaction &t, ///< [in, out] current transaction + paddr_list_t &&extents ///< [in] extent list for lookup + ) { + auto retref = std::make_unique<t_pextent_list_t<T>>(); + auto &ret = *retref; + auto ext = std::make_unique<paddr_list_t>(std::move(extents)); + return crimson::do_for_each( + ext->begin(), + ext->end(), + [this, &t, &ret](auto &p) { + auto &[offset, len] = p; + return get_extent(t, offset, len).safe_then([&ret](auto cext) { + ret.push_back(std::move(cext)); + }); + }).safe_then([retref=std::move(retref), ext=std::move(ext)]() mutable { + return get_extent_ertr::make_ready_future<t_pextent_list_t<T>>( + std::move(*retref)); + }); + } + + /** + * alloc_new_extent + * + * Allocates a fresh extent. addr will be relative until commit. + */ + template <typename T> + TCachedExtentRef<T> alloc_new_extent( + Transaction &t, ///< [in, out] current transaction + segment_off_t length ///< [in] length + ) { + auto ret = CachedExtent::make_cached_extent_ref<T>( + alloc_cache_buf(length)); + t.add_fresh_extent(ret); + ret->state = CachedExtent::extent_state_t::INITIAL_WRITE_PENDING; + return ret; + } + + /** + * alloc_new_extent + * + * Allocates a fresh extent. addr will be relative until commit. + */ + CachedExtentRef alloc_new_extent_by_type( + Transaction &t, ///< [in, out] current transaction + extent_types_t type, ///< [in] type tag + segment_off_t length ///< [in] length + ); + + /** + * Allocates mutable buffer from extent_set on offset~len + * + * TODO: Note, currently all implementations literally copy the + * buffer. This needn't be true, CachedExtent implementations could + * choose to refer to the same buffer unmodified until commit and just + * buffer the mutations in an ancillary data structure. + * + * @param current transaction + * @param extent to duplicate + * @return mutable extent + */ + CachedExtentRef duplicate_for_write( + Transaction &t, ///< [in, out] current transaction + CachedExtentRef i ///< [in] ref to existing extent + ); + + /** + * try_construct_record + * + * First checks for conflicts. If a racing write has mutated/retired + * an extent mutated by this transaction, nullopt will be returned. + * + * Otherwise, a record will be returned valid for use with Journal. + */ + std::optional<record_t> try_construct_record( + Transaction &t ///< [in, out] current transaction + ); + + /** + * complete_commit + * + * Must be called upon completion of write. Releases blocks on mutating + * extents, fills in addresses, and calls relevant callbacks on fresh + * and mutated exents. + */ + void complete_commit( + Transaction &t, ///< [in, out] current transaction + paddr_t final_block_start, ///< [in] offset of initial block + journal_seq_t seq, ///< [in] journal commit seq + SegmentCleaner *cleaner=nullptr ///< [out] optional segment stat listener + ); + + /** + * init + */ + void init(); + + /** + * mkfs + * + * Alloc initial root node and add to t. The intention is for other + * components to use t to adjust the resulting root ref prior to commit. + */ + using mkfs_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + mkfs_ertr::future<> mkfs(Transaction &t); + + /** + * close + * + * TODO: should flush dirty blocks + */ + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + close_ertr::future<> close(); + + /** + * replay_delta + * + * Intended for use in Journal::delta. For each delta, should decode delta, + * read relevant block from disk or cache (using correct type), and call + * CachedExtent::apply_delta marking the extent dirty. + */ + using replay_delta_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using replay_delta_ret = replay_delta_ertr::future<>; + replay_delta_ret replay_delta( + journal_seq_t seq, + paddr_t record_block_base, + const delta_info_t &delta); + + /** + * init_cached_extents + * + * Calls passed lambda for each dirty cached block. Intended for use + * after replay to allow lba_manager (or w/e) to read in any ancestor + * blocks. + */ + using init_cached_extents_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using init_cached_extents_ret = replay_delta_ertr::future<>; + template <typename F> + init_cached_extents_ret init_cached_extents( + Transaction &t, + F &&f) + { + std::vector<CachedExtentRef> dirty; + for (auto &e : extents) { + dirty.push_back(CachedExtentRef(&e)); + } + return seastar::do_with( + std::forward<F>(f), + std::move(dirty), + [&t](auto &f, auto &refs) mutable { + return crimson::do_for_each( + refs, + [&t, &f](auto &e) { return f(t, e); }); + }); + } + + /** + * update_extent_from_transaction + * + * Updates passed extent based on t. If extent has been retired, + * a null result will be returned. + */ + CachedExtentRef update_extent_from_transaction( + Transaction &t, + CachedExtentRef extent) { + if (extent->get_type() == extent_types_t::ROOT) { + if (t.root) { + return t.root; + } else { + return extent; + } + } else { + auto result = t.get_extent(extent->get_paddr(), &extent); + if (result == Transaction::get_extent_ret::RETIRED) { + return CachedExtentRef(); + } else { + return extent; + } + } + } + + /** + * print + * + * Dump summary of contents (TODO) + */ + std::ostream &print( + std::ostream &out) const { + return out; + } + + /// returns extents with dirty_from < seq + using get_next_dirty_extents_ertr = crimson::errorator<>; + using get_next_dirty_extents_ret = get_next_dirty_extents_ertr::future< + std::vector<CachedExtentRef>>; + get_next_dirty_extents_ret get_next_dirty_extents( + journal_seq_t seq); + +private: + SegmentManager &segment_manager; ///< ref to segment_manager + RootBlockRef root; ///< ref to current root + ExtentIndex extents; ///< set of live extents + + /** + * dirty + * + * holds refs to dirty extents. Ordered by CachedExtent::dirty_from. + */ + CachedExtent::list dirty; + + /// alloc buffer for cached extent + bufferptr alloc_cache_buf(size_t size) { + // TODO: memory pooling etc + auto bp = ceph::bufferptr( + buffer::create_page_aligned(size)); + bp.zero(); + return bp; + } + + /// Add extent to extents handling dirty and refcounting + void add_extent(CachedExtentRef ref); + + /// Mark exising extent ref dirty -- mainly for replay + void mark_dirty(CachedExtentRef ref); + + /// Add dirty extent to dirty list + void add_to_dirty(CachedExtentRef ref); + + /// Remove extent from extents handling dirty and refcounting + void remove_extent(CachedExtentRef ref); + + /// Replace prev with next + void replace_extent(CachedExtentRef next, CachedExtentRef prev); +}; + +} diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc new file mode 100644 index 000000000..7019b9fb8 --- /dev/null +++ b/src/crimson/os/seastore/cached_extent.cc @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/cached_extent.h" + +#include "crimson/common/log.h" + +namespace { + [[maybe_unused]] seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore { + +#ifdef DEBUG_CACHED_EXTENT_REF + +void intrusive_ptr_add_ref(CachedExtent *ptr) +{ + intrusive_ptr_add_ref( + static_cast<boost::intrusive_ref_counter< + CachedExtent, + boost::thread_unsafe_counter>*>(ptr)); + logger().debug("intrusive_ptr_add_ref: {}", *ptr); +} + +void intrusive_ptr_release(CachedExtent *ptr) +{ + logger().debug("intrusive_ptr_release: {}", *ptr); + intrusive_ptr_release( + static_cast<boost::intrusive_ref_counter< + CachedExtent, + boost::thread_unsafe_counter>*>(ptr)); +} + +#endif + +std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state) +{ + switch (state) { + case CachedExtent::extent_state_t::INITIAL_WRITE_PENDING: + return out << "INITIAL_WRITE_PENDING"; + case CachedExtent::extent_state_t::MUTATION_PENDING: + return out << "MUTATION_PENDING"; + case CachedExtent::extent_state_t::CLEAN: + return out << "CLEAN"; + case CachedExtent::extent_state_t::DIRTY: + return out << "DIRTY"; + case CachedExtent::extent_state_t::INVALID: + return out << "INVALID"; + default: + return out << "UNKNOWN"; + } +} + +std::ostream &operator<<(std::ostream &out, const CachedExtent &ext) +{ + return ext.print(out); +} + +CachedExtent::~CachedExtent() +{ + if (parent_index) { + parent_index->erase(*this); + } +} + +std::ostream &LogicalCachedExtent::print_detail(std::ostream &out) const +{ + out << ", laddr=" << laddr; + if (pin) { + out << ", pin=" << *pin; + } else { + out << ", pin=empty"; + } + return print_detail_l(out); +} + +std::ostream &operator<<(std::ostream &out, const LBAPin &rhs) +{ + return out << "LBAPin(" << rhs.get_laddr() << "~" << rhs.get_length() + << "->" << rhs.get_paddr(); +} + +std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs) +{ + bool first = true; + out << '['; + for (auto &i: rhs) { + out << (first ? "" : ",") << *i; + first = false; + } + return out << ']'; +} + +} diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h new file mode 100644 index 000000000..974988489 --- /dev/null +++ b/src/crimson/os/seastore/cached_extent.h @@ -0,0 +1,659 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive/list.hpp> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "seastar/core/shared_future.hh" + +#include "include/buffer.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { + +class CachedExtent; +using CachedExtentRef = boost::intrusive_ptr<CachedExtent>; + +// #define DEBUG_CACHED_EXTENT_REF +#ifdef DEBUG_CACHED_EXTENT_REF + +void intrusive_ptr_add_ref(CachedExtent *); +void intrusive_ptr_release(CachedExtent *); + +#endif + +template <typename T> +using TCachedExtentRef = boost::intrusive_ptr<T>; + +/** + * CachedExtent + */ +namespace onode { + class DummyNodeExtent; + class TestReplayExtent; +} +class ExtentIndex; +class CachedExtent : public boost::intrusive_ref_counter< + CachedExtent, boost::thread_unsafe_counter> { + enum class extent_state_t : uint8_t { + INITIAL_WRITE_PENDING, // In Transaction::write_set and fresh_block_list + MUTATION_PENDING, // In Transaction::write_set and mutated_block_list + CLEAN, // In Cache::extent_index, Transaction::read_set + // during write, contents match disk, version == 0 + DIRTY, // Same as CLEAN, but contents do not match disk, + // version > 0 + INVALID // Part of no ExtentIndex set + } state = extent_state_t::INVALID; + friend std::ostream &operator<<(std::ostream &, extent_state_t); + // allow a dummy extent to pretend it is at a specific state + friend class onode::DummyNodeExtent; + friend class onode::TestReplayExtent; + + uint32_t last_committed_crc = 0; + + // Points at current version while in state MUTATION_PENDING + CachedExtentRef prior_instance; + + /** + * dirty_from + * + * When dirty, indiciates the oldest journal entry which mutates + * this extent. + */ + journal_seq_t dirty_from; + +public: + /** + * duplicate_for_write + * + * Implementation should return a fresh CachedExtentRef + * which represents a copy of *this until on_delta_write() + * is complete, at which point the user may assume *this + * will be in state INVALID. As such, the implementation + * may involve a copy of get_bptr(), or an ancillary + * structure which defers updating the actual buffer until + * on_delta_write(). + */ + virtual CachedExtentRef duplicate_for_write() = 0; + + /** + * prepare_write + * + * Called prior to reading buffer. + * Implemenation may use this callback to fully write out + * updates to the buffer. + */ + virtual void prepare_write() {} + + /** + * on_initial_write + * + * Called after commit of extent. State will be CLEAN. + * Implentation may use this call to fixup the buffer + * with the newly available absolute get_paddr(). + */ + virtual void on_initial_write() {} + + /** + * on_clean_read + * + * Called after read of initially written extent. + * State will be CLEAN. Implentation may use this + * call to fixup the buffer with the newly available + * absolute get_paddr(). + */ + virtual void on_clean_read() {} + + /** + * on_delta_write + * + * Called after commit of delta. State will be DIRTY. + * Implentation may use this call to fixup any relative + * references in the the buffer with the passed + * record_block_offset record location. + */ + virtual void on_delta_write(paddr_t record_block_offset) {} + + /** + * get_type + * + * Returns concrete type. + */ + virtual extent_types_t get_type() const = 0; + + virtual bool is_logical() const { + return false; + } + + friend std::ostream &operator<<(std::ostream &, extent_state_t); + virtual std::ostream &print_detail(std::ostream &out) const { return out; } + std::ostream &print(std::ostream &out) const { + out << "CachedExtent(addr=" << this + << ", type=" << get_type() + << ", version=" << version + << ", dirty_from=" << dirty_from + << ", paddr=" << get_paddr() + << ", state=" << state + << ", last_committed_crc=" << last_committed_crc + << ", refcount=" << use_count(); + print_detail(out); + return out << ")"; + } + + /** + * get_delta + * + * Must return a valid delta usable in apply_delta() in submit_transaction + * if state == MUTATION_PENDING. + */ + virtual ceph::bufferlist get_delta() = 0; + + /** + * apply_delta + * + * bl is a delta obtained previously from get_delta. The versions will + * match. Implementation should mutate buffer based on bl. base matches + * the address passed on_delta_write. + * + * Implementation *must* use set_last_committed_crc to update the crc to + * what the crc of the buffer would have been at submission. For physical + * extents that use base to adjust internal record-relative deltas, this + * means that the crc should be of the buffer after applying the delta, + * but before that adjustment. We do it this way because the crc in the + * commit path does not yet know the record base address. + * + * LogicalCachedExtent overrides this method and provides a simpler + * apply_delta override for LogicalCachedExtent implementers. + */ + virtual void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &bl) = 0; + + /** + * Called on dirty CachedExtent implementation after replay. + * Implementation should perform any reads/in-memory-setup + * necessary. (for instance, the lba implementation will use this + * to load in lba_manager blocks) + */ + using complete_load_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + virtual complete_load_ertr::future<> complete_load() { + return complete_load_ertr::now(); + } + + /** + * cast + * + * Returns a TCachedExtentRef of the specified type. + * TODO: add dynamic check that the requested type is actually correct. + */ + template <typename T> + TCachedExtentRef<T> cast() { + return TCachedExtentRef<T>(static_cast<T*>(this)); + } + template <typename T> + TCachedExtentRef<const T> cast() const { + return TCachedExtentRef<const T>(static_cast<const T*>(this)); + } + + /// Returns true if extent is part of an open transaction + bool is_pending() const { + return state == extent_state_t::INITIAL_WRITE_PENDING || + state == extent_state_t::MUTATION_PENDING; + } + + /// Returns true if extent has a pending delta + bool is_mutation_pending() const { + return state == extent_state_t::MUTATION_PENDING; + } + + /// Returns true if extent is a fresh extent + bool is_initial_pending() const { + return state == extent_state_t::INITIAL_WRITE_PENDING; + } + + /// Returns true if extent is clean (does not have deltas on disk) + bool is_clean() const { + ceph_assert(is_valid()); + return state == extent_state_t::INITIAL_WRITE_PENDING || + state == extent_state_t::CLEAN; + } + + /// Returns true if extent is dirty (has deltas on disk) + bool is_dirty() const { + ceph_assert(is_valid()); + return !is_clean(); + } + + /// Returns true if extent has not been superceded or retired + bool is_valid() const { + return state != extent_state_t::INVALID; + } + + /** + * get_dirty_from + * + * Return journal location of oldest relevant delta. + */ + auto get_dirty_from() const { return dirty_from; } + + + /** + * get_paddr + * + * Returns current address of extent. If is_initial_pending(), address will + * be relative, otherwise address will be absolute. + */ + paddr_t get_paddr() const { return poffset; } + + /// Returns length of extent + extent_len_t get_length() const { return ptr.length(); } + + /// Returns version, get_version() == 0 iff is_clean() + extent_version_t get_version() const { + return version; + } + + /// Returns crc32c of buffer + uint32_t get_crc32c() { + return ceph_crc32c( + 1, + reinterpret_cast<const unsigned char *>(get_bptr().c_str()), + get_length()); + } + + /// Get ref to raw buffer + bufferptr &get_bptr() { return ptr; } + const bufferptr &get_bptr() const { return ptr; } + + /// Compare by paddr + friend bool operator< (const CachedExtent &a, const CachedExtent &b) { + return a.poffset < b.poffset; + } + friend bool operator> (const CachedExtent &a, const CachedExtent &b) { + return a.poffset > b.poffset; + } + friend bool operator== (const CachedExtent &a, const CachedExtent &b) { + return a.poffset == b.poffset; + } + + virtual ~CachedExtent(); + +private: + friend struct paddr_cmp; + friend struct ref_paddr_cmp; + friend class ExtentIndex; + + /// Pointer to containing index (or null) + ExtentIndex *parent_index = nullptr; + + /// hook for intrusive extent_index + boost::intrusive::set_member_hook<> extent_index_hook; + using index_member_options = boost::intrusive::member_hook< + CachedExtent, + boost::intrusive::set_member_hook<>, + &CachedExtent::extent_index_hook>; + using index = boost::intrusive::set<CachedExtent, index_member_options>; + friend class ExtentIndex; + friend class Transaction; + + /// hook for intrusive ref list (mainly dirty or lru list) + boost::intrusive::list_member_hook<> primary_ref_list_hook; + using primary_ref_list_member_options = boost::intrusive::member_hook< + CachedExtent, + boost::intrusive::list_member_hook<>, + &CachedExtent::primary_ref_list_hook>; + using list = boost::intrusive::list< + CachedExtent, + primary_ref_list_member_options>; + + /// Actual data contents + ceph::bufferptr ptr; + + /// number of deltas since initial write + extent_version_t version = EXTENT_VERSION_NULL; + + /// address of original block -- relative iff is_pending() and is_clean() + paddr_t poffset; + + /// used to wait while in-progress commit completes + std::optional<seastar::shared_promise<>> io_wait_promise; + void set_io_wait() { + ceph_assert(!io_wait_promise); + io_wait_promise = seastar::shared_promise<>(); + } + void complete_io() { + ceph_assert(io_wait_promise); + io_wait_promise->set_value(); + io_wait_promise = std::nullopt; + } + seastar::future<> wait_io() { + if (!io_wait_promise) { + return seastar::now(); + } else { + return io_wait_promise->get_shared_future(); + } + } + +protected: + CachedExtent(CachedExtent &&other) = delete; + CachedExtent(ceph::bufferptr &&ptr) : ptr(std::move(ptr)) {} + CachedExtent(const CachedExtent &other) + : state(other.state), + dirty_from(other.dirty_from), + ptr(other.ptr.c_str(), other.ptr.length()), + version(other.version), + poffset(other.poffset) {} + + struct share_buffer_t {}; + CachedExtent(const CachedExtent &other, share_buffer_t) : + state(other.state), + dirty_from(other.dirty_from), + ptr(other.ptr), + version(other.version), + poffset(other.poffset) {} + + + friend class Cache; + template <typename T> + static TCachedExtentRef<T> make_cached_extent_ref(bufferptr &&ptr) { + return new T(std::move(ptr)); + } + + CachedExtentRef get_prior_instance() { + return prior_instance; + } + + /// Sets last_committed_crc + void set_last_committed_crc(uint32_t crc) { + last_committed_crc = crc; + } + + void set_paddr(paddr_t offset) { poffset = offset; } + + /** + * maybe_generate_relative + * + * There are three kinds of addresses one might want to + * store within an extent: + * - addr for a block within the same transaction relative to the + * physical location of this extent in the + * event that we will read it in the initial read of the extent + * - addr relative to the physical location of the next record to a + * block within that record to contain a delta for this extent in + * the event that we'll read it from a delta and overlay it onto a + * dirty representation of the extent. + * - absolute addr to a block already written outside of the current + * transaction. + * + * This helper checks addr and the current state to create the correct + * reference. + */ + paddr_t maybe_generate_relative(paddr_t addr) { + if (!addr.is_relative()) { + return addr; + } else if (is_mutation_pending()) { + return addr; + } else { + ceph_assert(is_initial_pending()); + ceph_assert(get_paddr().is_record_relative()); + return addr - get_paddr(); + } + } + +}; + +std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t); +std::ostream &operator<<(std::ostream &, const CachedExtent&); + +/// Compare extents by paddr +struct paddr_cmp { + bool operator()(paddr_t lhs, const CachedExtent &rhs) const { + return lhs < rhs.poffset; + } + bool operator()(const CachedExtent &lhs, paddr_t rhs) const { + return lhs.poffset < rhs; + } +}; + +/// Compare extent refs by paddr +struct ref_paddr_cmp { + using is_transparent = paddr_t; + bool operator()(const CachedExtentRef &lhs, const CachedExtentRef &rhs) const { + return lhs->poffset < rhs->poffset; + } + bool operator()(const paddr_t &lhs, const CachedExtentRef &rhs) const { + return lhs < rhs->poffset; + } + bool operator()(const CachedExtentRef &lhs, const paddr_t &rhs) const { + return lhs->poffset < rhs; + } +}; + +template <typename T, typename C> +class addr_extent_list_base_t + : public std::list<std::pair<T, C>> {}; + +using pextent_list_t = addr_extent_list_base_t<paddr_t, CachedExtentRef>; + +template <typename T, typename C, typename Cmp> +class addr_extent_set_base_t + : public std::set<C, Cmp> {}; + +using pextent_set_t = addr_extent_set_base_t< + paddr_t, + CachedExtentRef, + ref_paddr_cmp + >; + +template <typename T> +using t_pextent_list_t = addr_extent_list_base_t<paddr_t, TCachedExtentRef<T>>; + +/** + * ExtentIndex + * + * Index of CachedExtent & by poffset, does not hold a reference, + * user must ensure each extent is removed prior to deletion + */ +class ExtentIndex { + friend class Cache; + CachedExtent::index extent_index; +public: + auto get_overlap(paddr_t addr, segment_off_t len) { + auto bottom = extent_index.upper_bound(addr, paddr_cmp()); + if (bottom != extent_index.begin()) + --bottom; + if (bottom != extent_index.end() && + bottom->get_paddr().add_offset(bottom->get_length()) <= addr) + ++bottom; + + auto top = extent_index.lower_bound(addr.add_offset(len), paddr_cmp()); + return std::make_pair( + bottom, + top + ); + } + + void clear() { + extent_index.clear(); + } + + void insert(CachedExtent &extent) { + // sanity check + auto [a, b] = get_overlap( + extent.get_paddr(), + extent.get_length()); + ceph_assert(a == b); + + extent_index.insert(extent); + extent.parent_index = this; + } + + void erase(CachedExtent &extent) { + extent_index.erase(extent); + extent.parent_index = nullptr; + } + + void replace(CachedExtent &to, CachedExtent &from) { + extent_index.replace_node(extent_index.s_iterator_to(from), to); + from.parent_index = nullptr; + to.parent_index = this; + } + + bool empty() const { + return extent_index.empty(); + } + + auto find_offset(paddr_t offset) { + return extent_index.find(offset, paddr_cmp()); + } + + auto begin() { + return extent_index.begin(); + } + + auto end() { + return extent_index.end(); + } + + void merge(ExtentIndex &&other) { + for (auto it = other.extent_index.begin(); + it != other.extent_index.end(); + ) { + auto &ext = *it; + ++it; + other.extent_index.erase(ext); + extent_index.insert(ext); + } + } + + template <typename T> + void remove(T &l) { + for (auto &ext : l) { + extent_index.erase(l); + } + } +}; + +class LogicalCachedExtent; +class LBAPin; +using LBAPinRef = std::unique_ptr<LBAPin>; +class LBAPin { +public: + virtual void link_extent(LogicalCachedExtent *ref) = 0; + virtual void take_pin(LBAPin &pin) = 0; + virtual extent_len_t get_length() const = 0; + virtual paddr_t get_paddr() const = 0; + virtual laddr_t get_laddr() const = 0; + virtual LBAPinRef duplicate() const = 0; + + virtual ~LBAPin() {} +}; +std::ostream &operator<<(std::ostream &out, const LBAPin &rhs); + +using lba_pin_list_t = std::list<LBAPinRef>; + +std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs); + + +/** + * LogicalCachedExtent + * + * CachedExtent with associated lba mapping. + * + * Users of TransactionManager should be using extents derived from + * LogicalCachedExtent. + */ +class LogicalCachedExtent : public CachedExtent { +public: + template <typename... T> + LogicalCachedExtent(T&&... t) : CachedExtent(std::forward<T>(t)...) {} + + void set_pin(LBAPinRef &&npin) { + assert(!pin); + pin = std::move(npin); + laddr = pin->get_laddr(); + pin->link_extent(this); + } + + bool has_pin() const { + return !!pin; + } + + LBAPin &get_pin() { + assert(pin); + return *pin; + } + + laddr_t get_laddr() const { + assert(laddr != L_ADDR_NULL); + return laddr; + } + + void set_laddr(laddr_t nladdr) { + laddr = nladdr; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &bl) final { + apply_delta(bl); + set_last_committed_crc(get_crc32c()); + } + + bool is_logical() const final { + return true; + } + + std::ostream &print_detail(std::ostream &out) const final; +protected: + virtual void apply_delta(const ceph::bufferlist &bl) = 0; + virtual std::ostream &print_detail_l(std::ostream &out) const { + return out; + } + + virtual void logical_on_delta_write() {} + + void on_delta_write(paddr_t record_block_offset) final { + assert(get_prior_instance()); + pin->take_pin(*(get_prior_instance()->cast<LogicalCachedExtent>()->pin)); + logical_on_delta_write(); + } + +private: + laddr_t laddr = L_ADDR_NULL; + LBAPinRef pin; +}; + +using LogicalCachedExtentRef = TCachedExtentRef<LogicalCachedExtent>; +struct ref_laddr_cmp { + using is_transparent = laddr_t; + bool operator()(const LogicalCachedExtentRef &lhs, + const LogicalCachedExtentRef &rhs) const { + return lhs->get_laddr() < rhs->get_laddr(); + } + bool operator()(const laddr_t &lhs, + const LogicalCachedExtentRef &rhs) const { + return lhs < rhs->get_laddr(); + } + bool operator()(const LogicalCachedExtentRef &lhs, + const laddr_t &rhs) const { + return lhs->get_laddr() < rhs; + } +}; + +using lextent_set_t = addr_extent_set_base_t< + laddr_t, + LogicalCachedExtentRef, + ref_laddr_cmp + >; + +template <typename T> +using lextent_list_t = addr_extent_list_base_t< + laddr_t, TCachedExtentRef<T>>; + +} diff --git a/src/crimson/os/seastore/extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager.cc new file mode 100644 index 000000000..32de3a6ed --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <experimental/iterator> +#include <iostream> + +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/extentmap_manager.h" +#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h" +namespace crimson::os::seastore::extentmap_manager { + +ExtentMapManagerRef create_extentmap_manager(TransactionManager &trans_manager) { + return ExtentMapManagerRef(new BtreeExtentMapManager(trans_manager)); +} + +} + +namespace crimson::os::seastore { + +std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs) +{ + return out << "extent_mapping_t (" << rhs.logical_offset << "~" << rhs.length + << "->" << rhs.laddr << ")"; +} + +std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs) +{ + out << '['; + std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", ")); + return out << ']'; +} + +} diff --git a/src/crimson/os/seastore/extentmap_manager.h b/src/crimson/os/seastore/extentmap_manager.h new file mode 100644 index 000000000..7d5223b94 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager.h @@ -0,0 +1,124 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iosfwd> +#include <list> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/future.hh> + +#include "crimson/osd/exceptions.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" + +#define PAGE_SIZE 4096 +#define EXTMAP_BLOCK_SIZE 4096 + +namespace crimson::os::seastore { + +struct lext_map_val_t { + laddr_t laddr; + extent_len_t length = 0; + + lext_map_val_t( + laddr_t laddr, + extent_len_t length) + : laddr(laddr), length(length) {} + +}; + +class extent_mapping_t +{ +public: + objaddr_t logical_offset = 0; //offset in object + laddr_t laddr; // lextent start address aligned with block size. + extent_len_t length = 0; + explicit extent_mapping_t(objaddr_t lo) : logical_offset(lo) { } + + extent_mapping_t( + objaddr_t lo, + laddr_t laddr, + extent_len_t length) + : logical_offset(lo), laddr(laddr), length(length) {} + + ~extent_mapping_t() {} +}; + +enum class extmap_root_state_t : uint8_t { + INITIAL = 0, + MUTATED = 1, + NONE = 0xFF +}; + +using extent_map_list_t = std::list<extent_mapping_t>; +std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs); +std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs); + +struct extmap_root_t { + depth_t depth = 0; + extmap_root_state_t state; + laddr_t extmap_root_laddr; + extmap_root_t(depth_t dep, laddr_t laddr) + : depth(dep), + extmap_root_laddr(laddr) { state = extmap_root_state_t::INITIAL; } +}; + +/** + * Abstract interface for managing the object inner offset to logical addr mapping + * each onode has an extentmap tree for a particular onode. + */ +class ExtentMapManager { +public: + using initialize_extmap_ertr = TransactionManager::alloc_extent_ertr; + using initialize_extmap_ret = initialize_extmap_ertr::future<extmap_root_t>; + virtual initialize_extmap_ret initialize_extmap(Transaction &t) = 0; + + /* find_lextents + * + * Return a list of all extent_mapping_t overlapping any portion of lo~len. + * or if not find any overlap extent_mapping_t will return the next extent after the range. + */ + using find_lextent_ertr = TransactionManager::read_extent_ertr; + using find_lextent_ret = find_lextent_ertr::future<extent_map_list_t>; + virtual find_lextent_ret + find_lextent(const extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, extent_len_t len) = 0; + + /* add_lextent + * + * add a new mapping (object offset -> laddr, length) to extent map + * return the added extent_mapping_t + */ + using add_lextent_ertr = TransactionManager::read_extent_ertr; + using add_lextent_ret = add_lextent_ertr::future<extent_mapping_t>; + virtual add_lextent_ret + add_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) = 0; + + /* rm_lextent + * + * remove an existing extent mapping from extent map + * return true if the extent mapping is removed, otherwise return false + */ + using rm_lextent_ertr = TransactionManager::read_extent_ertr; + using rm_lextent_ret = rm_lextent_ertr::future<bool>; + virtual rm_lextent_ret rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) = 0; + + virtual ~ExtentMapManager() {} +}; +using ExtentMapManagerRef = std::unique_ptr<ExtentMapManager>; + +namespace extentmap_manager { +/* creat ExtentMapManager for an extentmap + * if it is a new extmap after create_extentmap_manager need call initialize_extmap + * to initialize the extent map before use it + * if it is an exsiting extmap, needn't initialize_extmap + */ +ExtentMapManagerRef create_extentmap_manager( + TransactionManager &trans_manager); + +} + +} diff --git a/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc new file mode 100644 index 000000000..f7609d3e8 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.cc @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#include <sys/mman.h> +#include <string.h> + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h" +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::extentmap_manager { + +BtreeExtentMapManager::BtreeExtentMapManager( + TransactionManager &tm) + : tm(tm) {} + +BtreeExtentMapManager::initialize_extmap_ret +BtreeExtentMapManager::initialize_extmap(Transaction &t) +{ + + logger().debug("{}", __func__); + return tm.alloc_extent<ExtMapLeafNode>(t, L_ADDR_MIN, EXTMAP_BLOCK_SIZE) + .safe_then([](auto&& root_extent) { + root_extent->set_size(0); + extmap_node_meta_t meta{1}; + root_extent->set_meta(meta); + extmap_root_t extmap_root = extmap_root_t(1, root_extent->get_laddr()); + return initialize_extmap_ertr::make_ready_future<extmap_root_t>(extmap_root); + }); +} + +BtreeExtentMapManager::get_root_ret +BtreeExtentMapManager::get_extmap_root(const extmap_root_t &extmap_root, Transaction &t) +{ + assert(extmap_root.extmap_root_laddr != L_ADDR_NULL); + laddr_t laddr = extmap_root.extmap_root_laddr; + return extmap_load_extent(get_ext_context(t), laddr, extmap_root.depth); +} + +BtreeExtentMapManager::find_lextent_ret +BtreeExtentMapManager::find_lextent(const extmap_root_t &extmap_root, Transaction &t, + objaddr_t lo, extent_len_t len) +{ + logger().debug("{}: {}, {}", __func__, lo, len); + return get_extmap_root(extmap_root, t).safe_then([this, &t, lo, len](auto&& extent) { + return extent->find_lextent(get_ext_context(t), lo, len); + }).safe_then([](auto &&e) { + logger().debug("{}: found_lextent {}", __func__, e); + return find_lextent_ret( + find_lextent_ertr::ready_future_marker{}, + std::move(e)); + }); + +} + +BtreeExtentMapManager::add_lextent_ret +BtreeExtentMapManager::add_lextent(extmap_root_t &extmap_root, Transaction &t, + objaddr_t lo, lext_map_val_t val) +{ + logger().debug("{}: {}, {}, {}", __func__, lo, val.laddr, val.length); + return get_extmap_root(extmap_root, t).safe_then([this, &extmap_root, &t, lo, val](auto &&root) { + return insert_lextent(extmap_root, t, root, lo, val); + }).safe_then([](auto ret) { + logger().debug("{}: {}", __func__, ret); + return add_lextent_ret( + add_lextent_ertr::ready_future_marker{}, + std::move(ret)); + }); + +} + +BtreeExtentMapManager::insert_lextent_ret +BtreeExtentMapManager::insert_lextent(extmap_root_t &extmap_root, Transaction &t, + ExtMapNodeRef root, objaddr_t logical_offset, lext_map_val_t val) +{ + auto split = insert_lextent_ertr::make_ready_future<ExtMapNodeRef>(root); + if (root->at_max_capacity()) { + logger().debug("{}::splitting root {}", __func__, *root); + split = root->extmap_alloc_extent<ExtMapInnerNode>(get_ext_context(t), EXTMAP_BLOCK_SIZE) + .safe_then([this, &extmap_root, root, &t, logical_offset](auto&& nroot) { + extmap_node_meta_t meta{root->get_node_meta().depth + 1}; + nroot->set_meta(meta); + nroot->journal_insert(nroot->begin(), OBJ_ADDR_MIN, + root->get_laddr(), nullptr); + extmap_root.extmap_root_laddr = nroot->get_laddr(); + extmap_root.depth = root->get_node_meta().depth + 1; + extmap_root.state = extmap_root_state_t::MUTATED; + return nroot->split_entry(get_ext_context(t), logical_offset, nroot->begin(), root); + }); + } + return split.safe_then([this, &t, logical_offset, val](ExtMapNodeRef node) { + return node->insert(get_ext_context(t), logical_offset, val); + }); +} + +BtreeExtentMapManager::rm_lextent_ret +BtreeExtentMapManager::rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) +{ + logger().debug("{}: {}, {}, {}", __func__, lo, val.laddr, val.length); + return get_extmap_root(extmap_root, t).safe_then([this, &t, lo, val](auto extent) { + return extent->rm_lextent(get_ext_context(t), lo, val); + }).safe_then([](auto removed) { + logger().debug("{}: {}", __func__, removed); + return rm_lextent_ret( + rm_lextent_ertr::ready_future_marker{}, + removed); + }); +} + + +} diff --git a/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h new file mode 100644 index 000000000..db676f41d --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/extentmap_manager.h" +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" + +namespace crimson::os::seastore::extentmap_manager { +/** + * BtreeExtentMapManager + * + * Uses a btree to track : + * objaddr_t -> laddr_t mapping for each onode extentmap + */ + +class BtreeExtentMapManager : public ExtentMapManager { + TransactionManager &tm; + + ext_context_t get_ext_context(Transaction &t) { + return ext_context_t{tm,t}; + } + + /* get_extmap_root + * + * load extent map tree root node + */ + using get_root_ertr = TransactionManager::read_extent_ertr; + using get_root_ret = get_root_ertr::future<ExtMapNodeRef>; + get_root_ret get_extmap_root(const extmap_root_t &extmap_root, Transaction &t); + + using insert_lextent_ertr = TransactionManager::read_extent_ertr; + using insert_lextent_ret = insert_lextent_ertr::future<extent_mapping_t >; + insert_lextent_ret insert_lextent(extmap_root_t &extmap_root, Transaction &t, + ExtMapNodeRef extent, objaddr_t lo, + lext_map_val_t val); + +public: + explicit BtreeExtentMapManager(TransactionManager &tm); + + initialize_extmap_ret initialize_extmap(Transaction &t) final; + + find_lextent_ret find_lextent(const extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, extent_len_t len) final; + + add_lextent_ret add_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) final; + + rm_lextent_ret rm_lextent(extmap_root_t &extmap_root, Transaction &t, objaddr_t lo, lext_map_val_t val) final; + + +}; +using BtreeExtentMapManagerRef = std::unique_ptr<BtreeExtentMapManager>; + +} diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h new file mode 100644 index 000000000..3937bd049 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + + +#pragma once + +#include <boost/iterator/counting_iterator.hpp> + +#include "crimson/common/log.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/extentmap_manager.h" + +namespace crimson::os::seastore::extentmap_manager{ + +struct ext_context_t { + TransactionManager &tm; + Transaction &t; +}; + +struct extmap_node_meta_t { + depth_t depth = 0; + + std::pair<extmap_node_meta_t, extmap_node_meta_t> split_into(objaddr_t pivot) const { + return std::make_pair( + extmap_node_meta_t{depth}, + extmap_node_meta_t{depth}); + } + + static extmap_node_meta_t merge_from( + const extmap_node_meta_t &lhs, const extmap_node_meta_t &rhs) { + assert(lhs.depth == rhs.depth); + return extmap_node_meta_t{lhs.depth}; + } + + static std::pair<extmap_node_meta_t, extmap_node_meta_t> + rebalance(const extmap_node_meta_t &lhs, const extmap_node_meta_t &rhs, laddr_t pivot) { + assert(lhs.depth == rhs.depth); + return std::make_pair( + extmap_node_meta_t{lhs.depth}, + extmap_node_meta_t{lhs.depth}); + } +}; + +struct ExtMapNode : LogicalCachedExtent { + using ExtMapNodeRef = TCachedExtentRef<ExtMapNode>; + + ExtMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {} + ExtMapNode(const ExtMapNode &other) + : LogicalCachedExtent(other) {} + + using find_lextent_ertr = ExtentMapManager::find_lextent_ertr; + using find_lextent_ret = ExtentMapManager::find_lextent_ret; + virtual find_lextent_ret find_lextent(ext_context_t ec, + objaddr_t lo, extent_len_t len) = 0; + + using insert_ertr = TransactionManager::read_extent_ertr; + using insert_ret = insert_ertr::future<extent_mapping_t>; + virtual insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) = 0; + + using rm_lextent_ertr = TransactionManager::read_extent_ertr; + using rm_lextent_ret = rm_lextent_ertr::future<bool>; + virtual rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) = 0; + + using split_children_ertr = TransactionManager::alloc_extent_ertr; + using split_children_ret = split_children_ertr::future + <std::tuple<ExtMapNodeRef, ExtMapNodeRef, uint32_t>>; + virtual split_children_ret make_split_children(ext_context_t ec) = 0; + + using full_merge_ertr = TransactionManager::alloc_extent_ertr; + using full_merge_ret = full_merge_ertr::future<ExtMapNodeRef>; + virtual full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) = 0; + + using make_balanced_ertr = TransactionManager::alloc_extent_ertr; + using make_balanced_ret = make_balanced_ertr::future + <std::tuple<ExtMapNodeRef, ExtMapNodeRef, uint32_t>>; + virtual make_balanced_ret + make_balanced(ext_context_t ec, ExtMapNodeRef right, bool prefer_left) = 0; + + virtual extmap_node_meta_t get_node_meta() const = 0; + + virtual bool at_max_capacity() const = 0; + virtual bool at_min_capacity() const = 0; + virtual unsigned get_node_size() const = 0; + virtual ~ExtMapNode() = default; + + using alloc_ertr = TransactionManager::alloc_extent_ertr; + template<class T> + alloc_ertr::future<TCachedExtentRef<T>> + extmap_alloc_extent(ext_context_t ec, extent_len_t len) { + return ec.tm.alloc_extent<T>(ec.t, L_ADDR_MIN, len).safe_then( + [](auto&& extent) { + return alloc_ertr::make_ready_future<TCachedExtentRef<T>>(std::move(extent)); + }); + } + + template<class T> + alloc_ertr::future<std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>> + extmap_alloc_2extents(ext_context_t ec, extent_len_t len) { + return seastar::do_with(std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>(), + [ec, len] (auto &extents) { + return crimson::do_for_each(boost::make_counting_iterator(0), + boost::make_counting_iterator(2), + [ec, len, &extents] (auto i) { + return ec.tm.alloc_extent<T>(ec.t, L_ADDR_MIN, len).safe_then( + [i, &extents](auto &&node) { + if (i == 0) + extents.first = node; + if (i == 1) + extents.second = node; + }); + }).safe_then([&extents] { + return alloc_ertr::make_ready_future + <std::pair<TCachedExtentRef<T>, TCachedExtentRef<T>>>(std::move(extents)); + }); + }); + } + + using retire_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::input_output_error>; + using retire_ret = retire_ertr::future<std::list<unsigned>>; + retire_ret + extmap_retire_node(ext_context_t ec, std::list<laddr_t> dec_laddrs) { + return seastar::do_with(std::move(dec_laddrs), std::list<unsigned>(), + [ec] (auto &&dec_laddrs, auto &refcnt) { + return crimson::do_for_each(dec_laddrs.begin(), dec_laddrs.end(), + [ec, &refcnt] (auto &laddr) { + return ec.tm.dec_ref(ec.t, laddr).safe_then([&refcnt] (auto ref) { + refcnt.push_back(ref); + }); + }).safe_then([&refcnt] { + return retire_ertr::make_ready_future<std::list<unsigned>>(std::move(refcnt)); + }); + }); + } + +}; + +using ExtMapNodeRef = ExtMapNode::ExtMapNodeRef; + +TransactionManager::read_extent_ertr::future<ExtMapNodeRef> +extmap_load_extent(ext_context_t ec, laddr_t laddr, depth_t depth); + +} diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc new file mode 100644 index 000000000..7bf8680a5 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.cc @@ -0,0 +1,373 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include <memory> +#include <string.h> + +#include "include/buffer.h" +#include "include/byteorder.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h" +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::extentmap_manager { + +std::ostream &ExtMapInnerNode::print_detail_l(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", depth=" << get_meta().depth; +} + +ExtMapInnerNode::find_lextent_ret +ExtMapInnerNode::find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) +{ + auto [begin, end] = bound(lo, lo + len); + auto result_up = std::make_unique<extent_map_list_t>(); + auto &result = *result_up; + return crimson::do_for_each( + std::move(begin), + std::move(end), + [this, ec, &result, lo, len](const auto &val) mutable { + return extmap_load_extent(ec, val.get_val(), get_meta().depth - 1).safe_then( + [ec, &result, lo, len](auto extent) mutable { + return extent->find_lextent(ec, lo, len).safe_then( + [&result](auto item_list) mutable { + result.splice(result.end(), item_list, + item_list.begin(), item_list.end()); + }); + }); + }).safe_then([result=std::move(result_up)] { + return find_lextent_ret( + find_lextent_ertr::ready_future_marker{}, + std::move(*result)); + }); +} + +ExtMapInnerNode::insert_ret +ExtMapInnerNode::insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) +{ + auto insertion_pt = get_containing_child(lo); + assert(insertion_pt != end()); + return extmap_load_extent(ec, insertion_pt->get_val(), get_meta().depth - 1).safe_then( + [this, ec, insertion_pt, lo, val=std::move(val)](auto extent) mutable { + return extent->at_max_capacity() ? + split_entry(ec, lo, insertion_pt, extent) : + insert_ertr::make_ready_future<ExtMapNodeRef>(std::move(extent)); + }).safe_then([ec, lo, val=std::move(val)](ExtMapNodeRef extent) mutable { + return extent->insert(ec, lo, val); + }); +} + +ExtMapInnerNode::rm_lextent_ret +ExtMapInnerNode::rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) +{ + auto rm_pt = get_containing_child(lo); + return extmap_load_extent(ec, rm_pt->get_val(), get_meta().depth - 1).safe_then( + [this, ec, rm_pt, lo, val=std::move(val)](auto extent) mutable { + if (extent->at_min_capacity() && get_node_size() > 1) { + return merge_entry(ec, lo, rm_pt, extent); + } else { + return merge_entry_ertr::make_ready_future<ExtMapNodeRef>(std::move(extent)); + } + }).safe_then([ec, lo, val](ExtMapNodeRef extent) mutable { + return extent->rm_lextent(ec, lo, val); + }); +} + +ExtMapInnerNode::split_children_ret +ExtMapInnerNode::make_split_children(ext_context_t ec) +{ + logger().debug("{}: {}", "ExtMapInnerNode", __func__); + return extmap_alloc_2extents<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this] (auto &&ext_pair) { + auto [left, right] = ext_pair; + return split_children_ret( + split_children_ertr::ready_future_marker{}, + std::make_tuple(left, right, split_into(*left, *right))); + }); +} + +ExtMapInnerNode::full_merge_ret +ExtMapInnerNode::make_full_merge(ext_context_t ec, ExtMapNodeRef right) +{ + logger().debug("{}: {}", "ExtMapInnerNode", __func__); + return extmap_alloc_extent<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this, right] (auto &&replacement) { + replacement->merge_from(*this, *right->cast<ExtMapInnerNode>()); + return full_merge_ret( + full_merge_ertr::ready_future_marker{}, + std::move(replacement)); + }); +} + +ExtMapInnerNode::make_balanced_ret +ExtMapInnerNode::make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) +{ + logger().debug("{}: {}", "ExtMapInnerNode", __func__); + ceph_assert(_right->get_type() == type); + return extmap_alloc_2extents<ExtMapInnerNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this, _right, prefer_left] (auto &&replacement_pair){ + auto [replacement_left, replacement_right] = replacement_pair; + auto &right = *_right->cast<ExtMapInnerNode>(); + return make_balanced_ret( + make_balanced_ertr::ready_future_marker{}, + std::make_tuple(replacement_left, replacement_right, + balance_into_new_nodes(*this, right, prefer_left, + *replacement_left, *replacement_right))); + }); +} + +ExtMapInnerNode::split_entry_ret +ExtMapInnerNode::split_entry(ext_context_t ec, objaddr_t lo, + internal_iterator_t iter, ExtMapNodeRef entry) +{ + logger().debug("{}: {}", "ExtMapInnerNode", __func__); + if (!is_pending()) { + auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapInnerNode>(); + auto mut_iter = mut->iter_idx(iter->get_offset()); + return mut->split_entry(ec, lo, mut_iter, entry); + } + ceph_assert(!at_max_capacity()); + return entry->make_split_children(ec) + .safe_then([this, ec, lo, iter, entry] (auto tuple){ + auto [left, right, pivot] = tuple; + journal_update(iter, left->get_laddr(), maybe_get_delta_buffer()); + journal_insert(iter + 1, pivot, right->get_laddr(), maybe_get_delta_buffer()); + logger().debug( + "ExtMapInnerNode::split_entry *this {} entry {} into left {} right {}", + *this, *entry, *left, *right); + //retire extent + return ec.tm.dec_ref(ec.t, entry->get_laddr()) + .safe_then([lo, left = left, right = right, pivot = pivot] (auto ret) { + return split_entry_ertr::make_ready_future<ExtMapNodeRef>( + pivot > lo ? left : right); + }); + }); +} + +ExtMapInnerNode::merge_entry_ret +ExtMapInnerNode::merge_entry(ext_context_t ec, objaddr_t lo, + internal_iterator_t iter, ExtMapNodeRef entry) +{ + if (!is_pending()) { + auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapInnerNode>(); + auto mut_iter = mut->iter_idx(iter->get_offset()); + return mut->merge_entry(ec, lo, mut_iter, entry); + } + logger().debug("ExtMapInnerNode: merge_entry: {}, {}", *this, *entry); + auto is_left = (iter + 1) == end(); + auto donor_iter = is_left ? iter - 1 : iter + 1; + return extmap_load_extent(ec, donor_iter->get_val(), get_meta().depth - 1) + .safe_then([this, ec, lo, iter, entry, donor_iter, is_left] + (auto &&donor) mutable { + auto [l, r] = is_left ? + std::make_pair(donor, entry) : std::make_pair(entry, donor); + auto [liter, riter] = is_left ? + std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); + if (donor->at_min_capacity()) { + return l->make_full_merge(ec, r) + .safe_then([this, ec, entry, l = l, r = r, liter = liter, riter = riter] + (auto &&replacement){ + journal_update(liter, replacement->get_laddr(), maybe_get_delta_buffer()); + journal_remove(riter, maybe_get_delta_buffer()); + //retire extent + std::list<laddr_t> dec_laddrs; + dec_laddrs.push_back(l->get_laddr()); + dec_laddrs.push_back(r->get_laddr()); + return extmap_retire_node(ec, dec_laddrs) + .safe_then([replacement] (auto &&ret) { + return merge_entry_ertr::make_ready_future<ExtMapNodeRef>(replacement); + }); + }); + } else { + logger().debug("ExtMapInnerNode::merge_entry balanced l {} r {}", + *l, *r); + return l->make_balanced(ec, r, !is_left) + .safe_then([this, ec, lo, entry, l = l, r = r, liter = liter, riter = riter] + (auto tuple) { + auto [replacement_l, replacement_r, pivot] = tuple; + journal_update(liter, replacement_l->get_laddr(), maybe_get_delta_buffer()); + journal_replace(riter, pivot, replacement_r->get_laddr(), + maybe_get_delta_buffer()); + // retire extent + std::list<laddr_t> dec_laddrs; + dec_laddrs.push_back(l->get_laddr()); + dec_laddrs.push_back(r->get_laddr()); + return extmap_retire_node(ec, dec_laddrs) + .safe_then([lo, pivot = pivot, replacement_l = replacement_l, replacement_r = replacement_r] + (auto &&ret) { + return merge_entry_ertr::make_ready_future<ExtMapNodeRef>( + lo >= pivot ? replacement_r : replacement_l); + }); + }); + } + }); +} + + +ExtMapInnerNode::internal_iterator_t +ExtMapInnerNode::get_containing_child(objaddr_t lo) +{ + // TODO: binary search + for (auto i = begin(); i != end(); ++i) { + if (i.contains(lo)) + return i; + } + ceph_assert(0 == "invalid"); + return end(); +} + +std::ostream &ExtMapLeafNode::print_detail_l(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", depth=" << get_meta().depth; +} + +ExtMapLeafNode::find_lextent_ret +ExtMapLeafNode::find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) +{ + logger().debug( + "ExtMapLeafNode::find_lextent {}~{}", lo, len); + auto ret = extent_map_list_t(); + auto [from, to] = get_leaf_entries(lo, len); + if (from == to && to != end()) + ++to; + for (; from != to; ++from) { + auto val = (*from).get_val(); + ret.emplace_back( + extent_mapping_t( + (*from).get_key(), + val.laddr, + val.length)); + logger().debug("ExtMapLeafNode::find_lextent find {}~{}", lo, val.laddr); + } + return find_lextent_ertr::make_ready_future<extent_map_list_t>( + std::move(ret)); +} + +ExtMapLeafNode::insert_ret +ExtMapLeafNode::insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) +{ + ceph_assert(!at_max_capacity()); + if (!is_pending()) { + auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapLeafNode>(); + return mut->insert(ec, lo, val); + } + auto insert_pt = lower_bound(lo); + journal_insert(insert_pt, lo, val, maybe_get_delta_buffer()); + + logger().debug( + "ExtMapLeafNode::insert: inserted {}->{} {}", + insert_pt.get_key(), + insert_pt.get_val().laddr, + insert_pt.get_val().length); + return insert_ertr::make_ready_future<extent_mapping_t>( + extent_mapping_t(lo, val.laddr, val.length)); +} + +ExtMapLeafNode::rm_lextent_ret +ExtMapLeafNode::rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) +{ + if (!is_pending()) { + auto mut = ec.tm.get_mutable_extent(ec.t, this)->cast<ExtMapLeafNode>(); + return mut->rm_lextent(ec, lo, val); + } + + auto [rm_pt, rm_end] = get_leaf_entries(lo, val.length); + if (lo == rm_pt->get_key() && val.laddr == rm_pt->get_val().laddr + && val.length == rm_pt->get_val().length) { + journal_remove(rm_pt, maybe_get_delta_buffer()); + logger().debug( + "ExtMapLeafNode::rm_lextent: removed {}->{} {}", + rm_pt.get_key(), + rm_pt.get_val().laddr, + rm_pt.get_val().length); + return rm_lextent_ertr::make_ready_future<bool>(true); + } else { + return rm_lextent_ertr::make_ready_future<bool>(false); + } +} + +ExtMapLeafNode::split_children_ret +ExtMapLeafNode::make_split_children(ext_context_t ec) +{ + logger().debug("{}: {}", "ExtMapLeafNode", __func__); + return extmap_alloc_2extents<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this] (auto &&ext_pair) { + auto [left, right] = ext_pair; + return split_children_ret( + split_children_ertr::ready_future_marker{}, + std::make_tuple(left, right, split_into(*left, *right))); + }); +} + +ExtMapLeafNode::full_merge_ret +ExtMapLeafNode::make_full_merge(ext_context_t ec, ExtMapNodeRef right) +{ + logger().debug("{}: {}", "ExtMapLeafNode", __func__); + return extmap_alloc_extent<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this, right] (auto &&replacement) { + replacement->merge_from(*this, *right->cast<ExtMapLeafNode>()); + return full_merge_ret( + full_merge_ertr::ready_future_marker{}, + std::move(replacement)); + }); +} +ExtMapLeafNode::make_balanced_ret +ExtMapLeafNode::make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) +{ + logger().debug("{}: {}", "ExtMapLeafNode", __func__); + ceph_assert(_right->get_type() == type); + return extmap_alloc_2extents<ExtMapLeafNode>(ec, EXTMAP_BLOCK_SIZE) + .safe_then([this, _right, prefer_left] (auto &&replacement_pair) { + auto [replacement_left, replacement_right] = replacement_pair; + auto &right = *_right->cast<ExtMapLeafNode>(); + return make_balanced_ret( + make_balanced_ertr::ready_future_marker{}, + std::make_tuple( + replacement_left, replacement_right, + balance_into_new_nodes( + *this, right, prefer_left, + *replacement_left, *replacement_right))); + }); +} + + +std::pair<ExtMapLeafNode::internal_iterator_t, ExtMapLeafNode::internal_iterator_t> +ExtMapLeafNode::get_leaf_entries(objaddr_t addr, extent_len_t len) +{ + return bound(addr, addr + len); +} + + +TransactionManager::read_extent_ertr::future<ExtMapNodeRef> +extmap_load_extent(ext_context_t ec, laddr_t laddr, depth_t depth) +{ + ceph_assert(depth > 0); + if (depth > 1) { + return ec.tm.read_extents<ExtMapInnerNode>(ec.t, laddr, EXTMAP_BLOCK_SIZE).safe_then( + [](auto&& extents) { + assert(extents.size() == 1); + [[maybe_unused]] auto [laddr, e] = extents.front(); + return TransactionManager::read_extent_ertr::make_ready_future<ExtMapNodeRef>(std::move(e)); + }); + } else { + return ec.tm.read_extents<ExtMapLeafNode>(ec.t, laddr, EXTMAP_BLOCK_SIZE).safe_then( + [](auto&& extents) { + assert(extents.size() == 1); + [[maybe_unused]] auto [laddr, e] = extents.front(); + return TransactionManager::read_extent_ertr::make_ready_future<ExtMapNodeRef>(std::move(e)); + }); + } +} + +} diff --git a/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h new file mode 100644 index 000000000..f5da8cdc2 --- /dev/null +++ b/src/crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node_impl.h @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once +#include "include/buffer.h" + +#include "crimson/common/fixed_kv_node_layout.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/extentmap_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/extentmap_manager/btree/extentmap_btree_node.h" + +namespace crimson::os::seastore::extentmap_manager { + +struct extmap_node_meta_le_t { + depth_le_t depth = init_les32(0); + + extmap_node_meta_le_t() = default; + extmap_node_meta_le_t(const extmap_node_meta_le_t &) = default; + explicit extmap_node_meta_le_t(const extmap_node_meta_t &val) + : depth(init_les32(val.depth)) {} + + operator extmap_node_meta_t() const { + return extmap_node_meta_t{ depth }; + } +}; + +/** + * ExtMapInnerNode + * + * Abstracts operations on and layout of internal nodes for the + * Extentmap Tree. + * + * Layout (4k): + * num_entries: uint32_t 4b + * meta : depth 4b + * (padding) : 8b + * keys : objaddr_t[340] (340*4)b + * values : laddr_t[340] (340*8)b + * = 4096 + */ +constexpr size_t INNER_NODE_CAPACITY = + (EXTMAP_BLOCK_SIZE - sizeof(uint32_t) - sizeof(extmap_node_meta_t)) + / (sizeof (objaddr_t) + sizeof(laddr_t)); + +struct ExtMapInnerNode + : ExtMapNode, + common::FixedKVNodeLayout< + INNER_NODE_CAPACITY, + extmap_node_meta_t, extmap_node_meta_le_t, + objaddr_t, ceph_le32, + laddr_t, laddr_le_t> { + using internal_iterator_t = const_iterator; + template <typename... T> + ExtMapInnerNode(T&&... t) : + ExtMapNode(std::forward<T>(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t type = extent_types_t::EXTMAP_INNER; + + extmap_node_meta_t get_node_meta() const final {return get_meta();} + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new ExtMapInnerNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + find_lextent_ret find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) final; + + insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final; + + rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final; + + split_children_ret make_split_children(ext_context_t ec) final; + + full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) final; + + make_balanced_ret make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) final; + + std::ostream &print_detail_l(std::ostream &out) const final; + + extent_types_t get_type() const final { + return type; + } + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta(const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + } + + bool at_max_capacity() const final { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const { + return get_size() == get_capacity() / 2; + } + + unsigned get_node_size() const { + return get_size(); + } + + /* get the iterator containing [l, r] + */ + std::pair<internal_iterator_t, internal_iterator_t> bound( + objaddr_t l, objaddr_t r) { + auto retl = begin(); + for (; retl != end(); ++retl) { + if (retl->get_next_key_or_max() > l) + break; + } + auto retr = retl; + for (; retr != end(); ++retr) { + if (retr->get_key() >= r) + break; + } + return {retl, retr}; + } + + using split_entry_ertr = TransactionManager::read_extent_ertr; + using split_entry_ret = split_entry_ertr::future<ExtMapNodeRef>; + split_entry_ret split_entry(ext_context_t ec, objaddr_t lo, + internal_iterator_t, ExtMapNodeRef entry); + using merge_entry_ertr = TransactionManager::read_extent_ertr; + using merge_entry_ret = merge_entry_ertr::future<ExtMapNodeRef>; + merge_entry_ret merge_entry(ext_context_t ec, objaddr_t lo, + internal_iterator_t iter, ExtMapNodeRef entry); + internal_iterator_t get_containing_child(objaddr_t lo); + +}; + +/** + * ExtMapLeafNode + * + * Abstracts operations on and layout of leaf nodes for the + * ExtentMap Tree. + * + * Layout (4k): + * num_entries: uint32_t 4b + * meta : depth 4b + * (padding) : 8b + * keys : objaddr_t[204] (204*4)b + * values : lext_map_val_t[204] (204*16)b + * = 4096 + */ +constexpr size_t LEAF_NODE_CAPACITY = + (EXTMAP_BLOCK_SIZE - sizeof(uint32_t) - sizeof(extmap_node_meta_t)) + / (sizeof(objaddr_t) + sizeof(lext_map_val_t)); + +struct lext_map_val_le_t { + laddr_le_t laddr; + extent_len_le_t length = init_extent_len_le_t(0); + + lext_map_val_le_t() = default; + lext_map_val_le_t(const lext_map_val_le_t &) = default; + explicit lext_map_val_le_t(const lext_map_val_t &val) + : laddr(laddr_le_t(val.laddr)), + length(init_extent_len_le_t(val.length)) {} + + operator lext_map_val_t() const { + return lext_map_val_t{laddr, length}; + } +}; + +struct ExtMapLeafNode + : ExtMapNode, + common::FixedKVNodeLayout< + LEAF_NODE_CAPACITY, + extmap_node_meta_t, extmap_node_meta_le_t, + objaddr_t, ceph_le32, + lext_map_val_t, lext_map_val_le_t> { + using internal_iterator_t = const_iterator; + template <typename... T> + ExtMapLeafNode(T&&... t) : + ExtMapNode(std::forward<T>(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t type = extent_types_t::EXTMAP_LEAF; + + extmap_node_meta_t get_node_meta() const final { return get_meta(); } + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new ExtMapLeafNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + find_lextent_ret find_lextent(ext_context_t ec, objaddr_t lo, extent_len_t len) final; + + insert_ret insert(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final; + + rm_lextent_ret rm_lextent(ext_context_t ec, objaddr_t lo, lext_map_val_t val) final; + + split_children_ret make_split_children(ext_context_t ec) final; + + full_merge_ret make_full_merge(ext_context_t ec, ExtMapNodeRef right) final; + + make_balanced_ret make_balanced(ext_context_t ec, ExtMapNodeRef _right, bool prefer_left) final; + + extent_types_t get_type() const final { + return type; + } + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta(const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + } + + std::ostream &print_detail_l(std::ostream &out) const final; + + bool at_max_capacity() const final { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const final { + return get_size() == get_capacity() / 2; + } + + unsigned get_node_size() const { + return get_size(); + } + + /* get the iterator containing [l, r] + */ + std::pair<internal_iterator_t, internal_iterator_t> bound( + objaddr_t l, objaddr_t r) { + auto retl = begin(); + for (; retl != end(); ++retl) { + if (retl->get_key() >= l || (retl->get_key() + retl->get_val().length) > l) + break; + } + auto retr = retl; + for (; retr != end(); ++retr) { + if (retr->get_key() >= r) + break; + } + return {retl, retr}; + } + + std::pair<internal_iterator_t, internal_iterator_t> + get_leaf_entries(objaddr_t lo, extent_len_t len); + +}; +using ExtentMapLeafNodeRef = TCachedExtentRef<ExtMapLeafNode>; + +} diff --git a/src/crimson/os/seastore/journal.cc b/src/crimson/os/seastore/journal.cc new file mode 100644 index 000000000..39875fb56 --- /dev/null +++ b/src/crimson/os/seastore/journal.cc @@ -0,0 +1,756 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <iostream> + +#include <boost/iterator/counting_iterator.hpp> + +#include "crimson/os/seastore/journal.h" + +#include "include/intarith.h" +#include "crimson/os/seastore/segment_manager.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore { + +std::ostream &operator<<(std::ostream &out, const segment_header_t &header) +{ + return out << "segment_header_t(" + << "segment_seq=" << header.journal_segment_seq + << ", physical_segment_id=" << header.physical_segment_id + << ", journal_tail=" << header.journal_tail + << ", segment_nonce=" << header.segment_nonce + << ")"; +} + +segment_nonce_t generate_nonce( + segment_seq_t seq, + const seastore_meta_t &meta) +{ + return ceph_crc32c( + seq, + reinterpret_cast<const unsigned char *>(meta.seastore_id.bytes()), + sizeof(meta.seastore_id.uuid)); +} + +Journal::Journal(SegmentManager &segment_manager) + : block_size(segment_manager.get_block_size()), + max_record_length( + segment_manager.get_segment_size() - + p2align(ceph::encoded_sizeof_bounded<segment_header_t>(), + size_t(block_size))), + segment_manager(segment_manager) {} + + +Journal::initialize_segment_ertr::future<segment_seq_t> +Journal::initialize_segment(Segment &segment) +{ + auto new_tail = segment_provider->get_journal_tail_target(); + logger().debug( + "initialize_segment {} journal_tail_target {}", + segment.get_segment_id(), + new_tail); + // write out header + ceph_assert(segment.get_write_ptr() == 0); + bufferlist bl; + + segment_seq_t seq = next_journal_segment_seq++; + current_segment_nonce = generate_nonce( + seq, segment_manager.get_meta()); + auto header = segment_header_t{ + seq, + segment.get_segment_id(), + segment_provider->get_journal_tail_target(), + current_segment_nonce}; + encode(header, bl); + + bufferptr bp( + ceph::buffer::create_page_aligned( + segment_manager.get_block_size())); + bp.zero(); + auto iter = bl.cbegin(); + iter.copy(bl.length(), bp.c_str()); + bl.clear(); + bl.append(bp); + + written_to = segment_manager.get_block_size(); + committed_to = 0; + return segment.write(0, bl).safe_then( + [=] { + segment_provider->update_journal_tail_committed(new_tail); + return seq; + }, + initialize_segment_ertr::pass_further{}, + crimson::ct_error::assert_all{ "TODO" }); +} + +ceph::bufferlist Journal::encode_record( + record_size_t rsize, + record_t &&record) +{ + bufferlist data_bl; + for (auto &i: record.extents) { + data_bl.append(i.bl); + } + + bufferlist bl; + record_header_t header{ + rsize.mdlength, + rsize.dlength, + (uint32_t)record.deltas.size(), + (uint32_t)record.extents.size(), + current_segment_nonce, + committed_to, + data_bl.crc32c(-1) + }; + encode(header, bl); + + auto metadata_crc_filler = bl.append_hole(sizeof(uint32_t)); + + for (const auto &i: record.extents) { + encode(extent_info_t(i), bl); + } + for (const auto &i: record.deltas) { + encode(i, bl); + } + if (bl.length() % block_size != 0) { + bl.append_zero( + block_size - (bl.length() % block_size)); + } + ceph_assert(bl.length() == rsize.mdlength); + + + auto bliter = bl.cbegin(); + auto metadata_crc = bliter.crc32c( + ceph::encoded_sizeof_bounded<record_header_t>(), + -1); + bliter += sizeof(checksum_t); /* crc hole again */ + metadata_crc = bliter.crc32c( + bliter.get_remaining(), + metadata_crc); + ceph_le32 metadata_crc_le; + metadata_crc_le = metadata_crc; + metadata_crc_filler.copy_in( + sizeof(checksum_t), + reinterpret_cast<const char *>(&metadata_crc_le)); + + bl.claim_append(data_bl); + ceph_assert(bl.length() == (rsize.dlength + rsize.mdlength)); + + return bl; +} + +bool Journal::validate_metadata(const bufferlist &bl) +{ + auto bliter = bl.cbegin(); + auto test_crc = bliter.crc32c( + ceph::encoded_sizeof_bounded<record_header_t>(), + -1); + ceph_le32 recorded_crc_le; + ::decode(recorded_crc_le, bliter); + uint32_t recorded_crc = recorded_crc_le; + test_crc = bliter.crc32c( + bliter.get_remaining(), + test_crc); + return test_crc == recorded_crc; +} + +Journal::read_validate_data_ret Journal::read_validate_data( + paddr_t record_base, + const record_header_t &header) +{ + return segment_manager.read( + record_base.add_offset(header.mdlength), + header.dlength + ).safe_then([=, &header](auto bptr) { + bufferlist bl; + bl.append(bptr); + return bl.crc32c(-1) == header.data_crc; + }); +} + +Journal::write_record_ret Journal::write_record( + record_size_t rsize, + record_t &&record) +{ + ceph::bufferlist to_write = encode_record( + rsize, std::move(record)); + auto target = written_to; + assert((to_write.length() % block_size) == 0); + written_to += to_write.length(); + logger().debug( + "write_record, mdlength {}, dlength {}, target {}", + rsize.mdlength, + rsize.dlength, + target); + return current_journal_segment->write(target, to_write).handle_error( + write_record_ertr::pass_further{}, + crimson::ct_error::assert_all{ "TODO" }).safe_then([this, target] { + committed_to = target; + return write_record_ret( + write_record_ertr::ready_future_marker{}, + paddr_t{ + current_journal_segment->get_segment_id(), + target}); + }); +} + +Journal::record_size_t Journal::get_encoded_record_length( + const record_t &record) const { + extent_len_t metadata = + (extent_len_t)ceph::encoded_sizeof_bounded<record_header_t>(); + metadata += sizeof(checksum_t) /* crc */; + metadata += record.extents.size() * + ceph::encoded_sizeof_bounded<extent_info_t>(); + extent_len_t data = 0; + for (const auto &i: record.deltas) { + metadata += ceph::encoded_sizeof(i); + } + for (const auto &i: record.extents) { + data += i.bl.length(); + } + metadata = p2roundup(metadata, block_size); + return record_size_t{metadata, data}; +} + +bool Journal::needs_roll(segment_off_t length) const +{ + return length + written_to > + current_journal_segment->get_write_capacity(); +} + +Journal::roll_journal_segment_ertr::future<segment_seq_t> +Journal::roll_journal_segment() +{ + auto old_segment_id = current_journal_segment ? + current_journal_segment->get_segment_id() : + NULL_SEG_ID; + + return (current_journal_segment ? + current_journal_segment->close() : + Segment::close_ertr::now()).safe_then([this] { + return segment_provider->get_segment(); + }).safe_then([this](auto segment) { + return segment_manager.open(segment); + }).safe_then([this](auto sref) { + current_journal_segment = sref; + written_to = 0; + return initialize_segment(*current_journal_segment); + }).safe_then([=](auto seq) { + if (old_segment_id != NULL_SEG_ID) { + segment_provider->close_segment(old_segment_id); + } + segment_provider->set_journal_segment( + current_journal_segment->get_segment_id(), + seq); + return seq; + }).handle_error( + roll_journal_segment_ertr::pass_further{}, + crimson::ct_error::all_same_way([] { ceph_assert(0 == "TODO"); }) + ); +} + +Journal::read_segment_header_ret +Journal::read_segment_header(segment_id_t segment) +{ + return segment_manager.read(paddr_t{segment, 0}, block_size + ).handle_error( + read_segment_header_ertr::pass_further{}, + crimson::ct_error::assert_all{} + ).safe_then([=](bufferptr bptr) -> read_segment_header_ret { + logger().debug("segment {} bptr size {}", segment, bptr.length()); + + segment_header_t header; + bufferlist bl; + bl.push_back(bptr); + + logger().debug( + "Journal::read_segment_header: segment {} block crc {}", + segment, + bl.begin().crc32c(block_size, 0)); + + auto bp = bl.cbegin(); + try { + decode(header, bp); + } catch (ceph::buffer::error &e) { + logger().debug( + "Journal::read_segment_header: segment {} unable to decode " + "header, skipping", + segment); + return crimson::ct_error::enodata::make(); + } + logger().debug( + "Journal::read_segment_header: segment {} header {}", + segment, + header); + return read_segment_header_ret( + read_segment_header_ertr::ready_future_marker{}, + header); + }); +} + +Journal::open_for_write_ret Journal::open_for_write() +{ + return roll_journal_segment().safe_then([this](auto seq) { + return open_for_write_ret( + open_for_write_ertr::ready_future_marker{}, + journal_seq_t{ + seq, + paddr_t{ + current_journal_segment->get_segment_id(), + static_cast<segment_off_t>(block_size)} + }); + }); +} + +Journal::find_replay_segments_fut Journal::find_replay_segments() +{ + return seastar::do_with( + std::vector<std::pair<segment_id_t, segment_header_t>>(), + [this](auto &&segments) mutable { + return crimson::do_for_each( + boost::make_counting_iterator(segment_id_t{0}), + boost::make_counting_iterator(segment_manager.get_num_segments()), + [this, &segments](auto i) { + return read_segment_header(i + ).safe_then([this, &segments, i](auto header) mutable { + if (generate_nonce( + header.journal_segment_seq, + segment_manager.get_meta()) != header.segment_nonce) { + logger().debug( + "find_replay_segments: nonce mismatch segment {} header {}", + i, + header); + assert(0 == "impossible"); + return find_replay_segments_ertr::now(); + } + + segments.emplace_back(i, std::move(header)); + return find_replay_segments_ertr::now(); + }).handle_error( + crimson::ct_error::enoent::handle([i](auto) { + logger().debug( + "find_replay_segments: segment {} not available for read", + i); + return find_replay_segments_ertr::now(); + }), + crimson::ct_error::enodata::handle([i](auto) { + logger().debug( + "find_replay_segments: segment {} header undecodable", + i); + return find_replay_segments_ertr::now(); + }), + find_replay_segments_ertr::pass_further{}, + crimson::ct_error::assert_all{} + ); + }).safe_then([this, &segments]() mutable -> find_replay_segments_fut { + logger().debug( + "find_replay_segments: have {} segments", + segments.size()); + if (segments.empty()) { + return crimson::ct_error::input_output_error::make(); + } + std::sort( + segments.begin(), + segments.end(), + [](const auto <, const auto &rt) { + return lt.second.journal_segment_seq < + rt.second.journal_segment_seq; + }); + + next_journal_segment_seq = + segments.rbegin()->second.journal_segment_seq + 1; + std::for_each( + segments.begin(), + segments.end(), + [this](auto &seg) { + segment_provider->init_mark_segment_closed( + seg.first, + seg.second.journal_segment_seq); + }); + + auto journal_tail = segments.rbegin()->second.journal_tail; + segment_provider->update_journal_tail_committed(journal_tail); + auto replay_from = journal_tail.offset; + logger().debug( + "Journal::find_replay_segments: journal_tail={}", + journal_tail); + auto from = segments.begin(); + if (replay_from != P_ADDR_NULL) { + from = std::find_if( + segments.begin(), + segments.end(), + [&replay_from](const auto &seg) -> bool { + return seg.first == replay_from.segment; + }); + if (from->second.journal_segment_seq != journal_tail.segment_seq) { + logger().error( + "find_replay_segments: journal_tail {} does not match {}", + journal_tail, + from->second); + assert(0 == "invalid"); + } + } else { + replay_from = paddr_t{from->first, (segment_off_t)block_size}; + } + auto ret = replay_segments_t(segments.end() - from); + std::transform( + from, segments.end(), ret.begin(), + [this](const auto &p) { + auto ret = journal_seq_t{ + p.second.journal_segment_seq, + paddr_t{p.first, (segment_off_t)block_size}}; + logger().debug( + "Journal::find_replay_segments: replaying from {}", + ret); + return std::make_pair(ret, p.second); + }); + ret[0].first.offset = replay_from; + return find_replay_segments_fut( + find_replay_segments_ertr::ready_future_marker{}, + std::move(ret)); + }); + }); +} + +Journal::read_validate_record_metadata_ret Journal::read_validate_record_metadata( + paddr_t start, + segment_nonce_t nonce) +{ + if (start.offset + block_size > (int64_t)segment_manager.get_segment_size()) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + return segment_manager.read(start, block_size + ).safe_then( + [=](bufferptr bptr) mutable + -> read_validate_record_metadata_ret { + logger().debug("read_validate_record_metadata: reading {}", start); + bufferlist bl; + bl.append(bptr); + auto bp = bl.cbegin(); + record_header_t header; + try { + decode(header, bp); + } catch (ceph::buffer::error &e) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + if (header.segment_nonce != nonce) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + if (header.mdlength > block_size) { + if (start.offset + header.mdlength > + (int64_t)segment_manager.get_segment_size()) { + return crimson::ct_error::input_output_error::make(); + } + return segment_manager.read( + {start.segment, start.offset + (segment_off_t)block_size}, + header.mdlength - block_size).safe_then( + [header=std::move(header), bl=std::move(bl)]( + auto &&bptail) mutable { + bl.push_back(bptail); + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::make_pair(std::move(header), std::move(bl))); + }); + } else { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::make_pair(std::move(header), std::move(bl)) + ); + } + }).safe_then([=](auto p) { + if (p && validate_metadata(p->second)) { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::move(*p) + ); + } else { + return read_validate_record_metadata_ret( + read_validate_record_metadata_ertr::ready_future_marker{}, + std::nullopt); + } + }); +} + +std::optional<std::vector<delta_info_t>> Journal::try_decode_deltas( + record_header_t header, + const bufferlist &bl) +{ + auto bliter = bl.cbegin(); + bliter += ceph::encoded_sizeof_bounded<record_header_t>(); + bliter += sizeof(checksum_t) /* crc */; + bliter += header.extents * ceph::encoded_sizeof_bounded<extent_info_t>(); + logger().debug("{}: decoding {} deltas", __func__, header.deltas); + std::vector<delta_info_t> deltas(header.deltas); + for (auto &&i : deltas) { + try { + decode(i, bliter); + } catch (ceph::buffer::error &e) { + return std::nullopt; + } + } + return deltas; +} + +std::optional<std::vector<extent_info_t>> Journal::try_decode_extent_infos( + record_header_t header, + const bufferlist &bl) +{ + auto bliter = bl.cbegin(); + bliter += ceph::encoded_sizeof_bounded<record_header_t>(); + bliter += sizeof(checksum_t) /* crc */; + logger().debug("{}: decoding {} extents", __func__, header.extents); + std::vector<extent_info_t> extent_infos(header.extents); + for (auto &&i : extent_infos) { + try { + decode(i, bliter); + } catch (ceph::buffer::error &e) { + return std::nullopt; + } + } + return extent_infos; +} + +Journal::replay_ertr::future<> +Journal::replay_segment( + journal_seq_t seq, + segment_header_t header, + delta_handler_t &handler) +{ + logger().debug("replay_segment: starting at {}", seq); + return seastar::do_with( + scan_valid_records_cursor(seq.offset), + found_record_handler_t( + [=, &handler](paddr_t base, + const record_header_t &header, + const bufferlist &mdbuf) { + auto deltas = try_decode_deltas( + header, + mdbuf); + if (!deltas) { + // This should be impossible, we did check the crc on the mdbuf + logger().error( + "Journal::replay_segment unable to decode deltas for record {}", + base); + assert(deltas); + } + + return seastar::do_with( + std::move(*deltas), + [=](auto &deltas) { + return crimson::do_for_each( + deltas, + [=](auto &delta) { + /* The journal may validly contain deltas for extents in + * since released segments. We can detect those cases by + * checking whether the segment in question currently has a + * sequence number > the current journal segment seq. We can + * safetly skip these deltas because the extent must already + * have been rewritten. + * + * Note, this comparison exploits the fact that + * SEGMENT_SEQ_NULL is a large number. + */ + if (delta.paddr != P_ADDR_NULL && + (segment_provider->get_seq(delta.paddr.segment) > + seq.segment_seq)) { + return replay_ertr::now(); + } else { + return handler( + journal_seq_t{seq.segment_seq, base}, + base.add_offset(header.mdlength), + delta); + } + }); + }); + }), + [=](auto &cursor, auto &dhandler) { + return scan_valid_records( + cursor, + header.segment_nonce, + std::numeric_limits<size_t>::max(), + dhandler).safe_then([](auto){}); + }); +} + +Journal::replay_ret Journal::replay(delta_handler_t &&delta_handler) +{ + return seastar::do_with( + std::move(delta_handler), replay_segments_t(), + [this](auto &handler, auto &segments) mutable -> replay_ret { + return find_replay_segments().safe_then( + [this, &handler, &segments](auto replay_segs) mutable { + logger().debug("replay: found {} segments", replay_segs.size()); + segments = std::move(replay_segs); + return crimson::do_for_each(segments, [this, &handler](auto i) mutable { + return replay_segment(i.first, i.second, handler); + }); + }); + }); +} + +Journal::scan_extents_ret Journal::scan_extents( + scan_extents_cursor &cursor, + extent_len_t bytes_to_read) +{ + auto ret = std::make_unique<scan_extents_ret_bare>(); + auto &retref = *ret; + return read_segment_header(cursor.get_offset().segment + ).handle_error( + scan_extents_ertr::pass_further{}, + crimson::ct_error::assert_all{} + ).safe_then([&](auto segment_header) { + auto segment_nonce = segment_header.segment_nonce; + return seastar::do_with( + found_record_handler_t( + [&]( + paddr_t base, + const record_header_t &header, + const bufferlist &mdbuf) mutable { + + auto infos = try_decode_extent_infos( + header, + mdbuf); + if (!infos) { + // This should be impossible, we did check the crc on the mdbuf + logger().error( + "Journal::scan_extents unable to decode extents for record {}", + base); + assert(infos); + } + + paddr_t extent_offset = base.add_offset(header.mdlength); + for (const auto &i : *infos) { + retref.emplace_back(extent_offset, i); + extent_offset.offset += i.len; + } + return scan_extents_ertr::now(); + }), + [=, &cursor](auto &dhandler) { + return scan_valid_records( + cursor, + segment_nonce, + std::numeric_limits<size_t>::max(), + dhandler).safe_then([](auto){}); + }); + }).safe_then([ret=std::move(ret)] { + return std::move(*ret); + }); +} + +Journal::scan_valid_records_ret Journal::scan_valid_records( + scan_valid_records_cursor &cursor, + segment_nonce_t nonce, + size_t budget, + found_record_handler_t &handler) +{ + if (cursor.offset.offset == 0) { + cursor.offset.offset = block_size; + } + auto retref = std::make_unique<size_t>(0); + auto budget_used = *retref; + return crimson::do_until( + [=, &cursor, &budget_used, &handler]() mutable + -> scan_valid_records_ertr::future<bool> { + return [=, &handler, &cursor, &budget_used] { + if (!cursor.last_valid_header_found) { + return read_validate_record_metadata(cursor.offset, nonce + ).safe_then([=, &cursor](auto md) { + logger().debug( + "Journal::scan_valid_records: read complete {}", + cursor.offset); + if (!md) { + logger().debug( + "Journal::scan_valid_records: found invalid header at {}, presumably at end", + cursor.offset); + cursor.last_valid_header_found = true; + return scan_valid_records_ertr::now(); + } else { + logger().debug( + "Journal::scan_valid_records: valid record read at {}", + cursor.offset); + cursor.last_committed = paddr_t{ + cursor.offset.segment, + md->first.committed_to}; + cursor.pending_records.emplace_back( + cursor.offset, + md->first, + md->second); + cursor.offset.offset += + md->first.dlength + md->first.mdlength; + return scan_valid_records_ertr::now(); + } + }).safe_then([=, &cursor, &budget_used, &handler] { + return crimson::do_until( + [=, &budget_used, &cursor, &handler] { + logger().debug( + "Journal::scan_valid_records: valid record read, processing queue"); + if (cursor.pending_records.empty()) { + /* This is only possible if the segment is empty. + * A record's last_commited must be prior to its own + * location since it itself cannot yet have been committed + * at its own time of submission. Thus, the most recently + * read record must always fall after cursor.last_committed */ + return scan_valid_records_ertr::make_ready_future<bool>(true); + } + auto &next = cursor.pending_records.front(); + if (next.offset > cursor.last_committed) { + return scan_valid_records_ertr::make_ready_future<bool>(true); + } + budget_used += + next.header.dlength + next.header.mdlength; + return handler( + next.offset, + next.header, + next.mdbuffer + ).safe_then([&cursor] { + cursor.pending_records.pop_front(); + return scan_valid_records_ertr::make_ready_future<bool>(false); + }); + }); + }); + } else { + assert(!cursor.pending_records.empty()); + auto &next = cursor.pending_records.front(); + return read_validate_data(next.offset, next.header + ).safe_then([=, &budget_used, &next, &cursor, &handler](auto valid) { + if (!valid) { + cursor.pending_records.clear(); + return scan_valid_records_ertr::now(); + } + budget_used += + next.header.dlength + next.header.mdlength; + return handler( + next.offset, + next.header, + next.mdbuffer + ).safe_then([&cursor] { + cursor.pending_records.pop_front(); + return scan_valid_records_ertr::now(); + }); + }); + } + }().safe_then([=, &budget_used, &cursor] { + return scan_valid_records_ertr::make_ready_future<bool>( + cursor.is_complete() || budget_used >= budget); + }); + }).safe_then([retref=std::move(retref)]() mutable -> scan_valid_records_ret { + return scan_valid_records_ret( + scan_valid_records_ertr::ready_future_marker{}, + std::move(*retref)); + }); +} + + +} diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h new file mode 100644 index 000000000..7424d78b3 --- /dev/null +++ b/src/crimson/os/seastore/journal.h @@ -0,0 +1,405 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/log.h" + +#include <boost/intrusive_ptr.hpp> + +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer.h" +#include "include/denc.h" + +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/osd/exceptions.h" + +namespace crimson::os::seastore { + +using segment_nonce_t = uint32_t; + + +/** + * Segment header + * + * Every segment contains and encode segment_header_t in the first block. + * Our strategy for finding the journal replay point is: + * 1) Find the segment with the highest journal_segment_seq + * 2) Replay starting at record located at that segment's journal_tail + */ +struct segment_header_t { + segment_seq_t journal_segment_seq; + segment_id_t physical_segment_id; // debugging + + journal_seq_t journal_tail; + segment_nonce_t segment_nonce; + + DENC(segment_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.journal_segment_seq, p); + denc(v.physical_segment_id, p); + denc(v.journal_tail, p); + denc(v.segment_nonce, p); + DENC_FINISH(p); + } +}; +std::ostream &operator<<(std::ostream &out, const segment_header_t &header); + +struct record_header_t { + // Fixed portion + extent_len_t mdlength; // block aligned, length of metadata + extent_len_t dlength; // block aligned, length of data + uint32_t deltas; // number of deltas + uint32_t extents; // number of extents + segment_nonce_t segment_nonce;// nonce of containing segment + segment_off_t committed_to; // records in this segment prior to committed_to + // have been fully written + checksum_t data_crc; // crc of data payload + + + DENC(record_header_t, v, p) { + DENC_START(1, 1, p); + denc(v.mdlength, p); + denc(v.dlength, p); + denc(v.deltas, p); + denc(v.extents, p); + denc(v.segment_nonce, p); + denc(v.committed_to, p); + denc(v.data_crc, p); + DENC_FINISH(p); + } +}; + +struct extent_info_t { + extent_types_t type = extent_types_t::NONE; + laddr_t addr = L_ADDR_NULL; + extent_len_t len = 0; + + extent_info_t() = default; + extent_info_t(const extent_t &et) + : type(et.type), addr(et.addr), len(et.bl.length()) {} + + DENC(extent_info_t, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.addr, p); + denc(v.len, p); + DENC_FINISH(p); + } +}; + +/** + * Callback interface for managing available segments + */ +class JournalSegmentProvider { +public: + using get_segment_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_segment_ret = get_segment_ertr::future<segment_id_t>; + virtual get_segment_ret get_segment() = 0; + + virtual void close_segment(segment_id_t) {} + + virtual void set_journal_segment( + segment_id_t segment, + segment_seq_t seq) {} + + virtual journal_seq_t get_journal_tail_target() const = 0; + virtual void update_journal_tail_committed(journal_seq_t tail_committed) = 0; + + virtual void init_mark_segment_closed( + segment_id_t segment, segment_seq_t seq) {} + + virtual segment_seq_t get_seq(segment_id_t id) { return 0; } + + virtual ~JournalSegmentProvider() {} +}; + +/** + * Manages stream of atomically written records to a SegmentManager. + */ +class Journal { +public: + Journal(SegmentManager &segment_manager); + + /** + * Sets the JournalSegmentProvider. + * + * Not provided in constructor to allow the provider to not own + * or construct the Journal (TransactionManager). + * + * Note, Journal does not own this ptr, user must ensure that + * *provider outlives Journal. + */ + void set_segment_provider(JournalSegmentProvider *provider) { + segment_provider = provider; + } + + /** + * initializes journal for new writes -- must run prior to calls + * to submit_record. Should be called after replay if not a new + * Journal. + */ + using open_for_write_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using open_for_write_ret = open_for_write_ertr::future<journal_seq_t>; + open_for_write_ret open_for_write(); + + /** + * close journal + * + * TODO: should probably flush and disallow further writes + */ + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + close_ertr::future<> close() { return close_ertr::now(); } + + /** + * submit_record + * + * @param write record and returns offset of first block and seq + */ + using submit_record_ertr = crimson::errorator< + crimson::ct_error::erange, + crimson::ct_error::input_output_error + >; + using submit_record_ret = submit_record_ertr::future< + std::pair<paddr_t, journal_seq_t> + >; + submit_record_ret submit_record(record_t &&record) { + auto rsize = get_encoded_record_length(record); + auto total = rsize.mdlength + rsize.dlength; + if (total > max_record_length) { + return crimson::ct_error::erange::make(); + } + auto roll = needs_roll(total) + ? roll_journal_segment().safe_then([](auto){}) + : roll_journal_segment_ertr::now(); + return roll.safe_then( + [this, rsize, record=std::move(record)]() mutable { + return write_record(rsize, std::move(record) + ).safe_then([this, rsize](auto addr) { + return std::make_pair( + addr.add_offset(rsize.mdlength), + get_journal_seq(addr)); + }); + }); + } + + /** + * Read deltas and pass to delta_handler + * + * record_block_start (argument to delta_handler) is the start of the + * of the first block in the record + */ + using replay_ertr = SegmentManager::read_ertr; + using replay_ret = replay_ertr::future<>; + using delta_handler_t = std::function< + replay_ret(journal_seq_t seq, + paddr_t record_block_base, + const delta_info_t&)>; + replay_ret replay(delta_handler_t &&delta_handler); + + /** + * scan_extents + * + * Scans records beginning at addr until the first record boundary after + * addr + bytes_to_read. + * + * Returns list<extent, extent_info> + * cursor.is_complete() will be true when no further extents exist in segment. + */ + class scan_valid_records_cursor; + using scan_extents_cursor = scan_valid_records_cursor; + using scan_extents_ertr = SegmentManager::read_ertr; + using scan_extents_ret_bare = std::list<std::pair<paddr_t, extent_info_t>>; + using scan_extents_ret = scan_extents_ertr::future<scan_extents_ret_bare>; + scan_extents_ret scan_extents( + scan_extents_cursor &cursor, + extent_len_t bytes_to_read + ); + + +private: + const extent_len_t block_size; + const extent_len_t max_record_length; + + JournalSegmentProvider *segment_provider = nullptr; + SegmentManager &segment_manager; + + segment_seq_t next_journal_segment_seq = 0; + segment_nonce_t current_segment_nonce = 0; + + SegmentRef current_journal_segment; + segment_off_t written_to = 0; + segment_off_t committed_to = 0; + + journal_seq_t get_journal_seq(paddr_t addr) { + return journal_seq_t{next_journal_segment_seq-1, addr}; + } + + /// prepare segment for writes, writes out segment header + using initialize_segment_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + initialize_segment_ertr::future<segment_seq_t> initialize_segment( + Segment &segment); + + struct record_size_t { + extent_len_t mdlength = 0; + extent_len_t dlength = 0; + + record_size_t( + extent_len_t mdlength, + extent_len_t dlength) + : mdlength(mdlength), dlength(dlength) {} + }; + + /** + * Return <mdlength, dlength> pair denoting length of + * metadata and blocks respectively. + */ + record_size_t get_encoded_record_length( + const record_t &record) const; + + /// create encoded record bl + ceph::bufferlist encode_record( + record_size_t rsize, + record_t &&record); + + /// validate embedded metadata checksum + static bool validate_metadata(const bufferlist &bl); + + /// read and validate data + using read_validate_data_ertr = SegmentManager::read_ertr; + using read_validate_data_ret = read_validate_data_ertr::future<bool>; + read_validate_data_ret read_validate_data( + paddr_t record_base, + const record_header_t &header ///< caller must ensure lifetime through + /// future resolution + ); + + + /// do record write + using write_record_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using write_record_ret = write_record_ertr::future<paddr_t>; + write_record_ret write_record( + record_size_t rsize, + record_t &&record); + + /// close current segment and initialize next one + using roll_journal_segment_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + roll_journal_segment_ertr::future<segment_seq_t> roll_journal_segment(); + + /// returns true iff current segment has insufficient space + bool needs_roll(segment_off_t length) const; + + using read_segment_header_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::enodata, + crimson::ct_error::input_output_error + >; + using read_segment_header_ret = read_segment_header_ertr::future< + segment_header_t>; + read_segment_header_ret read_segment_header(segment_id_t segment); + + /// return ordered vector of segments to replay + using replay_segments_t = std::vector< + std::pair<journal_seq_t, segment_header_t>>; + using find_replay_segments_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using find_replay_segments_fut = find_replay_segments_ertr::future< + replay_segments_t>; + find_replay_segments_fut find_replay_segments(); + + /// attempts to decode deltas from bl, return nullopt if unsuccessful + std::optional<std::vector<delta_info_t>> try_decode_deltas( + record_header_t header, + const bufferlist &bl); + + /// attempts to decode extent infos from bl, return nullopt if unsuccessful + std::optional<std::vector<extent_info_t>> try_decode_extent_infos( + record_header_t header, + const bufferlist &bl); + + /// read record metadata for record starting at start + using read_validate_record_metadata_ertr = replay_ertr; + using read_validate_record_metadata_ret = + read_validate_record_metadata_ertr::future< + std::optional<std::pair<record_header_t, bufferlist>> + >; + read_validate_record_metadata_ret read_validate_record_metadata( + paddr_t start, + segment_nonce_t nonce); + +public: + /// scan segment for end incrementally + struct scan_valid_records_cursor { + bool last_valid_header_found = false; + paddr_t offset; + paddr_t last_committed; + + struct found_record_t { + paddr_t offset; + record_header_t header; + bufferlist mdbuffer; + + found_record_t( + paddr_t offset, + const record_header_t &header, + const bufferlist &mdbuffer) + : offset(offset), header(header), mdbuffer(mdbuffer) {} + }; + std::deque<found_record_t> pending_records; + + bool is_complete() const { + return last_valid_header_found && pending_records.empty(); + } + + paddr_t get_offset() const { + return offset; + } + + scan_valid_records_cursor( + paddr_t offset) + : offset(offset) {} + }; +private: + + using scan_valid_records_ertr = SegmentManager::read_ertr; + using scan_valid_records_ret = scan_valid_records_ertr::future< + size_t>; + using found_record_handler_t = std::function< + scan_valid_records_ertr::future<>( + paddr_t record_block_base, + // callee may assume header and bl will remain valid until + // returned future resolves + const record_header_t &header, + const bufferlist &bl)>; + scan_valid_records_ret scan_valid_records( + scan_valid_records_cursor &cursor, ///< [in, out] cursor, updated during call + segment_nonce_t nonce, ///< [in] nonce for segment + size_t budget, ///< [in] max budget to use + found_record_handler_t &handler ///< [in] handler for records + ); ///< @return used budget + + /// replays records starting at start through end of segment + replay_ertr::future<> + replay_segment( + journal_seq_t start, ///< [in] starting addr, seq + segment_header_t header, ///< [in] segment header + delta_handler_t &delta_handler ///< [in] processes deltas in order + ); + +}; + +} +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_header_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::record_header_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::extent_info_t) diff --git a/src/crimson/os/seastore/lba_manager.cc b/src/crimson/os/seastore/lba_manager.cc new file mode 100644 index 000000000..73411dcf7 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager.cc @@ -0,0 +1,17 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h" + +namespace crimson::os::seastore::lba_manager { + +LBAManagerRef create_lba_manager( + SegmentManager &segment_manager, + Cache &cache) { + return LBAManagerRef(new btree::BtreeLBAManager(segment_manager, cache)); +} + +} diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h new file mode 100644 index 000000000..ad90f4c4f --- /dev/null +++ b/src/crimson/os/seastore/lba_manager.h @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer_fwd.h" +#include "include/interval_set.h" +#include "common/interval_map.h" + +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/segment_manager.h" + +namespace crimson::os::seastore { + +/** + * Abstract interface for managing the logical to physical mapping + */ +class LBAManager { +public: + using mkfs_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using mkfs_ret = mkfs_ertr::future<>; + virtual mkfs_ret mkfs( + Transaction &t + ) = 0; + + /** + * Fetches mappings for laddr_t in range [offset, offset + len) + * + * Future will not resolve until all pins have resolved (set_paddr called) + */ + using get_mapping_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_mapping_ret = get_mapping_ertr::future<lba_pin_list_t>; + virtual get_mapping_ret get_mapping( + Transaction &t, + laddr_t offset, extent_len_t length) = 0; + + /** + * Fetches mappings for laddr_t in range [offset, offset + len) + * + * Future will not result until all pins have resolved (set_paddr called) + */ + using get_mappings_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_mappings_ret = get_mapping_ertr::future<lba_pin_list_t>; + virtual get_mappings_ret get_mappings( + Transaction &t, + laddr_list_t &&extent_lisk) = 0; + + /** + * Allocates a new mapping referenced by LBARef + * + * Offset will be relative to the block offset of the record + * This mapping will block from transaction submission until set_paddr + * is called on the LBAPin. + */ + using alloc_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using alloc_extent_ret = alloc_extent_ertr::future<LBAPinRef>; + virtual alloc_extent_ret alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + paddr_t addr) = 0; + + /** + * Creates a new absolute mapping. + * + * off~len must be unreferenced + */ + using set_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg>; + using set_extent_ret = set_extent_ertr::future<LBAPinRef>; + virtual set_extent_ret set_extent( + Transaction &t, + laddr_t off, extent_len_t len, paddr_t addr) = 0; + + + struct ref_update_result_t { + unsigned refcount = 0; + paddr_t addr; + }; + using ref_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::input_output_error>; + using ref_ret = ref_ertr::future<ref_update_result_t>; + + /** + * Decrements ref count on extent + * + * @return returns resulting refcount + */ + virtual ref_ret decref_extent( + Transaction &t, + laddr_t addr) = 0; + + /** + * Increments ref count on extent + * + * @return returns resulting refcount + */ + virtual ref_ret incref_extent( + Transaction &t, + laddr_t addr) = 0; + + using complete_transaction_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using complete_transaction_ret = complete_transaction_ertr::future<>; + virtual complete_transaction_ret complete_transaction( + Transaction &t) = 0; + + /** + * Should be called after replay on each cached extent. + * Implementation must initialize the LBAPin on any + * LogicalCachedExtent's and may also read in any dependent + * structures, etc. + */ + using init_cached_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using init_cached_extent_ret = init_cached_extent_ertr::future<>; + virtual init_cached_extent_ret init_cached_extent( + Transaction &t, + CachedExtentRef e) = 0; + + /** + * Calls f for each mapping in [begin, end) + */ + using scan_mappings_ertr = SegmentManager::read_ertr; + using scan_mappings_ret = scan_mappings_ertr::future<>; + using scan_mappings_func_t = std::function< + void(laddr_t, paddr_t, extent_len_t)>; + virtual scan_mappings_ret scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) = 0; + + /** + * Calls f for each mapped space usage in [begin, end) + */ + using scan_mapped_space_ertr = SegmentManager::read_ertr; + using scan_mapped_space_ret = scan_mapped_space_ertr::future<>; + using scan_mapped_space_func_t = std::function< + void(paddr_t, extent_len_t)>; + virtual scan_mapped_space_ret scan_mapped_space( + Transaction &t, + scan_mapped_space_func_t &&f) = 0; + + /** + * rewrite_extent + * + * rewrite extent into passed transaction + */ + using rewrite_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using rewrite_extent_ret = rewrite_extent_ertr::future<>; + virtual rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) = 0; + + /** + * get_physical_extent_if_live + * + * Returns extent at addr/laddr if still live (if laddr + * still points at addr). Extent must be an internal, physical + * extent. + * + * Returns a null CachedExtentRef if extent is not live. + */ + using get_physical_extent_if_live_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_physical_extent_if_live_ret = + get_physical_extent_if_live_ertr::future<CachedExtentRef>; + virtual get_physical_extent_if_live_ret get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) = 0; + + virtual void add_pin(LBAPin &pin) = 0; + + virtual ~LBAManager() {} +}; +using LBAManagerRef = std::unique_ptr<LBAManager>; + +class Cache; +namespace lba_manager { +LBAManagerRef create_lba_manager( + SegmentManager &segment_manager, + Cache &cache); +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc new file mode 100644 index 000000000..a837ae37e --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -0,0 +1,580 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" + + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::lba_manager::btree { + +BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( + Transaction &t) +{ + logger().debug("BtreeLBAManager::mkfs"); + return cache.get_root(t).safe_then([this, &t](auto croot) { + auto root_leaf = cache.alloc_new_extent<LBALeafNode>( + t, + LBA_BLOCK_SIZE); + root_leaf->set_size(0); + lba_node_meta_t meta{0, L_ADDR_MAX, 1}; + root_leaf->set_meta(meta); + root_leaf->pin.set_range(meta); + croot->get_root() = + root_t{ + 1, + 0, + root_leaf->get_paddr(), + make_record_relative_paddr(0), + L_ADDR_NULL}; + return mkfs_ertr::now(); + }); +} + +BtreeLBAManager::get_root_ret +BtreeLBAManager::get_root(Transaction &t) +{ + return cache.get_root(t).safe_then([this, &t](auto croot) { + logger().debug( + "BtreeLBAManager::get_root: reading root at {} depth {}", + paddr_t{croot->get_root().lba_root_addr}, + unsigned(croot->get_root().lba_depth)); + return get_lba_btree_extent( + get_context(t), + croot->get_root().lba_depth, + croot->get_root().lba_root_addr, + paddr_t()); + }); +} + +BtreeLBAManager::get_mapping_ret +BtreeLBAManager::get_mapping( + Transaction &t, + laddr_t offset, extent_len_t length) +{ + logger().debug("BtreeLBAManager::get_mapping: {}, {}", offset, length); + return get_root( + t).safe_then([this, &t, offset, length](auto extent) { + return extent->lookup_range( + get_context(t), + offset, length + ).safe_then([extent](auto ret) { return ret; }); + }).safe_then([](auto &&e) { + logger().debug("BtreeLBAManager::get_mapping: got mapping {}", e); + return get_mapping_ret( + get_mapping_ertr::ready_future_marker{}, + std::move(e)); + }); +} + + +BtreeLBAManager::get_mappings_ret +BtreeLBAManager::get_mappings( + Transaction &t, + laddr_list_t &&list) +{ + logger().debug("BtreeLBAManager::get_mappings: {}", list); + auto l = std::make_unique<laddr_list_t>(std::move(list)); + auto retptr = std::make_unique<lba_pin_list_t>(); + auto &ret = *retptr; + return crimson::do_for_each( + l->begin(), + l->end(), + [this, &t, &ret](const auto &p) { + return get_mapping(t, p.first, p.second).safe_then( + [&ret](auto res) { + ret.splice(ret.end(), res, res.begin(), res.end()); + }); + }).safe_then([l=std::move(l), retptr=std::move(retptr)]() mutable { + return std::move(*retptr); + }); +} + +BtreeLBAManager::alloc_extent_ret +BtreeLBAManager::alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + paddr_t addr) +{ + // TODO: we can certainly combine the lookup and the insert. + return get_root( + t).safe_then([this, &t, hint, len](auto extent) { + logger().debug( + "BtreeLBAManager::alloc_extent: beginning search at {}", + *extent); + return extent->find_hole( + get_context(t), + hint, + L_ADDR_MAX, + len).safe_then([extent](auto ret) { + return std::make_pair(ret, extent); + }); + }).safe_then([this, &t, len, addr](auto allocation_pair) { + auto &[laddr, extent] = allocation_pair; + ceph_assert(laddr != L_ADDR_MAX); + return insert_mapping( + t, + extent, + laddr, + { len, addr, 1, 0 } + ).safe_then([laddr=laddr, addr, len](auto pin) { + logger().debug( + "BtreeLBAManager::alloc_extent: alloc {}~{} for {}", + laddr, + len, + addr); + return alloc_extent_ret( + alloc_extent_ertr::ready_future_marker{}, + LBAPinRef(pin.release())); + }); + }); +} + +BtreeLBAManager::set_extent_ret +BtreeLBAManager::set_extent( + Transaction &t, + laddr_t off, extent_len_t len, paddr_t addr) +{ + return get_root( + t).safe_then([this, &t, off, len, addr](auto root) { + return insert_mapping( + t, + root, + off, + { len, addr, 1, 0 }); + }).safe_then([](auto ret) { + return set_extent_ret( + set_extent_ertr::ready_future_marker{}, + LBAPinRef(ret.release())); + }); +} + +static bool is_lba_node(extent_types_t type) +{ + return type == extent_types_t::LADDR_INTERNAL || + type == extent_types_t::LADDR_LEAF; +} + +static bool is_lba_node(const CachedExtent &e) +{ + return is_lba_node(e.get_type()); +} + +btree_range_pin_t &BtreeLBAManager::get_pin(CachedExtent &e) +{ + if (is_lba_node(e)) { + return e.cast<LBANode>()->pin; + } else if (e.is_logical()) { + return static_cast<BtreeLBAPin &>( + e.cast<LogicalCachedExtent>()->get_pin()).pin; + } else { + ceph_abort_msg("impossible"); + } +} + +static depth_t get_depth(const CachedExtent &e) +{ + if (is_lba_node(e)) { + return e.cast<LBANode>()->get_node_meta().depth; + } else if (e.is_logical()) { + return 0; + } else { + ceph_assert(0 == "currently impossible"); + return 0; + } +} + +BtreeLBAManager::complete_transaction_ret +BtreeLBAManager::complete_transaction( + Transaction &t) +{ + std::vector<CachedExtentRef> to_clear; + to_clear.reserve(t.get_retired_set().size()); + for (auto &e: t.get_retired_set()) { + if (e->is_logical() || is_lba_node(*e)) + to_clear.push_back(e); + } + // need to call check_parent from leaf->parent + std::sort( + to_clear.begin(), to_clear.end(), + [](auto &l, auto &r) { return get_depth(*l) < get_depth(*r); }); + + for (auto &e: to_clear) { + auto &pin = get_pin(*e); + logger().debug("{}: retiring {}, {}", __func__, *e, pin); + pin_set.retire(pin); + } + + // ...but add_pin from parent->leaf + std::vector<CachedExtentRef> to_link; + to_link.reserve(t.get_fresh_block_list().size()); + for (auto &e: t.get_fresh_block_list()) { + if (e->is_valid() && (is_lba_node(*e) || e->is_logical())) + to_link.push_back(e); + } + std::sort( + to_link.begin(), to_link.end(), + [](auto &l, auto &r) -> bool { return get_depth(*l) > get_depth(*r); }); + + for (auto &e : to_link) { + logger().debug("{}: linking {}", __func__, *e); + pin_set.add_pin(get_pin(*e)); + } + + for (auto &e: to_clear) { + auto &pin = get_pin(*e); + logger().debug("{}: checking {}, {}", __func__, *e, pin); + pin_set.check_parent(pin); + } + return complete_transaction_ertr::now(); +} + +BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent( + Transaction &t, + CachedExtentRef e) +{ + logger().debug("{}: {}", __func__, *e); + return get_root(t).safe_then( + [this, &t, e=std::move(e)](LBANodeRef root) mutable { + if (is_lba_node(*e)) { + auto lban = e->cast<LBANode>(); + logger().debug("init_cached_extent: lba node, getting root"); + return root->lookup( + op_context_t{cache, pin_set, t}, + lban->get_node_meta().begin, + lban->get_node_meta().depth + ).safe_then([this, e=std::move(e)](LBANodeRef c) { + if (c->get_paddr() == e->get_paddr()) { + assert(&*c == &*e); + logger().debug("init_cached_extent: {} initialized", *e); + } else { + // e is obsolete + logger().debug("init_cached_extent: {} obsolete", *e); + cache.drop_from_cache(e); + } + return init_cached_extent_ertr::now(); + }); + } else if (e->is_logical()) { + auto logn = e->cast<LogicalCachedExtent>(); + return root->lookup_range( + op_context_t{cache, pin_set, t}, + logn->get_laddr(), + logn->get_length()).safe_then( + [this, logn=std::move(logn)](auto pins) { + if (pins.size() == 1) { + auto pin = std::move(pins.front()); + pins.pop_front(); + if (pin->get_paddr() == logn->get_paddr()) { + logn->set_pin(std::move(pin)); + pin_set.add_pin( + static_cast<BtreeLBAPin&>(logn->get_pin()).pin); + logger().debug("init_cached_extent: {} initialized", *logn); + } else { + // paddr doesn't match, remapped, obsolete + logger().debug("init_cached_extent: {} obsolete", *logn); + cache.drop_from_cache(logn); + } + } else { + // set of extents changed, obsolete + logger().debug("init_cached_extent: {} obsolete", *logn); + cache.drop_from_cache(logn); + } + return init_cached_extent_ertr::now(); + }); + } else { + logger().debug("init_cached_extent: {} skipped", *e); + return init_cached_extent_ertr::now(); + } + }); +} + +BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) +{ + return seastar::do_with( + std::move(f), + LBANodeRef(), + [=, &t](auto &f, auto &lbarootref) { + return get_root(t).safe_then( + [=, &t, &f](LBANodeRef lbaroot) mutable { + lbarootref = lbaroot; + return lbaroot->scan_mappings( + get_context(t), + begin, + end, + f); + }); + }); +} + +BtreeLBAManager::scan_mapped_space_ret BtreeLBAManager::scan_mapped_space( + Transaction &t, + scan_mapped_space_func_t &&f) +{ + return seastar::do_with( + std::move(f), + LBANodeRef(), + [=, &t](auto &f, auto &lbarootref) { + return get_root(t).safe_then( + [=, &t, &f](LBANodeRef lbaroot) mutable { + lbarootref = lbaroot; + return lbaroot->scan_mapped_space( + get_context(t), + f); + }); + }); +} + +BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent( + Transaction &t, + CachedExtentRef extent) +{ + if (extent->is_logical()) { + auto lextent = extent->cast<LogicalCachedExtent>(); + cache.retire_extent(t, extent); + auto nlextent = cache.alloc_new_extent_by_type( + t, + lextent->get_type(), + lextent->get_length())->cast<LogicalCachedExtent>(); + lextent->get_bptr().copy_out( + 0, + lextent->get_length(), + nlextent->get_bptr().c_str()); + nlextent->set_laddr(lextent->get_laddr()); + nlextent->set_pin(lextent->get_pin().duplicate()); + + logger().debug( + "{}: rewriting {} into {}", + __func__, + *lextent, + *nlextent); + + return update_mapping( + t, + lextent->get_laddr(), + [prev_addr = lextent->get_paddr(), addr = nlextent->get_paddr()]( + const lba_map_val_t &in) { + lba_map_val_t ret = in; + ceph_assert(in.paddr == prev_addr); + ret.paddr = addr; + return ret; + }).safe_then([nlextent](auto e) {}).handle_error( + rewrite_extent_ertr::pass_further{}, + /* ENOENT in particular should be impossible */ + crimson::ct_error::assert_all{} + ); + } else if (is_lba_node(*extent)) { + auto lba_extent = extent->cast<LBANode>(); + cache.retire_extent(t, extent); + auto nlba_extent = cache.alloc_new_extent_by_type( + t, + lba_extent->get_type(), + lba_extent->get_length())->cast<LBANode>(); + lba_extent->get_bptr().copy_out( + 0, + lba_extent->get_length(), + nlba_extent->get_bptr().c_str()); + nlba_extent->pin.set_range(nlba_extent->get_node_meta()); + + /* This is a bit underhanded. Any relative addrs here must necessarily + * be record relative as we are rewriting a dirty extent. Thus, we + * are using resolve_relative_addrs with a (likely negative) block + * relative offset to correct them to block-relative offsets adjusted + * for our new transaction location. + * + * Upon commit, these now block relative addresses will be interpretted + * against the real final address. + */ + nlba_extent->resolve_relative_addrs( + make_record_relative_paddr(0) - nlba_extent->get_paddr()); + + return update_internal_mapping( + t, + nlba_extent->get_node_meta().depth, + nlba_extent->get_node_meta().begin, + nlba_extent->get_paddr()).safe_then( + [](auto) {}, + rewrite_extent_ertr::pass_further {}, + crimson::ct_error::assert_all{}); + } else { + return rewrite_extent_ertr::now(); + } +} + +BtreeLBAManager::get_physical_extent_if_live_ret +BtreeLBAManager::get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) +{ + ceph_assert(is_lba_node(type)); + return cache.get_extent_by_type( + t, + type, + addr, + laddr, + len + ).safe_then([=, &t](CachedExtentRef extent) { + return get_root(t).safe_then([=, &t](LBANodeRef root) { + auto lba_node = extent->cast<LBANode>(); + return root->lookup( + op_context_t{cache, pin_set, t}, + lba_node->get_node_meta().begin, + lba_node->get_node_meta().depth).safe_then([=](LBANodeRef c) { + if (c->get_paddr() == lba_node->get_paddr()) { + return get_physical_extent_if_live_ret( + get_physical_extent_if_live_ertr::ready_future_marker{}, + lba_node); + } else { + cache.drop_from_cache(lba_node); + return get_physical_extent_if_live_ret( + get_physical_extent_if_live_ertr::ready_future_marker{}, + CachedExtentRef()); + } + }); + }); + }); +} + +BtreeLBAManager::BtreeLBAManager( + SegmentManager &segment_manager, + Cache &cache) + : segment_manager(segment_manager), + cache(cache) {} + +BtreeLBAManager::insert_mapping_ret BtreeLBAManager::insert_mapping( + Transaction &t, + LBANodeRef root, + laddr_t laddr, + lba_map_val_t val) +{ + auto split = insert_mapping_ertr::future<LBANodeRef>( + insert_mapping_ertr::ready_future_marker{}, + root); + if (root->at_max_capacity()) { + split = cache.get_root(t).safe_then( + [this, root, laddr, &t](RootBlockRef croot) { + logger().debug( + "BtreeLBAManager::insert_mapping: splitting root {}", + *croot); + { + auto mut_croot = cache.duplicate_for_write(t, croot); + croot = mut_croot->cast<RootBlock>(); + } + auto nroot = cache.alloc_new_extent<LBAInternalNode>(t, LBA_BLOCK_SIZE); + lba_node_meta_t meta{0, L_ADDR_MAX, root->get_node_meta().depth + 1}; + nroot->set_meta(meta); + nroot->pin.set_range(meta); + nroot->journal_insert( + nroot->begin(), + L_ADDR_MIN, + root->get_paddr(), + nullptr); + croot->get_root().lba_root_addr = nroot->get_paddr(); + croot->get_root().lba_depth = root->get_node_meta().depth + 1; + return nroot->split_entry( + get_context(t), + laddr, nroot->begin(), root); + }); + } + return split.safe_then([this, &t, laddr, val](LBANodeRef node) { + return node->insert( + get_context(t), + laddr, val); + }); +} + +BtreeLBAManager::update_refcount_ret BtreeLBAManager::update_refcount( + Transaction &t, + laddr_t addr, + int delta) +{ + return update_mapping( + t, + addr, + [delta](const lba_map_val_t &in) { + lba_map_val_t out = in; + ceph_assert((int)out.refcount + delta >= 0); + out.refcount += delta; + return out; + }).safe_then([](auto result) { + return ref_update_result_t{result.refcount, result.paddr}; + }); +} + +BtreeLBAManager::update_mapping_ret BtreeLBAManager::update_mapping( + Transaction &t, + laddr_t addr, + update_func_t &&f) +{ + return get_root(t + ).safe_then([this, f=std::move(f), &t, addr](LBANodeRef root) mutable { + return root->mutate_mapping( + get_context(t), + addr, + std::move(f)); + }); +} + +BtreeLBAManager::update_internal_mapping_ret +BtreeLBAManager::update_internal_mapping( + Transaction &t, + depth_t depth, + laddr_t laddr, + paddr_t paddr) +{ + return cache.get_root(t).safe_then([=, &t](RootBlockRef croot) { + if (depth == croot->get_root().lba_depth) { + logger().debug( + "update_internal_mapping: updating lba root to: {}->{}", + laddr, + paddr); + { + auto mut_croot = cache.duplicate_for_write(t, croot); + croot = mut_croot->cast<RootBlock>(); + } + ceph_assert(laddr == 0); + auto old_paddr = croot->get_root().lba_root_addr; + croot->get_root().lba_root_addr = paddr; + return update_internal_mapping_ret( + update_internal_mapping_ertr::ready_future_marker{}, + old_paddr); + } else { + logger().debug( + "update_internal_mapping: updating lba node at depth {} to: {}->{}", + depth, + laddr, + paddr); + return get_lba_btree_extent( + get_context(t), + croot->get_root().lba_depth, + croot->get_root().lba_root_addr, + paddr_t()).safe_then([=, &t](LBANodeRef broot) { + return broot->mutate_internal_address( + get_context(t), + depth, + laddr, + paddr); + }); + } + }); +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h new file mode 100644 index 000000000..640d56734 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer_fwd.h" +#include "include/interval_set.h" +#include "common/interval_map.h" +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/segment_manager.h" + +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" + +namespace crimson::os::seastore::lba_manager::btree { + +/** + * BtreeLBAManager + * + * Uses a wandering btree to track two things: + * 1) lba state including laddr_t -> paddr_t mapping + * 2) reverse paddr_t -> laddr_t mapping for gc (TODO) + * + * Generally, any transaction will involve + * 1) deltas against lba tree nodes + * 2) new lba tree nodes + * - Note, there must necessarily be a delta linking + * these new nodes into the tree -- might be a + * bootstrap_state_t delta if new root + * + * get_mappings, alloc_extent_*, etc populate a Transaction + * which then gets submitted + */ +class BtreeLBAManager : public LBAManager { +public: + BtreeLBAManager( + SegmentManager &segment_manager, + Cache &cache); + + mkfs_ret mkfs( + Transaction &t) final; + + get_mapping_ret get_mapping( + Transaction &t, + laddr_t offset, extent_len_t length) final; + + get_mappings_ret get_mappings( + Transaction &t, + laddr_list_t &&list) final; + + alloc_extent_ret alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len, + paddr_t addr) final; + + set_extent_ret set_extent( + Transaction &t, + laddr_t off, extent_len_t len, paddr_t addr) final; + + ref_ret decref_extent( + Transaction &t, + laddr_t addr) final { + return update_refcount(t, addr, -1); + } + + ref_ret incref_extent( + Transaction &t, + laddr_t addr) final { + return update_refcount(t, addr, 1); + } + + complete_transaction_ret complete_transaction( + Transaction &t) final; + + init_cached_extent_ret init_cached_extent( + Transaction &t, + CachedExtentRef e) final; + + scan_mappings_ret scan_mappings( + Transaction &t, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &&f) final; + + scan_mapped_space_ret scan_mapped_space( + Transaction &t, + scan_mapped_space_func_t &&f) final; + + rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent); + + get_physical_extent_if_live_ret get_physical_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) final; + + void add_pin(LBAPin &pin) final { + auto *bpin = reinterpret_cast<BtreeLBAPin*>(&pin); + pin_set.add_pin(bpin->pin); + bpin->parent = nullptr; + } + +private: + SegmentManager &segment_manager; + Cache &cache; + + btree_pin_set_t pin_set; + + op_context_t get_context(Transaction &t) { + return op_context_t{cache, pin_set, t}; + } + + static btree_range_pin_t &get_pin(CachedExtent &e); + + + /** + * get_root + * + * Get a reference to the root LBANode. + */ + using get_root_ertr = Cache::get_extent_ertr; + using get_root_ret = get_root_ertr::future<LBANodeRef>; + get_root_ret get_root(Transaction &); + + /** + * insert_mapping + * + * Insert a lba mapping into the tree + */ + using insert_mapping_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using insert_mapping_ret = insert_mapping_ertr::future<LBAPinRef>; + insert_mapping_ret insert_mapping( + Transaction &t, ///< [in,out] transaction + LBANodeRef root, ///< [in] root node + laddr_t laddr, ///< [in] logical addr to insert + lba_map_val_t val ///< [in] mapping to insert + ); + + /** + * update_refcount + * + * Updates refcount, returns resulting refcount + */ + using update_refcount_ret = ref_ret; + update_refcount_ret update_refcount( + Transaction &t, + laddr_t addr, + int delta); + + /** + * update_mapping + * + * Updates mapping, removes if f returns nullopt + */ + using update_mapping_ertr = ref_ertr; + using update_mapping_ret = ref_ertr::future<lba_map_val_t>; + using update_func_t = LBANode::mutate_func_t; + update_mapping_ret update_mapping( + Transaction &t, + laddr_t addr, + update_func_t &&f); + + using update_internal_mapping_ertr = LBANode::mutate_internal_address_ertr; + using update_internal_mapping_ret = LBANode::mutate_internal_address_ret; + update_internal_mapping_ret update_internal_mapping( + Transaction &t, + depth_t depth, + laddr_t laddr, + paddr_t paddr); +}; +using BtreeLBAManagerRef = std::unique_ptr<BtreeLBAManager>; + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc new file mode 100644 index 000000000..a86c3cc57 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.cc @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::lba_manager::btree { + +void btree_range_pin_t::take_pin(btree_range_pin_t &other) +{ + assert(other.extent); + assert(other.pins); + other.pins->replace_pin(*this, other); + pins = other.pins; + other.pins = nullptr; + + if (other.has_ref()) { + other.drop_ref(); + acquire_ref(); + } +} + +btree_range_pin_t::~btree_range_pin_t() +{ + assert(!pins == !is_linked()); + assert(!ref); + if (pins) { + logger().debug("{}: removing {}", __func__, *this); + pins->remove_pin(*this, true); + } + extent = nullptr; +} + +void btree_pin_set_t::replace_pin(btree_range_pin_t &to, btree_range_pin_t &from) +{ + pins.replace_node(pins.iterator_to(from), to); +} + +void btree_pin_set_t::remove_pin(btree_range_pin_t &pin, bool do_check_parent) +{ + logger().debug("{}: {}", __func__, pin); + assert(pin.is_linked()); + assert(pin.pins); + assert(!pin.ref); + + pins.erase(pin); + pin.pins = nullptr; + + if (do_check_parent) { + check_parent(pin); + } +} + +btree_range_pin_t *btree_pin_set_t::maybe_get_parent( + const lba_node_meta_t &meta) +{ + auto cmeta = meta; + cmeta.depth++; + auto iter = pins.upper_bound(cmeta, btree_range_pin_t::meta_cmp_t()); + if (iter == pins.begin()) { + return nullptr; + } else { + --iter; + if (iter->range.is_parent_of(meta)) { + return &*iter; + } else { + return nullptr; + } + } +} + +const btree_range_pin_t *btree_pin_set_t::maybe_get_first_child( + const lba_node_meta_t &meta) const +{ + if (meta.depth == 0) { + return nullptr; + } + + auto cmeta = meta; + cmeta.depth--; + + auto iter = pins.lower_bound(cmeta, btree_range_pin_t::meta_cmp_t()); + if (iter == pins.end()) { + return nullptr; + } else if (meta.is_parent_of(iter->range)) { + return &*iter; + } else { + return nullptr; + } +} + +void btree_pin_set_t::release_if_no_children(btree_range_pin_t &pin) +{ + assert(pin.is_linked()); + if (maybe_get_first_child(pin.range) == nullptr) { + pin.drop_ref(); + } +} + +void btree_pin_set_t::add_pin(btree_range_pin_t &pin) +{ + assert(!pin.is_linked()); + assert(!pin.pins); + assert(!pin.ref); + + auto [prev, inserted] = pins.insert(pin); + if (!inserted) { + logger().error("{}: unable to add {}, found {}", __func__, pin, *prev); + assert(0 == "impossible"); + return; + } + pin.pins = this; + if (!pin.is_root()) { + auto *parent = maybe_get_parent(pin.range); + assert(parent); + if (!parent->has_ref()) { + logger().debug("{}: acquiring parent {}", __func__, + static_cast<void*>(parent)); + parent->acquire_ref(); + } else { + logger().debug("{}: parent has ref {}", __func__, + static_cast<void*>(parent)); + } + } + if (maybe_get_first_child(pin.range) != nullptr) { + logger().debug("{}: acquiring self {}", __func__, pin); + pin.acquire_ref(); + } +} + +void btree_pin_set_t::retire(btree_range_pin_t &pin) +{ + pin.drop_ref(); + remove_pin(pin, false); +} + +void btree_pin_set_t::check_parent(btree_range_pin_t &pin) +{ + auto parent = maybe_get_parent(pin.range); + if (parent) { + logger().debug("{}: releasing parent {}", __func__, *parent); + release_if_no_children(*parent); + } +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h new file mode 100644 index 000000000..3fa218fc8 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/btree_range_pin.h @@ -0,0 +1,274 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive/set.hpp> + +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore::lba_manager::btree { + +class LBANode; +using LBANodeRef = TCachedExtentRef<LBANode>; + +struct lba_node_meta_t { + laddr_t begin = 0; + laddr_t end = 0; + depth_t depth = 0; + + bool is_parent_of(const lba_node_meta_t &other) const { + return (depth == other.depth + 1) && + (begin <= other.begin) && + (end >= other.end); + } + + std::pair<lba_node_meta_t, lba_node_meta_t> split_into(laddr_t pivot) const { + return std::make_pair( + lba_node_meta_t{begin, pivot, depth}, + lba_node_meta_t{pivot, end, depth}); + } + + static lba_node_meta_t merge_from( + const lba_node_meta_t &lhs, const lba_node_meta_t &rhs) { + assert(lhs.depth == rhs.depth); + return lba_node_meta_t{lhs.begin, rhs.end, lhs.depth}; + } + + static std::pair<lba_node_meta_t, lba_node_meta_t> + rebalance(const lba_node_meta_t &lhs, const lba_node_meta_t &rhs, laddr_t pivot) { + assert(lhs.depth == rhs.depth); + return std::make_pair( + lba_node_meta_t{lhs.begin, pivot, lhs.depth}, + lba_node_meta_t{pivot, rhs.end, lhs.depth}); + } + + bool is_root() const { + return begin == 0 && end == L_ADDR_MAX; + } +}; + +inline std::ostream &operator<<( + std::ostream &lhs, + const lba_node_meta_t &rhs) +{ + return lhs << "btree_node_meta_t(" + << "begin=" << rhs.begin + << ", end=" << rhs.end + << ", depth=" << rhs.depth + << ")"; +} + +/** + * btree_range_pin_t + * + * Element tracked by btree_pin_set_t below. Encapsulates the intrusive_set + * hook, the lba_node_meta_t representing the lba range covered by a node, + * and extent and ref members intended to hold a reference when the extent + * should be pinned. + */ +class btree_pin_set_t; +class btree_range_pin_t : public boost::intrusive::set_base_hook<> { + friend class btree_pin_set_t; + lba_node_meta_t range; + + btree_pin_set_t *pins = nullptr; + + // We need to be able to remember extent without holding a reference, + // but we can do it more compactly -- TODO + CachedExtent *extent = nullptr; + CachedExtentRef ref; + + using index_t = boost::intrusive::set<btree_range_pin_t>; + + static auto get_tuple(const lba_node_meta_t &meta) { + return std::make_tuple(-meta.depth, meta.begin); + } + + void acquire_ref() { + ref = CachedExtentRef(extent); + } + + void drop_ref() { + ref.reset(); + } + +public: + btree_range_pin_t() = default; + btree_range_pin_t(CachedExtent *extent) + : extent(extent) {} + btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent) + : range(rhs.range), extent(extent) {} + + bool has_ref() const { + return !!ref; + } + + bool is_root() const { + return range.is_root(); + } + + void set_range(const lba_node_meta_t &nrange) { + range = nrange; + } + void set_extent(CachedExtent *nextent) { + assert(!extent); + extent = nextent; + } + + void take_pin(btree_range_pin_t &other); + + friend bool operator<( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) < get_tuple(rhs.range); + } + friend bool operator>( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) > get_tuple(rhs.range); + } + friend bool operator==( + const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { + return get_tuple(lhs.range) == rhs.get_tuple(rhs.range); + } + + struct meta_cmp_t { + bool operator()( + const btree_range_pin_t &lhs, const lba_node_meta_t &rhs) const { + return get_tuple(lhs.range) < get_tuple(rhs); + } + bool operator()( + const lba_node_meta_t &lhs, const btree_range_pin_t &rhs) const { + return get_tuple(lhs) < get_tuple(rhs.range); + } + }; + + friend std::ostream &operator<<( + std::ostream &lhs, + const btree_range_pin_t &rhs) { + return lhs << "btree_range_pin_t(" + << "begin=" << rhs.range.begin + << ", end=" << rhs.range.end + << ", depth=" << rhs.range.depth + << ", extent=" << rhs.extent + << ")"; + } + + friend class BtreeLBAPin; + ~btree_range_pin_t(); +}; + +/** + * btree_pin_set_t + * + * Ensures that for every cached node, all parent LBANodes required + * to map it are present in cache. Relocating these nodes can + * therefore be done without further reads or cache space. + * + * Contains a btree_range_pin_t for every clean or dirty LBANode + * or LogicalCachedExtent instance in cache at any point in time. + * For any LBANode, the contained btree_range_pin_t will hold + * a reference to that node pinning it in cache as long as that + * node has children in the set. This invariant can be violated + * only by calling retire_extent and is repaired by calling + * check_parent synchronously after adding any new extents. + */ +class btree_pin_set_t { + friend class btree_range_pin_t; + using pins_t = btree_range_pin_t::index_t; + pins_t pins; + + pins_t::iterator get_iter(btree_range_pin_t &pin) { + return pins_t::s_iterator_to(pin); + } + + /// Removes pin from set optionally checking whether parent has other children + void remove_pin(btree_range_pin_t &pin, bool check_parent); + + void replace_pin(btree_range_pin_t &to, btree_range_pin_t &from); + + /// Returns parent pin if exists + btree_range_pin_t *maybe_get_parent(const lba_node_meta_t &pin); + + /// Returns earliest child pin if exist + const btree_range_pin_t *maybe_get_first_child(const lba_node_meta_t &pin) const; + + /// Releases pin if it has no children + void release_if_no_children(btree_range_pin_t &pin); + +public: + /// Adds pin to set, assumes set is consistent + void add_pin(btree_range_pin_t &pin); + + /** + * retire/check_parent + * + * See BtreeLBAManager::complete_transaction. + * retire removes the specified pin from the set, but does not + * check parents. After any new extents are added to the set, + * the caller is required to call check_parent to restore the + * invariant. + */ + void retire(btree_range_pin_t &pin); + void check_parent(btree_range_pin_t &pin); + + ~btree_pin_set_t() { + assert(pins.empty()); + } +}; + +class BtreeLBAPin : public LBAPin { + friend class BtreeLBAManager; + + /** + * parent + * + * populated until link_extent is called to ensure cache residence + * until add_pin is called. + */ + CachedExtentRef parent; + + paddr_t paddr; + btree_range_pin_t pin; + +public: + BtreeLBAPin() = default; + + BtreeLBAPin( + CachedExtentRef parent, + paddr_t paddr, + lba_node_meta_t &&meta) + : parent(parent), paddr(paddr) { + pin.set_range(std::move(meta)); + } + + void link_extent(LogicalCachedExtent *ref) final { + pin.set_extent(ref); + } + + extent_len_t get_length() const final { + assert(pin.range.end > pin.range.begin); + return pin.range.end - pin.range.begin; + } + + paddr_t get_paddr() const final { + return paddr; + } + + laddr_t get_laddr() const final { + return pin.range.begin; + } + + LBAPinRef duplicate() const final { + auto ret = std::unique_ptr<BtreeLBAPin>(new BtreeLBAPin); + ret->pin.set_range(pin.range); + ret->paddr = paddr; + return ret; + } + + void take_pin(LBAPin &opin) final { + pin.take_pin(static_cast<BtreeLBAPin&>(opin).pin); + } +}; + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h new file mode 100644 index 000000000..b6f33a1ae --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -0,0 +1,269 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <sys/mman.h> +#include <memory> +#include <string.h> + +#include "crimson/common/log.h" +#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" +#include "crimson/os/seastore/lba_manager.h" + +namespace crimson::os::seastore::lba_manager::btree { + +struct op_context_t { + Cache &cache; + btree_pin_set_t &pins; + Transaction &trans; +}; + +/** + * lba_map_val_t + * + * struct representing a single lba mapping + */ +struct lba_map_val_t { + extent_len_t len = 0; ///< length of mapping + paddr_t paddr; ///< physical addr of mapping + uint32_t refcount = 0; ///< refcount + uint32_t checksum = 0; ///< checksum of original block written at paddr (TODO) + + lba_map_val_t( + extent_len_t len, + paddr_t paddr, + uint32_t refcount, + uint32_t checksum) + : len(len), paddr(paddr), refcount(refcount), checksum(checksum) {} +}; + +class BtreeLBAPin; +using BtreeLBAPinRef = std::unique_ptr<BtreeLBAPin>; + +/** + * LBANode + * + * Base class enabling recursive lookup between internal and leaf nodes. + */ +struct LBANode : CachedExtent { + using LBANodeRef = TCachedExtentRef<LBANode>; + using lookup_range_ertr = LBAManager::get_mapping_ertr; + using lookup_range_ret = LBAManager::get_mapping_ret; + + btree_range_pin_t pin; + + LBANode(ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), pin(this) {} + LBANode(const LBANode &rhs) + : CachedExtent(rhs), pin(rhs.pin, this) {} + + virtual lba_node_meta_t get_node_meta() const = 0; + + /** + * lookup + * + * Returns the node at the specified depth responsible + * for laddr + */ + using lookup_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using lookup_ret = lookup_ertr::future<LBANodeRef>; + virtual lookup_ret lookup( + op_context_t c, + laddr_t addr, + depth_t depth) = 0; + + /** + * lookup_range + * + * Returns mappings within range [addr, addr+len) + */ + virtual lookup_range_ret lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) = 0; + + /** + * insert + * + * Recursively inserts into subtree rooted at *this. Caller + * must already have handled splitting if at_max_capacity(). + * + * Precondition: !at_max_capacity() + */ + using insert_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using insert_ret = insert_ertr::future<LBAPinRef>; + virtual insert_ret insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) = 0; + + /** + * find_hole + * + * Finds minimum hole of size len in [min, max) + * + * @return addr of hole, L_ADDR_NULL if unfound + */ + using find_hole_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using find_hole_ret = find_hole_ertr::future<laddr_t>; + virtual find_hole_ret find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) = 0; + + /** + * scan_mappings + * + * Call f for all mappings in [begin, end) + */ + using scan_mappings_ertr = LBAManager::scan_mappings_ertr; + using scan_mappings_ret = LBAManager::scan_mappings_ret; + using scan_mappings_func_t = LBAManager::scan_mappings_func_t; + virtual scan_mappings_ret scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) = 0; + + using scan_mapped_space_ertr = LBAManager::scan_mapped_space_ertr; + using scan_mapped_space_ret = LBAManager::scan_mapped_space_ret; + using scan_mapped_space_func_t = LBAManager::scan_mapped_space_func_t; + virtual scan_mapped_space_ret scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) = 0; + + /** + * mutate_mapping + * + * Lookups up laddr, calls f on value. If f returns a value, inserts it. + * If it returns nullopt, removes the value. + * Caller must already have merged if at_min_capacity(). + * + * Recursive calls use mutate_mapping_internal. + * + * Precondition: !at_min_capacity() + */ + using mutate_mapping_ertr = crimson::errorator< + crimson::ct_error::enoent, ///< mapping does not exist + crimson::ct_error::input_output_error + >; + using mutate_mapping_ret = mutate_mapping_ertr::future< + lba_map_val_t>; + using mutate_func_t = std::function< + lba_map_val_t(const lba_map_val_t &v) + >; + virtual mutate_mapping_ret mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) = 0; + virtual mutate_mapping_ret mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) = 0; + + /** + * mutate_internal_address + * + * Looks up internal node mapping at laddr, depth and + * updates the mapping to paddr. Returns previous paddr + * (for debugging purposes). + */ + using mutate_internal_address_ertr = crimson::errorator< + crimson::ct_error::enoent, ///< mapping does not exist + crimson::ct_error::input_output_error + >; + using mutate_internal_address_ret = mutate_internal_address_ertr::future< + paddr_t>; + virtual mutate_internal_address_ret mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) = 0; + + /** + * make_split_children + * + * Generates appropriately typed left and right nodes formed from the + * contents of *this. + * + * Returns <left, right, pivot> where pivot is the first value of right. + */ + virtual std::tuple< + LBANodeRef, + LBANodeRef, + laddr_t> + make_split_children( + op_context_t c) = 0; + + /** + * make_full_merge + * + * Returns a single node formed from merging *this and right. + * Precondition: at_min_capacity() && right.at_min_capacity() + */ + virtual LBANodeRef make_full_merge( + op_context_t c, + LBANodeRef &right) = 0; + + /** + * make_balanced + * + * Returns nodes formed by balancing the contents of *this and right. + * + * Returns <left, right, pivot> where pivot is the first value of right. + */ + virtual std::tuple< + LBANodeRef, + LBANodeRef, + laddr_t> + make_balanced( + op_context_t c, + LBANodeRef &right, + bool prefer_left) = 0; + + virtual bool at_max_capacity() const = 0; + virtual bool at_min_capacity() const = 0; + + virtual ~LBANode() = default; + + void on_delta_write(paddr_t record_block_offset) final { + // All in-memory relative addrs are necessarily record-relative + assert(get_prior_instance()); + pin.take_pin(get_prior_instance()->cast<LBANode>()->pin); + resolve_relative_addrs(record_block_offset); + } + + void on_initial_write() final { + // All in-memory relative addrs are necessarily block-relative + resolve_relative_addrs(get_paddr()); + } + + void on_clean_read() final { + // From initial write of block, relative addrs are necessarily block-relative + resolve_relative_addrs(get_paddr()); + } + + virtual void resolve_relative_addrs(paddr_t base) = 0; +}; +using LBANodeRef = LBANode::LBANodeRef; + +/** + * get_lba_btree_extent + * + * Fetches node at depth of the appropriate type. + */ +Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent( + op_context_t c, ///< [in] context structure + depth_t depth, ///< [in] depth of node to fetch + paddr_t offset, ///< [in] physical addr of node + paddr_t base ///< [in] depending on user, block addr or record addr + /// in case offset is relative +); + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc new file mode 100644 index 000000000..5e400803b --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.cc @@ -0,0 +1,701 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include <memory> +#include <string.h> + +#include "include/buffer.h" +#include "include/byteorder.h" + +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::lba_manager::btree { + +std::ostream &LBAInternalNode::print_detail(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", meta=" << get_meta(); +} + +LBAInternalNode::lookup_ret LBAInternalNode::lookup( + op_context_t c, + laddr_t addr, + depth_t depth) +{ + auto meta = get_meta(); + if (depth == get_meta().depth) { + return lookup_ret( + lookup_ertr::ready_future_marker{}, + this); + } + assert(meta.begin <= addr); + assert(meta.end > addr); + auto iter = lower_bound(addr); + return get_lba_btree_extent( + c, + meta.depth - 1, + iter->get_val(), + get_paddr()).safe_then([c, addr, depth](auto child) { + return child->lookup(c, addr, depth); + }).finally([ref=LBANodeRef(this)] {}); +} + +LBAInternalNode::lookup_range_ret LBAInternalNode::lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) +{ + auto [begin, end] = bound(addr, addr + len); + auto result_up = std::make_unique<lba_pin_list_t>(); + auto &result = *result_up; + return crimson::do_for_each( + std::move(begin), + std::move(end), + [this, c, &result, addr, len](const auto &val) mutable { + return get_lba_btree_extent( + c, + get_meta().depth - 1, + val.get_val(), + get_paddr()).safe_then( + [c, &result, addr, len](auto extent) mutable { + return extent->lookup_range( + c, + addr, + len).safe_then( + [&result](auto pin_list) mutable { + result.splice(result.end(), pin_list, + pin_list.begin(), pin_list.end()); + }); + }); + }).safe_then([result=std::move(result_up), ref=LBANodeRef(this)] { + return lookup_range_ertr::make_ready_future<lba_pin_list_t>( + std::move(*result)); + }); +} + +LBAInternalNode::insert_ret LBAInternalNode::insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) +{ + auto insertion_pt = get_containing_child(laddr); + return get_lba_btree_extent( + c, + get_meta().depth - 1, + insertion_pt->get_val(), + get_paddr()).safe_then( + [this, insertion_pt, c, laddr, val=std::move(val)]( + auto extent) mutable { + return extent->at_max_capacity() ? + split_entry(c, laddr, insertion_pt, extent) : + insert_ertr::make_ready_future<LBANodeRef>(std::move(extent)); + }).safe_then([c, laddr, val=std::move(val)]( + LBANodeRef extent) mutable { + return extent->insert(c, laddr, val); + }); +} + +LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) +{ + return mutate_mapping_internal(c, laddr, true, std::move(f)); +} + +LBAInternalNode::mutate_mapping_ret LBAInternalNode::mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) +{ + auto mutation_pt = get_containing_child(laddr); + if (mutation_pt == end()) { + assert(0 == "impossible"); + return crimson::ct_error::enoent::make(); + } + return get_lba_btree_extent( + c, + get_meta().depth - 1, + mutation_pt->get_val(), + get_paddr() + ).safe_then([=](LBANodeRef extent) { + if (extent->at_min_capacity() && get_size() > 1) { + return merge_entry( + c, + laddr, + mutation_pt, + extent, + is_root); + } else { + return merge_ertr::make_ready_future<LBANodeRef>( + std::move(extent)); + } + }).safe_then([c, laddr, f=std::move(f)](LBANodeRef extent) mutable { + return extent->mutate_mapping_internal(c, laddr, false, std::move(f)); + }); +} + +LBAInternalNode::mutate_internal_address_ret LBAInternalNode::mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) +{ + if (get_meta().depth == (depth + 1)) { + if (!is_pending()) { + return c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>( + )->mutate_internal_address( + c, + depth, + laddr, + paddr); + } + auto iter = get_containing_child(laddr); + if (iter->get_key() != laddr) { + return crimson::ct_error::enoent::make(); + } + + auto old_paddr = iter->get_val(); + + journal_update( + iter, + maybe_generate_relative(paddr), + maybe_get_delta_buffer()); + + return mutate_internal_address_ret( + mutate_internal_address_ertr::ready_future_marker{}, + old_paddr + ); + } else { + auto iter = get_containing_child(laddr); + return get_lba_btree_extent( + c, + get_meta().depth - 1, + iter->get_val(), + get_paddr() + ).safe_then([=](auto node) { + return node->mutate_internal_address( + c, + depth, + laddr, + paddr); + }); + } +} + +LBAInternalNode::find_hole_ret LBAInternalNode::find_hole( + op_context_t c, + laddr_t min_addr, + laddr_t max_addr, + extent_len_t len) +{ + logger().debug( + "LBAInternalNode::find_hole min={}, max={}, len={}, *this={}", + min_addr, max_addr, len, *this); + auto [begin, end] = bound(min_addr, max_addr); + return seastar::repeat_until_value( + [i=begin, e=end, c, min_addr, len, this]() mutable { + if (i == e) { + return seastar::make_ready_future<std::optional<laddr_t>>( + std::make_optional<laddr_t>(L_ADDR_NULL)); + } + return get_lba_btree_extent(c, + get_meta().depth - 1, + i->get_val(), + get_paddr()).safe_then( + [c, min_addr, len, i](auto extent) mutable { + auto lb = std::max(min_addr, i->get_key()); + auto ub = i->get_next_key_or_max(); + logger().debug("LBAInternalNode::find_hole extent {} lb {} ub {}", + *extent, lb, ub); + return extent->find_hole(c, lb, ub, len); + }).safe_then([&i](auto addr) mutable -> std::optional<laddr_t> { + if (addr == L_ADDR_NULL) { + ++i; + return {}; + } else { + return addr; + } + }, + // TODO: GCC enters a dead loop if crimson::do_until() is used + // or erroratorized future is returned + crimson::ct_error::assert_all{ "fix me - APIv6" }); + }); +} + +LBAInternalNode::scan_mappings_ret LBAInternalNode::scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) +{ + auto [biter, eiter] = bound(begin, end); + return crimson::do_for_each( + std::move(biter), + std::move(eiter), + [=, &f](auto &viter) { + return get_lba_btree_extent( + c, + get_meta().depth - 1, + viter->get_val(), + get_paddr()).safe_then([=, &f](auto child) { + return child->scan_mappings(c, begin, end, f); + }); + }).safe_then([ref=LBANodeRef(this)]{}); +} + +LBAInternalNode::scan_mapped_space_ret LBAInternalNode::scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) +{ + f(get_paddr(), get_length()); + return crimson::do_for_each( + begin(), end(), + [=, &f](auto &viter) { + return get_lba_btree_extent( + c, + get_meta().depth - 1, + viter->get_val(), + get_paddr()).safe_then([=, &f](auto child) { + return child->scan_mapped_space(c, f); + }); + }).safe_then([ref=LBANodeRef(this)]{}); +} + + +void LBAInternalNode::resolve_relative_addrs(paddr_t base) +{ + for (auto i: *this) { + if (i->get_val().is_relative()) { + auto updated = base.add_relative(i->get_val()); + logger().debug( + "LBAInternalNode::resolve_relative_addrs {} -> {}", + i->get_val(), + updated); + i->set_val(updated); + } + } +} + + +LBAInternalNode::split_ret +LBAInternalNode::split_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t iter, LBANodeRef entry) +{ + if (!is_pending()) { + auto mut = c.cache.duplicate_for_write( + c.trans, this)->cast<LBAInternalNode>(); + auto mut_iter = mut->iter_idx(iter->get_offset()); + return mut->split_entry(c, addr, mut_iter, entry); + } + + ceph_assert(!at_max_capacity()); + auto [left, right, pivot] = entry->make_split_children(c); + + journal_update( + iter, + maybe_generate_relative(left->get_paddr()), + maybe_get_delta_buffer()); + journal_insert( + iter + 1, + pivot, + maybe_generate_relative(right->get_paddr()), + maybe_get_delta_buffer()); + + c.cache.retire_extent(c.trans, entry); + + logger().debug( + "LBAInternalNode::split_entry *this {} entry {} into left {} right {}", + *this, + *entry, + *left, + *right); + + return split_ertr::make_ready_future<LBANodeRef>( + pivot > addr ? left : right + ); +} + +LBAInternalNode::merge_ret +LBAInternalNode::merge_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t iter, + LBANodeRef entry, + bool is_root) +{ + if (!is_pending()) { + auto mut = c.cache.duplicate_for_write(c.trans, this)->cast<LBAInternalNode>(); + auto mut_iter = mut->iter_idx(iter->get_offset()); + return mut->merge_entry(c, addr, mut_iter, entry, is_root); + } + + logger().debug( + "LBAInternalNode: merge_entry: {}, {}", + *this, + *entry); + auto donor_is_left = (iter + 1) == end(); + auto donor_iter = donor_is_left ? iter - 1 : iter + 1; + return get_lba_btree_extent( + c, + get_meta().depth - 1, + donor_iter->get_val(), + get_paddr() + ).safe_then([=](auto donor) mutable { + auto [l, r] = donor_is_left ? + std::make_pair(donor, entry) : std::make_pair(entry, donor); + auto [liter, riter] = donor_is_left ? + std::make_pair(donor_iter, iter) : std::make_pair(iter, donor_iter); + if (donor->at_min_capacity()) { + auto replacement = l->make_full_merge( + c, + r); + + journal_update( + liter, + maybe_generate_relative(replacement->get_paddr()), + maybe_get_delta_buffer()); + journal_remove(riter, maybe_get_delta_buffer()); + + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + + if (is_root && get_size() == 1) { + return c.cache.get_root(c.trans).safe_then([=](RootBlockRef croot) { + { + auto mut_croot = c.cache.duplicate_for_write(c.trans, croot); + croot = mut_croot->cast<RootBlock>(); + } + croot->root.lba_root_addr = begin()->get_val(); + logger().debug( + "LBAInternalNode::merge_entry: collapsing root {} to addr {}", + *this, + begin()->get_val()); + croot->root.lba_depth = get_meta().depth - 1; + c.cache.retire_extent(c.trans, this); + return merge_ertr::make_ready_future<LBANodeRef>(replacement); + }); + } else { + return merge_ertr::make_ready_future<LBANodeRef>(replacement); + } + } else { + logger().debug( + "LBAInternalEntry::merge_entry balanced l {} r {}", + *l, + *r); + auto [replacement_l, replacement_r, pivot] = + l->make_balanced( + c, + r, + !donor_is_left); + + journal_update( + liter, + maybe_generate_relative(replacement_l->get_paddr()), + maybe_get_delta_buffer()); + journal_replace( + riter, + pivot, + maybe_generate_relative(replacement_r->get_paddr()), + maybe_get_delta_buffer()); + + c.cache.retire_extent(c.trans, l); + c.cache.retire_extent(c.trans, r); + return merge_ertr::make_ready_future<LBANodeRef>( + addr >= pivot ? replacement_r : replacement_l + ); + } + }); +} + + +LBAInternalNode::internal_iterator_t +LBAInternalNode::get_containing_child(laddr_t laddr) +{ + // TODO: binary search + for (auto i = begin(); i != end(); ++i) { + if (i.contains(laddr)) + return i; + } + ceph_assert(0 == "invalid"); + return end(); +} + +std::ostream &LBALeafNode::print_detail(std::ostream &out) const +{ + return out << ", size=" << get_size() + << ", meta=" << get_meta(); +} + +LBALeafNode::lookup_range_ret LBALeafNode::lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) +{ + logger().debug( + "LBALeafNode::lookup_range {}~{}", + addr, + len); + auto ret = lba_pin_list_t(); + auto [i, end] = get_leaf_entries(addr, len); + for (; i != end; ++i) { + auto val = i->get_val(); + auto begin = i->get_key(); + ret.emplace_back( + std::make_unique<BtreeLBAPin>( + this, + val.paddr.maybe_relative_to(get_paddr()), + lba_node_meta_t{ begin, begin + val.len, 0})); + } + return lookup_range_ertr::make_ready_future<lba_pin_list_t>( + std::move(ret)); +} + +LBALeafNode::insert_ret LBALeafNode::insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) +{ + ceph_assert(!at_max_capacity()); + + if (!is_pending()) { + return c.cache.duplicate_for_write(c.trans, this + )->cast<LBALeafNode>()->insert(c, laddr, val); + } + + val.paddr = maybe_generate_relative(val.paddr); + logger().debug( + "LBALeafNode::insert: inserting {}~{} -> {}", + laddr, + val.len, + val.paddr); + + auto insert_pt = lower_bound(laddr); + journal_insert(insert_pt, laddr, val, maybe_get_delta_buffer()); + + logger().debug( + "LBALeafNode::insert: inserted {}~{} -> {}", + insert_pt.get_key(), + insert_pt.get_val().len, + insert_pt.get_val().paddr); + auto begin = insert_pt.get_key(); + return insert_ret( + insert_ertr::ready_future_marker{}, + std::make_unique<BtreeLBAPin>( + this, + val.paddr.maybe_relative_to(get_paddr()), + lba_node_meta_t{ begin, begin + val.len, 0})); +} + +LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) +{ + return mutate_mapping_internal(c, laddr, true, std::move(f)); +} + +LBALeafNode::mutate_mapping_ret LBALeafNode::mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) +{ + auto mutation_pt = find(laddr); + if (mutation_pt == end()) { + return crimson::ct_error::enoent::make(); + } + + if (!is_pending()) { + return c.cache.duplicate_for_write(c.trans, this)->cast<LBALeafNode>( + )->mutate_mapping_internal( + c, + laddr, + is_root, + std::move(f)); + } + + auto cur = mutation_pt.get_val(); + auto mutated = f(cur); + + mutated.paddr = maybe_generate_relative(mutated.paddr); + + logger().debug( + "{}: mutate addr {}: {} -> {}", + __func__, + laddr, + cur.paddr, + mutated.paddr); + + if (mutated.refcount > 0) { + journal_update(mutation_pt, mutated, maybe_get_delta_buffer()); + return mutate_mapping_ret( + mutate_mapping_ertr::ready_future_marker{}, + mutated); + } else { + journal_remove(mutation_pt, maybe_get_delta_buffer()); + return mutate_mapping_ret( + mutate_mapping_ertr::ready_future_marker{}, + mutated); + } +} + +LBALeafNode::mutate_internal_address_ret LBALeafNode::mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) +{ + ceph_assert(0 == "Impossible"); + return mutate_internal_address_ret( + mutate_internal_address_ertr::ready_future_marker{}, + paddr); +} + +LBALeafNode::find_hole_ret LBALeafNode::find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) +{ + logger().debug( + "LBALeafNode::find_hole min={} max={}, len={}, *this={}", + min, max, len, *this); + auto [liter, uiter] = bound(min, max); + for (auto i = liter; i != uiter; ++i) { + auto ub = i->get_key(); + if (min + len <= ub) { + return find_hole_ret( + find_hole_ertr::ready_future_marker{}, + min); + } else { + min = i->get_key() + i->get_val().len; + } + } + if (min + len <= max) { + return find_hole_ret( + find_hole_ertr::ready_future_marker{}, + min); + } else { + return find_hole_ret( + find_hole_ertr::ready_future_marker{}, + L_ADDR_MAX); + } +} + +LBALeafNode::scan_mappings_ret LBALeafNode::scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) +{ + auto [biter, eiter] = bound(begin, end); + for (auto i = biter; i != eiter; ++i) { + auto val = i->get_val(); + f(i->get_key(), val.paddr, val.len); + } + return scan_mappings_ertr::now(); +} + +LBALeafNode::scan_mapped_space_ret LBALeafNode::scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) +{ + f(get_paddr(), get_length()); + for (auto i = begin(); i != end(); ++i) { + auto val = i->get_val(); + f(val.paddr, val.len); + } + return scan_mappings_ertr::now(); +} + + +void LBALeafNode::resolve_relative_addrs(paddr_t base) +{ + for (auto i: *this) { + if (i->get_val().paddr.is_relative()) { + auto val = i->get_val(); + val.paddr = base.add_relative(val.paddr); + logger().debug( + "LBALeafNode::resolve_relative_addrs {} -> {}", + i->get_val().paddr, + val.paddr); + i->set_val(val); + } + } +} + +std::pair<LBALeafNode::internal_iterator_t, LBALeafNode::internal_iterator_t> +LBALeafNode::get_leaf_entries(laddr_t addr, extent_len_t len) +{ + return bound(addr, addr + len); +} + +Cache::get_extent_ertr::future<LBANodeRef> get_lba_btree_extent( + op_context_t c, + depth_t depth, + paddr_t offset, + paddr_t base) +{ + offset = offset.maybe_relative_to(base); + ceph_assert(depth > 0); + if (depth > 1) { + logger().debug( + "get_lba_btree_extent: reading internal at offset {}, depth {}", + offset, + depth); + return c.cache.get_extent<LBAInternalNode>( + c.trans, + offset, + LBA_BLOCK_SIZE).safe_then([c](auto ret) { + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + if (!ret->is_pending() && !ret->pin.is_linked()) { + ret->pin.set_range(meta); + c.pins.add_pin(ret->pin); + } + return LBANodeRef(ret.detach(), /* add_ref = */ false); + }); + } else { + logger().debug( + "get_lba_btree_extent: reading leaf at offset {}, depth {}", + offset, + depth); + return c.cache.get_extent<LBALeafNode>( + c.trans, + offset, + LBA_BLOCK_SIZE).safe_then([offset, c](auto ret) { + logger().debug( + "get_lba_btree_extent: read leaf at offset {} {}", + offset, + *ret); + auto meta = ret->get_meta(); + if (ret->get_size()) { + ceph_assert(meta.begin <= ret->begin()->get_key()); + ceph_assert(meta.end > (ret->end() - 1)->get_key()); + } + if (!ret->is_pending() && !ret->pin.is_linked()) { + ret->pin.set_range(meta); + c.pins.add_pin(ret->pin); + } + return LBANodeRef(ret.detach(), /* add_ref = */ false); + }); + } +} + +} diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h new file mode 100644 index 000000000..230eef682 --- /dev/null +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node_impl.h @@ -0,0 +1,555 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <sys/mman.h> +#include <string.h> + +#include <memory> +#include <string.h> + +#include "include/buffer.h" + +#include "crimson/common/fixed_kv_node_layout.h" +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h" + +namespace crimson::os::seastore::lba_manager::btree { + +constexpr size_t LBA_BLOCK_SIZE = 4096; + +/** + * lba_node_meta_le_t + * + * On disk layout for lba_node_meta_t + */ +struct lba_node_meta_le_t { + laddr_le_t begin = laddr_le_t(0); + laddr_le_t end = laddr_le_t(0); + depth_le_t depth = init_les32(0); + + lba_node_meta_le_t() = default; + lba_node_meta_le_t(const lba_node_meta_le_t &) = default; + explicit lba_node_meta_le_t(const lba_node_meta_t &val) + : begin(init_le64(val.begin)), + end(init_le64(val.end)), + depth(init_les32(val.depth)) {} + + operator lba_node_meta_t() const { + return lba_node_meta_t{ begin, end, depth }; + } +}; + + +/** + * LBAInternalNode + * + * Abstracts operations on and layout of internal nodes for the + * LBA Tree. + * + * Layout (4k): + * size : uint32_t[1] 4b + * (padding) : 4b + * meta : lba_node_meta_le_t[3] (1*24)b + * keys : laddr_t[255] (254*8)b + * values : paddr_t[255] (254*8)b + * = 4096 + + * TODO: make the above capacity calculation part of FixedKVNodeLayout + * TODO: the above alignment probably isn't portable without further work + */ +constexpr size_t INTERNAL_NODE_CAPACITY = 254; +struct LBAInternalNode + : LBANode, + common::FixedKVNodeLayout< + INTERNAL_NODE_CAPACITY, + lba_node_meta_t, lba_node_meta_le_t, + laddr_t, laddr_le_t, + paddr_t, paddr_le_t> { + using internal_iterator_t = const_iterator; + template <typename... T> + LBAInternalNode(T&&... t) : + LBANode(std::forward<T>(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t type = extent_types_t::LADDR_INTERNAL; + + lba_node_meta_t get_node_meta() const final { return get_meta(); } + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new LBAInternalNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final; + + lookup_range_ret lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) final; + + insert_ret insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) final; + + mutate_mapping_ret mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) final; + mutate_mapping_ret mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) final; + + mutate_internal_address_ret mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) final; + + find_hole_ret find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) final; + + scan_mappings_ret scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) final; + + scan_mapped_space_ret scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) final; + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_split_children(op_context_t c) final { + auto left = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + auto right = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + auto pivot = split_into(*left, *right); + left->pin.set_range(left->get_meta()); + right->pin.set_range(right->get_meta()); + return std::make_tuple( + left, + right, + pivot); + } + + LBANodeRef make_full_merge( + op_context_t c, + LBANodeRef &right) final { + auto replacement = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + replacement->merge_from(*this, *right->cast<LBAInternalNode>()); + replacement->pin.set_range(replacement->get_meta()); + return replacement; + } + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_balanced( + op_context_t c, + LBANodeRef &_right, + bool prefer_left) final { + ceph_assert(_right->get_type() == type); + auto &right = *_right->cast<LBAInternalNode>(); + auto replacement_left = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + auto replacement_right = c.cache.alloc_new_extent<LBAInternalNode>( + c.trans, LBA_BLOCK_SIZE); + + auto pivot = balance_into_new_nodes( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + + replacement_left->pin.set_range(replacement_left->get_meta()); + replacement_right->pin.set_range(replacement_right->get_meta()); + return std::make_tuple( + replacement_left, + replacement_right, + pivot); + } + + /** + * Internal relative addresses on read or in memory prior to commit + * are either record or block relative depending on whether this + * physical node is is_initial_pending() or just is_pending(). + * + * User passes appropriate base depending on lifecycle and + * resolve_relative_addrs fixes up relative internal references + * based on base. + */ + void resolve_relative_addrs(paddr_t base) final; + void node_resolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + if (i->get_val().is_relative()) { + assert(i->get_val().is_block_relative()); + i->set_val(get_paddr().add_relative(i->get_val())); + } + } + } + } + void node_unresolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + if (i->get_val().is_relative()) { + assert(i->get_val().is_record_relative()); + i->set_val(i->get_val() - get_paddr()); + } + } + } + } + + extent_types_t get_type() const final { + return type; + } + + std::ostream &print_detail(std::ostream &out) const final; + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + set_last_committed_crc(get_crc32c()); + resolve_relative_addrs(base); + } + + bool at_max_capacity() const final { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const { + return get_size() == (get_capacity() / 2); + } + + /// returns iterators containing [l, r) + std::pair<internal_iterator_t, internal_iterator_t> bound( + laddr_t l, laddr_t r) { + // TODO: inefficient + auto retl = begin(); + for (; retl != end(); ++retl) { + if (retl->get_next_key_or_max() > l) + break; + } + auto retr = retl; + for (; retr != end(); ++retr) { + if (retr->get_key() >= r) + break; + } + return std::make_pair(retl, retr); + } + + using split_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using split_ret = split_ertr::future<LBANodeRef>; + split_ret split_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t, + LBANodeRef entry); + + using merge_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using merge_ret = merge_ertr::future<LBANodeRef>; + merge_ret merge_entry( + op_context_t c, + laddr_t addr, + internal_iterator_t, + LBANodeRef entry, + bool is_root); + + /// returns iterator for subtree containing laddr + internal_iterator_t get_containing_child(laddr_t laddr); +}; + +/** + * LBALeafNode + * + * Abstracts operations on and layout of leaf nodes for the + * LBA Tree. + * + * Layout (4k): + * size : uint32_t[1] 4b + * (padding) : 4b + * meta : lba_node_meta_le_t[3] (1*24)b + * keys : laddr_t[170] (145*8)b + * values : lba_map_val_t[170] (145*20)b + * = 4092 + * + * TODO: update FixedKVNodeLayout to handle the above calculation + * TODO: the above alignment probably isn't portable without further work + */ +constexpr size_t LEAF_NODE_CAPACITY = 145; + +/** + * lba_map_val_le_t + * + * On disk layout for lba_map_val_t. + */ +struct lba_map_val_le_t { + extent_len_le_t len = init_extent_len_le_t(0); + paddr_le_t paddr; + ceph_le32 refcount = init_le32(0); + ceph_le32 checksum = init_le32(0); + + lba_map_val_le_t() = default; + lba_map_val_le_t(const lba_map_val_le_t &) = default; + explicit lba_map_val_le_t(const lba_map_val_t &val) + : len(init_extent_len_le_t(val.len)), + paddr(paddr_le_t(val.paddr)), + refcount(init_le32(val.refcount)), + checksum(init_le32(val.checksum)) {} + + operator lba_map_val_t() const { + return lba_map_val_t{ len, paddr, refcount, checksum }; + } +}; + +struct LBALeafNode + : LBANode, + common::FixedKVNodeLayout< + LEAF_NODE_CAPACITY, + lba_node_meta_t, lba_node_meta_le_t, + laddr_t, laddr_le_t, + lba_map_val_t, lba_map_val_le_t> { + using internal_iterator_t = const_iterator; + template <typename... T> + LBALeafNode(T&&... t) : + LBANode(std::forward<T>(t)...), + FixedKVNodeLayout(get_bptr().c_str()) {} + + static constexpr extent_types_t type = extent_types_t::LADDR_LEAF; + + lba_node_meta_t get_node_meta() const final { return get_meta(); } + + CachedExtentRef duplicate_for_write() final { + assert(delta_buffer.empty()); + return CachedExtentRef(new LBALeafNode(*this)); + }; + + delta_buffer_t delta_buffer; + delta_buffer_t *maybe_get_delta_buffer() { + return is_mutation_pending() ? &delta_buffer : nullptr; + } + + lookup_ret lookup(op_context_t c, laddr_t addr, depth_t depth) final + { + return lookup_ret( + lookup_ertr::ready_future_marker{}, + this); + } + + lookup_range_ret lookup_range( + op_context_t c, + laddr_t addr, + extent_len_t len) final; + + insert_ret insert( + op_context_t c, + laddr_t laddr, + lba_map_val_t val) final; + + mutate_mapping_ret mutate_mapping( + op_context_t c, + laddr_t laddr, + mutate_func_t &&f) final; + mutate_mapping_ret mutate_mapping_internal( + op_context_t c, + laddr_t laddr, + bool is_root, + mutate_func_t &&f) final; + + mutate_internal_address_ret mutate_internal_address( + op_context_t c, + depth_t depth, + laddr_t laddr, + paddr_t paddr) final; + + find_hole_ret find_hole( + op_context_t c, + laddr_t min, + laddr_t max, + extent_len_t len) final; + + scan_mappings_ret scan_mappings( + op_context_t c, + laddr_t begin, + laddr_t end, + scan_mappings_func_t &f) final; + + scan_mapped_space_ret scan_mapped_space( + op_context_t c, + scan_mapped_space_func_t &f) final; + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_split_children(op_context_t c) final { + auto left = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + auto right = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + auto pivot = split_into(*left, *right); + left->pin.set_range(left->get_meta()); + right->pin.set_range(right->get_meta()); + return std::make_tuple( + left, + right, + pivot); + } + + LBANodeRef make_full_merge( + op_context_t c, + LBANodeRef &right) final { + auto replacement = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + replacement->merge_from(*this, *right->cast<LBALeafNode>()); + replacement->pin.set_range(replacement->get_meta()); + return replacement; + } + + std::tuple<LBANodeRef, LBANodeRef, laddr_t> + make_balanced( + op_context_t c, + LBANodeRef &_right, + bool prefer_left) final { + ceph_assert(_right->get_type() == type); + auto &right = *_right->cast<LBALeafNode>(); + auto replacement_left = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + auto replacement_right = c.cache.alloc_new_extent<LBALeafNode>( + c.trans, LBA_BLOCK_SIZE); + + auto pivot = balance_into_new_nodes( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + + replacement_left->pin.set_range(replacement_left->get_meta()); + replacement_right->pin.set_range(replacement_right->get_meta()); + return std::make_tuple( + replacement_left, + replacement_right, + pivot); + } + + // See LBAInternalNode, same concept + void resolve_relative_addrs(paddr_t base) final; + void node_resolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + auto val = i->get_val(); + if (val.paddr.is_relative()) { + assert(val.paddr.is_block_relative()); + val.paddr = get_paddr().add_relative(val.paddr); + i->set_val(val); + } + } + } + } + void node_unresolve_vals(iterator from, iterator to) const final { + if (is_initial_pending()) { + for (auto i = from; i != to; ++i) { + auto val = i->get_val(); + if (val.paddr.is_relative()) { + auto val = i->get_val(); + assert(val.paddr.is_record_relative()); + val.paddr = val.paddr - get_paddr(); + i->set_val(val); + } + } + } + } + + ceph::bufferlist get_delta() final { + assert(!delta_buffer.empty()); + ceph::buffer::ptr bptr(delta_buffer.get_bytes()); + delta_buffer.copy_out(bptr.c_str(), bptr.length()); + ceph::bufferlist bl; + bl.push_back(bptr); + return bl; + } + + void apply_delta_and_adjust_crc( + paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length()); + ceph::bufferlist bl = _bl; + bl.rebuild(); + delta_buffer_t buffer; + buffer.copy_in(bl.front().c_str(), bl.front().length()); + buffer.replay(*this); + set_last_committed_crc(get_crc32c()); + resolve_relative_addrs(base); + } + + extent_types_t get_type() const final { + return type; + } + + std::ostream &print_detail(std::ostream &out) const final; + + bool at_max_capacity() const final { + return get_size() == get_capacity(); + } + + bool at_min_capacity() const final { + return get_size() == (get_capacity() / 2); + } + + /// returns iterators <lb, ub> containing addresses [l, r) + std::pair<internal_iterator_t, internal_iterator_t> bound( + laddr_t l, laddr_t r) { + // TODO: inefficient + auto retl = begin(); + for (; retl != end(); ++retl) { + if (retl->get_key() >= l || (retl->get_key() + retl->get_val().len) > l) + break; + } + auto retr = retl; + for (; retr != end(); ++retr) { + if (retr->get_key() >= r) + break; + } + return std::make_pair(retl, retr); + } + + std::pair<internal_iterator_t, internal_iterator_t> + get_leaf_entries(laddr_t addr, extent_len_t len); +}; +using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>; + +} diff --git a/src/crimson/os/seastore/onode.cc b/src/crimson/os/seastore/onode.cc new file mode 100644 index 000000000..a8b925b70 --- /dev/null +++ b/src/crimson/os/seastore/onode.cc @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "onode.h" +#include "include/encoding.h" + +namespace crimson::os::seastore { + +size_t Onode::size() const +{ + return ceph::encoded_sizeof(*this); +} + +void Onode::encode(void* buffer, size_t len) +{ + struct [[gnu::packed]] encoded_t { + uint8_t struct_v; + uint8_t struct_compat; + uint32_t struct_len; + uint32_t len; + char data[]; + }; + auto p = reinterpret_cast<encoded_t*>(buffer); + assert(std::numeric_limits<uint16_t>::max() >= size()); + assert(len >= size()); + p->struct_v = 1; + p->struct_compat = 1; + p->struct_len = sizeof(encoded_t) + payload.size(); + p->len = payload.size(); + std::memcpy(p->data, payload.data(), payload.size()); +} + +bool operator==(const Onode& lhs, const Onode& rhs) +{ + return lhs.get() == rhs.get(); +} + +std::ostream& operator<<(std::ostream &out, const Onode &rhs) +{ + return out << rhs.get(); +} + +} + diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h new file mode 100644 index 000000000..4d7783028 --- /dev/null +++ b/src/crimson/os/seastore/onode.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <limits> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "include/buffer.h" +#include "include/denc.h" + +namespace crimson::os::seastore { + +// in-memory onode, in addition to the stuff that should be persisted to disk, +// it may contain intrusive hooks for LRU, rw locks etc +class Onode : public boost::intrusive_ref_counter< + Onode, + boost::thread_unsafe_counter> +{ +public: + Onode(std::string_view s) + : payload{s} + {} + size_t size() const; + const std::string& get() const { + return payload; + } + void encode(void* buffer, size_t len); + DENC(Onode, v, p) { + DENC_START(1, 1, p); + denc(v.payload, p); + DENC_FINISH(p); + } + +private: + // dummy payload + std::string payload; +}; + +bool operator==(const Onode& lhs, const Onode& rhs); +std::ostream& operator<<(std::ostream &out, const Onode &rhs); +using OnodeRef = boost::intrusive_ptr<Onode>; +} + +WRITE_CLASS_DENC(crimson::os::seastore::Onode) diff --git a/src/crimson/os/seastore/onode_manager.h b/src/crimson/os/seastore/onode_manager.h new file mode 100644 index 000000000..0a03b7fdf --- /dev/null +++ b/src/crimson/os/seastore/onode_manager.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/buffer_fwd.h" +#include "include/ceph_assert.h" +#include "common/hobject.h" + +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/osd/exceptions.h" + +namespace crimson::os::seastore { + +class OnodeManager { +public: + using open_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + virtual open_ertr::future<OnodeRef> get_or_create_onode( + Transaction &trans, + const ghobject_t &hoid) { + return open_ertr::make_ready_future<OnodeRef>(); + } + virtual open_ertr::future<std::vector<OnodeRef>> get_or_create_onodes( + Transaction &trans, + const std::vector<ghobject_t> &hoids) { + return open_ertr::make_ready_future<std::vector<OnodeRef>>(); + } + + using write_ertr= crimson::errorator< + crimson::ct_error::input_output_error>; + virtual write_ertr::future<> write_dirty( + Transaction &trans, + const std::vector<OnodeRef> &onodes) { + return write_ertr::now(); + } + virtual ~OnodeManager() {} +}; +using OnodeManagerRef = std::unique_ptr<OnodeManager>; + +namespace onode_manager { + +OnodeManagerRef create_ephemeral() { + return OnodeManagerRef(); +} + +} + +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc new file mode 100644 index 000000000..b05ea76a3 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.cc @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "onode_block.h" + +namespace crimson::os::seastore { + +ceph::bufferlist OnodeBlock::get_delta() +{ + bufferlist bl; + assert(deltas.size() <= std::numeric_limits<uint8_t>::max()); + uint8_t n_deltas = deltas.size(); + ceph::encode(n_deltas, bl); + for (auto& delta : deltas) { + delta->encode(bl); + } + return bl; +} + +void OnodeBlock::logical_on_delta_write() +{ + // journal submitted to disk, now update the memory + apply_pending_changes(true); +} + +void OnodeBlock::apply_delta(const ceph::bufferlist &bl) +{ + assert(deltas.empty()); + + auto p = bl.cbegin(); + uint8_t n_deltas = 0; + ceph::decode(n_deltas, p); + for (uint8_t i = 0; i < n_deltas; i++) { + delta_t delta; + delta.decode(p); + mutate(std::move(delta)); + } + apply_pending_changes(true); +} + +void OnodeBlock::mutate(delta_t&& d) +{ + if (is_initial_pending()) { + char* const p = get_bptr().c_str(); + mutate_func(p, d); + } + deltas.push_back(std::make_unique<delta_t>(std::move(d))); +} + +void OnodeBlock::apply_pending_changes(bool do_cleanup) +{ + if (!is_mutation_pending()) { + return; + } + if (share_buffer) { + // do a deep copy so i can change my own copy + get_bptr() = ceph::bufferptr{get_bptr().c_str(), + get_bptr().length()}; + share_buffer = false; + } + assert(mutate_func); + char* const p = get_bptr().c_str(); + for (auto& delta : deltas) { + mutate_func(p, *delta); + if (do_cleanup) { + delta.reset(); + } + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h new file mode 100644 index 000000000..0025d9847 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_block.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <cstdint> +#include <boost/container/small_vector.hpp> + +#include "crimson/os/seastore/transaction_manager.h" +#include "onode_delta.h" + +namespace crimson::os::seastore { + +// TODO s/CachedExtent/LogicalCachedExtent/ +struct OnodeBlock final : LogicalCachedExtent { + using Ref = TCachedExtentRef<OnodeBlock>; + + template <typename... T> + OnodeBlock(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {} + OnodeBlock(OnodeBlock&& block) = delete; + OnodeBlock(const OnodeBlock& block, CachedExtent::share_buffer_t tag) noexcept + : LogicalCachedExtent{block, tag}, + share_buffer{true} + {} + + CachedExtentRef duplicate_for_write() final { + return new OnodeBlock{*this, CachedExtent::share_buffer_t{}}; + } + + // could materialize the pending changes to the underlying buffer here, + // but since we write the change to the buffer immediately, let skip + // this for now. + void prepare_write() final {} + + // queries + static constexpr extent_types_t TYPE = extent_types_t::ONODE_BLOCK; + extent_types_t get_type() const final { + return TYPE; + } + + // have to stash all the changes before on_delta_write() is called, + // otherwise we could pollute the extent with pending mutations + // before the transaction carrying these mutations is committed to + // disk + ceph::bufferlist get_delta() final; + void logical_on_delta_write() final; + void apply_delta(const ceph::bufferlist &bl) final; + + void sync() { + apply_pending_changes(false); + } + void mutate(delta_t&& d); + using mutate_func_t = std::function<void (char*, const delta_t&)>; + void set_delta_applier(mutate_func_t&& func) { + mutate_func = std::move(func); + } +private: + // before looking at the extent, we need to make sure the content is up to date + void apply_pending_changes(bool do_cleanup); + // assuming we don't stash too many deltas to a single block + // otherwise a fullwrite op is necessary + boost::container::small_vector<std::unique_ptr<delta_t>, 2> deltas; + mutate_func_t mutate_func; + bool share_buffer = false; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc new file mode 100644 index 000000000..869685d45 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.cc @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "onode_delta.h" + +delta_t::delta_t(delta_t&& delta) +{ + assert(op == op_t::nop); + op = delta.op; + n = delta.n; + oid = std::move(delta.oid); + onode = std::move(delta.onode); + keys = std::move(delta.keys); + cells = std::move(delta.cells); + delta.op = op_t::nop; +} + +delta_t& delta_t::operator=(delta_t&& delta) +{ + assert(op == op_t::nop); + op = delta.op; + n = delta.n; + oid = std::move(delta.oid); + onode = std::move(delta.onode); + keys = std::move(delta.keys); + cells = std::move(delta.cells); + delta.op = op_t::nop; + return *this; +} + +delta_t delta_t::nop() +{ + return delta_t{op_t::nop}; +} + +delta_t delta_t::insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode) +{ + delta_t delta{op_t::insert_onode}; + delta.n = slot; + delta.oid = oid; + delta.onode = onode; + return delta; +} + +delta_t delta_t::update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode) +{ + delta_t delta{op_t::update_onode}; + delta.n = slot; + delta.oid = oid; + delta.onode = onode; + return delta; +} + +delta_t delta_t::insert_child(unsigned slot, + const ghobject_t& oid, + crimson::os::seastore::laddr_t addr) +{ + delta_t delta{op_t::insert_child}; + delta.n = slot; + delta.oid = oid; + delta.addr = addr; + return delta; +} + +delta_t delta_t::update_key(unsigned slot, const ghobject_t& oid) +{ + delta_t delta{op_t::update_key}; + delta.n = slot; + delta.oid = oid; + return delta; +} + +delta_t delta_t::shift_left(unsigned n) +{ + delta_t delta{op_t::shift_left}; + delta.n = n; + return delta; +} + +delta_t delta_t::trim_right(unsigned n) +{ + delta_t delta{op_t::trim_right}; + delta.n = n; + return delta; +} + +delta_t delta_t::insert_front(ceph::buffer::ptr keys, + ceph::buffer::ptr cells) +{ + delta_t delta{op_t::insert_front}; + delta.keys = std::move(keys); + delta.cells = std::move(cells); + return delta; +} + +delta_t delta_t::insert_back(ceph::buffer::ptr keys, + ceph::buffer::ptr cells) +{ + delta_t delta{op_t::insert_back}; + delta.keys = std::move(keys); + delta.cells = std::move(cells); + return delta; +} + +delta_t delta_t::remove_from(unsigned slot) +{ + delta_t delta{op_t::remove_from}; + delta.n = slot; + return delta; +} + +void delta_t::encode(ceph::bufferlist& bl) +{ + using ceph::encode; + switch (op) { + case op_t::insert_onode: + [[fallthrough]]; + case op_t::update_onode: + // the slot # is not encoded, because we can alway figure it out + // when we have to replay the delta by looking the oid up in the + // node block + encode(oid, bl); + encode(*onode, bl); + break; + case op_t::insert_child: + encode(oid, bl); + encode(addr, bl); + case op_t::update_key: + encode(n, bl); + encode(oid, bl); + break; + case op_t::shift_left: + encode(n, bl); + break; + case op_t::trim_right: + encode(n, bl); + break; + case op_t::insert_front: + [[fallthrough]]; + case op_t::insert_back: + encode(n, bl); + encode(keys, bl); + encode(cells, bl); + break; + case op_t::remove_from: + encode(n, bl); + break; + default: + assert(0 == "unknown onode op"); + } +} + +void delta_t::decode(ceph::bufferlist::const_iterator& p) { + using ceph::decode; + decode(op, p); + switch (op) { + case op_t::insert_onode: + [[fallthrough]]; + case op_t::update_onode: + decode(oid, p); + decode(*onode, p); + break; + case op_t::insert_child: + [[fallthrough]]; + case op_t::update_key: + decode(n, p); + decode(oid, p); + break; + case op_t::shift_left: + decode(n, p); + break; + case op_t::trim_right: + decode(n, p); + break; + case op_t::insert_front: + [[fallthrough]]; + case op_t::insert_back: + decode(n, p); + decode(keys, p); + decode(cells, p); + break; + case op_t::remove_from: + decode(n, p); + break; + default: + assert(0 == "unknown onode op"); + } +} diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h new file mode 100644 index 000000000..3e7e7315e --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_delta.h @@ -0,0 +1,70 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> + +#include "common/hobject.h" +#include "include/buffer_fwd.h" + +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/seastore_types.h" + +using crimson::os::seastore::OnodeRef; + +struct delta_t { + enum class op_t : uint8_t { + nop, + insert_onode, + update_onode, + insert_child, + update_key, + shift_left, + trim_right, + insert_front, + insert_back, + remove_from, + // finer grained op? + // - changing the embedded extent map of given oid + // - mutating the embedded xattrs of given oid + } op = op_t::nop; + + unsigned n = 0; + ghobject_t oid; + crimson::os::seastore::laddr_t addr = 0; + OnodeRef onode; + ceph::bufferptr keys; + ceph::bufferptr cells; + + delta_t() = default; + delta_t(op_t op) + : op{op} + {} + delta_t(delta_t&& delta); + delta_t& operator=(delta_t&& delta); + + static delta_t nop(); + static delta_t insert_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode); + static delta_t update_onode(unsigned slot, const ghobject_t& oid, OnodeRef onode); + static delta_t insert_child(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr); + static delta_t update_key(unsigned slot, const ghobject_t& oid); + static delta_t shift_left(unsigned n); + static delta_t trim_right(unsigned n); + static delta_t insert_front(ceph::buffer::ptr keys, + ceph::buffer::ptr cells); + static delta_t insert_back(ceph::buffer::ptr keys, + ceph::buffer::ptr cells); + static delta_t remove_from(unsigned slot); + + // shortcuts + static delta_t insert_item(unsigned slot, const ghobject_t& oid, OnodeRef onode) { + return insert_onode(slot, oid, onode); + } + static delta_t insert_item(unsigned slot, const ghobject_t& oid, crimson::os::seastore::laddr_t addr) { + return insert_child(slot, oid, addr); + } + + void encode(ceph::bufferlist& bl); + void decode(ceph::bufferlist::const_iterator& p); +}; diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc new file mode 100644 index 000000000..fdcaa2fcb --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.cc @@ -0,0 +1,567 @@ +#include "onode_node.h" + +template<size_t BlockSize, int N, ntype_t NodeType> +auto node_t<BlockSize, N, NodeType>::key_at(unsigned slot) const + -> std::pair<const key_prefix_t&, const key_suffix_t&> +{ + auto& key = keys[slot]; + if constexpr (item_in_key) { + return {key, key_suffix_t{}}; + } else { + auto p = from_end(key.offset); + return {key, *reinterpret_cast<const key_suffix_t*>(p)}; + } +} + +// update an existing oid with the specified item +template<size_t BlockSize, int N, ntype_t NodeType> +ghobject_t +node_t<BlockSize, N, NodeType>::get_oid_at(unsigned slot, + const ghobject_t& oid) const +{ + auto [prefix, suffix] = key_at(slot); + ghobject_t updated = oid; + prefix.update_oid(updated); + suffix.update_oid(updated); + return updated; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +auto node_t<BlockSize, N, NodeType>::item_at(const key_prefix_t& key) const + -> const_item_t +{ + if constexpr (item_in_key) { + return key.child_addr; + } else { + assert(key.offset < BlockSize); + auto p = from_end(key.offset); + auto partial_key = reinterpret_cast<const key_suffix_t*>(p); + p += size_of(*partial_key); + return *reinterpret_cast<const item_t*>(p); + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::dump(std::ostream& os) const +{ + for (uint16_t i = 0; i < count; i++) { + const auto& [prefix, suffix] = key_at(i); + os << " [" << i << '/' << count - 1 << "]\n" + << " key1 = (" << prefix << ")\n" + << " key2 = (" << suffix << ")\n"; + const auto& item = item_at(prefix); + if (_is_leaf()) { + os << " item = " << item << "\n"; + } else { + os << " child = " << std::hex << item << std::dec << "\n"; + } + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +char* node_t<BlockSize, N, NodeType>::from_end(uint16_t offset) +{ + auto end = reinterpret_cast<char*>(this) + BlockSize; + return end - static_cast<int>(offset); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +const char* node_t<BlockSize, N, NodeType>::from_end(uint16_t offset) const +{ + auto end = reinterpret_cast<const char*>(this) + BlockSize; + return end - static_cast<int>(offset); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +uint16_t node_t<BlockSize, N, NodeType>::used_space() const +{ + if constexpr (item_in_key) { + return count * sizeof(key_prefix_t); + } else { + if (count) { + return keys[count - 1].offset + count * sizeof(key_prefix_t); + } else { + return 0; + } + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +uint16_t node_t<BlockSize, N, NodeType>::capacity() +{ + auto p = reinterpret_cast<node_t*>(0); + return BlockSize - (reinterpret_cast<char*>(p->keys) - + reinterpret_cast<char*>(p)); +} + +// TODO: if it's allowed to update 2 siblings at the same time, we can have +// B* tree +template<size_t BlockSize, int N, ntype_t NodeType> +constexpr uint16_t node_t<BlockSize, N, NodeType>::min_size() +{ + return capacity() / 2; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +constexpr std::pair<int16_t, int16_t> +node_t<BlockSize, N, NodeType>::bytes_to_add(uint16_t size) +{ + assert(size < min_size()); + return {min_size() - size, capacity() - size}; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +constexpr std::pair<int16_t, int16_t> +node_t<BlockSize, N, NodeType>::bytes_to_remove(uint16_t size) +{ + assert(size > capacity()); + return {size - capacity(), size - min_size()}; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +size_state_t node_t<BlockSize, N, NodeType>::size_state(uint16_t size) const +{ + if (size > capacity()) { + return size_state_t::overflow; + } else if (size < capacity() / 2) { + return size_state_t::underflow; + } else { + return size_state_t::okay; + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +bool node_t<BlockSize, N, NodeType>::is_underflow(uint16_t size) const +{ + switch (size_state(size)) { + case size_state_t::underflow: + return true; + case size_state_t::okay: + return false; + default: + assert(0); + return false; + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +int16_t node_t<BlockSize, N, NodeType>::size_with_key(unsigned slot, + const ghobject_t& oid) const +{ + if constexpr (item_in_key) { + return capacity(); + } else { + // the size of fixed key does not change + [[maybe_unused]] const auto& [prefix, suffix] = key_at(slot); + return capacity() + key_suffix_t::size_from(oid) - suffix.size(); + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +ordering_t node_t<BlockSize, N, NodeType>::compare_with_slot(unsigned slot, + const ghobject_t& oid) const +{ + const auto& [prefix, suffix] = key_at(slot); + if (auto result = prefix.compare(oid); result != ordering_t::equivalent) { + return result; + } else { + return suffix.compare(oid); + } +} + +/// return the slot number of the first slot that is greater or equal to +/// key +template<size_t BlockSize, int N, ntype_t NodeType> +std::pair<unsigned, bool> node_t<BlockSize, N, NodeType>::lower_bound(const ghobject_t& oid) const +{ + unsigned s = 0, e = count; + while (s != e) { + unsigned mid = (s + e) / 2; + switch (compare_with_slot(mid, oid)) { + case ordering_t::less: + s = ++mid; + break; + case ordering_t::greater: + e = mid; + break; + case ordering_t::equivalent: + assert(mid == 0 || mid < count); + return {mid, true}; + } + } + return {s, false}; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +uint16_t node_t<BlockSize, N, NodeType>::size_of_item(const ghobject_t& oid, + const item_t& item) +{ + if constexpr (item_in_key) { + return sizeof(key_prefix_t); + } else { + return (sizeof(key_prefix_t) + + key_suffix_t::size_from(oid) + size_of(item)); + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +bool node_t<BlockSize, N, NodeType>::is_overflow(const ghobject_t& oid, + const item_t& item) const +{ + return free_space() < size_of_item(oid, item); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +bool node_t<BlockSize, N, NodeType>::is_overflow(const ghobject_t& oid, + const OnodeRef& item) const +{ + return free_space() < (sizeof(key_prefix_t) + key_suffix_t::size_from(oid) + item->size()); +} + +// inserts an item into the given slot, pushing all subsequent keys forward +// @note if the item is not embedded in key, shift the right half as well +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::insert_at(unsigned slot, + const ghobject_t& oid, + const item_t& item) +{ + assert(!is_overflow(oid, item)); + assert(slot <= count); + if constexpr (item_in_key) { + // shift the keys right + key_prefix_t* key = keys + slot; + key_prefix_t* last_key = keys + count; + std::copy_backward(key, last_key, last_key + 1); + key->set(oid, item); + } else { + const uint16_t size = key_suffix_t::size_from(oid) + size_of(item); + uint16_t offset = size; + if (slot > 0) { + offset += keys[slot - 1].offset; + } + if (slot < count) { + // V + // | |... // ...|//////|| | + // | |... // ...|//////| | | + // shift the partial keys and items left + auto first = keys[slot - 1].offset; + auto last = keys[count - 1].offset; + std::memmove(from_end(last + size), from_end(last), last - first); + // shift the keys right and update the pointers + for (key_prefix_t* dst = keys + count; dst > keys + slot; dst--) { + key_prefix_t* src = dst - 1; + *dst = *src; + dst->offset += size; + } + } + keys[slot].set(oid, offset); + auto p = from_end(offset); + auto partial_key = reinterpret_cast<key_suffix_t*>(p); + partial_key->set(oid); + p += size_of(*partial_key); + auto item_ptr = reinterpret_cast<item_t*>(p); + *item_ptr = item; + } + count++; + assert(used_space() <= capacity()); +} + +// used by InnerNode for updating the keys indexing its children when their lower boundaries +// is updated +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::update_key_at(unsigned slot, const ghobject_t& oid) +{ + if constexpr (is_leaf()) { + assert(0); + } else if constexpr (item_in_key) { + keys[slot].update(oid); + } else { + const auto& [prefix, suffix] = key_at(slot); + int16_t delta = key_suffix_t::size_from(oid) - suffix.size(); + if (delta > 0) { + // shift the cells sitting at its left side + auto first = keys[slot].offset; + auto last = keys[count - 1].offset; + std::memmove(from_end(last + delta), from_end(last), last - first); + // update the pointers + for (key_prefix_t* key = keys + slot; key < keys + count; key++) { + key->offset += delta; + } + } + keys[slot].update(oid); + auto p = from_end(keys[slot].offset); + auto partial_key = reinterpret_cast<key_suffix_t*>(p); + partial_key->set(oid); + // we don't update item here + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +std::pair<unsigned, uint16_t> +node_t<BlockSize, N, NodeType>::calc_grab_front(uint16_t min_grab, + uint16_t max_grab) const +{ + // TODO: split by likeness + uint16_t grabbed = 0; + uint16_t used = used_space(); + int n = 0; + for (; n < count; n++) { + const auto& [prefix, suffix] = key_at(n); + uint16_t to_grab = sizeof(prefix) + size_of(suffix); + if constexpr (!item_in_key) { + const auto& item = item_at(prefix); + to_grab += size_of(item); + } + if (grabbed + to_grab > max_grab) { + break; + } + grabbed += to_grab; + } + if (grabbed >= min_grab) { + if (n == count) { + return {n, grabbed}; + } else if (!is_underflow(used - grabbed)) { + return {n, grabbed}; + } + } + return {0, 0}; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +std::pair<unsigned, uint16_t> +node_t<BlockSize, N, NodeType>::calc_grab_back(uint16_t min_grab, + uint16_t max_grab) const +{ + // TODO: split by likeness + uint16_t grabbed = 0; + uint16_t used = used_space(); + for (int i = count - 1; i >= 0; i--) { + const auto& [prefix, suffix] = key_at(i); + uint16_t to_grab = sizeof(prefix) + size_of(suffix); + if constexpr (!item_in_key) { + const auto& item = item_at(prefix); + to_grab += size_of(item); + } + grabbed += to_grab; + if (is_underflow(used - grabbed)) { + return {0, 0}; + } else if (grabbed > max_grab) { + return {0, 0}; + } else if (grabbed >= min_grab) { + return {i + 1, grabbed}; + } + } + return {0, 0}; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +template<int LeftN, class Mover> +void node_t<BlockSize, N, NodeType>::grab_from_left(node_t<BlockSize, LeftN, NodeType>& left, + unsigned n, uint16_t bytes, + Mover& mover) +{ + // TODO: rebuild keys if moving across different layouts + // group by likeness + shift_right(n, bytes); + mover.move_from(left.count - n, 0, n); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +template<int RightN, class Mover> +delta_t node_t<BlockSize, N, NodeType>::acquire_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned whoami, Mover& mover) +{ + mover.move_from(0, count, right.count); + return mover.to_delta(); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +template<int RightN, class Mover> +void node_t<BlockSize, N, NodeType>::grab_from_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned n, uint16_t bytes, + Mover& mover) +{ + mover.move_from(0, count, n); + right.shift_left(n, 0); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +template<int LeftN, class Mover> +void node_t<BlockSize, N, NodeType>::push_to_left(node_t<BlockSize, LeftN, NodeType>& left, + unsigned n, uint16_t bytes, + Mover& mover) +{ + left.grab_from_right(*this, n, bytes, mover); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +template<int RightN, class Mover> +void node_t<BlockSize, N, NodeType>::push_to_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned n, uint16_t bytes, + Mover& mover) +{ + right.grab_from_left(*this, n, bytes, mover); +} + +// [to, from) are removed, so we need to shift left +// actually there are only two use cases: +// - to = 0: for giving elements in bulk +// - to = from - 1: for removing a single element +// old: |////|.....| |.....|/|........| +// new: |.....| |.....||........| +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::shift_left(unsigned from, unsigned to) +{ + assert(from < count); + assert(to < from); + if constexpr (item_in_key) { + std::copy(keys + from, keys + count, keys + to); + } else { + const uint16_t cell_hi = keys[count - 1].offset; + const uint16_t cell_lo = keys[from - 1].offset; + const uint16_t offset_delta = keys[from].offset - keys[to].offset; + for (auto src_key = keys + from, dst_key = keys + to; + src_key != keys + count; + ++src_key, ++dst_key) { + // shift the keys left + *dst_key = *src_key; + // update the pointers + dst_key->offset -= offset_delta; + } + // and cells + auto dst = from_end(cell_hi); + std::memmove(dst + offset_delta, dst, cell_hi - cell_lo); + } + count -= (from - to); +} + +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::insert_front(const ceph::bufferptr& keys_buf, + const ceph::bufferptr& cells_buf) +{ + unsigned n = keys_buf.length() / sizeof(key_prefix_t); + shift_right(n, cells_buf.length()); + keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast<char*>(keys)); + if constexpr (item_in_key) { + assert(cells_buf.length() == 0); + } else { + cells_buf.copy_out(0, cells_buf.length(), from_end(keys[n - 1].offset)); + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::insert_back(const ceph::bufferptr& keys_buf, + const ceph::bufferptr& cells_buf) +{ + keys_buf.copy_out(0, keys_buf.length(), reinterpret_cast<char*>(keys + count)); + count += keys_buf.length() / sizeof(key_prefix_t); + if constexpr (item_in_key) { + assert(cells_buf.length() == 0); + } else { + cells_buf.copy_out(0, cells_buf.length(), from_end(keys[count - 1].offset)); + } +} + +// one or more elements are inserted, so we need to shift the elements right +// actually there are only two use cases: +// - bytes != 0: for inserting bytes before from +// - bytes = 0: for inserting a single element before from +// old: ||.....| +// new: |/////|.....| +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::shift_right(unsigned n, unsigned bytes) +{ + assert(bytes + used_space() < capacity()); + // shift the keys left + std::copy_backward(keys, keys + count, keys + count + n); + count += n; + if constexpr (!item_in_key) { + uint16_t cells = keys[count - 1].offset; + // copy the partial keys and items + std::memmove(from_end(cells + bytes), from_end(cells), cells); + // update the pointers + for (auto key = keys + n; key < keys + count; ++key) { + key->offset += bytes; + } + } +} + +// shift all keys after slot is removed. +// @note if the item is not embdedded in key, all items sitting at the left +// side of it will be shifted right +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::remove_from(unsigned slot) +{ + assert(slot < count); + if (unsigned next = slot + 1; next < count) { + shift_left(next, slot); + } else { + // slot is the last one + count--; + } +} + +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::trim_right(unsigned n) +{ + count = n; +} + +template<size_t BlockSize, int N, ntype_t NodeType> +void node_t<BlockSize, N, NodeType>::play_delta(const delta_t& delta) +{ + switch (delta.op) { + case delta_t::op_t::insert_onode: + if constexpr (is_leaf()) { + auto [slot, found] = lower_bound(delta.oid); + assert(!found); + assert(delta.onode->size() <= std::numeric_limits<unsigned>::max()); + ceph::bufferptr buf{static_cast<unsigned>(delta.onode->size())}; + delta.onode->encode(buf.c_str(), buf.length()); + auto onode = reinterpret_cast<const onode_t*>(buf.c_str()); + return insert_at(slot, delta.oid, *onode); + } else { + throw std::invalid_argument("wrong node type"); + } + case delta_t::op_t::update_onode: + // TODO + assert(0 == "not implemented"); + break; + case delta_t::op_t::insert_child: + if constexpr (is_leaf()) { + throw std::invalid_argument("wrong node type"); + } else { + auto [slot, found] = lower_bound(delta.oid); + assert(!found); + insert_at(slot, delta.oid, delta.addr); + } + case delta_t::op_t::update_key: + if constexpr (is_leaf()) { + throw std::invalid_argument("wrong node type"); + } else { + return update_key_at(delta.n, delta.oid); + } + case delta_t::op_t::shift_left: + return shift_left(delta.n, 0); + case delta_t::op_t::trim_right: + return trim_right(delta.n); + case delta_t::op_t::insert_front: + return insert_front(delta.keys, delta.cells); + case delta_t::op_t::insert_back: + return insert_back(delta.keys, delta.cells); + case delta_t::op_t::remove_from: + return remove_from(delta.n); + default: + assert(0 == "unknown onode delta"); + } +} + +// explicit instantiate the node_t classes used by test_node.cc +template class node_t<512, 0, ntype_t::inner>; +template class node_t<512, 0, ntype_t::leaf>; +template class node_t<512, 1, ntype_t::inner>; +template class node_t<512, 1, ntype_t::leaf>; +template class node_t<512, 2, ntype_t::inner>; +template class node_t<512, 2, ntype_t::leaf>; +template class node_t<512, 3, ntype_t::inner>; +template class node_t<512, 3, ntype_t::leaf>; diff --git a/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h new file mode 100644 index 000000000..d833a6682 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/simple-fltree/onode_node.h @@ -0,0 +1,942 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> +#include <cstdint> +#include <type_traits> +#include <variant> + +#include "common/hobject.h" +#include "crimson/common/layout.h" +#include "crimson/os/seastore/onode.h" +#include "crimson/os/seastore/seastore_types.h" +#include "onode_delta.h" + +namespace asci = absl::container_internal; + +namespace boost::beast { + template<class T> + bool operator==(const span<T>& lhs, const span<T>& rhs) { + return std::equal( + lhs.begin(), lhs.end(), + rhs.begin(), rhs.end()); + } +} + +// on-disk onode +// it only keeps the bits necessary to rebuild an in-memory onode +struct [[gnu::packed]] onode_t { + onode_t& operator=(const onode_t& onode) { + len = onode.len; + std::memcpy(data, onode.data, len); + return *this; + } + size_t size() const { + return sizeof(*this) + len; + } + OnodeRef decode() const { + return new crimson::os::seastore::Onode(std::string_view{data, len}); + } + uint8_t struct_v = 1; + uint8_t struct_compat = 1; + // TODO: + // - use uint16_t for length, as the size of an onode should be less + // than a block (16K for now) + // - drop struct_len + uint32_t struct_len = 0; + uint32_t len; + char data[]; +}; + +static inline std::ostream& operator<<(std::ostream& os, const onode_t& onode) { + return os << *onode.decode(); +} + +using crimson::os::seastore::laddr_t; + +struct [[gnu::packed]] child_addr_t { + laddr_t data; + child_addr_t(laddr_t data) + : data{data} + {} + child_addr_t& operator=(laddr_t addr) { + data = addr; + return *this; + } + laddr_t get() const { + return data; + } + operator laddr_t() const { + return data; + } + size_t size() const { + return sizeof(laddr_t); + } +}; + +// poor man's operator<=> +enum class ordering_t { + less, + equivalent, + greater, +}; + +template<class L, class R> +ordering_t compare_element(const L& x, const R& y) +{ + if constexpr (std::is_arithmetic_v<L>) { + static_assert(std::is_arithmetic_v<R>); + if (x < y) { + return ordering_t::less; + } else if (x > y) { + return ordering_t::greater; + } else { + return ordering_t::equivalent; + } + } else { + // string_view::compare(), string::compare(), ... + auto result = x.compare(y); + if (result < 0) { + return ordering_t::less; + } else if (result > 0) { + return ordering_t::greater; + } else { + return ordering_t::equivalent; + } + } +} + +template<typename L, typename R> +constexpr ordering_t tuple_cmp(const L&, const R&, std::index_sequence<>) +{ + return ordering_t::equivalent; +} + +template<typename L, typename R, + size_t Head, size_t... Tail> +constexpr ordering_t tuple_cmp(const L& x, const R& y, + std::index_sequence<Head, Tail...>) +{ + auto ordering = compare_element(std::get<Head>(x), std::get<Head>(y)); + if (ordering != ordering_t::equivalent) { + return ordering; + } else { + return tuple_cmp(x, y, std::index_sequence<Tail...>()); + } +} + +template<typename... Ls, typename... Rs> +constexpr ordering_t cmp(const std::tuple<Ls...>& x, + const std::tuple<Rs...>& y) +{ + static_assert(sizeof...(Ls) == sizeof...(Rs)); + return tuple_cmp(x, y, std::index_sequence_for<Ls...>()); +} + +enum class likes_t { + yes, + no, + maybe, +}; + +struct [[gnu::packed]] variable_key_suffix { + uint64_t snap; + uint64_t gen; + uint8_t nspace_len; + uint8_t name_len; + char data[]; + struct index_t { + enum { + nspace_data = 0, + name_data = 1, + }; + }; + using layout_type = asci::Layout<char, char>; + layout_type cell_layout() const { + return layout_type{nspace_len, name_len}; + } + void set(const ghobject_t& oid) { + snap = oid.hobj.snap; + gen = oid.generation; + nspace_len = oid.hobj.nspace.size(); + name_len = oid.hobj.oid.name.size(); + auto layout = cell_layout(); + std::memcpy(layout.Pointer<index_t::nspace_data>(data), + oid.hobj.nspace.data(), oid.hobj.nspace.size()); + std::memcpy(layout.Pointer<index_t::name_data>(data), + oid.hobj.oid.name.data(), oid.hobj.oid.name.size()); + } + + void update_oid(ghobject_t& oid) const { + oid.hobj.snap = snap; + oid.generation = gen; + oid.hobj.nspace = nspace(); + oid.hobj.oid.name = name(); + } + + variable_key_suffix& operator=(const variable_key_suffix& key) { + snap = key.snap; + gen = key.gen; + auto layout = cell_layout(); + auto nspace = key.nspace(); + std::copy_n(nspace.data(), nspace.size(), + layout.Pointer<index_t::nspace_data>(data)); + auto name = key.name(); + std::copy_n(name.data(), name.size(), + layout.Pointer<index_t::name_data>(data)); + return *this; + } + const std::string_view nspace() const { + auto layout = cell_layout(); + auto nspace = layout.Slice<index_t::nspace_data>(data); + return {nspace.data(), nspace.size()}; + } + const std::string_view name() const { + auto layout = cell_layout(); + auto name = layout.Slice<index_t::name_data>(data); + return {name.data(), name.size()}; + } + size_t size() const { + return sizeof(*this) + nspace_len + name_len; + } + static size_t size_from(const ghobject_t& oid) { + return (sizeof(variable_key_suffix) + + oid.hobj.nspace.size() + + oid.hobj.oid.name.size()); + } + ordering_t compare(const ghobject_t& oid) const { + return cmp(std::tie(nspace(), name(), snap, gen), + std::tie(oid.hobj.nspace, oid.hobj.oid.name, oid.hobj.snap.val, + oid.generation)); + } + bool likes(const variable_key_suffix& key) const { + return nspace() == key.nspace() && name() == key.name(); + } +}; + +static inline std::ostream& operator<<(std::ostream& os, const variable_key_suffix& k) { + if (k.snap != CEPH_NOSNAP) { + os << "s" << k.snap << ","; + } + if (k.gen != ghobject_t::NO_GEN) { + os << "g" << k.gen << ","; + } + return os << k.nspace() << "/" << k.name(); +} + +// should use [[no_unique_address]] in C++20 +struct empty_key_suffix { + static constexpr ordering_t compare(const ghobject_t&) { + return ordering_t::equivalent; + } + static void set(const ghobject_t&) {} + static constexpr size_t size() { + return 0; + } + static size_t size_from(const ghobject_t&) { + return 0; + } + static void update_oid(ghobject_t&) {} +}; + +static inline std::ostream& operator<<(std::ostream& os, const empty_key_suffix&) +{ + return os; +} + +enum class ntype_t : uint8_t { + leaf = 0u, + inner, +}; + +constexpr ntype_t flip_ntype(ntype_t ntype) noexcept +{ + if (ntype == ntype_t::leaf) { + return ntype_t::inner; + } else { + return ntype_t::leaf; + } +} + +template<int N, ntype_t NodeType> +struct FixedKeyPrefix {}; + +template<ntype_t NodeType> +struct FixedKeyPrefix<0, NodeType> +{ + static constexpr bool item_in_key = false; + int8_t shard = -1; + int64_t pool = -1; + uint32_t hash = 0; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + FixedKeyPrefix(const ghobject_t& oid, uint16_t offset) + : shard{oid.shard_id}, + pool{oid.hobj.pool}, + hash{oid.hobj.get_hash()}, + offset{offset} + {} + + void set(const ghobject_t& oid, uint16_t new_offset) { + shard = oid.shard_id; + pool = oid.hobj.pool; + hash = oid.hobj.get_hash(); + offset = new_offset; + } + + void set(const FixedKeyPrefix& k, uint16_t new_offset) { + shard = k.shard; + pool = k.pool; + hash = k.hash; + offset = new_offset; + } + + void update(const ghobject_t& oid) { + shard = oid.shard_id; + pool = oid.hobj.pool; + hash = oid.hobj.get_hash(); + } + + void update_oid(ghobject_t& oid) const { + oid.set_shard(shard_id_t{shard}); + oid.hobj.pool = pool; + oid.hobj.set_hash(hash); + } + + ordering_t compare(const ghobject_t& oid) const { + // so std::tie() can bind them by reference + int8_t rhs_shard = oid.shard_id; + uint32_t rhs_hash = oid.hobj.get_hash(); + return cmp(std::tie(shard, pool, hash), + std::tie(rhs_shard, oid.hobj.pool, rhs_hash)); + } + // @return true if i likes @c k, we will can be pushed down to next level + // in the same node + likes_t likes(const FixedKeyPrefix& k) const { + if (shard == k.shard && pool == k.pool) { + return likes_t::yes; + } else { + return likes_t::no; + } + } +}; + +template<ntype_t NodeType> +std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<0, NodeType>& k) { + if (k.shard != shard_id_t::NO_SHARD) { + os << "s" << k.shard; + } + return os << "p=" << k.pool << "," + << "h=" << std::hex << k.hash << std::dec << "," + << ">" << k.offset; +} + +// all elements in this node share the same <shard, pool> +template<ntype_t NodeType> +struct FixedKeyPrefix<1, NodeType> { + static constexpr bool item_in_key = false; + uint32_t hash = 0; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + FixedKeyPrefix(uint32_t hash, uint16_t offset) + : hash{hash}, + offset{offset} + {} + FixedKeyPrefix(const ghobject_t& oid, uint16_t offset) + : FixedKeyPrefix(oid.hobj.get_hash(), offset) + {} + void set(const ghobject_t& oid, uint16_t new_offset) { + hash = oid.hobj.get_hash(); + offset = new_offset; + } + template<int N> + void set(const FixedKeyPrefix<N, NodeType>& k, uint16_t new_offset) { + static_assert(N < 2, "only N0, N1 have hash"); + hash = k.hash; + offset = new_offset; + } + void update_oid(ghobject_t& oid) const { + oid.hobj.set_hash(hash); + } + void update(const ghobject_t& oid) { + hash = oid.hobj.get_hash(); + } + ordering_t compare(const ghobject_t& oid) const { + return compare_element(hash, oid.hobj.get_hash()); + } + likes_t likes(const FixedKeyPrefix& k) const { + return hash == k.hash ? likes_t::yes : likes_t::no; + } +}; + +template<ntype_t NodeType> +std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<1, NodeType>& k) { + return os << "0x" << std::hex << k.hash << std::dec << "," + << ">" << k.offset; +} + +// all elements in this node must share the same <shard, pool, hash> +template<ntype_t NodeType> +struct FixedKeyPrefix<2, NodeType> { + static constexpr bool item_in_key = false; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + + static constexpr ordering_t compare(const ghobject_t& oid) { + // need to compare the cell + return ordering_t::equivalent; + } + // always defer to my cell for likeness + constexpr likes_t likes(const FixedKeyPrefix&) const { + return likes_t::maybe; + } + void set(const ghobject_t&, uint16_t new_offset) { + offset = new_offset; + } + template<int N> + void set(const FixedKeyPrefix<N, NodeType>&, uint16_t new_offset) { + offset = new_offset; + } + void update(const ghobject_t&) {} + void update_oid(ghobject_t&) const {} +}; + +template<ntype_t NodeType> +std::ostream& operator<<(std::ostream& os, const FixedKeyPrefix<2, NodeType>& k) { + return os << ">" << k.offset; +} + +struct fixed_key_3 { + uint64_t snap = 0; + uint64_t gen = 0; + + fixed_key_3() = default; + fixed_key_3(const ghobject_t& oid) + : snap{oid.hobj.snap}, gen{oid.generation} + {} + ordering_t compare(const ghobject_t& oid) const { + return cmp(std::tie(snap, gen), + std::tie(oid.hobj.snap.val, oid.generation)); + } + // no object likes each other at this level + constexpr likes_t likes(const fixed_key_3&) const { + return likes_t::no; + } + void update_with_oid(const ghobject_t& oid) { + snap = oid.hobj.snap; + gen = oid.generation; + } + void update_oid(ghobject_t& oid) const { + oid.hobj.snap = snap; + oid.generation = gen; + } +}; + +static inline std::ostream& operator<<(std::ostream& os, const fixed_key_3& k) { + if (k.snap != CEPH_NOSNAP) { + os << "s" << k.snap << ","; + } + if (k.gen != ghobject_t::NO_GEN) { + os << "g" << k.gen << ","; + } + return os; +} + +// all elements in this node must share the same <shard, pool, hash, namespace, oid> +// but the unlike other FixedKeyPrefix<>, a node with FixedKeyPrefix<3> does not have +// variable_sized_key, so if it is an inner node, we can just embed the child +// addr right in the key. +template<> +struct FixedKeyPrefix<3, ntype_t::inner> : public fixed_key_3 { + // the item is embedded in the key + static constexpr bool item_in_key = true; + laddr_t child_addr = 0; + + FixedKeyPrefix() = default; + void set(const ghobject_t& oid, laddr_t new_child_addr) { + update_with_oid(oid); + child_addr = new_child_addr; + } + // unlikely get called, though.. + void update(const ghobject_t& oid) {} + template<int N> + std::enable_if_t<N < 3> set(const FixedKeyPrefix<N, ntype_t::inner>&, + laddr_t new_child_addr) { + child_addr = new_child_addr; + } + void set(const FixedKeyPrefix& k, laddr_t new_child_addr) { + snap = k.snap; + gen = k.gen; + child_addr = new_child_addr; + } + void set(const variable_key_suffix& k, laddr_t new_child_addr) { + snap = k.snap; + gen = k.gen; + child_addr = new_child_addr; + } +}; + +template<> +struct FixedKeyPrefix<3, ntype_t::leaf> : public fixed_key_3 { + static constexpr bool item_in_key = false; + uint16_t offset = 0; + + FixedKeyPrefix() = default; + void set(const ghobject_t& oid, uint16_t new_offset) { + update_with_oid(oid); + offset = new_offset; + } + void set(const FixedKeyPrefix& k, uint16_t new_offset) { + snap = k.snap; + gen = k.gen; + offset = new_offset; + } + template<int N> + std::enable_if_t<N < 3> set(const FixedKeyPrefix<N, ntype_t::leaf>&, + uint16_t new_offset) { + offset = new_offset; + } +}; + +struct tag_t { + template<int N, ntype_t node_type> + static constexpr tag_t create() { + static_assert(std::clamp(N, 0, 3) == N); + return tag_t{N, static_cast<uint8_t>(node_type)}; + } + bool is_leaf() const { + return type() == ntype_t::leaf; + } + int layout() const { + return layout_type; + } + ntype_t type() const { + return ntype_t{node_type}; + } + int layout_type : 4; + uint8_t node_type : 4; +}; + +static inline std::ostream& operator<<(std::ostream& os, const tag_t& tag) { + return os << "n=" << tag.layout() << ", leaf=" << tag.is_leaf(); +} + +// for calculating size of variable-sized item/key +template<class T> +size_t size_of(const T& t) { + using decayed_t = std::decay_t<T>; + if constexpr (std::is_scalar_v<decayed_t>) { + return sizeof(decayed_t); + } else { + return t.size(); + } +} + +enum class size_state_t { + okay, + underflow, + overflow, +}; + +// layout of a node of B+ tree +// +// it is different from a typical B+ tree in following ways +// - the size of keys is not necessarily fixed, neither is the size of value. +// - the max number of elements in a node is determined by the total size of +// the keys and values in the node +// - in internal nodes, each key maps to the logical address of the child +// node whose minimum key is greater or equal to that key. +template<size_t BlockSize, + int N, + ntype_t NodeType> +struct node_t { + static_assert(std::clamp(N, 0, 3) == N); + constexpr static ntype_t node_type = NodeType; + constexpr static int node_n = N; + + using key_prefix_t = FixedKeyPrefix<N, NodeType>; + using item_t = std::conditional_t<NodeType == ntype_t::leaf, + onode_t, + child_addr_t>; + using const_item_t = std::conditional_t<NodeType == ntype_t::leaf, + const onode_t&, + child_addr_t>; + static constexpr bool item_in_key = key_prefix_t::item_in_key; + using key_suffix_t = std::conditional_t<N < 3, + variable_key_suffix, + empty_key_suffix>; + + std::pair<const key_prefix_t&, const key_suffix_t&> + key_at(unsigned slot) const; + + // update an existing oid with the specified item + ghobject_t get_oid_at(unsigned slot, const ghobject_t& oid) const; + const_item_t item_at(const key_prefix_t& key) const; + void dump(std::ostream& os) const; + + // for debugging only. + static constexpr bool is_leaf() { + return node_type == ntype_t::leaf; + } + + bool _is_leaf() const { + return tag.is_leaf(); + } + + char* from_end(uint16_t offset); + const char* from_end(uint16_t offset) const; + uint16_t used_space() const; + uint16_t free_space() const { + return capacity() - used_space(); + } + static uint16_t capacity(); + // TODO: if it's allowed to update 2 siblings at the same time, we can have + // B* tree + static constexpr uint16_t min_size(); + + + // calculate the allowable bounds on bytes to remove from an overflow node + // with specified size + // @param size the overflowed size + // @return <minimum bytes to grab, maximum bytes to grab> + static constexpr std::pair<int16_t, int16_t> bytes_to_remove(uint16_t size); + + // calculate the allowable bounds on bytes to add to an underflow node + // with specified size + // @param size the underflowed size + // @return <minimum bytes to push, maximum bytes to push> + static constexpr std::pair<int16_t, int16_t> bytes_to_add(uint16_t size); + + size_state_t size_state(uint16_t size) const; + bool is_underflow(uint16_t size) const; + int16_t size_with_key(unsigned slot, const ghobject_t& oid) const; + ordering_t compare_with_slot(unsigned slot, const ghobject_t& oid) const; + /// return the slot number of the first slot that is greater or equal to + /// key + std::pair<unsigned, bool> lower_bound(const ghobject_t& oid) const; + static uint16_t size_of_item(const ghobject_t& oid, const item_t& item); + bool is_overflow(const ghobject_t& oid, const item_t& item) const; + bool is_overflow(const ghobject_t& oid, const OnodeRef& item) const; + + // inserts an item into the given slot, pushing all subsequent keys forward + // @note if the item is not embedded in key, shift the right half as well + void insert_at(unsigned slot, const ghobject_t& oid, const item_t& item); + // used by InnerNode for updating the keys indexing its children when their lower boundaries + // is updated + void update_key_at(unsigned slot, const ghobject_t& oid); + // try to figure out the number of elements and total size when trying to + // rebalance by moving the elements from the front of this node when its + // left sibling node is underflow + // + // @param min_grab lower bound of the number of bytes to move + // @param max_grab upper bound of the number of bytes to move + // @return the number of element to grab + // @note return {0, 0} if current node would be underflow if + // @c min_grab bytes of elements are taken from it + std::pair<unsigned, uint16_t> calc_grab_front(uint16_t min_grab, uint16_t max_grab) const; + // try to figure out the number of elements and their total size when trying to + // rebalance by moving the elements from the end of this node when its right + // sibling node is underflow + // + // @param min_grab lower bound of the number of bytes to move + // @param max_grab upper bound of the number of bytes to move + // @return the number of element to grab + // @note return {0, 0} if current node would be underflow if + // @c min_grab bytes of elements are taken from it + std::pair<unsigned, uint16_t> calc_grab_back(uint16_t min_grab, uint16_t max_grab) const; + template<int LeftN, class Mover> void grab_from_left( + node_t<BlockSize, LeftN, NodeType>& left, + unsigned n, uint16_t bytes, + Mover& mover); + template<int RightN, class Mover> + delta_t acquire_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned whoami, Mover& mover); + // transfer n elements at the front of given node to me + template<int RightN, class Mover> + void grab_from_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned n, uint16_t bytes, + Mover& mover); + template<int LeftN, class Mover> + void push_to_left(node_t<BlockSize, LeftN, NodeType>& left, + unsigned n, uint16_t bytes, + Mover& mover); + template<int RightN, class Mover> + void push_to_right(node_t<BlockSize, RightN, NodeType>& right, + unsigned n, uint16_t bytes, + Mover& mover); + // [to, from) are removed, so we need to shift left + // actually there are only two use cases: + // - to = 0: for giving elements in bulk + // - to = from - 1: for removing a single element + // old: |////|.....| |.....|/|........| + // new: |.....| |.....||........| + void shift_left(unsigned from, unsigned to); + void insert_front(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf); + void insert_back(const ceph::bufferptr& keys_buf, const ceph::bufferptr& cells_buf); + // one or more elements are inserted, so we need to shift the elements right + // actually there are only two use cases: + // - bytes != 0: for inserting bytes before from + // - bytes = 0: for inserting a single element before from + // old: ||.....| + // new: |/////|.....| + void shift_right(unsigned n, unsigned bytes); + // shift all keys after slot is removed. + // @note if the item is not embdedded in key, all items sitting at the left + // side of it will be shifted right + void remove_from(unsigned slot); + void trim_right(unsigned n); + void play_delta(const delta_t& delta); + // /-------------------------------| + // | V + // |header|k0|k1|k2|... | / / |k2'v2|k1'v1|k0'.v0| v_m | + // |<-- count -->| + tag_t tag = tag_t::create<N, NodeType>(); + // the count of values in the node + uint16_t count = 0; + key_prefix_t keys[]; +}; + +template<class parent_t, + class from_t, + class to_t, + typename=void> +class EntryMover { +public: + // a "trap" mover + EntryMover(const parent_t&, from_t&, to_t& dst, unsigned) { + assert(0); + } + void move_from(unsigned, unsigned, unsigned) { + assert(0); + } + delta_t get_delta() { + return delta_t::nop(); + } +}; + +// lower the layout, for instance, from L0 to L1, no reference oid is used +template<class parent_t, + class from_t, + class to_t> +class EntryMover<parent_t, + from_t, + to_t, + std::enable_if_t<from_t::node_n < to_t::node_n>> +{ +public: + EntryMover(const parent_t&, from_t& src, to_t& dst, unsigned) + : src{src}, dst{dst} + {} + void move_from(unsigned src_first, unsigned dst_first, unsigned n) + { + ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)}; + ceph::bufferptr cells_buf; + auto dst_keys = reinterpret_cast<typename to_t::key_prefix_t*>(keys_buf.c_str()); + if constexpr (to_t::item_in_key) { + for (unsigned i = 0; i < n; i++) { + const auto& [prefix, suffix] = src.key_at(src_first + i); + dst_keys[i].set(suffix, src.item_at(prefix)); + } + } else { + // copy keys + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + for (unsigned i = 0; i < n; i++) { + auto& src_key = src.keys[src_first + i]; + uint16_t offset = src_key.offset - src_offset + dst_offset; + dst_keys[i].set(src_key, offset); + } + // copy cells in bulk, yay! + auto src_end = src.keys[src_first + n - 1].offset; + uint16_t total_cell_size = src_end - src_offset; + cells_buf = ceph::bufferptr{total_cell_size}; + cells_buf.copy_in(0, total_cell_size, src.from_end(src_end)); + } + if (dst_first == dst.count) { + dst_delta = delta_t::insert_back(keys_buf, cells_buf); + } else { + dst_delta = delta_t::insert_front(keys_buf, cells_buf); + } + if (src_first > 0 && src_first + n == src.count) { + src_delta = delta_t::trim_right(src_first); + } else if (src_first == 0 && n < src.count) { + src_delta = delta_t::shift_left(n); + } else if (src_first == 0 && n == src.count) { + // the caller will retire the src extent + } else { + // grab in the middle? + assert(0); + } + } + + delta_t from_delta() { + return std::move(src_delta); + } + delta_t to_delta() { + return std::move(dst_delta); + } +private: + const from_t& src; + const to_t& dst; + delta_t dst_delta; + delta_t src_delta; +}; + +// lift the layout, for instance, from L2 to L0, need a reference oid +template<class parent_t, + class from_t, + class to_t> +class EntryMover<parent_t, from_t, to_t, + std::enable_if_t<(from_t::node_n > to_t::node_n)>> +{ +public: + EntryMover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot) + : src{src}, dst{dst}, ref_oid{parent->get_oid_at(from_slot, {})} + {} + void move_from(unsigned src_first, unsigned dst_first, unsigned n) + { + ceph::bufferptr keys_buf{n * sizeof(to_t::key_prefix_t)}; + ceph::bufferptr cells_buf; + auto dst_keys = reinterpret_cast<typename to_t::key_prefix_t*>(keys_buf.c_str()); + uint16_t in_node_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + static_assert(!std::is_same_v<typename to_t::key_suffix_t, empty_key_suffix>); + // copy keys + uint16_t buf_offset = 0; + for (unsigned i = 0; i < n; i++) { + auto& src_key = src.keys[src_first + i]; + if constexpr (std::is_same_v<typename from_t::key_suffix_t, empty_key_suffix>) { + // heterogeneous partial key, have to rebuild dst partial key from oid + src_key.update_oid(ref_oid); + const auto& src_item = src.item_at(src_key); + size_t key2_size = to_t::key_suffix_t::size_from(ref_oid); + buf_offset += key2_size + size_of(src_item); + dst_keys[i].set(ref_oid, in_node_offset + buf_offset); + auto p = from_end(cells_buf, buf_offset); + auto partial_key = reinterpret_cast<typename to_t::key_suffix_t*>(p); + partial_key->set(ref_oid); + p += key2_size; + auto dst_item = reinterpret_cast<typename to_t::item_t*>(p); + *dst_item = src_item; + } else { + // homogeneous partial key, just update the pointers + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + uint16_t offset = src_key.offset - src_offset + dst_offset; + dst_keys[i].set(ref_oid, in_node_offset + offset); + } + } + if constexpr (std::is_same_v<typename to_t::key_suffix_t, + typename from_t::key_suffix_t>) { + // copy cells in bulk, yay! + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t src_end = src.keys[src_first + n - 1].offset; + uint16_t total_cell_size = src_end - src_offset; + cells_buf.copy_in(0, total_cell_size, src.from_end(src_end)); + } + if (dst_first == dst.count) { + dst_delta = delta_t::insert_back(keys_buf, cells_buf); + } else { + dst_delta = delta_t::insert_front(keys_buf, cells_buf); + } + if (src_first + n == src.count && src_first > 0) { + src_delta = delta_t::trim_right(src_first); + } else { + // the caller will retire the src extent + assert(src_first == 0); + } + } + + delta_t from_delta() { + return std::move(src_delta); + } + delta_t to_delta() { + return std::move(dst_delta); + } +private: + char* from_end(ceph::bufferptr& ptr, uint16_t offset) { + return ptr.end_c_str() - static_cast<int>(offset); + } +private: + const from_t& src; + const to_t& dst; + delta_t dst_delta; + delta_t src_delta; + ghobject_t ref_oid; +}; + +// identical layout, yay! +template<class parent_t, + class child_t> +class EntryMover<parent_t, child_t, child_t> +{ +public: + EntryMover(const parent_t&, child_t& src, child_t& dst, unsigned) + : src{src}, dst{dst} + {} + + void move_from(unsigned src_first, unsigned dst_first, unsigned n) + { + ceph::bufferptr keys_buf{static_cast<unsigned>(n * sizeof(typename child_t::key_prefix_t))}; + ceph::bufferptr cells_buf; + auto dst_keys = reinterpret_cast<typename child_t::key_prefix_t*>(keys_buf.c_str()); + + // copy keys + std::copy(src.keys + src_first, src.keys + src_first + n, + dst_keys); + if constexpr (!child_t::item_in_key) { + uint16_t src_offset = src_first > 0 ? src.keys[src_first - 1].offset : 0; + uint16_t dst_offset = dst_first > 0 ? dst.keys[dst_first - 1].offset : 0; + const int offset_delta = dst_offset - src_offset; + // update the pointers + for (unsigned i = 0; i < n; i++) { + dst_keys[i].offset += offset_delta; + } + // copy cells in bulk, yay! + auto src_end = src.keys[src_first + n - 1].offset; + uint16_t total_cell_size = src_end - src_offset; + cells_buf = ceph::bufferptr{total_cell_size}; + cells_buf.copy_in(0, total_cell_size, src.from_end(src_end)); + } + if (dst_first == dst.count) { + dst_delta = delta_t::insert_back(std::move(keys_buf), std::move(cells_buf)); + } else { + dst_delta = delta_t::insert_front(std::move(keys_buf), std::move(cells_buf)); + } + if (src_first + n == src.count && src_first > 0) { + src_delta = delta_t::trim_right(n); + } else if (src_first == 0 && n < src.count) { + src_delta = delta_t::shift_left(n); + } else if (src_first == 0 && n == src.count) { + // the caller will retire the src extent + } else { + // grab in the middle? + assert(0); + } + } + + delta_t from_delta() { + return std::move(src_delta); + } + + delta_t to_delta() { + return std::move(dst_delta); + } +private: + char* from_end(ceph::bufferptr& ptr, uint16_t offset) { + return ptr.end_c_str() - static_cast<int>(offset); + } +private: + const child_t& src; + const child_t& dst; + delta_t src_delta; + delta_t dst_delta; +}; + +template<class parent_t, class from_t, class to_t> +EntryMover<parent_t, from_t, to_t> +make_mover(const parent_t& parent, from_t& src, to_t& dst, unsigned from_slot) { + return EntryMover<parent_t, from_t, to_t>(parent, src, dst, from_slot); +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h new file mode 100644 index 000000000..4908c691f --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/fwd.h @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <algorithm> +#include <cstring> +#include <limits> +#include <memory> +#include <string> + +#include "crimson/common/errorator.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction.h" + +namespace crimson::os::seastore::onode { + +using crimson::os::seastore::Transaction; +using crimson::os::seastore::TransactionRef; +using crimson::os::seastore::make_transaction; +using crimson::os::seastore::laddr_t; +using crimson::os::seastore::L_ADDR_MIN; +using crimson::os::seastore::L_ADDR_NULL; +using crimson::os::seastore::extent_len_t; + +class DeltaRecorder; +class NodeExtent; +class NodeExtentManager; +class RootNodeTracker; +using DeltaRecorderURef = std::unique_ptr<DeltaRecorder>; +using NodeExtentRef = crimson::os::seastore::TCachedExtentRef<NodeExtent>; +using NodeExtentManagerURef = std::unique_ptr<NodeExtentManager>; +using RootNodeTrackerURef = std::unique_ptr<RootNodeTracker>; +struct context_t { + NodeExtentManager& nm; + Transaction& t; +}; + +class LeafNodeImpl; +class InternalNodeImpl; +class NodeImpl; +using LeafNodeImplURef = std::unique_ptr<LeafNodeImpl>; +using InternalNodeImplURef = std::unique_ptr<InternalNodeImpl>; +using NodeImplURef = std::unique_ptr<NodeImpl>; + +using level_t = uint8_t; +// a type only to index within a node, 32 bits should be enough +using index_t = uint32_t; +constexpr auto INDEX_END = std::numeric_limits<index_t>::max(); +constexpr auto INDEX_LAST = INDEX_END - 0x4; +constexpr auto INDEX_UPPER_BOUND = INDEX_END - 0x8; +inline bool is_valid_index(index_t index) { return index < INDEX_UPPER_BOUND; } + +// TODO: decide by NODE_BLOCK_SIZE +using node_offset_t = uint16_t; +constexpr node_offset_t DISK_BLOCK_SIZE = 1u << 12; +constexpr node_offset_t NODE_BLOCK_SIZE = DISK_BLOCK_SIZE * 1u; + +enum class MatchKindBS : int8_t { NE = -1, EQ = 0 }; + +enum class MatchKindCMP : int8_t { LT = -1, EQ = 0, GT }; +inline MatchKindCMP toMatchKindCMP(int value) { + if (value > 0) { + return MatchKindCMP::GT; + } else if (value < 0) { + return MatchKindCMP::LT; + } else { + return MatchKindCMP::EQ; + } +} +template <typename Type> +MatchKindCMP toMatchKindCMP(const Type& l, const Type& r) { + int match = l - r; + return toMatchKindCMP(match); +} + +inline MatchKindCMP toMatchKindCMP( + std::string_view l, std::string_view r) { + return toMatchKindCMP(l.compare(r)); +} + +inline MatchKindCMP reverse(MatchKindCMP cmp) { + if (cmp == MatchKindCMP::LT) { + return MatchKindCMP::GT; + } else if (cmp == MatchKindCMP::GT) { + return MatchKindCMP::LT; + } else { + return cmp; + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc new file mode 100644 index 000000000..3df458f08 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.cc @@ -0,0 +1,809 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node.h" + +#include <cassert> +#include <exception> +#include <sstream> + +#include "common/likely.h" +#include "crimson/common/log.h" +#include "node_extent_manager.h" +#include "node_impl.h" +#include "stages/node_stage_layout.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::onode { + +using node_ertr = Node::node_ertr; +template <class ValueT=void> +using node_future = Node::node_future<ValueT>; + +/* + * tree_cursor_t + */ + +tree_cursor_t::tree_cursor_t(Ref<LeafNode> node, const search_position_t& pos) + : leaf_node{node}, position{pos} { + assert(!is_end()); + leaf_node->do_track_cursor<true>(*this); +} + +tree_cursor_t::tree_cursor_t( + Ref<LeafNode> node, const search_position_t& pos, + const key_view_t& key, const onode_t* _p_value, layout_version_t v) + : leaf_node{node}, position{pos} { + assert(!is_end()); + update_kv(key, _p_value, v); + leaf_node->do_track_cursor<true>(*this); +} + +tree_cursor_t::tree_cursor_t(Ref<LeafNode> node) + : leaf_node{node}, position{search_position_t::end()} { + assert(is_end()); + assert(leaf_node->is_level_tail()); +} + +tree_cursor_t::~tree_cursor_t() { + if (!is_end()) { + leaf_node->do_untrack_cursor(*this); + } +} + +const key_view_t& tree_cursor_t::get_key_view() const { + ensure_kv(); + return *key_view; +} + +const onode_t* tree_cursor_t::get_p_value() const { + ensure_kv(); + return p_value; +} + +template <bool VALIDATE> +void tree_cursor_t::update_track( + Ref<LeafNode> node, const search_position_t& pos) { + // the cursor must be already untracked + // track the new node and new pos + assert(!pos.is_end()); + assert(!is_end()); + leaf_node = node; + position = pos; + key_view.reset(); + p_value = nullptr; + leaf_node->do_track_cursor<VALIDATE>(*this); +} +template void tree_cursor_t::update_track<true>(Ref<LeafNode>, const search_position_t&); +template void tree_cursor_t::update_track<false>(Ref<LeafNode>, const search_position_t&); + +void tree_cursor_t::update_kv( + const key_view_t& key, const onode_t* _p_value, layout_version_t v) const { + assert(!is_end()); + assert(_p_value); + assert(std::make_tuple(key, _p_value, v) == leaf_node->get_kv(position)); + key_view = key; + p_value = _p_value; + node_version = v; +} + +void tree_cursor_t::ensure_kv() const { + assert(!is_end()); + if (!p_value || node_version != leaf_node->get_layout_version()) { + // NOTE: the leaf node is always present when we hold its reference. + std::tie(key_view, p_value, node_version) = leaf_node->get_kv(position); + } + assert(p_value); +} + +/* + * Node + */ + +Node::Node(NodeImplURef&& impl) : impl{std::move(impl)} {} + +Node::~Node() { + // XXX: tolerate failure between allocate() and as_child() + if (is_root()) { + super->do_untrack_root(*this); + } else { + _parent_info->ptr->do_untrack_child(*this); + } +} + +level_t Node::level() const { + return impl->level(); +} + +node_future<Node::search_result_t> Node::lower_bound( + context_t c, const key_hobj_t& key) { + return seastar::do_with( + MatchHistory(), [this, c, &key](auto& history) { + return lower_bound_tracked(c, key, history); + } + ); +} + +node_future<std::pair<Ref<tree_cursor_t>, bool>> Node::insert( + context_t c, const key_hobj_t& key, const onode_t& value) { + return seastar::do_with( + MatchHistory(), [this, c, &key, &value](auto& history) { + return lower_bound_tracked(c, key, history + ).safe_then([c, &key, &value, &history](auto result) { + if (result.match() == MatchKindBS::EQ) { + return node_ertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>( + std::make_pair(result.p_cursor, false)); + } else { + auto leaf_node = result.p_cursor->get_leaf_node(); + return leaf_node->insert_value( + c, key, value, result.p_cursor->get_position(), history, result.mstat + ).safe_then([](auto p_cursor) { + return node_ertr::make_ready_future<std::pair<Ref<tree_cursor_t>, bool>>( + std::make_pair(p_cursor, true)); + }); + } + }); + } + ); +} + +node_future<tree_stats_t> Node::get_tree_stats(context_t c) { + return seastar::do_with( + tree_stats_t(), [this, c](auto& stats) { + return do_get_tree_stats(c, stats).safe_then([&stats] { + return stats; + }); + } + ); +} + +std::ostream& Node::dump(std::ostream& os) const { + return impl->dump(os); +} + +std::ostream& Node::dump_brief(std::ostream& os) const { + return impl->dump_brief(os); +} + +void Node::test_make_destructable( + context_t c, NodeExtentMutable& mut, Super::URef&& _super) { + impl->test_set_tail(mut); + make_root(c, std::move(_super)); +} + +node_future<> Node::mkfs(context_t c, RootNodeTracker& root_tracker) { + return LeafNode::allocate_root(c, root_tracker + ).safe_then([](auto ret) { /* FIXME: discard_result(); */ }); +} + +node_future<Ref<Node>> Node::load_root(context_t c, RootNodeTracker& root_tracker) { + return c.nm.get_super(c.t, root_tracker + ).safe_then([c, &root_tracker](auto&& _super) { + auto root_addr = _super->get_root_laddr(); + assert(root_addr != L_ADDR_NULL); + return Node::load(c, root_addr, true + ).safe_then([c, _super = std::move(_super), + &root_tracker](auto root) mutable { + assert(root->impl->field_type() == field_type_t::N0); + root->as_root(std::move(_super)); + std::ignore = c; // as only used in an assert + std::ignore = root_tracker; + assert(root == root_tracker.get_root(c.t)); + return node_ertr::make_ready_future<Ref<Node>>(root); + }); + }); +} + +void Node::make_root(context_t c, Super::URef&& _super) { + _super->write_root_laddr(c, impl->laddr()); + as_root(std::move(_super)); +} + +void Node::as_root(Super::URef&& _super) { + assert(!super && !_parent_info); + assert(_super->get_root_laddr() == impl->laddr()); + assert(impl->is_level_tail()); + super = std::move(_super); + super->do_track_root(*this); +} + +node_future<> Node::upgrade_root(context_t c) { + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + super->do_untrack_root(*this); + return InternalNode::allocate_root(c, impl->level(), impl->laddr(), std::move(super) + ).safe_then([this](auto new_root) { + as_child(search_position_t::end(), new_root); + }); +} + +template <bool VALIDATE> +void Node::as_child(const search_position_t& pos, Ref<InternalNode> parent_node) { + assert(!super); + _parent_info = parent_info_t{pos, parent_node}; + parent_info().ptr->do_track_child<VALIDATE>(*this); +} +template void Node::as_child<true>(const search_position_t&, Ref<InternalNode>); +template void Node::as_child<false>(const search_position_t&, Ref<InternalNode>); + +node_future<> Node::insert_parent(context_t c, Ref<Node> right_node) { + assert(!is_root()); + // TODO(cross-node string dedup) + return parent_info().ptr->apply_child_split( + c, parent_info().position, this, right_node); +} + +node_future<Ref<Node>> Node::load( + context_t c, laddr_t addr, bool expect_is_level_tail) { + // NOTE: + // *option1: all types of node have the same length; + // option2: length is defined by node/field types; + // option3: length is totally flexible; + return c.nm.read_extent(c.t, addr, NODE_BLOCK_SIZE + ).safe_then([expect_is_level_tail](auto extent) { + auto [node_type, field_type] = extent->get_types(); + if (node_type == node_type_t::LEAF) { + auto impl = LeafNodeImpl::load(extent, field_type, expect_is_level_tail); + return Ref<Node>(new LeafNode(impl.get(), std::move(impl))); + } else if (node_type == node_type_t::INTERNAL) { + auto impl = InternalNodeImpl::load(extent, field_type, expect_is_level_tail); + return Ref<Node>(new InternalNode(impl.get(), std::move(impl))); + } else { + ceph_abort("impossible path"); + } + }); +} + +/* + * InternalNode + */ + +InternalNode::InternalNode(InternalNodeImpl* impl, NodeImplURef&& impl_ref) + : Node(std::move(impl_ref)), impl{impl} {} + +node_future<> InternalNode::apply_child_split( + context_t c, const search_position_t& pos, + Ref<Node> left_child, Ref<Node> right_child) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(impl->is_level_tail()); + } +#endif + impl->prepare_mutate(c); + + auto left_key = left_child->impl->get_largest_key_view(); + auto left_child_addr = left_child->impl->laddr(); + auto left_child_addr_packed = laddr_packed_t{left_child_addr}; + auto right_key = right_child->impl->get_largest_key_view(); + auto right_child_addr = right_child->impl->laddr(); + logger().debug("OTree::Internal::Insert: " + "pos({}), left_child({}, {:#x}), right_child({}, {:#x}) ...", + pos, left_key, left_child_addr, right_key, right_child_addr); + // update pos => left_child to pos => right_child + impl->replace_child_addr(pos, right_child_addr, left_child_addr); + replace_track(pos, right_child, left_child); + + search_position_t insert_pos = pos; + auto [insert_stage, insert_size] = impl->evaluate_insert( + left_key, left_child_addr, insert_pos); + auto free_size = impl->free_size(); + if (free_size >= insert_size) { + // insert + [[maybe_unused]] auto p_value = impl->insert( + left_key, left_child_addr_packed, insert_pos, insert_stage, insert_size); + assert(impl->free_size() == free_size - insert_size); + assert(insert_pos <= pos); + assert(p_value->value == left_child_addr); + track_insert(insert_pos, insert_stage, left_child, right_child); + validate_tracked_children(); + return node_ertr::now(); + } + // split and insert + Ref<InternalNode> this_ref = this; + return (is_root() ? upgrade_root(c) : node_ertr::now() + ).safe_then([this, c] { + return InternalNode::allocate( + c, impl->field_type(), impl->is_level_tail(), impl->level()); + }).safe_then([this_ref, this, c, left_key, left_child, right_child, + insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable { + auto right_node = fresh_right.node; + auto left_child_addr = left_child->impl->laddr(); + auto left_child_addr_packed = laddr_packed_t{left_child_addr}; + auto [split_pos, is_insert_left, p_value] = impl->split_insert( + fresh_right.mut, *right_node->impl, left_key, left_child_addr_packed, + insert_pos, insert_stage, insert_size); + assert(p_value->value == left_child_addr); + track_split(split_pos, right_node); + if (is_insert_left) { + track_insert(insert_pos, insert_stage, left_child); + } else { + right_node->track_insert(insert_pos, insert_stage, left_child); + } + validate_tracked_children(); + right_node->validate_tracked_children(); + + // propagate index to parent + return insert_parent(c, right_node); + // TODO (optimize) + // try to acquire space from siblings before split... see btrfs + }); +} + +node_future<Ref<InternalNode>> InternalNode::allocate_root( + context_t c, level_t old_root_level, + laddr_t old_root_addr, Super::URef&& super) { + return InternalNode::allocate(c, field_type_t::N0, true, old_root_level + 1 + ).safe_then([c, old_root_addr, + super = std::move(super)](auto fresh_node) mutable { + auto root = fresh_node.node; + auto p_value = root->impl->get_p_value(search_position_t::end()); + fresh_node.mut.copy_in_absolute( + const_cast<laddr_packed_t*>(p_value), old_root_addr); + root->make_root_from(c, std::move(super), old_root_addr); + return root; + }); +} + +node_future<Ref<tree_cursor_t>> +InternalNode::lookup_smallest(context_t c) { + auto position = search_position_t::begin(); + laddr_t child_addr = impl->get_p_value(position)->value; + return get_or_track_child(c, position, child_addr + ).safe_then([c](auto child) { + return child->lookup_smallest(c); + }); +} + +node_future<Ref<tree_cursor_t>> +InternalNode::lookup_largest(context_t c) { + // NOTE: unlike LeafNode::lookup_largest(), this only works for the tail + // internal node to return the tail child address. + auto position = search_position_t::end(); + laddr_t child_addr = impl->get_p_value(position)->value; + return get_or_track_child(c, position, child_addr).safe_then([c](auto child) { + return child->lookup_largest(c); + }); +} + +node_future<Node::search_result_t> +InternalNode::lower_bound_tracked( + context_t c, const key_hobj_t& key, MatchHistory& history) { + auto result = impl->lower_bound(key, history); + return get_or_track_child(c, result.position, result.p_value->value + ).safe_then([c, &key, &history](auto child) { + // XXX(multi-type): pass result.mstat to child + return child->lower_bound_tracked(c, key, history); + }); +} + +node_future<> InternalNode::do_get_tree_stats( + context_t c, tree_stats_t& stats) { + auto nstats = impl->get_stats(); + stats.size_persistent_internal += nstats.size_persistent; + stats.size_filled_internal += nstats.size_filled; + stats.size_logical_internal += nstats.size_logical; + stats.size_overhead_internal += nstats.size_overhead; + stats.size_value_internal += nstats.size_value; + stats.num_kvs_internal += nstats.num_kvs; + stats.num_nodes_internal += 1; + + Ref<const InternalNode> this_ref = this; + return seastar::do_with( + search_position_t(), [this, this_ref, c, &stats](auto& pos) { + pos = search_position_t::begin(); + return crimson::do_until( + [this, this_ref, c, &stats, &pos]() -> node_future<bool> { + auto child_addr = impl->get_p_value(pos)->value; + return get_or_track_child(c, pos, child_addr + ).safe_then([c, &stats](auto child) { + return child->do_get_tree_stats(c, stats); + }).safe_then([this, this_ref, &pos] { + if (pos.is_end()) { + return node_ertr::make_ready_future<bool>(true); + } else { + impl->next_position(pos); + if (pos.is_end()) { + if (impl->is_level_tail()) { + return node_ertr::make_ready_future<bool>(false); + } else { + return node_ertr::make_ready_future<bool>(true); + } + } else { + return node_ertr::make_ready_future<bool>(false); + } + } + }); + }); + } + ); +} + +node_future<> InternalNode::test_clone_root( + context_t c_other, RootNodeTracker& tracker_other) const { + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + Ref<const InternalNode> this_ref = this; + return InternalNode::allocate(c_other, field_type_t::N0, true, impl->level() + ).safe_then([this, c_other, &tracker_other](auto fresh_other) { + impl->test_copy_to(fresh_other.mut); + auto cloned_root = fresh_other.node; + return c_other.nm.get_super(c_other.t, tracker_other + ).safe_then([c_other, cloned_root](auto&& super_other) { + cloned_root->make_root_new(c_other, std::move(super_other)); + return cloned_root; + }); + }).safe_then([this_ref, this, c_other](auto cloned_root) { + // clone tracked children + // In some unit tests, the children are stubbed out that they + // don't exist in NodeExtentManager, and are only tracked in memory. + return crimson::do_for_each( + tracked_child_nodes.begin(), + tracked_child_nodes.end(), + [this_ref, c_other, cloned_root](auto& kv) { + assert(kv.first == kv.second->parent_info().position); + return kv.second->test_clone_non_root(c_other, cloned_root); + } + ); + }); +} + +node_future<Ref<Node>> InternalNode::get_or_track_child( + context_t c, const search_position_t& position, laddr_t child_addr) { + bool level_tail = position.is_end(); + Ref<Node> child; + auto found = tracked_child_nodes.find(position); + Ref<InternalNode> this_ref = this; + return (found == tracked_child_nodes.end() + ? (logger().trace("OTree::Internal: load child untracked at {:#x}, pos({}), level={}", + child_addr, position, level() - 1), + Node::load(c, child_addr, level_tail + ).safe_then([this, position] (auto child) { + child->as_child(position, this); + return child; + })) + : (logger().trace("OTree::Internal: load child tracked at {:#x}, pos({}), level={}", + child_addr, position, level() - 1), + node_ertr::make_ready_future<Ref<Node>>(found->second)) + ).safe_then([this_ref, this, position, child_addr] (auto child) { + assert(child_addr == child->impl->laddr()); + assert(position == child->parent_info().position); + std::ignore = position; + std::ignore = child_addr; + validate_child(*child); + return child; + }); +} + +void InternalNode::track_insert( + const search_position_t& insert_pos, match_stage_t insert_stage, + Ref<Node> insert_child, Ref<Node> nxt_child) { + // update tracks + auto pos_upper_bound = insert_pos; + pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND; + auto first = tracked_child_nodes.lower_bound(insert_pos); + auto last = tracked_child_nodes.lower_bound(pos_upper_bound); + std::vector<Node*> nodes; + std::for_each(first, last, [&nodes](auto& kv) { + nodes.push_back(kv.second); + }); + tracked_child_nodes.erase(first, last); + for (auto& node : nodes) { + auto _pos = node->parent_info().position; + assert(!_pos.is_end()); + ++_pos.index_by_stage(insert_stage); + node->as_child(_pos, this); + } + // track insert + insert_child->as_child(insert_pos, this); + +#ifndef NDEBUG + // validate left_child is before right_child + if (nxt_child) { + auto iter = tracked_child_nodes.find(insert_pos); + ++iter; + assert(iter->second == nxt_child); + } +#endif +} + +void InternalNode::replace_track( + const search_position_t& position, Ref<Node> new_child, Ref<Node> old_child) { + assert(tracked_child_nodes[position] == old_child); + tracked_child_nodes.erase(position); + new_child->as_child(position, this); + assert(tracked_child_nodes[position] == new_child); +} + +void InternalNode::track_split( + const search_position_t& split_pos, Ref<InternalNode> right_node) { + auto first = tracked_child_nodes.lower_bound(split_pos); + auto iter = first; + while (iter != tracked_child_nodes.end()) { + search_position_t new_pos = iter->first; + new_pos -= split_pos; + iter->second->as_child<false>(new_pos, right_node); + ++iter; + } + tracked_child_nodes.erase(first, tracked_child_nodes.end()); +} + +void InternalNode::validate_child(const Node& child) const { +#ifndef NDEBUG + assert(impl->level() - 1 == child.impl->level()); + assert(this == child.parent_info().ptr); + auto& child_pos = child.parent_info().position; + assert(impl->get_p_value(child_pos)->value == child.impl->laddr()); + if (child_pos.is_end()) { + assert(impl->is_level_tail()); + assert(child.impl->is_level_tail()); + } else { + assert(!child.impl->is_level_tail()); + assert(impl->get_key_view(child_pos) == child.impl->get_largest_key_view()); + } + // XXX(multi-type) + assert(impl->field_type() <= child.impl->field_type()); +#endif +} + +node_future<InternalNode::fresh_node_t> InternalNode::allocate( + context_t c, field_type_t field_type, bool is_level_tail, level_t level) { + return InternalNodeImpl::allocate(c, field_type, is_level_tail, level + ).safe_then([](auto&& fresh_impl) { + auto node = Ref<InternalNode>(new InternalNode( + fresh_impl.impl.get(), std::move(fresh_impl.impl))); + return fresh_node_t{node, fresh_impl.mut}; + }); +} + +/* + * LeafNode + */ + +LeafNode::LeafNode(LeafNodeImpl* impl, NodeImplURef&& impl_ref) + : Node(std::move(impl_ref)), impl{impl} {} + +bool LeafNode::is_level_tail() const { + return impl->is_level_tail(); +} + +std::tuple<key_view_t, const onode_t*, layout_version_t> LeafNode::get_kv( + const search_position_t& pos) const { + key_view_t key_view; + auto p_value = impl->get_p_value(pos, &key_view); + return {key_view, p_value, layout_version}; +} + +node_future<Ref<tree_cursor_t>> +LeafNode::lookup_smallest(context_t) { + if (unlikely(impl->is_empty())) { + assert(is_root()); + return node_ertr::make_ready_future<Ref<tree_cursor_t>>( + new tree_cursor_t(this)); + } + auto pos = search_position_t::begin(); + key_view_t index_key; + auto p_value = impl->get_p_value(pos, &index_key); + return node_ertr::make_ready_future<Ref<tree_cursor_t>>( + get_or_track_cursor(pos, index_key, p_value)); +} + +node_future<Ref<tree_cursor_t>> +LeafNode::lookup_largest(context_t) { + if (unlikely(impl->is_empty())) { + assert(is_root()); + return node_ertr::make_ready_future<Ref<tree_cursor_t>>( + new tree_cursor_t(this)); + } + search_position_t pos; + const onode_t* p_value = nullptr; + key_view_t index_key; + impl->get_largest_slot(pos, index_key, &p_value); + return node_ertr::make_ready_future<Ref<tree_cursor_t>>( + get_or_track_cursor(pos, index_key, p_value)); +} + +node_future<Node::search_result_t> +LeafNode::lower_bound_tracked( + context_t c, const key_hobj_t& key, MatchHistory& history) { + key_view_t index_key; + auto result = impl->lower_bound(key, history, &index_key); + Ref<tree_cursor_t> cursor; + if (result.position.is_end()) { + assert(!result.p_value); + cursor = new tree_cursor_t(this); + } else { + cursor = get_or_track_cursor(result.position, index_key, result.p_value); + } + return node_ertr::make_ready_future<search_result_t>( + search_result_t{cursor, result.mstat}); +} + +node_future<> LeafNode::do_get_tree_stats(context_t, tree_stats_t& stats) { + auto nstats = impl->get_stats(); + stats.size_persistent_leaf += nstats.size_persistent; + stats.size_filled_leaf += nstats.size_filled; + stats.size_logical_leaf += nstats.size_logical; + stats.size_overhead_leaf += nstats.size_overhead; + stats.size_value_leaf += nstats.size_value; + stats.num_kvs_leaf += nstats.num_kvs; + stats.num_nodes_leaf += 1; + return node_ertr::now(); +} + +node_future<> LeafNode::test_clone_root( + context_t c_other, RootNodeTracker& tracker_other) const { + assert(is_root()); + assert(impl->is_level_tail()); + assert(impl->field_type() == field_type_t::N0); + Ref<const LeafNode> this_ref = this; + return LeafNode::allocate(c_other, field_type_t::N0, true + ).safe_then([this, c_other, &tracker_other](auto fresh_other) { + impl->test_copy_to(fresh_other.mut); + auto cloned_root = fresh_other.node; + return c_other.nm.get_super(c_other.t, tracker_other + ).safe_then([c_other, cloned_root](auto&& super_other) { + cloned_root->make_root_new(c_other, std::move(super_other)); + }); + }).safe_then([this_ref]{}); +} + +node_future<Ref<tree_cursor_t>> LeafNode::insert_value( + context_t c, const key_hobj_t& key, const onode_t& value, + const search_position_t& pos, const MatchHistory& history, + match_stat_t mstat) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(impl->is_level_tail()); + } +#endif + logger().debug("OTree::Leaf::Insert: " + "pos({}), {}, {}, {}, mstat({}) ...", + pos, key, value, history, mstat); + search_position_t insert_pos = pos; + auto [insert_stage, insert_size] = impl->evaluate_insert( + key, value, history, mstat, insert_pos); + auto free_size = impl->free_size(); + if (free_size >= insert_size) { + // insert + on_layout_change(); + impl->prepare_mutate(c); + auto p_value = impl->insert(key, value, insert_pos, insert_stage, insert_size); + assert(impl->free_size() == free_size - insert_size); + assert(insert_pos <= pos); + assert(p_value->size == value.size); + auto ret = track_insert(insert_pos, insert_stage, p_value); + validate_tracked_cursors(); + return node_ertr::make_ready_future<Ref<tree_cursor_t>>(ret); + } + // split and insert + Ref<LeafNode> this_ref = this; + return (is_root() ? upgrade_root(c) : node_ertr::now() + ).safe_then([this, c] { + return LeafNode::allocate(c, impl->field_type(), impl->is_level_tail()); + }).safe_then([this_ref, this, c, &key, &value, + insert_pos, insert_stage=insert_stage, insert_size=insert_size](auto fresh_right) mutable { + auto right_node = fresh_right.node; + // no need to bump version for right node, as it is fresh + on_layout_change(); + impl->prepare_mutate(c); + auto [split_pos, is_insert_left, p_value] = impl->split_insert( + fresh_right.mut, *right_node->impl, key, value, + insert_pos, insert_stage, insert_size); + assert(p_value->size == value.size); + track_split(split_pos, right_node); + Ref<tree_cursor_t> ret; + if (is_insert_left) { + ret = track_insert(insert_pos, insert_stage, p_value); + } else { + ret = right_node->track_insert(insert_pos, insert_stage, p_value); + } + validate_tracked_cursors(); + right_node->validate_tracked_cursors(); + + // propagate insert to parent + return insert_parent(c, right_node).safe_then([ret] { + return ret; + }); + // TODO (optimize) + // try to acquire space from siblings before split... see btrfs + }); +} + +node_future<Ref<LeafNode>> LeafNode::allocate_root( + context_t c, RootNodeTracker& root_tracker) { + return LeafNode::allocate(c, field_type_t::N0, true + ).safe_then([c, &root_tracker](auto fresh_node) { + auto root = fresh_node.node; + return c.nm.get_super(c.t, root_tracker + ).safe_then([c, root](auto&& super) { + root->make_root_new(c, std::move(super)); + return root; + }); + }); +} + +Ref<tree_cursor_t> LeafNode::get_or_track_cursor( + const search_position_t& position, + const key_view_t& key, const onode_t* p_value) { + assert(!position.is_end()); + assert(p_value); + Ref<tree_cursor_t> p_cursor; + auto found = tracked_cursors.find(position); + if (found == tracked_cursors.end()) { + p_cursor = new tree_cursor_t(this, position, key, p_value, layout_version); + } else { + p_cursor = found->second; + assert(p_cursor->get_leaf_node() == this); + assert(p_cursor->get_position() == position); + p_cursor->update_kv(key, p_value, layout_version); + } + return p_cursor; +} + +void LeafNode::validate_cursor(tree_cursor_t& cursor) const { +#ifndef NDEBUG + assert(this == cursor.get_leaf_node().get()); + assert(!cursor.is_end()); + auto [key, val, ver] = get_kv(cursor.get_position()); + assert(key == cursor.get_key_view()); + assert(val == cursor.get_p_value()); +#endif +} + +Ref<tree_cursor_t> LeafNode::track_insert( + const search_position_t& insert_pos, match_stage_t insert_stage, + const onode_t* p_onode) { + // update cursor position + auto pos_upper_bound = insert_pos; + pos_upper_bound.index_by_stage(insert_stage) = INDEX_UPPER_BOUND; + auto first = tracked_cursors.lower_bound(insert_pos); + auto last = tracked_cursors.lower_bound(pos_upper_bound); + std::vector<tree_cursor_t*> p_cursors; + std::for_each(first, last, [&p_cursors](auto& kv) { + p_cursors.push_back(kv.second); + }); + tracked_cursors.erase(first, last); + for (auto& p_cursor : p_cursors) { + search_position_t new_pos = p_cursor->get_position(); + ++new_pos.index_by_stage(insert_stage); + p_cursor->update_track<true>(this, new_pos); + } + + // track insert + // TODO: getting key_view_t from stage::proceed_insert() and + // stage::append_insert() has not supported yet + return new tree_cursor_t(this, insert_pos); +} + +void LeafNode::track_split( + const search_position_t& split_pos, Ref<LeafNode> right_node) { + // update cursor ownership and position + auto first = tracked_cursors.lower_bound(split_pos); + auto iter = first; + while (iter != tracked_cursors.end()) { + search_position_t new_pos = iter->first; + new_pos -= split_pos; + iter->second->update_track<false>(right_node, new_pos); + ++iter; + } + tracked_cursors.erase(first, tracked_cursors.end()); +} + +node_future<LeafNode::fresh_node_t> LeafNode::allocate( + context_t c, field_type_t field_type, bool is_level_tail) { + return LeafNodeImpl::allocate(c, field_type, is_level_tail + ).safe_then([](auto&& fresh_impl) { + auto node = Ref<LeafNode>(new LeafNode( + fresh_impl.impl.get(), std::move(fresh_impl.impl))); + return fresh_node_t{node, fresh_impl.mut}; + }); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h new file mode 100644 index 000000000..d6af489e7 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node.h @@ -0,0 +1,476 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <memory> +#include <ostream> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include "crimson/common/type_helpers.h" + +#include "node_extent_mutable.h" +#include "stages/key_layout.h" +#include "stages/stage_types.h" +#include "super.h" +#include "tree_types.h" + +/** + * Tree example (2 levels): + * + * Root node keys: [ 3 7 ] + * values: [p1 p2 p3] + * / | \ + * ------- | ------- + * | | | + * V V V + * Leaf node keys: [ 1 2 3] [ 4 5 7] [ 9 11 12] + * values: [v1 v2 v3] [v4 v5 v6] [v7 v8 v9] + * + * Tree structure properties: + * - As illustrated above, the parent key is strictly equal to its left child's + * largest key; + * - If a tree is indexing multiple seastore transactions, each transaction + * will be mapped to a Super which points to a distinct root node. So the + * transactions are isolated at tree level. However, tree nodes from + * different transactions can reference the same seastore CachedExtent before + * modification; + * - The resources of the transactional tree are tracked by tree_cursor_ts held + * by users. As long as any cursor is alive, the according tree hierarchy is + * alive and keeps tracked. See the reversed resource management sections + * below; + */ + +namespace crimson::os::seastore::onode { + +class LeafNode; +class InternalNode; + +/** + * tree_cursor_t + * + * A cursor points to a position (LeafNode and search_position_t) of the tree + * where it can find the according key and value pair. The position is updated + * by LeafNode insert/split/delete/merge internally and is kept valid. It also + * caches the key-value information for a specific node layout version. + * + * Exposes public interfaces for Btree::Cursor. + */ +using layout_version_t = uint32_t; +class tree_cursor_t final + : public boost::intrusive_ref_counter< + tree_cursor_t, boost::thread_unsafe_counter> { + public: + // public to Btree + ~tree_cursor_t(); + tree_cursor_t(const tree_cursor_t&) = delete; + tree_cursor_t(tree_cursor_t&&) = delete; + tree_cursor_t& operator=(const tree_cursor_t&) = delete; + tree_cursor_t& operator=(tree_cursor_t&&) = delete; + + /** + * is_end + * + * Represents one-past-the-last of all the sorted key-value + * pairs in the tree. An end cursor won't contain valid key-value + * information. + */ + bool is_end() const { return position.is_end(); } + + /// Returns the key view in tree if it is not an end cursor. + const key_view_t& get_key_view() const; + + /// Returns the value pointer in tree if it is not an end cursor. + const onode_t* get_p_value() const; + + private: + tree_cursor_t(Ref<LeafNode>, const search_position_t&); + tree_cursor_t(Ref<LeafNode>, const search_position_t&, + const key_view_t& key, const onode_t*, layout_version_t); + // lookup reaches the end, contain leaf node for further insert + tree_cursor_t(Ref<LeafNode>); + const search_position_t& get_position() const { return position; } + Ref<LeafNode> get_leaf_node() { return leaf_node; } + template <bool VALIDATE> + void update_track(Ref<LeafNode>, const search_position_t&); + void update_kv(const key_view_t&, const onode_t*, layout_version_t) const; + void ensure_kv() const; + + private: + /** + * Reversed resource management (tree_cursor_t) + * + * tree_cursor_t holds a reference to the LeafNode, so the LeafNode will be + * alive as long as any of it's cursors is still referenced by user. + */ + Ref<LeafNode> leaf_node; + search_position_t position; + + // cached information + mutable std::optional<key_view_t> key_view; + mutable const onode_t* p_value; + mutable layout_version_t node_version; + + friend class LeafNode; + friend class Node; // get_position(), get_leaf_node() +}; + +/** + * Node + * + * An abstracted class for both InternalNode and LeafNode. + * + * Exposes public interfaces for Btree. + */ +class Node + : public boost::intrusive_ref_counter< + Node, boost::thread_unsafe_counter> { + public: + // public to Btree + using node_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template <class ValueT=void> + using node_future = node_ertr::future<ValueT>; + + struct search_result_t { + bool is_end() const { return p_cursor->is_end(); } + Ref<tree_cursor_t> p_cursor; + match_stat_t mstat; + + MatchKindBS match() const { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + return (mstat == MSTAT_EQ ? MatchKindBS::EQ : MatchKindBS::NE); + } + }; + + virtual ~Node(); + Node(const Node&) = delete; + Node(Node&&) = delete; + Node& operator=(const Node&) = delete; + Node& operator=(Node&&) = delete; + + /** + * level + * + * A positive value denotes the level (or height) of this node in tree. + * 0 means LeafNode, positive means InternalNode. + */ + level_t level() const; + + /** + * lookup_smallest + * + * Returns a cursor pointing to the smallest key in the sub-tree formed by + * this node. + * + * Returns an end cursor if it is an empty root node. + */ + virtual node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) = 0; + + /** + * lookup_largest + * + * Returns a cursor pointing to the largest key in the sub-tree formed by + * this node. + * + * Returns an end cursor if it is an empty root node. + */ + virtual node_future<Ref<tree_cursor_t>> lookup_largest(context_t) = 0; + + /** + * lower_bound + * + * Returns a cursor pointing to the first element in the range [first, last) + * of the sub-tree which does not compare less than the input key. The + * result also denotes whether the pointed key is equal to the input key. + * + * Returns an end cursor with MatchKindBS::NE if: + * - It is an empty root node; + * - Or the input key is larger than all the keys in the sub-tree; + */ + node_future<search_result_t> lower_bound(context_t c, const key_hobj_t& key); + + /** + * insert + * + * Try to insert a key-value pair into the sub-tree formed by this node. + * + * Returns a boolean denoting whether the insertion is successful: + * - If true, the returned cursor points to the inserted element in tree; + * - If false, the returned cursor points to the conflicting element in tree; + */ + node_future<std::pair<Ref<tree_cursor_t>, bool>> insert( + context_t, const key_hobj_t&, const onode_t&); + + /// Recursively collects the statistics of the sub-tree formed by this node + node_future<tree_stats_t> get_tree_stats(context_t); + + /// Returns an ostream containing a dump of all the elements in the node. + std::ostream& dump(std::ostream&) const; + + /// Returns an ostream containing an one-line summary of this node. + std::ostream& dump_brief(std::ostream&) const; + + /// Initializes the tree by allocating an empty root node. + static node_future<> mkfs(context_t, RootNodeTracker&); + + /// Loads the tree root. The tree must be initialized. + static node_future<Ref<Node>> load_root(context_t, RootNodeTracker&); + + // Only for unit test purposes. + void test_make_destructable(context_t, NodeExtentMutable&, Super::URef&&); + virtual node_future<> test_clone_root(context_t, RootNodeTracker&) const = 0; + + protected: + virtual node_future<> test_clone_non_root(context_t, Ref<InternalNode>) const { + ceph_abort("impossible path"); + } + virtual node_future<search_result_t> lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) = 0; + virtual node_future<> do_get_tree_stats(context_t, tree_stats_t&) = 0; + + protected: + Node(NodeImplURef&&); + bool is_root() const { + assert((super && !_parent_info.has_value()) || + (!super && _parent_info.has_value())); + return !_parent_info.has_value(); + } + + // as root + void make_root(context_t c, Super::URef&& _super); + void make_root_new(context_t c, Super::URef&& _super) { + assert(_super->get_root_laddr() == L_ADDR_NULL); + make_root(c, std::move(_super)); + } + void make_root_from(context_t c, Super::URef&& _super, laddr_t from_addr) { + assert(_super->get_root_laddr() == from_addr); + make_root(c, std::move(_super)); + } + void as_root(Super::URef&& _super); + node_future<> upgrade_root(context_t); + + // as child/non-root + template <bool VALIDATE = true> + void as_child(const search_position_t&, Ref<InternalNode>); + struct parent_info_t { + search_position_t position; + Ref<InternalNode> ptr; + }; + const parent_info_t& parent_info() const { return *_parent_info; } + node_future<> insert_parent(context_t, Ref<Node> right_node); + + private: + /** + * Reversed resource management (Node) + * + * Root Node holds a reference to its parent Super class, so its parent + * will be alive as long as this root node is alive. + * + * None-root Node holds a reference to its parent Node, so its parent will + * be alive as long as any of it's children is alive. + */ + // as root + Super::URef super; + // as child/non-root + std::optional<parent_info_t> _parent_info; + + private: + static node_future<Ref<Node>> load(context_t, laddr_t, bool expect_is_level_tail); + + NodeImplURef impl; + friend class InternalNode; +}; +inline std::ostream& operator<<(std::ostream& os, const Node& node) { + return node.dump_brief(os); +} + +/** + * InternalNode + * + * A concrete implementation of Node class that represents an internal tree + * node. Its level is always positive and its values are logical block + * addresses to its child nodes. An internal node cannot be empty. + */ +class InternalNode final : public Node { + public: + // public to Node + InternalNode(InternalNodeImpl*, NodeImplURef&&); + ~InternalNode() override { assert(tracked_child_nodes.empty()); } + InternalNode(const InternalNode&) = delete; + InternalNode(InternalNode&&) = delete; + InternalNode& operator=(const InternalNode&) = delete; + InternalNode& operator=(InternalNode&&) = delete; + + node_future<> apply_child_split( + context_t, const search_position_t&, Ref<Node> left, Ref<Node> right); + template <bool VALIDATE> + void do_track_child(Node& child) { + if constexpr (VALIDATE) { + validate_child(child); + } + auto& child_pos = child.parent_info().position; + assert(tracked_child_nodes.find(child_pos) == tracked_child_nodes.end()); + tracked_child_nodes[child_pos] = &child; + } + void do_untrack_child(const Node& child) { + auto& child_pos = child.parent_info().position; + assert(tracked_child_nodes.find(child_pos)->second == &child); + [[maybe_unused]] auto removed = tracked_child_nodes.erase(child_pos); + assert(removed); + } + + static node_future<Ref<InternalNode>> allocate_root( + context_t, level_t, laddr_t, Super::URef&&); + + protected: + node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override; + node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override; + node_future<search_result_t> lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) override; + node_future<> do_get_tree_stats(context_t, tree_stats_t&) override; + + node_future<> test_clone_root(context_t, RootNodeTracker&) const override; + + private: + // XXX: extract a common tracker for InternalNode to track Node, + // and LeafNode to track tree_cursor_t. + node_future<Ref<Node>> get_or_track_child(context_t, const search_position_t&, laddr_t); + void track_insert( + const search_position_t&, match_stage_t, Ref<Node>, Ref<Node> nxt_child = nullptr); + void replace_track(const search_position_t&, Ref<Node> new_child, Ref<Node> old_child); + void track_split(const search_position_t&, Ref<InternalNode>); + void validate_tracked_children() const { +#ifndef NDEBUG + for (auto& kv : tracked_child_nodes) { + assert(kv.first == kv.second->parent_info().position); + validate_child(*kv.second); + } +#endif + } + void validate_child(const Node& child) const; + + struct fresh_node_t { + Ref<InternalNode> node; + NodeExtentMutable mut; + std::pair<Ref<Node>, NodeExtentMutable> make_pair() { + return std::make_pair(Ref<Node>(node), mut); + } + }; + static node_future<fresh_node_t> allocate(context_t, field_type_t, bool, level_t); + + private: + /** + * Reversed resource management (InternalNode) + * + * InteralNode keeps track of its child nodes which are still alive in + * memory, and their positions will be updated throughout + * insert/split/delete/merge operations of this node. + */ + // XXX: leverage intrusive data structure to control memory overhead + std::map<search_position_t, Node*> tracked_child_nodes; + InternalNodeImpl* impl; +}; + +/** + * LeafNode + * + * A concrete implementation of Node class that represents a leaf tree node. + * Its level is always 0. A leaf node can only be empty if it is root. + */ +class LeafNode final : public Node { + public: + // public to tree_cursor_t + ~LeafNode() override { assert(tracked_cursors.empty()); } + LeafNode(const LeafNode&) = delete; + LeafNode(LeafNode&&) = delete; + LeafNode& operator=(const LeafNode&) = delete; + LeafNode& operator=(LeafNode&&) = delete; + + bool is_level_tail() const; + layout_version_t get_layout_version() const { return layout_version; } + std::tuple<key_view_t, const onode_t*, layout_version_t> get_kv( + const search_position_t&) const; + template <bool VALIDATE> + void do_track_cursor(tree_cursor_t& cursor) { + if constexpr (VALIDATE) { + validate_cursor(cursor); + } + auto& cursor_pos = cursor.get_position(); + assert(tracked_cursors.find(cursor_pos) == tracked_cursors.end()); + tracked_cursors[cursor_pos] = &cursor; + } + void do_untrack_cursor(tree_cursor_t& cursor) { + validate_cursor(cursor); + auto& cursor_pos = cursor.get_position(); + assert(tracked_cursors.find(cursor_pos)->second == &cursor); + [[maybe_unused]] auto removed = tracked_cursors.erase(cursor_pos); + assert(removed); + } + + protected: + node_future<Ref<tree_cursor_t>> lookup_smallest(context_t) override; + node_future<Ref<tree_cursor_t>> lookup_largest(context_t) override; + node_future<search_result_t> lower_bound_tracked( + context_t, const key_hobj_t&, MatchHistory&) override; + node_future<> do_get_tree_stats(context_t, tree_stats_t&) override; + + node_future<> test_clone_root(context_t, RootNodeTracker&) const override; + + private: + LeafNode(LeafNodeImpl*, NodeImplURef&&); + node_future<Ref<tree_cursor_t>> insert_value( + context_t, const key_hobj_t&, const onode_t&, + const search_position_t&, const MatchHistory&, + match_stat_t mstat); + static node_future<Ref<LeafNode>> allocate_root(context_t, RootNodeTracker&); + friend class Node; + + private: + // XXX: extract a common tracker for InternalNode to track Node, + // and LeafNode to track tree_cursor_t. + Ref<tree_cursor_t> get_or_track_cursor( + const search_position_t&, const key_view_t&, const onode_t*); + Ref<tree_cursor_t> track_insert( + const search_position_t&, match_stage_t, const onode_t*); + void track_split(const search_position_t&, Ref<LeafNode>); + void validate_tracked_cursors() const { +#ifndef NDEBUG + for (auto& kv : tracked_cursors) { + assert(kv.first == kv.second->get_position()); + validate_cursor(*kv.second); + } +#endif + } + void validate_cursor(tree_cursor_t& cursor) const; + // invalidate p_value pointers in tree_cursor_t + void on_layout_change() { ++layout_version; } + + struct fresh_node_t { + Ref<LeafNode> node; + NodeExtentMutable mut; + std::pair<Ref<Node>, NodeExtentMutable> make_pair() { + return std::make_pair(Ref<Node>(node), mut); + } + }; + static node_future<fresh_node_t> allocate(context_t, field_type_t, bool); + + private: + /** + * Reversed resource management (LeafNode) + * + * LeafNode keeps track of the referencing cursors which are still alive in + * memory, and their positions will be updated throughout + * insert/split/delete/merge operations of this node. + */ + // XXX: leverage intrusive data structure to control memory overhead + std::map<search_position_t, tree_cursor_t*> tracked_cursors; + LeafNodeImpl* impl; + layout_version_t layout_version = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h new file mode 100644 index 000000000..d08a99015 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/buffer.h" +#include "node_types.h" + +namespace crimson::os::seastore::onode { + +/** + * DeltaRecorder + * + * An abstracted class to encapsulate different implementations to apply delta + * to a specific node layout. + */ +class DeltaRecorder { + public: + virtual ~DeltaRecorder() { + assert(is_empty()); + } + + bool is_empty() const { + return encoded.length() == 0; + } + + ceph::bufferlist get_delta() { + assert(!is_empty()); + return std::move(encoded); + } + + virtual node_type_t node_type() const = 0; + virtual field_type_t field_type() const = 0; + virtual void apply_delta(ceph::bufferlist::const_iterator&, + NodeExtentMutable&) = 0; + + protected: + DeltaRecorder() = default; + ceph::bufferlist encoded; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h new file mode 100644 index 000000000..94782f50d --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h @@ -0,0 +1,413 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/log.h" +#include "node_extent_manager.h" +#include "node_delta_recorder.h" +#include "node_layout_replayable.h" + +#ifndef NDEBUG +#include "node_extent_manager/test_replay.h" +#endif + +namespace crimson::os::seastore::onode { + +/** + * DeltaRecorderT + * + * Responsible to encode and decode delta, and apply delta for a specific node + * layout. + */ +template <typename FieldType, node_type_t NODE_TYPE> +class DeltaRecorderT final: public DeltaRecorder { + enum class op_t : uint8_t { + INSERT, + SPLIT, + SPLIT_INSERT, + UPDATE_CHILD_ADDR, + }; + + public: + using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>; + using node_stage_t = typename layout_t::node_stage_t; + using position_t = typename layout_t::position_t; + using StagedIterator = typename layout_t::StagedIterator; + using value_t = typename layout_t::value_t; + static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE; + + ~DeltaRecorderT() override = default; + + template <KeyT KT> + void encode_insert( + const full_key_t<KT>& key, + const value_t& value, + const position_t& insert_pos, + const match_stage_t& insert_stage, + const node_offset_t& insert_size) { + ceph::encode(op_t::INSERT, encoded); + encode_key<KT>(key, encoded); + encode_value(value, encoded); + insert_pos.encode(encoded); + ceph::encode(insert_stage, encoded); + ceph::encode(insert_size, encoded); + } + + void encode_split( + const StagedIterator& split_at, + const char* p_node_start) { + ceph::encode(op_t::SPLIT, encoded); + split_at.encode(p_node_start, encoded); + } + + template <KeyT KT> + void encode_split_insert( + const StagedIterator& split_at, + const full_key_t<KT>& key, + const value_t& value, + const position_t& insert_pos, + const match_stage_t& insert_stage, + const node_offset_t& insert_size, + const char* p_node_start) { + ceph::encode(op_t::SPLIT_INSERT, encoded); + split_at.encode(p_node_start, encoded); + encode_key<KT>(key, encoded); + encode_value(value, encoded); + insert_pos.encode(encoded); + ceph::encode(insert_stage, encoded); + ceph::encode(insert_size, encoded); + } + + void encode_update_child_addr( + const laddr_t new_addr, + const laddr_packed_t* p_addr, + const char* p_node_start) { + ceph::encode(op_t::UPDATE_CHILD_ADDR, encoded); + ceph::encode(new_addr, encoded); + int node_offset = reinterpret_cast<const char*>(p_addr) - p_node_start; + assert(node_offset > 0 && node_offset <= NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(node_offset), encoded); + } + + static DeltaRecorderURef create() { + return std::unique_ptr<DeltaRecorder>(new DeltaRecorderT()); + } + + protected: + DeltaRecorderT() = default; + node_type_t node_type() const override { return NODE_TYPE; } + field_type_t field_type() const override { return FIELD_TYPE; } + void apply_delta(ceph::bufferlist::const_iterator& delta, + NodeExtentMutable& node) override { + assert(is_empty()); + node_stage_t stage(reinterpret_cast<const FieldType*>(node.get_read())); + op_t op; + try { + ceph::decode(op, delta); + switch (op) { + case op_t::INSERT: { + logger().debug("OTree::Extent::Replay: decoding INSERT ..."); + auto key = key_hobj_t::decode(delta); + + std::unique_ptr<char[]> value_storage_heap; + value_t value_storage_stack; + auto p_value = decode_value(delta, value_storage_heap, value_storage_stack); + + auto insert_pos = position_t::decode(delta); + match_stage_t insert_stage; + ceph::decode(insert_stage, delta); + node_offset_t insert_size; + ceph::decode(insert_size, delta); + logger().debug("OTree::Extent::Replay: apply {}, {}, " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + key, *p_value, insert_pos, insert_stage, insert_size); + layout_t::template insert<KeyT::HOBJ>( + node, stage, key, *p_value, insert_pos, insert_stage, insert_size); + break; + } + case op_t::SPLIT: { + logger().debug("OTree::Extent::Replay: decoding SPLIT ..."); + auto split_at = StagedIterator::decode(stage.p_start(), delta); + logger().debug("OTree::Extent::Replay: apply split_at={} ...", split_at); + layout_t::split(node, stage, split_at); + break; + } + case op_t::SPLIT_INSERT: { + logger().debug("OTree::Extent::Replay: decoding SPLIT_INSERT ..."); + auto split_at = StagedIterator::decode(stage.p_start(), delta); + auto key = key_hobj_t::decode(delta); + + std::unique_ptr<char[]> value_storage_heap; + value_t value_storage_stack; + auto p_value = decode_value(delta, value_storage_heap, value_storage_stack); + + auto insert_pos = position_t::decode(delta); + match_stage_t insert_stage; + ceph::decode(insert_stage, delta); + node_offset_t insert_size; + ceph::decode(insert_size, delta); + logger().debug("OTree::Extent::Replay: apply split_at={}, {}, {}, " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + split_at, key, *p_value, insert_pos, insert_stage, insert_size); + layout_t::template split_insert<KeyT::HOBJ>( + node, stage, split_at, key, *p_value, insert_pos, insert_stage, insert_size); + break; + } + case op_t::UPDATE_CHILD_ADDR: { + logger().debug("OTree::Extent::Replay: decoding UPDATE_CHILD_ADDR ..."); + laddr_t new_addr; + ceph::decode(new_addr, delta); + node_offset_t update_offset; + ceph::decode(update_offset, delta); + auto p_addr = reinterpret_cast<laddr_packed_t*>( + node.get_write() + update_offset); + logger().debug("OTree::Extent::Replay: apply {:#x} to offset {:#x} ...", + new_addr, update_offset); + layout_t::update_child_addr(node, new_addr, p_addr); + break; + } + default: + logger().error("OTree::Extent::Replay: got unknown op {} when replay {:#x}", + op, node.get_laddr()); + ceph_abort(); + } + } catch (buffer::error& e) { + logger().error("OTree::Extent::Replay: got decode error {} when replay {:#x}", + e, node.get_laddr()); + ceph_abort(); + } + } + + private: + static void encode_value(const value_t& value, ceph::bufferlist& encoded) { + if constexpr (std::is_same_v<value_t, laddr_packed_t>) { + // NODE_TYPE == node_type_t::INTERNAL + ceph::encode(value.value, encoded); + } else if constexpr (std::is_same_v<value_t, onode_t>) { + // NODE_TYPE == node_type_t::LEAF + value.encode(encoded); + } else { + ceph_abort("impossible path"); + } + } + + static value_t* decode_value(ceph::bufferlist::const_iterator& delta, + std::unique_ptr<char[]>& value_storage_heap, + value_t& value_storage_stack) { + if constexpr (std::is_same_v<value_t, laddr_packed_t>) { + // NODE_TYPE == node_type_t::INTERNAL + laddr_t value; + ceph::decode(value, delta); + value_storage_stack.value = value; + return &value_storage_stack; + } else if constexpr (std::is_same_v<value_t, onode_t>) { + // NODE_TYPE == node_type_t::LEAF + auto value_config = onode_t::decode(delta); + value_storage_heap = onode_t::allocate(value_config); + return reinterpret_cast<onode_t*>(value_storage_heap.get()); + } else { + ceph_abort("impossible path"); + } + } + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +}; + +/** + * NodeExtentAccessorT + * + * This component is responsible to reference and mutate the underlying + * NodeExtent, record mutation parameters when needed, and apply the recorded + * modifications for a specific node layout. + */ +template <typename FieldType, node_type_t NODE_TYPE> +class NodeExtentAccessorT { + public: + using layout_t = NodeLayoutReplayableT<FieldType, NODE_TYPE>; + using node_stage_t = typename layout_t::node_stage_t; + using position_t = typename layout_t::position_t; + using recorder_t = DeltaRecorderT<FieldType, NODE_TYPE>; + using StagedIterator = typename layout_t::StagedIterator; + using value_t = typename layout_t::value_t; + static constexpr auto FIELD_TYPE = layout_t::FIELD_TYPE; + + NodeExtentAccessorT(NodeExtentRef extent) + : extent{extent}, + node_stage{reinterpret_cast<const FieldType*>(extent->get_read())} { + if (no_recording()) { + mut.emplace(extent->get_mutable()); + assert(extent->get_recorder() == nullptr); + recorder = nullptr; + } else if (needs_recording()) { + mut.emplace(extent->get_mutable()); + auto p_recorder = extent->get_recorder(); + assert(p_recorder != nullptr); + assert(p_recorder->node_type() == NODE_TYPE); + assert(p_recorder->field_type() == FIELD_TYPE); + recorder = static_cast<recorder_t*>(p_recorder); + } else if (needs_mutate()) { + // mut is empty + assert(extent->get_recorder() == nullptr || + extent->get_recorder()->is_empty()); + recorder = nullptr; + } else { + ceph_abort("impossible path"); + } +#ifndef NDEBUG + auto ref_recorder = recorder_t::create(); + test_recorder = static_cast<recorder_t*>(ref_recorder.get()); + test_extent = TestReplayExtent::create( + extent->get_length(), std::move(ref_recorder)); +#endif + } + ~NodeExtentAccessorT() = default; + NodeExtentAccessorT(const NodeExtentAccessorT&) = delete; + NodeExtentAccessorT(NodeExtentAccessorT&&) = delete; + NodeExtentAccessorT& operator=(const NodeExtentAccessorT&) = delete; + NodeExtentAccessorT& operator=(NodeExtentAccessorT&&) = delete; + + const node_stage_t& read() const { return node_stage; } + laddr_t get_laddr() const { return extent->get_laddr(); } + + // must be called before any mutate attempes. + // for the safety of mixed read and mutate, call before read. + void prepare_mutate(context_t c) { + if (needs_mutate()) { + auto ref_recorder = recorder_t::create(); + recorder = static_cast<recorder_t*>(ref_recorder.get()); + extent = extent->mutate(c, std::move(ref_recorder)); + assert(needs_recording()); + node_stage = node_stage_t( + reinterpret_cast<const FieldType*>(extent->get_read())); + assert(recorder == static_cast<recorder_t*>(extent->get_recorder())); + mut.emplace(extent->get_mutable()); + } + } + + template <KeyT KT> + const value_t* insert_replayable( + const full_key_t<KT>& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->template encode_insert<KT>( + key, value, insert_pos, insert_stage, insert_size); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_insert<KT>( + key, value, insert_pos, insert_stage, insert_size); +#endif + auto ret = layout_t::template insert<KT>( + *mut, read(), key, value, + insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; + } + + void split_replayable(StagedIterator& split_at) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->encode_split(split_at, read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_split(split_at, read().p_start()); +#endif + layout_t::split(*mut, read(), split_at); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + } + + template <KeyT KT> + const value_t* split_insert_replayable( + StagedIterator& split_at, + const full_key_t<KT>& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->template encode_split_insert<KT>( + split_at, key, value, insert_pos, insert_stage, insert_size, + read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->template encode_split_insert<KT>( + split_at, key, value, insert_pos, insert_stage, insert_size, + read().p_start()); +#endif + auto ret = layout_t::template split_insert<KT>( + *mut, read(), split_at, key, value, + insert_pos, insert_stage, insert_size); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + return ret; + } + + void update_child_addr_replayable( + const laddr_t new_addr, laddr_packed_t* p_addr) { + assert(!needs_mutate()); + if (needs_recording()) { + recorder->encode_update_child_addr(new_addr, p_addr, read().p_start()); + } +#ifndef NDEBUG + test_extent->prepare_replay(extent); + test_recorder->encode_update_child_addr(new_addr, p_addr, read().p_start()); +#endif + layout_t::update_child_addr(*mut, new_addr, p_addr); +#ifndef NDEBUG + test_extent->replay_and_verify(extent); +#endif + } + + void test_copy_to(NodeExtentMutable& to) const { + assert(extent->get_length() == to.get_length()); + std::memcpy(to.get_write(), extent->get_read(), extent->get_length()); + } + + private: + /** + * Possible states with CachedExtent::extent_state_t: + * INITIAL_WRITE_PENDING -- can mutate, no recording + * MUTATION_PENDING -- can mutate, needs recording + * CLEAN/DIRTY -- pending mutate + * INVALID -- impossible + */ + bool no_recording() const { + return extent->is_initial_pending(); + } + bool needs_recording() const { + return extent->is_mutation_pending(); + } + bool needs_mutate() const { + assert(extent->is_valid()); + return !extent->is_pending(); + } + + NodeExtentRef extent; + node_stage_t node_stage; + std::optional<NodeExtentMutable> mut; + // owned by extent + recorder_t* recorder; + +#ifndef NDEBUG + // verify record replay using a different memory block + TestReplayExtent::Ref test_extent; + recorder_t* test_recorder; +#endif +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc new file mode 100644 index 000000000..bd22d4b67 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.cc @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_extent_manager.h" + +#include "node_extent_manager/dummy.h" +#include "node_extent_manager/seastore.h" +#include "stages/node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +std::pair<node_type_t, field_type_t> NodeExtent::get_types() const { + const auto header = reinterpret_cast<const node_header_t*>(get_read()); + auto node_type = header->get_node_type(); + auto field_type = header->get_field_type(); + if (!field_type.has_value()) { + throw std::runtime_error("load failed: bad field type"); + } + return {node_type, *field_type}; +} + +NodeExtentManagerURef NodeExtentManager::create_dummy(bool is_sync) { + if (is_sync) { + return NodeExtentManagerURef(new DummyNodeExtentManager<true>()); + } else { + return NodeExtentManagerURef(new DummyNodeExtentManager<false>()); + } +} + +NodeExtentManagerURef NodeExtentManager::create_seastore( + TransactionManager& tm, laddr_t min_laddr) { + return NodeExtentManagerURef(new SeastoreNodeExtentManager(tm, min_laddr)); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h new file mode 100644 index 000000000..77b230e03 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/type_helpers.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/transaction_manager.h" + +#include "fwd.h" +#include "super.h" +#include "node_extent_mutable.h" +#include "node_types.h" + +/** + * node_extent_manager.h + * + * Contains general interfaces for different backends (Dummy and Seastore). + */ + +namespace crimson::os::seastore::onode { + +using crimson::os::seastore::LogicalCachedExtent; +class NodeExtent : public LogicalCachedExtent { + public: + virtual ~NodeExtent() = default; + std::pair<node_type_t, field_type_t> get_types() const; + const char* get_read() const { + return get_bptr().c_str(); + } + NodeExtentMutable get_mutable() { + assert(is_pending()); + return do_get_mutable(); + } + + virtual DeltaRecorder* get_recorder() const = 0; + virtual NodeExtentRef mutate(context_t, DeltaRecorderURef&&) = 0; + + protected: + template <typename... T> + NodeExtent(T&&... t) : LogicalCachedExtent(std::forward<T>(t)...) {} + + NodeExtentMutable do_get_mutable() { + return NodeExtentMutable(*this); + } + + /** + * Abstracted interfaces to implement: + * - CacheExtent::duplicate_for_write() -> CachedExtentRef + * - CacheExtent::get_type() -> extent_types_t + * - CacheExtent::get_delta() -> ceph::bufferlist + * - LogicalCachedExtent::apply_delta(const ceph::bufferlist) -> void + */ + + private: + friend class NodeExtentMutable; +}; + +using crimson::os::seastore::TransactionManager; +class NodeExtentManager { + public: + virtual ~NodeExtentManager() = default; + using tm_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template <class ValueT=void> + using tm_future = tm_ertr::future<ValueT>; + + virtual bool is_read_isolated() const = 0; + virtual tm_future<NodeExtentRef> read_extent( + Transaction&, laddr_t, extent_len_t) = 0; + virtual tm_future<NodeExtentRef> alloc_extent(Transaction&, extent_len_t) = 0; + virtual tm_future<Super::URef> get_super(Transaction&, RootNodeTracker&) = 0; + virtual std::ostream& print(std::ostream& os) const = 0; + + static NodeExtentManagerURef create_dummy(bool is_sync); + static NodeExtentManagerURef create_seastore( + TransactionManager& tm, laddr_t min_laddr = L_ADDR_MIN); +}; +inline std::ostream& operator<<(std::ostream& os, const NodeExtentManager& nm) { + return nm.print(os); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h new file mode 100644 index 000000000..830ea4a7d --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h @@ -0,0 +1,156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <chrono> +#include <seastar/core/sleep.hh> + +#include "include/buffer_raw.h" + +#include "crimson/common/log.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +/** + * dummy.h + * + * Dummy backend implementations for test purposes. + */ + +namespace crimson::os::seastore::onode { + +class DummySuper final: public Super { + public: + DummySuper(Transaction& t, RootNodeTracker& tracker, laddr_t* p_root_laddr) + : Super(t, tracker), p_root_laddr{p_root_laddr} {} + ~DummySuper() override = default; + protected: + laddr_t get_root_laddr() const override { return *p_root_laddr; } + void write_root_laddr(context_t, laddr_t addr) override { + logger().info("OTree::Dummy: update root {:#x} ...", addr); + *p_root_laddr = addr; + } + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + laddr_t* p_root_laddr; +}; + +class DummyNodeExtent final: public NodeExtent { + public: + DummyNodeExtent(ceph::bufferptr &&ptr) : NodeExtent(std::move(ptr)) { + state = extent_state_t::INITIAL_WRITE_PENDING; + } + ~DummyNodeExtent() override = default; + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override { + ceph_abort("impossible path"); } + DeltaRecorder* get_recorder() const override { + return nullptr; } + CachedExtentRef duplicate_for_write() override { + ceph_abort("impossible path"); } + extent_types_t get_type() const override { + return extent_types_t::TEST_BLOCK; } + ceph::bufferlist get_delta() override { + ceph_abort("impossible path"); } + void apply_delta(const ceph::bufferlist&) override { + ceph_abort("impossible path"); } +}; + +template <bool SYNC> +class DummyNodeExtentManager final: public NodeExtentManager { + static constexpr size_t ALIGNMENT = 4096; + public: + ~DummyNodeExtentManager() override = default; + protected: + bool is_read_isolated() const override { return false; } + + tm_future<NodeExtentRef> read_extent( + Transaction& t, laddr_t addr, extent_len_t len) override { + logger().trace("OTree::Dummy: reading {}B at {:#x} ...", len, addr); + if constexpr (SYNC) { + return read_extent_sync(t, addr, len); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, addr, len] { + return read_extent_sync(t, addr, len); + }); + } + } + + tm_future<NodeExtentRef> alloc_extent( + Transaction& t, extent_len_t len) override { + logger().trace("OTree::Dummy: allocating {}B ...", len); + if constexpr (SYNC) { + return alloc_extent_sync(t, len); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, len] { + return alloc_extent_sync(t, len); + }); + } + } + + tm_future<Super::URef> get_super( + Transaction& t, RootNodeTracker& tracker) override { + logger().trace("OTree::Dummy: get root ..."); + if constexpr (SYNC) { + return get_super_sync(t, tracker); + } else { + using namespace std::chrono_literals; + return seastar::sleep(1us).then([this, &t, &tracker] { + return get_super_sync(t, tracker); + }); + } + } + + std::ostream& print(std::ostream& os) const override { + return os << "DummyNodeExtentManager(sync=" << SYNC << ")"; + } + + private: + tm_future<NodeExtentRef> read_extent_sync( + Transaction& t, laddr_t addr, extent_len_t len) { + auto iter = allocate_map.find(addr); + assert(iter != allocate_map.end()); + auto extent = iter->second; + logger().trace("OTree::Dummy: read {}B at {:#x}", + extent->get_length(), extent->get_laddr()); + assert(extent->get_laddr() == addr); + assert(extent->get_length() == len); + return tm_ertr::make_ready_future<NodeExtentRef>(extent); + } + + tm_future<NodeExtentRef> alloc_extent_sync( + Transaction& t, extent_len_t len) { + assert(len % ALIGNMENT == 0); + auto r = ceph::buffer::create_aligned(len, ALIGNMENT); + auto addr = reinterpret_cast<laddr_t>(r->get_data()); + auto bp = ceph::bufferptr(std::move(r)); + auto extent = Ref<DummyNodeExtent>(new DummyNodeExtent(std::move(bp))); + extent->set_laddr(addr); + assert(allocate_map.find(extent->get_laddr()) == allocate_map.end()); + allocate_map.insert({extent->get_laddr(), extent}); + logger().debug("OTree::Dummy: allocated {}B at {:#x}", + extent->get_length(), extent->get_laddr()); + assert(extent->get_length() == len); + return tm_ertr::make_ready_future<NodeExtentRef>(extent); + } + + tm_future<Super::URef> get_super_sync( + Transaction& t, RootNodeTracker& tracker) { + logger().debug("OTree::Dummy: got root {:#x}", root_laddr); + return tm_ertr::make_ready_future<Super::URef>( + Super::URef(new DummySuper(t, tracker, &root_laddr))); + } + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + + std::map<laddr_t, Ref<DummyNodeExtent>> allocate_map; + laddr_t root_laddr = L_ADDR_NULL; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc new file mode 100644 index 000000000..8d88485bf --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.cc @@ -0,0 +1,88 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "seastore.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_accessor.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h" + +namespace { + +seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); +} + +} + +namespace crimson::os::seastore::onode { + +static DeltaRecorderURef create_recorder( + node_type_t node_type, field_type_t field_type) { + if (node_type == node_type_t::LEAF) { + if (field_type == field_type_t::N0) { + return DeltaRecorderT<node_fields_0_t, node_type_t::LEAF>::create(); + } else if (field_type == field_type_t::N1) { + return DeltaRecorderT<node_fields_1_t, node_type_t::LEAF>::create(); + } else if (field_type == field_type_t::N2) { + return DeltaRecorderT<node_fields_2_t, node_type_t::LEAF>::create(); + } else if (field_type == field_type_t::N3) { + return DeltaRecorderT<leaf_fields_3_t, node_type_t::LEAF>::create(); + } else { + ceph_abort("impossible path"); + } + } else if (node_type == node_type_t::INTERNAL) { + if (field_type == field_type_t::N0) { + return DeltaRecorderT<node_fields_0_t, node_type_t::INTERNAL>::create(); + } else if (field_type == field_type_t::N1) { + return DeltaRecorderT<node_fields_1_t, node_type_t::INTERNAL>::create(); + } else if (field_type == field_type_t::N2) { + return DeltaRecorderT<node_fields_2_t, node_type_t::INTERNAL>::create(); + } else if (field_type == field_type_t::N3) { + return DeltaRecorderT<internal_fields_3_t, node_type_t::INTERNAL>::create(); + } else { + ceph_abort("impossible path"); + } + } else { + ceph_abort("impossible path"); + } +} + +void SeastoreSuper::write_root_laddr(context_t c, laddr_t addr) { + logger().info("OTree::Seastore: update root {:#x} ...", addr); + root_addr = addr; + auto nm = static_cast<SeastoreNodeExtentManager*>(&c.nm); + nm->get_tm().write_onode_root(c.t, addr); +} + +NodeExtentRef SeastoreNodeExtent::mutate( + context_t c, DeltaRecorderURef&& _recorder) { + logger().debug("OTree::Seastore: mutate {:#x} ...", get_laddr()); + auto nm = static_cast<SeastoreNodeExtentManager*>(&c.nm); + auto extent = nm->get_tm().get_mutable_extent(c.t, this); + auto ret = extent->cast<SeastoreNodeExtent>(); + assert(!ret->recorder || ret->recorder->is_empty()); + ret->recorder = std::move(_recorder); + return ret; +} + +void SeastoreNodeExtent::apply_delta(const ceph::bufferlist& bl) { + logger().debug("OTree::Seastore: replay {:#x} ...", get_laddr()); + if (!recorder) { + auto [node_type, field_type] = get_types(); + recorder = create_recorder(node_type, field_type); + } else { +#ifndef NDEBUG + auto [node_type, field_type] = get_types(); + assert(recorder->node_type() == node_type); + assert(recorder->field_type() == field_type); +#endif + } + assert(is_clean()); + auto node = do_get_mutable(); + auto p = bl.cbegin(); + while (p != bl.end()) { + recorder->apply_delta(p, node); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h new file mode 100644 index 000000000..f80b99fab --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/log.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h" + +/** + * seastore.h + * + * Seastore backend implementations. + */ + +namespace crimson::os::seastore::onode { + +class SeastoreSuper final: public Super { + public: + SeastoreSuper(Transaction& t, RootNodeTracker& tracker, + laddr_t root_addr, TransactionManager& tm) + : Super(t, tracker), root_addr{root_addr}, tm{tm} {} + ~SeastoreSuper() override = default; + protected: + laddr_t get_root_laddr() const override { + return root_addr; + } + void write_root_laddr(context_t c, laddr_t addr) override; + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + laddr_t root_addr; + TransactionManager& tm; +}; + +class SeastoreNodeExtent final: public NodeExtent { + public: + SeastoreNodeExtent(ceph::bufferptr &&ptr) + : NodeExtent(std::move(ptr)) {} + SeastoreNodeExtent(const SeastoreNodeExtent& other) + : NodeExtent(other) {} + ~SeastoreNodeExtent() override = default; + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override; + + DeltaRecorder* get_recorder() const override { + return recorder.get(); + } + + CachedExtentRef duplicate_for_write() override { + return CachedExtentRef(new SeastoreNodeExtent(*this)); + } + extent_types_t get_type() const override { + return extent_types_t::ONODE_BLOCK_STAGED; + } + ceph::bufferlist get_delta() override { + assert(recorder); + return recorder->get_delta(); + } + void apply_delta(const ceph::bufferlist&) override; + private: + DeltaRecorderURef recorder; +}; + +class SeastoreNodeExtentManager final: public NodeExtentManager { + public: + SeastoreNodeExtentManager(TransactionManager& tm, laddr_t min) + : tm{tm}, addr_min{min} {}; + ~SeastoreNodeExtentManager() override = default; + TransactionManager& get_tm() { return tm; } + protected: + bool is_read_isolated() const override { return true; } + + tm_future<NodeExtentRef> read_extent( + Transaction& t, laddr_t addr, extent_len_t len) override { + logger().debug("OTree::Seastore: reading {}B at {:#x} ...", len, addr); + return tm.read_extents<SeastoreNodeExtent>(t, addr, len + ).safe_then([addr, len](auto&& extents) { + assert(extents.size() == 1); + [[maybe_unused]] auto [laddr, e] = extents.front(); + logger().trace("OTree::Seastore: read {}B at {:#x}", + e->get_length(), e->get_laddr()); + assert(e->get_laddr() == addr); + assert(e->get_length() == len); + std::ignore = addr; + std::ignore = len; + return NodeExtentRef(e); + }); + } + + tm_future<NodeExtentRef> alloc_extent( + Transaction& t, extent_len_t len) override { + logger().debug("OTree::Seastore: allocating {}B ...", len); + return tm.alloc_extent<SeastoreNodeExtent>(t, addr_min, len + ).safe_then([len](auto extent) { + logger().debug("OTree::Seastore: allocated {}B at {:#x}", + extent->get_length(), extent->get_laddr()); + assert(extent->get_length() == len); + std::ignore = len; + return NodeExtentRef(extent); + }); + } + + tm_future<Super::URef> get_super( + Transaction& t, RootNodeTracker& tracker) override { + logger().trace("OTree::Seastore: get root ..."); + return tm.read_onode_root(t).safe_then([this, &t, &tracker](auto root_addr) { + logger().debug("OTree::Seastore: got root {:#x}", root_addr); + return Super::URef(new SeastoreSuper(t, tracker, root_addr, tm)); + }); + } + + std::ostream& print(std::ostream& os) const override { + return os << "SeastoreNodeExtentManager"; + } + + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + TransactionManager& tm; + const laddr_t addr_min; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h new file mode 100644 index 000000000..240c88932 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_delta_recorder.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager.h" + +/** test_replay.h + * + * A special version of NodeExtent to help verify delta encode, decode and + * replay in recorder_t under debug build. + */ + +namespace crimson::os::seastore::onode { + +class TestReplayExtent final: public NodeExtent { + public: + using Ref = crimson::os::seastore::TCachedExtentRef<TestReplayExtent>; + + void prepare_replay(NodeExtentRef from_extent) { + assert(get_length() == from_extent->get_length()); + auto mut = do_get_mutable(); + std::memcpy(mut.get_write(), from_extent->get_read(), get_length()); + } + + void replay_and_verify(NodeExtentRef replayed_extent) { + assert(get_length() == replayed_extent->get_length()); + auto mut = do_get_mutable(); + auto bl = recorder->get_delta(); + assert(bl.length()); + auto p = bl.cbegin(); + recorder->apply_delta(p, mut); + assert(p == bl.end()); + auto cmp = std::memcmp(get_read(), replayed_extent->get_read(), get_length()); + ceph_assert(cmp == 0 && "replay mismatch!"); + } + + static Ref create(extent_len_t length, DeltaRecorderURef&& recorder) { + auto r = ceph::buffer::create_aligned(length, 4096); + auto bp = ceph::bufferptr(std::move(r)); + return new TestReplayExtent(std::move(bp), std::move(recorder)); + } + + protected: + NodeExtentRef mutate(context_t, DeltaRecorderURef&&) override { + ceph_abort("impossible path"); } + DeltaRecorder* get_recorder() const override { + ceph_abort("impossible path"); } + CachedExtentRef duplicate_for_write() override { + ceph_abort("impossible path"); } + extent_types_t get_type() const override { + return extent_types_t::TEST_BLOCK; } + ceph::bufferlist get_delta() override { + ceph_abort("impossible path"); } + void apply_delta(const ceph::bufferlist&) override { + ceph_abort("impossible path"); } + + private: + TestReplayExtent(ceph::bufferptr&& ptr, DeltaRecorderURef&& recorder) + : NodeExtent(std::move(ptr)), recorder(std::move(recorder)) { + state = extent_state_t::MUTATION_PENDING; + } + DeltaRecorderURef recorder; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc new file mode 100644 index 000000000..048c4000d --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.cc @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_extent_mutable.h" +#include "node_extent_manager.h" + +namespace crimson::os::seastore::onode { + +NodeExtentMutable::NodeExtentMutable(NodeExtent& extent) + : extent{extent} { + assert(extent.is_pending() || // during mutation + extent.is_clean()); // during replay +} + +const char* NodeExtentMutable::get_read() const { + assert(extent.is_pending() || // during mutation + extent.is_clean()); // during replay + return extent.get_bptr().c_str(); +} + +char* NodeExtentMutable::get_write() { + assert(extent.is_pending() || // during mutation + extent.is_clean()); // during replay + return extent.get_bptr().c_str(); +} + +extent_len_t NodeExtentMutable::get_length() const { + return extent.get_length(); +} + +laddr_t NodeExtentMutable::get_laddr() const { + return extent.get_laddr(); +} + +const char* NodeExtentMutable::buf_upper_bound() const { + return get_read() + get_length(); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h new file mode 100644 index 000000000..52f10a013 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include <cstring> + +#include "fwd.h" + +#pragma once + +namespace crimson::os::seastore::onode { + +class NodeExtent; + +/** + * NodeExtentMutable + * + * A thin wrapper of NodeExtent to make sure that only the newly allocated + * or the duplicated NodeExtent is mutable, and the memory modifications are + * safe within the extent range. + */ +class NodeExtentMutable { + public: + void copy_in_absolute(void* dst, const void* src, extent_len_t len) { + assert((char*)dst >= get_write()); + assert((char*)dst + len <= buf_upper_bound()); + std::memcpy(dst, src, len); + } + template <typename T> + void copy_in_absolute(void* dst, const T& src) { + copy_in_absolute(dst, &src, sizeof(T)); + } + + const void* copy_in_relative( + extent_len_t dst_offset, const void* src, extent_len_t len) { + auto dst = get_write() + dst_offset; + copy_in_absolute(dst, src, len); + return dst; + } + template <typename T> + const T* copy_in_relative( + extent_len_t dst_offset, const T& src) { + auto dst = copy_in_relative(dst_offset, &src, sizeof(T)); + return static_cast<const T*>(dst); + } + + void shift_absolute(const void* src, extent_len_t len, int offset) { + assert((const char*)src >= get_write()); + assert((const char*)src + len <= buf_upper_bound()); + char* to = (char*)src + offset; + assert(to >= get_write()); + assert(to + len <= buf_upper_bound()); + if (len != 0) { + std::memmove(to, src, len); + } + } + void shift_relative(extent_len_t src_offset, extent_len_t len, int offset) { + shift_absolute(get_write() + src_offset, len, offset); + } + + template <typename T> + void validate_inplace_update(const T& updated) { + assert((const char*)&updated >= get_write()); + assert((const char*)&updated + sizeof(T) <= buf_upper_bound()); + } + + const char* get_read() const; + char* get_write(); + extent_len_t get_length() const; + laddr_t get_laddr() const; + + private: + explicit NodeExtentMutable(NodeExtent&); + const char* buf_upper_bound() const; + + NodeExtent& extent; + + friend class NodeExtent; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc new file mode 100644 index 000000000..59d792b1a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.cc @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_impl.h" +#include "node_layout.h" + +namespace crimson::os::seastore::onode { + +#ifdef UNIT_TESTS_BUILT +last_split_info_t last_split = {}; +#endif + +// XXX: branchless allocation +InternalNodeImpl::alloc_ertr::future<InternalNodeImpl::fresh_impl_t> +InternalNodeImpl::allocate( + context_t c, field_type_t type, bool is_level_tail, level_t level) { + if (type == field_type_t::N0) { + return InternalNode0::allocate(c, is_level_tail, level); + } else if (type == field_type_t::N1) { + return InternalNode1::allocate(c, is_level_tail, level); + } else if (type == field_type_t::N2) { + return InternalNode2::allocate(c, is_level_tail, level); + } else if (type == field_type_t::N3) { + return InternalNode3::allocate(c, is_level_tail, level); + } else { + ceph_abort("impossible path"); + } +} + +LeafNodeImpl::alloc_ertr::future<LeafNodeImpl::fresh_impl_t> +LeafNodeImpl::allocate( + context_t c, field_type_t type, bool is_level_tail) { + if (type == field_type_t::N0) { + return LeafNode0::allocate(c, is_level_tail, 0); + } else if (type == field_type_t::N1) { + return LeafNode1::allocate(c, is_level_tail, 0); + } else if (type == field_type_t::N2) { + return LeafNode2::allocate(c, is_level_tail, 0); + } else if (type == field_type_t::N3) { + return LeafNode3::allocate(c, is_level_tail, 0); + } else { + ceph_abort("impossible path"); + } +} + +InternalNodeImplURef InternalNodeImpl::load( + NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) { + if (type == field_type_t::N0) { + return InternalNode0::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N1) { + return InternalNode1::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N2) { + return InternalNode2::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N3) { + return InternalNode3::load(extent, expect_is_level_tail); + } else { + ceph_abort("impossible path"); + } +} + +LeafNodeImplURef LeafNodeImpl::load( + NodeExtentRef extent, field_type_t type, bool expect_is_level_tail) { + if (type == field_type_t::N0) { + return LeafNode0::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N1) { + return LeafNode1::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N2) { + return LeafNode2::load(extent, expect_is_level_tail); + } else if (type == field_type_t::N3) { + return LeafNode3::load(extent, expect_is_level_tail); + } else { + ceph_abort("impossible path"); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h new file mode 100644 index 000000000..3267cda2b --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_impl.h @@ -0,0 +1,197 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> + +#include "node_extent_mutable.h" +#include "node_types.h" +#include "stages/stage_types.h" + +namespace crimson::os::seastore::onode { + +#ifdef UNIT_TESTS_BUILT +enum class InsertType { BEGIN, LAST, MID }; +struct split_expectation_t { + match_stage_t split_stage; + match_stage_t insert_stage; + bool is_insert_left; + InsertType insert_type; +}; +struct last_split_info_t { + search_position_t split_pos; + match_stage_t insert_stage; + bool is_insert_left; + InsertType insert_type; + bool match(const split_expectation_t& e) const { + match_stage_t split_stage; + if (split_pos.nxt.nxt.index == 0) { + if (split_pos.nxt.index == 0) { + split_stage = 2; + } else { + split_stage = 1; + } + } else { + split_stage = 0; + } + return split_stage == e.split_stage && + insert_stage == e.insert_stage && + is_insert_left == e.is_insert_left && + insert_type == e.insert_type; + } + bool match_split_pos(const search_position_t& pos) const { + return split_pos == pos; + } +}; +extern last_split_info_t last_split; +#endif + +struct key_hobj_t; +struct key_view_t; +class NodeExtentMutable; + +/** + * NodeImpl + * + * Hides type specific node layout implementations for Node. + */ +class NodeImpl { + public: + using alloc_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + virtual ~NodeImpl() = default; + + virtual field_type_t field_type() const = 0; + virtual laddr_t laddr() const = 0; + virtual void prepare_mutate(context_t) = 0; + virtual bool is_level_tail() const = 0; + virtual bool is_empty() const = 0; + virtual level_t level() const = 0; + virtual node_offset_t free_size() const = 0; + virtual key_view_t get_key_view(const search_position_t&) const = 0; + virtual key_view_t get_largest_key_view() const = 0; + virtual void next_position(search_position_t&) const = 0; + + virtual node_stats_t get_stats() const = 0; + virtual std::ostream& dump(std::ostream&) const = 0; + virtual std::ostream& dump_brief(std::ostream&) const = 0; + virtual void validate_layout() const = 0; + + virtual void test_copy_to(NodeExtentMutable&) const = 0; + virtual void test_set_tail(NodeExtentMutable&) = 0; + + protected: + NodeImpl() = default; +}; + +/** + * InternalNodeImpl + * + * Hides type specific node layout implementations for InternalNode. + */ +class InternalNodeImpl : public NodeImpl { + public: + struct internal_marker_t {}; + virtual ~InternalNodeImpl() = default; + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const laddr_packed_t* get_p_value( + const search_position_t&, + key_view_t* = nullptr, internal_marker_t = {}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual lookup_result_t<node_type_t::INTERNAL> lower_bound( + const key_hobj_t&, MatchHistory&, + key_view_t* = nullptr, internal_marker_t = {}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const laddr_packed_t* insert( + const key_view_t&, const laddr_packed_t&, search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual std::tuple<search_position_t, bool, const laddr_packed_t*> split_insert( + NodeExtentMutable&, NodeImpl&, const key_view_t&, const laddr_packed_t&, + search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + + virtual void replace_child_addr(const search_position_t&, laddr_t dst, laddr_t src) = 0; + virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_view_t&, const laddr_t&, search_position_t&) const = 0; + + struct fresh_impl_t { + InternalNodeImplURef impl; + NodeExtentMutable mut; + std::pair<NodeImplURef, NodeExtentMutable> make_pair() { + return {std::move(impl), mut}; + } + }; + static alloc_ertr::future<fresh_impl_t> allocate(context_t, field_type_t, bool, level_t); + static InternalNodeImplURef load(NodeExtentRef, field_type_t, bool); + + protected: + InternalNodeImpl() = default; +}; + +/** + * LeafNodeImpl + * + * Hides type specific node layout implementations for LeafNode. + */ +class LeafNodeImpl : public NodeImpl { + public: + struct leaf_marker_t {}; + virtual ~LeafNodeImpl() = default; + + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const onode_t* get_p_value( + const search_position_t&, + key_view_t* = nullptr, leaf_marker_t={}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual lookup_result_t<node_type_t::LEAF> lower_bound( + const key_hobj_t&, MatchHistory&, + key_view_t* = nullptr, leaf_marker_t = {}) const { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual const onode_t* insert( + const key_hobj_t&, const onode_t&, search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + #pragma GCC diagnostic ignored "-Woverloaded-virtual" + virtual std::tuple<search_position_t, bool, const onode_t*> split_insert( + NodeExtentMutable&, NodeImpl&, const key_hobj_t&, const onode_t&, + search_position_t&, match_stage_t&, node_offset_t&) { + ceph_abort("impossible path"); + } + + virtual void get_largest_slot( + search_position_t&, key_view_t&, const onode_t**) const = 0; + virtual std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_hobj_t&, const onode_t&, + const MatchHistory&, match_stat_t, search_position_t&) const = 0; + + struct fresh_impl_t { + LeafNodeImplURef impl; + NodeExtentMutable mut; + std::pair<NodeImplURef, NodeExtentMutable> make_pair() { + return {std::move(impl), mut}; + } + }; + static alloc_ertr::future<fresh_impl_t> allocate(context_t, field_type_t, bool); + static LeafNodeImplURef load(NodeExtentRef, field_type_t, bool); + + protected: + LeafNodeImpl() = default; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h new file mode 100644 index 000000000..916d17424 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h @@ -0,0 +1,613 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> +#include <sstream> + +#include "common/likely.h" +#include "crimson/common/log.h" +#include "node_extent_accessor.h" +#include "node_impl.h" +#include "stages/node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +template <node_type_t NODE_TYPE> struct insert_key_type; +template <> struct insert_key_type<node_type_t::INTERNAL> { + static constexpr auto type = KeyT::VIEW; }; +template <> struct insert_key_type<node_type_t::LEAF> { + static constexpr auto type = KeyT::HOBJ; }; + +template <node_type_t NODE_TYPE> struct node_impl_type; +template <> struct node_impl_type<node_type_t::INTERNAL> { + using type = InternalNodeImpl; }; +template <> struct node_impl_type<node_type_t::LEAF> { + using type = LeafNodeImpl; }; + +template <node_type_t NODE_TYPE> struct node_marker_type; +template <> struct node_marker_type<node_type_t::INTERNAL> { + using type = InternalNodeImpl::internal_marker_t; }; +template <> struct node_marker_type<node_type_t::LEAF> { + using type = LeafNodeImpl::leaf_marker_t; }; + +/** + * NodeLayoutT + * + * Contains templated and concrete implementations for both InternalNodeImpl + * and LeafNodeImpl under a specific node layout. + */ +template <typename FieldType, node_type_t NODE_TYPE> +class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl { + public: + using URef = std::unique_ptr<NodeLayoutT>; + using extent_t = NodeExtentAccessorT<FieldType, NODE_TYPE>; + using parent_t = typename node_impl_type<NODE_TYPE>::type; + using marker_t = typename node_marker_type<NODE_TYPE>::type; + using node_stage_t = typename extent_t::node_stage_t; + using position_t = typename extent_t::position_t; + using value_t = typename extent_t::value_t; + static constexpr auto FIELD_TYPE = extent_t::FIELD_TYPE; + static constexpr auto KEY_TYPE = insert_key_type<NODE_TYPE>::type; + static constexpr auto STAGE = STAGE_T::STAGE; + + NodeLayoutT(const NodeLayoutT&) = delete; + NodeLayoutT(NodeLayoutT&&) = delete; + NodeLayoutT& operator=(const NodeLayoutT&) = delete; + NodeLayoutT& operator=(NodeLayoutT&&) = delete; + ~NodeLayoutT() override = default; + + static URef load(NodeExtentRef extent, bool expect_is_level_tail) { + std::unique_ptr<NodeLayoutT> ret(new NodeLayoutT(extent)); + assert(ret->is_level_tail() == expect_is_level_tail); + return ret; + } + + using alloc_ertr = NodeExtentManager::tm_ertr; + static alloc_ertr::future<typename parent_t::fresh_impl_t> allocate( + context_t c, bool is_level_tail, level_t level) { + // NOTE: Currently, all the node types have the same size for simplicity. + // But depending on the requirement, we may need to make node size + // configurable by field_type_t and node_type_t, or totally flexible. + return c.nm.alloc_extent(c.t, node_stage_t::EXTENT_SIZE + ).safe_then([is_level_tail, level](auto extent) { + assert(extent->is_initial_pending()); + auto mut = extent->get_mutable(); + node_stage_t::bootstrap_extent( + mut, FIELD_TYPE, NODE_TYPE, is_level_tail, level); + return typename parent_t::fresh_impl_t{ + std::unique_ptr<parent_t>(new NodeLayoutT(extent)), mut}; + }); + } + + protected: + /* + * NodeImpl + */ + field_type_t field_type() const override { return FIELD_TYPE; } + laddr_t laddr() const override { return extent.get_laddr(); } + void prepare_mutate(context_t c) override { return extent.prepare_mutate(c); } + bool is_level_tail() const override { return extent.read().is_level_tail(); } + bool is_empty() const override { return extent.read().keys() == 0; } + level_t level() const override { return extent.read().level(); } + node_offset_t free_size() const override { return extent.read().free_size(); } + + key_view_t get_key_view(const search_position_t& position) const override { + key_view_t ret; + STAGE_T::get_key_view(extent.read(), cast_down<STAGE>(position), ret); + return ret; + } + + key_view_t get_largest_key_view() const override { + key_view_t index_key; + STAGE_T::template lookup_largest_slot<false, true, false>( + extent.read(), nullptr, &index_key, nullptr); + return index_key; + } + + void next_position(search_position_t& pos) const override { + assert(!pos.is_end()); + bool find_next = STAGE_T::next_position(extent.read(), cast_down<STAGE>(pos)); + if (find_next) { + pos = search_position_t::end(); + } + } + + node_stats_t get_stats() const override { + node_stats_t stats; + auto& node_stage = extent.read(); + key_view_t index_key; + if (node_stage.keys()) { + STAGE_T::get_stats(node_stage, stats, index_key); + } + stats.size_persistent = node_stage_t::EXTENT_SIZE; + stats.size_filled = filled_size(); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + stats.size_logical += sizeof(value_t); + stats.size_value += sizeof(value_t); + stats.num_kvs += 1; + } + } + return stats; + } + + std::ostream& dump(std::ostream& os) const override { + auto& node_stage = extent.read(); + auto p_start = node_stage.p_start(); + dump_brief(os); + auto stats = get_stats(); + os << " num_kvs=" << stats.num_kvs + << ", logical=" << stats.size_logical + << "B, overhead=" << stats.size_overhead + << "B, value=" << stats.size_value << "B"; + os << ":\n header: " << node_stage_t::header_size() << "B"; + size_t size = 0u; + if (node_stage.keys()) { + STAGE_T::dump(node_stage, os, " ", size, p_start); + } else { + size += node_stage_t::header_size(); + if (NODE_TYPE == node_type_t::LEAF || !node_stage.is_level_tail()) { + os << " empty!"; + } + } + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (node_stage.is_level_tail()) { + size += sizeof(laddr_t); + auto value_ptr = node_stage.get_end_p_laddr(); + int offset = reinterpret_cast<const char*>(value_ptr) - p_start; + os << "\n tail value: 0x" + << std::hex << value_ptr->value << std::dec + << " " << size << "B" + << " @" << offset << "B"; + } + } + assert(size == filled_size()); + return os; + } + + std::ostream& dump_brief(std::ostream& os) const override { + auto& node_stage = extent.read(); + os << "Node" << NODE_TYPE << FIELD_TYPE + << "@0x" << std::hex << extent.get_laddr() + << "+" << node_stage_t::EXTENT_SIZE << std::dec + << (node_stage.is_level_tail() ? "$" : "") + << "(level=" << (unsigned)node_stage.level() + << ", filled=" << filled_size() << "B" + << ", free=" << node_stage.free_size() << "B" + << ")"; + return os; + } + + void validate_layout() const override { +#ifndef NDEBUG + STAGE_T::validate(extent.read()); +#endif + } + + void test_copy_to(NodeExtentMutable& to) const override { + extent.test_copy_to(to); + } + + void test_set_tail(NodeExtentMutable& mut) override { + node_stage_t::update_is_level_tail(mut, extent.read(), true); + } + + /* + * Common + */ + const value_t* get_p_value(const search_position_t& position, + key_view_t* index_key=nullptr, marker_t={}) const override { + auto& node_stage = extent.read(); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(!index_key); + if (position.is_end()) { + assert(is_level_tail()); + return node_stage.get_end_p_laddr(); + } + } else { + assert(!position.is_end()); + } + if (index_key) { + return STAGE_T::template get_p_value<true>( + node_stage, cast_down<STAGE>(position), index_key); + } else { + return STAGE_T::get_p_value(node_stage, cast_down<STAGE>(position)); + } + } + + lookup_result_t<NODE_TYPE> lower_bound( + const key_hobj_t& key, MatchHistory& history, + key_view_t* index_key=nullptr, marker_t={}) const override { + auto& node_stage = extent.read(); + if constexpr (NODE_TYPE == node_type_t::LEAF) { + if (unlikely(node_stage.keys() == 0)) { + history.set<STAGE_LEFT>(MatchKindCMP::LT); + return lookup_result_t<NODE_TYPE>::end(); + } + } + + typename STAGE_T::result_t result_raw; + if (index_key) { + result_raw = STAGE_T::template lower_bound<true>( + node_stage, key, history, index_key); +#ifndef NDEBUG + if (!result_raw.is_end()) { + full_key_t<KeyT::VIEW> index; + STAGE_T::get_key_view(node_stage, result_raw.position, index); + assert(index == *index_key); + } +#endif + } else { + result_raw = STAGE_T::lower_bound(node_stage, key, history); + } +#ifndef NDEBUG + if (result_raw.is_end()) { + assert(result_raw.mstat == MSTAT_END); + } else { + full_key_t<KeyT::VIEW> index; + STAGE_T::get_key_view(node_stage, result_raw.position, index); + assert_mstat(key, index, result_raw.mstat); + } +#endif + + // calculate MSTAT_LT3 + if constexpr (FIELD_TYPE == field_type_t::N0) { + // currently only internal node checks mstat + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (result_raw.mstat == MSTAT_LT2) { + auto cmp = compare_to<KeyT::HOBJ>( + key, node_stage[result_raw.position.index].shard_pool); + assert(cmp != MatchKindCMP::GT); + if (cmp != MatchKindCMP::EQ) { + result_raw.mstat = MSTAT_LT3; + } + } + } + } + + auto result = normalize(std::move(result_raw)); + if (result.is_end()) { + assert(node_stage.is_level_tail()); + assert(result.p_value == nullptr); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + result.p_value = node_stage.get_end_p_laddr(); + } + } else { + assert(result.p_value != nullptr); + } + return result; + } + + const value_t* insert( + const full_key_t<KEY_TYPE>& key, const value_t& value, + search_position_t& insert_pos, match_stage_t& insert_stage, + node_offset_t& insert_size) override { + logger().debug("OTree::Layout::Insert: begin at " + "insert_pos({}), insert_stage={}, insert_size={}B ...", + insert_pos, insert_stage, insert_size); + if (unlikely(logger().is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str()); + } + auto ret = extent.template insert_replayable<KEY_TYPE>( + key, value, cast_down<STAGE>(insert_pos), insert_stage, insert_size); + logger().debug("OTree::Layout::Insert: done at " + "insert_pos({}), insert_stage={}, insert_size={}B", + insert_pos, insert_stage, insert_size); + if (unlikely(logger().is_enabled(seastar::log_level::trace))) { + std::ostringstream sos; + dump(sos); + logger().trace("OTree::Layout::Insert: -- dump\n{}", sos.str()); + } + validate_layout(); + assert(get_key_view(insert_pos) == key); + return ret; + } + + std::tuple<search_position_t, bool, const value_t*> split_insert( + NodeExtentMutable& right_mut, NodeImpl& right_impl, + const full_key_t<KEY_TYPE>& key, const value_t& value, + search_position_t& _insert_pos, match_stage_t& insert_stage, + node_offset_t& insert_size) override { + logger().info("OTree::Layout::Split: begin at " + "insert_pos({}), insert_stage={}, insert_size={}B, " + "{:#x}=>{:#x} ...", + _insert_pos, insert_stage, insert_size, + laddr(), right_impl.laddr()); + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + dump(sos); + logger().debug("OTree::Layout::Split: -- dump\n{}", sos.str()); + } +#ifdef UNIT_TESTS_BUILT + auto insert_stage_pre = insert_stage; +#endif + + auto& insert_pos = cast_down<STAGE>(_insert_pos); + auto& node_stage = extent.read(); + typename STAGE_T::StagedIterator split_at; + bool is_insert_left; + size_t split_size; + size_t target_split_size; + { + size_t empty_size = node_stage.size_before(0); + size_t filled_kv_size = filled_size() - empty_size; + /** NODE_BLOCK_SIZE considerations + * + * Generally, + * target_split_size = (filled_size + insert_size) / 2 + * We can have two locate_split() strategies: + * A. the simpler one is to locate the largest split position where + * the estimated left_node_size <= target_split_size; + * B. the fair one takes a further step to calculate the next slot of + * P KiB, and if left_node_size + P/2 < target_split_size, compensate + * the split position to include the next slot; (TODO) + * + * Say that the node_block_size = N KiB, the largest allowed + * insert_size = 1/I * N KiB (I > 1). We want to identify the minimal 'I' + * that won't lead to "double split" effect, meaning after a split, + * the right node size is still larger than N KiB and need to split + * again. I think "double split" makes split much more complicated and + * we can no longer identify whether the node is safe under concurrent + * operations. + * + * We need to evaluate the worst case in order to identify 'I'. This means: + * - filled_size ~= N KiB + * - insert_size == N/I KiB + * - target_split_size ~= (I+1)/2I * N KiB + * To simplify the below calculations, node_block_size is normalized to 1. + * + * With strategy A, the worst case is when left_node_size cannot include + * the next slot that will just overflow the target_split_size: + * - left_node_size + 1/I ~= (I+1)/2I + * - left_node_size ~= (I-1)/2I + * - right_node_size ~= 1 + 1/I - left_node_size ~= (I+3)/2I + * The right_node_size cannot larger than the node_block_size in the + * worst case, which means (I+3)/2I < 1, so I > 3, meaning the largest + * possible insert_size must be smaller than 1/3 of the node_block_size. + * + * With strategy B, the worst case is when left_node_size cannot include + * the next slot that will just overflow the threshold + * target_split_size - 1/2I, thus: + * - left_node_size ~= (I+1)/2I - 1/2I ~= 1/2 + * - right_node_size ~= 1 + 1/I - 1/2 ~= (I+2)/2I < node_block_size(1) + * - I > 2 + * This means the largest possible insert_size must be smaller than 1/2 of + * the node_block_size, which is better than strategy A. + + * In order to avoid "double split", there is another side-effect we need + * to take into consideration: if split happens with snap-gen indexes, the + * according ns-oid string needs to be copied to the right node. That is + * to say: right_node_size + string_size < node_block_size. + * + * Say that the largest allowed string size is 1/S of the largest allowed + * insert_size N/I KiB. If we go with stragety B, the equation should be + * changed to: + * - right_node_size ~= (I+2)/2I + 1/(I*S) < 1 + * - I > 2 + 2/S (S > 1) + * + * Now back to NODE_BLOCK_SIZE calculation, if we have limits of at most + * X KiB ns-oid string and Y KiB of onode_t to store in this BTree, then: + * - largest_insert_size ~= X+Y KiB + * - 1/S == X/(X+Y) + * - I > (4X+2Y)/(X+Y) + * - node_block_size(N) == I * insert_size > 4X+2Y KiB + * + * In conclusion, + * (TODO) the current node block size (4 KiB) is too small to + * store entire 2 KiB ns-oid string. We need to consider a larger + * node_block_size. + * + * We are setting X = Y = 640 B in order not to break the current + * implementations with 4KiB node. + * + * (TODO) Implement smarter logics to check when "double split" happens. + */ + target_split_size = empty_size + (filled_kv_size + insert_size) / 2; + assert(insert_size < (node_stage.total_size() - empty_size) / 2); + + std::optional<bool> _is_insert_left; + split_at.set(node_stage); + split_size = 0; + bool locate_nxt = STAGE_T::recursively_locate_split_inserted( + split_size, 0, target_split_size, insert_pos, + insert_stage, insert_size, _is_insert_left, split_at); + is_insert_left = *_is_insert_left; + logger().debug("OTree::Layout::Split: -- located " + "split_at({}), insert_pos({}), is_insert_left={}, " + "split_size={}B(target={}B, current={}B)", + split_at, insert_pos, is_insert_left, + split_size, target_split_size, filled_size()); + // split_size can be larger than target_split_size in strategy B + // assert(split_size <= target_split_size); + if (locate_nxt) { + assert(insert_stage == STAGE); + assert(split_at.get().is_last()); + split_at.set_end(); + assert(insert_pos.index == split_at.index()); + } + } + + auto append_at = split_at; + // TODO(cross-node string dedup) + typename STAGE_T::template StagedAppender<KEY_TYPE> right_appender; + right_appender.init(&right_mut, right_mut.get_write()); + const value_t* p_value = nullptr; + if (!is_insert_left) { + // right node: append [start(append_at), insert_pos) + STAGE_T::template append_until<KEY_TYPE>( + append_at, right_appender, insert_pos, insert_stage); + logger().debug("OTree::Layout::Split: -- right appended until " + "insert_pos({}), insert_stage={}, insert/append the rest ...", + insert_pos, insert_stage); + // right node: append [insert_pos(key, value)] + bool is_front_insert = (insert_pos == position_t::begin()); + [[maybe_unused]] bool is_end = STAGE_T::template append_insert<KEY_TYPE>( + key, value, append_at, right_appender, + is_front_insert, insert_stage, p_value); + assert(append_at.is_end() == is_end); + } else { + logger().debug("OTree::Layout::Split: -- right appending ..."); + } + + // right node: append (insert_pos, end) + auto pos_end = position_t::end(); + STAGE_T::template append_until<KEY_TYPE>( + append_at, right_appender, pos_end, STAGE); + assert(append_at.is_end()); + right_appender.wrap(); + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + right_impl.dump(sos); + logger().debug("OTree::Layout::Split: -- right node dump\n{}", sos.str()); + } + right_impl.validate_layout(); + + // mutate left node + if (is_insert_left) { + logger().debug("OTree::Layout::Split: -- left trim/insert at " + "insert_pos({}), insert_stage={} ...", + insert_pos, insert_stage); + p_value = extent.template split_insert_replayable<KEY_TYPE>( + split_at, key, value, insert_pos, insert_stage, insert_size); + assert(get_key_view(_insert_pos) == key); + } else { + logger().debug("OTree::Layout::Split: -- left trim ..."); + assert(right_impl.get_key_view(_insert_pos) == key); + extent.split_replayable(split_at); + } + if (unlikely(logger().is_enabled(seastar::log_level::debug))) { + std::ostringstream sos; + dump(sos); + logger().debug("OTree::Layout::Split: -- left node dump\n{}", sos.str()); + } + validate_layout(); + assert(p_value); + + auto split_pos = normalize(split_at.get_pos()); + logger().info("OTree::Layout::Split: done at " + "insert_pos({}), insert_stage={}, insert_size={}B, split_at({}), " + "is_insert_left={}, split_size={}B(target={}B)", + _insert_pos, insert_stage, insert_size, split_pos, + is_insert_left, split_size, target_split_size); + assert(split_size == filled_size()); + +#ifdef UNIT_TESTS_BUILT + InsertType insert_type; + search_position_t last_pos; + if (is_insert_left) { + STAGE_T::template lookup_largest_slot<true, false, false>( + extent.read(), &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr); + } else { + node_stage_t right_stage{reinterpret_cast<FieldType*>(right_mut.get_write())}; + STAGE_T::template lookup_largest_slot<true, false, false>( + right_stage, &cast_down_fill_0<STAGE>(last_pos), nullptr, nullptr); + } + if (_insert_pos == search_position_t::begin()) { + insert_type = InsertType::BEGIN; + } else if (_insert_pos == last_pos) { + insert_type = InsertType::LAST; + } else { + insert_type = InsertType::MID; + } + last_split = {split_pos, insert_stage_pre, is_insert_left, insert_type}; +#endif + return {split_pos, is_insert_left, p_value}; + } + + /* + * InternalNodeImpl + */ + void replace_child_addr( + const search_position_t& pos, laddr_t dst, laddr_t src) override { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + const laddr_packed_t* p_value = get_p_value(pos); + assert(p_value->value == src); + extent.update_child_addr_replayable(dst, const_cast<laddr_packed_t*>(p_value)); + } else { + ceph_abort("impossible path"); + } + } + + std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_view_t& key, const laddr_t& value, + search_position_t& insert_pos) const override { + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + auto packed_value = laddr_packed_t{value}; + auto& node_stage = extent.read(); + match_stage_t insert_stage; + node_offset_t insert_size; + if (unlikely(!node_stage.keys())) { + assert(insert_pos.is_end()); + insert_stage = STAGE; + insert_size = STAGE_T::template insert_size<KeyT::VIEW>(key, packed_value); + } else { + std::tie(insert_stage, insert_size) = STAGE_T::evaluate_insert( + node_stage, key, packed_value, cast_down<STAGE>(insert_pos), false); + } + return {insert_stage, insert_size}; + } else { + ceph_abort("impossible path"); + } + } + + /* + * LeafNodeImpl + */ + void get_largest_slot(search_position_t& pos, + key_view_t& index_key, const onode_t** pp_value) const override { + if constexpr (NODE_TYPE == node_type_t::LEAF) { + STAGE_T::template lookup_largest_slot<true, true, true>( + extent.read(), &cast_down_fill_0<STAGE>(pos), &index_key, pp_value); + } else { + ceph_abort("impossible path"); + } + } + + std::tuple<match_stage_t, node_offset_t> evaluate_insert( + const key_hobj_t& key, const onode_t& value, + const MatchHistory& history, match_stat_t mstat, + search_position_t& insert_pos) const override { + if constexpr (NODE_TYPE == node_type_t::LEAF) { + if (unlikely(is_empty())) { + assert(insert_pos.is_end()); + return {STAGE, STAGE_T::template insert_size<KeyT::HOBJ>(key, value)}; + } else { + return STAGE_T::evaluate_insert( + key, value, history, mstat, cast_down<STAGE>(insert_pos)); + } + } else { + ceph_abort("impossible path"); + } + } + + private: + NodeLayoutT(NodeExtentRef extent) : extent{extent} {} + + node_offset_t filled_size() const { + auto& node_stage = extent.read(); + auto ret = node_stage.size_before(node_stage.keys()); + assert(ret == node_stage.total_size() - node_stage.free_size()); + return ret; + } + + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + + extent_t extent; +}; + +using InternalNode0 = NodeLayoutT<node_fields_0_t, node_type_t::INTERNAL>; +using InternalNode1 = NodeLayoutT<node_fields_1_t, node_type_t::INTERNAL>; +using InternalNode2 = NodeLayoutT<node_fields_2_t, node_type_t::INTERNAL>; +using InternalNode3 = NodeLayoutT<internal_fields_3_t, node_type_t::INTERNAL>; +using LeafNode0 = NodeLayoutT<node_fields_0_t, node_type_t::LEAF>; +using LeafNode1 = NodeLayoutT<node_fields_1_t, node_type_t::LEAF>; +using LeafNode2 = NodeLayoutT<node_fields_2_t, node_type_t::LEAF>; +using LeafNode3 = NodeLayoutT<leaf_fields_3_t, node_type_t::LEAF>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h new file mode 100644 index 000000000..c1499d609 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout_replayable.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "node_extent_mutable.h" +#include "stages/node_stage.h" +#include "stages/stage.h" + +#define STAGE_T node_to_stage_t<node_stage_t> + +namespace crimson::os::seastore::onode { + +/** + * NodeLayoutReplayableT + * + * Contains templated logics to modify the layout of a NodeExtend which are + * also replayable. Used by NodeExtentAccessorT at runtime and by + * DeltaRecorderT during replay. + */ +template <typename FieldType, node_type_t NODE_TYPE> +struct NodeLayoutReplayableT { + using node_stage_t = node_extent_t<FieldType, NODE_TYPE>; + using position_t = typename STAGE_T::position_t; + using StagedIterator = typename STAGE_T::StagedIterator; + using value_t = value_type_t<NODE_TYPE>; + static constexpr auto FIELD_TYPE = FieldType::FIELD_TYPE; + + template <KeyT KT> + static const value_t* insert( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + const full_key_t<KT>& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + auto p_value = STAGE_T::template proceed_insert<KT, false>( + mut, node_stage, key, value, insert_pos, insert_stage, insert_size); + return p_value; + } + + static void split( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + StagedIterator& split_at) { + node_stage_t::update_is_level_tail(mut, node_stage, false); + STAGE_T::trim(mut, split_at); + } + + template <KeyT KT> + static const value_t* split_insert( + NodeExtentMutable& mut, + const node_stage_t& node_stage, + StagedIterator& split_at, + const full_key_t<KT>& key, + const value_t& value, + position_t& insert_pos, + match_stage_t& insert_stage, + node_offset_t& insert_size) { + node_stage_t::update_is_level_tail(mut, node_stage, false); + STAGE_T::trim(mut, split_at); + auto p_value = STAGE_T::template proceed_insert<KT, true>( + mut, node_stage, key, value, insert_pos, insert_stage, insert_size); + return p_value; + } + + static void update_child_addr( + NodeExtentMutable& mut, const laddr_t new_addr, laddr_packed_t* p_addr) { + assert(NODE_TYPE == node_type_t::INTERNAL); + mut.copy_in_absolute(p_addr, new_addr); + } +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h new file mode 100644 index 000000000..6774544c7 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_types.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <ostream> + +#include "fwd.h" + +namespace crimson::os::seastore::onode { + +constexpr uint8_t FIELD_TYPE_MAGIC = 0x25; +enum class field_type_t : uint8_t { + N0 = FIELD_TYPE_MAGIC, + N1, + N2, + N3, + _MAX +}; +inline uint8_t to_unsigned(field_type_t type) { + auto value = static_cast<uint8_t>(type); + assert(value >= FIELD_TYPE_MAGIC); + assert(value < static_cast<uint8_t>(field_type_t::_MAX)); + return value - FIELD_TYPE_MAGIC; +} +inline std::ostream& operator<<(std::ostream &os, field_type_t type) { + const char* const names[] = {"0", "1", "2", "3"}; + auto index = to_unsigned(type); + os << names[index]; + return os; +} + +enum class node_type_t : uint8_t { + LEAF = 0, + INTERNAL +}; +inline std::ostream& operator<<(std::ostream &os, const node_type_t& type) { + const char* const names[] = {"L", "I"}; + auto index = static_cast<uint8_t>(type); + assert(index <= 1u); + os << names[index]; + return os; +} + +struct laddr_packed_t { + laddr_t value; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const laddr_packed_t& laddr) { + return os << "laddr_packed(0x" << std::hex << laddr.value << std::dec << ")"; +} + +using match_stat_t = int8_t; +constexpr match_stat_t MSTAT_END = -2; // index is search_position_t::end() +constexpr match_stat_t MSTAT_EQ = -1; // key == index +constexpr match_stat_t MSTAT_LT0 = 0; // key == index [pool/shard crush ns/oid]; key < index [snap/gen] +constexpr match_stat_t MSTAT_LT1 = 1; // key == index [pool/shard crush]; key < index [ns/oid] +constexpr match_stat_t MSTAT_LT2 = 2; // key < index [pool/shard crush ns/oid] || + // key == index [pool/shard]; key < index [crush] +constexpr match_stat_t MSTAT_LT3 = 3; // key < index [pool/shard] +constexpr match_stat_t MSTAT_MIN = MSTAT_END; +constexpr match_stat_t MSTAT_MAX = MSTAT_LT3; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc new file mode 100644 index 000000000..443c6cabd --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.cc @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "item_iterator_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +#define ITER_T item_iterator_t<NODE_TYPE> +#define ITER_INST(NT) item_iterator_t<NT> + +template <node_type_t NODE_TYPE> +template <KeyT KT> +memory_range_t ITER_T::insert_prefix( + NodeExtentMutable& mut, const ITER_T& iter, const full_key_t<KT>& key, + bool is_end, node_offset_t size, const char* p_left_bound) { + // 1. insert range + char* p_insert; + if (is_end) { + assert(!iter.has_next()); + p_insert = const_cast<char*>(iter.p_start()); + } else { + p_insert = const_cast<char*>(iter.p_end()); + } + char* p_insert_front = p_insert - size; + + // 2. shift memory + const char* p_shift_start = p_left_bound; + const char* p_shift_end = p_insert; + mut.shift_absolute(p_shift_start, + p_shift_end - p_shift_start, + -(int)size); + + // 3. append header + p_insert -= sizeof(node_offset_t); + node_offset_t back_offset = (p_insert - p_insert_front); + mut.copy_in_absolute(p_insert, back_offset); + ns_oid_view_t::append<KT>(mut, key, p_insert); + + return {p_insert_front, p_insert}; +} +#define IP_TEMPLATE(NT, KT) \ + template memory_range_t ITER_INST(NT)::insert_prefix<KT>( \ + NodeExtentMutable&, const ITER_INST(NT)&, const full_key_t<KT>&, \ + bool, node_offset_t, const char*) +IP_TEMPLATE(node_type_t::LEAF, KeyT::VIEW); +IP_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW); +IP_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ); +IP_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ); + +template <node_type_t NODE_TYPE> +void ITER_T::update_size( + NodeExtentMutable& mut, const ITER_T& iter, int change) { + node_offset_t offset = iter.get_back_offset(); + int new_size = change + offset; + assert(new_size > 0 && new_size < NODE_BLOCK_SIZE); + mut.copy_in_absolute( + (void*)iter.get_item_range().p_end, node_offset_t(new_size)); +} + +template <node_type_t NODE_TYPE> +node_offset_t ITER_T::trim_until(NodeExtentMutable&, const ITER_T& iter) { + assert(iter.index() != 0); + size_t ret = iter.p_end() - iter.p_items_start; + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template <node_type_t NODE_TYPE> +node_offset_t ITER_T::trim_at( + NodeExtentMutable& mut, const ITER_T& iter, node_offset_t trimmed) { + size_t trim_size = iter.p_start() - iter.p_items_start + trimmed; + assert(trim_size < NODE_BLOCK_SIZE); + assert(iter.get_back_offset() > trimmed); + node_offset_t new_offset = iter.get_back_offset() - trimmed; + mut.copy_in_absolute((void*)iter.item_range.p_end, new_offset); + return trim_size; +} + +#define ITER_TEMPLATE(NT) template class ITER_INST(NT) +ITER_TEMPLATE(node_type_t::LEAF); +ITER_TEMPLATE(node_type_t::INTERNAL); + +#define APPEND_T ITER_T::Appender<KT> + +template <node_type_t NODE_TYPE> +template <KeyT KT> +bool APPEND_T::append(const ITER_T& src, index_t& items) { + auto p_end = src.p_end(); + bool append_till_end = false; + if (is_valid_index(items)) { + for (auto i = 1u; i <= items; ++i) { + if (!src.has_next()) { + assert(i == items); + append_till_end = true; + break; + } + ++src; + } + } else { + if (items == INDEX_END) { + append_till_end = true; + } else { + assert(items == INDEX_LAST); + } + items = 0; + while (src.has_next()) { + ++src; + ++items; + } + if (append_till_end) { + ++items; + } + } + + const char* p_start; + if (append_till_end) { + p_start = src.p_start(); + } else { + p_start = src.p_end(); + } + assert(p_end >= p_start); + size_t append_size = p_end - p_start; + p_append -= append_size; + p_mut->copy_in_absolute(p_append, p_start, append_size); + return append_till_end; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const key_get_type& partial_key) { + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append(*p_mut, partial_key, p_append); + return {p_mut, p_append}; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const full_key_t<KT>& key) { + p_append -= sizeof(node_offset_t); + p_offset_while_open = p_append; + ns_oid_view_t::append<KT>(*p_mut, key, p_append); + return {p_mut, p_append}; +} + +template <node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::wrap_nxt(char* _p_append) { + assert(_p_append < p_append); + p_mut->copy_in_absolute( + p_offset_while_open, node_offset_t(p_offset_while_open - _p_append)); + p_append = _p_append; +} + +#define APPEND_TEMPLATE(NT, KT) template class ITER_INST(NT)::Appender<KT> +APPEND_TEMPLATE(node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_type_t::INTERNAL, KeyT::HOBJ); + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h new file mode 100644 index 000000000..bb68eec8f --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/item_iterator_stage.h @@ -0,0 +1,180 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/** + * item_iterator_t + * + * The STAGE_STRING implementation for node N0/N1, implements staged contract + * as an iterative container to resolve crush hash conflicts. + * + * The layout of the contaner to index ns, oid strings storing n items: + * + * # <--------- container range ---------> # + * #<~># items [i+1, n) # + * # # items [0, i) #<~># + * # # <------ item i -------------> # # + * # # <--- item_range ---> | # # + * # # | # # + * # # next-stage | ns-oid | back_ # # + * # # contaner | strings | offset # # + * #...# range | | #...# + * ^ ^ | ^ + * | | | | + * | +---------------------------+ | + * + p_items_start p_items_end + + */ +template <node_type_t NODE_TYPE> +class item_iterator_t { + using value_t = value_type_t<NODE_TYPE>; + public: + item_iterator_t(const memory_range_t& range) + : p_items_start(range.p_start), p_items_end(range.p_end) { + assert(p_items_start < p_items_end); + next_item_range(p_items_end); + } + + const char* p_start() const { return item_range.p_start; } + const char* p_end() const { return item_range.p_end + sizeof(node_offset_t); } + const memory_range_t& get_item_range() const { return item_range; } + node_offset_t get_back_offset() const { return back_offset; } + + // container type system + using key_get_type = const ns_oid_view_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::ITERATIVE; + index_t index() const { return _index; } + key_get_type get_key() const { + if (!key.has_value()) { + key = ns_oid_view_t(item_range.p_end); + assert(item_range.p_start < (*key).p_start()); + } + return *key; + } + node_offset_t size() const { + size_t ret = item_range.p_end - item_range.p_start + sizeof(node_offset_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + }; + node_offset_t size_to_nxt() const { + size_t ret = get_key().size() + sizeof(node_offset_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_overhead() const { + return sizeof(node_offset_t) + get_key().size_overhead(); + } + memory_range_t get_nxt_container() const { + return {item_range.p_start, get_key().p_start()}; + } + bool has_next() const { + assert(p_items_start <= item_range.p_start); + return p_items_start < item_range.p_start; + } + const item_iterator_t<NODE_TYPE>& operator++() const { + assert(has_next()); + next_item_range(item_range.p_start); + key.reset(); + ++_index; + return *this; + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + int start_offset = p_items_start - p_node_start; + int end_offset = p_items_end - p_node_start; + assert(start_offset > 0 && start_offset < NODE_BLOCK_SIZE); + assert(end_offset > 0 && end_offset <= NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(end_offset), encoded); + ceph::encode(_index, encoded); + } + + static item_iterator_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + index_t index; + ceph::decode(index, delta); + + item_iterator_t ret({p_node_start + start_offset, + p_node_start + end_offset}); + while (index > 0) { + ++ret; + --index; + } + return ret; + } + + static node_offset_t header_size() { return 0u; } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>& key, const value_t&) { + return ns_oid_view_t::estimate_size<KT>(key) + sizeof(node_offset_t); + } + + template <KeyT KT> + static memory_range_t insert_prefix( + NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, + const full_key_t<KT>& key, bool is_end, + node_offset_t size, const char* p_left_bound); + + static void update_size( + NodeExtentMutable& mut, const item_iterator_t<NODE_TYPE>& iter, int change); + + static node_offset_t trim_until(NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&); + static node_offset_t trim_at( + NodeExtentMutable&, const item_iterator_t<NODE_TYPE>&, node_offset_t trimmed); + + template <KeyT KT> + class Appender; + + private: + void next_item_range(const char* p_end) const { + auto p_item_end = p_end - sizeof(node_offset_t); + assert(p_items_start < p_item_end); + back_offset = reinterpret_cast<const node_offset_packed_t*>(p_item_end)->value; + assert(back_offset); + const char* p_item_start = p_item_end - back_offset; + assert(p_items_start <= p_item_start); + item_range = {p_item_start, p_item_end}; + } + + const char* p_items_start; + const char* p_items_end; + mutable memory_range_t item_range; + mutable node_offset_t back_offset; + mutable std::optional<ns_oid_view_t> key; + mutable index_t _index = 0u; +}; + +template <node_type_t NODE_TYPE> +template <KeyT KT> +class item_iterator_t<NODE_TYPE>::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + bool append(const item_iterator_t<NODE_TYPE>& src, index_t& items); + char* wrap() { return p_append; } + std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&); + std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&); + void wrap_nxt(char* _p_append); + + private: + NodeExtentMutable* p_mut; + char* p_append; + char* p_offset_while_open; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc new file mode 100644 index 000000000..d60bb8d09 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.cc @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "key_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void string_key_view_t::append_str( + NodeExtentMutable& mut, std::string_view str, char*& p_append) { + assert(is_valid_size(str.length())); + p_append -= sizeof(string_size_t); + string_size_t len = str.length(); + mut.copy_in_absolute(p_append, len); + p_append -= len; + mut.copy_in_absolute(p_append, str.data(), len); +} + +void string_key_view_t::append_dedup( + NodeExtentMutable& mut, const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + if (dedup_type == Type::MIN) { + mut.copy_in_absolute(p_append, MIN); + } else if (dedup_type == Type::MAX) { + mut.copy_in_absolute(p_append, MAX); + } else { + ceph_abort("impossible path"); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h new file mode 100644 index 000000000..cc1f546c1 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/key_layout.h @@ -0,0 +1,846 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <limits> +#include <optional> +#include <ostream> + +#include "common/hobject.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" + +namespace crimson::os::seastore::onode { + +using shard_t = int8_t; +using pool_t = int64_t; +using crush_hash_t = uint32_t; +using snap_t = uint64_t; +using gen_t = uint64_t; +static_assert(sizeof(shard_t) == sizeof(ghobject_t().shard_id.id)); +static_assert(sizeof(pool_t) == sizeof(ghobject_t().hobj.pool)); +static_assert(sizeof(crush_hash_t) == sizeof(ghobject_t().hobj.get_hash())); +static_assert(sizeof(snap_t) == sizeof(ghobject_t().hobj.snap.val)); +static_assert(sizeof(gen_t) == sizeof(ghobject_t().generation)); + +class NodeExtentMutable; +class key_view_t; +class key_hobj_t; +enum class KeyT { VIEW, HOBJ }; +template <KeyT> struct _full_key_type; +template<> struct _full_key_type<KeyT::VIEW> { using type = key_view_t; }; +template<> struct _full_key_type<KeyT::HOBJ> { using type = key_hobj_t; }; +template <KeyT type> +using full_key_t = typename _full_key_type<type>::type; + +struct node_offset_packed_t { + node_offset_t value; +} __attribute__((packed)); + +// TODO: consider alignments +struct shard_pool_t { + bool operator==(const shard_pool_t& x) const { + return (shard == x.shard && pool == x.pool); + } + bool operator!=(const shard_pool_t& x) const { return !(*this == x); } + + template <KeyT KT> + static shard_pool_t from_key(const full_key_t<KT>& key); + + shard_t shard; + pool_t pool; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_t& sp) { + return os << (unsigned)sp.shard << "," << sp.pool; +} +inline MatchKindCMP compare_to(const shard_pool_t& l, const shard_pool_t& r) { + auto ret = toMatchKindCMP(l.shard, r.shard); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(l.pool, r.pool); +} + +struct crush_t { + bool operator==(const crush_t& x) const { return crush == x.crush; } + bool operator!=(const crush_t& x) const { return !(*this == x); } + + template <KeyT KT> + static crush_t from_key(const full_key_t<KT>& key); + + crush_hash_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const crush_t& c) { + return os << c.crush; +} +inline MatchKindCMP compare_to(const crush_t& l, const crush_t& r) { + return toMatchKindCMP(l.crush, r.crush); +} + +struct shard_pool_crush_t { + bool operator==(const shard_pool_crush_t& x) const { + return (shard_pool == x.shard_pool && crush == x.crush); + } + bool operator!=(const shard_pool_crush_t& x) const { return !(*this == x); } + + template <KeyT KT> + static shard_pool_crush_t from_key(const full_key_t<KT>& key); + + shard_pool_t shard_pool; + crush_t crush; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const shard_pool_crush_t& spc) { + return os << spc.shard_pool << "," << spc.crush; +} +inline MatchKindCMP compare_to( + const shard_pool_crush_t& l, const shard_pool_crush_t& r) { + auto ret = compare_to(l.shard_pool, r.shard_pool); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(l.crush, r.crush); +} + +struct snap_gen_t { + bool operator==(const snap_gen_t& x) const { + return (snap == x.snap && gen == x.gen); + } + bool operator!=(const snap_gen_t& x) const { return !(*this == x); } + + template <KeyT KT> + static snap_gen_t from_key(const full_key_t<KT>& key); + + snap_t snap; + gen_t gen; +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const snap_gen_t& sg) { + return os << sg.snap << "," << sg.gen; +} +inline MatchKindCMP compare_to(const snap_gen_t& l, const snap_gen_t& r) { + auto ret = toMatchKindCMP(l.snap, r.snap); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(l.gen, r.gen); +} + +/** + * string_key_view_t + * + * The layout to store char array as an oid or an ns string which may be + * compressed. + * + * If compressed, the physical block only stores an unsigned int of + * string_size_t, with value 0 denoting Type::MIN, and value max() denoting + * Type::MAX. + * + * If not compressed (Type::STR), the physical block stores the char array and + * a valid string_size_t value. + */ +struct string_key_view_t { + enum class Type {MIN, STR, MAX}; + // presumably the maximum string length is 2KiB + using string_size_t = uint16_t; + static constexpr auto MAX = std::numeric_limits<string_size_t>::max(); + static constexpr auto MIN = string_size_t(0u); + static auto is_valid_size(size_t size) { + return (size > MIN && size < MAX); + } + + string_key_view_t(const char* p_end) { + p_length = p_end - sizeof(string_size_t); + std::memcpy(&length, p_length, sizeof(string_size_t)); + if (is_valid_size(length)) { + auto _p_key = p_length - length; + p_key = static_cast<const char*>(_p_key); + } else { + assert(length == MAX || length == MIN); + p_key = nullptr; + } + } + Type type() const { + if (length == MIN) { + return Type::MIN; + } else if (length == MAX) { + return Type::MAX; + } else { + assert(is_valid_size(length)); + return Type::STR; + } + } + const char* p_start() const { + if (p_key) { + return p_key; + } else { + return p_length; + } + } + const char* p_next_end() const { + if (p_key) { + return p_start(); + } else { + return p_length + sizeof(string_size_t); + } + } + node_offset_t size() const { + size_t ret = length + sizeof(string_size_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_logical() const { + assert(type() == Type::STR); + assert(is_valid_size(length)); + return length; + } + node_offset_t size_overhead() const { + assert(type() == Type::STR); + return sizeof(string_size_t); + } + + std::string_view to_string_view() const { + assert(type() == Type::STR); + assert(is_valid_size(length)); + return {p_key, length}; + } + bool operator==(const string_key_view_t& x) const { + if (type() == x.type() && type() != Type::STR) + return true; + if (type() != x.type()) + return false; + if (length != x.length) + return false; + return (memcmp(p_key, x.p_key, length) == 0); + } + bool operator!=(const string_key_view_t& x) const { return !(*this == x); } + + static void append_str( + NodeExtentMutable&, std::string_view, char*& p_append); + + static void test_append_str(std::string_view str, char*& p_append) { + assert(is_valid_size(str.length())); + p_append -= sizeof(string_size_t); + string_size_t len = str.length(); + std::memcpy(p_append, &len, sizeof(string_size_t)); + p_append -= len; + std::memcpy(p_append, str.data(), len); + } + + static void append_dedup( + NodeExtentMutable&, const Type& dedup_type, char*& p_append); + + static void test_append_dedup(const Type& dedup_type, char*& p_append) { + p_append -= sizeof(string_size_t); + string_size_t len; + if (dedup_type == Type::MIN) { + len = MIN; + } else if (dedup_type == Type::MAX) { + len = MAX; + } else { + ceph_abort("impossible path"); + } + std::memcpy(p_append, &len, sizeof(string_size_t)); + } + + const char* p_key; + const char* p_length; + // TODO: remove if p_length is aligned + string_size_t length; +}; + +/** + * string_view_masked_t + * + * A common class to hide the underlying string implementation regardless of a + * string_key_view_t (maybe compressed), a string/string_view, or a compressed + * string. And leverage this consistant class to do compare, print, convert and + * append operations. + */ +class string_view_masked_t { + public: + using string_size_t = string_key_view_t::string_size_t; + using Type = string_key_view_t::Type; + explicit string_view_masked_t(const string_key_view_t& index) + : type{index.type()} { + if (type == Type::STR) { + view = index.to_string_view(); + } + } + explicit string_view_masked_t(std::string_view str) + : type{Type::STR}, view{str} { + assert(string_key_view_t::is_valid_size(view.size())); + } + + Type get_type() const { return type; } + std::string_view to_string_view() const { + assert(get_type() == Type::STR); + return view; + } + string_size_t size() const { + assert(get_type() == Type::STR); + assert(string_key_view_t::is_valid_size(view.size())); + return view.size(); + } + bool operator==(const string_view_masked_t& x) const { + if (get_type() == x.get_type() && get_type() != Type::STR) + return true; + if (get_type() != x.get_type()) + return false; + if (size() != x.size()) + return false; + return (memcmp(view.data(), x.view.data(), size()) == 0); + } + bool operator!=(const string_view_masked_t& x) const { return !(*this == x); } + void encode(ceph::bufferlist& bl) const { + if (get_type() == Type::MIN) { + ceph::encode(string_key_view_t::MIN, bl); + } else if (get_type() == Type::MAX) { + ceph::encode(string_key_view_t::MAX, bl); + } else { + ceph::encode(size(), bl); + ceph::encode_nohead(view, bl); + } + } + static auto min() { return string_view_masked_t{Type::MIN}; } + static auto max() { return string_view_masked_t{Type::MAX}; } + static string_view_masked_t decode( + std::string& str_storage, ceph::bufferlist::const_iterator& delta) { + string_size_t size; + ceph::decode(size, delta); + if (size == string_key_view_t::MIN) { + return min(); + } else if (size == string_key_view_t::MAX) { + return max(); + } else { + ceph::decode_nohead(size, str_storage, delta); + return string_view_masked_t(str_storage); + } + } + + private: + explicit string_view_masked_t(Type type) + : type{type} {} + + Type type; + std::string_view view; +}; +inline MatchKindCMP compare_to(const string_view_masked_t& l, const string_view_masked_t& r) { + using Type = string_view_masked_t::Type; + auto l_type = l.get_type(); + auto r_type = r.get_type(); + if (l_type == Type::STR && r_type == Type::STR) { + assert(l.size() && r.size()); + return toMatchKindCMP(l.to_string_view(), r.to_string_view()); + } else if (l_type == r_type) { + return MatchKindCMP::EQ; + } else if (l_type == Type::MIN || r_type == Type::MAX) { + return MatchKindCMP::LT; + } else { // l_type == Type::MAX || r_type == Type::MIN + return MatchKindCMP::GT; + } +} +inline MatchKindCMP compare_to(std::string_view l, const string_view_masked_t& r) { + using Type = string_view_masked_t::Type; + assert(l.length()); + auto r_type = r.get_type(); + if (r_type == Type::MIN) { + return MatchKindCMP::GT; + } else if (r_type == Type::MAX) { + return MatchKindCMP::LT; + } else { // r_type == Type::STR + assert(r.size()); + return toMatchKindCMP(l, r.to_string_view()); + } +} +inline MatchKindCMP compare_to(const string_view_masked_t& l, std::string_view r) { + return reverse(compare_to(r, l)); +} +inline std::ostream& operator<<(std::ostream& os, const string_view_masked_t& masked) { + using Type = string_view_masked_t::Type; + auto type = masked.get_type(); + if (type == Type::MIN) { + return os << "MIN"; + } else if (type == Type::MAX) { + return os << "MAX"; + } else { // type == Type::STR + auto view = masked.to_string_view(); + if (view.length() <= 12) { + os << "\"" << view << "\""; + } else { + os << "\"" << std::string_view(view.data(), 4) << ".." + << std::string_view(view.data() + view.length() - 2, 2) + << "/" << view.length() << "B\""; + } + return os; + } +} + +struct ns_oid_view_t { + using string_size_t = string_key_view_t::string_size_t; + using Type = string_key_view_t::Type; + + ns_oid_view_t(const char* p_end) : nspace(p_end), oid(nspace.p_next_end()) {} + Type type() const { return oid.type(); } + const char* p_start() const { return oid.p_start(); } + node_offset_t size() const { + if (type() == Type::STR) { + size_t ret = nspace.size() + oid.size(); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } else { + return sizeof(string_size_t); + } + } + node_offset_t size_logical() const { + assert(type() == Type::STR); + return nspace.size_logical() + oid.size_logical(); + } + node_offset_t size_overhead() const { + assert(type() == Type::STR); + return nspace.size_overhead() + oid.size_overhead(); + } + bool operator==(const ns_oid_view_t& x) const { + return (string_view_masked_t{nspace} == string_view_masked_t{x.nspace} && + string_view_masked_t{oid} == string_view_masked_t{x.oid}); + } + bool operator!=(const ns_oid_view_t& x) const { return !(*this == x); } + + template <KeyT KT> + static node_offset_t estimate_size(const full_key_t<KT>& key); + + template <KeyT KT> + static void append(NodeExtentMutable&, + const full_key_t<KT>& key, + char*& p_append); + + static void append(NodeExtentMutable& mut, + const ns_oid_view_t& view, + char*& p_append) { + if (view.type() == Type::STR) { + string_key_view_t::append_str(mut, view.nspace.to_string_view(), p_append); + string_key_view_t::append_str(mut, view.oid.to_string_view(), p_append); + } else { + string_key_view_t::append_dedup(mut, view.type(), p_append); + } + } + + template <KeyT KT> + static void test_append(const full_key_t<KT>& key, char*& p_append); + + string_key_view_t nspace; + string_key_view_t oid; +}; +inline std::ostream& operator<<(std::ostream& os, const ns_oid_view_t& ns_oid) { + return os << string_view_masked_t{ns_oid.nspace} << "," + << string_view_masked_t{ns_oid.oid}; +} +inline MatchKindCMP compare_to(const ns_oid_view_t& l, const ns_oid_view_t& r) { + auto ret = compare_to(string_view_masked_t{l.nspace}, + string_view_masked_t{r.nspace}); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(string_view_masked_t{l.oid}, + string_view_masked_t{r.oid}); +} + +/** + * key_hobj_t + * + * A specialized implementation of a full_key_t storing a ghobject_t passed + * from user. + */ +class key_hobj_t { + public: + explicit key_hobj_t(const ghobject_t& ghobj) : ghobj{ghobj} {} + /* + * common interfaces as a full_key_t + */ + shard_t shard() const { + return ghobj.shard_id; + } + pool_t pool() const { + return ghobj.hobj.pool; + } + crush_hash_t crush() const { + return ghobj.hobj.get_hash(); + } + std::string_view nspace() const { + // TODO(cross-node string dedup) + return ghobj.hobj.nspace; + } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{nspace()}; + } + std::string_view oid() const { + // TODO(cross-node string dedup) + return ghobj.hobj.oid.name; + } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{oid()}; + } + ns_oid_view_t::Type dedup_type() const { + return _dedup_type; + } + snap_t snap() const { + return ghobj.hobj.snap; + } + gen_t gen() const { + return ghobj.generation; + } + + bool operator==(const full_key_t<KeyT::VIEW>& o) const; + bool operator==(const full_key_t<KeyT::HOBJ>& o) const; + bool operator!=(const full_key_t<KeyT::VIEW>& o) const { + return !operator==(o); + } + bool operator!=(const full_key_t<KeyT::HOBJ>& o) const { + return !operator==(o); + } + + std::ostream& dump(std::ostream& os) const { + os << "key_hobj(" << (unsigned)shard() << "," + << pool() << "," << crush() << "; " + << string_view_masked_t{nspace()} << "," + << string_view_masked_t{oid()} << "; " + << snap() << "," << gen() << ")"; + return os; + } + + static key_hobj_t decode(ceph::bufferlist::const_iterator& delta) { + shard_t shard; + ceph::decode(shard, delta); + pool_t pool; + ceph::decode(pool, delta); + crush_hash_t crush; + ceph::decode(crush, delta); + std::string nspace; + auto nspace_masked = string_view_masked_t::decode(nspace, delta); + // TODO(cross-node string dedup) + assert(nspace_masked.get_type() == string_view_masked_t::Type::STR); + std::string oid; + auto oid_masked = string_view_masked_t::decode(oid, delta); + // TODO(cross-node string dedup) + assert(oid_masked.get_type() == string_view_masked_t::Type::STR); + snap_t snap; + ceph::decode(snap, delta); + gen_t gen; + ceph::decode(gen, delta); + return key_hobj_t(ghobject_t( + shard_id_t(shard), pool, crush, nspace, oid, snap, gen)); + } + + private: + ns_oid_view_t::Type _dedup_type = ns_oid_view_t::Type::STR; + ghobject_t ghobj; +}; +inline std::ostream& operator<<(std::ostream& os, const key_hobj_t& key) { + return key.dump(os); +} + +/** + * key_view_t + * + * A specialized implementation of a full_key_t pointing to the locations + * storing the full key in a tree node. + */ +class key_view_t { + public: + /** + * common interfaces as a full_key_t + */ + shard_t shard() const { + return shard_pool_packed().shard; + } + pool_t pool() const { + return shard_pool_packed().pool; + } + crush_hash_t crush() const { + return crush_packed().crush; + } + std::string_view nspace() const { + // TODO(cross-node string dedup) + return ns_oid_view().nspace.to_string_view(); + } + string_view_masked_t nspace_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().nspace}; + } + std::string_view oid() const { + // TODO(cross-node string dedup) + return ns_oid_view().oid.to_string_view(); + } + string_view_masked_t oid_masked() const { + // TODO(cross-node string dedup) + return string_view_masked_t{ns_oid_view().oid}; + } + ns_oid_view_t::Type dedup_type() const { + return ns_oid_view().type(); + } + snap_t snap() const { + return snap_gen_packed().snap; + } + gen_t gen() const { + return snap_gen_packed().gen; + } + + bool operator==(const full_key_t<KeyT::VIEW>& o) const; + bool operator==(const full_key_t<KeyT::HOBJ>& o) const; + bool operator!=(const full_key_t<KeyT::VIEW>& o) const { + return !operator==(o); + } + bool operator!=(const full_key_t<KeyT::HOBJ>& o) const { + return !operator==(o); + } + + /** + * key_view_t specific interfaces + */ + bool has_shard_pool() const { + return p_shard_pool != nullptr; + } + bool has_crush() const { + return p_crush != nullptr; + } + bool has_ns_oid() const { + return p_ns_oid.has_value(); + } + bool has_snap_gen() const { + return p_snap_gen != nullptr; + } + + const shard_pool_t& shard_pool_packed() const { + assert(has_shard_pool()); + return *p_shard_pool; + } + const crush_t& crush_packed() const { + assert(has_crush()); + return *p_crush; + } + const ns_oid_view_t& ns_oid_view() const { + assert(has_ns_oid()); + return *p_ns_oid; + } + const snap_gen_t& snap_gen_packed() const { + assert(has_snap_gen()); + return *p_snap_gen; + } + + size_t size_logical() const { + return sizeof(shard_t) + sizeof(pool_t) + sizeof(crush_hash_t) + + sizeof(snap_t) + sizeof(gen_t) + ns_oid_view().size_logical(); + } + + ghobject_t to_ghobj() const { + return ghobject_t( + shard_id_t(shard()), pool(), crush(), + std::string(nspace()), std::string(oid()), snap(), gen()); + } + + void replace(const crush_t& key) { p_crush = &key; } + void set(const crush_t& key) { + assert(!has_crush()); + replace(key); + } + void replace(const shard_pool_crush_t& key) { p_shard_pool = &key.shard_pool; } + void set(const shard_pool_crush_t& key) { + set(key.crush); + assert(!has_shard_pool()); + replace(key); + } + void replace(const ns_oid_view_t& key) { p_ns_oid = key; } + void set(const ns_oid_view_t& key) { + assert(!has_ns_oid()); + replace(key); + } + void replace(const snap_gen_t& key) { p_snap_gen = &key; } + void set(const snap_gen_t& key) { + assert(!has_snap_gen()); + replace(key); + } + + std::ostream& dump(std::ostream& os) const { + os << "key_view("; + if (has_shard_pool()) { + os << (unsigned)shard() << "," << pool() << ","; + } else { + os << "X,X,"; + } + if (has_crush()) { + os << crush() << "; "; + } else { + os << "X; "; + } + if (has_ns_oid()) { + os << ns_oid_view() << "; "; + } else { + os << "X,X; "; + } + if (has_snap_gen()) { + os << snap() << "," << gen() << ")"; + } else { + os << "X,X)"; + } + return os; + } + + private: + const shard_pool_t* p_shard_pool = nullptr; + const crush_t* p_crush = nullptr; + std::optional<ns_oid_view_t> p_ns_oid; + const snap_gen_t* p_snap_gen = nullptr; +}; + +template <KeyT KT> +void encode_key(const full_key_t<KT>& key, ceph::bufferlist& bl) { + ceph::encode(key.shard(), bl); + ceph::encode(key.pool(), bl); + ceph::encode(key.crush(), bl); + key.nspace_masked().encode(bl); + key.oid_masked().encode(bl); + ceph::encode(key.snap(), bl); + ceph::encode(key.gen(), bl); +} + +inline MatchKindCMP compare_to(std::string_view l, std::string_view r) { + return toMatchKindCMP(l, r); +} +template <KeyT TypeL, KeyT TypeR> +bool compare_full_key(const full_key_t<TypeL>& l, const full_key_t<TypeR>& r) { + if (l.shard() != r.shard()) + return false; + if (l.pool() != r.pool()) + return false; + if (l.crush() != r.crush()) + return false; + if (compare_to(l.nspace(), r.nspace()) != MatchKindCMP::EQ) + return false; + if (compare_to(l.oid(), r.oid()) != MatchKindCMP::EQ) + return false; + if (l.snap() != r.snap()) + return false; + if (l.gen() != r.gen()) + return false; + return true; +} + +inline bool key_hobj_t::operator==(const full_key_t<KeyT::VIEW>& o) const { + return compare_full_key<KeyT::HOBJ, KeyT::VIEW>(*this, o); +} +inline bool key_hobj_t::operator==(const full_key_t<KeyT::HOBJ>& o) const { + return compare_full_key<KeyT::HOBJ, KeyT::HOBJ>(*this, o); +} +inline bool key_view_t::operator==(const full_key_t<KeyT::VIEW>& o) const { + return compare_full_key<KeyT::VIEW, KeyT::VIEW>(*this, o); +} +inline bool key_view_t::operator==(const full_key_t<KeyT::HOBJ>& o) const { + return compare_full_key<KeyT::VIEW, KeyT::HOBJ>(*this, o); +} + +inline std::ostream& operator<<(std::ostream& os, const key_view_t& key) { + return key.dump(os); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_t& target) { + auto ret = toMatchKindCMP(key.shard(), target.shard); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(key.pool(), target.pool); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const crush_t& target) { + return toMatchKindCMP(key.crush(), target.crush); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const shard_pool_crush_t& target) { + auto ret = compare_to<Type>(key, target.shard_pool); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to<Type>(key, target.crush); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const ns_oid_view_t& target) { + auto ret = compare_to(key.nspace(), string_view_masked_t{target.nspace}); + if (ret != MatchKindCMP::EQ) + return ret; + return compare_to(key.oid(), string_view_masked_t{target.oid}); +} + +template <KeyT Type> +MatchKindCMP compare_to(const full_key_t<Type>& key, const snap_gen_t& target) { + auto ret = toMatchKindCMP(key.snap(), target.snap); + if (ret != MatchKindCMP::EQ) + return ret; + return toMatchKindCMP(key.gen(), target.gen); +} + +template <KeyT KT> +shard_pool_t shard_pool_t::from_key(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.shard_pool_packed(); + } else { + return {key.shard(), key.pool()}; + } +} + +template <KeyT KT> +crush_t crush_t::from_key(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.crush_packed(); + } else { + return {key.crush()}; + } +} + +template <KeyT KT> +shard_pool_crush_t shard_pool_crush_t::from_key(const full_key_t<KT>& key) { + return {shard_pool_t::from_key<KT>(key), crush_t::from_key<KT>(key)}; +} + +template <KeyT KT> +snap_gen_t snap_gen_t::from_key(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.snap_gen_packed(); + } else { + return {key.snap(), key.gen()}; + } +} + +template <KeyT KT> +node_offset_t ns_oid_view_t::estimate_size(const full_key_t<KT>& key) { + if constexpr (KT == KeyT::VIEW) { + return key.ns_oid_view().size(); + } else { + if (key.dedup_type() != Type::STR) { + // size after deduplication + return sizeof(string_size_t); + } else { + return 2 * sizeof(string_size_t) + key.nspace().size() + key.oid().size(); + } + } +} + +template <KeyT KT> +void ns_oid_view_t::append( + NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) { + if (key.dedup_type() == Type::STR) { + string_key_view_t::append_str(mut, key.nspace(), p_append); + string_key_view_t::append_str(mut, key.oid(), p_append); + } else { + string_key_view_t::append_dedup(mut, key.dedup_type(), p_append); + } +} + +template <KeyT KT> +void ns_oid_view_t::test_append(const full_key_t<KT>& key, char*& p_append) { + if (key.dedup_type() == Type::STR) { + string_key_view_t::test_append_str(key.nspace(), p_append); + string_key_view_t::test_append_str(key.oid(), p_append); + } else { + string_key_view_t::test_append_dedup(key.dedup_type(), p_append); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc new file mode 100644 index 000000000..4a5988185 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.cc @@ -0,0 +1,318 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" +#include "node_stage_layout.h" + +namespace crimson::os::seastore::onode { + +#define NODE_T node_extent_t<FieldType, NODE_TYPE> +#define NODE_INST(FT, NT) node_extent_t<FT, NT> + +template <typename FieldType, node_type_t NODE_TYPE> +const char* NODE_T::p_left_bound() const { + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + // N3 internal node doesn't have the right part + return nullptr; + } else { + auto ret = p_start() + fields().get_item_end_offset(keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (is_level_tail()) { + ret -= sizeof(laddr_t); + } + } + return ret; + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::size_to_nxt_at(index_t index) const { + assert(index < keys()); + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + return FieldType::estimate_insert_one(); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + auto p_end = p_start() + p_fields->get_item_end_offset(index); + return FieldType::estimate_insert_one() + ns_oid_view_t(p_end).size(); + } else { + ceph_abort("N3 node is not nested"); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +memory_range_t NODE_T::get_nxt_container(index_t index) const { + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("N3 internal node doesn't have the right part"); + } else { + node_offset_t item_start_offset = p_fields->get_item_start_offset(index); + node_offset_t item_end_offset = p_fields->get_item_end_offset(index); + assert(item_start_offset < item_end_offset); + auto item_p_start = p_start() + item_start_offset; + auto item_p_end = p_start() + item_end_offset; + if constexpr (FIELD_TYPE == field_type_t::N2) { + // range for sub_items_t<NODE_TYPE> + item_p_end = ns_oid_view_t(item_p_end).p_start(); + assert(item_p_start < item_p_end); + } else { + // range for item_iterator_t<NODE_TYPE> + } + return {item_p_start, item_p_end}; + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) { + node_header_t::bootstrap_extent( + mut, field_type, node_type, is_level_tail, level); + mut.copy_in_relative( + sizeof(node_header_t), typename FieldType::num_keys_t(0u)); +} + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::update_is_level_tail( + NodeExtentMutable& mut, const node_extent_t& extent, bool value) { + node_header_t::update_is_level_tail(mut, extent.p_fields->header, value); +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +memory_range_t NODE_T::insert_prefix_at( + NodeExtentMutable& mut, const node_extent_t& node, const full_key_t<KT>& key, + index_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + assert(index <= node.keys()); + assert(p_left_bound == node.p_left_bound()); + assert(size > FieldType::estimate_insert_one()); + auto size_right = size - FieldType::estimate_insert_one(); + const char* p_insert = node.p_start() + node.fields().get_item_end_offset(index); + const char* p_insert_front = p_insert - size_right; + FieldType::template insert_at<KT>(mut, key, node.fields(), index, size_right); + mut.shift_absolute(p_left_bound, + p_insert - p_left_bound, + -(int)size_right); + return {p_insert_front, p_insert}; + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible"); + } +} +#define IPA_TEMPLATE(FT, NT, KT) \ + template memory_range_t NODE_INST(FT, NT)::insert_prefix_at<KT>( \ + NodeExtentMutable&, const node_extent_t&, const full_key_t<KT>&, \ + index_t, node_offset_t, const char*) +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +IPA_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +IPA_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); + +template <typename FieldType, node_type_t NODE_TYPE> +void NODE_T::update_size_at( + NodeExtentMutable& mut, const node_extent_t& node, index_t index, int change) { + assert(index < node.keys()); + FieldType::update_size_at(mut, node.fields(), index, change); +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::trim_until( + NodeExtentMutable& mut, const node_extent_t& node, index_t index) { + assert(!node.is_level_tail()); + auto keys = node.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("not implemented"); + } else { + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index)); + } + // no need to calculate trim size for node + return 0; +} + +template <typename FieldType, node_type_t NODE_TYPE> +node_offset_t NODE_T::trim_at( + NodeExtentMutable& mut, const node_extent_t& node, + index_t index, node_offset_t trimmed) { + assert(!node.is_level_tail()); + assert(index < node.keys()); + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("not implemented"); + } else { + node_offset_t offset = node.p_fields->get_item_start_offset(index); + size_t new_offset = offset + trimmed; + assert(new_offset < node.p_fields->get_item_end_offset(index)); + mut.copy_in_absolute(const_cast<void*>(node.p_fields->p_offset(index)), + node_offset_t(new_offset)); + mut.copy_in_absolute( + (void*)&node.p_fields->num_keys, num_keys_t(index + 1)); + } + // no need to calculate trim size for node + return 0; +} + +#define NODE_TEMPLATE(FT, NT) template class NODE_INST(FT, NT) +NODE_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL); +NODE_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL); +NODE_TEMPLATE(node_fields_0_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_1_t, node_type_t::LEAF); +NODE_TEMPLATE(node_fields_2_t, node_type_t::LEAF); +NODE_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF); + +#define APPEND_T node_extent_t<FieldType, NODE_TYPE>::Appender<KT> + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::append(const node_extent_t& src, index_t from, index_t items) { + assert(from <= src.keys()); + if (p_src == nullptr) { + p_src = &src; + } else { + assert(p_src == &src); + } + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + num_keys += items; + if constexpr (std::is_same_v<FieldType, internal_fields_3_t>) { + ceph_abort("impossible path"); + } else { + // append left part forwards + node_offset_t offset_left_start = src.fields().get_key_start_offset(from); + node_offset_t offset_left_end = src.fields().get_key_start_offset(from + items); + node_offset_t left_size = offset_left_end - offset_left_start; + if (num_keys == 0) { + // no need to adjust offset + assert(from == 0); + assert(p_start + offset_left_start == p_append_left); + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + } else { + node_offset_t step_size = FieldType::estimate_insert_one(); + node_offset_t offset_base = src.fields().get_item_end_offset(from); + int offset_change = p_append_right - p_start - offset_base; + auto p_offset_dst = p_append_left; + if constexpr (FIELD_TYPE != field_type_t::N2) { + // copy keys + p_mut->copy_in_absolute(p_append_left, + src.p_start() + offset_left_start, left_size); + // point to offset for update + p_offset_dst += sizeof(typename FieldType::key_t); + } + for (auto i = from; i < from + items; ++i) { + p_mut->copy_in_absolute(p_offset_dst, + node_offset_t(src.fields().get_item_start_offset(i) + offset_change)); + p_offset_dst += step_size; + } + assert(p_append_left + left_size + sizeof(typename FieldType::key_t) == + p_offset_dst); + } + p_append_left += left_size; + + // append right part backwards + node_offset_t offset_right_start = src.fields().get_item_end_offset(from + items); + node_offset_t offset_right_end = src.fields().get_item_end_offset(from); + node_offset_t right_size = offset_right_end - offset_right_start; + p_append_right -= right_size; + p_mut->copy_in_absolute(p_append_right, + src.p_start() + offset_right_start, right_size); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +void APPEND_T::append( + const full_key_t<KT>& key, const value_t& value, const value_t*& p_value) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + ceph_abort("not implemented"); + } else { + ceph_abort("should not happen"); + } +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const key_get_type& partial_key) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::append_key(*p_mut, partial_key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::append_key(*p_mut, partial_key, p_append_right); + } else { + ceph_abort("impossible path"); + } + return {p_mut, p_append_right}; +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +std::tuple<NodeExtentMutable*, char*> +APPEND_T::open_nxt(const full_key_t<KT>& key) { + if constexpr (FIELD_TYPE == field_type_t::N0 || + FIELD_TYPE == field_type_t::N1) { + FieldType::template append_key<KT>(*p_mut, key, p_append_left); + } else if constexpr (FIELD_TYPE == field_type_t::N2) { + FieldType::template append_key<KT>(*p_mut, key, p_append_right); + } else { + ceph_abort("impossible path"); + } + return {p_mut, p_append_right}; +} + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +char* APPEND_T::wrap() { + assert(p_append_left <= p_append_right); + assert(p_src); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (p_src->is_level_tail()) { + laddr_t tail_value = p_src->get_end_p_laddr()->value; + p_append_right -= sizeof(laddr_t); + assert(p_append_left <= p_append_right); + p_mut->copy_in_absolute(p_append_right, tail_value); + } + } + p_mut->copy_in_absolute(p_start + offsetof(FieldType, num_keys), num_keys); + return p_append_left; +} + +#define APPEND_TEMPLATE(FT, NT, KT) template class node_extent_t<FT, NT>::Appender<KT> +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::VIEW); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(internal_fields_3_t, node_type_t::INTERNAL, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_0_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_1_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(node_fields_2_t, node_type_t::LEAF, KeyT::HOBJ); +APPEND_TEMPLATE(leaf_fields_3_t, node_type_t::LEAF, KeyT::HOBJ); + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h new file mode 100644 index 000000000..cf0ca463c --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage.h @@ -0,0 +1,226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +/** + * node_extent_t + * + * The top indexing stage implementation for node N0/N1/N2/N3, implements + * staged contract as an indexable container, and provides access to node + * header. + * + * The specific field layout are defined by FieldType which are + * node_fields_0_t, node_fields_1_t, node_fields_2_t, internal_fields_3_t and + * leaf_fields_3_t. Diagrams see node_stage_layout.h. + */ +template <typename FieldType, node_type_t _NODE_TYPE> +class node_extent_t { + public: + using value_t = value_type_t<_NODE_TYPE>; + using num_keys_t = typename FieldType::num_keys_t; + static constexpr node_type_t NODE_TYPE = _NODE_TYPE; + static constexpr field_type_t FIELD_TYPE = FieldType::FIELD_TYPE; + static constexpr node_offset_t EXTENT_SIZE = + (FieldType::SIZE + DISK_BLOCK_SIZE - 1u) / DISK_BLOCK_SIZE * DISK_BLOCK_SIZE; + + // TODO: remove + node_extent_t() = default; + + node_extent_t(const FieldType* p_fields) : p_fields{p_fields} { + validate(*p_fields); + } + + const char* p_start() const { return fields_start(*p_fields); } + + const char* off_to_ptr(node_offset_t off) const { + assert(off <= FieldType::SIZE); + return p_start() + off; + } + + node_offset_t ptr_to_off(const void* ptr) const { + auto _ptr = static_cast<const char*>(ptr); + assert(_ptr >= p_start()); + auto off = _ptr - p_start(); + assert(off <= FieldType::SIZE); + return off; + } + + bool is_level_tail() const { return p_fields->is_level_tail(); } + level_t level() const { return p_fields->header.level; } + node_offset_t free_size() const { + return p_fields->template free_size_before<NODE_TYPE>(keys()); + } + node_offset_t total_size() const { return p_fields->total_size(); } + const char* p_left_bound() const; + template <node_type_t T = NODE_TYPE> + std::enable_if_t<T == node_type_t::INTERNAL, const laddr_packed_t*> + get_end_p_laddr() const { + assert(is_level_tail()); + if constexpr (FIELD_TYPE == field_type_t::N3) { + return &p_fields->child_addrs[keys()]; + } else { + auto offset_start = p_fields->get_item_end_offset(keys()); + assert(offset_start <= FieldType::SIZE); + offset_start -= sizeof(laddr_packed_t); + auto p_addr = p_start() + offset_start; + return reinterpret_cast<const laddr_packed_t*>(p_addr); + } + } + + // container type system + using key_get_type = typename FieldType::key_get_type; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + index_t keys() const { return p_fields->num_keys; } + key_get_type operator[] (index_t index) const { return p_fields->get_key(index); } + node_offset_t size_before(index_t index) const { + auto free_size = p_fields->template free_size_before<NODE_TYPE>(index); + assert(total_size() >= free_size); + return total_size() - free_size; + } + node_offset_t size_to_nxt_at(index_t index) const; + node_offset_t size_overhead_at(index_t index) const { + return FieldType::ITEM_OVERHEAD; } + memory_range_t get_nxt_container(index_t index) const; + + template <typename T = FieldType> + std::enable_if_t<T::FIELD_TYPE == field_type_t::N3, const value_t*> + get_p_value(index_t index) const { + assert(index < keys()); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + return &p_fields->child_addrs[index]; + } else { + auto range = get_nxt_container(index); + auto ret = reinterpret_cast<const onode_t*>(range.p_start); + assert(range.p_start + ret->size == range.p_end); + return ret; + } + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + assert(p_node_start == p_start()); + // nothing to encode as the container range is the entire extent + } + + static node_extent_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + // nothing to decode + return node_extent_t(reinterpret_cast<const FieldType*>(p_node_start)); + } + + static void validate(const FieldType& fields) { +#ifndef NDEBUG + assert(fields.header.get_node_type() == NODE_TYPE); + assert(fields.header.get_field_type() == FieldType::FIELD_TYPE); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + assert(fields.header.level > 0u); + } else { + assert(fields.header.level == 0u); + } +#endif + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_extent_t&, bool); + + static node_offset_t header_size() { return FieldType::HEADER_SIZE; } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>& key, const value_t& value) { + auto size = FieldType::estimate_insert_one(); + if constexpr (FIELD_TYPE == field_type_t::N2) { + size += ns_oid_view_t::estimate_size<KT>(key); + } else if constexpr (FIELD_TYPE == field_type_t::N3 && + NODE_TYPE == node_type_t::LEAF) { + size += value.size; + } + return size; + } + + template <KeyT KT> + static const value_t* insert_at( + NodeExtentMutable& mut, const node_extent_t&, + const full_key_t<KT>& key, const value_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + if constexpr (FIELD_TYPE == field_type_t::N3) { + ceph_abort("not implemented"); + } else { + ceph_abort("impossible"); + } + } + + template <KeyT KT> + static memory_range_t insert_prefix_at( + NodeExtentMutable&, const node_extent_t&, + const full_key_t<KT>& key, + index_t index, node_offset_t size, const char* p_left_bound); + + static void update_size_at( + NodeExtentMutable&, const node_extent_t&, index_t index, int change); + + static node_offset_t trim_until( + NodeExtentMutable&, const node_extent_t&, index_t index); + static node_offset_t trim_at(NodeExtentMutable&, const node_extent_t&, + index_t index, node_offset_t trimmed); + + template <KeyT KT> + class Appender; + + private: + const FieldType& fields() const { return *p_fields; } + const FieldType* p_fields; +}; + +template <typename FieldType, node_type_t NODE_TYPE> +template <KeyT KT> +class node_extent_t<FieldType, NODE_TYPE>::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_start{p_append} { +#ifndef NDEBUG + auto p_fields = reinterpret_cast<const FieldType*>(p_append); + assert(*(p_fields->header.get_field_type()) == FIELD_TYPE); + assert(p_fields->header.get_node_type() == NODE_TYPE); + assert(p_fields->num_keys == 0); +#endif + p_append_left = p_start + FieldType::HEADER_SIZE; + p_append_right = p_start + FieldType::SIZE; + } + void append(const node_extent_t& src, index_t from, index_t items); + void append(const full_key_t<KT>&, const value_t&, const value_t*&); + char* wrap(); + std::tuple<NodeExtentMutable*, char*> open_nxt(const key_get_type&); + std::tuple<NodeExtentMutable*, char*> open_nxt(const full_key_t<KT>&); + void wrap_nxt(char* p_append) { + if constexpr (FIELD_TYPE != field_type_t::N3) { + assert(p_append < p_append_right); + assert(p_append_left < p_append); + p_append_right = p_append; + FieldType::append_offset(*p_mut, p_append - p_start, p_append_left); + ++num_keys; + } else { + ceph_abort("not implemented"); + } + } + + private: + const node_extent_t* p_src = nullptr; + NodeExtentMutable* p_mut; + char* p_start; + char* p_append_left; + char* p_append_right; + num_keys_t num_keys = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc new file mode 100644 index 000000000..81bfac72a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.cc @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "node_stage_layout.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +void node_header_t::bootstrap_extent( + NodeExtentMutable& mut, + field_type_t field_type, node_type_t node_type, + bool is_level_tail, level_t level) { + node_header_t header; + header.set_field_type(field_type); + header.set_node_type(node_type); + header.set_is_level_tail(is_level_tail); + header.level = level; + mut.copy_in_relative(0, header); +} + +void node_header_t::update_is_level_tail( + NodeExtentMutable& mut, const node_header_t& header, bool value) { + auto& _header = const_cast<node_header_t&>(header); + _header.set_is_level_tail(value); + mut.validate_inplace_update(_header); +} + +#define F013_T _node_fields_013_t<SlotType> +#define F013_INST(ST) _node_fields_013_t<ST> + +template <typename SlotType> +void F013_T::update_size_at( + NodeExtentMutable& mut, const me_t& node, index_t index, int change) { + assert(index <= node.num_keys); + for (const auto* p_slot = &node.slots[index]; + p_slot < &node.slots[node.num_keys]; + ++p_slot) { + node_offset_t offset = p_slot->right_offset; + mut.copy_in_absolute( + (void*)&(p_slot->right_offset), + node_offset_t(offset - change)); + } +} + +template <typename SlotType> +void F013_T::append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + mut.copy_in_absolute(p_append, key); + p_append += sizeof(key_t); +} + +template <typename SlotType> +void F013_T::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) { + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +template <typename SlotType> +template <KeyT KT> +void F013_T::insert_at( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const me_t& node, index_t index, node_offset_t size_right) { + assert(index <= node.num_keys); + update_size_at(mut, node, index, size_right); + auto p_insert = const_cast<char*>(fields_start(node)) + + node.get_key_start_offset(index); + auto p_shift_end = fields_start(node) + node.get_key_start_offset(node.num_keys); + mut.shift_absolute(p_insert, p_shift_end - p_insert, estimate_insert_one()); + mut.copy_in_absolute((void*)&node.num_keys, num_keys_t(node.num_keys + 1)); + append_key(mut, key_t::template from_key<KT>(key), p_insert); + append_offset(mut, node.get_item_end_offset(index) - size_right, p_insert); +} +#define IA_TEMPLATE(ST, KT) template void F013_INST(ST):: \ + insert_at<KT>(NodeExtentMutable&, const full_key_t<KT>&, \ + const F013_INST(ST)&, index_t, node_offset_t) +IA_TEMPLATE(slot_0_t, KeyT::VIEW); +IA_TEMPLATE(slot_1_t, KeyT::VIEW); +IA_TEMPLATE(slot_3_t, KeyT::VIEW); +IA_TEMPLATE(slot_0_t, KeyT::HOBJ); +IA_TEMPLATE(slot_1_t, KeyT::HOBJ); +IA_TEMPLATE(slot_3_t, KeyT::HOBJ); + +#define F013_TEMPLATE(ST) template struct F013_INST(ST) +F013_TEMPLATE(slot_0_t); +F013_TEMPLATE(slot_1_t); +F013_TEMPLATE(slot_3_t); + +void node_fields_2_t::append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append) { + mut.copy_in_absolute(p_append, offset_to_right); + p_append += sizeof(node_offset_t); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h new file mode 100644 index 000000000..14ba95bf4 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/node_stage_layout.h @@ -0,0 +1,366 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "key_layout.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct node_header_t { + static constexpr unsigned FIELD_TYPE_BITS = 6u; + static_assert(static_cast<uint8_t>(field_type_t::_MAX) <= 1u << FIELD_TYPE_BITS); + static constexpr unsigned NODE_TYPE_BITS = 1u; + static constexpr unsigned B_LEVEL_TAIL_BITS = 1u; + using bits_t = uint8_t; + + node_header_t() {} + std::optional<field_type_t> get_field_type() const { + if (field_type >= FIELD_TYPE_MAGIC && + field_type < static_cast<uint8_t>(field_type_t::_MAX)) { + return static_cast<field_type_t>(field_type); + } else { + return std::nullopt; + } + } + node_type_t get_node_type() const { + return static_cast<node_type_t>(node_type); + } + bool get_is_level_tail() const { + return is_level_tail; + } + + static void bootstrap_extent( + NodeExtentMutable&, field_type_t, node_type_t, bool, level_t); + + static void update_is_level_tail(NodeExtentMutable&, const node_header_t&, bool); + + bits_t field_type : FIELD_TYPE_BITS; + bits_t node_type : NODE_TYPE_BITS; + bits_t is_level_tail : B_LEVEL_TAIL_BITS; + static_assert(sizeof(bits_t) * 8 == + FIELD_TYPE_BITS + NODE_TYPE_BITS + B_LEVEL_TAIL_BITS); + level_t level; + + private: + void set_field_type(field_type_t type) { + field_type = static_cast<uint8_t>(type); + } + void set_node_type(node_type_t type) { + node_type = static_cast<uint8_t>(type); + } + void set_is_level_tail(bool value) { + is_level_tail = static_cast<uint8_t>(value); + } +} __attribute__((packed)); + +template <typename FixedKeyType, field_type_t _FIELD_TYPE> +struct _slot_t { + using key_t = FixedKeyType; + static constexpr field_type_t FIELD_TYPE = _FIELD_TYPE; + static constexpr node_offset_t OVERHEAD = sizeof(_slot_t) - sizeof(key_t); + + key_t key; + node_offset_t right_offset; +} __attribute__((packed)); +using slot_0_t = _slot_t<shard_pool_crush_t, field_type_t::N0>; +using slot_1_t = _slot_t<crush_t, field_type_t::N1>; +using slot_3_t = _slot_t<snap_gen_t, field_type_t::N3>; + +struct node_range_t { + node_offset_t start; + node_offset_t end; +}; + +template <typename FieldType> +const char* fields_start(const FieldType& node) { + return reinterpret_cast<const char*>(&node); +} + +template <node_type_t NODE_TYPE, typename FieldType> +node_range_t fields_free_range_before( + const FieldType& node, index_t index) { + assert(index <= node.num_keys); + node_offset_t offset_start = node.get_key_start_offset(index); + node_offset_t offset_end = + (index == 0 ? FieldType::SIZE + : node.get_item_start_offset(index - 1)); + if constexpr (NODE_TYPE == node_type_t::INTERNAL) { + if (node.is_level_tail() && index == node.num_keys) { + offset_end -= sizeof(laddr_t); + } + } + assert(offset_start <= offset_end); + assert(offset_end - offset_start < FieldType::SIZE); + return {offset_start, offset_end}; +} + +/** + * _node_fields_013_t (node_fields_0_t, node_fields_1_t, leaf_fields_3_t + * + * The STAGE_LEFT layout implementation for node N0/N1, or the STAGE_RIGHT + * layout implementation for leaf node N3. + * + * The node layout storing n slots: + * + * # <----------------------------- node range --------------------------------------> # + * # #<~># free space # + * # <----- left part -----------------------------> # <~# <----- right slots -------> # + * # # <---- left slots -------------> #~> # # + * # # slots [2, n) |<~># #<~>| right slots [2, n) # + * # # <- slot 0 -> | <- slot 1 -> | # # | <-- s1 --> | <-- s0 --> # + * # # | | # # | | # + * # | num_ # | right | | right | # # | next-stage | next-stage # + * # header | keys # key | offset | key | offset | # # | container | container # + * # | # 0 | 0 | 1 | 1 |...#...#...| or onode 1 | or onode 0 # + * | | ^ ^ + * | | | | + * | +----------------+ | + * +--------------------------------------------+ + */ +template <typename SlotType> +struct _node_fields_013_t { + // TODO: decide by NODE_BLOCK_SIZE, sizeof(SlotType), sizeof(laddr_t) + // and the minimal size of variable_key. + using num_keys_t = uint8_t; + using key_t = typename SlotType::key_t; + using key_get_type = const key_t&; + using me_t = _node_fields_013_t<SlotType>; + static constexpr field_type_t FIELD_TYPE = SlotType::FIELD_TYPE; + static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = SlotType::OVERHEAD; + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { return SIZE; } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + return slots[index].key; + } + node_offset_t get_key_start_offset(index_t index) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(SlotType) * index; + assert(offset < SIZE); + return offset; + } + node_offset_t get_item_start_offset(index_t index) const { + assert(index < num_keys); + auto offset = slots[index].right_offset; + assert(offset <= SIZE); + return offset; + } + const void* p_offset(index_t index) const { + assert(index < num_keys); + return &slots[index].right_offset; + } + node_offset_t get_item_end_offset(index_t index) const { + return index == 0 ? SIZE : get_item_start_offset(index - 1); + } + template <node_type_t NODE_TYPE> + node_offset_t free_size_before(index_t index) const { + auto range = fields_free_range_before<NODE_TYPE>(*this, index); + return range.end - range.start; + } + + static node_offset_t estimate_insert_one() { return sizeof(SlotType); } + template <KeyT KT> + static void insert_at( + NodeExtentMutable&, const full_key_t<KT>& key, + const me_t& node, index_t index, node_offset_t size_right); + static void update_size_at( + NodeExtentMutable&, const me_t& node, index_t index, int change); + static void append_key( + NodeExtentMutable&, const key_t& key, char*& p_append); + template <KeyT KT> + static void append_key( + NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) { + append_key(mut, key_t::template from_key<KT>(key), p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + SlotType slots[]; +} __attribute__((packed)); +using node_fields_0_t = _node_fields_013_t<slot_0_t>; +using node_fields_1_t = _node_fields_013_t<slot_1_t>; + +/** + * node_fields_2_t + * + * The STAGE_STRING layout implementation for node N2. + * + * The node layout storing n slots: + * + * # <--------------------------------- node range ----------------------------------------> # + * # #<~># free space # + * # <------- left part ---------------> # <~# <--------- right slots ---------------------> # + * # # <---- offsets ----> #~> #<~>| slots [2, n) # + * # # offsets [2, n) |<~># # | <----- slot 1 ----> | <----- slot 0 ----> # + * # # | # # | | # + * # | num_ # offset | offset | # # | next-stage | ns-oid | next-stage | ns-oid # + * # header | keys # 0 | 1 |...#...#...| container1 | 1 | container0 | 0 # + * | | ^ ^ + * | | | | + * | +----------------+ | + * +-----------------------------------------------+ + */ +struct node_fields_2_t { + // TODO: decide by NODE_BLOCK_SIZE, sizeof(node_off_t), sizeof(laddr_t) + // and the minimal size of variable_key. + using num_keys_t = uint8_t; + using key_t = ns_oid_view_t; + using key_get_type = key_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N2; + static constexpr node_offset_t SIZE = NODE_BLOCK_SIZE; + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = sizeof(node_offset_t); + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { return SIZE; } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + node_offset_t item_end_offset = + (index == 0 ? SIZE : offsets[index - 1]); + assert(item_end_offset <= SIZE); + const char* p_start = fields_start(*this); + return key_t(p_start + item_end_offset); + } + node_offset_t get_key_start_offset(index_t index) const { + assert(index <= num_keys); + auto offset = HEADER_SIZE + sizeof(node_offset_t) * num_keys; + assert(offset <= SIZE); + return offset; + } + node_offset_t get_item_start_offset(index_t index) const { + assert(index < num_keys); + auto offset = offsets[index]; + assert(offset <= SIZE); + return offset; + } + const void* p_offset(index_t index) const { + assert(index < num_keys); + return &offsets[index]; + } + node_offset_t get_item_end_offset(index_t index) const { + return index == 0 ? SIZE : get_item_start_offset(index - 1); + } + template <node_type_t NODE_TYPE> + node_offset_t free_size_before(index_t index) const { + auto range = fields_free_range_before<NODE_TYPE>(*this, index); + return range.end - range.start; + } + + static node_offset_t estimate_insert_one() { return sizeof(node_offset_t); } + template <KeyT KT> + static void insert_at( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const node_fields_2_t& node, index_t index, node_offset_t size_right) { + ceph_abort("not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const node_fields_2_t& node, index_t index, int change) { + ceph_abort("not implemented"); + } + static void append_key( + NodeExtentMutable& mut, const key_t& key, char*& p_append) { + ns_oid_view_t::append(mut, key, p_append); + } + template <KeyT KT> + static void append_key( + NodeExtentMutable& mut, const full_key_t<KT>& key, char*& p_append) { + ns_oid_view_t::append<KT>(mut, key, p_append); + } + static void append_offset( + NodeExtentMutable& mut, node_offset_t offset_to_right, char*& p_append); + + node_header_t header; + num_keys_t num_keys = 0u; + node_offset_t offsets[]; +} __attribute__((packed)); + +/** + * internal_fields_3_t + * + * The STAGE_RIGHT layout implementation for N2. + * + * The node layout storing 3 children: + * + * # <---------------- node range ---------------------------> # + * # # <-- keys ---> # <---- laddrs -----------> # + * # free space: # |<~># |<~># + * # # | # | # + * # | num_ # key | key | # laddr | laddr | laddr | # + * # header | keys # 0 | 1 |...# 0 | 1 | 2 |...# + */ +// TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t) +static constexpr unsigned MAX_NUM_KEYS_I3 = 170u; +template <unsigned MAX_NUM_KEYS> +struct _internal_fields_3_t { + using key_get_type = const snap_gen_t&; + using me_t = _internal_fields_3_t<MAX_NUM_KEYS>; + // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), sizeof(laddr_t) + using num_keys_t = uint8_t; + static constexpr field_type_t FIELD_TYPE = field_type_t::N3; + static constexpr node_offset_t SIZE = sizeof(me_t); + static constexpr node_offset_t HEADER_SIZE = + sizeof(node_header_t) + sizeof(num_keys_t); + static constexpr node_offset_t ITEM_OVERHEAD = 0u; + + bool is_level_tail() const { return header.get_is_level_tail(); } + node_offset_t total_size() const { + if (is_level_tail()) { + return SIZE - sizeof(snap_gen_t); + } else { + return SIZE; + } + } + key_get_type get_key(index_t index) const { + assert(index < num_keys); + return keys[index]; + } + template <node_type_t NODE_TYPE> + std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, node_offset_t> + free_size_before(index_t index) const { + assert(index <= num_keys); + assert(num_keys <= (is_level_tail() ? MAX_NUM_KEYS - 1 : MAX_NUM_KEYS)); + auto free = (MAX_NUM_KEYS - index) * (sizeof(snap_gen_t) + sizeof(laddr_t)); + if (is_level_tail() && index == num_keys) { + free -= (sizeof(snap_gen_t) + sizeof(laddr_t)); + } + assert(free < SIZE); + return free; + } + + static node_offset_t estimate_insert_one() { + return sizeof(snap_gen_t) + sizeof(laddr_t); + } + template <KeyT KT> + static void insert_at( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const me_t& node, index_t index, node_offset_t size_right) { + ceph_abort("not implemented"); + } + static void update_size_at( + NodeExtentMutable& mut, const me_t& node, index_t index, int change) { + ceph_abort("not implemented"); + } + + node_header_t header; + num_keys_t num_keys = 0u; + snap_gen_t keys[MAX_NUM_KEYS]; + laddr_packed_t child_addrs[MAX_NUM_KEYS]; +} __attribute__((packed)); +static_assert(_internal_fields_3_t<MAX_NUM_KEYS_I3>::SIZE <= NODE_BLOCK_SIZE && + _internal_fields_3_t<MAX_NUM_KEYS_I3 + 1>::SIZE > NODE_BLOCK_SIZE); +using internal_fields_3_t = _internal_fields_3_t<MAX_NUM_KEYS_I3>; + +using leaf_fields_3_t = _node_fields_013_t<slot_3_t>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h new file mode 100644 index 000000000..cac167a98 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage.h @@ -0,0 +1,2186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <optional> +#include <ostream> +#include <sstream> +#include <type_traits> + +#include "common/likely.h" + +#include "sub_items_stage.h" +#include "item_iterator_stage.h" + +namespace crimson::os::seastore::onode { + +struct search_result_bs_t { + index_t index; + MatchKindBS match; +}; +template <typename FGetKey> +search_result_bs_t binary_search( + const full_key_t<KeyT::HOBJ>& key, + index_t begin, index_t end, FGetKey&& f_get_key) { + assert(begin <= end); + while (begin < end) { + auto total = begin + end; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get_key(mid)) target = f_get_key(mid); + auto match = compare_to<KeyT::HOBJ>(key, target); + if (match == MatchKindCMP::LT) { + end = mid; + } else if (match == MatchKindCMP::GT) { + begin = mid + 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {begin , MatchKindBS::NE}; +} + +template <typename PivotType, typename FGet> +search_result_bs_t binary_search_r( + index_t rend, index_t rbegin, FGet&& f_get, const PivotType& key) { + assert(rend <= rbegin); + while (rend < rbegin) { + auto total = rend + rbegin + 1; + auto mid = total >> 1; + // do not copy if return value is reference + decltype(f_get(mid)) target = f_get(mid); + int match = target - key; + if (match < 0) { + rend = mid; + } else if (match > 0) { + rbegin = mid - 1; + } else { + return {mid, MatchKindBS::EQ}; + } + } + return {rbegin, MatchKindBS::NE}; +} + +inline bool matchable(field_type_t type, match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_MAX); + /* + * compressed prefix by field type: + * N0: NONE + * N1: pool/shard + * N2: pool/shard crush + * N3: pool/shard crush ns/oid + * + * if key matches the node's compressed prefix, return true + * else, return false + */ +#ifndef NDEBUG + if (mstat == MSTAT_END) { + assert(type == field_type_t::N0); + } +#endif + return mstat + to_unsigned(type) < 4; +} + +inline void assert_mstat( + const full_key_t<KeyT::HOBJ>& key, + const full_key_t<KeyT::VIEW>& index, + match_stat_t mstat) { + assert(mstat >= MSTAT_MIN && mstat <= MSTAT_LT2); + // key < index ... + switch (mstat) { + case MSTAT_EQ: + break; + case MSTAT_LT0: + assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::LT); + break; + case MSTAT_LT1: + assert(compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::LT); + break; + case MSTAT_LT2: + if (index.has_shard_pool()) { + assert(compare_to<KeyT::HOBJ>(key, shard_pool_crush_t{ + index.shard_pool_packed(), index.crush_packed()}) == MatchKindCMP::LT); + } else { + assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::LT); + } + break; + default: + ceph_abort("impossible path"); + } + // key == index ... + switch (mstat) { + case MSTAT_EQ: + assert(compare_to<KeyT::HOBJ>(key, index.snap_gen_packed()) == MatchKindCMP::EQ); + case MSTAT_LT0: + if (!index.has_ns_oid()) + break; + assert(index.ns_oid_view().type() == ns_oid_view_t::Type::MAX || + compare_to<KeyT::HOBJ>(key, index.ns_oid_view()) == MatchKindCMP::EQ); + case MSTAT_LT1: + if (!index.has_crush()) + break; + assert(compare_to<KeyT::HOBJ>(key, index.crush_packed()) == MatchKindCMP::EQ); + if (!index.has_shard_pool()) + break; + assert(compare_to<KeyT::HOBJ>(key, index.shard_pool_packed()) == MatchKindCMP::EQ); + default: + break; + } +} + +#define NXT_STAGE_T staged<next_param_t> + +enum class TrimType { BEFORE, AFTER, AT }; + +/** + * staged + * + * Implements recursive logic that modifies or reads the node layout + * (N0/N1/N2/N3 * LEAF/INTERNAL) with the multi-stage design. The specific + * stage implementation is flexible. So the implementations for different + * stages can be assembled independently, as long as they follow the + * definitions of container interfaces. + * + * Multi-stage is designed to index different portions of onode keys + * stage-by-stage. There are at most 3 stages for a node: + * - STAGE_LEFT: index shard-pool-crush for N0, or index crush for N1 node; + * - STAGE_STRING: index ns-oid for N0/N1/N2 nodes; + * - STAGE_RIGHT: index snap-gen for N0/N1/N2/N3 nodes; + * + * The intention is to consolidate the high-level indexing implementations at + * the level of stage, so we don't need to write them repeatedly for every + * stage and for every node type. + */ +template <typename Params> +struct staged { + static_assert(Params::STAGE >= STAGE_BOTTOM); + static_assert(Params::STAGE <= STAGE_TOP); + using container_t = typename Params::container_t; + using key_get_type = typename container_t::key_get_type; + using next_param_t = typename Params::next_param_t; + using position_t = staged_position_t<Params::STAGE>; + using result_t = staged_result_t<Params::NODE_TYPE, Params::STAGE>; + using value_t = value_type_t<Params::NODE_TYPE>; + static constexpr auto CONTAINER_TYPE = container_t::CONTAINER_TYPE; + static constexpr bool IS_BOTTOM = (Params::STAGE == STAGE_BOTTOM); + static constexpr auto NODE_TYPE = Params::NODE_TYPE; + static constexpr auto STAGE = Params::STAGE; + + template <bool is_exclusive> + static void _left_or_right(index_t& split_index, index_t insert_index, + std::optional<bool>& is_insert_left) { + assert(!is_insert_left.has_value()); + assert(is_valid_index(split_index)); + if constexpr (is_exclusive) { + if (split_index <= insert_index) { + // ...[s_index-1] |!| (i_index) [s_index]... + // offset i_position to right + is_insert_left = false; + } else { + // ...[s_index-1] (i_index)) |?[s_index]| ... + // ...(i_index)...[s_index-1] |?[s_index]| ... + is_insert_left = true; + --split_index; + } + } else { + if (split_index < insert_index) { + // ...[s_index-1] |?[s_index]| ...[(i_index)[s_index_k]... + is_insert_left = false; + } else if (split_index > insert_index) { + // ...[(i_index)s_index-1] |?[s_index]| ... + // ...[(i_index)s_index_k]...[s_index-1] |?[s_index]| ... + is_insert_left = true; + } else { + // ...[s_index-1] |?[(i_index)s_index]| ... + // i_to_left = std::nullopt; + } + } + } + + template <ContainerType CTYPE, typename Enable = void> class _iterator_t; + template <ContainerType CTYPE> + class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::INDEXABLE>> { + /* + * indexable container type system: + * CONTAINER_TYPE = ContainerType::INDEXABLE + * keys() const -> index_t + * operator[](index_t) const -> key_get_type + * size_before(index_t) const -> node_offset_t + * size_overhead_at(index_t) const -> node_offset_t + * (IS_BOTTOM) get_p_value(index_t) const -> const value_t* + * (!IS_BOTTOM) size_to_nxt_at(index_t) const -> node_offset_t + * (!IS_BOTTOM) get_nxt_container(index_t) const + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> container_t + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * (IS_BOTTOM) insert_at(mut, src, key, value, + * index, size, p_left_bound) -> const value_t* + * (!IS_BOTTOM) insert_prefix_at(mut, src, key, + * index, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size_at(mut, src, index, size) + * trim_until(mut, container, index) -> trim_size + * (!IS_BOTTOM) trim_at(mut, container, index, trimmed) -> trim_size + * + * Appender::append(const container_t& src, from, items) + */ + public: + using me_t = _iterator_t<CTYPE>; + + _iterator_t(const container_t& container) : container{container} { + assert(container.keys()); + } + + index_t index() const { + return _index; + } + key_get_type get_key() const { + assert(!is_end()); + return container[_index]; + } + node_offset_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt_at(_index); + } + template <typename T = typename NXT_STAGE_T::container_t> + std::enable_if_t<!IS_BOTTOM, T> get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(_index); + } + template <typename T = value_t> + std::enable_if_t<IS_BOTTOM, const T*> get_p_value() const { + assert(!is_end()); + return container.get_p_value(_index); + } + bool is_last() const { + return _index + 1 == container.keys(); + } + bool is_end() const { return _index == container.keys(); } + node_offset_t size() const { + assert(!is_end()); + assert(header_size() == container.size_before(0)); + assert(container.size_before(_index + 1) > container.size_before(_index)); + return container.size_before(_index + 1) - + container.size_before(_index); + } + node_offset_t size_overhead() const { + assert(!is_end()); + return container.size_overhead_at(_index); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++_index; + return *this; + } + void seek_at(index_t index) { + assert(index < container.keys()); + seek_till_end(index); + } + void seek_till_end(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + assert(index <= container.keys()); + _index = index; + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + _index = container.keys() - 1; + } + void set_end() { + assert(!is_end()); + assert(is_last()); + ++_index; + } + // Note: possible to return an end iterator + MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + index_t end_index = container.keys(); + if (exclude_last) { + assert(end_index); + --end_index; + assert(compare_to<KeyT::HOBJ>(key, container[end_index]) == MatchKindCMP::LT); + } + auto ret = binary_search(key, _index, end_index, + [this] (index_t index) { return container[index]; }); + _index = ret.index; + return ret.match; + } + + template <KeyT KT, typename T = value_t> + std::enable_if_t<IS_BOTTOM, const T*> insert( + NodeExtentMutable& mut, const full_key_t<KT>& key, + const value_t& value, node_offset_t insert_size, const char* p_left_bound) { + return container_t::template insert_at<KT>( + mut, container, key, value, _index, insert_size, p_left_bound); + } + + template <KeyT KT, typename T = memory_range_t> + std::enable_if_t<!IS_BOTTOM, T> insert_prefix( + NodeExtentMutable& mut, const full_key_t<KT>& key, + node_offset_t size, const char* p_left_bound) { + return container_t::template insert_prefix_at<KT>( + mut, container, key, _index, size, p_left_bound); + } + + template <typename T = void> + std::enable_if_t<!IS_BOTTOM, T> + update_size(NodeExtentMutable& mut, node_offset_t insert_size) { + assert(!is_end()); + container_t::update_size_at(mut, container, _index, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + template <bool is_exclusive> + size_t seek_split_inserted( + size_t start_size, size_t extra_size, size_t target_size, + index_t& insert_index, size_t insert_size, + std::optional<bool>& is_insert_left) { + assert(!is_end()); + assert(index() == 0); + // replace insert_index placeholder + if constexpr (!is_exclusive) { + if (insert_index == INDEX_LAST) { + insert_index = container.keys() - 1; + } + } else { + if (insert_index == INDEX_END) { + insert_index = container.keys(); + } + } + assert(insert_index <= container.keys()); + + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1, + insert_index, insert_size] (index_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + current_size = start_size_1; + if (index > insert_index) { + current_size += insert_size; + if constexpr (is_exclusive) { + --index; + } + } + // already includes header size + current_size += container.size_before(index); + } + return current_size; + }; + index_t s_end; + if constexpr (is_exclusive) { + s_end = container.keys(); + } else { + s_end = container.keys() - 1; + } + _index = binary_search_r(0, s_end, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + + _left_or_right<is_exclusive>(_index, insert_index, is_insert_left); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + auto start_size_1 = start_size + extra_size; + auto f_get_used_size = [this, start_size, start_size_1] (index_t index) { + size_t current_size; + if (unlikely(index == 0)) { + current_size = start_size; + } else { + // already includes header size + current_size = start_size_1 + container.size_before(index); + } + return current_size; + }; + _index = binary_search_r( + 0, container.keys() - 1, f_get_used_size, target_size).index; + size_t current_size = f_get_used_size(_index); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if to_index == INDEX_END + template <KeyT KT> + void copy_out_until( + typename container_t::template Appender<KT>& appender, index_t& to_index) { + auto num_keys = container.keys(); + index_t items; + if (to_index == INDEX_END) { + items = num_keys - _index; + appender.append(container, _index, items); + _index = num_keys; + to_index = _index; + } else if (to_index == INDEX_LAST) { + assert(!is_end()); + items = num_keys - 1 - _index; + appender.append(container, _index, items); + _index = num_keys - 1; + to_index = _index; + } else { + assert(_index <= to_index); + assert(to_index <= num_keys); + items = to_index - _index; + appender.append(container, _index, items); + _index = to_index; + } + } + + node_offset_t trim_until(NodeExtentMutable& mut) { + return container_t::trim_until(mut, container, _index); + } + + template <typename T = node_offset_t> + std::enable_if_t<!IS_BOTTOM, T> + trim_at(NodeExtentMutable& mut, node_offset_t trimmed) { + return container_t::trim_at(mut, container, _index, trimmed); + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + ceph::encode(_index, encoded); + } + + static me_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode(p_node_start, delta); + auto ret = me_t(container); + index_t index; + ceph::decode(index, delta); + ret.seek_till_end(index); + return ret; + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>& key, const value_t& value) { + return container_t::template estimate_insert<KT>(key, value); + } + + private: + container_t container; + index_t _index = 0; + }; + + template <ContainerType CTYPE> + class _iterator_t<CTYPE, std::enable_if_t<CTYPE == ContainerType::ITERATIVE>> { + /* + * iterative container type system (!IS_BOTTOM): + * CONTAINER_TYPE = ContainerType::ITERATIVE + * index() const -> index_t + * get_key() const -> key_get_type + * size() const -> node_offset_t + * size_to_nxt() const -> node_offset_t + * size_overhead() const -> node_offset_t + * get_nxt_container() const + * has_next() const -> bool + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> container_t + * operator++() + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + * insert_prefix(mut, src, key, is_end, size, p_left_bound) -> memory_range_t + * update_size(mut, src, size) + * trim_until(mut, container) -> trim_size + * trim_at(mut, container, trimmed) -> trim_size + */ + // currently the iterative iterator is only implemented with STAGE_STRING + // for in-node space efficiency + static_assert(STAGE == STAGE_STRING); + public: + using me_t = _iterator_t<CTYPE>; + + _iterator_t(const container_t& container) : container{container} {} + + index_t index() const { + if (is_end()) { + return container.index() + 1; + } else { + return container.index(); + } + } + key_get_type get_key() const { + assert(!is_end()); + return container.get_key(); + } + node_offset_t size_to_nxt() const { + assert(!is_end()); + return container.size_to_nxt(); + } + const typename NXT_STAGE_T::container_t get_nxt_container() const { + assert(!is_end()); + return container.get_nxt_container(); + } + bool is_last() const { + assert(!is_end()); + return !container.has_next(); + } + bool is_end() const { +#ifndef NDEBUG + if (_is_end) { + assert(!container.has_next()); + } +#endif + return _is_end; + } + node_offset_t size() const { + assert(!is_end()); + return container.size(); + } + node_offset_t size_overhead() const { + assert(!is_end()); + return container.size_overhead(); + } + + me_t& operator++() { + assert(!is_end()); + assert(!is_last()); + ++container; + return *this; + } + void seek_at(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + assert(container.has_next()); + ++container; + --index; + } + } + void seek_till_end(index_t index) { + assert(!is_end()); + assert(this->index() == 0); + while (index > 0) { + if (!container.has_next()) { + assert(index == 1); + set_end(); + break; + } + ++container; + --index; + } + } + void seek_last() { + assert(!is_end()); + assert(index() == 0); + while (container.has_next()) { + ++container; + } + } + void set_end() { + assert(!is_end()); + assert(is_last()); + _is_end = true; + } + // Note: possible to return an end iterator + MatchKindBS seek(const full_key_t<KeyT::HOBJ>& key, bool exclude_last) { + assert(!is_end()); + assert(index() == 0); + do { + if (exclude_last && is_last()) { + assert(compare_to<KeyT::HOBJ>(key, get_key()) == MatchKindCMP::LT); + return MatchKindBS::NE; + } + auto match = compare_to<KeyT::HOBJ>(key, get_key()); + if (match == MatchKindCMP::LT) { + return MatchKindBS::NE; + } else if (match == MatchKindCMP::EQ) { + return MatchKindBS::EQ; + } else { + if (container.has_next()) { + ++container; + } else { + // end + break; + } + } + } while (true); + assert(!exclude_last); + set_end(); + return MatchKindBS::NE; + } + + template <KeyT KT> + memory_range_t insert_prefix( + NodeExtentMutable& mut, const full_key_t<KT>& key, + node_offset_t size, const char* p_left_bound) { + return container_t::template insert_prefix<KT>( + mut, container, key, is_end(), size, p_left_bound); + } + + void update_size(NodeExtentMutable& mut, node_offset_t insert_size) { + assert(!is_end()); + container_t::update_size(mut, container, insert_size); + } + + // Note: possible to return an end iterator when is_exclusive is true + // insert_index can still be INDEX_LAST or INDEX_END + template <bool is_exclusive> + size_t seek_split_inserted( + size_t start_size, size_t extra_size, size_t target_size, + index_t& insert_index, size_t insert_size, + std::optional<bool>& is_insert_left) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + index_t split_index = 0; + extra_size += header_size(); + do { + if constexpr (!is_exclusive) { + if (is_last()) { + assert(split_index == index()); + if (insert_index == INDEX_LAST) { + insert_index = index(); + } + assert(insert_index <= index()); + break; + } + } + + size_t nxt_size = current_size; + if (split_index == 0) { + nxt_size += extra_size; + } + if (split_index == insert_index) { + nxt_size += insert_size; + if constexpr (is_exclusive) { + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++split_index; + } + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + + if constexpr (is_exclusive) { + if (is_last()) { + assert(split_index == index()); + set_end(); + split_index = index(); + if (insert_index == INDEX_END) { + insert_index = index(); + } + assert(insert_index == index()); + break; + } else { + ++(*this); + ++split_index; + } + } else { + ++(*this); + ++split_index; + } + } while (true); + assert(current_size <= target_size); + + _left_or_right<is_exclusive>(split_index, insert_index, is_insert_left); + assert(split_index == index()); + return current_size; + } + + size_t seek_split(size_t start_size, size_t extra_size, size_t target_size) { + assert(!is_end()); + assert(index() == 0); + size_t current_size = start_size; + do { + if (is_last()) { + break; + } + + size_t nxt_size = current_size; + if (index() == 0) { + nxt_size += extra_size; + } + nxt_size += size(); + if (nxt_size > target_size) { + break; + } + current_size = nxt_size; + ++(*this); + } while (true); + assert(current_size <= target_size); + return current_size; + } + + // Note: possible to return an end iterater if to_index == INDEX_END + template <KeyT KT> + void copy_out_until( + typename container_t::template Appender<KT>& appender, index_t& to_index) { + if (is_end()) { + assert(!container.has_next()); + if (to_index == INDEX_END) { + to_index = index(); + } + assert(to_index == index()); + return; + } + index_t items; + if (to_index == INDEX_END || to_index == INDEX_LAST) { + items = to_index; + } else { + assert(is_valid_index(to_index)); + assert(index() <= to_index); + items = to_index - index(); + } + if (appender.append(container, items)) { + set_end(); + } + to_index = index(); + } + + node_offset_t trim_until(NodeExtentMutable& mut) { + if (is_end()) { + return 0; + } + return container_t::trim_until(mut, container); + } + + node_offset_t trim_at(NodeExtentMutable& mut, node_offset_t trimmed) { + assert(!is_end()); + return container_t::trim_at(mut, container, trimmed); + } + + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + container.encode(p_node_start, encoded); + uint8_t is_end = _is_end; + ceph::encode(is_end, encoded); + } + + static me_t decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + auto container = container_t::decode(p_node_start, delta); + auto ret = me_t(container); + uint8_t is_end; + ceph::decode(is_end, delta); + if (is_end) { + ret.set_end(); + } + return ret; + } + + static node_offset_t header_size() { + return container_t::header_size(); + } + + template <KeyT KT> + static node_offset_t estimate_insert(const full_key_t<KT>& key, const value_t& value) { + return container_t::template estimate_insert<KT>(key, value); + } + + private: + container_t container; + bool _is_end = false; + }; + + /* + * iterator_t encapsulates both indexable and iterative implementations + * from a *non-empty* container. + * cstr(const container_t&) + * access: + * index() -> index_t + * get_key() -> key_get_type (const reference or value type) + * is_last() -> bool + * is_end() -> bool + * size() -> node_offset_t + * size_overhead() -> node_offset_t + * (IS_BOTTOM) get_p_value() -> const value_t* + * (!IS_BOTTOM) get_nxt_container() -> nxt_stage::container_t + * (!IS_BOTTOM) size_to_nxt() -> node_offset_t + * seek: + * operator++() -> iterator_t& + * seek_at(index) + * seek_till_end(index) + * seek_last() + * set_end() + * seek(key, exclude_last) -> MatchKindBS + * insert: + * (IS_BOTTOM) insert(mut, key, value, size, p_left_bound) -> p_value + * (!IS_BOTTOM) insert_prefix(mut, key, size, p_left_bound) -> memory_range_t + * (!IS_BOTTOM) update_size(mut, size) + * split: + * seek_split_inserted<bool is_exclusive>( + * start_size, extra_size, target_size, insert_index, insert_size, + * std::optional<bool>& is_insert_left) + * -> insert to left/right/unknown (!exclusive) + * -> insert to left/right (exclusive, can be end) + * -> split_size + * seek_split(start_size, extra_size, target_size) -> split_size + * copy_out_until(appender, to_index) (can be end) + * trim_until(mut) -> trim_size + * (!IS_BOTTOM) trim_at(mut, trimmed) -> trim_size + * denc: + * encode(p_node_start, encoded) + * decode(p_node_start, delta) -> iterator_t + * static: + * header_size() -> node_offset_t + * estimate_insert(key, value) -> node_offset_t + */ + using iterator_t = _iterator_t<CONTAINER_TYPE>; + /* TODO: detailed comments + * - trim_until(mut) -> trim_size + * * keep 0 to i - 1, and remove the rest, return the size trimmed. + * * if this is the end iterator, do nothing and return 0. + * * if this is the start iterator, normally needs to go to the higher + * stage to trim the entire container. + * - trim_at(mut, trimmed) -> trim_size + * * trim happens inside the current iterator, causing the size reduced by + * <trimmed>, return the total size trimmed. + */ + + /* + * Lookup internals (hide?) + */ + + template <bool GET_KEY> + static result_t smallest_result( + const iterator_t& iter, full_key_t<KeyT::VIEW>* index_key) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto pos_smallest = NXT_STAGE_T::position_t::begin(); + auto nxt_container = iter.get_nxt_container(); + auto value_ptr = NXT_STAGE_T::template get_p_value<GET_KEY>( + nxt_container, pos_smallest, index_key); + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + return result_t{{iter.index(), pos_smallest}, value_ptr, STAGE}; + } + + template <bool GET_KEY> + static result_t nxt_lower_bound( + const full_key_t<KeyT::HOBJ>& key, iterator_t& iter, + MatchHistory& history, full_key_t<KeyT::VIEW>* index_key) { + static_assert(!IS_BOTTOM); + assert(!iter.is_end()); + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>( + nxt_container, key, history, index_key); + if (nxt_result.is_end()) { + if (iter.is_last()) { + return result_t::end(); + } else { + return smallest_result<GET_KEY>(++iter, index_key); + } + } else { + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + return result_t::from_nxt(iter.index(), nxt_result); + } + } + + template <bool GET_POS, bool GET_KEY, bool GET_VAL> + static void lookup_largest_slot( + const container_t& container, position_t* p_position, + full_key_t<KeyT::VIEW>* p_index_key, const value_t** pp_value) { + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (GET_KEY) { + assert(p_index_key); + p_index_key->set(iter.get_key()); + } + if constexpr (GET_POS) { + assert(p_position); + p_position->index = iter.index(); + } + if constexpr (IS_BOTTOM) { + if constexpr (GET_VAL) { + assert(pp_value); + *pp_value = iter.get_p_value(); + } + } else { + auto nxt_container = iter.get_nxt_container(); + if constexpr (GET_POS) { + NXT_STAGE_T::template lookup_largest_slot<true, GET_KEY, GET_VAL>( + nxt_container, &p_position->nxt, p_index_key, pp_value); + } else { + NXT_STAGE_T::template lookup_largest_slot<false, GET_KEY, GET_VAL>( + nxt_container, nullptr, p_index_key, pp_value); + } + } + } + + template <bool GET_KEY = false> + static const value_t* get_p_value( + const container_t& container, const position_t& position, + full_key_t<KeyT::VIEW>* index_key = nullptr) { + auto iter = iterator_t(container); + iter.seek_at(position.index); + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::template get_p_value<GET_KEY>( + nxt_container, position.nxt, index_key); + } else { + return iter.get_p_value(); + } + } + + static void get_key_view( + const container_t& container, + const position_t& position, + full_key_t<KeyT::VIEW>& index_key) { + auto iter = iterator_t(container); + iter.seek_at(position.index); + index_key.set(iter.get_key()); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::get_key_view(nxt_container, position.nxt, index_key); + } + } + + template <bool GET_KEY = false> + static result_t lower_bound( + const container_t& container, + const full_key_t<KeyT::HOBJ>& key, + MatchHistory& history, + full_key_t<KeyT::VIEW>* index_key = nullptr) { + bool exclude_last = false; + if (history.get<STAGE>().has_value()) { + if (*history.get<STAGE>() == MatchKindCMP::EQ) { + // lookup is short-circuited + if constexpr (!IS_BOTTOM) { + assert(history.get<STAGE - 1>().has_value()); + if (history.is_GT<STAGE - 1>()) { + auto iter = iterator_t(container); + bool test_key_equal; + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // test_key_equal = (iter.get_key().type() == ns_oid_view_t::Type::MIN); + auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key()); + assert(cmp != MatchKindCMP::GT); + test_key_equal = (cmp == MatchKindCMP::EQ); + } else { + auto cmp = compare_to<KeyT::HOBJ>(key, iter.get_key()); + // From history, key[stage] == parent[stage][index - 1] + // which should be the smallest possible value for all + // index[stage][*] + assert(cmp != MatchKindCMP::GT); + test_key_equal = (cmp == MatchKindCMP::EQ); + } + if (test_key_equal) { + return nxt_lower_bound<GET_KEY>(key, iter, history, index_key); + } else { + // key[stage] < index[stage][left-most] + return smallest_result<GET_KEY>(iter, index_key); + } + } + } + // IS_BOTTOM || !history.is_GT<STAGE - 1>() + auto iter = iterator_t(container); + iter.seek_last(); + if constexpr (STAGE == STAGE_STRING) { + // TODO(cross-node string dedup) + // assert(iter.get_key().type() == ns_oid_view_t::Type::MAX); + assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ); + } else { + assert(compare_to<KeyT::HOBJ>(key, iter.get_key()) == MatchKindCMP::EQ); + } + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + if constexpr (IS_BOTTOM) { + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, MSTAT_EQ}; + } else { + auto nxt_container = iter.get_nxt_container(); + auto nxt_result = NXT_STAGE_T::template lower_bound<GET_KEY>( + nxt_container, key, history, index_key); + // !history.is_GT<STAGE - 1>() means + // key[stage+1 ...] <= index[stage+1 ...][*] + assert(!nxt_result.is_end()); + return result_t::from_nxt(iter.index(), nxt_result); + } + } else if (*history.get<STAGE>() == MatchKindCMP::LT) { + exclude_last = true; + } + } + auto iter = iterator_t(container); + auto bs_match = iter.seek(key, exclude_last); + if (iter.is_end()) { + assert(!exclude_last); + assert(bs_match == MatchKindBS::NE); + history.set<STAGE>(MatchKindCMP::GT); + return result_t::end(); + } + history.set<STAGE>(bs_match == MatchKindBS::EQ ? + MatchKindCMP::EQ : MatchKindCMP::LT); + if constexpr (IS_BOTTOM) { + if constexpr (GET_KEY) { + index_key->set(iter.get_key()); + } + auto value_ptr = iter.get_p_value(); + return result_t{{iter.index()}, value_ptr, + (bs_match == MatchKindBS::EQ ? MSTAT_EQ : MSTAT_LT0)}; + } else { + if (bs_match == MatchKindBS::EQ) { + return nxt_lower_bound<GET_KEY>(key, iter, history, index_key); + } else { + return smallest_result<GET_KEY>(iter, index_key); + } + } + } + + template <KeyT KT> + static node_offset_t insert_size(const full_key_t<KT>& key, const value_t& value) { + if constexpr (IS_BOTTOM) { + return iterator_t::template estimate_insert<KT>(key, value); + } else { + return iterator_t::template estimate_insert<KT>(key, value) + + NXT_STAGE_T::iterator_t::header_size() + + NXT_STAGE_T::template insert_size<KT>(key, value); + } + } + + template <KeyT KT> + static node_offset_t insert_size_at( + match_stage_t stage, const full_key_t<KeyT::HOBJ>& key, const value_t& value) { + if (stage == STAGE) { + return insert_size<KT>(key, value); + } else { + assert(stage < STAGE); + return NXT_STAGE_T::template insert_size_at<KT>(stage, key, value); + } + } + + template <typename T = std::tuple<match_stage_t, node_offset_t>> + static std::enable_if_t<NODE_TYPE == node_type_t::INTERNAL, T> evaluate_insert( + const container_t& container, const full_key_t<KeyT::VIEW>& key, + const value_t& value, position_t& position, bool evaluate_last) { + auto iter = iterator_t(container); + auto& index = position.index; + if (evaluate_last || index == INDEX_END) { + iter.seek_last(); + index = iter.index(); + // evaluate the previous index + } else { + assert(is_valid_index(index)); + // evaluate the current index + iter.seek_at(index); + auto match = compare_to<KeyT::VIEW>(key, iter.get_key()); + if (match == MatchKindCMP::EQ) { + if constexpr (IS_BOTTOM) { + ceph_abort("insert conflict at current index!"); + } else { + // insert into the current index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, false); + } + } else { + assert(match == MatchKindCMP::LT); + if (index == 0) { + // already the first index, so insert at the current index + return {STAGE, insert_size<KeyT::VIEW>(key, value)}; + } + --index; + iter = iterator_t(container); + iter.seek_at(index); + // proceed to evaluate the previous index + } + } + + // XXX(multi-type): when key is from a different type of node + auto match = compare_to<KeyT::VIEW>(key, iter.get_key()); + if (match == MatchKindCMP::GT) { + // key doesn't match both indexes, so insert at the current index + ++index; + return {STAGE, insert_size<KeyT::VIEW>(key, value)}; + } else { + assert(match == MatchKindCMP::EQ); + if constexpr (IS_BOTTOM) { + // ceph_abort? + ceph_abort("insert conflict at the previous index!"); + } else { + // insert into the previous index + auto nxt_container = iter.get_nxt_container(); + return NXT_STAGE_T::evaluate_insert( + nxt_container, key, value, position.nxt, true); + } + } + } + + template <typename T = bool> + static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> + compensate_insert_position_at(match_stage_t stage, position_t& position) { + auto& index = position.index; + if (stage == STAGE) { + assert(index == 0); + // insert at the end of the current stage + index = INDEX_END; + return true; + } else { + if constexpr (IS_BOTTOM) { + ceph_abort("impossible path"); + } else { + assert(stage < STAGE); + bool compensate = NXT_STAGE_T:: + compensate_insert_position_at(stage, position.nxt); + if (compensate) { + assert(is_valid_index(index)); + if (index == 0) { + // insert into the *last* index of the current stage + index = INDEX_LAST; + return true; + } else { + --index; + return false; + } + } else { + return false; + } + } + } + } + + static void patch_insert_end(position_t& insert_pos, match_stage_t insert_stage) { + assert(insert_stage <= STAGE); + if (insert_stage == STAGE) { + insert_pos.index = INDEX_END; + } else if constexpr (!IS_BOTTOM) { + insert_pos.index = INDEX_LAST; + NXT_STAGE_T::patch_insert_end(insert_pos.nxt, insert_stage); + } + } + + template <typename T = std::tuple<match_stage_t, node_offset_t>> + static std::enable_if_t<NODE_TYPE == node_type_t::LEAF, T> evaluate_insert( + const full_key_t<KeyT::HOBJ>& key, const onode_t& value, + const MatchHistory& history, match_stat_t mstat, position_t& position) { + match_stage_t insert_stage = STAGE_TOP; + while (*history.get_by_stage(insert_stage) == MatchKindCMP::EQ) { + assert(insert_stage != STAGE_BOTTOM && "insert conflict!"); + --insert_stage; + } + + if (history.is_GT()) { + if (position.is_end()) { + // no need to compensate insert position + assert(insert_stage <= STAGE && "impossible insert stage"); + } else if (position == position_t::begin()) { + // I must be short-circuited by staged::smallest_result() + // in staged::lower_bound(), so we need to rely on mstat instead + assert(mstat >= MSTAT_LT0 && mstat <= MSTAT_LT3); + if (mstat == MSTAT_LT0) { + insert_stage = STAGE_RIGHT; + } else if (mstat == MSTAT_LT1) { + insert_stage = STAGE_STRING; + } else { + insert_stage = STAGE_LEFT; + } + // XXX(multi-type): need to upgrade node type before inserting an + // incompatible index at front. + assert(insert_stage <= STAGE && "incompatible insert"); + } else { + assert(insert_stage <= STAGE && "impossible insert stage"); + [[maybe_unused]] bool ret = compensate_insert_position_at(insert_stage, position); + assert(!ret); + } + } + + if (position.is_end()) { + patch_insert_end(position, insert_stage); + } + + node_offset_t insert_size = insert_size_at<KeyT::HOBJ>(insert_stage, key, value); + + return {insert_stage, insert_size}; + } + + template <KeyT KT> + static const value_t* insert_new( + NodeExtentMutable& mut, const memory_range_t& range, + const full_key_t<KT>& key, const value_t& value) { + char* p_insert = const_cast<char*>(range.p_end); + const value_t* p_value = nullptr; + StagedAppender<KT> appender; + appender.init(&mut, p_insert); + appender.append(key, value, p_value); + [[maybe_unused]] const char* p_insert_front = appender.wrap(); + assert(p_insert_front == range.p_start); + return p_value; + } + + template <KeyT KT, bool SPLIT> + static const value_t* proceed_insert_recursively( + NodeExtentMutable& mut, const container_t& container, + const full_key_t<KT>& key, const value_t& value, + position_t& position, match_stage_t& stage, + node_offset_t& _insert_size, const char* p_left_bound) { + // proceed insert from right to left + assert(stage <= STAGE); + auto iter = iterator_t(container); + auto& index = position.index; + + bool do_insert = false; + if (stage == STAGE) { + if (index == INDEX_END) { + iter.seek_last(); + iter.set_end(); + index = iter.index(); + } else { + assert(is_valid_index(index)); + iter.seek_till_end(index); + } + do_insert = true; + } else { // stage < STAGE + if (index == INDEX_LAST) { + iter.seek_last(); + index = iter.index(); + } else { + assert(is_valid_index(index)); + iter.seek_till_end(index); + } + if constexpr (SPLIT) { + if (iter.is_end()) { + // insert at the higher stage due to split + do_insert = true; + _insert_size = insert_size<KT>(key, value); + stage = STAGE; + } + } else { + assert(!iter.is_end()); + } + } + + if (do_insert) { + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + assert(_insert_size == insert_size<KT>(key, value)); + if constexpr (IS_BOTTOM) { + return iter.template insert<KT>( + mut, key, value, _insert_size, p_left_bound); + } else { + auto range = iter.template insert_prefix<KT>( + mut, key, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value); + } + } else { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + auto p_value = NXT_STAGE_T::template proceed_insert_recursively<KT, SPLIT>( + mut, nxt_container, key, value, + position.nxt, stage, _insert_size, p_left_bound); + iter.update_size(mut, _insert_size); + return p_value; + } else { + ceph_abort("impossible path"); + } + } + } + + template <KeyT KT, bool SPLIT> + static const value_t* proceed_insert( + NodeExtentMutable& mut, const container_t& container, + const full_key_t<KT>& key, const value_t& value, + position_t& position, match_stage_t& stage, node_offset_t& _insert_size) { + auto p_left_bound = container.p_left_bound(); + if (unlikely(!container.keys())) { + if (position.is_end()) { + position = position_t::begin(); + assert(stage == STAGE); + assert(_insert_size == insert_size<KT>(key, value)); + } else if (position == position_t::begin()) { + // when insert into a trimmed and empty left node + stage = STAGE; + _insert_size = insert_size<KT>(key, value); + } else { + ceph_abort("impossible path"); + } + if constexpr (IS_BOTTOM) { + return container_t::template insert_at<KT>( + mut, container, key, value, 0, _insert_size, p_left_bound); + } else { + auto range = container_t::template insert_prefix_at<KT>( + mut, container, key, 0, _insert_size, p_left_bound); + return NXT_STAGE_T::template insert_new<KT>(mut, range, key, value); + } + } else { + return proceed_insert_recursively<KT, SPLIT>( + mut, container, key, value, + position, stage, _insert_size, p_left_bound); + } + } + + static std::ostream& dump(const container_t& container, + std::ostream& os, + const std::string& prefix, + size_t& size, + const char* p_start) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + std::string prefix_blank(prefix.size(), ' '); + const std::string* p_prefix = &prefix; + size += iterator_t::header_size(); + do { + std::ostringstream sos; + sos << *p_prefix << iter.get_key() << ": "; + std::string i_prefix = sos.str(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + size += iter.size_to_nxt(); + NXT_STAGE_T::dump(nxt_container, os, i_prefix, size, p_start); + } else { + auto value_ptr = iter.get_p_value(); + int offset = reinterpret_cast<const char*>(value_ptr) - p_start; + size += iter.size(); + os << "\n" << i_prefix; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + os << *value_ptr; + } else { + os << "0x" << std::hex << value_ptr->value << std::dec; + } + os << " " << size << "B" + << " @" << offset << "B"; + } + if (iter.is_last()) { + break; + } else { + ++iter; + p_prefix = &prefix_blank; + } + } while (true); + return os; + } + + static void validate(const container_t& container) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + auto key = iter.get_key(); + do { + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::validate(nxt_container); + } + if (iter.is_last()) { + break; + } else { + ++iter; + assert(compare_to(key, iter.get_key()) == MatchKindCMP::LT); + key = iter.get_key(); + } + } while (true); + } + + static void get_stats(const container_t& container, node_stats_t& stats, + full_key_t<KeyT::VIEW>& index_key) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + stats.size_overhead += iterator_t::header_size(); + do { + index_key.replace(iter.get_key()); + stats.size_overhead += iter.size_overhead(); + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + NXT_STAGE_T::get_stats(nxt_container, stats, index_key); + } else { + ++stats.num_kvs; + size_t kv_logical_size = index_key.size_logical(); + size_t value_size; + if constexpr (NODE_TYPE == node_type_t::LEAF) { + value_size = iter.get_p_value()->size; + } else { + value_size = sizeof(value_t); + } + stats.size_value += value_size; + kv_logical_size += value_size; + stats.size_logical += kv_logical_size; + } + if (iter.is_last()) { + break; + } else { + ++iter; + } + } while (true); + } + + static bool next_position(const container_t& container, position_t& pos) { + auto iter = iterator_t(container); + assert(!iter.is_end()); + iter.seek_at(pos.index); + bool find_next; + if constexpr (!IS_BOTTOM) { + auto nxt_container = iter.get_nxt_container(); + find_next = NXT_STAGE_T::next_position(nxt_container, pos.nxt); + } else { + find_next = true; + } + if (find_next) { + if (iter.is_last()) { + return true; + } else { + pos.index = iter.index() + 1; + if constexpr (!IS_BOTTOM) { + pos.nxt = NXT_STAGE_T::position_t::begin(); + } + return false; + } + } else { + return false; + } + } + + struct _BaseEmpty {}; + class _BaseWithNxtIterator { + protected: + typename NXT_STAGE_T::StagedIterator _nxt; + }; + class StagedIterator + : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtIterator> { + public: + StagedIterator() = default; + bool valid() const { return iter.has_value(); } + index_t index() const { + return iter->index(); + } + bool is_end() const { return iter->is_end(); } + bool in_progress() const { + assert(valid()); + if constexpr (!IS_BOTTOM) { + if (this->_nxt.valid()) { + if (this->_nxt.index() == 0) { + return this->_nxt.in_progress(); + } else { + return true; + } + } else { + return false; + } + } else { + return false; + } + } + key_get_type get_key() const { return iter->get_key(); } + + iterator_t& get() { return *iter; } + void set(const container_t& container) { + assert(!valid()); + iter = iterator_t(container); + } + void set_end() { iter->set_end(); } + typename NXT_STAGE_T::StagedIterator& nxt() { + if constexpr (!IS_BOTTOM) { + if (!this->_nxt.valid()) { + auto nxt_container = iter->get_nxt_container(); + this->_nxt.set(nxt_container); + } + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::StagedIterator& get_nxt() { + if constexpr (!IS_BOTTOM) { + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + StagedIterator& operator++() { + if (iter->is_last()) { + iter->set_end(); + } else { + ++(*iter); + } + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + return *this; + } + void reset() { + if (valid()) { + iter.reset(); + if constexpr (!IS_BOTTOM) { + this->_nxt.reset(); + } + } + } + std::ostream& print(std::ostream& os, bool is_top) const { + if (valid()) { + if (iter->is_end()) { + return os << "END"; + } else { + os << index(); + } + } else { + if (is_top) { + return os << "invalid StagedIterator!"; + } else { + os << "0!"; + } + } + if constexpr (!IS_BOTTOM) { + os << ", "; + return this->_nxt.print(os, false); + } else { + return os; + } + } + position_t get_pos() const { + if (valid()) { + if constexpr (IS_BOTTOM) { + return position_t{index()}; + } else { + return position_t{index(), this->_nxt.get_pos()}; + } + } else { + return position_t::begin(); + } + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + uint8_t present = static_cast<bool>(iter); + ceph::encode(present, encoded); + if (iter.has_value()) { + iter->encode(p_node_start, encoded); + if constexpr (!IS_BOTTOM) { + this->_nxt.encode(p_node_start, encoded); + } + } + } + static StagedIterator decode(const char* p_node_start, + ceph::bufferlist::const_iterator& delta) { + StagedIterator ret; + uint8_t present; + ceph::decode(present, delta); + if (present) { + ret.iter = iterator_t::decode(p_node_start, delta); + if constexpr (!IS_BOTTOM) { + ret._nxt = NXT_STAGE_T::StagedIterator::decode(p_node_start, delta); + } + } + return ret; + } + friend std::ostream& operator<<(std::ostream& os, const StagedIterator& iter) { + return iter.print(os, true); + } + private: + std::optional<iterator_t> iter; + }; + + static bool recursively_locate_split( + size_t& current_size, size_t extra_size, + size_t target_size, StagedIterator& split_at) { + assert(current_size <= target_size); + iterator_t& split_iter = split_at.get(); + current_size = split_iter.seek_split(current_size, extra_size, target_size); + assert(current_size <= target_size); + assert(!split_iter.is_end()); + if (split_iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + bool locate_nxt; + if constexpr (!IS_BOTTOM) { + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); + } else { // IS_BOTTOM + // located upper_bound, fair split strategy + size_t nxt_size = split_iter.size() + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + current_size += nxt_size; + locate_nxt = true; + } else { + // exclude next + locate_nxt = false; + } + } + if (locate_nxt) { + if (split_iter.is_last()) { + return true; + } else { + ++split_at; + return false; + } + } else { + return false; + } + } + + static bool recursively_locate_split_inserted( + size_t& current_size, size_t extra_size, size_t target_size, + position_t& insert_pos, match_stage_t insert_stage, size_t insert_size, + std::optional<bool>& is_insert_left, StagedIterator& split_at) { + assert(current_size <= target_size); + assert(!is_insert_left.has_value()); + iterator_t& split_iter = split_at.get(); + auto& insert_index = insert_pos.index; + if (insert_stage == STAGE) { + current_size = split_iter.template seek_split_inserted<true>( + current_size, extra_size, target_size, + insert_index, insert_size, is_insert_left); + assert(is_insert_left.has_value()); + assert(current_size <= target_size); + if (split_iter.index() == 0) { + if (insert_index == 0) { + if (*is_insert_left == false) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + } else { + extra_size += iterator_t::header_size(); + } + } else { + extra_size = 0; + } + if (*is_insert_left == false && split_iter.index() == insert_index) { + // split_iter can be end + // found the lower-bound of target_size + // ...[s_index-1] |!| (i_index) [s_index]... + + // located upper-bound, fair split strategy + // look at the next slot (the insert item) + size_t nxt_size = insert_size + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + *is_insert_left = true; + current_size += nxt_size; + if (split_iter.is_end()) { + // ...[s_index-1] (i_index) |!| + return true; + } else { + return false; + } + } else { + // exclude next + return false; + } + } else { + // Already considered insert effect in the current stage. + // Look into the next stage to identify the target_size lower-bound w/o + // insert effect. + assert(!split_iter.is_end()); + bool locate_nxt; + if constexpr (!IS_BOTTOM) { + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); + } else { // IS_BOTTOM + // located upper-bound, fair split strategy + // look at the next slot + size_t nxt_size = split_iter.size() + extra_size; + assert(current_size + nxt_size > target_size); + if (current_size + nxt_size/2 < target_size) { + // include next + current_size += nxt_size; + locate_nxt = true; + } else { + // exclude next + locate_nxt = false; + } + } + if (locate_nxt) { + if (split_iter.is_last()) { + auto end_index = split_iter.index() + 1; + if (insert_index == INDEX_END) { + insert_index = end_index; + } + assert(insert_index <= end_index); + if (insert_index == end_index) { + assert(*is_insert_left == false); + split_iter.set_end(); + // ...[s_index-1] |!| (i_index) + return false; + } else { + assert(*is_insert_left == true); + return true; + } + } else { + ++split_at; + return false; + } + } else { + return false; + } + } + } else { + if constexpr (!IS_BOTTOM) { + assert(insert_stage < STAGE); + current_size = split_iter.template seek_split_inserted<false>( + current_size, extra_size, target_size, + insert_index, insert_size, is_insert_left); + assert(!split_iter.is_end()); + assert(current_size <= target_size); + if (split_iter.index() == 0) { + extra_size += iterator_t::header_size(); + } else { + extra_size = 0; + } + bool locate_nxt; + if (!is_insert_left.has_value()) { + // Considered insert effect in the current stage, and insert happens + // in the lower stage. + // Look into the next stage to identify the target_size lower-bound w/ + // insert effect. + assert(split_iter.index() == insert_index); + locate_nxt = NXT_STAGE_T::recursively_locate_split_inserted( + current_size, extra_size + split_iter.size_to_nxt(), target_size, + insert_pos.nxt, insert_stage, insert_size, + is_insert_left, split_at.nxt()); + assert(is_insert_left.has_value()); +#ifndef NDEBUG + if (locate_nxt) { + assert(*is_insert_left == true); + } +#endif + } else { + // is_insert_left.has_value() == true + // Insert will *not* happen in the lower stage. + // Need to look into the next stage to identify the target_size + // lower-bound w/ insert effect + assert(split_iter.index() != insert_index); + locate_nxt = NXT_STAGE_T::recursively_locate_split( + current_size, extra_size + split_iter.size_to_nxt(), + target_size, split_at.nxt()); +#ifndef NDEBUG + if (split_iter.index() < insert_index) { + assert(*is_insert_left == false); + } else { + assert(*is_insert_left == true); + } +#endif + } + if (locate_nxt) { + if (split_iter.is_last()) { + return true; + } else { + ++split_at; + return false; + } + } else { + return false; + } + } else { + ceph_abort("impossible path"); + return false;; + } + } + } + + /* + * container appender type system + * container_t::Appender(NodeExtentMutable& mut, char* p_append) + * append(const container_t& src, index_t from, index_t items) + * wrap() -> char* + * IF !IS_BOTTOM: + * open_nxt(const key_get_type&) + * open_nxt(const full_key_t&) + * -> std::tuple<NodeExtentMutable&, char*> + * wrap_nxt(char* p_append) + * ELSE + * append(const full_key_t& key, const value_t& value) + */ + template <KeyT KT> + struct _BaseWithNxtAppender { + typename NXT_STAGE_T::template StagedAppender<KT> _nxt; + }; + template <KeyT KT> + class StagedAppender + : std::conditional_t<IS_BOTTOM, _BaseEmpty, _BaseWithNxtAppender<KT>> { + public: + StagedAppender() = default; + ~StagedAppender() { + assert(!require_wrap_nxt); + assert(!valid()); + } + bool valid() const { return appender.has_value(); } + index_t index() const { + assert(valid()); + return _index; + } + bool in_progress() const { return require_wrap_nxt; } + // TODO: pass by reference + void init(NodeExtentMutable* p_mut, char* p_start) { + assert(!valid()); + appender = typename container_t::template Appender<KT>(p_mut, p_start); + _index = 0; + } + // possible to make src_iter end if to_index == INDEX_END + void append_until(StagedIterator& src_iter, index_t& to_index) { + assert(!require_wrap_nxt); + auto s_index = src_iter.index(); + src_iter.get().template copy_out_until<KT>(*appender, to_index); + assert(src_iter.index() == to_index); + assert(to_index >= s_index); + auto increment = (to_index - s_index); + if (increment) { + _index += increment; + if constexpr (!IS_BOTTOM) { + src_iter.get_nxt().reset(); + } + } + } + void append(const full_key_t<KT>& key, + const value_t& value, const value_t*& p_value) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + auto& nxt = open_nxt(key); + nxt.append(key, value, p_value); + wrap_nxt(); + } else { + appender->append(key, value, p_value); + ++_index; + } + } + char* wrap() { + assert(valid()); + assert(_index > 0); + if constexpr (!IS_BOTTOM) { + if (require_wrap_nxt) { + wrap_nxt(); + } + } + auto ret = appender->wrap(); + appender.reset(); + return ret; + } + typename NXT_STAGE_T::template StagedAppender<KT>& + open_nxt(key_get_type paritial_key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(paritial_key); + this->_nxt.init(p_mut, p_append); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::template StagedAppender<KT>& + open_nxt(const full_key_t<KT>& key) { + assert(!require_wrap_nxt); + if constexpr (!IS_BOTTOM) { + require_wrap_nxt = true; + auto [p_mut, p_append] = appender->open_nxt(key); + this->_nxt.init(p_mut, p_append); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + typename NXT_STAGE_T::template StagedAppender<KT>& get_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + return this->_nxt; + } else { + ceph_abort("impossible path"); + } + } + void wrap_nxt() { + if constexpr (!IS_BOTTOM) { + assert(require_wrap_nxt); + require_wrap_nxt = false; + auto p_append = this->_nxt.wrap(); + appender->wrap_nxt(p_append); + ++_index; + } else { + ceph_abort("impossible path"); + } + } + private: + std::optional<typename container_t::template Appender<KT>> appender; + index_t _index; + bool require_wrap_nxt = false; + }; + + template <KeyT KT> + static void _append_range( + StagedIterator& src_iter, StagedAppender<KT>& appender, index_t& to_index) { + if (src_iter.is_end()) { + // append done + assert(to_index == INDEX_END); + to_index = src_iter.index(); + } else if constexpr (!IS_BOTTOM) { + if (appender.in_progress()) { + // appender has appended something at the current item, + // cannot append the current item as-a-whole + index_t to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range<KT>( + src_iter.nxt(), appender.get_nxt(), to_index_nxt); + ++src_iter; + appender.wrap_nxt(); + } else if (src_iter.in_progress()) { + // src_iter is not at the beginning of the current item, + // cannot append the current item as-a-whole + index_t to_index_nxt = INDEX_END; + NXT_STAGE_T::template _append_range<KT>( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), to_index_nxt); + ++src_iter; + appender.wrap_nxt(); + } else { + // we can safely append the current item as-a-whole + } + } + appender.append_until(src_iter, to_index); + } + + template <KeyT KT> + static void _append_into(StagedIterator& src_iter, StagedAppender<KT>& appender, + position_t& position, match_stage_t stage) { + assert(position.index == src_iter.index()); + // reaches the last item + if (stage == STAGE) { + // done, end recursion + if constexpr (!IS_BOTTOM) { + position.nxt = position_t::nxt_t::begin(); + } + } else { + assert(stage < STAGE); + // proceed append in the next stage + NXT_STAGE_T::template append_until<KT>( + src_iter.nxt(), appender.open_nxt(src_iter.get_key()), + position.nxt, stage); + } + } + + template <KeyT KT> + static void append_until(StagedIterator& src_iter, StagedAppender<KT>& appender, + position_t& position, match_stage_t stage) { + index_t from_index = src_iter.index(); + index_t& to_index = position.index; + assert(from_index <= to_index); + if constexpr (IS_BOTTOM) { + assert(stage == STAGE); + appender.append_until(src_iter, to_index); + } else { + assert(stage <= STAGE); + if (src_iter.index() == to_index) { + _append_into<KT>(src_iter, appender, position, stage); + } else { + if (to_index == INDEX_END) { + assert(stage == STAGE); + } else if (to_index == INDEX_LAST) { + assert(stage < STAGE); + } + _append_range<KT>(src_iter, appender, to_index); + _append_into<KT>(src_iter, appender, position, stage); + } + } + to_index -= from_index; + } + + template <KeyT KT> + static bool append_insert( + const full_key_t<KT>& key, const value_t& value, + StagedIterator& src_iter, StagedAppender<KT>& appender, + bool is_front_insert, match_stage_t& stage, const value_t*& p_value) { + assert(src_iter.valid()); + if (stage == STAGE) { + appender.append(key, value, p_value); + if (src_iter.is_end()) { + return true; + } else { + return false; + } + } else { + assert(stage < STAGE); + if constexpr (!IS_BOTTOM) { + auto nxt_is_end = NXT_STAGE_T::template append_insert<KT>( + key, value, src_iter.get_nxt(), appender.get_nxt(), + is_front_insert, stage, p_value); + if (nxt_is_end) { + appender.wrap_nxt(); + ++src_iter; + if (is_front_insert) { + stage = STAGE; + } + if (src_iter.is_end()) { + return true; + } + } + return false; + } else { + ceph_abort("impossible path"); + } + } + } + + /* TrimType: + * BEFORE: remove the entire container, normally means the according higher + * stage iterator needs to be trimmed as-a-whole. + * AFTER: retain the entire container, normally means the trim should be + * start from the next iterator at the higher stage. + * AT: trim happens in the current container, and the according higher + * stage iterator needs to be adjusted by the trimmed size. + */ + static std::tuple<TrimType, node_offset_t> + recursively_trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + if (!trim_at.valid()) { + return {TrimType::BEFORE, 0u}; + } + if (trim_at.is_end()) { + return {TrimType::AFTER, 0u}; + } + + auto& iter = trim_at.get(); + if constexpr (!IS_BOTTOM) { + auto [type, trimmed] = NXT_STAGE_T::recursively_trim( + mut, trim_at.get_nxt()); + node_offset_t trim_size; + if (type == TrimType::AFTER) { + if (iter.is_last()) { + return {TrimType::AFTER, 0u}; + } + ++trim_at; + trim_size = iter.trim_until(mut); + } else if (type == TrimType::BEFORE) { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } + trim_size = iter.trim_until(mut); + } else { + trim_size = iter.trim_at(mut, trimmed); + } + return {TrimType::AT, trim_size}; + } else { + if (iter.index() == 0) { + return {TrimType::BEFORE, 0u}; + } else { + auto trimmed = iter.trim_until(mut); + return {TrimType::AT, trimmed}; + } + } + } + + static void trim(NodeExtentMutable& mut, StagedIterator& trim_at) { + auto [type, trimmed] = recursively_trim(mut, trim_at); + if (type == TrimType::BEFORE) { + assert(trim_at.valid()); + auto& iter = trim_at.get(); + iter.trim_until(mut); + } + } +}; + +/** + * Configurations for struct staged + * + * staged_params_* assembles different container_t implementations (defined by + * stated::_iterator_t) by STAGE, and constructs the final multi-stage + * implementations for different node layouts defined by + * node_extent_t<FieldType, NODE_TYPE>. + * + * The specialized implementations for different layouts are accessible through + * the helper type node_to_stage_t<node_extent_t<FieldType, NODE_TYPE>>. + * + * Specifically, the settings of 8 layouts are: + * + * The layout (N0, LEAF/INTERNAL) has 3 stages: + * - STAGE_LEFT: node_extent_t<node_fields_0_t, LEAF/INTERNAL> + * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N1, LEAF/INTERNAL) has 3 stages: + * - STAGE_LEFT: node_extent_t<node_fields_1_t, LEAF/INTERNAL> + * - STAGE_STRING: item_iterator_t<LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N2, LEAF/INTERNAL) has 2 stages: + * - STAGE_STRING: node_extent_t<node_fields_2_t, LEAF/INTERNAL> + * - STAGE_RIGHT: sub_items_t<LEAF/INTERNAL> + * + * The layout (N3, LEAF) has 1 stage: + * - STAGE_RIGHT: node_extent_t<leaf_fields_3_t, LEAF> + * + * The layout (N3, INTERNAL) has 1 stage: + * - STAGE_RIGHT: node_extent_t<internal_fields_3_t, INTERNAL> + */ + +template <node_type_t _NODE_TYPE> +struct staged_params_subitems { + using container_t = sub_items_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <node_type_t _NODE_TYPE> +struct staged_params_item_iterator { + using container_t = item_iterator_t<_NODE_TYPE>; + static constexpr auto NODE_TYPE = _NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_01 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_LEFT; + + using next_param_t = staged_params_item_iterator<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_2 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_STRING; + + using next_param_t = staged_params_subitems<NODE_TYPE>; +}; + +template <typename NodeType> +struct staged_params_node_3 { + using container_t = NodeType; + static constexpr auto NODE_TYPE = NodeType::NODE_TYPE; + static constexpr auto STAGE = STAGE_RIGHT; + + // dummy type in order to make our type system work + // any better solution to get rid of this? + using next_param_t = staged_params_node_3<NodeType>; +}; + +template <typename NodeType, typename Enable = void> struct _node_to_stage_t; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N0 || + NodeType::FIELD_TYPE == field_type_t::N1>> { + using type = staged<staged_params_node_01<NodeType>>; +}; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N2>> { + using type = staged<staged_params_node_2<NodeType>>; +}; +template <typename NodeType> +struct _node_to_stage_t<NodeType, + std::enable_if_t<NodeType::FIELD_TYPE == field_type_t::N3>> { + using type = staged<staged_params_node_3<NodeType>>; +}; +template <typename NodeType> +using node_to_stage_t = typename _node_to_stage_t<NodeType>::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h new file mode 100644 index 000000000..a9d5cef3b --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/stage_types.h @@ -0,0 +1,411 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <optional> +#include <ostream> + +#include "crimson/os/seastore/onode_manager/staged-fltree/fwd.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "crimson/os/seastore/onode_manager/staged-fltree/tree_types.h" + +namespace crimson::os::seastore::onode { + +using match_stage_t = int8_t; +constexpr match_stage_t STAGE_LEFT = 2; // shard/pool/crush +constexpr match_stage_t STAGE_STRING = 1; // nspace/oid +constexpr match_stage_t STAGE_RIGHT = 0; // snap/gen +constexpr auto STAGE_TOP = STAGE_LEFT; +constexpr auto STAGE_BOTTOM = STAGE_RIGHT; +constexpr bool is_valid_stage(match_stage_t stage) { + return std::clamp(stage, STAGE_BOTTOM, STAGE_TOP) == stage; +} +// TODO: replace by +// using match_history_t = int8_t; +// left_m, str_m, right_m +// 3: GT, +// 2: EQ, GT, +// 1: EQ, EQ, GT +// 0: EQ, EQ, EQ +// -1: EQ, EQ, LT +// -2: EQ, LT, +// -3: LT, + +struct MatchHistory { + template <match_stage_t STAGE> + const std::optional<MatchKindCMP>& get() const { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE == STAGE_RIGHT) { + return right_match; + } else if (STAGE == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + const std::optional<MatchKindCMP>& + get_by_stage(match_stage_t stage) const { + assert(is_valid_stage(stage)); + if (stage == STAGE_RIGHT) { + return right_match; + } else if (stage == STAGE_STRING) { + return string_match; + } else { + return left_match; + } + } + + template <match_stage_t STAGE = STAGE_TOP> + const bool is_GT() const; + + template <match_stage_t STAGE> + void set(MatchKindCMP match) { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE < STAGE_TOP) { + assert(*get<STAGE + 1>() == MatchKindCMP::EQ); + } + assert(!get<STAGE>().has_value() || *get<STAGE>() != MatchKindCMP::EQ); + const_cast<std::optional<MatchKindCMP>&>(get<STAGE>()) = match; + } + + std::ostream& dump(std::ostream& os) const { + os << "history("; + dump_each(os, left_match) << ", "; + dump_each(os, string_match) << ", "; + dump_each(os, right_match) << ")"; + return os; + } + + std::ostream& dump_each( + std::ostream& os, const std::optional<MatchKindCMP>& match) const { + if (!match.has_value()) { + return os << "--"; + } else if (*match == MatchKindCMP::LT) { + return os << "LT"; + } else if (*match == MatchKindCMP::EQ) { + return os << "EQ"; + } else if (*match == MatchKindCMP::GT) { + return os << "GT"; + } else { + ceph_abort("impossble path"); + } + } + + std::optional<MatchKindCMP> left_match; + std::optional<MatchKindCMP> string_match; + std::optional<MatchKindCMP> right_match; +}; +inline std::ostream& operator<<(std::ostream& os, const MatchHistory& pos) { + return pos.dump(os); +} + +template <match_stage_t STAGE> +struct _check_GT_t { + static bool eval(const MatchHistory* history) { + return history->get<STAGE>() && + (*history->get<STAGE>() == MatchKindCMP::GT || + (*history->get<STAGE>() == MatchKindCMP::EQ && + _check_GT_t<STAGE - 1>::eval(history))); + } +}; +template <> +struct _check_GT_t<STAGE_RIGHT> { + static bool eval(const MatchHistory* history) { + return history->get<STAGE_RIGHT>() && + *history->get<STAGE_RIGHT>() == MatchKindCMP::GT; + } +}; +template <match_stage_t STAGE> +const bool MatchHistory::is_GT() const { + static_assert(is_valid_stage(STAGE)); + if constexpr (STAGE < STAGE_TOP) { + assert(get<STAGE + 1>() == MatchKindCMP::EQ); + } + return _check_GT_t<STAGE>::eval(this); +} + +template <match_stage_t STAGE> +struct staged_position_t { + static_assert(is_valid_stage(STAGE)); + using me_t = staged_position_t<STAGE>; + using nxt_t = staged_position_t<STAGE - 1>; + bool is_end() const { + if (index == INDEX_END) { + return true; + } else { + assert(is_valid_index(index)); + return false; + } + } + index_t& index_by_stage(match_stage_t stage) { + assert(stage <= STAGE); + if (STAGE == stage) { + return index; + } else { + return nxt.index_by_stage(stage); + } + } + + int cmp(const me_t& o) const { + if (index > o.index) { + return 1; + } else if (index < o.index) { + return -1; + } else { + return nxt.cmp(o.nxt); + } + } + bool operator>(const me_t& o) const { return cmp(o) > 0; } + bool operator>=(const me_t& o) const { return cmp(o) >= 0; } + bool operator<(const me_t& o) const { return cmp(o) < 0; } + bool operator<=(const me_t& o) const { return cmp(o) <= 0; } + bool operator==(const me_t& o) const { return cmp(o) == 0; } + bool operator!=(const me_t& o) const { return cmp(o) != 0; } + + me_t& operator-=(const me_t& o) { + assert(is_valid_index(o.index)); + assert(index >= o.index); + if (index != INDEX_END) { + assert(is_valid_index(index)); + index -= o.index; + if (index == 0) { + nxt -= o.nxt; + } + } + return *this; + } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + nxt.encode(encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + ret.nxt = nxt_t::decode(delta); + return ret; + } + + static me_t begin() { return {0u, nxt_t::begin()}; } + static me_t end() { + return {INDEX_END, nxt_t::end()}; + } + + index_t index; + nxt_t nxt; +}; +template <match_stage_t STAGE> +std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE>& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else if (pos.index == INDEX_LAST) { + os << "LAST"; + } else { + os << pos.index; + assert(is_valid_index(pos.index)); + } + return os << ", " << pos.nxt; +} + +template <> +struct staged_position_t<STAGE_BOTTOM> { + using me_t = staged_position_t<STAGE_BOTTOM>; + bool is_end() const { + if (index == INDEX_END) { + return true; + } else { + assert(is_valid_index(index)); + return false; + } + } + index_t& index_by_stage(match_stage_t stage) { + assert(stage == STAGE_BOTTOM); + return index; + } + + int cmp(const staged_position_t<STAGE_BOTTOM>& o) const { + if (index > o.index) { + return 1; + } else if (index < o.index) { + return -1; + } else { + return 0; + } + } + bool operator>(const me_t& o) const { return cmp(o) > 0; } + bool operator>=(const me_t& o) const { return cmp(o) >= 0; } + bool operator<(const me_t& o) const { return cmp(o) < 0; } + bool operator<=(const me_t& o) const { return cmp(o) <= 0; } + bool operator==(const me_t& o) const { return cmp(o) == 0; } + bool operator!=(const me_t& o) const { return cmp(o) != 0; } + + me_t& operator-=(const me_t& o) { + assert(is_valid_index(o.index)); + assert(index >= o.index); + if (index != INDEX_END) { + assert(is_valid_index(index)); + index -= o.index; + } + return *this; + } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(index, encoded); + } + + static me_t decode(ceph::bufferlist::const_iterator& delta) { + me_t ret; + ceph::decode(ret.index, delta); + return ret; + } + + static me_t begin() { return {0u}; } + static me_t end() { return {INDEX_END}; } + + index_t index; +}; +template <> +inline std::ostream& operator<<(std::ostream& os, const staged_position_t<STAGE_BOTTOM>& pos) { + if (pos.index == INDEX_END) { + os << "END"; + } else if (pos.index == INDEX_LAST) { + os << "LAST"; + } else { + os << pos.index; + assert(is_valid_index(pos.index)); + } + return os; +} + +using search_position_t = staged_position_t<STAGE_TOP>; + +template <match_stage_t STAGE> +const staged_position_t<STAGE>& cast_down(const search_position_t& pos) { + if constexpr (STAGE == STAGE_LEFT) { + return pos; + } else if constexpr (STAGE == STAGE_STRING) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.is_end()); + } else { + assert(pos.index == 0u); + } +#endif + return pos.nxt; + } else if constexpr (STAGE == STAGE_RIGHT) { +#ifndef NDEBUG + if (pos.is_end()) { + assert(pos.nxt.nxt.is_end()); + } else { + assert(pos.index == 0u); + assert(pos.nxt.index == 0u); + } +#endif + return pos.nxt.nxt; + } else { + ceph_abort("impossible path"); + } +} + +template <match_stage_t STAGE> +staged_position_t<STAGE>& cast_down(search_position_t& pos) { + const search_position_t& _pos = pos; + return const_cast<staged_position_t<STAGE>&>(cast_down<STAGE>(_pos)); +} + +template <match_stage_t STAGE> +staged_position_t<STAGE>& cast_down_fill_0(search_position_t& pos) { + if constexpr (STAGE == STAGE_LEFT) { + return pos; + } if constexpr (STAGE == STAGE_STRING) { + pos.index = 0; + return pos.nxt; + } else if constexpr (STAGE == STAGE_RIGHT) { + pos.index = 0; + pos.nxt.index = 0; + return pos.nxt.nxt; + } else { + ceph_abort("impossible path"); + } +} + +inline search_position_t&& normalize(search_position_t&& pos) { return std::move(pos); } + +template <match_stage_t STAGE, typename = std::enable_if_t<STAGE != STAGE_TOP>> +search_position_t normalize(staged_position_t<STAGE>&& pos) { + if (pos.is_end()) { + return search_position_t::end(); + } + if constexpr (STAGE == STAGE_STRING) { + return {0u, std::move(pos)}; + } else if (STAGE == STAGE_RIGHT) { + return {0u, {0u, std::move(pos)}}; + } else { + ceph_abort("impossible path"); + } +} + +struct memory_range_t { + const char* p_start; + const char* p_end; +}; + +enum class ContainerType { ITERATIVE, INDEXABLE }; + +template <node_type_t> struct value_type; +template<> struct value_type<node_type_t::INTERNAL> { using type = laddr_packed_t; }; +template<> struct value_type<node_type_t::LEAF> { using type = onode_t; }; +template <node_type_t NODE_TYPE> +using value_type_t = typename value_type<NODE_TYPE>::type; + +template <node_type_t NODE_TYPE, match_stage_t STAGE> +struct staged_result_t { + using me_t = staged_result_t<NODE_TYPE, STAGE>; + bool is_end() const { return position.is_end(); } + + static me_t end() { + return {staged_position_t<STAGE>::end(), nullptr, MSTAT_END}; + } + template <typename T = me_t> + static std::enable_if_t<STAGE != STAGE_BOTTOM, T> from_nxt( + index_t index, const staged_result_t<NODE_TYPE, STAGE - 1>& nxt_stage_result) { + return {{index, nxt_stage_result.position}, + nxt_stage_result.p_value, + nxt_stage_result.mstat}; + } + + staged_position_t<STAGE> position; + const value_type_t<NODE_TYPE>* p_value; + match_stat_t mstat; +}; + +template <node_type_t NODE_TYPE> +using lookup_result_t = staged_result_t<NODE_TYPE, STAGE_TOP>; + +template <node_type_t NODE_TYPE> +lookup_result_t<NODE_TYPE>&& normalize( + lookup_result_t<NODE_TYPE>&& result) { return std::move(result); } + +template <node_type_t NODE_TYPE, match_stage_t STAGE, + typename = std::enable_if_t<STAGE != STAGE_TOP>> +lookup_result_t<NODE_TYPE> normalize( + staged_result_t<NODE_TYPE, STAGE>&& result) { + // FIXME: assert result.mstat correct + return {normalize(std::move(result.position)), result.p_value, result.mstat}; +} + +struct node_stats_t { + size_t size_persistent = 0; + size_t size_filled = 0; + // filled by staged::get_stats() + size_t size_logical = 0; + size_t size_overhead = 0; + size_t size_value = 0; + unsigned num_kvs = 0; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc new file mode 100644 index 000000000..aaca6c3c6 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.cc @@ -0,0 +1,208 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "sub_items_stage.h" + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_extent_mutable.h" + +namespace crimson::os::seastore::onode { + +template <KeyT KT> +const laddr_packed_t* internal_sub_items_t::insert_at( + NodeExtentMutable& mut, const internal_sub_items_t& sub_items, + const full_key_t<KT>& key, const laddr_packed_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + assert(index <= sub_items.keys()); + assert(size == estimate_insert<KT>(key, value)); + const char* p_shift_start = p_left_bound; + const char* p_shift_end = reinterpret_cast<const char*>( + sub_items.p_first_item + 1 - index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + auto p_insert = const_cast<char*>(p_shift_end) - size; + auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value}; + mut.copy_in_absolute(p_insert, item); + return &reinterpret_cast<internal_sub_item_t*>(p_insert)->value; +} +#define IA_TEMPLATE(KT) \ + template const laddr_packed_t* internal_sub_items_t::insert_at<KT>( \ + NodeExtentMutable&, const internal_sub_items_t&, const full_key_t<KT>&, \ + const laddr_packed_t&, index_t, node_offset_t, const char*) +IA_TEMPLATE(KeyT::VIEW); +IA_TEMPLATE(KeyT::HOBJ); + +node_offset_t internal_sub_items_t::trim_until( + NodeExtentMutable&, internal_sub_items_t& items, index_t index) { + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + size_t ret = sizeof(internal_sub_item_t) * (keys - index); + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template <KeyT KT> +void internal_sub_items_t::Appender<KT>::append( + const internal_sub_items_t& src, index_t from, index_t items) { + assert(from <= src.keys()); + if (items == 0) { + return; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + node_offset_t size = sizeof(internal_sub_item_t) * items; + p_append -= size; + p_mut->copy_in_absolute(p_append, src.p_first_item + 1 - from - items, size); +} + +template <KeyT KT> +void internal_sub_items_t::Appender<KT>::append( + const full_key_t<KT>& key, const laddr_packed_t& value, + const laddr_packed_t*& p_value) { + p_append -= sizeof(internal_sub_item_t); + auto item = internal_sub_item_t{snap_gen_t::from_key<KT>(key), value}; + p_mut->copy_in_absolute(p_append, item); + p_value = &reinterpret_cast<internal_sub_item_t*>(p_append)->value; +} + +template <KeyT KT> +const onode_t* leaf_sub_items_t::insert_at( + NodeExtentMutable& mut, const leaf_sub_items_t& sub_items, + const full_key_t<KT>& key, const onode_t& value, + index_t index, node_offset_t size, const char* p_left_bound) { + assert(index <= sub_items.keys()); + assert(size == estimate_insert<KT>(key, value)); + // a. [... item(index)] << size + const char* p_shift_start = p_left_bound; + const char* p_shift_end = sub_items.get_item_end(index); + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)size); + + // b. insert item + auto p_insert = const_cast<char*>(p_shift_end - size); + auto p_value = reinterpret_cast<const onode_t*>(p_insert); + mut.copy_in_absolute(p_insert, &value, value.size); + p_insert += value.size; + mut.copy_in_absolute(p_insert, snap_gen_t::template from_key<KT>(key)); + assert(p_insert + sizeof(snap_gen_t) + sizeof(node_offset_t) == p_shift_end); + + // c. compensate affected offsets + auto item_size = value.size + sizeof(snap_gen_t); + for (auto i = index; i < sub_items.keys(); ++i) { + const node_offset_packed_t& offset_i = sub_items.get_offset(i); + mut.copy_in_absolute((void*)&offset_i, node_offset_t(offset_i.value + item_size)); + } + + // d. [item(index-1) ... item(0) ... offset(index)] <<< sizeof(node_offset_t) + const char* p_offset = (index == 0 ? + (const char*)&sub_items.get_offset(0) + sizeof(node_offset_t) : + (const char*)&sub_items.get_offset(index - 1)); + p_shift_start = p_shift_end; + p_shift_end = p_offset; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, -(int)sizeof(node_offset_t)); + + // e. insert offset + node_offset_t offset_to_item_start = item_size + sub_items.get_offset_to_end(index); + mut.copy_in_absolute( + const_cast<char*>(p_shift_end) - sizeof(node_offset_t), offset_to_item_start); + + // f. update num_sub_keys + mut.copy_in_absolute((void*)sub_items.p_num_keys, num_keys_t(sub_items.keys() + 1)); + + return p_value; +} +template const onode_t* leaf_sub_items_t::insert_at<KeyT::HOBJ>( + NodeExtentMutable&, const leaf_sub_items_t&, const full_key_t<KeyT::HOBJ>&, + const onode_t&, index_t, node_offset_t, const char*); + +node_offset_t leaf_sub_items_t::trim_until( + NodeExtentMutable& mut, leaf_sub_items_t& items, index_t index) { + assert(index != 0); + auto keys = items.keys(); + assert(index <= keys); + if (index == keys) { + return 0; + } + index_t trim_items = keys - index; + const char* p_items_start = items.p_start(); + const char* p_shift_start = items.get_item_end(index); + const char* p_shift_end = items.get_item_end(0); + size_t size_trim_offsets = sizeof(node_offset_t) * trim_items; + mut.shift_absolute(p_shift_start, p_shift_end - p_shift_start, + size_trim_offsets); + mut.copy_in_absolute((void*)items.p_num_keys, num_keys_t(index)); + size_t ret = size_trim_offsets + (p_shift_start - p_items_start); + assert(ret < NODE_BLOCK_SIZE); + return ret; +} + +template class internal_sub_items_t::Appender<KeyT::VIEW>; +template class internal_sub_items_t::Appender<KeyT::HOBJ>; + +// helper type for the visitor +template<class... Ts> struct overloaded : Ts... { using Ts::operator()...; }; +// explicit deduction guide +template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>; + +template <KeyT KT> +char* leaf_sub_items_t::Appender<KT>::wrap() { + auto p_cur = p_append; + num_keys_t num_keys = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { num_keys += arg.items; }, + [&] (const kv_item_t& arg) { ++num_keys; } + }, a); + } + assert(num_keys); + p_cur -= sizeof(num_keys_t); + p_mut->copy_in_absolute(p_cur, num_keys); + + node_offset_t last_offset = 0; + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + int compensate = (last_offset - op_src->get_offset_to_end(arg.from)); + node_offset_t offset; + for (auto i = arg.from; i < arg.from + arg.items; ++i) { + offset = op_src->get_offset(i).value + compensate; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, offset); + } + last_offset = offset; + }, + [&] (const kv_item_t& arg) { + last_offset += sizeof(snap_gen_t) + arg.p_value->size; + p_cur -= sizeof(node_offset_t); + p_mut->copy_in_absolute(p_cur, last_offset); + } + }, a); + } + + for (auto i = 0u; i < cnt; ++i) { + auto& a = appends[i]; + std::visit(overloaded { + [&] (const range_items_t& arg) { + auto _p_start = op_src->get_item_end(arg.from + arg.items); + size_t _len = op_src->get_item_end(arg.from) - _p_start; + p_cur -= _len; + p_mut->copy_in_absolute(p_cur, _p_start, _len); + }, + [&] (const kv_item_t& arg) { + assert(pp_value); + p_cur -= sizeof(snap_gen_t); + p_mut->copy_in_absolute(p_cur, snap_gen_t::template from_key<KT>(*arg.p_key)); + p_cur -= arg.p_value->size; + p_mut->copy_in_absolute(p_cur, arg.p_value, arg.p_value->size); + *pp_value = reinterpret_cast<const onode_t*>(p_cur); + } + }, a); + } + return p_cur; +} + +template class leaf_sub_items_t::Appender<KeyT::VIEW>; +template class leaf_sub_items_t::Appender<KeyT::HOBJ>; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h new file mode 100644 index 000000000..8ef5f7472 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/stages/sub_items_stage.h @@ -0,0 +1,341 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <variant> + +#include "crimson/os/seastore/onode_manager/staged-fltree/node_types.h" +#include "key_layout.h" +#include "stage_types.h" + +namespace crimson::os::seastore::onode { + +class NodeExtentMutable; + +struct internal_sub_item_t { + const snap_gen_t& get_key() const { return key; } + const laddr_packed_t* get_p_value() const { return &value; } + + snap_gen_t key; + laddr_packed_t value; +} __attribute__((packed)); + +/** + * internal_sub_items_t + * + * The STAGE_RIGHT implementation for internal node N0/N1/N2, implements staged + * contract as an indexable container to index snap-gen to child node + * addresses. + * + * The layout of the contaner storing n sub-items: + * + * # <--------- container range -----------> # + * #<~># sub-items [2, n) # + * # # <- sub-item 1 -> # <- sub-item 0 -> # + * #...# snap-gen | laddr # snap-gen | laddr # + * ^ + * | + * p_first_item + + */ +class internal_sub_items_t { + public: + using num_keys_t = index_t; + + internal_sub_items_t(const memory_range_t& range) { + assert(range.p_start < range.p_end); + assert((range.p_end - range.p_start) % sizeof(internal_sub_item_t) == 0); + num_items = (range.p_end - range.p_start) / sizeof(internal_sub_item_t); + assert(num_items > 0); + auto _p_first_item = range.p_end - sizeof(internal_sub_item_t); + p_first_item = reinterpret_cast<const internal_sub_item_t*>(_p_first_item); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return num_items; } + key_get_type operator[](index_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_key(); + } + node_offset_t size_before(index_t index) const { + size_t ret = index * sizeof(internal_sub_item_t); + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + const laddr_packed_t* get_p_value(index_t index) const { + assert(index < num_items); + return (p_first_item - index)->get_p_value(); + } + node_offset_t size_overhead_at(index_t index) const { return 0u; } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast<const char*>(p_first_item) + + sizeof(internal_sub_item_t); + auto p_start = p_end - num_items * sizeof(internal_sub_item_t); + int start_offset = p_start - p_node_start; + int end_offset = p_end - p_node_start; + assert(start_offset > 0 && + start_offset < end_offset && + end_offset < NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(end_offset), encoded); + } + + static internal_sub_items_t decode( + const char* p_node_start, ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + return internal_sub_items_t({p_node_start + start_offset, + p_node_start + end_offset}); + } + + static node_offset_t header_size() { return 0u; } + + template <KeyT KT> + static node_offset_t estimate_insert( + const full_key_t<KT>&, const laddr_packed_t&) { + return sizeof(internal_sub_item_t); + } + + template <KeyT KT> + static const laddr_packed_t* insert_at( + NodeExtentMutable&, const internal_sub_items_t&, + const full_key_t<KT>&, const laddr_packed_t&, + index_t index, node_offset_t size, const char* p_left_bound); + + static node_offset_t trim_until(NodeExtentMutable&, internal_sub_items_t&, index_t); + + template <KeyT KT> + class Appender; + + private: + index_t num_items; + const internal_sub_item_t* p_first_item; +}; + +template <KeyT KT> +class internal_sub_items_t::Appender { + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} {} + void append(const internal_sub_items_t& src, index_t from, index_t items); + void append(const full_key_t<KT>&, const laddr_packed_t&, const laddr_packed_t*&); + char* wrap() { return p_append; } + private: + NodeExtentMutable* p_mut; + char* p_append; +}; + +/** + * leaf_sub_items_t + * + * The STAGE_RIGHT implementation for leaf node N0/N1/N2, implements staged + * contract as an indexable container to index snap-gen to onode_t. + * + * The layout of the contaner storing n sub-items: + * + * # <------------------------ container range -------------------------------> # + * # <---------- sub-items ----------------> # <--- offsets ---------# # + * #<~># sub-items [2, n) #<~>| offsets [2, n) # # + * # # <- sub-item 1 -> # <- sub-item 0 -> # | # # + * #...# snap-gen | onode # snap-gen | onode #...| offset1 | offset0 # num_keys # + * ^ ^ ^ + * | | | + * p_items_end + p_offsets + | + * p_num_keys + + */ +class leaf_sub_items_t { + public: + // TODO: decide by NODE_BLOCK_SIZE, sizeof(snap_gen_t), + // and the minimal size of onode_t + using num_keys_t = uint8_t; + + leaf_sub_items_t(const memory_range_t& range) { + assert(range.p_start < range.p_end); + auto _p_num_keys = range.p_end - sizeof(num_keys_t); + assert(range.p_start < _p_num_keys); + p_num_keys = reinterpret_cast<const num_keys_t*>(_p_num_keys); + assert(keys()); + auto _p_offsets = _p_num_keys - sizeof(node_offset_t); + assert(range.p_start < _p_offsets); + p_offsets = reinterpret_cast<const node_offset_packed_t*>(_p_offsets); + p_items_end = reinterpret_cast<const char*>(&get_offset(keys() - 1)); + assert(range.p_start < p_items_end); + assert(range.p_start == p_start()); + } + + bool operator==(const leaf_sub_items_t& x) { + return (p_num_keys == x.p_num_keys && + p_offsets == x.p_offsets && + p_items_end == x.p_items_end); + } + + const char* p_start() const { return get_item_end(keys()); } + + const node_offset_packed_t& get_offset(index_t index) const { + assert(index < keys()); + return *(p_offsets - index); + } + + const node_offset_t get_offset_to_end(index_t index) const { + assert(index <= keys()); + return index == 0 ? 0 : get_offset(index - 1).value; + } + + const char* get_item_start(index_t index) const { + return p_items_end - get_offset(index).value; + } + + const char* get_item_end(index_t index) const { + return p_items_end - get_offset_to_end(index); + } + + // container type system + using key_get_type = const snap_gen_t&; + static constexpr auto CONTAINER_TYPE = ContainerType::INDEXABLE; + num_keys_t keys() const { return *p_num_keys; } + key_get_type operator[](index_t index) const { + assert(index < keys()); + auto pointer = get_item_end(index); + assert(get_item_start(index) < pointer); + pointer -= sizeof(snap_gen_t); + assert(get_item_start(index) < pointer); + return *reinterpret_cast<const snap_gen_t*>(pointer); + } + node_offset_t size_before(index_t index) const { + assert(index <= keys()); + size_t ret; + if (index == 0) { + ret = sizeof(num_keys_t); + } else { + --index; + ret = sizeof(num_keys_t) + + (index + 1) * sizeof(node_offset_t) + + get_offset(index).value; + } + assert(ret < NODE_BLOCK_SIZE); + return ret; + } + node_offset_t size_overhead_at(index_t index) const { return sizeof(node_offset_t); } + const onode_t* get_p_value(index_t index) const { + assert(index < keys()); + auto pointer = get_item_start(index); + auto value = reinterpret_cast<const onode_t*>(pointer); + assert(pointer + value->size + sizeof(snap_gen_t) == get_item_end(index)); + return value; + } + void encode(const char* p_node_start, ceph::bufferlist& encoded) const { + auto p_end = reinterpret_cast<const char*>(p_num_keys) + + sizeof(num_keys_t); + int start_offset = p_start() - p_node_start; + int end_offset = p_end - p_node_start; + assert(start_offset > 0 && + start_offset < end_offset && + end_offset < NODE_BLOCK_SIZE); + ceph::encode(static_cast<node_offset_t>(start_offset), encoded); + ceph::encode(static_cast<node_offset_t>(end_offset), encoded); + } + + static leaf_sub_items_t decode( + const char* p_node_start, ceph::bufferlist::const_iterator& delta) { + node_offset_t start_offset; + ceph::decode(start_offset, delta); + node_offset_t end_offset; + ceph::decode(end_offset, delta); + assert(start_offset < end_offset); + assert(end_offset <= NODE_BLOCK_SIZE); + return leaf_sub_items_t({p_node_start + start_offset, + p_node_start + end_offset}); + } + + static node_offset_t header_size() { return sizeof(num_keys_t); } + + template <KeyT KT> + static node_offset_t estimate_insert(const full_key_t<KT>&, const onode_t& value) { + return value.size + sizeof(snap_gen_t) + sizeof(node_offset_t); + } + + template <KeyT KT> + static const onode_t* insert_at( + NodeExtentMutable&, const leaf_sub_items_t&, + const full_key_t<KT>&, const onode_t&, + index_t index, node_offset_t size, const char* p_left_bound); + + static node_offset_t trim_until(NodeExtentMutable&, leaf_sub_items_t&, index_t index); + + template <KeyT KT> + class Appender; + + private: + // TODO: support unaligned access + const num_keys_t* p_num_keys; + const node_offset_packed_t* p_offsets; + const char* p_items_end; +}; + +constexpr index_t APPENDER_LIMIT = 3u; + +template <KeyT KT> +class leaf_sub_items_t::Appender { + struct range_items_t { + index_t from; + index_t items; + }; + struct kv_item_t { + const full_key_t<KT>* p_key; + const onode_t* p_value; + }; + using var_t = std::variant<range_items_t, kv_item_t>; + + public: + Appender(NodeExtentMutable* p_mut, char* p_append) + : p_mut{p_mut}, p_append{p_append} { + } + + void append(const leaf_sub_items_t& src, index_t from, index_t items) { + assert(cnt <= APPENDER_LIMIT); + assert(from <= src.keys()); + if (items == 0) { + return; + } + if (op_src) { + assert(*op_src == src); + } else { + op_src = src; + } + assert(from < src.keys()); + assert(from + items <= src.keys()); + appends[cnt] = range_items_t{from, items}; + ++cnt; + } + void append(const full_key_t<KT>& key, + const onode_t& value, const onode_t*& p_value) { + assert(pp_value == nullptr); + assert(cnt <= APPENDER_LIMIT); + appends[cnt] = kv_item_t{&key, &value}; + ++cnt; + pp_value = &p_value; + } + char* wrap(); + + private: + std::optional<leaf_sub_items_t> op_src; + const onode_t** pp_value = nullptr; + NodeExtentMutable* p_mut; + char* p_append; + var_t appends[APPENDER_LIMIT]; + index_t cnt = 0; +}; + +template <node_type_t> struct _sub_items_t; +template<> struct _sub_items_t<node_type_t::INTERNAL> { using type = internal_sub_items_t; }; +template<> struct _sub_items_t<node_type_t::LEAF> { using type = leaf_sub_items_t; }; +template <node_type_t NODE_TYPE> +using sub_items_t = typename _sub_items_t<NODE_TYPE>::type; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc new file mode 100644 index 000000000..5a28f5097 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.cc @@ -0,0 +1,26 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "super.h" +#include "node.h" + +namespace crimson::os::seastore::onode { + +Ref<Node> RootNodeTrackerIsolated::get_root(Transaction& t) const { + auto iter = tracked_supers.find(&t); + if (iter == tracked_supers.end()) { + return nullptr; + } else { + return iter->second->get_p_root(); + } +} + +Ref<Node> RootNodeTrackerShared::get_root(Transaction&) const { + if (is_clean()) { + return nullptr; + } else { + return tracked_super->get_p_root(); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/super.h b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h new file mode 100644 index 000000000..5eefee9ff --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/super.h @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> + +#include "crimson/common/type_helpers.h" + +#include "fwd.h" + +namespace crimson::os::seastore::onode { + +class Node; +class Super; + +/** + * RootNodeTracker + * + * An abstracted tracker to get the root node by Transaction. + */ +class RootNodeTracker { + public: + virtual ~RootNodeTracker() = default; + virtual bool is_clean() const = 0; + virtual Ref<Node> get_root(Transaction&) const = 0; + static RootNodeTrackerURef create(bool read_isolated); + protected: + RootNodeTracker() = default; + RootNodeTracker(const RootNodeTracker&) = delete; + RootNodeTracker(RootNodeTracker&&) = delete; + RootNodeTracker& operator=(const RootNodeTracker&) = delete; + RootNodeTracker& operator=(RootNodeTracker&&) = delete; + virtual void do_track_super(Transaction&, Super&) = 0; + virtual void do_untrack_super(Transaction&, Super&) = 0; + friend class Super; +}; + +/** + * Super + * + * The parent of root node. It contains the relationship between a Transaction + * and a root node address. + */ +class Super { + public: + using URef = std::unique_ptr<Super>; + Super(const Super&) = delete; + Super(Super&&) = delete; + Super& operator=(const Super&) = delete; + Super& operator=(Super&&) = delete; + virtual ~Super() { + assert(tracked_root_node == nullptr); + tracker.do_untrack_super(t, *this); + } + + virtual laddr_t get_root_laddr() const = 0; + virtual void write_root_laddr(context_t, laddr_t) = 0; + + void do_track_root(Node& root) { + assert(tracked_root_node == nullptr); + tracked_root_node = &root; + } + void do_untrack_root(Node& root) { + assert(tracked_root_node == &root); + tracked_root_node = nullptr; + } + Node* get_p_root() const { + assert(tracked_root_node != nullptr); + return tracked_root_node; + } + + protected: + Super(Transaction& t, RootNodeTracker& tracker) + : t{t}, tracker{tracker} { + tracker.do_track_super(t, *this); + } + + private: + Transaction& t; + RootNodeTracker& tracker; + Node* tracked_root_node = nullptr; +}; + +/** + * RootNodeTrackerIsolated + * + * A concrete RootNodeTracker implementation which provides root node isolation + * between Transactions for Seastore backend. + */ +class RootNodeTrackerIsolated final : public RootNodeTracker { + public: + ~RootNodeTrackerIsolated() override { assert(is_clean()); } + protected: + bool is_clean() const override { + return tracked_supers.empty(); + } + void do_track_super(Transaction& t, Super& super) override { + assert(tracked_supers.find(&t) == tracked_supers.end()); + tracked_supers[&t] = &super; + } + void do_untrack_super(Transaction& t, Super& super) override { + [[maybe_unused]] auto removed = tracked_supers.erase(&t); + assert(removed); + } + ::Ref<Node> get_root(Transaction& t) const override; + std::map<Transaction*, Super*> tracked_supers; +}; + +/** + * RootNodeTrackerShared + * + * A concrete RootNodeTracker implementation which has no isolation between + * Transactions for Dummy backend. + */ +class RootNodeTrackerShared final : public RootNodeTracker { + public: + ~RootNodeTrackerShared() override { assert(is_clean()); } + protected: + bool is_clean() const override { + return tracked_super == nullptr; + } + void do_track_super(Transaction&, Super& super) override { + assert(is_clean()); + tracked_super = &super; + } + void do_untrack_super(Transaction&, Super& super) override { + assert(tracked_super == &super); + tracked_super = nullptr; + } + ::Ref<Node> get_root(Transaction&) const override; + Super* tracked_super = nullptr; +}; + +inline RootNodeTrackerURef RootNodeTracker::create(bool read_isolated) { + if (read_isolated) { + return RootNodeTrackerURef(new RootNodeTrackerIsolated()); + } else { + return RootNodeTrackerURef(new RootNodeTrackerShared()); + } +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc new file mode 100644 index 000000000..2c8c21652 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "tree.h" + +#include "node.h" +#include "node_extent_manager.h" +#include "stages/key_layout.h" +#include "super.h" + +namespace crimson::os::seastore::onode { + +using btree_ertr = Btree::btree_ertr; +template <class ValueT=void> +using btree_future = Btree::btree_future<ValueT>; +using Cursor = Btree::Cursor; + +Cursor::Cursor(Btree* p_tree, Ref<tree_cursor_t> _p_cursor) + : p_tree(p_tree) { + if (_p_cursor->is_end()) { + // no need to hold the leaf node + } else { + p_cursor = _p_cursor; + } +} +Cursor::Cursor(Btree* p_tree) : p_tree{p_tree} {} +Cursor::Cursor(const Cursor&) = default; +Cursor::Cursor(Cursor&&) noexcept = default; +Cursor& Cursor::operator=(const Cursor&) = default; +Cursor& Cursor::operator=(Cursor&&) = default; +Cursor::~Cursor() = default; + +bool Cursor::is_end() const { + if (p_cursor) { + assert(!p_cursor->is_end()); + return false; + } else { + return true; + } +} + +ghobject_t Cursor::get_ghobj() const { + return p_cursor->get_key_view().to_ghobj(); +} + +const onode_t* Cursor::value() const { + return p_cursor->get_p_value(); +} + +bool Cursor::operator==(const Cursor& x) const { + return p_cursor == x.p_cursor; +} + +Cursor& Cursor::operator++() { + // TODO + return *this; +} + +Cursor Cursor::operator++(int) { + Cursor tmp = *this; + ++*this; + return tmp; +} + +Cursor Cursor::make_end(Btree* p_tree) { + return {p_tree}; +} + +Btree::Btree(NodeExtentManagerURef&& _nm) + : nm{std::move(_nm)}, + root_tracker{RootNodeTracker::create(nm->is_read_isolated())} {} + +Btree::~Btree() { assert(root_tracker->is_clean()); } + +btree_future<> Btree::mkfs(Transaction& t) { + return Node::mkfs(get_context(t), *root_tracker); +} + +btree_future<Cursor> Btree::begin(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + return root->lookup_smallest(get_context(t)); + }).safe_then([this](auto cursor) { + return Cursor{this, cursor}; + }); +} + +btree_future<Cursor> Btree::last(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + return root->lookup_largest(get_context(t)); + }).safe_then([this](auto cursor) { + return Cursor(this, cursor); + }); +} + +Cursor Btree::end() { + return Cursor::make_end(this); +} + +btree_future<bool> +Btree::contains(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + full_key_t<KeyT::HOBJ>(obj), + [this, &t](auto& key) -> btree_future<bool> { + return get_root(t).safe_then([this, &t, &key](auto root) { + // TODO: improve lower_bound() + return root->lower_bound(get_context(t), key); + }).safe_then([](auto result) { + return MatchKindBS::EQ == result.match(); + }); + } + ); +} + +btree_future<Cursor> +Btree::find(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + full_key_t<KeyT::HOBJ>(obj), + [this, &t](auto& key) -> btree_future<Cursor> { + return get_root(t).safe_then([this, &t, &key](auto root) { + // TODO: improve lower_bound() + return root->lower_bound(get_context(t), key); + }).safe_then([this](auto result) { + if (result.match() == MatchKindBS::EQ) { + return Cursor(this, result.p_cursor); + } else { + return Cursor::make_end(this); + } + }); + } + ); +} + +btree_future<Cursor> +Btree::lower_bound(Transaction& t, const ghobject_t& obj) { + return seastar::do_with( + full_key_t<KeyT::HOBJ>(obj), + [this, &t](auto& key) -> btree_future<Cursor> { + return get_root(t).safe_then([this, &t, &key](auto root) { + return root->lower_bound(get_context(t), key); + }).safe_then([this](auto result) { + return Cursor(this, result.p_cursor); + }); + } + ); +} + +btree_future<std::pair<Cursor, bool>> +Btree::insert(Transaction& t, const ghobject_t& obj, const onode_t& value) { + return seastar::do_with( + full_key_t<KeyT::HOBJ>(obj), + [this, &t, &value](auto& key) -> btree_future<std::pair<Cursor, bool>> { + return get_root(t).safe_then([this, &t, &key, &value](auto root) { + return root->insert(get_context(t), key, value); + }).safe_then([this](auto ret) { + auto& [cursor, success] = ret; + return std::make_pair(Cursor(this, cursor), success); + }); + } + ); +} + +btree_future<size_t> Btree::erase(Transaction& t, const ghobject_t& obj) { + // TODO + return btree_ertr::make_ready_future<size_t>(0u); +} + +btree_future<Cursor> Btree::erase(Cursor& pos) { + // TODO + return btree_ertr::make_ready_future<Cursor>( + Cursor::make_end(this)); +} + +btree_future<Cursor> +Btree::erase(Cursor& first, Cursor& last) { + // TODO + return btree_ertr::make_ready_future<Cursor>( + Cursor::make_end(this)); +} + +btree_future<size_t> Btree::height(Transaction& t) { + return get_root(t).safe_then([](auto root) { + return size_t(root->level() + 1); + }); +} + +btree_future<tree_stats_t> Btree::get_stats_slow(Transaction& t) { + return get_root(t).safe_then([this, &t](auto root) { + unsigned height = root->level() + 1; + return root->get_tree_stats(get_context(t) + ).safe_then([height](auto stats) { + stats.height = height; + return btree_ertr::make_ready_future<tree_stats_t>(stats); + }); + }); +} + +std::ostream& Btree::dump(Transaction& t, std::ostream& os) { + auto root = root_tracker->get_root(t); + if (root) { + root->dump(os); + } else { + os << "empty tree!"; + } + return os; +} + +std::ostream& Btree::print(std::ostream& os) const { + return os << "BTree-" << *nm; +} + +btree_future<Ref<Node>> Btree::get_root(Transaction& t) { + auto root = root_tracker->get_root(t); + if (root) { + return btree_ertr::make_ready_future<Ref<Node>>(root); + } else { + return Node::load_root(get_context(t), *root_tracker); + } +} + +bool Btree::test_is_clean() const { + return root_tracker->is_clean(); +} + +btree_future<> Btree::test_clone_from( + Transaction& t, Transaction& t_from, Btree& from) { + // Note: assume the tree to clone is tracked correctly in memory. + // In some unit tests, parts of the tree are stubbed out that they + // should not be loaded from NodeExtentManager. + return from.get_root(t_from + ).safe_then([this, &t](auto root_from) { + return root_from->test_clone_root(get_context(t), *root_tracker); + }); +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h new file mode 100644 index 000000000..7ee618cb3 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree.h @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> + +#include "common/hobject.h" +#include "crimson/common/type_helpers.h" + +#include "fwd.h" +#include "tree_types.h" + +/** + * tree.h + * + * An example implementation to expose tree interfaces to users. The current + * interface design is based on: + * - ceph::os::Transaction::create/touch/remove() + * - ceph::ObjectStore::collection_list() + * - ceph::BlueStore::get_onode() + * - db->get_iterator(PREFIIX_OBJ) by ceph::BlueStore::fsck() + * + * TODO: Redesign the interfaces based on real onode manager requirements. + */ + +namespace crimson::os::seastore::onode { + +class Node; +class Btree { + public: + using btree_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + template <class ValueT=void> + using btree_future = btree_ertr::future<ValueT>; + + Btree(NodeExtentManagerURef&& nm); + Btree(const Btree&) = delete; + Btree(Btree&&) = delete; + Btree& operator=(const Btree&) = delete; + Btree& operator=(Btree&&) = delete; + ~Btree(); + + btree_future<> mkfs(Transaction&); + + class Cursor; + // lookup + btree_future<Cursor> begin(Transaction&); + btree_future<Cursor> last(Transaction&); + Cursor end(); + btree_future<bool> contains(Transaction&, const ghobject_t&); + btree_future<Cursor> find(Transaction&, const ghobject_t&); + btree_future<Cursor> lower_bound(Transaction&, const ghobject_t&); + + // modifiers + // TODO: replace onode_t + btree_future<std::pair<Cursor, bool>> + insert(Transaction&, const ghobject_t&, const onode_t&); + btree_future<size_t> erase(Transaction&, const ghobject_t& key); + btree_future<Cursor> erase(Cursor& pos); + btree_future<Cursor> erase(Cursor& first, Cursor& last); + + // stats + btree_future<size_t> height(Transaction&); + btree_future<tree_stats_t> get_stats_slow(Transaction&); + std::ostream& dump(Transaction&, std::ostream&); + std::ostream& print(std::ostream& os) const; + + // test_only + bool test_is_clean() const; + btree_future<> test_clone_from(Transaction& t, Transaction& t_from, Btree& from); + + private: + context_t get_context(Transaction& t) { return {*nm, t}; } + btree_future<Ref<Node>> get_root(Transaction& t); + + NodeExtentManagerURef nm; + RootNodeTrackerURef root_tracker; + + friend class DummyChildPool; +}; +inline std::ostream& operator<<(std::ostream& os, const Btree& tree) { + return tree.print(os); +} + +class tree_cursor_t; +class Btree::Cursor { + public: + Cursor(const Cursor&); + Cursor(Cursor&&) noexcept; + Cursor& operator=(const Cursor&); + Cursor& operator=(Cursor&&); + ~Cursor(); + + bool is_end() const; + // XXX: return key_view_t to avoid unecessary ghobject_t constructions + ghobject_t get_ghobj() const; + const onode_t* value() const; + bool operator==(const Cursor& x) const; + bool operator!=(const Cursor& x) const { return !(*this == x); } + Cursor& operator++(); + Cursor operator++(int); + + private: + Cursor(Btree*, Ref<tree_cursor_t>); + Cursor(Btree*); + + static Cursor make_end(Btree*); + + Btree* p_tree; + Ref<tree_cursor_t> p_cursor; + + friend class Btree; +}; + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h new file mode 100644 index 000000000..0bb345e0a --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_types.h @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> + +namespace crimson::os::seastore::onode { + +// TODO: Redesign according to real requirement from onode manager +struct onode_t { + // onode should be smaller than a node + uint16_t size; // address up to 64 KiB sized node + uint16_t id; + // omap, extent_map, inline data + + bool operator==(const onode_t& o) const { return size == o.size && id == o.id; } + bool operator!=(const onode_t& o) const { return !(*this == o); } + + void encode(ceph::bufferlist& encoded) const { + ceph::encode(size, encoded); + ceph::encode(id, encoded); + } + static onode_t decode(ceph::bufferlist::const_iterator& delta) { + uint16_t size; + ceph::decode(size, delta); + uint16_t id; + ceph::decode(id, delta); + onode_t ret{size, id}; + return ret; + } + static void validate_tail_magic(const onode_t& onode) { + auto p_target = (const char*)&onode + onode.size - sizeof(uint32_t); + uint32_t target; + std::memcpy(&target, p_target, sizeof(uint32_t)); + ceph_assert(target == onode.size * 137); + } + static std::unique_ptr<char[]> allocate(const onode_t& config) { + ceph_assert(config.size >= sizeof(onode_t) + sizeof(uint32_t)); + + auto ret = std::make_unique<char[]>(config.size); + char* p_mem = ret.get(); + auto p_onode = reinterpret_cast<onode_t*>(p_mem); + *p_onode = config; + + uint32_t tail_magic = config.size * 137; + p_mem += (config.size - sizeof(uint32_t)); + std::memcpy(p_mem, &tail_magic, sizeof(uint32_t)); + validate_tail_magic(*p_onode); + + return ret; + } +} __attribute__((packed)); +inline std::ostream& operator<<(std::ostream& os, const onode_t& node) { + return os << "onode(" << node.id << ", " << node.size << "B)"; +} + +struct tree_stats_t { + size_t size_persistent_leaf = 0; + size_t size_persistent_internal = 0; + size_t size_filled_leaf = 0; + size_t size_filled_internal = 0; + size_t size_logical_leaf = 0; + size_t size_logical_internal = 0; + size_t size_overhead_leaf = 0; + size_t size_overhead_internal = 0; + size_t size_value_leaf = 0; + size_t size_value_internal = 0; + unsigned num_kvs_leaf = 0; + unsigned num_kvs_internal = 0; + unsigned num_nodes_leaf = 0; + unsigned num_nodes_internal = 0; + unsigned height = 0; + + size_t size_persistent() const { + return size_persistent_leaf + size_persistent_internal; } + size_t size_filled() const { + return size_filled_leaf + size_filled_internal; } + size_t size_logical() const { + return size_logical_leaf + size_logical_internal; } + size_t size_overhead() const { + return size_overhead_leaf + size_overhead_internal; } + size_t size_value() const { + return size_value_leaf + size_value_internal; } + unsigned num_kvs() const { + return num_kvs_leaf + num_kvs_internal; } + unsigned num_nodes() const { + return num_nodes_leaf + num_nodes_internal; } + + double ratio_fullness() const { + return (double)size_filled() / size_persistent(); } + double ratio_key_compression() const { + return (double)(size_filled() - size_value()) / (size_logical() - size_value()); } + double ratio_overhead() const { + return (double)size_overhead() / size_filled(); } + double ratio_keys_leaf() const { + return (double)num_kvs_leaf / num_kvs(); } + double ratio_nodes_leaf() const { + return (double)num_nodes_leaf / num_nodes(); } + double ratio_filled_leaf() const { + return (double)size_filled_leaf / size_filled(); } +}; +inline std::ostream& operator<<(std::ostream& os, const tree_stats_t& stats) { + os << "Tree stats:" + << "\n height = " << stats.height + << "\n num values = " << stats.num_kvs_leaf + << "\n num nodes = " << stats.num_nodes() + << " (leaf=" << stats.num_nodes_leaf + << ", internal=" << stats.num_nodes_internal << ")" + << "\n size persistent = " << stats.size_persistent() << "B" + << "\n size filled = " << stats.size_filled() << "B" + << " (value=" << stats.size_value_leaf << "B" + << ", rest=" << stats.size_filled() - stats.size_value_leaf << "B)" + << "\n size logical = " << stats.size_logical() << "B" + << "\n size overhead = " << stats.size_overhead() << "B" + << "\n ratio fullness = " << stats.ratio_fullness() + << "\n ratio keys leaf = " << stats.ratio_keys_leaf() + << "\n ratio nodes leaf = " << stats.ratio_nodes_leaf() + << "\n ratio filled leaf = " << stats.ratio_filled_leaf() + << "\n ratio key compression = " << stats.ratio_key_compression(); + assert(stats.num_kvs_internal + 1 == stats.num_nodes()); + return os; +} + +} diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h new file mode 100644 index 000000000..536052003 --- /dev/null +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/tree_utils.h @@ -0,0 +1,333 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cassert> +#include <cstring> +#include <random> +#include <string> +#include <sstream> +#include <utility> +#include <vector> + +#include "crimson/common/log.h" +#include "stages/key_layout.h" +#include "tree.h" + +/** + * tree_utils.h + * + * Contains shared logic for unit tests and perf tool. + */ + +namespace crimson::os::seastore::onode { + +class Onodes { + public: + Onodes(size_t n) { + for (size_t i = 1; i <= n; ++i) { + auto p_onode = &create(i * 8); + onodes.push_back(p_onode); + } + } + + Onodes(std::vector<size_t> sizes) { + for (auto& size : sizes) { + auto p_onode = &create(size); + onodes.push_back(p_onode); + } + } + + ~Onodes() = default; + + const onode_t& create(size_t size) { + ceph_assert(size <= std::numeric_limits<uint16_t>::max()); + onode_t config{static_cast<uint16_t>(size), id++}; + auto onode = onode_t::allocate(config); + auto p_onode = onode.get(); + tracked_onodes.push_back(std::move(onode)); + return *reinterpret_cast<onode_t*>(p_onode); + } + + const onode_t& pick() const { + auto index = rd() % onodes.size(); + return *onodes[index]; + } + + const onode_t& pick_largest() const { + return *onodes[onodes.size() - 1]; + } + + static void validate_cursor( + const Btree::Cursor& cursor, const ghobject_t& key, const onode_t& onode) { + ceph_assert(!cursor.is_end()); + ceph_assert(cursor.get_ghobj() == key); + ceph_assert(cursor.value()); + ceph_assert(cursor.value() != &onode); + ceph_assert(*cursor.value() == onode); + onode_t::validate_tail_magic(*cursor.value()); + } + + private: + uint16_t id = 0; + mutable std::random_device rd; + std::vector<const onode_t*> onodes; + std::vector<std::unique_ptr<char[]>> tracked_onodes; +}; + +class KVPool { + struct kv_conf_t { + unsigned index2; + unsigned index1; + unsigned index0; + size_t ns_size; + size_t oid_size; + const onode_t* p_value; + + ghobject_t get_ghobj() const { + assert(index1 < 10); + std::ostringstream os_ns; + os_ns << "ns" << index1; + unsigned current_size = (unsigned)os_ns.tellp(); + assert(ns_size >= current_size); + os_ns << std::string(ns_size - current_size, '_'); + + std::ostringstream os_oid; + os_oid << "oid" << index1; + current_size = (unsigned)os_oid.tellp(); + assert(oid_size >= current_size); + os_oid << std::string(oid_size - current_size, '_'); + + return ghobject_t(shard_id_t(index2), index2, index2, + os_ns.str(), os_oid.str(), index0, index0); + } + }; + using kv_vector_t = std::vector<kv_conf_t>; + + public: + using kv_t = std::pair<ghobject_t, const onode_t*>; + + KVPool(const std::vector<size_t>& str_sizes, + const std::vector<size_t>& onode_sizes, + const std::pair<unsigned, unsigned>& range2, + const std::pair<unsigned, unsigned>& range1, + const std::pair<unsigned, unsigned>& range0) + : str_sizes{str_sizes}, onodes{onode_sizes} { + ceph_assert(range2.first < range2.second); + ceph_assert(range2.second - 1 <= (unsigned)std::numeric_limits<shard_t>::max()); + ceph_assert(range2.second - 1 <= std::numeric_limits<crush_hash_t>::max()); + ceph_assert(range1.first < range1.second); + ceph_assert(range1.second - 1 <= 9); + ceph_assert(range0.first < range0.second); + std::random_device rd; + for (unsigned i = range2.first; i < range2.second; ++i) { + for (unsigned j = range1.first; j < range1.second; ++j) { + auto ns_size = (unsigned)str_sizes[rd() % str_sizes.size()]; + auto oid_size = (unsigned)str_sizes[rd() % str_sizes.size()]; + for (unsigned k = range0.first; k < range0.second; ++k) { + kvs.emplace_back(kv_conf_t{i, j, k, ns_size, oid_size, &onodes.pick()}); + } + } + } + random_kvs = kvs; + std::random_shuffle(random_kvs.begin(), random_kvs.end()); + } + + class iterator_t { + public: + iterator_t() = default; + iterator_t(const iterator_t&) = default; + iterator_t(iterator_t&&) = default; + iterator_t& operator=(const iterator_t&) = default; + iterator_t& operator=(iterator_t&&) = default; + + kv_t get_kv() const { + assert(!is_end()); + auto& conf = (*p_kvs)[i]; + return std::make_pair(conf.get_ghobj(), conf.p_value); + } + bool is_end() const { return !p_kvs || i >= p_kvs->size(); } + size_t index() const { return i; } + + iterator_t& operator++() { + assert(!is_end()); + ++i; + return *this; + } + + iterator_t operator++(int) { + iterator_t tmp = *this; + ++*this; + return tmp; + } + + private: + iterator_t(const kv_vector_t& kvs) : p_kvs{&kvs} {} + + const kv_vector_t* p_kvs = nullptr; + size_t i = 0; + friend class KVPool; + }; + + iterator_t begin() const { + return iterator_t(kvs); + } + + iterator_t random_begin() const { + return iterator_t(random_kvs); + } + + size_t size() const { + return kvs.size(); + } + + private: + std::vector<size_t> str_sizes; + Onodes onodes; + kv_vector_t kvs; + kv_vector_t random_kvs; +}; + +template <bool TRACK> +class TreeBuilder { + public: + using ertr = Btree::btree_ertr; + template <class ValueT=void> + using future = ertr::future<ValueT>; + + TreeBuilder(KVPool& kvs, NodeExtentManagerURef&& nm) + : kvs{kvs} { + tree.emplace(std::move(nm)); + } + + future<> bootstrap(Transaction& t) { + std::ostringstream oss; +#ifndef NDEBUG + oss << "debug=on, "; +#else + oss << "debug=off, "; +#endif +#ifdef UNIT_TESTS_BUILT + oss << "UNIT_TEST_BUILT=on, "; +#else + oss << "UNIT_TEST_BUILT=off, "; +#endif + if constexpr (TRACK) { + oss << "track=on, "; + } else { + oss << "track=off, "; + } + oss << *tree; + logger().warn("TreeBuilder: {}, bootstrapping ...", oss.str()); + return tree->mkfs(t); + } + + future<> insert(Transaction& t) { + kv_iter = kvs.random_begin(); + auto cursors = seastar::make_lw_shared<std::vector<Btree::Cursor>>(); + logger().warn("start inserting {} kvs ...", kvs.size()); + auto start_time = mono_clock::now(); + return crimson::do_until([&t, this, cursors]() -> future<bool> { + if (kv_iter.is_end()) { + return ertr::make_ready_future<bool>(true); + } + auto [key, p_value] = kv_iter.get_kv(); + logger().debug("[{}] {} -> {}", kv_iter.index(), key_hobj_t{key}, *p_value); + return tree->insert(t, key, *p_value + ).safe_then([&t, this, cursors](auto ret) { + auto& [cursor, success] = ret; + assert(success == true); + if constexpr (TRACK) { + cursors->emplace_back(cursor); + } +#ifndef NDEBUG + auto [key, p_value] = kv_iter.get_kv(); + Onodes::validate_cursor(cursor, key, *p_value); + return tree->lower_bound(t, key).safe_then([this, cursor](auto cursor_) { + auto [key, p_value] = kv_iter.get_kv(); + ceph_assert(cursor_.get_ghobj() == key); + ceph_assert(cursor_.value() == cursor.value()); + ++kv_iter; + return ertr::make_ready_future<bool>(false); + }); +#else + ++kv_iter; + return ertr::make_ready_future<bool>(false); +#endif + }); + }).safe_then([&t, this, start_time, cursors] { + std::chrono::duration<double> duration = mono_clock::now() - start_time; + logger().warn("Insert done! {}s", duration.count()); + if (!cursors->empty()) { + logger().info("Verifing tracked cursors ..."); + kv_iter = kvs.random_begin(); + return seastar::do_with( + cursors->begin(), [&t, this, cursors](auto& c_iter) { + return crimson::do_until([&t, this, &c_iter, cursors]() -> future<bool> { + if (kv_iter.is_end()) { + logger().info("Verify done!"); + return ertr::make_ready_future<bool>(true); + } + assert(c_iter != cursors->end()); + auto [k, v] = kv_iter.get_kv(); + // validate values in tree keep intact + return tree->lower_bound(t, k).safe_then([this, &c_iter](auto cursor) { + auto [k, v] = kv_iter.get_kv(); + Onodes::validate_cursor(cursor, k, *v); + // validate values in cursors keep intact + Onodes::validate_cursor(*c_iter, k, *v); + ++kv_iter; + ++c_iter; + return ertr::make_ready_future<bool>(false); + }); + }); + }); + } else { + return ertr::now(); + } + }); + } + + future<> get_stats(Transaction& t) { + return tree->get_stats_slow(t + ).safe_then([this](auto stats) { + logger().warn("{}", stats); + }); + } + + void reload(NodeExtentManagerURef&& nm) { + tree.emplace(std::move(nm)); + } + + future<> validate(Transaction& t) { + logger().info("Verifing insertion ..."); + return seastar::do_with( + kvs.begin(), [&t, this] (auto& kvs_iter) { + return crimson::do_until([&t, this, &kvs_iter]() -> future<bool> { + if (kvs_iter.is_end()) { + logger().info("Verify done!"); + return ertr::make_ready_future<bool>(true); + } + auto [k, v] = kvs_iter.get_kv(); + return tree->lower_bound(t, k + ).safe_then([&kvs_iter, k=k, v=v] (auto cursor) { + Onodes::validate_cursor(cursor, k, *v); + ++kvs_iter; + return ertr::make_ready_future<bool>(false); + }); + }); + }); + } + + private: + static seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } + + KVPool& kvs; + std::optional<Btree> tree; + KVPool::iterator_t kv_iter; +}; + +} diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h new file mode 100644 index 000000000..4a5024caa --- /dev/null +++ b/src/crimson/os/seastore/root_block.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/cached_extent.h" + +namespace crimson::os::seastore { + +/** + * root_t + * + * Contains information required to find metadata roots. + * TODO: generalize this to permit more than one lba_manager implementation + */ +struct __attribute__((aligned(8), packed)) root_t { + depth_t lba_depth = 0; + depth_t segment_depth = 0; + paddr_t lba_root_addr; + paddr_t segment_root; + laddr_t onode_root = L_ADDR_NULL; + + void adjust_addrs_from_base(paddr_t base) { + if (lba_root_addr.is_relative()) { + lba_root_addr = base.add_record_relative(lba_root_addr); + } + } +}; + +/** + * RootBlock + * + * Holds the physical addresses of all metadata roots. + * In-memory values may be + * - absolute: reference to block which predates the current transaction + * - record_relative: reference to block updated in this transaction + * if !pending() + * + * Journal replay only considers deltas and must always discover the most + * recent value for the RootBlock. Because the contents of root_t above are + * very small, it's simplest to stash the entire root_t value into the delta + * and never actually write the RootBlock to a physical location (safe since + * nothing references the location of the RootBlock). + * + * As a result, Cache treats the root differently in a few ways including: + * - state will only ever be DIRTY or MUTATION_PENDING + * - RootBlock's never show up in the transaction fresh or dirty lists -- + * there's a special Transaction::root member for when the root needs to + * be mutated. + * + * TODO: Journal trimming will need to be aware of the most recent RootBlock + * delta location, or, even easier, just always write one out with the + * mutation which changes the journal trim bound. + */ +struct RootBlock : CachedExtent { + constexpr static segment_off_t SIZE = 4<<10; + using Ref = TCachedExtentRef<RootBlock>; + + root_t root; + + RootBlock() : CachedExtent(0) {} + + RootBlock(const RootBlock &rhs) = default; + + CachedExtentRef duplicate_for_write() final { + return CachedExtentRef(new RootBlock(*this)); + }; + + static constexpr extent_types_t TYPE = extent_types_t::ROOT; + extent_types_t get_type() const final { + return extent_types_t::ROOT; + } + + /// dumps root as delta + ceph::bufferlist get_delta() final { + ceph::bufferlist bl; + ceph::buffer::ptr bptr(sizeof(root_t)); + *reinterpret_cast<root_t*>(bptr.c_str()) = root; + bl.append(bptr); + return bl; + } + + /// overwrites root + void apply_delta_and_adjust_crc(paddr_t base, const ceph::bufferlist &_bl) final { + assert(_bl.length() == sizeof(root_t)); + ceph::bufferlist bl = _bl; + bl.rebuild(); + root = *reinterpret_cast<const root_t*>(bl.front().c_str()); + root.adjust_addrs_from_base(base); + } + + /// Patches relative addrs in memory based on record commit addr + void on_delta_write(paddr_t record_block_offset) final { + root.adjust_addrs_from_base(record_block_offset); + } + + complete_load_ertr::future<> complete_load() final { + ceph_abort_msg("Root is only written via deltas"); + } + + void on_initial_write() final { + ceph_abort_msg("Root is only written via deltas"); + } + + root_t &get_root() { return root; } +}; +using RootBlockRef = RootBlock::Ref; + +} diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc new file mode 100644 index 000000000..50c148cea --- /dev/null +++ b/src/crimson/os/seastore/seastore.cc @@ -0,0 +1,532 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "seastore.h" + +#include <boost/algorithm/string/trim.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "common/safe_io.h" +#include "os/Transaction.h" + +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" + +#include "crimson/os/futurized_collection.h" + +#include "crimson/os/seastore/segment_manager/ephemeral.h" +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/onode_manager.h" +#include "crimson/os/seastore/cache.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +using crimson::common::local_conf; + +namespace crimson::os::seastore { + +struct SeastoreCollection final : public FuturizedCollection { + template <typename... T> + SeastoreCollection(T&&... args) : + FuturizedCollection(std::forward<T>(args)...) {} +}; + +SeaStore::SeaStore(const std::string& path) + : segment_manager(segment_manager::create_test_ephemeral() /* TODO */), + segment_cleaner( + std::make_unique<SegmentCleaner>( + SegmentCleaner::config_t::default_from_segment_manager( + *segment_manager))), + cache(std::make_unique<Cache>(*segment_manager)), + journal(new Journal(*segment_manager)), + lba_manager( + lba_manager::create_lba_manager(*segment_manager, *cache)), + transaction_manager( + new TransactionManager( + *segment_manager, + *segment_cleaner, + *journal, + *cache, + *lba_manager)), + onode_manager(onode_manager::create_ephemeral()) +{ + journal->set_segment_provider(&*segment_cleaner); + segment_cleaner->set_extent_callback(&*transaction_manager); +} + +SeaStore::~SeaStore() = default; + +seastar::future<> SeaStore::stop() +{ + return seastar::now(); +} + +seastar::future<> SeaStore::mount() +{ + return seastar::now(); +} + +seastar::future<> SeaStore::umount() +{ + return seastar::now(); +} + +seastar::future<> SeaStore::mkfs(uuid_d new_osd_fsid) +{ + return seastar::now(); +} + +seastar::future<store_statfs_t> SeaStore::stat() const +{ + logger().debug("{}", __func__); + store_statfs_t st; + return seastar::make_ready_future<store_statfs_t>(st); +} + +seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> +SeaStore::list_objects(CollectionRef ch, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const +{ + return seastar::make_ready_future<std::tuple<std::vector<ghobject_t>, ghobject_t>>( + std::make_tuple(std::vector<ghobject_t>(), end)); +} + +seastar::future<CollectionRef> SeaStore::create_new_collection(const coll_t& cid) +{ + auto c = _get_collection(cid); + return seastar::make_ready_future<CollectionRef>(c); +} + +seastar::future<CollectionRef> SeaStore::open_collection(const coll_t& cid) +{ + return seastar::make_ready_future<CollectionRef>(_get_collection(cid)); +} + +seastar::future<std::vector<coll_t>> SeaStore::list_collections() +{ + return seastar::make_ready_future<std::vector<coll_t>>(); +} + +SeaStore::read_errorator::future<ceph::bufferlist> SeaStore::read( + CollectionRef ch, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags) +{ + return read_errorator::make_ready_future<ceph::bufferlist>(); +} + +SeaStore::read_errorator::future<ceph::bufferlist> SeaStore::readv( + CollectionRef ch, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags) +{ + return read_errorator::make_ready_future<ceph::bufferlist>(); +} + +SeaStore::get_attr_errorator::future<ceph::bufferptr> SeaStore::get_attr( + CollectionRef ch, + const ghobject_t& oid, + std::string_view name) const +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + return crimson::ct_error::enoent::make(); +} + +SeaStore::get_attrs_ertr::future<SeaStore::attrs_t> SeaStore::get_attrs( + CollectionRef ch, + const ghobject_t& oid) +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + return crimson::ct_error::enoent::make(); +} + +seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) +{ + return seastar::make_ready_future<struct stat>(); +} + + +seastar::future<struct stat> SeaStore::stat( + CollectionRef c, + const ghobject_t& oid) +{ + struct stat st; + return seastar::make_ready_future<struct stat>(st); +} + +auto +SeaStore::omap_get_header( + CollectionRef c, + const ghobject_t& oid) + -> read_errorator::future<bufferlist> +{ + return seastar::make_ready_future<bufferlist>(); +} + +auto +SeaStore::omap_get_values( + CollectionRef ch, + const ghobject_t& oid, + const omap_keys_t& keys) + -> read_errorator::future<omap_values_t> +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + logger().debug("{} {} {}", + __func__, c->get_cid(), oid); + return seastar::make_ready_future<omap_values_t>(); +} + +auto +SeaStore::omap_get_values( + CollectionRef ch, + const ghobject_t &oid, + const std::optional<string> &start) + -> read_errorator::future<std::tuple<bool, SeaStore::omap_values_t>> +{ + auto c = static_cast<SeastoreCollection*>(ch.get()); + logger().debug( + "{} {} {}", + __func__, c->get_cid(), oid); + return seastar::make_ready_future<std::tuple<bool, omap_values_t>>( + std::make_tuple(false, omap_values_t())); +} + +seastar::future<FuturizedStore::OmapIteratorRef> get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) +{ + return seastar::make_ready_future<FuturizedStore::OmapIteratorRef>(); +} + +seastar::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + return seastar::make_ready_future<std::map<uint64_t, uint64_t>>(); +} + +seastar::future<> SeaStore::do_transaction( + CollectionRef _ch, + ceph::os::Transaction&& _t) +{ + return seastar::do_with( + _t.begin(), + transaction_manager->create_transaction(), + std::vector<OnodeRef>(), + std::move(_t), + std::move(_ch), + [this](auto &iter, auto &trans, auto &onodes, auto &t, auto &ch) { + return onode_manager->get_or_create_onodes( + *trans, iter.get_objects()).safe_then( + [this, &iter, &trans, &onodes, &t, &ch](auto &&read_onodes) { + onodes = std::move(read_onodes); + return seastar::do_until( + [&iter]() { return iter.have_op(); }, + [this, &iter, &trans, &onodes, &t, &ch]() { + return _do_transaction_step(trans, ch, onodes, iter).safe_then( + [this, &trans] { + return transaction_manager->submit_transaction(std::move(trans)); + }).handle_error( + // TODO: add errorator::do_until + crimson::ct_error::eagain::handle([]() { + // TODO retry + }), + write_ertr::all_same_way([&t](auto e) { + logger().error(" transaction dump:\n"); + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + std::stringstream str; + f.flush(str); + logger().error("{}", str.str()); + abort(); + })); + }); + }).safe_then([this, &trans, &onodes]() { + return onode_manager->write_dirty(*trans, onodes); + }).safe_then([]() { + // TODO: complete transaction! + return; + }).handle_error( + write_ertr::all_same_way([&t](auto e) { + logger().error(" transaction dump:\n"); + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + std::stringstream str; + f.flush(str); + logger().error("{}", str.str()); + abort(); + })).then([&t]() { + for (auto i : { + t.get_on_applied(), + t.get_on_commit(), + t.get_on_applied_sync()}) { + if (i) { + i->complete(0); + } + } + }); + }); +} + +SeaStore::write_ertr::future<> SeaStore::_do_transaction_step( + TransactionRef &trans, + CollectionRef &col, + std::vector<OnodeRef> &onodes, + ceph::os::Transaction::iterator &i) +{ + auto get_onode = [&onodes](size_t i) -> OnodeRef& { + ceph_assert(i < onodes.size()); + return onodes[i]; + }; + + using ceph::os::Transaction; + try { + switch (auto op = i.decode_op(); op->op) { + case Transaction::OP_NOP: + return write_ertr::now(); + case Transaction::OP_REMOVE: + { + return _remove(trans, get_onode(op->oid)); + } + break; + case Transaction::OP_TOUCH: + { + return _touch(trans, get_onode(op->oid)); + } + break; + case Transaction::OP_WRITE: + { + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + ceph::bufferlist bl; + i.decode_bl(bl); + return _write(trans, get_onode(op->oid), off, len, bl, fadvise_flags); + } + break; + case Transaction::OP_TRUNCATE: + { + uint64_t off = op->off; + return _truncate(trans, get_onode(op->oid), off); + } + break; + case Transaction::OP_SETATTR: + { + std::string name = i.decode_string(); + ceph::bufferlist bl; + i.decode_bl(bl); + std::map<std::string, bufferptr> to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + return _setattrs(trans, get_onode(op->oid), to_set); + } + break; + case Transaction::OP_MKCOLL: + { + coll_t cid = i.get_cid(op->cid); + return _create_collection(trans, cid, op->split_bits); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + std::map<std::string, ceph::bufferlist> aset; + i.decode_attrset(aset); + return _omap_set_values(trans, get_onode(op->oid), std::move(aset)); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + ceph::bufferlist bl; + i.decode_bl(bl); + return _omap_set_header(trans, get_onode(op->oid), bl); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + omap_keys_t keys; + i.decode_keyset(keys); + return _omap_rmkeys(trans, get_onode(op->oid), keys); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + string first, last; + first = i.decode_string(); + last = i.decode_string(); + return _omap_rmkeyrange(trans, get_onode(op->oid), first, last); + } + break; + case Transaction::OP_COLL_HINT: + { + ceph::bufferlist hint; + i.decode_bl(hint); + return write_ertr::now(); + } + default: + logger().error("bad op {}", static_cast<unsigned>(op->op)); + return crimson::ct_error::input_output_error::make(); + } + } catch (std::exception &e) { + logger().error("{} got exception {}", __func__, e); + return crimson::ct_error::input_output_error::make(); + } +} + +SeaStore::write_ertr::future<> SeaStore::_remove( + TransactionRef &trans, + OnodeRef &onode) +{ + logger().debug("{} onode={}", + __func__, *onode); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_touch( + TransactionRef &trans, + OnodeRef &onode) +{ + logger().debug("{} onode={}", + __func__, *onode); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_write( + TransactionRef &trans, + OnodeRef &onode, + uint64_t offset, size_t len, const ceph::bufferlist& bl, + uint32_t fadvise_flags) +{ + logger().debug("{}: {} {} ~ {}", + __func__, *onode, offset, len); + assert(len == bl.length()); + +/* + return onode_manager->get_or_create_onode(cid, oid).safe_then([=, &bl](auto ref) { + return; + }).handle_error( + crimson::ct_error::enoent::handle([]() { + return; + }), + OnodeManager::open_ertr::pass_further{} + ); + */ + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_omap_set_values( + TransactionRef &trans, + OnodeRef &onode, + std::map<std::string, ceph::bufferlist> &&aset) +{ + logger().debug( + "{}: {} {} keys", + __func__, *onode, aset.size()); + + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_omap_set_header( + TransactionRef &trans, + OnodeRef &onode, + const ceph::bufferlist &header) +{ + logger().debug( + "{}: {} {} bytes", + __func__, *onode, header.length()); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_omap_rmkeys( + TransactionRef &trans, + OnodeRef &onode, + const omap_keys_t& aset) +{ + logger().debug( + "{} {} {} keys", + __func__, *onode, aset.size()); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_omap_rmkeyrange( + TransactionRef &trans, + OnodeRef &onode, + const std::string &first, + const std::string &last) +{ + logger().debug( + "{} {} first={} last={}", + __func__, *onode, first, last); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_truncate( + TransactionRef &trans, + OnodeRef &onode, + uint64_t size) +{ + logger().debug("{} onode={} size={}", + __func__, *onode, size); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_setattrs( + TransactionRef &trans, + OnodeRef &onode, + std::map<std::string,bufferptr>& aset) +{ + logger().debug("{} onode={}", + __func__, *onode); + return write_ertr::now(); +} + +SeaStore::write_ertr::future<> SeaStore::_create_collection( + TransactionRef &trans, + const coll_t& cid, int bits) +{ + return write_ertr::now(); +} + +boost::intrusive_ptr<SeastoreCollection> SeaStore::_get_collection(const coll_t& cid) +{ + return new SeastoreCollection{cid}; +} + +seastar::future<> SeaStore::write_meta(const std::string& key, + const std::string& value) +{ + return seastar::make_ready_future<>(); +} + +seastar::future<std::tuple<int, std::string>> SeaStore::read_meta(const std::string& key) +{ + return seastar::make_ready_future<std::tuple<int, std::string>>( + std::make_tuple(0, ""s)); +} + +uuid_d SeaStore::get_fsid() const +{ + return osd_fsid; +} + +} diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h new file mode 100644 index 000000000..798442c34 --- /dev/null +++ b/src/crimson/os/seastore/seastore.h @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <unordered_map> +#include <map> +#include <typeinfo> +#include <vector> + +#include <optional> +#include <seastar/core/future.hh> + +#include "osd/osd_types.h" +#include "include/uuid.h" + +#include "os/Transaction.h" +#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/futurized_store.h" +#include "transaction.h" + +namespace crimson::os::seastore { + +class SeastoreCollection; +class SegmentManager; +class OnodeManager; +class Onode; +using OnodeRef = boost::intrusive_ptr<Onode>; +class Journal; +class LBAManager; +class TransactionManager; +class Cache; + +class SeaStore final : public FuturizedStore { + uuid_d osd_fsid; + +public: + + SeaStore(const std::string& path); + ~SeaStore() final; + + seastar::future<> stop() final; + seastar::future<> mount() final; + seastar::future<> umount() final; + + seastar::future<> mkfs(uuid_d new_osd_fsid) final; + seastar::future<store_statfs_t> stat() const final; + + read_errorator::future<ceph::bufferlist> read( + CollectionRef c, + const ghobject_t& oid, + uint64_t offset, + size_t len, + uint32_t op_flags = 0) final; + read_errorator::future<ceph::bufferlist> readv( + CollectionRef c, + const ghobject_t& oid, + interval_set<uint64_t>& m, + uint32_t op_flags = 0) final; + get_attr_errorator::future<ceph::bufferptr> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name) const final; + get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid) final; + + seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) final; + + read_errorator::future<omap_values_t> omap_get_values( + CollectionRef c, + const ghobject_t& oid, + const omap_keys_t& keys) final; + + /// Retrieves paged set of values > start (if present) + read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( + CollectionRef c, ///< [in] collection + const ghobject_t &oid, ///< [in] oid + const std::optional<std::string> &start ///< [in] start, empty for begin + ) final; ///< @return <done, values> values.empty() iff done + + read_errorator::future<bufferlist> omap_get_header( + CollectionRef c, + const ghobject_t& oid) final; + + seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( + CollectionRef c, + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit) const final; + + seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; + seastar::future<CollectionRef> open_collection(const coll_t& cid) final; + seastar::future<std::vector<coll_t>> list_collections() final; + + seastar::future<> do_transaction( + CollectionRef ch, + ceph::os::Transaction&& txn) final; + + seastar::future<OmapIteratorRef> get_omap_iterator( + CollectionRef ch, + const ghobject_t& oid) final; + seastar::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef ch, + const ghobject_t& oid, + uint64_t off, + uint64_t len) final; + + seastar::future<> write_meta(const std::string& key, + const std::string& value) final; + seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final; + uuid_d get_fsid() const final; + + unsigned get_max_attr_name_length() const final { + return 256; + } + +private: + std::unique_ptr<SegmentManager> segment_manager; + std::unique_ptr<SegmentCleaner> segment_cleaner; + std::unique_ptr<Cache> cache; + std::unique_ptr<Journal> journal; + std::unique_ptr<LBAManager> lba_manager; + std::unique_ptr<TransactionManager> transaction_manager; + std::unique_ptr<OnodeManager> onode_manager; + + + using write_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + write_ertr::future<> _do_transaction_step( + TransactionRef &trans, + CollectionRef &col, + std::vector<OnodeRef> &onodes, + ceph::os::Transaction::iterator &i); + + write_ertr::future<> _remove( + TransactionRef &trans, + OnodeRef &onode); + write_ertr::future<> _touch( + TransactionRef &trans, + OnodeRef &onode); + write_ertr::future<> _write( + TransactionRef &trans, + OnodeRef &onode, + uint64_t offset, size_t len, const ceph::bufferlist& bl, + uint32_t fadvise_flags); + write_ertr::future<> _omap_set_values( + TransactionRef &trans, + OnodeRef &onode, + std::map<std::string, ceph::bufferlist> &&aset); + write_ertr::future<> _omap_set_header( + TransactionRef &trans, + OnodeRef &onode, + const ceph::bufferlist &header); + write_ertr::future<> _omap_rmkeys( + TransactionRef &trans, + OnodeRef &onode, + const omap_keys_t& aset); + write_ertr::future<> _omap_rmkeyrange( + TransactionRef &trans, + OnodeRef &onode, + const std::string &first, + const std::string &last); + write_ertr::future<> _truncate( + TransactionRef &trans, + OnodeRef &onode, uint64_t size); + write_ertr::future<> _setattrs( + TransactionRef &trans, + OnodeRef &onode, + std::map<std::string,bufferptr>& aset); + write_ertr::future<> _create_collection( + TransactionRef &trans, + const coll_t& cid, int bits); + + boost::intrusive_ptr<SeastoreCollection> _get_collection(const coll_t& cid); +}; + +} diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc new file mode 100644 index 000000000..ff43b1e51 --- /dev/null +++ b/src/crimson/os/seastore/seastore_types.cc @@ -0,0 +1,105 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { + +std::ostream &segment_to_stream(std::ostream &out, const segment_id_t &t) +{ + if (t == NULL_SEG_ID) + return out << "NULL_SEG"; + else if (t == BLOCK_REL_SEG_ID) + return out << "BLOCK_REL_SEG"; + else if (t == RECORD_REL_SEG_ID) + return out << "RECORD_REL_SEG"; + else if (t == FAKE_SEG_ID) + return out << "FAKE_SEG"; + else + return out << t; +} + +std::ostream &offset_to_stream(std::ostream &out, const segment_off_t &t) +{ + if (t == NULL_SEG_OFF) + return out << "NULL_OFF"; + else + return out << t; +} + +std::ostream &operator<<(std::ostream &out, const paddr_t &rhs) +{ + out << "paddr_t<"; + segment_to_stream(out, rhs.segment); + out << ", "; + offset_to_stream(out, rhs.offset); + return out << ">"; +} + +std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq) +{ + return out << "journal_seq_t(segment_seq=" + << seq.segment_seq << ", offset=" + << seq.offset + << ")"; +} + +std::ostream &operator<<(std::ostream &out, extent_types_t t) +{ + switch (t) { + case extent_types_t::ROOT: + return out << "ROOT"; + case extent_types_t::LADDR_INTERNAL: + return out << "LADDR_INTERNAL"; + case extent_types_t::LADDR_LEAF: + return out << "LADDR_LEAF"; + case extent_types_t::EXTMAP_INNER: + return out << "EXTMAP_INNER"; + case extent_types_t::EXTMAP_LEAF: + return out << "EXTMAP_LEAF"; + case extent_types_t::ONODE_BLOCK_STAGED: + return out << "ONODE_BLOCK_STAGED"; + case extent_types_t::TEST_BLOCK: + return out << "TEST_BLOCK"; + case extent_types_t::TEST_BLOCK_PHYSICAL: + return out << "TEST_BLOCK_PHYSICAL"; + case extent_types_t::NONE: + return out << "NONE"; + default: + return out << "UNKNOWN"; + } +} + +std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs) +{ + bool first = false; + for (auto &i: rhs) { + out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')'; + first = true; + } + return out << ']'; +} +std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs) +{ + bool first = false; + for (auto &i: rhs) { + out << (first ? '[' : ',') << '(' << i.first << ',' << i.second << ')'; + first = true; + } + return out << ']'; +} + +std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs) +{ + return lhs << "delta_info_t(" + << "type: " << rhs.type + << ", paddr: " << rhs.paddr + << ", laddr: " << rhs.laddr + << ", prev_crc: " << rhs.prev_crc + << ", final_crc: " << rhs.final_crc + << ", length: " << rhs.length + << ", pversion: " << rhs.pversion + << ")"; +} + +} diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h new file mode 100644 index 000000000..cb8480268 --- /dev/null +++ b/src/crimson/os/seastore/seastore_types.h @@ -0,0 +1,369 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <limits> +#include <iostream> + +#include "include/byteorder.h" +#include "include/denc.h" +#include "include/buffer.h" +#include "include/cmp.h" +#include "include/uuid.h" + +namespace crimson::os::seastore { + +using depth_t = int32_t; +using depth_le_t = ceph_les32; + +using checksum_t = uint32_t; + +// Immutable metadata for seastore to set at mkfs time +struct seastore_meta_t { + uuid_d seastore_id; + + DENC(seastore_meta_t, v, p) { + DENC_START(1, 1, p); + denc(v.seastore_id, p); + DENC_FINISH(p); + } +}; + +// Identifies segment location on disk, see SegmentManager, +using segment_id_t = uint32_t; +constexpr segment_id_t NULL_SEG_ID = + std::numeric_limits<segment_id_t>::max() - 1; +/* Used to denote relative paddr_t */ +constexpr segment_id_t RECORD_REL_SEG_ID = + std::numeric_limits<segment_id_t>::max() - 2; +constexpr segment_id_t BLOCK_REL_SEG_ID = + std::numeric_limits<segment_id_t>::max() - 3; + +// for tests which generate fake paddrs +constexpr segment_id_t FAKE_SEG_ID = + std::numeric_limits<segment_id_t>::max() - 4; + +std::ostream &segment_to_stream(std::ostream &, const segment_id_t &t); + +// Offset within a segment on disk, see SegmentManager +// may be negative for relative offsets +using segment_off_t = int32_t; +constexpr segment_off_t NULL_SEG_OFF = + std::numeric_limits<segment_id_t>::max(); + +std::ostream &offset_to_stream(std::ostream &, const segment_off_t &t); + +/* Monotonically increasing segment seq, uniquely identifies + * the incarnation of a segment */ +using segment_seq_t = uint32_t; +static constexpr segment_seq_t NULL_SEG_SEQ = + std::numeric_limits<segment_seq_t>::max(); + +// Offset of delta within a record +using record_delta_idx_t = uint32_t; +constexpr record_delta_idx_t NULL_DELTA_IDX = + std::numeric_limits<record_delta_idx_t>::max(); + +/** + * paddr_t + * + * <segment, offset> offset on disk, see SegmentManager + * + * May be absolute, record_relative, or block_relative. + * + * Blocks get read independently of the surrounding record, + * so paddrs embedded directly within a block need to refer + * to other blocks within the same record by a block_relative + * addr relative to the block's own offset. By contrast, + * deltas to existing blocks need to use record_relative + * addrs relative to the first block of the record. + * + * Fresh extents during a transaction are refered to by + * record_relative paddrs. + */ +struct paddr_t { + segment_id_t segment = NULL_SEG_ID; + segment_off_t offset = NULL_SEG_OFF; + + bool is_relative() const { + return segment == RECORD_REL_SEG_ID || + segment == BLOCK_REL_SEG_ID; + } + + bool is_record_relative() const { + return segment == RECORD_REL_SEG_ID; + } + + bool is_block_relative() const { + return segment == BLOCK_REL_SEG_ID; + } + + paddr_t add_offset(segment_off_t o) const { + return paddr_t{segment, offset + o}; + } + + paddr_t add_relative(paddr_t o) const { + assert(o.is_relative()); + return paddr_t{segment, offset + o.offset}; + } + + paddr_t add_block_relative(paddr_t o) const { + // special version mainly for documentation purposes + assert(o.is_block_relative()); + return add_relative(o); + } + + paddr_t add_record_relative(paddr_t o) const { + // special version mainly for documentation purposes + assert(o.is_record_relative()); + return add_relative(o); + } + + /** + * paddr_t::operator- + * + * Only defined for record_relative paddr_ts. Yields a + * block_relative address. + */ + paddr_t operator-(paddr_t rhs) const { + assert(rhs.is_relative() && is_relative()); + assert(rhs.segment == segment); + return paddr_t{ + BLOCK_REL_SEG_ID, + offset - rhs.offset + }; + } + + /** + * maybe_relative_to + * + * Helper for the case where an in-memory paddr_t may be + * either block_relative or absolute (not record_relative). + * + * base must be either absolute or record_relative. + */ + paddr_t maybe_relative_to(paddr_t base) const { + assert(!base.is_block_relative()); + if (is_block_relative()) + return base.add_block_relative(*this); + else + return *this; + } + + DENC(paddr_t, v, p) { + DENC_START(1, 1, p); + denc(v.segment, p); + denc(v.offset, p); + DENC_FINISH(p); + } +}; +WRITE_CMP_OPERATORS_2(paddr_t, segment, offset) +WRITE_EQ_OPERATORS_2(paddr_t, segment, offset) +constexpr paddr_t P_ADDR_NULL = paddr_t{}; +constexpr paddr_t P_ADDR_MIN = paddr_t{0, 0}; +constexpr paddr_t make_record_relative_paddr(segment_off_t off) { + return paddr_t{RECORD_REL_SEG_ID, off}; +} +constexpr paddr_t make_block_relative_paddr(segment_off_t off) { + return paddr_t{BLOCK_REL_SEG_ID, off}; +} +constexpr paddr_t make_fake_paddr(segment_off_t off) { + return paddr_t{FAKE_SEG_ID, off}; +} + +struct paddr_le_t { + ceph_le32 segment = init_le32(NULL_SEG_ID); + ceph_les32 offset = init_les32(NULL_SEG_OFF); + + paddr_le_t() = default; + paddr_le_t(ceph_le32 segment, ceph_les32 offset) + : segment(segment), offset(offset) {} + paddr_le_t(segment_id_t segment, segment_off_t offset) + : segment(init_le32(segment)), offset(init_les32(offset)) {} + paddr_le_t(const paddr_t &addr) : paddr_le_t(addr.segment, addr.offset) {} + + operator paddr_t() const { + return paddr_t{segment, offset}; + } +}; + +std::ostream &operator<<(std::ostream &out, const paddr_t &rhs); + +using objaddr_t = uint32_t; +constexpr objaddr_t OBJ_ADDR_MIN = std::numeric_limits<objaddr_t>::min(); + +/* Monotonically increasing identifier for the location of a + * journal_record. + */ +struct journal_seq_t { + segment_seq_t segment_seq = 0; + paddr_t offset; + + DENC(journal_seq_t, v, p) { + DENC_START(1, 1, p); + denc(v.segment_seq, p); + denc(v.offset, p); + DENC_FINISH(p); + } +}; +WRITE_CMP_OPERATORS_2(journal_seq_t, segment_seq, offset) +WRITE_EQ_OPERATORS_2(journal_seq_t, segment_seq, offset) + +std::ostream &operator<<(std::ostream &out, const journal_seq_t &seq); + +static constexpr journal_seq_t NO_DELTAS = journal_seq_t{ + NULL_SEG_SEQ, + P_ADDR_NULL +}; + +// logical addr, see LBAManager, TransactionManager +using laddr_t = uint64_t; +constexpr laddr_t L_ADDR_MIN = std::numeric_limits<laddr_t>::min(); +constexpr laddr_t L_ADDR_MAX = std::numeric_limits<laddr_t>::max(); +constexpr laddr_t L_ADDR_NULL = std::numeric_limits<laddr_t>::max(); +constexpr laddr_t L_ADDR_ROOT = std::numeric_limits<laddr_t>::max() - 1; +constexpr laddr_t L_ADDR_LBAT = std::numeric_limits<laddr_t>::max() - 2; + +struct laddr_le_t { + ceph_le64 laddr = init_le64(L_ADDR_NULL); + + laddr_le_t() = default; + laddr_le_t(const laddr_le_t &) = default; + explicit laddr_le_t(const laddr_t &addr) + : laddr(init_le64(addr)) {} + + operator laddr_t() const { + return laddr_t(laddr); + } + laddr_le_t& operator=(laddr_t addr) { + ceph_le64 val; + val = addr; + laddr = val; + return *this; + } +}; + +// logical offset, see LBAManager, TransactionManager +using extent_len_t = uint32_t; +constexpr extent_len_t EXTENT_LEN_MAX = + std::numeric_limits<extent_len_t>::max(); + +using extent_len_le_t = ceph_le32; +inline extent_len_le_t init_extent_len_le_t(extent_len_t len) { + return init_le32(len); +} + +struct laddr_list_t : std::list<std::pair<laddr_t, extent_len_t>> { + template <typename... T> + laddr_list_t(T&&... args) + : std::list<std::pair<laddr_t, extent_len_t>>(std::forward<T>(args)...) {} +}; +struct paddr_list_t : std::list<std::pair<paddr_t, extent_len_t>> { + template <typename... T> + paddr_list_t(T&&... args) + : std::list<std::pair<paddr_t, extent_len_t>>(std::forward<T>(args)...) {} +}; + +std::ostream &operator<<(std::ostream &out, const laddr_list_t &rhs); +std::ostream &operator<<(std::ostream &out, const paddr_list_t &rhs); + +/* identifies type of extent, used for interpretting deltas, managing + * writeback. + * + * Note that any new extent type needs to be added to + * Cache::get_extent_by_type in cache.cc + */ +enum class extent_types_t : uint8_t { + ROOT = 0, + LADDR_INTERNAL = 1, + LADDR_LEAF = 2, + ONODE_BLOCK = 3, + EXTMAP_INNER = 4, + EXTMAP_LEAF = 5, + ONODE_BLOCK_STAGED = 6, + + // Test Block Types + TEST_BLOCK = 0xF0, + TEST_BLOCK_PHYSICAL = 0xF1, + + // None + NONE = 0xFF +}; + +inline bool is_logical_type(extent_types_t type) { + switch (type) { + case extent_types_t::ROOT: + case extent_types_t::LADDR_INTERNAL: + case extent_types_t::LADDR_LEAF: + return false; + default: + return true; + } +} + +std::ostream &operator<<(std::ostream &out, extent_types_t t); + +/* description of a new physical extent */ +struct extent_t { + extent_types_t type; ///< type of extent + laddr_t addr; ///< laddr of extent (L_ADDR_NULL for non-logical) + ceph::bufferlist bl; ///< payload, bl.length() == length, aligned +}; + +using extent_version_t = uint32_t; +constexpr extent_version_t EXTENT_VERSION_NULL = 0; + +/* description of a mutation to a physical extent */ +struct delta_info_t { + extent_types_t type = extent_types_t::NONE; ///< delta type + paddr_t paddr; ///< physical address + laddr_t laddr = L_ADDR_NULL; ///< logical address + uint32_t prev_crc = 0; + uint32_t final_crc = 0; + segment_off_t length = NULL_SEG_OFF; ///< extent length + extent_version_t pversion; ///< prior version + ceph::bufferlist bl; ///< payload + + DENC(delta_info_t, v, p) { + DENC_START(1, 1, p); + denc(v.type, p); + denc(v.paddr, p); + denc(v.laddr, p); + denc(v.prev_crc, p); + denc(v.final_crc, p); + denc(v.length, p); + denc(v.pversion, p); + denc(v.bl, p); + DENC_FINISH(p); + } + + bool operator==(const delta_info_t &rhs) const { + return ( + type == rhs.type && + paddr == rhs.paddr && + laddr == rhs.laddr && + prev_crc == rhs.prev_crc && + final_crc == rhs.final_crc && + length == rhs.length && + pversion == rhs.pversion && + bl == rhs.bl + ); + } + + friend std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs); +}; + +std::ostream &operator<<(std::ostream &lhs, const delta_info_t &rhs); + +struct record_t { + std::vector<extent_t> extents; + std::vector<delta_info_t> deltas; +}; + +} + +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::seastore_meta_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::paddr_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::journal_seq_t) +WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::delta_info_t) diff --git a/src/crimson/os/seastore/segment_cleaner.cc b/src/crimson/os/seastore/segment_cleaner.cc new file mode 100644 index 000000000..3597c21df --- /dev/null +++ b/src/crimson/os/seastore/segment_cleaner.cc @@ -0,0 +1,340 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/segment_cleaner.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore { + +bool SpaceTrackerSimple::equals(const SpaceTrackerI &_other) const +{ + const auto &other = static_cast<const SpaceTrackerSimple&>(_other); + + if (other.live_bytes_by_segment.size() != live_bytes_by_segment.size()) { + logger().error("{}: different segment counts, bug in test"); + assert(0 == "segment counts should match"); + return false; + } + + bool all_match = true; + for (segment_id_t i = 0; i < live_bytes_by_segment.size(); ++i) { + if (other.live_bytes_by_segment[i] != live_bytes_by_segment[i]) { + all_match = false; + logger().debug( + "{}: segment_id {} live bytes mismatch *this: {}, other: {}", + __func__, + i, + live_bytes_by_segment[i], + other.live_bytes_by_segment[i]); + } + } + return all_match; +} + +int64_t SpaceTrackerDetailed::SegmentMap::allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size) +{ + assert(offset % block_size == 0); + assert(len % block_size == 0); + + const auto b = (offset / block_size); + const auto e = (offset + len) / block_size; + + bool error = false; + for (auto i = b; i < e; ++i) { + if (bitmap[i]) { + if (!error) { + logger().error( + "SegmentMap::allocate found allocated in {}, {} ~ {}", + segment, + offset, + len); + error = true; + } + logger().debug( + "SegmentMap::allocate block {} allocated", + i * block_size); + } + bitmap[i] = true; + } + return update_usage(block_size); +} + +int64_t SpaceTrackerDetailed::SegmentMap::release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size) +{ + assert(offset % block_size == 0); + assert(len % block_size == 0); + + const auto b = (offset / block_size); + const auto e = (offset + len) / block_size; + + bool error = false; + for (auto i = b; i < e; ++i) { + if (!bitmap[i]) { + if (!error) { + logger().error( + "SegmentMap::release found unallocated in {}, {} ~ {}", + segment, + offset, + len); + error = true; + } + logger().debug( + "SegmentMap::release block {} unallocated", + i * block_size); + } + bitmap[i] = false; + } + return update_usage(-(int64_t)block_size); +} + +bool SpaceTrackerDetailed::equals(const SpaceTrackerI &_other) const +{ + const auto &other = static_cast<const SpaceTrackerDetailed&>(_other); + + if (other.segment_usage.size() != segment_usage.size()) { + logger().error("{}: different segment counts, bug in test"); + assert(0 == "segment counts should match"); + return false; + } + + bool all_match = true; + for (segment_id_t i = 0; i < segment_usage.size(); ++i) { + if (other.segment_usage[i].get_usage() != segment_usage[i].get_usage()) { + all_match = false; + logger().error( + "{}: segment_id {} live bytes mismatch *this: {}, other: {}", + __func__, + i, + segment_usage[i].get_usage(), + other.segment_usage[i].get_usage()); + } + } + return all_match; +} + +void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const +{ + for (unsigned i = 0; i < bitmap.size(); ++i) { + if (bitmap[i]) { + logger().debug(" {} still live", i * block_size); + } + } +} + +void SpaceTrackerDetailed::dump_usage(segment_id_t id) const +{ + logger().debug("SpaceTrackerDetailed::dump_usage {}", id); + segment_usage[id].dump_usage(block_size); +} + +SegmentCleaner::get_segment_ret SegmentCleaner::get_segment() +{ + for (size_t i = 0; i < segments.size(); ++i) { + if (segments[i].is_empty()) { + mark_open(i); + logger().debug("{}: returning segment {}", __func__, i); + return get_segment_ret( + get_segment_ertr::ready_future_marker{}, + i); + } + } + assert(0 == "out of space handling todo"); + return get_segment_ret( + get_segment_ertr::ready_future_marker{}, + 0); +} + +void SegmentCleaner::update_journal_tail_target(journal_seq_t target) +{ + logger().debug( + "{}: {}", + __func__, + target); + assert(journal_tail_target == journal_seq_t() || target >= journal_tail_target); + if (journal_tail_target == journal_seq_t() || target > journal_tail_target) { + journal_tail_target = target; + } +} + +void SegmentCleaner::update_journal_tail_committed(journal_seq_t committed) +{ + if (journal_tail_committed == journal_seq_t() || + committed > journal_tail_committed) { + logger().debug( + "{}: update journal_tail_committed {}", + __func__, + committed); + journal_tail_committed = committed; + } + if (journal_tail_target == journal_seq_t() || + committed > journal_tail_target) { + logger().debug( + "{}: update journal_tail_target {}", + __func__, + committed); + journal_tail_target = committed; + } +} + +void SegmentCleaner::close_segment(segment_id_t segment) +{ + mark_closed(segment); +} + +SegmentCleaner::do_immediate_work_ret SegmentCleaner::do_immediate_work( + Transaction &t) +{ + auto next_target = get_dirty_tail_limit(); + logger().debug( + "{}: journal_tail_target={} get_dirty_tail_limit()={}", + __func__, + journal_tail_target, + next_target); + + logger().debug( + "SegmentCleaner::do_immediate_work gc total {}, available {}, unavailable {}, used {} available_ratio {}, reclaim_ratio {}, bytes_to_gc_for_available {}, bytes_to_gc_for_reclaim {}", + get_total_bytes(), + get_available_bytes(), + get_unavailable_bytes(), + get_used_bytes(), + get_available_ratio(), + get_reclaim_ratio(), + get_immediate_bytes_to_gc_for_available(), + get_immediate_bytes_to_gc_for_reclaim()); + + auto dirty_fut = do_immediate_work_ertr::now(); + if (journal_tail_target < next_target) { + dirty_fut = rewrite_dirty(t, next_target); + } + return dirty_fut.safe_then([=, &t] { + return do_gc(t, get_immediate_bytes_to_gc()); + }).handle_error( + do_immediate_work_ertr::pass_further{}, + crimson::ct_error::assert_all{} + ); +} + +SegmentCleaner::do_deferred_work_ret SegmentCleaner::do_deferred_work( + Transaction &t) +{ + return do_deferred_work_ret( + do_deferred_work_ertr::ready_future_marker{}, + ceph::timespan()); +} + +SegmentCleaner::rewrite_dirty_ret SegmentCleaner::rewrite_dirty( + Transaction &t, + journal_seq_t limit) +{ + return ecb->get_next_dirty_extents( + limit + ).then([=, &t](auto dirty_list) { + if (dirty_list.empty()) { + return do_immediate_work_ertr::now(); + } else { + update_journal_tail_target(dirty_list.front()->get_dirty_from()); + } + return seastar::do_with( + std::move(dirty_list), + [this, &t](auto &dirty_list) { + return crimson::do_for_each( + dirty_list, + [this, &t](auto &e) { + logger().debug( + "SegmentCleaner::do_immediate_work cleaning {}", + *e); + return ecb->rewrite_extent(t, e); + }); + }); + }); +} + +SegmentCleaner::do_gc_ret SegmentCleaner::do_gc( + Transaction &t, + size_t bytes) +{ + if (bytes == 0) { + return do_gc_ertr::now(); + } + + if (!scan_cursor) { + paddr_t next = P_ADDR_NULL; + next.segment = get_next_gc_target(); + if (next == P_ADDR_NULL) { + logger().debug( + "SegmentCleaner::do_gc: no segments to gc"); + return do_gc_ertr::now(); + } + next.offset = 0; + scan_cursor = + std::make_unique<ExtentCallbackInterface::scan_extents_cursor>( + next); + logger().debug( + "SegmentCleaner::do_gc: starting gc on segment {}", + scan_cursor->get_offset().segment); + } + + return ecb->scan_extents( + *scan_cursor, + bytes + ).safe_then([=, &t](auto addrs) { + return seastar::do_with( + std::move(addrs), + [=, &t](auto &addr_list) { + return crimson::do_for_each( + addr_list, + [=, &t](auto &addr_pair) { + auto &[addr, info] = addr_pair; + logger().debug( + "SegmentCleaner::do_gc: checking addr {}", + addr); + return ecb->get_extent_if_live( + t, + info.type, + addr, + info.addr, + info.len + ).safe_then([addr=addr, &t, this](CachedExtentRef ext) { + if (!ext) { + logger().debug( + "SegmentCleaner::do_gc: addr {} dead, skipping", + addr); + return ExtentCallbackInterface::rewrite_extent_ertr::now(); + } else { + logger().debug( + "SegmentCleaner::do_gc: addr {} alive, gc'ing {}", + addr, + *ext); + } + return ecb->rewrite_extent( + t, + ext); + }); + }).safe_then([&t, this] { + if (scan_cursor->is_complete()) { + t.mark_segment_to_release(scan_cursor->get_offset().segment); + scan_cursor.reset(); + } + return ExtentCallbackInterface::release_segment_ertr::now(); + }); + }); + }); +} + +} diff --git a/src/crimson/os/seastore/segment_cleaner.h b/src/crimson/os/seastore/segment_cleaner.h new file mode 100644 index 000000000..38ebd05bc --- /dev/null +++ b/src/crimson/os/seastore/segment_cleaner.h @@ -0,0 +1,691 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive/set.hpp> + +#include "common/ceph_time.h" + +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/journal.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/segment_manager.h" + +namespace crimson::os::seastore { +class Transaction; + +struct segment_info_t { + Segment::segment_state_t state = Segment::segment_state_t::EMPTY; + + // Will be non-null for any segments in the current journal + segment_seq_t journal_segment_seq = NULL_SEG_SEQ; + + + bool is_in_journal(journal_seq_t tail_committed) const { + return journal_segment_seq != NULL_SEG_SEQ && + tail_committed.segment_seq <= journal_segment_seq; + } + + bool is_empty() const { + return state == Segment::segment_state_t::EMPTY; + } + + bool is_closed() const { + return state == Segment::segment_state_t::CLOSED; + } + + bool is_open() const { + return state == Segment::segment_state_t::OPEN; + } +}; + +class SpaceTrackerI { +public: + virtual int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) = 0; + + virtual int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) = 0; + + virtual int64_t get_usage( + segment_id_t segment) const = 0; + + virtual bool equals(const SpaceTrackerI &other) const = 0; + + virtual std::unique_ptr<SpaceTrackerI> make_empty() const = 0; + + virtual void dump_usage(segment_id_t) const = 0; + + virtual void reset() = 0; + + virtual ~SpaceTrackerI() = default; +}; +using SpaceTrackerIRef = std::unique_ptr<SpaceTrackerI>; + +class SpaceTrackerSimple : public SpaceTrackerI { + // Tracks live space for each segment + std::vector<int64_t> live_bytes_by_segment; + + int64_t update_usage(segment_id_t segment, int64_t delta) { + assert(segment < live_bytes_by_segment.size()); + live_bytes_by_segment[segment] += delta; + assert(live_bytes_by_segment[segment] >= 0); + return live_bytes_by_segment[segment]; + } +public: + SpaceTrackerSimple(size_t num_segments) + : live_bytes_by_segment(num_segments, 0) {} + + int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + return update_usage(segment, len); + } + + int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + return update_usage(segment, -len); + } + + int64_t get_usage(segment_id_t segment) const final { + assert(segment < live_bytes_by_segment.size()); + return live_bytes_by_segment[segment]; + } + + void dump_usage(segment_id_t) const final {} + + void reset() final { + for (auto &i: live_bytes_by_segment) + i = 0; + } + + SpaceTrackerIRef make_empty() const final { + return SpaceTrackerIRef( + new SpaceTrackerSimple(live_bytes_by_segment.size())); + } + + bool equals(const SpaceTrackerI &other) const; +}; + +class SpaceTrackerDetailed : public SpaceTrackerI { + class SegmentMap { + int64_t used = 0; + std::vector<bool> bitmap; + + public: + SegmentMap(size_t blocks) : bitmap(blocks, false) {} + + int64_t update_usage(int64_t delta) { + used += delta; + return used; + } + + int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size); + + int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len, + const extent_len_t block_size); + + int64_t get_usage() const { + return used; + } + + void dump_usage(extent_len_t block_size) const; + + void reset() { + used = 0; + for (auto &&i: bitmap) { + i = false; + } + } + }; + const size_t block_size; + const size_t segment_size; + + // Tracks live space for each segment + std::vector<SegmentMap> segment_usage; + +public: + SpaceTrackerDetailed(size_t num_segments, size_t segment_size, size_t block_size) + : block_size(block_size), + segment_size(segment_size), + segment_usage(num_segments, segment_size / block_size) {} + + int64_t allocate( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + assert(segment < segment_usage.size()); + return segment_usage[segment].allocate(segment, offset, len, block_size); + } + + int64_t release( + segment_id_t segment, + segment_off_t offset, + extent_len_t len) final { + assert(segment < segment_usage.size()); + return segment_usage[segment].release(segment, offset, len, block_size); + } + + int64_t get_usage(segment_id_t segment) const final { + assert(segment < segment_usage.size()); + return segment_usage[segment].get_usage(); + } + + void dump_usage(segment_id_t seg) const final; + + void reset() final { + for (auto &i: segment_usage) + i.reset(); + } + + SpaceTrackerIRef make_empty() const final { + return SpaceTrackerIRef( + new SpaceTrackerDetailed( + segment_usage.size(), + segment_size, + block_size)); + } + + bool equals(const SpaceTrackerI &other) const; +}; + + +class SegmentCleaner : public JournalSegmentProvider { +public: + /// Config + struct config_t { + size_t num_segments = 0; + size_t segment_size = 0; + size_t block_size = 0; + size_t target_journal_segments = 0; + size_t max_journal_segments = 0; + + double reclaim_ratio_hard_limit = 0; + // don't apply reclaim ratio with available space below this + double reclaim_ratio_usage_min = 0; + + double available_ratio_hard_limit = 0; + + static config_t default_from_segment_manager( + SegmentManager &manager) { + return config_t{ + manager.get_num_segments(), + static_cast<size_t>(manager.get_segment_size()), + (size_t)manager.get_block_size(), + 2, + 4, + .5, + .95, + .2 + }; + } + }; + + /// Callback interface for querying and operating on segments + class ExtentCallbackInterface { + public: + virtual ~ExtentCallbackInterface() = default; + /** + * get_next_dirty_extent + * + * returns all extents with dirty_from < bound + */ + using get_next_dirty_extents_ertr = crimson::errorator<>; + using get_next_dirty_extents_ret = get_next_dirty_extents_ertr::future< + std::vector<CachedExtentRef>>; + virtual get_next_dirty_extents_ret get_next_dirty_extents( + journal_seq_t bound ///< [in] return extents with dirty_from < bound + ) = 0; + + /** + * rewrite_extent + * + * Updates t with operations moving the passed extents to a new + * segment. extent may be invalid, implementation must correctly + * handle finding the current instance if it is still alive and + * otherwise ignore it. + */ + using rewrite_extent_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using rewrite_extent_ret = rewrite_extent_ertr::future<>; + virtual rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) = 0; + + /** + * get_extent_if_live + * + * Returns extent at specified location if still referenced by + * lba_manager and not removed by t. + * + * See TransactionManager::get_extent_if_live and + * LBAManager::get_physical_extent_if_live. + */ + using get_extent_if_live_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using get_extent_if_live_ret = get_extent_if_live_ertr::future< + CachedExtentRef>; + virtual get_extent_if_live_ret get_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) = 0; + + /** + * scan_extents + * + * Interface shim for Journal::scan_extents + */ + using scan_extents_cursor = Journal::scan_valid_records_cursor; + using scan_extents_ertr = Journal::scan_extents_ertr; + using scan_extents_ret = Journal::scan_extents_ret; + virtual scan_extents_ret scan_extents( + scan_extents_cursor &cursor, + extent_len_t bytes_to_read) = 0; + + /** + * release_segment + * + * Release segment. + */ + using release_segment_ertr = SegmentManager::release_ertr; + using release_segment_ret = release_segment_ertr::future<>; + virtual release_segment_ret release_segment( + segment_id_t id) = 0; + }; + +private: + const config_t config; + + SpaceTrackerIRef space_tracker; + std::vector<segment_info_t> segments; + size_t empty_segments; + int64_t used_bytes = 0; + bool init_complete = false; + + journal_seq_t journal_tail_target; + journal_seq_t journal_tail_committed; + journal_seq_t journal_head; + + ExtentCallbackInterface *ecb = nullptr; + +public: + SegmentCleaner(config_t config, bool detailed = false) + : config(config), + space_tracker( + detailed ? + (SpaceTrackerI*)new SpaceTrackerDetailed( + config.num_segments, + config.segment_size, + config.block_size) : + (SpaceTrackerI*)new SpaceTrackerSimple( + config.num_segments)), + segments(config.num_segments), + empty_segments(config.num_segments) {} + + get_segment_ret get_segment() final; + + void close_segment(segment_id_t segment) final; + + void set_journal_segment( + segment_id_t segment, segment_seq_t seq) final { + assert(segment < segments.size()); + segments[segment].journal_segment_seq = seq; + assert(segments[segment].is_open()); + } + + journal_seq_t get_journal_tail_target() const final { + return journal_tail_target; + } + + void update_journal_tail_committed(journal_seq_t committed) final; + + void update_journal_tail_target(journal_seq_t target); + + void init_journal_tail(journal_seq_t tail) { + journal_tail_target = journal_tail_committed = tail; + } + + void set_journal_head(journal_seq_t head) { + assert(journal_head == journal_seq_t() || head >= journal_head); + journal_head = head; + } + + void init_mark_segment_closed(segment_id_t segment, segment_seq_t seq) final { + crimson::get_logger(ceph_subsys_filestore).debug( + "SegmentCleaner::init_mark_segment_closed: segment {}, seq {}", + segment, + seq); + mark_closed(segment); + segments[segment].journal_segment_seq = seq; + } + + segment_seq_t get_seq(segment_id_t id) final { + return segments[id].journal_segment_seq; + } + + void mark_segment_released(segment_id_t segment) { + return mark_empty(segment); + } + + void mark_space_used( + paddr_t addr, + extent_len_t len, + bool init_scan = false) { + assert(addr.segment < segments.size()); + + if (!init_scan && !init_complete) + return; + + if (!init_scan) { + assert(segments[addr.segment].state == Segment::segment_state_t::OPEN); + } + + used_bytes += len; + [[maybe_unused]] auto ret = space_tracker->allocate( + addr.segment, + addr.offset, + len); + assert(ret > 0); + } + + void mark_space_free( + paddr_t addr, + extent_len_t len) { + if (!init_complete) + return; + + used_bytes -= len; + assert(addr.segment < segments.size()); + + [[maybe_unused]] auto ret = space_tracker->release( + addr.segment, + addr.offset, + len); + assert(ret >= 0); + } + + segment_id_t get_next_gc_target() const { + segment_id_t ret = NULL_SEG_ID; + int64_t least_live_bytes = std::numeric_limits<int64_t>::max(); + for (segment_id_t i = 0; i < segments.size(); ++i) { + if (segments[i].is_closed() && + !segments[i].is_in_journal(journal_tail_committed) && + space_tracker->get_usage(i) < least_live_bytes) { + ret = i; + least_live_bytes = space_tracker->get_usage(i); + } + } + if (ret != NULL_SEG_ID) { + crimson::get_logger(ceph_subsys_filestore).debug( + "SegmentCleaner::get_next_gc_target: segment {} seq {}", + ret, + segments[ret].journal_segment_seq); + } + return ret; + } + + SpaceTrackerIRef get_empty_space_tracker() const { + return space_tracker->make_empty(); + } + + void complete_init() { init_complete = true; } + + void set_extent_callback(ExtentCallbackInterface *cb) { + ecb = cb; + } + + bool debug_check_space(const SpaceTrackerI &tracker) { + return space_tracker->equals(tracker); + } + + /** + * do_immediate_work + * + * Should be invoked prior to submission of any transaction, + * will piggy-back work required to maintain deferred work + * constraints. + */ + using do_immediate_work_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using do_immediate_work_ret = do_immediate_work_ertr::future<>; + do_immediate_work_ret do_immediate_work( + Transaction &t); + + + /** + * do_deferred_work + * + * Should be called at idle times -- will perform background + * operations based on deferred work constraints. + * + * If returned timespan is non-zero, caller should pause calling + * back into do_deferred_work before returned timespan has elapsed, + * or a foreground operation occurs. + */ + using do_deferred_work_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using do_deferred_work_ret = do_deferred_work_ertr::future< + ceph::timespan + >; + do_deferred_work_ret do_deferred_work( + Transaction &t); + +private: + + // journal status helpers + + /** + * rewrite_dirty + * + * Writes out dirty blocks dirtied earlier than limit. + */ + using rewrite_dirty_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + using rewrite_dirty_ret = rewrite_dirty_ertr::future<>; + rewrite_dirty_ret rewrite_dirty( + Transaction &t, + journal_seq_t limit); + + journal_seq_t get_dirty_tail() const { + auto ret = journal_head; + ret.segment_seq -= std::min( + static_cast<size_t>(ret.segment_seq), + config.target_journal_segments); + return ret; + } + + journal_seq_t get_dirty_tail_limit() const { + auto ret = journal_head; + ret.segment_seq -= std::min( + static_cast<size_t>(ret.segment_seq), + config.max_journal_segments); + return ret; + } + + // GC status helpers + std::unique_ptr<ExtentCallbackInterface::scan_extents_cursor> scan_cursor; + + /** + * do_gc + * + * Performs bytes worth of gc work on t. + */ + using do_gc_ertr = SegmentManager::read_ertr; + using do_gc_ret = do_gc_ertr::future<>; + do_gc_ret do_gc( + Transaction &t, + size_t bytes); + + size_t get_bytes_used_current_segment() const { + assert(journal_head != journal_seq_t()); + return journal_head.offset.offset; + } + + size_t get_bytes_available_current_segment() const { + return config.segment_size - get_bytes_used_current_segment(); + } + + /** + * get_bytes_scanned_current_segment + * + * Returns the number of bytes from the current gc segment that + * have been scanned. + */ + size_t get_bytes_scanned_current_segment() const { + if (!scan_cursor) + return 0; + + return scan_cursor->get_offset().offset; + } + + size_t get_available_bytes() const { + return (empty_segments * config.segment_size) + + get_bytes_available_current_segment() + + get_bytes_scanned_current_segment(); + } + + size_t get_total_bytes() const { + return config.segment_size * config.num_segments; + } + + size_t get_unavailable_bytes() const { + return get_total_bytes() - get_available_bytes(); + } + + /// Returns bytes currently occupied by live extents (not journal) + size_t get_used_bytes() const { + return used_bytes; + } + + /// Returns the number of bytes in unavailable segments that are not live + size_t get_reclaimable_bytes() const { + return get_unavailable_bytes() - get_used_bytes(); + } + + /** + * get_reclaim_ratio + * + * Returns the ratio of unavailable space that is not currently used. + */ + double get_reclaim_ratio() const { + if (get_unavailable_bytes() == 0) return 0; + return (double)get_reclaimable_bytes() / (double)get_unavailable_bytes(); + } + + /** + * get_available_ratio + * + * Returns ratio of available space to write to total space + */ + double get_available_ratio() const { + return (double)get_available_bytes() / (double)get_total_bytes(); + } + + /** + * get_immediate_bytes_to_gc_for_reclaim + * + * Returns the number of bytes to gc in order to bring the + * reclaim ratio below reclaim_ratio_usage_min. + */ + size_t get_immediate_bytes_to_gc_for_reclaim() const { + if (get_reclaim_ratio() < config.reclaim_ratio_hard_limit) + return 0; + + const size_t unavailable_target = std::max( + get_used_bytes() / (1.0 - config.reclaim_ratio_hard_limit), + (1 - config.reclaim_ratio_usage_min) * get_total_bytes()); + + if (unavailable_target > get_unavailable_bytes()) + return 0; + + return (get_unavailable_bytes() - unavailable_target) / get_reclaim_ratio(); + } + + /** + * get_immediate_bytes_to_gc_for_available + * + * Returns the number of bytes to gc in order to bring the + * the ratio of available disk space to total disk space above + * available_ratio_hard_limit. + */ + size_t get_immediate_bytes_to_gc_for_available() const { + if (get_available_ratio() > config.available_ratio_hard_limit) { + return 0; + } + + const double ratio_to_make_available = config.available_ratio_hard_limit - + get_available_ratio(); + return ratio_to_make_available * (double)get_total_bytes() + / get_reclaim_ratio(); + } + + /** + * get_immediate_bytes_to_gc + * + * Returns number of bytes to gc in order to restore any strict + * limits. + */ + size_t get_immediate_bytes_to_gc() const { + // number of bytes to gc in order to correct reclaim ratio + size_t for_reclaim = get_immediate_bytes_to_gc_for_reclaim(); + + // number of bytes to gc in order to correct available_ratio + size_t for_available = get_immediate_bytes_to_gc_for_available(); + + return std::max(for_reclaim, for_available); + } + + void mark_closed(segment_id_t segment) { + assert(segments.size() > segment); + if (init_complete) { + assert(segments[segment].is_open()); + } else { + assert(segments[segment].is_empty()); + assert(empty_segments > 0); + --empty_segments; + } + crimson::get_logger(ceph_subsys_filestore).debug( + "mark_closed: empty_segments: {}", + empty_segments); + segments[segment].state = Segment::segment_state_t::CLOSED; + } + + void mark_empty(segment_id_t segment) { + assert(segments.size() > segment); + assert(segments[segment].is_closed()); + assert(segments.size() > empty_segments); + ++empty_segments; + if (space_tracker->get_usage(segment) != 0) { + space_tracker->dump_usage(segment); + assert(space_tracker->get_usage(segment) == 0); + } + segments[segment].state = Segment::segment_state_t::EMPTY; + } + + void mark_open(segment_id_t segment) { + assert(segments.size() > segment); + assert(segments[segment].is_empty()); + assert(empty_segments > 0); + --empty_segments; + segments[segment].state = Segment::segment_state_t::OPEN; + } +}; + +} diff --git a/src/crimson/os/seastore/segment_manager.h b/src/crimson/os/seastore/segment_manager.h new file mode 100644 index 000000000..61c6509d1 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager.h @@ -0,0 +1,128 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iosfwd> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "crimson/os/seastore/seastore_types.h" +#include "include/buffer_fwd.h" +#include "crimson/osd/exceptions.h" + +namespace crimson::os::seastore { + +class Segment : public boost::intrusive_ref_counter< + Segment, + boost::thread_unsafe_counter>{ +public: + + enum class segment_state_t : uint8_t { + EMPTY = 0, + OPEN = 1, + CLOSED = 2 + }; + + /** + * get_segment_id + */ + virtual segment_id_t get_segment_id() const = 0; + + /** + * min next write location + */ + virtual segment_off_t get_write_ptr() const = 0; + + /** + * max capacity + */ + virtual segment_off_t get_write_capacity() const = 0; + + /** + * close + * + * Closes segment for writes. Won't complete until + * outstanding writes to this segment are complete. + */ + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + virtual close_ertr::future<> close() = 0; + + + /** + * write + * + * @param offset offset of write, must be aligned to <> and >= write pointer, advances + * write pointer + * @param bl buffer to write, will be padded if not aligned + */ + using write_ertr = crimson::errorator< + crimson::ct_error::input_output_error, // media error or corruption + crimson::ct_error::invarg, // if offset is < write pointer or misaligned + crimson::ct_error::ebadf, // segment closed + crimson::ct_error::enospc // write exceeds segment size + >; + virtual write_ertr::future<> write( + segment_off_t offset, ceph::bufferlist bl) = 0; + + virtual ~Segment() {} +}; +using SegmentRef = boost::intrusive_ptr<Segment>; + +constexpr size_t PADDR_SIZE = sizeof(paddr_t); + +class SegmentManager { +public: + using open_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + virtual open_ertr::future<SegmentRef> open(segment_id_t id) = 0; + + using release_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent>; + virtual release_ertr::future<> release(segment_id_t id) = 0; + + using read_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::invarg, + crimson::ct_error::enoent, + crimson::ct_error::erange>; + virtual read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) = 0; + read_ertr::future<ceph::bufferptr> read( + paddr_t addr, + size_t len) { + auto ptrref = std::make_unique<ceph::bufferptr>( + buffer::create_page_aligned(len)); + return read(addr, len, *ptrref).safe_then( + [ptrref=std::move(ptrref)]() mutable { + return read_ertr::make_ready_future<bufferptr>(std::move(*ptrref)); + }); + } + + /* Methods for discovering device geometry, segmentid set, etc */ + virtual size_t get_size() const = 0; + virtual segment_off_t get_block_size() const = 0; + virtual segment_off_t get_segment_size() const = 0; + virtual segment_id_t get_num_segments() const { + ceph_assert(get_size() % get_segment_size() == 0); + return ((segment_id_t)(get_size() / get_segment_size())); + } + virtual const seastore_meta_t &get_meta() const = 0; + + virtual ~SegmentManager() {} +}; +using SegmentManagerRef = std::unique_ptr<SegmentManager>; + +} diff --git a/src/crimson/os/seastore/segment_manager/block.cc b/src/crimson/os/seastore/segment_manager/block.cc new file mode 100644 index 000000000..6a4991d42 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/block.cc @@ -0,0 +1,402 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/segment_manager/block.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + + +namespace crimson::os::seastore::segment_manager::block { + +static write_ertr::future<> do_write( + seastar::file &device, + uint64_t offset, + bufferptr &bptr) +{ + logger().debug( + "block: do_write offset {} len {}", + offset, + bptr.length()); + return device.dma_write( + offset, + bptr.c_str(), + bptr.length() + ).handle_exception([](auto e) -> write_ertr::future<size_t> { + logger().error( + "do_write: dma_write got error {}", + e); + return crimson::ct_error::input_output_error::make(); + }).then([length=bptr.length()](auto result) + -> write_ertr::future<> { + if (result != length) { + return crimson::ct_error::input_output_error::make(); + } + return write_ertr::now(); + }); +} + +static read_ertr::future<> do_read( + seastar::file &device, + uint64_t offset, + bufferptr &bptr) +{ + logger().debug( + "block: do_read offset {} len {}", + offset, + bptr.length()); + return device.dma_read( + offset, + bptr.c_str(), + bptr.length() + ).handle_exception([](auto e) -> read_ertr::future<size_t> { + logger().error( + "do_read: dma_read got error {}", + e); + return crimson::ct_error::input_output_error::make(); + }).then([length=bptr.length()](auto result) -> read_ertr::future<> { + if (result != length) { + return crimson::ct_error::input_output_error::make(); + } + return read_ertr::now(); + }); +} + +write_ertr::future<> +SegmentStateTracker::write_out( + seastar::file &device, + uint64_t offset) +{ + return do_write(device, offset, bptr); +} + +write_ertr::future<> +SegmentStateTracker::read_in( + seastar::file &device, + uint64_t offset) +{ + return do_read( + device, + offset, + bptr); +} + +static +block_sm_superblock_t make_superblock( + const BlockSegmentManager::mkfs_config_t &config, + const seastar::stat_data &data) +{ + logger().debug( + "{}: size {}, block_size {}, allocated_size {}, configured_size {}", + __func__, + data.size, + data.block_size, + data.allocated_size, + config.total_size); + size_t size = (data.size == 0) ? config.total_size : data.size; + size_t raw_segments = size / config.segment_size; + size_t tracker_size = SegmentStateTracker::get_raw_size( + raw_segments, + data.block_size); + size_t segments = (size - tracker_size - data.block_size) + / config.segment_size; + return block_sm_superblock_t{ + size, + config.segment_size, + data.block_size, + segments, + data.block_size, + tracker_size + data.block_size, + config.meta + }; +} + +using open_device_ret = + BlockSegmentManager::access_ertr::future< + std::pair<seastar::file, seastar::stat_data> + >; +static +open_device_ret open_device(const std::string &in_path, seastar::open_flags mode) +{ + return seastar::do_with( + in_path, + [mode](auto &path) { + return seastar::file_stat(path, seastar::follow_symlink::yes + ).then([mode, &path](auto stat) mutable { + return seastar::open_file_dma(path, mode).then([=](auto file) { + logger().debug("open_device: open successful"); + return std::make_pair(file, stat); + }); + }).handle_exception([](auto e) -> open_device_ret { + logger().error( + "open_device: got error {}", + e); + return crimson::ct_error::input_output_error::make(); + }); + }); +} + + +static +BlockSegmentManager::access_ertr::future<> +write_superblock(seastar::file &device, block_sm_superblock_t sb) +{ + assert(ceph::encoded_sizeof_bounded<block_sm_superblock_t>() < + sb.block_size); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(sb.block_size)), + [=, &device](auto &bp) { + bufferlist bl; + encode(sb, bl); + auto iter = bl.begin(); + assert(bl.length() < sb.block_size); + iter.copy(bl.length(), bp.c_str()); + logger().debug("write_superblock: doing writeout"); + return do_write(device, 0, bp); + }); +} + +static +BlockSegmentManager::access_ertr::future<block_sm_superblock_t> +read_superblock(seastar::file &device, seastar::stat_data sd) +{ + assert(ceph::encoded_sizeof_bounded<block_sm_superblock_t>() < + sd.block_size); + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(sd.block_size)), + [=, &device](auto &bp) { + return do_read( + device, + 0, + bp + ).safe_then([=, &bp] { + bufferlist bl; + bl.push_back(bp); + block_sm_superblock_t ret; + auto bliter = bl.cbegin(); + decode(ret, bliter); + return BlockSegmentManager::access_ertr::future<block_sm_superblock_t>( + BlockSegmentManager::access_ertr::ready_future_marker{}, + ret); + }); + }); +} + +BlockSegment::BlockSegment( + BlockSegmentManager &manager, segment_id_t id) + : manager(manager), id(id) {} + +segment_off_t BlockSegment::get_write_capacity() const +{ + return manager.get_segment_size(); +} + +Segment::close_ertr::future<> BlockSegment::close() +{ + manager.segment_close(id); + return close_ertr::now(); +} + +Segment::write_ertr::future<> BlockSegment::write( + segment_off_t offset, ceph::bufferlist bl) +{ + if (offset < write_pointer || offset % manager.superblock.block_size != 0) + return crimson::ct_error::invarg::make(); + + if (offset + bl.length() > manager.superblock.segment_size) + return crimson::ct_error::enospc::make(); + + write_pointer = offset + bl.length(); + return manager.segment_write({id, offset}, bl); +} + +Segment::close_ertr::future<> BlockSegmentManager::segment_close(segment_id_t id) +{ + assert(tracker); + tracker->set(id, segment_state_t::CLOSED); + return tracker->write_out(device, superblock.tracker_offset); +} + +Segment::write_ertr::future<> BlockSegmentManager::segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check) +{ + assert((bl.length() % superblock.block_size) == 0); + logger().debug( + "segment_write to segment {} at offset {}, physical offset {}, len {}", + addr.segment, + addr.offset, + get_offset(addr), + bl.length()); + + + // TODO send an iovec and avoid the copy -- bl should have aligned + // constituent buffers and they will remain unmodified until the write + // completes + return seastar::do_with( + bufferptr(ceph::buffer::create_page_aligned(bl.length())), + [&](auto &bp) { + auto iter = bl.cbegin(); + iter.copy(bl.length(), bp.c_str()); + return do_write(device, get_offset(addr), bp); + }); +} + +BlockSegmentManager::~BlockSegmentManager() +{ +} + +BlockSegmentManager::mount_ret BlockSegmentManager::mount(mount_config_t config) +{ + return open_device( + config.path, seastar::open_flags::rw | seastar::open_flags::dsync + ).safe_then([=](auto p) { + device = std::move(p.first); + auto sd = p.second; + return read_superblock(device, sd); + }).safe_then([=](auto sb) { + superblock = sb; + tracker = std::make_unique<SegmentStateTracker>( + superblock.segments, + superblock.block_size); + return tracker->read_in( + device, + superblock.tracker_offset + ).safe_then([this] { + for (segment_id_t i = 0; i < tracker->get_capacity(); ++i) { + if (tracker->get(i) == segment_state_t::OPEN) { + tracker->set(i, segment_state_t::CLOSED); + } + } + return tracker->write_out(device, superblock.tracker_offset); + }); + }); +} + +BlockSegmentManager::mkfs_ret BlockSegmentManager::mkfs(mkfs_config_t config) +{ + return seastar::do_with( + seastar::file{}, + seastar::stat_data{}, + block_sm_superblock_t{}, + std::unique_ptr<SegmentStateTracker>(), + [=](auto &device, auto &stat, auto &sb, auto &tracker) { + return open_device( + config.path, seastar::open_flags::rw + ).safe_then([&, config](auto p) { + device = p.first; + stat = p.second; + sb = make_superblock(config, stat); + return write_superblock(device, sb); + }).safe_then([&] { + logger().debug("BlockSegmentManager::mkfs: superblock written"); + tracker.reset(new SegmentStateTracker(sb.segments, sb.block_size)); + return tracker->write_out(device, sb.tracker_offset); + }).finally([&] { + return device.close(); + }).safe_then([] { + logger().debug("BlockSegmentManager::mkfs: complete"); + return mkfs_ertr::now(); + }); + }); +} + +BlockSegmentManager::close_ertr::future<> BlockSegmentManager::close() +{ + return device.close(); +} + +SegmentManager::open_ertr::future<SegmentRef> BlockSegmentManager::open( + segment_id_t id) +{ + if (id >= get_num_segments()) { + logger().error("BlockSegmentManager::open: invalid segment {}", id); + return crimson::ct_error::invarg::make(); + } + + if (tracker->get(id) != segment_state_t::EMPTY) { + logger().error( + "BlockSegmentManager::open: invalid segment {} state {}", + id, + tracker->get(id)); + return crimson::ct_error::invarg::make(); + } + + tracker->set(id, segment_state_t::OPEN); + return tracker->write_out(device, superblock.tracker_offset + ).safe_then([this, id] { + return open_ertr::future<SegmentRef>( + open_ertr::ready_future_marker{}, + SegmentRef(new BlockSegment(*this, id))); + }); +} + +SegmentManager::release_ertr::future<> BlockSegmentManager::release( + segment_id_t id) +{ + logger().debug("BlockSegmentManager::release: {}", id); + + if (id >= get_num_segments()) { + logger().error( + "BlockSegmentManager::release: invalid segment {}", + id); + return crimson::ct_error::invarg::make(); + } + + if (tracker->get(id) != segment_state_t::CLOSED) { + logger().error( + "BlockSegmentManager::release: invalid segment {} state {}", + id, + tracker->get(id)); + return crimson::ct_error::invarg::make(); + } + + tracker->set(id, segment_state_t::EMPTY); + return tracker->write_out(device, superblock.tracker_offset); +} + +SegmentManager::read_ertr::future<> BlockSegmentManager::read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) +{ + if (addr.segment >= get_num_segments()) { + logger().error( + "BlockSegmentManager::read: invalid segment {}", + addr); + return crimson::ct_error::invarg::make(); + } + + if (addr.offset + len > superblock.segment_size) { + logger().error( + "BlockSegmentManager::read: invalid offset {}~{}!", + addr, + len); + return crimson::ct_error::invarg::make(); + } + + if (tracker->get(addr.segment) == segment_state_t::EMPTY) { + logger().error( + "BlockSegmentManager::read: read on invalid segment {} state {}", + addr.segment, + tracker->get(addr.segment)); + return crimson::ct_error::enoent::make(); + } + + return do_read( + device, + get_offset(addr), + out); +} + +} diff --git a/src/crimson/os/seastore/segment_manager/block.h b/src/crimson/os/seastore/segment_manager/block.h new file mode 100644 index 000000000..927b13e4e --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/block.h @@ -0,0 +1,222 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/file.hh> +#include <seastar/core/future.hh> +#include <seastar/core/reactor.hh> + +#include "crimson/common/layout.h" + +#include "crimson/os/seastore/segment_manager.h" + +namespace crimson::os::seastore::segment_manager::block { + +struct block_sm_superblock_t { + size_t size = 0; + size_t segment_size = 0; + size_t block_size = 0; + + size_t segments = 0; + uint64_t tracker_offset = 0; + uint64_t first_segment_offset = 0; + + seastore_meta_t meta; + + DENC(block_sm_superblock_t, v, p) { + DENC_START(1, 1, p); + denc(v.size, p); + denc(v.segment_size, p); + denc(v.block_size, p); + denc(v.segments, p); + denc(v.tracker_offset, p); + denc(v.first_segment_offset, p); + denc(v.meta, p); + DENC_FINISH(p); + } +}; + +using write_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; +using read_ertr = crimson::errorator< + crimson::ct_error::input_output_error>; + +/** + * SegmentStateTracker + * + * Tracks lifecycle state of each segment using space at the beginning + * of the drive. + */ +class SegmentStateTracker { + using segment_state_t = Segment::segment_state_t; + + bufferptr bptr; + + using L = absl::container_internal::Layout<uint8_t>; + const L layout; + +public: + static size_t get_raw_size(size_t segments, size_t block_size) { + return p2roundup(segments, block_size); + } + + SegmentStateTracker(size_t segments, size_t block_size) + : bptr(ceph::buffer::create_page_aligned( + get_raw_size(segments, block_size))), + layout(bptr.length()) + { + ::memset( + bptr.c_str(), + static_cast<char>(segment_state_t::EMPTY), + bptr.length()); + } + + size_t get_size() const { + return bptr.length(); + } + + size_t get_capacity() const { + return bptr.length(); + } + + segment_state_t get(segment_id_t offset) const { + assert(offset < get_capacity()); + return static_cast<segment_state_t>( + layout.template Pointer<0>( + bptr.c_str())[offset]); + } + + void set(segment_id_t offset, segment_state_t state) { + assert(offset < get_capacity()); + layout.template Pointer<0>(bptr.c_str())[offset] = + static_cast<uint8_t>(state); + } + + write_ertr::future<> write_out( + seastar::file &device, + uint64_t offset); + + read_ertr::future<> read_in( + seastar::file &device, + uint64_t offset); +}; + +class BlockSegmentManager; +class BlockSegment final : public Segment { + friend class BlockSegmentManager; + BlockSegmentManager &manager; + const segment_id_t id; + segment_off_t write_pointer = 0; +public: + BlockSegment(BlockSegmentManager &manager, segment_id_t id); + + segment_id_t get_segment_id() const final { return id; } + segment_off_t get_write_capacity() const final; + segment_off_t get_write_ptr() const final { return write_pointer; } + close_ertr::future<> close() final; + write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final; + + ~BlockSegment() {} +}; + +/** + * BlockSegmentManager + * + * Implements SegmentManager on a conventional block device. + * SegmentStateTracker uses space at the start of the device to store + * state analagous to that of the segments of a zns device. + */ +class BlockSegmentManager final : public SegmentManager { +public: + using access_ertr = crimson::errorator< + crimson::ct_error::input_output_error, + crimson::ct_error::permission_denied, + crimson::ct_error::enoent>; + + + struct mount_config_t { + std::string path; + }; + using mount_ertr = access_ertr; + using mount_ret = access_ertr::future<>; + mount_ret mount(mount_config_t); + + struct mkfs_config_t { + std::string path; + size_t segment_size = 0; + size_t total_size = 0; + seastore_meta_t meta; + }; + using mkfs_ertr = access_ertr; + using mkfs_ret = mkfs_ertr::future<>; + static mkfs_ret mkfs(mkfs_config_t); + + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + close_ertr::future<> close(); + + BlockSegmentManager() = default; + ~BlockSegmentManager(); + + open_ertr::future<SegmentRef> open(segment_id_t id) final; + + release_ertr::future<> release(segment_id_t id) final; + + read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) final; + + size_t get_size() const final { + return superblock.size; + } + segment_off_t get_block_size() const { + return superblock.block_size; + } + segment_off_t get_segment_size() const { + return superblock.segment_size; + } + + // public so tests can bypass segment interface when simpler + Segment::write_ertr::future<> segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check=false); + +private: + friend class BlockSegment; + using segment_state_t = Segment::segment_state_t; + + + std::unique_ptr<SegmentStateTracker> tracker; + block_sm_superblock_t superblock; + seastar::file device; + + size_t get_offset(paddr_t addr) { + return superblock.first_segment_offset + + (addr.segment * superblock.segment_size) + + addr.offset; + } + + const seastore_meta_t &get_meta() const { + return superblock.meta; + } + + std::vector<segment_state_t> segment_state; + + char *buffer = nullptr; + + Segment::close_ertr::future<> segment_close(segment_id_t id); +}; + +} + +WRITE_CLASS_DENC_BOUNDED( + crimson::os::seastore::segment_manager::block::block_sm_superblock_t +) + diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.cc b/src/crimson/os/seastore/segment_manager/ephemeral.cc new file mode 100644 index 000000000..3250303ad --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/ephemeral.cc @@ -0,0 +1,226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/mman.h> +#include <string.h> + +#include "seastar/core/sleep.hh" + +#include "crimson/common/log.h" + +#include "include/buffer.h" +#include "crimson/os/seastore/segment_manager/ephemeral.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore::segment_manager { + +std::ostream &operator<<(std::ostream &lhs, const ephemeral_config_t &c) { + return lhs << "ephemeral_config_t(size=" << c.size << ", block_size=" << c.block_size + << ", segment_size=" << c.segment_size << ")"; +} + +EphemeralSegmentManagerRef create_test_ephemeral() { + return EphemeralSegmentManagerRef( + new EphemeralSegmentManager(DEFAULT_TEST_EPHEMERAL)); +} + +EphemeralSegment::EphemeralSegment( + EphemeralSegmentManager &manager, segment_id_t id) + : manager(manager), id(id) {} + +segment_off_t EphemeralSegment::get_write_capacity() const +{ + return manager.get_segment_size(); +} + +Segment::close_ertr::future<> EphemeralSegment::close() +{ + manager.segment_close(id); + return close_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +Segment::write_ertr::future<> EphemeralSegment::write( + segment_off_t offset, ceph::bufferlist bl) +{ + if (offset < write_pointer || offset % manager.config.block_size != 0) + return crimson::ct_error::invarg::make(); + + if (offset + bl.length() > (size_t)manager.get_segment_size()) + return crimson::ct_error::enospc::make(); + + return manager.segment_write({id, offset}, bl); +} + +Segment::close_ertr::future<> EphemeralSegmentManager::segment_close(segment_id_t id) +{ + if (segment_state[id] != segment_state_t::OPEN) + return crimson::ct_error::invarg::make(); + + segment_state[id] = segment_state_t::CLOSED; + return Segment::close_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +Segment::write_ertr::future<> EphemeralSegmentManager::segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check) +{ + logger().debug( + "segment_write to segment {} at offset {}, physical offset {}, len {}, crc {}", + addr.segment, + addr.offset, + get_offset(addr), + bl.length(), + bl.crc32c(1)); + if (!ignore_check && segment_state[addr.segment] != segment_state_t::OPEN) + return crimson::ct_error::invarg::make(); + + bl.begin().copy(bl.length(), buffer + get_offset(addr)); + return Segment::write_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +EphemeralSegmentManager::init_ertr::future<> EphemeralSegmentManager::init() +{ + logger().debug( + "Initing ephemeral segment manager with config {}", + config); + + meta = seastore_meta_t{}; + + if (config.block_size % (4<<10) != 0) { + return crimson::ct_error::invarg::make(); + } + if (config.segment_size % config.block_size != 0) { + return crimson::ct_error::invarg::make(); + } + if (config.size % config.segment_size != 0) { + return crimson::ct_error::invarg::make(); + } + + auto addr = ::mmap( + nullptr, + config.size, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, + -1, + 0); + + segment_state.resize(config.size / config.segment_size, segment_state_t::EMPTY); + + if (addr == MAP_FAILED) + return crimson::ct_error::enospc::make(); + + buffer = (char*)addr; + + ::memset(buffer, 0, config.size); + return init_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +EphemeralSegmentManager::~EphemeralSegmentManager() +{ + if (buffer) { + ::munmap(buffer, config.size); + } +} + +void EphemeralSegmentManager::remount() +{ + for (auto &i : segment_state) { + if (i == Segment::segment_state_t::OPEN) + i = Segment::segment_state_t::CLOSED; + } +} + +SegmentManager::open_ertr::future<SegmentRef> EphemeralSegmentManager::open( + segment_id_t id) +{ + if (id >= get_num_segments()) { + logger().error("EphemeralSegmentManager::open: invalid segment {}", id); + return crimson::ct_error::invarg::make(); + } + + if (segment_state[id] != segment_state_t::EMPTY) { + logger().error("EphemeralSegmentManager::open: segment {} not empty", id); + return crimson::ct_error::invarg::make(); + } + + segment_state[id] = segment_state_t::OPEN; + return open_ertr::make_ready_future<SegmentRef>(new EphemeralSegment(*this, id)); +} + +SegmentManager::release_ertr::future<> EphemeralSegmentManager::release( + segment_id_t id) +{ + logger().debug("EphemeralSegmentManager::release: {}", id); + + if (id >= get_num_segments()) { + logger().error( + "EphemeralSegmentManager::release: invalid segment {}", + id); + return crimson::ct_error::invarg::make(); + } + + if (segment_state[id] != segment_state_t::CLOSED) { + logger().error( + "EphemeralSegmentManager::release: segment id {} not closed", + id); + return crimson::ct_error::invarg::make(); + } + + ::memset(buffer + get_offset({id, 0}), 0, config.segment_size); + segment_state[id] = segment_state_t::EMPTY; + return release_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +SegmentManager::read_ertr::future<> EphemeralSegmentManager::read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) +{ + if (addr.segment >= get_num_segments()) { + logger().error( + "EphemeralSegmentManager::read: invalid segment {}", + addr); + return crimson::ct_error::invarg::make(); + } + + if (addr.offset + len > config.segment_size) { + logger().error( + "EphemeralSegmentManager::read: invalid offset {}~{}!", + addr, + len); + return crimson::ct_error::invarg::make(); + } + + out.copy_in(0, len, buffer + get_offset(addr)); + + bufferlist bl; + bl.push_back(out); + logger().debug( + "segment_read to segment {} at offset {}, physical offset {}, length {}, crc {}", + addr.segment, + addr.offset, + get_offset(addr), + len, + bl.begin().crc32c(len, 1)); + + return read_ertr::now().safe_then([] { + return seastar::sleep(std::chrono::milliseconds(1)); + }); +} + +} diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.h b/src/crimson/os/seastore/segment_manager/ephemeral.h new file mode 100644 index 000000000..9f19cb4d0 --- /dev/null +++ b/src/crimson/os/seastore/segment_manager/ephemeral.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> + +#include "crimson/os/seastore/segment_manager.h" + +#include "crimson/os/seastore/segment_manager/ephemeral.h" + +namespace crimson::os::seastore::segment_manager { + +class EphemeralSegmentManager; +using EphemeralSegmentManagerRef = std::unique_ptr<EphemeralSegmentManager>; + +struct ephemeral_config_t { + size_t size = 0; + size_t block_size = 0; + size_t segment_size = 0; +}; + +constexpr ephemeral_config_t DEFAULT_TEST_EPHEMERAL = { + 1 << 30, + 4 << 10, + 8 << 20 +}; + +std::ostream &operator<<(std::ostream &, const ephemeral_config_t &); +EphemeralSegmentManagerRef create_test_ephemeral(); + +class EphemeralSegment final : public Segment { + friend class EphemeralSegmentManager; + EphemeralSegmentManager &manager; + const segment_id_t id; + segment_off_t write_pointer = 0; +public: + EphemeralSegment(EphemeralSegmentManager &manager, segment_id_t id); + + segment_id_t get_segment_id() const final { return id; } + segment_off_t get_write_capacity() const final; + segment_off_t get_write_ptr() const final { return write_pointer; } + close_ertr::future<> close() final; + write_ertr::future<> write(segment_off_t offset, ceph::bufferlist bl) final; + + ~EphemeralSegment() {} +}; + +class EphemeralSegmentManager final : public SegmentManager { + friend class EphemeralSegment; + using segment_state_t = Segment::segment_state_t; + + const ephemeral_config_t config; + std::optional<seastore_meta_t> meta; + + size_t get_offset(paddr_t addr) { + return (addr.segment * config.segment_size) + addr.offset; + } + + std::vector<segment_state_t> segment_state; + + char *buffer = nullptr; + + Segment::close_ertr::future<> segment_close(segment_id_t id); + +public: + EphemeralSegmentManager(ephemeral_config_t config) : config(config) {} + ~EphemeralSegmentManager(); + + using init_ertr = crimson::errorator< + crimson::ct_error::enospc, + crimson::ct_error::invarg, + crimson::ct_error::erange>; + init_ertr::future<> init(); + + open_ertr::future<SegmentRef> open(segment_id_t id) final; + + release_ertr::future<> release(segment_id_t id) final; + + read_ertr::future<> read( + paddr_t addr, + size_t len, + ceph::bufferptr &out) final; + + size_t get_size() const final { + return config.size; + } + segment_off_t get_block_size() const final { + return config.block_size; + } + segment_off_t get_segment_size() const final { + return config.segment_size; + } + + const seastore_meta_t &get_meta() const final { + assert(meta); + return *meta; + } + + void remount(); + + // public so tests can bypass segment interface when simpler + Segment::write_ertr::future<> segment_write( + paddr_t addr, + ceph::bufferlist bl, + bool ignore_check=false); +}; + +} diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h new file mode 100644 index 000000000..e189d1d32 --- /dev/null +++ b/src/crimson/os/seastore/transaction.h @@ -0,0 +1,145 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> + +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/root_block.h" + +namespace crimson::os::seastore { + +/** + * Transaction + * + * Representation of in-progress mutation. Used exclusively through Cache methods. + */ +class Transaction { +public: + using Ref = std::unique_ptr<Transaction>; + enum class get_extent_ret { + PRESENT, + ABSENT, + RETIRED + }; + get_extent_ret get_extent(paddr_t addr, CachedExtentRef *out) { + if (retired_set.count(addr)) { + return get_extent_ret::RETIRED; + } else if (auto iter = write_set.find_offset(addr); + iter != write_set.end()) { + if (out) + *out = CachedExtentRef(&*iter); + return get_extent_ret::PRESENT; + } else if ( + auto iter = read_set.find(addr); + iter != read_set.end()) { + if (out) + *out = CachedExtentRef(*iter); + return get_extent_ret::PRESENT; + } else { + return get_extent_ret::ABSENT; + } + } + + void add_to_retired_set(CachedExtentRef ref) { + ceph_assert(!is_weak()); + if (!ref->is_initial_pending()) { + // && retired_set.count(ref->get_paddr()) == 0 + // If it's already in the set, insert here will be a noop, + // which is what we want. + retired_set.insert(ref); + } else { + ref->state = CachedExtent::extent_state_t::INVALID; + } + if (ref->is_pending()) { + write_set.erase(*ref); + } + } + + void add_to_read_set(CachedExtentRef ref) { + if (is_weak()) return; + + ceph_assert(read_set.count(ref) == 0); + read_set.insert(ref); + } + + void add_fresh_extent(CachedExtentRef ref) { + ceph_assert(!is_weak()); + fresh_block_list.push_back(ref); + ref->set_paddr(make_record_relative_paddr(offset)); + offset += ref->get_length(); + write_set.insert(*ref); + } + + void add_mutated_extent(CachedExtentRef ref) { + ceph_assert(!is_weak()); + mutated_block_list.push_back(ref); + write_set.insert(*ref); + } + + void mark_segment_to_release(segment_id_t segment) { + assert(to_release == NULL_SEG_ID); + to_release = segment; + } + + segment_id_t get_segment_to_release() const { + return to_release; + } + + const auto &get_fresh_block_list() { + return fresh_block_list; + } + + const auto &get_mutated_block_list() { + return mutated_block_list; + } + + const auto &get_retired_set() { + return retired_set; + } + + bool is_weak() const { + return weak; + } + +private: + friend class Cache; + friend Ref make_transaction(); + friend Ref make_weak_transaction(); + + /** + * If set, *this may not be used to perform writes and will not provide + * consistentency allowing operations using to avoid maintaining a read_set. + */ + const bool weak; + + RootBlockRef root; ///< ref to root if read or written by transaction + + segment_off_t offset = 0; ///< relative offset of next block + + pextent_set_t read_set; ///< set of extents read by paddr + ExtentIndex write_set; ///< set of extents written by paddr + + std::list<CachedExtentRef> fresh_block_list; ///< list of fresh blocks + std::list<CachedExtentRef> mutated_block_list; ///< list of mutated blocks + + pextent_set_t retired_set; ///< list of extents mutated by this transaction + + ///< if != NULL_SEG_ID, release this segment after completion + segment_id_t to_release = NULL_SEG_ID; + + Transaction(bool weak) : weak(weak) {} +}; +using TransactionRef = Transaction::Ref; + +inline TransactionRef make_transaction() { + return std::unique_ptr<Transaction>(new Transaction(false)); +} + +inline TransactionRef make_weak_transaction() { + return std::unique_ptr<Transaction>(new Transaction(true)); +} + +} diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc new file mode 100644 index 000000000..7b86631e2 --- /dev/null +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -0,0 +1,306 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "include/denc.h" +#include "include/intarith.h" + +#include "crimson/common/log.h" + +#include "crimson/os/seastore/transaction_manager.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/journal.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_filestore); + } +} + +namespace crimson::os::seastore { + +TransactionManager::TransactionManager( + SegmentManager &segment_manager, + SegmentCleaner &segment_cleaner, + Journal &journal, + Cache &cache, + LBAManager &lba_manager) + : segment_manager(segment_manager), + segment_cleaner(segment_cleaner), + cache(cache), + lba_manager(lba_manager), + journal(journal) +{} + +TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() +{ + return journal.open_for_write().safe_then([this](auto addr) { + logger().debug("TransactionManager::mkfs: about to do_with"); + segment_cleaner.set_journal_head(addr); + return seastar::do_with( + create_transaction(), + [this](auto &transaction) { + logger().debug("TransactionManager::mkfs: about to cache.mkfs"); + cache.init(); + return cache.mkfs(*transaction + ).safe_then([this, &transaction] { + return lba_manager.mkfs(*transaction); + }).safe_then([this, &transaction] { + logger().debug("TransactionManager::mkfs: about to submit_transaction"); + return submit_transaction(std::move(transaction)).handle_error( + crimson::ct_error::eagain::handle([] { + ceph_assert(0 == "eagain impossible"); + return mkfs_ertr::now(); + }), + mkfs_ertr::pass_further{} + ); + }); + }); + }).safe_then([this] { + return journal.close(); + }); +} + +TransactionManager::mount_ertr::future<> TransactionManager::mount() +{ + cache.init(); + return journal.replay([this](auto seq, auto paddr, const auto &e) { + return cache.replay_delta(seq, paddr, e); + }).safe_then([this] { + return journal.open_for_write(); + }).safe_then([this](auto addr) { + segment_cleaner.set_journal_head(addr); + return seastar::do_with( + make_weak_transaction(), + [this](auto &t) { + return cache.init_cached_extents(*t, [this](auto &t, auto &e) { + return lba_manager.init_cached_extent(t, e); + }).safe_then([this, &t] { + assert(segment_cleaner.debug_check_space( + *segment_cleaner.get_empty_space_tracker())); + return lba_manager.scan_mapped_space( + *t, + [this](paddr_t addr, extent_len_t len) { + logger().debug("TransactionManager::mount: marking {}~{} used", + addr, + len); + segment_cleaner.mark_space_used( + addr, + len , + /* init_scan = */ true); + }); + }); + }); + }).safe_then([this] { + segment_cleaner.complete_init(); + }).handle_error( + mount_ertr::pass_further{}, + crimson::ct_error::all_same_way([] { + ceph_assert(0 == "unhandled error"); + return mount_ertr::now(); + })); +} + +TransactionManager::close_ertr::future<> TransactionManager::close() { + return cache.close( + ).safe_then([this] { + return journal.close(); + }); +} + +TransactionManager::ref_ret TransactionManager::inc_ref( + Transaction &t, + LogicalCachedExtentRef &ref) +{ + return lba_manager.incref_extent(t, ref->get_laddr()).safe_then([](auto r) { + return r.refcount; + }).handle_error( + ref_ertr::pass_further{}, + ct_error::all_same_way([](auto e) { + ceph_assert(0 == "unhandled error, TODO"); + })); +} + +TransactionManager::ref_ret TransactionManager::inc_ref( + Transaction &t, + laddr_t offset) +{ + return lba_manager.incref_extent(t, offset).safe_then([](auto result) { + return result.refcount; + }); +} + +TransactionManager::ref_ret TransactionManager::dec_ref( + Transaction &t, + LogicalCachedExtentRef &ref) +{ + return lba_manager.decref_extent(t, ref->get_laddr() + ).safe_then([this, &t, ref](auto ret) { + if (ret.refcount == 0) { + logger().debug( + "TransactionManager::dec_ref: extent {} refcount 0", + *ref); + cache.retire_extent(t, ref); + } + return ret.refcount; + }); +} + +TransactionManager::ref_ret TransactionManager::dec_ref( + Transaction &t, + laddr_t offset) +{ + return lba_manager.decref_extent(t, offset + ).safe_then([this, offset, &t](auto result) -> ref_ret { + if (result.refcount == 0) { + logger().debug( + "TransactionManager::dec_ref: offset {} refcount 0", + offset); + return cache.retire_extent_if_cached(t, result.addr).safe_then([] { + return ref_ret( + ref_ertr::ready_future_marker{}, + 0); + }); + } else { + return ref_ret( + ref_ertr::ready_future_marker{}, + result.refcount); + } + }); +} + +TransactionManager::submit_transaction_ertr::future<> +TransactionManager::submit_transaction( + TransactionRef t) +{ + logger().debug("TransactionManager::submit_transaction"); + return segment_cleaner.do_immediate_work(*t + ).safe_then([this, t=std::move(t)]() mutable -> submit_transaction_ertr::future<> { + auto record = cache.try_construct_record(*t); + if (!record) { + return crimson::ct_error::eagain::make(); + } + + return journal.submit_record(std::move(*record) + ).safe_then([this, t=std::move(t)](auto p) mutable { + auto [addr, journal_seq] = p; + segment_cleaner.set_journal_head(journal_seq); + cache.complete_commit(*t, addr, journal_seq, &segment_cleaner); + lba_manager.complete_transaction(*t); + auto to_release = t->get_segment_to_release(); + if (to_release != NULL_SEG_ID) { + segment_cleaner.mark_segment_released(to_release); + return segment_manager.release(to_release); + } else { + return SegmentManager::release_ertr::now(); + } + }).handle_error( + submit_transaction_ertr::pass_further{}, + crimson::ct_error::all_same_way([](auto e) { + ceph_assert(0 == "Hit error submitting to journal"); + })); + }); +} + +TransactionManager::get_next_dirty_extents_ret +TransactionManager::get_next_dirty_extents(journal_seq_t seq) +{ + return cache.get_next_dirty_extents(seq); +} + +TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( + Transaction &t, + CachedExtentRef extent) +{ + { + auto updated = cache.update_extent_from_transaction(t, extent); + if (!updated) { + logger().debug( + "{}: {} is already retired, skipping", + __func__, + *extent); + return rewrite_extent_ertr::now(); + } + extent = updated; + } + + if (extent->get_type() == extent_types_t::ROOT) { + logger().debug( + "{}: marking root {} for rewrite", + __func__, + *extent); + cache.duplicate_for_write(t, extent); + return rewrite_extent_ertr::now(); + } + return lba_manager.rewrite_extent(t, extent); +} + +TransactionManager::get_extent_if_live_ret TransactionManager::get_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) +{ + CachedExtentRef ret; + auto status = cache.get_extent_if_cached(t, addr, &ret); + if (status != Transaction::get_extent_ret::ABSENT) { + return get_extent_if_live_ret( + get_extent_if_live_ertr::ready_future_marker{}, + ret); + } + + if (is_logical_type(type)) { + return lba_manager.get_mapping( + t, + laddr, + len).safe_then([=, &t](lba_pin_list_t pins) { + ceph_assert(pins.size() <= 1); + if (pins.empty()) { + return get_extent_if_live_ret( + get_extent_if_live_ertr::ready_future_marker{}, + CachedExtentRef()); + } + + auto pin = std::move(pins.front()); + pins.pop_front(); + ceph_assert(pin->get_laddr() == laddr); + ceph_assert(pin->get_length() == (extent_len_t)len); + if (pin->get_paddr() == addr) { + return cache.get_extent_by_type( + t, + type, + addr, + laddr, + len).safe_then( + [this, pin=std::move(pin)](CachedExtentRef ret) mutable { + auto lref = ret->cast<LogicalCachedExtent>(); + if (!lref->has_pin()) { + lref->set_pin(std::move(pin)); + lba_manager.add_pin(lref->get_pin()); + } + return get_extent_if_live_ret( + get_extent_if_live_ertr::ready_future_marker{}, + ret); + }); + } else { + return get_extent_if_live_ret( + get_extent_if_live_ertr::ready_future_marker{}, + CachedExtentRef()); + } + }); + } else { + logger().debug( + "TransactionManager::get_extent_if_live: non-logical extent {}", + addr); + return lba_manager.get_physical_extent_if_live( + t, + type, + addr, + laddr, + len); + } +} + +TransactionManager::~TransactionManager() {} + +} diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h new file mode 100644 index 000000000..d28fd0b87 --- /dev/null +++ b/src/crimson/os/seastore/transaction_manager.h @@ -0,0 +1,296 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <optional> +#include <vector> +#include <utility> +#include <functional> + +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> + +#include <seastar/core/future.hh> + +#include "include/ceph_assert.h" +#include "include/buffer.h" + +#include "crimson/osd/exceptions.h" + +#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/lba_manager.h" +#include "crimson/os/seastore/journal.h" + +namespace crimson::os::seastore { +class Journal; + +/** + * TransactionManager + * + * Abstraction hiding reading and writing to persistence. + * Exposes transaction based interface with read isolation. + */ +class TransactionManager : public SegmentCleaner::ExtentCallbackInterface { +public: + TransactionManager( + SegmentManager &segment_manager, + SegmentCleaner &segment_cleaner, + Journal &journal, + Cache &cache, + LBAManager &lba_manager); + + /// Writes initial metadata to disk + using mkfs_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + mkfs_ertr::future<> mkfs(); + + /// Reads initial metadata from disk + using mount_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + mount_ertr::future<> mount(); + + /// Closes transaction_manager + using close_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + close_ertr::future<> close(); + + /// Creates empty transaction + TransactionRef create_transaction() { + return make_transaction(); + } + + /// Creates weak transaction + TransactionRef create_weak_transaction() { + return make_weak_transaction(); + } + + /** + * Read extents corresponding to specified lba range + */ + using read_extent_ertr = SegmentManager::read_ertr; + template <typename T> + using read_extent_ret = read_extent_ertr::future<lextent_list_t<T>>; + template <typename T> + read_extent_ret<T> read_extents( + Transaction &t, + laddr_t offset, + extent_len_t length) + { + std::unique_ptr<lextent_list_t<T>> ret = + std::make_unique<lextent_list_t<T>>(); + auto &ret_ref = *ret; + std::unique_ptr<lba_pin_list_t> pin_list = + std::make_unique<lba_pin_list_t>(); + auto &pin_list_ref = *pin_list; + return lba_manager.get_mapping( + t, offset, length + ).safe_then([this, &t, &pin_list_ref, &ret_ref](auto pins) { + crimson::get_logger(ceph_subsys_filestore).debug( + "read_extents: mappings {}", + pins); + pins.swap(pin_list_ref); + return crimson::do_for_each( + pin_list_ref.begin(), + pin_list_ref.end(), + [this, &t, &ret_ref](auto &pin) { + crimson::get_logger(ceph_subsys_filestore).debug( + "read_extents: get_extent {}~{}", + pin->get_paddr(), + pin->get_length()); + return cache.get_extent<T>( + t, + pin->get_paddr(), + pin->get_length() + ).safe_then([this, &pin, &ret_ref](auto ref) mutable { + if (!ref->has_pin()) { + ref->set_pin(std::move(pin)); + lba_manager.add_pin(ref->get_pin()); + } + ret_ref.push_back(std::make_pair(ref->get_laddr(), ref)); + crimson::get_logger(ceph_subsys_filestore).debug( + "read_extents: got extent {}", + *ref); + return read_extent_ertr::now(); + }); + }); + }).safe_then([ret=std::move(ret), pin_list=std::move(pin_list)]() mutable { + return read_extent_ret<T>( + read_extent_ertr::ready_future_marker{}, + std::move(*ret)); + }); + } + + /// Obtain mutable copy of extent + LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) { + auto &logger = crimson::get_logger(ceph_subsys_filestore); + auto ret = cache.duplicate_for_write( + t, + ref)->cast<LogicalCachedExtent>(); + if (!ret->has_pin()) { + logger.debug( + "{}: duplicating {} for write: {}", + __func__, + *ref, + *ret); + ret->set_pin(ref->get_pin().duplicate()); + } else { + logger.debug( + "{}: {} already pending", + __func__, + *ref); + assert(ref->is_pending()); + assert(&*ref == &*ret); + } + return ret; + } + + + using ref_ertr = LBAManager::ref_ertr; + using ref_ret = ref_ertr::future<unsigned>; + + /// Add refcount for ref + ref_ret inc_ref( + Transaction &t, + LogicalCachedExtentRef &ref); + + /// Add refcount for offset + ref_ret inc_ref( + Transaction &t, + laddr_t offset); + + /// Remove refcount for ref + ref_ret dec_ref( + Transaction &t, + LogicalCachedExtentRef &ref); + + /// Remove refcount for offset + ref_ret dec_ref( + Transaction &t, + laddr_t offset); + + /** + * alloc_extent + * + * Allocates a new block of type T with the minimum lba range of size len + * greater than hint. + */ + using alloc_extent_ertr = SegmentManager::read_ertr; + template <typename T> + using alloc_extent_ret = alloc_extent_ertr::future<TCachedExtentRef<T>>; + template <typename T> + alloc_extent_ret<T> alloc_extent( + Transaction &t, + laddr_t hint, + extent_len_t len) { + auto ext = cache.alloc_new_extent<T>( + t, + len); + return lba_manager.alloc_extent( + t, + hint, + len, + ext->get_paddr() + ).safe_then([ext=std::move(ext)](auto &&ref) mutable { + ext->set_pin(std::move(ref)); + return alloc_extent_ertr::make_ready_future<TCachedExtentRef<T>>( + std::move(ext)); + }); + } + + /** + * submit_transaction + * + * Atomically submits transaction to persistence + */ + using submit_transaction_ertr = crimson::errorator< + crimson::ct_error::eagain, // Caller should retry transaction from beginning + crimson::ct_error::input_output_error // Media error + >; + submit_transaction_ertr::future<> submit_transaction(TransactionRef); + + /// SegmentCleaner::ExtentCallbackInterface + + using SegmentCleaner::ExtentCallbackInterface::get_next_dirty_extents_ret; + get_next_dirty_extents_ret get_next_dirty_extents( + journal_seq_t seq) final; + + using SegmentCleaner::ExtentCallbackInterface::rewrite_extent_ret; + rewrite_extent_ret rewrite_extent( + Transaction &t, + CachedExtentRef extent) final; + + using SegmentCleaner::ExtentCallbackInterface::get_extent_if_live_ret; + get_extent_if_live_ret get_extent_if_live( + Transaction &t, + extent_types_t type, + paddr_t addr, + laddr_t laddr, + segment_off_t len) final; + + using scan_extents_cursor = + SegmentCleaner::ExtentCallbackInterface::scan_extents_cursor; + using scan_extents_ertr = + SegmentCleaner::ExtentCallbackInterface::scan_extents_ertr; + using scan_extents_ret = + SegmentCleaner::ExtentCallbackInterface::scan_extents_ret; + scan_extents_ret scan_extents( + scan_extents_cursor &cursor, + extent_len_t bytes_to_read) final { + return journal.scan_extents(cursor, bytes_to_read); + } + + using release_segment_ret = + SegmentCleaner::ExtentCallbackInterface::release_segment_ret; + release_segment_ret release_segment( + segment_id_t id) final { + return segment_manager.release(id); + } + + /** + * read_onode_root + * + * Get onode-tree root logical address + */ + using read_onode_root_ertr = crimson::errorator< + crimson::ct_error::input_output_error + >; + using read_onode_root_ret = read_onode_root_ertr::future<laddr_t>; + read_onode_root_ret read_onode_root(Transaction &t) { + return cache.get_root(t).safe_then([](auto croot) { + return croot->get_root().onode_root; + }); + } + + /** + * write_onode_root + * + * Write onode-tree root logical address, must be called after read. + */ + void write_onode_root(Transaction &t, laddr_t addr) { + auto croot = cache.get_root_fast(t); + croot = cache.duplicate_for_write(t, croot)->cast<RootBlock>(); + croot->get_root().onode_root = addr; + } + + ~TransactionManager(); + +private: + friend class Transaction; + + SegmentManager &segment_manager; + SegmentCleaner &segment_cleaner; + Cache &cache; + LBAManager &lba_manager; + Journal &journal; +}; +using TransactionManagerRef = std::unique_ptr<TransactionManager>; + +} diff --git a/src/crimson/osd/CMakeLists.txt b/src/crimson/osd/CMakeLists.txt new file mode 100644 index 000000000..898f70c42 --- /dev/null +++ b/src/crimson/osd/CMakeLists.txt @@ -0,0 +1,57 @@ +add_executable(crimson-osd + backfill_state.cc + ec_backend.cc + heartbeat.cc + main.cc + osd.cc + osd_meta.cc + pg.cc + pg_backend.cc + pg_meta.cc + replicated_backend.cc + shard_services.cc + object_context.cc + ops_executer.cc + osd_operation.cc + osd_operations/client_request.cc + osd_operations/compound_peering_request.cc + osd_operations/peering_event.cc + osd_operations/pg_advance_map.cc + osd_operations/replicated_request.cc + osd_operations/background_recovery.cc + osd_operations/recovery_subrequest.cc + pg_recovery.cc + recovery_backend.cc + replicated_recovery_backend.cc + scheduler/scheduler.cc + scheduler/mclock_scheduler.cc + osdmap_gate.cc + pg_map.cc + objclass.cc + ${PROJECT_SOURCE_DIR}/src/objclass/class_api.cc + ${PROJECT_SOURCE_DIR}/src/osd/ClassHandler.cc + ${PROJECT_SOURCE_DIR}/src/osd/osd_op_util.cc + ${PROJECT_SOURCE_DIR}/src/osd/OSDCap.cc + ${PROJECT_SOURCE_DIR}/src/osd/PeeringState.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGStateUtils.cc + ${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc + ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc + ${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc + watch.cc + ) +target_link_libraries(crimson-osd + crimson-admin + crimson-common + crimson-os + crimson + fmt::fmt + Boost::MPL + dmclock::dmclock) +set_target_properties(crimson-osd PROPERTIES + POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE}) +install(TARGETS crimson-osd DESTINATION bin) +if(WITH_TESTS) + add_dependencies(tests crimson-osd) +endif() diff --git a/src/crimson/osd/acked_peers.h b/src/crimson/osd/acked_peers.h new file mode 100644 index 000000000..b2f2562c0 --- /dev/null +++ b/src/crimson/osd/acked_peers.h @@ -0,0 +1,14 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <vector> + +namespace crimson::osd { + struct peer_shard_t { + pg_shard_t shard; + eversion_t last_complete_ondisk; + }; + using acked_peers_t = std::vector<peer_shard_t>; +} diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h new file mode 100644 index 000000000..683dc6ea6 --- /dev/null +++ b/src/crimson/osd/backfill_facades.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/pg.h" +#include "osd/PeeringState.h" + +namespace crimson::osd { + +// PeeringFacade -- main implementation of the BackfillState::PeeringFacade +// interface. We have the abstraction to decuple BackfillState from Peering +// State, and thus cut depedencies in unit testing. The second implemention +// is BackfillFixture::PeeringFacade and sits in test_backfill.cc. +struct PeeringFacade final : BackfillState::PeeringFacade { + PeeringState& peering_state; + + hobject_t earliest_backfill() const override { + return peering_state.earliest_backfill(); + } + + const std::set<pg_shard_t>& get_backfill_targets() const override { + return peering_state.get_backfill_targets(); + } + + const hobject_t& get_peer_last_backfill(pg_shard_t peer) const override { + return peering_state.get_peer_info(peer).last_backfill; + } + + const eversion_t& get_last_update() const override { + return peering_state.get_info().last_update; + } + + const eversion_t& get_log_tail() const override { + return peering_state.get_info().log_tail; + } + + void scan_log_after(eversion_t v, scan_log_func_t f) const override { + peering_state.get_pg_log().get_log().scan_log_after(v, std::move(f)); + } + + bool is_backfill_target(pg_shard_t peer) const override { + return peering_state.is_backfill_target(peer); + } + void update_complete_backfill_object_stats(const hobject_t &hoid, + const pg_stat_t &stats) override { + peering_state.update_complete_backfill_object_stats(hoid, stats); + } + + bool is_backfilling() const override { + return peering_state.is_backfilling(); + } + + PeeringFacade(PeeringState& peering_state) + : peering_state(peering_state) { + } +}; + +// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge +// interface of crimson's PG class. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct PGFacade final : BackfillState::PGFacade { + PG& pg; + + const eversion_t& get_projected_last_update() const override { + return pg.projected_last_update; + } + + PGFacade(PG& pg) : pg(pg) {} +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc new file mode 100644 index 000000000..57f845f92 --- /dev/null +++ b/src/crimson/osd/backfill_state.cc @@ -0,0 +1,556 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> +#include <boost/type_index.hpp> + +#include "crimson/osd/backfill_state.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +BackfillState::BackfillState( + BackfillState::BackfillListener& backfill_listener, + std::unique_ptr<BackfillState::PeeringFacade> peering_state, + std::unique_ptr<BackfillState::PGFacade> pg) + : backfill_machine(*this, + backfill_listener, + std::move(peering_state), + std::move(pg)), + progress_tracker( + std::make_unique<BackfillState::ProgressTracker>(backfill_machine)) +{ + logger().debug("{}:{}", __func__, __LINE__); + backfill_machine.initiate(); +} + +template <class S> +BackfillState::StateHelper<S>::StateHelper() +{ + logger().debug("enter {}", + boost::typeindex::type_id<S>().pretty_name()); +} + +template <class S> +BackfillState::StateHelper<S>::~StateHelper() +{ + logger().debug("exit {}", + boost::typeindex::type_id<S>().pretty_name()); +} + +BackfillState::~BackfillState() = default; + +BackfillState::BackfillMachine::BackfillMachine( + BackfillState& backfill_state, + BackfillState::BackfillListener& backfill_listener, + std::unique_ptr<BackfillState::PeeringFacade> peering_state, + std::unique_ptr<BackfillState::PGFacade> pg) + : backfill_state(backfill_state), + backfill_listener(backfill_listener), + peering_state(std::move(peering_state)), + pg(std::move(pg)) +{} + +BackfillState::BackfillMachine::~BackfillMachine() = default; + +BackfillState::Initial::Initial(my_context ctx) + : my_base(ctx) +{ + backfill_state().last_backfill_started = peering_state().earliest_backfill(); + logger().debug("{}: bft={} from {}", + __func__, peering_state().get_backfill_targets(), + backfill_state().last_backfill_started); + for (const auto& bt : peering_state().get_backfill_targets()) { + logger().debug("{}: target shard {} from {}", + __func__, bt, peering_state().get_peer_last_backfill(bt)); + } + ceph_assert(peering_state().get_backfill_targets().size()); + ceph_assert(!backfill_state().last_backfill_started.is_max()); +} + +boost::statechart::result +BackfillState::Initial::react(const BackfillState::Triggered& evt) +{ + logger().debug("{}: backfill triggered", __func__); + ceph_assert(backfill_state().last_backfill_started == \ + peering_state().earliest_backfill()); + ceph_assert(peering_state().is_backfilling()); + // initialize BackfillIntervals + for (const auto& bt : peering_state().get_backfill_targets()) { + backfill_state().peer_backfill_info[bt].reset( + peering_state().get_peer_last_backfill(bt)); + } + backfill_state().backfill_info.reset(backfill_state().last_backfill_started); + if (Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + logger().debug("{}: switching to Done state", __func__); + return transit<BackfillState::Done>(); + } else { + logger().debug("{}: switching to Enqueuing state", __func__); + return transit<BackfillState::Enqueuing>(); + } +} + + +// -- Enqueuing +void BackfillState::Enqueuing::maybe_update_range() +{ + if (auto& primary_bi = backfill_state().backfill_info; + primary_bi.version >= pg().get_projected_last_update()) { + logger().info("{}: bi is current", __func__); + ceph_assert(primary_bi.version == pg().get_projected_last_update()); + } else if (primary_bi.version >= peering_state().get_log_tail()) { +#if 0 + if (peering_state().get_pg_log().get_log().empty() && + pg().get_projected_log().empty()) { + /* Because we don't move log_tail on split, the log might be + * empty even if log_tail != last_update. However, the only + * way to get here with an empty log is if log_tail is actually + * eversion_t(), because otherwise the entry which changed + * last_update since the last scan would have to be present. + */ + ceph_assert(primary_bi.version == eversion_t()); + return; + } +#endif + logger().debug("{}: bi is old, ({}) can be updated with log to {}", + __func__, + primary_bi.version, + pg().get_projected_last_update()); + logger().debug("{}: scanning pg log first", __func__); + peering_state().scan_log_after(primary_bi.version, + [&](const pg_log_entry_t& e) { + logger().debug("maybe_update_range(lambda): updating from version {}", + e.version); + if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) { + if (e.is_update()) { + logger().debug("maybe_update_range(lambda): {} updated to ver {}", + e.soid, e.version); + primary_bi.objects.erase(e.soid); + primary_bi.objects.insert(std::make_pair(e.soid, + e.version)); + } else if (e.is_delete()) { + logger().debug("maybe_update_range(lambda): {} removed", + e.soid); + primary_bi.objects.erase(e.soid); + } + } + }); + primary_bi.version = pg().get_projected_last_update(); + } else { + ceph_abort_msg( + "scan_range should have raised primary_bi.version past log_tail"); + } +} + +void BackfillState::Enqueuing::trim_backfill_infos() +{ + for (const auto& bt : peering_state().get_backfill_targets()) { + backfill_state().peer_backfill_info[bt].trim_to( + std::max(peering_state().get_peer_last_backfill(bt), + backfill_state().last_backfill_started)); + } + backfill_state().backfill_info.trim_to( + backfill_state().last_backfill_started); +} + +/* static */ bool BackfillState::Enqueuing::all_enqueued( + const PeeringFacade& peering_state, + const BackfillInterval& backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) +{ + const bool all_local_enqueued = \ + backfill_info.extends_to_end() && backfill_info.empty(); + const bool all_peer_enqueued = std::all_of( + std::begin(peer_backfill_info), + std::end(peer_backfill_info), + [] (const auto& kv) { + [[maybe_unused]] const auto& [ shard, peer_backfill_info ] = kv; + return peer_backfill_info.extends_to_end() && peer_backfill_info.empty(); + }); + return all_local_enqueued && all_peer_enqueued; +} + +hobject_t BackfillState::Enqueuing::earliest_peer_backfill( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const +{ + hobject_t e = hobject_t::get_max(); + for (const pg_shard_t& bt : peering_state().get_backfill_targets()) { + const auto iter = peer_backfill_info.find(bt); + ceph_assert(iter != peer_backfill_info.end()); + e = std::min(e, iter->second.begin); + } + return e; +} + +bool BackfillState::Enqueuing::should_rescan_replicas( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const +{ + const auto& targets = peering_state().get_backfill_targets(); + return std::any_of(std::begin(targets), std::end(targets), + [&] (const auto& bt) { + return ReplicasScanning::replica_needs_scan(peer_backfill_info.at(bt), + backfill_info); + }); +} + +bool BackfillState::Enqueuing::should_rescan_primary( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const +{ + return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) && + !backfill_info.extends_to_end(); +} + +void BackfillState::Enqueuing::trim_backfilled_object_from_intervals( + BackfillState::Enqueuing::result_t&& result, + hobject_t& last_backfill_started, + std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) +{ + std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets), + [&peer_backfill_info] (const auto& bt) { + peer_backfill_info.at(bt).pop_front(); + }); + last_backfill_started = std::move(result.new_last_backfill_started); +} + +BackfillState::Enqueuing::result_t +BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) +{ + // set `new_last_backfill_started` to `check` + result_t result { {}, check }; + for (const auto& bt : peering_state().get_backfill_targets()) { + const auto& pbi = backfill_state().peer_backfill_info.at(bt); + if (pbi.begin == check) { + result.pbi_targets.insert(bt); + const auto& version = pbi.objects.begin()->second; + backfill_state().progress_tracker->enqueue_drop(pbi.begin); + backfill_listener().enqueue_drop(bt, pbi.begin, version); + } + } + logger().debug("{}: BACKFILL removing {} from peers {}", + __func__, check, result.pbi_targets); + ceph_assert(!result.pbi_targets.empty()); + return result; +} + +BackfillState::Enqueuing::result_t +BackfillState::Enqueuing::update_on_peers(const hobject_t& check) +{ + logger().debug("{}: check={}", __func__, check); + const auto& primary_bi = backfill_state().backfill_info; + result_t result { {}, primary_bi.begin }; + + for (const auto& bt : peering_state().get_backfill_targets()) { + const auto& peer_bi = backfill_state().peer_backfill_info.at(bt); + + // Find all check peers that have the wrong version + if (const eversion_t& obj_v = primary_bi.objects.begin()->second; + check == primary_bi.begin && check == peer_bi.begin) { + if(peer_bi.objects.begin()->second != obj_v && + backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) { + backfill_listener().enqueue_push(primary_bi.begin, obj_v); + } else { + // it's fine, keep it! OR already recovering + } + result.pbi_targets.insert(bt); + } else { + // Only include peers that we've caught up to their backfill line + // otherwise, they only appear to be missing this object + // because their peer_bi.begin > backfill_info.begin. + if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) && + backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) { + backfill_listener().enqueue_push(primary_bi.begin, obj_v); + } + } + } + return result; +} + +bool BackfillState::Enqueuing::Enqueuing::all_emptied( + const BackfillInterval& local_backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const +{ + const auto& targets = peering_state().get_backfill_targets(); + const auto replicas_emptied = + std::all_of(std::begin(targets), std::end(targets), + [&] (const auto& bt) { + return peer_backfill_info.at(bt).empty(); + }); + return local_backfill_info.empty() && replicas_emptied; +} + +BackfillState::Enqueuing::Enqueuing(my_context ctx) + : my_base(ctx) +{ + auto& primary_bi = backfill_state().backfill_info; + + // update our local interval to cope with recent changes + primary_bi.begin = backfill_state().last_backfill_started; + if (primary_bi.version < peering_state().get_log_tail()) { + // it might be that the OSD is so flooded with modifying operations + // that backfill will be spinning here over and over. For the sake + // of performance and complexity we don't synchronize with entire PG. + // similar can happen in classical OSD. + logger().warn("{}: bi is old, rescanning of local backfill_info", + __func__); + post_event(RequestPrimaryScanning{}); + return; + } else { + maybe_update_range(); + } + trim_backfill_infos(); + + while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) { + if (!backfill_listener().budget_available()) { + post_event(RequestWaiting{}); + return; + } else if (should_rescan_replicas(backfill_state().peer_backfill_info, + primary_bi)) { + // Count simultaneous scans as a single op and let those complete + post_event(RequestReplicasScanning{}); + return; + } + // Get object within set of peers to operate on and the set of targets + // for which that object applies. + if (const hobject_t check = \ + earliest_peer_backfill(backfill_state().peer_backfill_info); + check < primary_bi.begin) { + // Don't increment ops here because deletions + // are cheap and not replied to unlike real recovery_ops, + // and we can't increment ops without requeueing ourself + // for recovery. + auto result = remove_on_peers(check); + trim_backfilled_object_from_intervals(std::move(result), + backfill_state().last_backfill_started, + backfill_state().peer_backfill_info); + } else { + auto result = update_on_peers(check); + trim_backfilled_object_from_intervals(std::move(result), + backfill_state().last_backfill_started, + backfill_state().peer_backfill_info); + primary_bi.pop_front(); + } + backfill_listener().maybe_flush(); + } + + if (should_rescan_primary(backfill_state().peer_backfill_info, + primary_bi)) { + // need to grab one another chunk of the object namespace and restart + // the queueing. + logger().debug("{}: reached end for current local chunk", + __func__); + post_event(RequestPrimaryScanning{}); + } else if (backfill_state().progress_tracker->tracked_objects_completed()) { + post_event(RequestDone{}); + } else { + logger().debug("{}: reached end for both local and all peers " + "but still has in-flight operations", __func__); + post_event(RequestWaiting{}); + } +} + +// -- PrimaryScanning +BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx) + : my_base(ctx) +{ + backfill_state().backfill_info.version = peering_state().get_last_update(); + backfill_listener().request_primary_scan( + backfill_state().backfill_info.begin); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(PrimaryScanned evt) +{ + logger().debug("{}", __func__); + backfill_state().backfill_info = std::move(evt.result); + return transit<Enqueuing>(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(ObjectPushed evt) +{ + logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + return discard_event(); +} + +// -- ReplicasScanning +bool BackfillState::ReplicasScanning::replica_needs_scan( + const BackfillInterval& replica_backfill_info, + const BackfillInterval& local_backfill_info) +{ + return replica_backfill_info.empty() && \ + replica_backfill_info.begin <= local_backfill_info.begin && \ + !replica_backfill_info.extends_to_end(); +} + +BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx) + : my_base(ctx) +{ + for (const auto& bt : peering_state().get_backfill_targets()) { + if (const auto& pbi = backfill_state().peer_backfill_info.at(bt); + replica_needs_scan(pbi, backfill_state().backfill_info)) { + logger().debug("{}: scanning peer osd.{} from {}", + __func__, bt, pbi.end); + backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{}); + + ceph_assert(waiting_on_backfill.find(bt) == \ + waiting_on_backfill.end()); + waiting_on_backfill.insert(bt); + } + } + ceph_assert(!waiting_on_backfill.empty()); + // TODO: start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end +} + +#if 0 +BackfillState::ReplicasScanning::~ReplicasScanning() +{ + // TODO: finish_recovery_op(hobject_t::get_max()); +} +#endif + +boost::statechart::result +BackfillState::ReplicasScanning::react(ReplicaScanned evt) +{ + logger().debug("{}: got scan result from osd={}, result={}", + __func__, evt.from, evt.result); + // TODO: maybe we'll be able to move waiting_on_backfill from + // the machine to the state. + ceph_assert(peering_state().is_backfill_target(evt.from)); + if (waiting_on_backfill.erase(evt.from)) { + backfill_state().peer_backfill_info[evt.from] = std::move(evt.result); + if (waiting_on_backfill.empty()) { + ceph_assert(backfill_state().peer_backfill_info.size() == \ + peering_state().get_backfill_targets().size()); + return transit<Enqueuing>(); + } + } else { + // we canceled backfill for a while due to a too full, and this + // is an extra response from a non-too-full peer + logger().debug("{}: canceled backfill (too full?)", __func__); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::ReplicasScanning::react(ObjectPushed evt) +{ + logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + return discard_event(); +} + + +// -- Waiting +BackfillState::Waiting::Waiting(my_context ctx) + : my_base(ctx) +{ +} + +boost::statechart::result +BackfillState::Waiting::react(ObjectPushed evt) +{ + logger().debug("Waiting::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + if (!Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + return transit<Enqueuing>(); + } else if (backfill_state().progress_tracker->tracked_objects_completed()) { + return transit<Done>(); + } else { + // we still have something to wait on + logger().debug("Waiting::react() on ObjectPushed; still waiting"); + return discard_event(); + } +} + +// -- Done +BackfillState::Done::Done(my_context ctx) + : my_base(ctx) +{ + logger().info("{}: backfill is done", __func__); + backfill_listener().backfilled(); +} + +// -- Crashed +BackfillState::Crashed::Crashed() +{ + ceph_abort_msg("{}: this should not happen"); +} + +// ProgressTracker is an intermediary between the BackfillListener and +// BackfillMachine + its states. All requests to push or drop an object +// are directed through it. The same happens with notifications about +// completing given operations which are generated by BackfillListener +// and dispatched as i.e. ObjectPushed events. +// This allows ProgressTacker to track the list of in-flight operations +// which is essential to make the decision whether the entire machine +// should switch from Waiting to Done keep in Waiting. +// ProgressTracker also coordinates .last_backfill_started and stats +// updates. +bool BackfillState::ProgressTracker::tracked_objects_completed() const +{ + return registry.empty(); +} + +bool BackfillState::ProgressTracker::enqueue_push(const hobject_t& obj) +{ + [[maybe_unused]] const auto [it, first_seen] = registry.try_emplace( + obj, registry_item_t{op_stage_t::enqueued_push, std::nullopt}); + return first_seen; +} + +void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj) +{ + registry.try_emplace( + obj, registry_item_t{op_stage_t::enqueued_drop, pg_stat_t{}}); +} + +void BackfillState::ProgressTracker::complete_to( + const hobject_t& obj, + const pg_stat_t& stats) +{ + logger().debug("{}: obj={}", + __func__, obj); + if (auto completion_iter = registry.find(obj); + completion_iter != std::end(registry)) { + completion_iter->second = \ + registry_item_t{ op_stage_t::completed_push, stats }; + } else { + ceph_abort_msg("completing untracked object shall not happen"); + } + for (auto it = std::begin(registry); + it != std::end(registry) && + it->second.stage != op_stage_t::enqueued_push; + it = registry.erase(it)) { + auto& [soid, item] = *it; + assert(item.stats); + peering_state().update_complete_backfill_object_stats( + soid, + *item.stats); + } + if (Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info) && + tracked_objects_completed()) { + backfill_state().last_backfill_started = hobject_t::get_max(); + backfill_listener().update_peers_last_backfill(hobject_t::get_max()); + } else { + backfill_listener().update_peers_last_backfill(obj); + } +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h new file mode 100644 index 000000000..4bd2991fb --- /dev/null +++ b/src/crimson/osd/backfill_state.h @@ -0,0 +1,382 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <optional> + +#include <boost/statechart/custom_reaction.hpp> +#include <boost/statechart/event.hpp> +#include <boost/statechart/event_base.hpp> +#include <boost/statechart/simple_state.hpp> +#include <boost/statechart/state.hpp> +#include <boost/statechart/state_machine.hpp> +#include <boost/statechart/transition.hpp> + +#include "osd/recovery_types.h" + +namespace crimson::osd { + +namespace sc = boost::statechart; + +struct BackfillState { + struct BackfillListener; + struct PeeringFacade; + struct PGFacade; + + // events comes first + struct PrimaryScanned : sc::event<PrimaryScanned> { + BackfillInterval result; + PrimaryScanned(BackfillInterval&& result) + : result(std::move(result)) { + } + }; + + struct ReplicaScanned : sc::event<ReplicaScanned> { + pg_shard_t from; + BackfillInterval result; + ReplicaScanned(pg_shard_t from, BackfillInterval&& result) + : from(std::move(from)), + result(std::move(result)) { + } + }; + + struct ObjectPushed : sc::event<ObjectPushed> { + // TODO: implement replica management; I don't want to follow + // current convention where the backend layer is responsible + // for tracking replicas. + hobject_t object; + pg_stat_t stat; + ObjectPushed(hobject_t object) + : object(std::move(object)) { + } + }; + + struct Triggered : sc::event<Triggered> { + }; + +private: + // internal events + struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> { + }; + + struct RequestReplicasScanning : sc::event<RequestReplicasScanning> { + }; + + struct RequestWaiting : sc::event<RequestWaiting> { + }; + + struct RequestDone : sc::event<RequestDone> { + }; + + class ProgressTracker; + +public: + + struct Initial; + struct Enqueuing; + struct PrimaryScanning; + struct ReplicasScanning; + struct Waiting; + struct Done; + + struct BackfillMachine : sc::state_machine<BackfillMachine, Initial> { + BackfillMachine(BackfillState& backfill_state, + BackfillListener& backfill_listener, + std::unique_ptr<PeeringFacade> peering_state, + std::unique_ptr<PGFacade> pg); + ~BackfillMachine(); + BackfillState& backfill_state; + BackfillListener& backfill_listener; + std::unique_ptr<PeeringFacade> peering_state; + std::unique_ptr<PGFacade> pg; + }; + +private: + template <class S> + struct StateHelper { + StateHelper(); + ~StateHelper(); + + BackfillState& backfill_state() { + return static_cast<S*>(this) \ + ->template context<BackfillMachine>().backfill_state; + } + BackfillListener& backfill_listener() { + return static_cast<S*>(this) \ + ->template context<BackfillMachine>().backfill_listener; + } + PeeringFacade& peering_state() { + return *static_cast<S*>(this) \ + ->template context<BackfillMachine>().peering_state; + } + PGFacade& pg() { + return *static_cast<S*>(this)->template context<BackfillMachine>().pg; + } + + const PeeringFacade& peering_state() const { + return *static_cast<const S*>(this) \ + ->template context<BackfillMachine>().peering_state; + } + const BackfillState& backfill_state() const { + return static_cast<const S*>(this) \ + ->template context<BackfillMachine>().backfill_state; + } + }; + +public: + + // states + struct Crashed : sc::simple_state<Crashed, BackfillMachine>, + StateHelper<Crashed> { + explicit Crashed(); + }; + + struct Initial : sc::state<Initial, BackfillMachine>, + StateHelper<Initial> { + using reactions = boost::mpl::list< + sc::custom_reaction<Triggered>, + sc::transition<sc::event_base, Crashed>>; + explicit Initial(my_context); + // initialize after triggering backfill by on_activate_complete(). + // transit to Enqueuing. + sc::result react(const Triggered&); + }; + + struct Enqueuing : sc::state<Enqueuing, BackfillMachine>, + StateHelper<Enqueuing> { + using reactions = boost::mpl::list< + sc::transition<RequestPrimaryScanning, PrimaryScanning>, + sc::transition<RequestReplicasScanning, ReplicasScanning>, + sc::transition<RequestWaiting, Waiting>, + sc::transition<RequestDone, Done>, + sc::transition<sc::event_base, Crashed>>; + explicit Enqueuing(my_context); + + // indicate whether there is any remaining work to do when it comes + // to comparing the hobject_t namespace between primary and replicas. + // true doesn't necessarily mean backfill is done -- there could be + // in-flight pushes or drops which had been enqueued but aren't + // completed yet. + static bool all_enqueued( + const PeeringFacade& peering_state, + const BackfillInterval& backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info); + + private: + void maybe_update_range(); + void trim_backfill_infos(); + + // these methods take BackfillIntervals instead of extracting them from + // the state to emphasize the relationships across the main loop. + bool all_emptied( + const BackfillInterval& local_backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const; + hobject_t earliest_peer_backfill( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const; + bool should_rescan_replicas( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const; + // indicate whether a particular acting primary needs to scanned again + // to process next piece of the hobject_t's namespace. + // the logic is per analogy to replica_needs_scan(). See comments there. + bool should_rescan_primary( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const; + + // the result_t is intermediary between {remove,update}_on_peers() and + // updating BackfillIntervals in trim_backfilled_object_from_intervals. + // This step is important because it affects the main loop's condition, + // and thus deserves to be exposed instead of being called deeply from + // {remove,update}_on_peers(). + struct [[nodiscard]] result_t { + std::set<pg_shard_t> pbi_targets; + hobject_t new_last_backfill_started; + }; + void trim_backfilled_object_from_intervals( + result_t&&, + hobject_t& last_backfill_started, + std::map<pg_shard_t, BackfillInterval>& peer_backfill_info); + result_t remove_on_peers(const hobject_t& check); + result_t update_on_peers(const hobject_t& check); + }; + + struct PrimaryScanning : sc::state<PrimaryScanning, BackfillMachine>, + StateHelper<PrimaryScanning> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::custom_reaction<PrimaryScanned>, + sc::transition<sc::event_base, Crashed>>; + explicit PrimaryScanning(my_context); + sc::result react(ObjectPushed); + // collect scanning result and transit to Enqueuing. + sc::result react(PrimaryScanned); + }; + + struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>, + StateHelper<ReplicasScanning> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::custom_reaction<ReplicaScanned>, + sc::transition<sc::event_base, Crashed>>; + explicit ReplicasScanning(my_context); + // collect scanning result; if all results are collected, transition + // to Enqueuing will happen. + sc::result react(ObjectPushed); + sc::result react(ReplicaScanned); + + // indicate whether a particular peer should be scanned to retrieve + // BackfillInterval for new range of hobject_t namespace. + // true when bi.objects is exhausted, replica bi's end is not MAX, + // and primary bi'begin is further than the replica's one. + static bool replica_needs_scan( + const BackfillInterval& replica_backfill_info, + const BackfillInterval& local_backfill_info); + + private: + std::set<pg_shard_t> waiting_on_backfill; + }; + + struct Waiting : sc::state<Waiting, BackfillMachine>, + StateHelper<Waiting> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::transition<sc::event_base, Crashed>>; + explicit Waiting(my_context); + sc::result react(ObjectPushed); + }; + + struct Done : sc::state<Done, BackfillMachine>, + StateHelper<Done> { + using reactions = boost::mpl::list< + sc::transition<sc::event_base, Crashed>>; + explicit Done(my_context); + }; + + BackfillState(BackfillListener& backfill_listener, + std::unique_ptr<PeeringFacade> peering_state, + std::unique_ptr<PGFacade> pg); + ~BackfillState(); + + void process_event( + boost::intrusive_ptr<const sc::event_base> evt) { + backfill_machine.process_event(*std::move(evt)); + } + + hobject_t get_last_backfill_started() const { + return last_backfill_started; + } +private: + hobject_t last_backfill_started; + BackfillInterval backfill_info; + std::map<pg_shard_t, BackfillInterval> peer_backfill_info; + BackfillMachine backfill_machine; + std::unique_ptr<ProgressTracker> progress_tracker; +}; + +// BackfillListener -- an interface used by the backfill FSM to request +// low-level services like issueing `MOSDPGPush` or `MOSDPGBackfillRemove`. +// The goals behind the interface are: 1) unittestability; 2) possibility +// to retrofit classical OSD with BackfillState. For the second reason we +// never use `seastar::future` -- instead responses to the requests are +// conveyed as events; see ObjectPushed as an example. +struct BackfillState::BackfillListener { + virtual void request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) = 0; + + virtual void request_primary_scan( + const hobject_t& begin) = 0; + + virtual void enqueue_push( + const hobject_t& obj, + const eversion_t& v) = 0; + + virtual void enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) = 0; + + virtual void maybe_flush() = 0; + + virtual void update_peers_last_backfill( + const hobject_t& new_last_backfill) = 0; + + virtual bool budget_available() const = 0; + + virtual void backfilled() = 0; + + virtual ~BackfillListener() = default; +}; + +// PeeringFacade -- a facade (in the GoF-defined meaning) simplifying +// the interface of PeeringState. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct BackfillState::PeeringFacade { + virtual hobject_t earliest_backfill() const = 0; + virtual const std::set<pg_shard_t>& get_backfill_targets() const = 0; + virtual const hobject_t& get_peer_last_backfill(pg_shard_t peer) const = 0; + virtual const eversion_t& get_last_update() const = 0; + virtual const eversion_t& get_log_tail() const = 0; + + // the performance impact of `std::function` has not been considered yet. + // If there is any proof (from e.g. profiling) about its significance, we + // can switch back to the template variant. + using scan_log_func_t = std::function<void(const pg_log_entry_t&)>; + virtual void scan_log_after(eversion_t, scan_log_func_t) const = 0; + + virtual bool is_backfill_target(pg_shard_t peer) const = 0; + virtual void update_complete_backfill_object_stats(const hobject_t &hoid, + const pg_stat_t &stats) = 0; + virtual bool is_backfilling() const = 0; + virtual ~PeeringFacade() {} +}; + +// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge +// interface of crimson's PG class. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct BackfillState::PGFacade { + virtual const eversion_t& get_projected_last_update() const = 0; + virtual ~PGFacade() {} +}; + +class BackfillState::ProgressTracker { + // TODO: apply_stat, + enum class op_stage_t { + enqueued_push, + enqueued_drop, + completed_push, + }; + + struct registry_item_t { + op_stage_t stage; + std::optional<pg_stat_t> stats; + }; + + BackfillMachine& backfill_machine; + std::map<hobject_t, registry_item_t> registry; + + BackfillState& backfill_state() { + return backfill_machine.backfill_state; + } + PeeringFacade& peering_state() { + return *backfill_machine.peering_state; + } + BackfillListener& backfill_listener() { + return backfill_machine.backfill_listener; + } + +public: + ProgressTracker(BackfillMachine& backfill_machine) + : backfill_machine(backfill_machine) { + } + + bool tracked_objects_completed() const; + + bool enqueue_push(const hobject_t&); + void enqueue_drop(const hobject_t&); + void complete_to(const hobject_t&, const pg_stat_t&); +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc new file mode 100644 index 000000000..c6516d50a --- /dev/null +++ b/src/crimson/osd/ec_backend.cc @@ -0,0 +1,35 @@ +#include "ec_backend.h" + +#include "crimson/osd/shard_services.h" + +ECBackend::ECBackend(shard_id_t shard, + ECBackend::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t&, + uint64_t) + : PGBackend{shard, coll, &shard_services.get_store()} +{ + // todo +} + +ECBackend::ll_read_errorator::future<ceph::bufferlist> +ECBackend::_read(const hobject_t& hoid, + const uint64_t off, + const uint64_t len, + const uint32_t flags) +{ + // todo + return seastar::make_ready_future<bufferlist>(); +} + +seastar::future<crimson::osd::acked_peers_t> +ECBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + // todo + return seastar::make_ready_future<crimson::osd::acked_peers_t>(); +} diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h new file mode 100644 index 000000000..e15b19970 --- /dev/null +++ b/src/crimson/osd/ec_backend.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> +#include "include/buffer_fwd.h" +#include "osd/osd_types.h" +#include "pg_backend.h" + +class ECBackend : public PGBackend +{ +public: + ECBackend(shard_id_t shard, + CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile, + uint64_t stripe_width); + seastar::future<> stop() final { + return seastar::now(); + } + void on_actingset_changed(peering_info_t pi) final {} +private: + ll_read_errorator::future<ceph::bufferlist> _read(const hobject_t& hoid, + uint64_t off, + uint64_t len, + uint32_t flags) override; + seastar::future<crimson::osd::acked_peers_t> + _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + const osd_op_params_t& req, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) final; + CollectionRef coll; + crimson::os::FuturizedStore* store; +}; diff --git a/src/crimson/osd/exceptions.h b/src/crimson/osd/exceptions.h new file mode 100644 index 000000000..2783ed252 --- /dev/null +++ b/src/crimson/osd/exceptions.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <exception> +#include <system_error> + +#include "crimson/common/errorator.h" + +namespace crimson::osd { +class error : private std::system_error { +public: + error(const std::errc ec) + : system_error(std::make_error_code(ec)) { + } + + using system_error::code; + using system_error::what; + + friend error make_error(int ret); + +private: + error(const int ret) noexcept + : system_error(ret, std::system_category()) { + } +}; + +inline error make_error(const int ret) { + return error{ret}; +} + +struct object_not_found : public error { + object_not_found() : error(std::errc::no_such_file_or_directory) {} +}; + +struct invalid_argument : public error { + invalid_argument() : error(std::errc::invalid_argument) {} +}; + +// FIXME: error handling +struct permission_denied : public error { + permission_denied() : error(std::errc::operation_not_permitted) {} +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc new file mode 100644 index 000000000..81ec06ecd --- /dev/null +++ b/src/crimson/osd/heartbeat.cc @@ -0,0 +1,680 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "heartbeat.h" + +#include <boost/range/join.hpp> + +#include "messages/MOSDPing.h" +#include "messages/MOSDFailure.h" + +#include "crimson/common/config_proxy.h" +#include "crimson/common/formatter.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/osd/shard_services.h" +#include "crimson/mon/MonClient.h" + +#include "osd/OSDMap.h" + +using crimson::common::local_conf; + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +Heartbeat::Heartbeat(osd_id_t whoami, + const crimson::osd::ShardServices& service, + crimson::mon::Client& monc, + crimson::net::MessengerRef front_msgr, + crimson::net::MessengerRef back_msgr) + : whoami{whoami}, + service{service}, + monc{monc}, + front_msgr{front_msgr}, + back_msgr{back_msgr}, + // do this in background + timer{[this] { + heartbeat_check(); + (void)send_heartbeats(); + }}, + failing_peers{*this} +{} + +seastar::future<> Heartbeat::start(entity_addrvec_t front_addrs, + entity_addrvec_t back_addrs) +{ + logger().info("heartbeat: start"); + // i only care about the address, so any unused port would work + for (auto& addr : boost::join(front_addrs.v, back_addrs.v)) { + addr.set_port(0); + } + + using crimson::net::SocketPolicy; + front_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossy_client(0)); + back_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossy_client(0)); + return seastar::when_all_succeed(start_messenger(*front_msgr, + front_addrs), + start_messenger(*back_msgr, + back_addrs)) + .then_unpack([this] { + timer.arm_periodic( + std::chrono::seconds(local_conf()->osd_heartbeat_interval)); + }); +} + +seastar::future<> +Heartbeat::start_messenger(crimson::net::Messenger& msgr, + const entity_addrvec_t& addrs) +{ + return msgr.try_bind(addrs, + local_conf()->ms_bind_port_min, + local_conf()->ms_bind_port_max) + .safe_then([this, &msgr]() mutable { + return msgr.start({this}); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [] (const std::error_code& e) { + logger().error("heartbeat messenger try_bind(): address range is unavailable."); + ceph_abort(); + })); +} + +seastar::future<> Heartbeat::stop() +{ + logger().info("{}", __func__); + timer.cancel(); + front_msgr->stop(); + back_msgr->stop(); + return gate.close().then([this] { + return seastar::when_all_succeed(front_msgr->shutdown(), + back_msgr->shutdown()); + }).then_unpack([] { + return seastar::now(); + }); +} + +const entity_addrvec_t& Heartbeat::get_front_addrs() const +{ + return front_msgr->get_myaddrs(); +} + +const entity_addrvec_t& Heartbeat::get_back_addrs() const +{ + return back_msgr->get_myaddrs(); +} + +void Heartbeat::set_require_authorizer(bool require_authorizer) +{ + if (front_msgr->get_require_authorizer() != require_authorizer) { + front_msgr->set_require_authorizer(require_authorizer); + back_msgr->set_require_authorizer(require_authorizer); + } +} + +void Heartbeat::add_peer(osd_id_t _peer, epoch_t epoch) +{ + assert(whoami != _peer); + auto [iter, added] = peers.try_emplace(_peer, *this, _peer); + auto& peer = iter->second; + peer.set_epoch(epoch); +} + +Heartbeat::osds_t Heartbeat::remove_down_peers() +{ + osds_t old_osds; // osds not added in this epoch + for (auto i = peers.begin(); i != peers.end(); ) { + auto osdmap = service.get_osdmap_service().get_map(); + const auto& [osd, peer] = *i; + if (!osdmap->is_up(osd)) { + i = peers.erase(i); + } else { + if (peer.get_epoch() < osdmap->get_epoch()) { + old_osds.push_back(osd); + } + ++i; + } + } + return old_osds; +} + +void Heartbeat::add_reporter_peers(int whoami) +{ + auto osdmap = service.get_osdmap_service().get_map(); + // include next and previous up osds to ensure we have a fully-connected set + set<int> want; + if (auto next = osdmap->get_next_up_osd_after(whoami); next >= 0) { + want.insert(next); + } + if (auto prev = osdmap->get_previous_up_osd_before(whoami); prev >= 0) { + want.insert(prev); + } + // make sure we have at least **min_down** osds coming from different + // subtree level (e.g., hosts) for fast failure detection. + auto min_down = local_conf().get_val<uint64_t>("mon_osd_min_down_reporters"); + auto subtree = local_conf().get_val<string>("mon_osd_reporter_subtree_level"); + osdmap->get_random_up_osds_by_subtree( + whoami, subtree, min_down, want, &want); + auto epoch = osdmap->get_epoch(); + for (int osd : want) { + add_peer(osd, epoch); + }; +} + +void Heartbeat::update_peers(int whoami) +{ + const auto min_peers = static_cast<size_t>( + local_conf().get_val<int64_t>("osd_heartbeat_min_peers")); + add_reporter_peers(whoami); + auto extra = remove_down_peers(); + // too many? + for (auto& osd : extra) { + if (peers.size() <= min_peers) { + break; + } + remove_peer(osd); + } + // or too few? + auto osdmap = service.get_osdmap_service().get_map(); + auto epoch = osdmap->get_epoch(); + for (auto next = osdmap->get_next_up_osd_after(whoami); + peers.size() < min_peers && next >= 0 && next != whoami; + next = osdmap->get_next_up_osd_after(next)) { + add_peer(next, epoch); + } +} + +Heartbeat::osds_t Heartbeat::get_peers() const +{ + osds_t osds; + osds.reserve(peers.size()); + for (auto& peer : peers) { + osds.push_back(peer.first); + } + return osds; +} + +void Heartbeat::remove_peer(osd_id_t peer) +{ + assert(peers.count(peer) == 1); + peers.erase(peer); +} + +std::optional<seastar::future<>> +Heartbeat::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + bool dispatched = true; + gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] { + switch (m->get_type()) { + case MSG_OSD_PING: + return handle_osd_ping(conn, boost::static_pointer_cast<MOSDPing>(m)); + default: + dispatched = false; + return seastar::now(); + } + }); + return (dispatched ? std::make_optional(seastar::now()) : std::nullopt); +} + +void Heartbeat::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) +{ + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_reset(conn, is_replace); + } +} + +void Heartbeat::ms_handle_connect(crimson::net::ConnectionRef conn) +{ + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_connect(conn); + } +} + +void Heartbeat::ms_handle_accept(crimson::net::ConnectionRef conn) +{ + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_accept(conn); + } +} + +seastar::future<> Heartbeat::handle_osd_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + switch (m->op) { + case MOSDPing::PING: + return handle_ping(conn, m); + case MOSDPing::PING_REPLY: + return handle_reply(conn, m); + case MOSDPing::YOU_DIED: + return handle_you_died(); + default: + return seastar::now(); + } +} + +seastar::future<> Heartbeat::handle_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + auto min_message = static_cast<uint32_t>( + local_conf()->osd_heartbeat_min_size); + auto reply = + make_message<MOSDPing>( + m->fsid, + service.get_osdmap_service().get_map()->get_epoch(), + MOSDPing::PING_REPLY, + m->ping_stamp, + m->mono_ping_stamp, + service.get_mnow(), + service.get_osdmap_service().get_up_epoch(), + min_message); + return conn->send(reply); +} + +seastar::future<> Heartbeat::handle_reply(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + const osd_id_t from = m->get_source().num(); + auto found = peers.find(from); + if (found == peers.end()) { + // stale reply + return seastar::now(); + } + auto& peer = found->second; + return peer.handle_reply(conn, m); +} + +seastar::future<> Heartbeat::handle_you_died() +{ + // TODO: ask for newer osdmap + return seastar::now(); +} + +void Heartbeat::heartbeat_check() +{ + failure_queue_t failure_queue; + const auto now = clock::now(); + for (const auto& [osd, peer] : peers) { + auto failed_since = peer.failed_since(now); + if (!clock::is_zero(failed_since)) { + failure_queue.emplace(osd, failed_since); + } + } + if (!failure_queue.empty()) { + // send_failures can run in background, because + // 1. After the execution of send_failures, no msg is actually + // sent, which means the sending operation is not done, + // which further seems to involve problems risks that when + // osd shuts down, the left part of the sending operation + // may reference OSD and Heartbeat instances that are already + // deleted. However, remaining work of that sending operation + // involves no reference back to OSD or Heartbeat instances, + // which means it wouldn't involve the above risks. + // 2. messages are sent in order, if later checks find out + // the previous "failed" peers to be healthy, that "still + // alive" messages would be sent after the previous "osd + // failure" messages which is totally safe. + (void)send_failures(std::move(failure_queue)); + } +} + +seastar::future<> Heartbeat::send_heartbeats() +{ + const auto mnow = service.get_mnow(); + const auto now = clock::now(); + + std::vector<seastar::future<>> futures; + for (auto& [osd, peer] : peers) { + peer.send_heartbeat(now, mnow, futures); + } + return seastar::when_all_succeed(futures.begin(), futures.end()); +} + +seastar::future<> Heartbeat::send_failures(failure_queue_t&& failure_queue) +{ + std::vector<seastar::future<>> futures; + const auto now = clock::now(); + for (auto [osd, failed_since] : failure_queue) { + failing_peers.add_pending(osd, failed_since, now, futures); + } + + return seastar::when_all_succeed(futures.begin(), futures.end()); +} + +void Heartbeat::print(std::ostream& out) const +{ + out << "heartbeat"; +} + +Heartbeat::Connection::~Connection() +{ + if (conn) { + conn->mark_down(); + } +} + +bool Heartbeat::Connection::matches(crimson::net::ConnectionRef _conn) const +{ + return (conn && conn == _conn); +} + +void Heartbeat::Connection::accepted(crimson::net::ConnectionRef accepted_conn) +{ + if (!conn) { + if (accepted_conn->get_peer_addr() == listener.get_peer_addr(type)) { + logger().info("Heartbeat::Connection::accepted(): " + "{} racing resolved", *this); + conn = accepted_conn; + set_connected(); + } + } else if (conn == accepted_conn) { + set_connected(); + } +} + +void Heartbeat::Connection::replaced() +{ + assert(!is_connected); + auto replaced_conn = conn; + // set the racing connection, will be handled by handle_accept() + conn = msgr.connect(replaced_conn->get_peer_addr(), + replaced_conn->get_peer_name()); + racing_detected = true; + logger().warn("Heartbeat::Connection::replaced(): {} racing", *this); + assert(conn != replaced_conn); + assert(conn->is_connected()); +} + +void Heartbeat::Connection::reset() +{ + conn = nullptr; + if (is_connected) { + is_connected = false; + listener.decrease_connected(); + } + if (!racing_detected || is_winner_side) { + connect(); + } else { + logger().info("Heartbeat::Connection::reset(): " + "{} racing detected and lose, " + "waiting for peer connect me", *this); + } +} + +seastar::future<> Heartbeat::Connection::send(MessageRef msg) +{ + assert(is_connected); + return conn->send(msg); +} + +void Heartbeat::Connection::validate() +{ + assert(is_connected); + auto peer_addr = listener.get_peer_addr(type); + if (conn->get_peer_addr() != peer_addr) { + logger().info("Heartbeat::Connection::validate(): " + "{} has new address {} over {}, reset", + *this, peer_addr, conn->get_peer_addr()); + conn->mark_down(); + racing_detected = false; + reset(); + } +} + +void Heartbeat::Connection::retry() +{ + racing_detected = false; + if (!is_connected) { + if (conn) { + conn->mark_down(); + reset(); + } else { + connect(); + } + } +} + +void Heartbeat::Connection::set_connected() +{ + assert(!is_connected); + is_connected = true; + listener.increase_connected(); +} + +void Heartbeat::Connection::connect() +{ + assert(!conn); + auto addr = listener.get_peer_addr(type); + conn = msgr.connect(addr, entity_name_t(CEPH_ENTITY_TYPE_OSD, peer)); + if (conn->is_connected()) { + set_connected(); + } +} + +Heartbeat::clock::time_point +Heartbeat::Session::failed_since(Heartbeat::clock::time_point now) const +{ + if (do_health_screen(now) == health_state::UNHEALTHY) { + auto oldest_deadline = ping_history.begin()->second.deadline; + auto failed_since = std::min(last_rx_back, last_rx_front); + if (clock::is_zero(failed_since)) { + logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} " + "ever on either front or back, first ping sent {} " + "(oldest deadline {})", + peer, first_tx, oldest_deadline); + failed_since = first_tx; + } else { + logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} " + "since back {} front {} (oldest deadline {})", + peer, last_rx_back, last_rx_front, oldest_deadline); + } + return failed_since; + } else { + return clock::zero(); + } +} + +void Heartbeat::Session::set_inactive_history(clock::time_point now) +{ + assert(!connected); + if (ping_history.empty()) { + const utime_t sent_stamp{now}; + const auto deadline = + now + std::chrono::seconds(local_conf()->osd_heartbeat_grace); + ping_history.emplace(sent_stamp, reply_t{deadline, 0}); + } else { // the entry is already added + assert(ping_history.size() == 1); + } +} + +Heartbeat::Peer::Peer(Heartbeat& heartbeat, osd_id_t peer) + : ConnectionListener(2), heartbeat{heartbeat}, peer{peer}, session{peer}, + con_front(peer, heartbeat.whoami > peer, Connection::type_t::front, + *heartbeat.front_msgr, *this), + con_back(peer, heartbeat.whoami > peer, Connection::type_t::back, + *heartbeat.back_msgr, *this) +{ + logger().info("Heartbeat::Peer: osd.{} added", peer); +} + +Heartbeat::Peer::~Peer() +{ + logger().info("Heartbeat::Peer: osd.{} removed", peer); +} + +void Heartbeat::Peer::send_heartbeat( + clock::time_point now, ceph::signedspan mnow, + std::vector<seastar::future<>>& futures) +{ + session.set_tx(now); + if (session.is_started()) { + do_send_heartbeat(now, mnow, &futures); + for_each_conn([] (auto& conn) { + conn.validate(); + }); + } else { + // we should send MOSDPing but still cannot at this moment + if (pending_send) { + // we have already pending for a entire heartbeat interval + logger().warn("Heartbeat::Peer::send_heartbeat(): " + "heartbeat to osd.{} is still pending...", peer); + for_each_conn([] (auto& conn) { + conn.retry(); + }); + } else { + logger().info("Heartbeat::Peer::send_heartbeat(): " + "heartbeat to osd.{} is pending send...", peer); + session.set_inactive_history(now); + pending_send = true; + } + } +} + +seastar::future<> Heartbeat::Peer::handle_reply( + crimson::net::ConnectionRef conn, Ref<MOSDPing> m) +{ + if (!session.is_started()) { + // we haven't sent any ping yet + return seastar::now(); + } + type_t type; + if (con_front.matches(conn)) { + type = type_t::front; + } else if (con_back.matches(conn)) { + type = type_t::back; + } else { + return seastar::now(); + } + const auto now = clock::now(); + if (session.on_pong(m->ping_stamp, type, now)) { + if (session.do_health_screen(now) == Session::health_state::HEALTHY) { + return heartbeat.failing_peers.cancel_one(peer); + } + } + return seastar::now(); +} + +entity_addr_t Heartbeat::Peer::get_peer_addr(type_t type) +{ + const auto osdmap = heartbeat.service.get_osdmap_service().get_map(); + if (type == type_t::front) { + return osdmap->get_hb_front_addrs(peer).front(); + } else { + return osdmap->get_hb_back_addrs(peer).front(); + } +} + +void Heartbeat::Peer::on_connected() +{ + logger().info("Heartbeat::Peer: osd.{} connected (send={})", + peer, pending_send); + session.on_connected(); + if (pending_send) { + pending_send = false; + do_send_heartbeat(clock::now(), heartbeat.service.get_mnow(), nullptr); + } +} + +void Heartbeat::Peer::on_disconnected() +{ + logger().info("Heartbeat::Peer: osd.{} disconnected", peer); + session.on_disconnected(); +} + +void Heartbeat::Peer::do_send_heartbeat( + Heartbeat::clock::time_point now, + ceph::signedspan mnow, + std::vector<seastar::future<>>* futures) +{ + const utime_t sent_stamp{now}; + const auto deadline = + now + std::chrono::seconds(local_conf()->osd_heartbeat_grace); + session.on_ping(sent_stamp, deadline); + for_each_conn([&, this] (auto& conn) { + auto min_message = static_cast<uint32_t>( + local_conf()->osd_heartbeat_min_size); + auto ping = make_message<MOSDPing>( + heartbeat.monc.get_fsid(), + heartbeat.service.get_osdmap_service().get_map()->get_epoch(), + MOSDPing::PING, + sent_stamp, + mnow, + mnow, + heartbeat.service.get_osdmap_service().get_up_epoch(), + min_message); + if (futures) { + futures->push_back(conn.send(std::move(ping))); + } + }); +} + +bool Heartbeat::FailingPeers::add_pending( + osd_id_t peer, + clock::time_point failed_since, + clock::time_point now, + std::vector<seastar::future<>>& futures) +{ + if (failure_pending.count(peer)) { + return false; + } + auto failed_for = chrono::duration_cast<chrono::seconds>( + now - failed_since).count(); + auto osdmap = heartbeat.service.get_osdmap_service().get_map(); + auto failure_report = + make_message<MOSDFailure>(heartbeat.monc.get_fsid(), + peer, + osdmap->get_addrs(peer), + static_cast<int>(failed_for), + osdmap->get_epoch()); + failure_pending.emplace(peer, failure_info_t{failed_since, + osdmap->get_addrs(peer)}); + futures.push_back(heartbeat.monc.send_message(failure_report)); + logger().info("{}: osd.{} failed for {}", __func__, peer, failed_for); + return true; +} + +seastar::future<> Heartbeat::FailingPeers::cancel_one(osd_id_t peer) +{ + if (auto pending = failure_pending.find(peer); + pending != failure_pending.end()) { + auto fut = send_still_alive(peer, pending->second.addrs); + failure_pending.erase(peer); + return fut; + } + return seastar::now(); +} + +seastar::future<> +Heartbeat::FailingPeers::send_still_alive( + osd_id_t osd, const entity_addrvec_t& addrs) +{ + auto still_alive = make_message<MOSDFailure>( + heartbeat.monc.get_fsid(), + osd, + addrs, + 0, + heartbeat.service.get_osdmap_service().get_map()->get_epoch(), + MOSDFailure::FLAG_ALIVE); + logger().info("{}: osd.{}", __func__, osd); + return heartbeat.monc.send_message(still_alive); +} diff --git a/src/crimson/osd/heartbeat.h b/src/crimson/osd/heartbeat.h new file mode 100644 index 000000000..4947e871f --- /dev/null +++ b/src/crimson/osd/heartbeat.h @@ -0,0 +1,455 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> +#include <seastar/core/future.hh> +#include "common/ceph_time.h" +#include "crimson/common/gated.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/net/Fwd.h" + +class MOSDPing; + +namespace crimson::osd { + class ShardServices; +} + +namespace crimson::mon { + class Client; +} + +template<typename Message> using Ref = boost::intrusive_ptr<Message>; + +class Heartbeat : public crimson::net::Dispatcher { +public: + using osd_id_t = int; + + Heartbeat(osd_id_t whoami, + const crimson::osd::ShardServices& service, + crimson::mon::Client& monc, + crimson::net::MessengerRef front_msgr, + crimson::net::MessengerRef back_msgr); + + seastar::future<> start(entity_addrvec_t front, + entity_addrvec_t back); + seastar::future<> stop(); + + using osds_t = std::vector<osd_id_t>; + void add_peer(osd_id_t peer, epoch_t epoch); + void update_peers(int whoami); + void remove_peer(osd_id_t peer); + osds_t get_peers() const; + + const entity_addrvec_t& get_front_addrs() const; + const entity_addrvec_t& get_back_addrs() const; + + void set_require_authorizer(bool); + + // Dispatcher methods + std::optional<seastar::future<>> ms_dispatch( + crimson::net::ConnectionRef conn, MessageRef m) override; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override; + void ms_handle_connect(crimson::net::ConnectionRef conn) override; + void ms_handle_accept(crimson::net::ConnectionRef conn) override; + + void print(std::ostream&) const; +private: + seastar::future<> handle_osd_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_reply(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_you_died(); + + /// remove down OSDs + /// @return peers not added in this epoch + osds_t remove_down_peers(); + /// add enough reporters for fast failure detection + void add_reporter_peers(int whoami); + + seastar::future<> start_messenger(crimson::net::Messenger& msgr, + const entity_addrvec_t& addrs); +private: + const osd_id_t whoami; + const crimson::osd::ShardServices& service; + crimson::mon::Client& monc; + crimson::net::MessengerRef front_msgr; + crimson::net::MessengerRef back_msgr; + + seastar::timer<seastar::lowres_clock> timer; + // use real_clock so it can be converted to utime_t + using clock = ceph::coarse_real_clock; + + class ConnectionListener; + class Connection; + class Session; + class Peer; + using peers_map_t = std::map<osd_id_t, Peer>; + peers_map_t peers; + + // osds which are considered failed + // osd_id => when was the last time that both front and back pings were acked + // or sent. + // use for calculating how long the OSD has been unresponsive + using failure_queue_t = std::map<osd_id_t, clock::time_point>; + seastar::future<> send_failures(failure_queue_t&& failure_queue); + seastar::future<> send_heartbeats(); + void heartbeat_check(); + + // osds we've reported to monior as failed ones, but they are not marked down + // yet + crimson::common::Gated gate; + + class FailingPeers { + public: + FailingPeers(Heartbeat& heartbeat) : heartbeat(heartbeat) {} + bool add_pending(osd_id_t peer, + clock::time_point failed_since, + clock::time_point now, + std::vector<seastar::future<>>& futures); + seastar::future<> cancel_one(osd_id_t peer); + + private: + seastar::future<> send_still_alive(osd_id_t, const entity_addrvec_t&); + + Heartbeat& heartbeat; + + struct failure_info_t { + clock::time_point failed_since; + entity_addrvec_t addrs; + }; + std::map<osd_id_t, failure_info_t> failure_pending; + } failing_peers; +}; + +inline std::ostream& operator<<(std::ostream& out, const Heartbeat& hb) { + hb.print(out); + return out; +} + +/* + * Event driven interface for Heartbeat::Peer to be notified when both hb_front + * and hb_back are connected, or connection is lost. + */ +class Heartbeat::ConnectionListener { + public: + ConnectionListener(size_t connections) : connections{connections} {} + + void increase_connected() { + assert(connected < connections); + ++connected; + if (connected == connections) { + on_connected(); + } + } + void decrease_connected() { + assert(connected > 0); + if (connected == connections) { + on_disconnected(); + } + --connected; + } + enum class type_t { front, back }; + virtual entity_addr_t get_peer_addr(type_t) = 0; + + protected: + virtual void on_connected() = 0; + virtual void on_disconnected() = 0; + + private: + const size_t connections; + size_t connected = 0; +}; + +class Heartbeat::Connection { + public: + using type_t = ConnectionListener::type_t; + Connection(osd_id_t peer, bool is_winner_side, type_t type, + crimson::net::Messenger& msgr, + ConnectionListener& listener) + : peer{peer}, type{type}, + msgr{msgr}, listener{listener}, + is_winner_side{is_winner_side} { + connect(); + } + Connection(const Connection&) = delete; + Connection(Connection&&) = delete; + Connection& operator=(const Connection&) = delete; + Connection& operator=(Connection&&) = delete; + + ~Connection(); + + bool matches(crimson::net::ConnectionRef _conn) const; + void connected() { + set_connected(); + } + void accepted(crimson::net::ConnectionRef); + void replaced(); + void reset(); + seastar::future<> send(MessageRef msg); + void validate(); + // retry connection if still pending + void retry(); + + private: + void set_connected(); + void connect(); + + const osd_id_t peer; + const type_t type; + crimson::net::Messenger& msgr; + ConnectionListener& listener; + +/* + * Resolve the following racing when both me and peer are trying to connect + * each other symmetrically, under SocketPolicy::lossy_client: + * + * OSD.A OSD.B + * - - + * |-[1]----> <----[2]-| + * \ / + * \ / + * delay.. X delay.. + * / \ + * |-[1]x> / \ <x[2]-| + * |<-[2]--- ---[1]->| + * |(reset#1) (reset#2)| + * |(reconnectB) (reconnectA)| + * |-[2]---> <---[1]-| + * delay.. delay.. + * (remote close populated) + * |-[2]x> <x[1]-| + * |(reset#2) (reset#1)| + * | ... ... | + * (dead loop!) + * + * Our solution is to remember if such racing was happened recently, and + * establish connection asymmetrically only from the winner side whose osd-id + * is larger. + */ + const bool is_winner_side; + bool racing_detected = false; + + crimson::net::ConnectionRef conn; + bool is_connected = false; + + friend std::ostream& operator<<(std::ostream& os, const Connection& c) { + if (c.type == type_t::front) { + return os << "con_front(osd." << c.peer << ")"; + } else { + return os << "con_back(osd." << c.peer << ")"; + } + } +}; + +/* + * Track the ping history and ping reply (the pong) from the same session, clean up + * history once hb_front or hb_back loses connection and restart the session once + * both connections are connected again. + * + * We cannot simply remove the entire Heartbeat::Peer once hb_front or hb_back + * loses connection, because we would end up with the following deadloop: + * + * OSD.A OSD.B + * - - + * hb_front reset <--(network)--- hb_front close + * | ^ + * | | + * remove Peer B (dead loop!) remove Peer A + * | | + * V | + * hb_back close ----(network)---> hb_back reset + */ +class Heartbeat::Session { + public: + Session(osd_id_t peer) : peer{peer} {} + + void set_epoch(epoch_t epoch_) { epoch = epoch_; } + epoch_t get_epoch() const { return epoch; } + bool is_started() const { return connected; } + bool pinged() const { + if (clock::is_zero(first_tx)) { + // i can never receive a pong without sending any ping message first. + assert(clock::is_zero(last_rx_front) && + clock::is_zero(last_rx_back)); + return false; + } else { + return true; + } + } + + enum class health_state { + UNKNOWN, + UNHEALTHY, + HEALTHY, + }; + health_state do_health_screen(clock::time_point now) const { + if (!pinged()) { + // we are not healty nor unhealty because we haven't sent anything yet + return health_state::UNKNOWN; + } else if (!ping_history.empty() && ping_history.begin()->second.deadline < now) { + return health_state::UNHEALTHY; + } else if (!clock::is_zero(last_rx_front) && + !clock::is_zero(last_rx_back)) { + // only declare to be healthy until we have received the first + // replies from both front/back connections + return health_state::HEALTHY; + } else { + return health_state::UNKNOWN; + } + } + + clock::time_point failed_since(clock::time_point now) const; + + void set_tx(clock::time_point now) { + if (!pinged()) { + first_tx = now; + } + last_tx = now; + } + + void on_connected() { + assert(!connected); + connected = true; + ping_history.clear(); + } + + void on_ping(const utime_t& sent_stamp, + const clock::time_point& deadline) { + assert(connected); + [[maybe_unused]] auto [reply, added] = + ping_history.emplace(sent_stamp, reply_t{deadline, 2}); + } + + bool on_pong(const utime_t& ping_stamp, + Connection::type_t type, + clock::time_point now) { + assert(connected); + auto ping = ping_history.find(ping_stamp); + if (ping == ping_history.end()) { + // old replies, deprecated by newly sent pings. + return false; + } + auto& unacked = ping->second.unacknowledged; + assert(unacked); + if (type == Connection::type_t::front) { + last_rx_front = now; + unacked--; + } else { + last_rx_back = now; + unacked--; + } + if (unacked == 0) { + ping_history.erase(ping_history.begin(), ++ping); + } + return true; + } + + void on_disconnected() { + assert(connected); + connected = false; + if (!ping_history.empty()) { + // we lost our ping_history of the last session, but still need to keep + // the oldest deadline for unhealthy check. + auto oldest = ping_history.begin(); + auto sent_stamp = oldest->first; + auto deadline = oldest->second.deadline; + ping_history.clear(); + ping_history.emplace(sent_stamp, reply_t{deadline, 0}); + } + } + + // maintain an entry in ping_history for unhealthy check + void set_inactive_history(clock::time_point); + + private: + const osd_id_t peer; + bool connected = false; + // time we sent our first ping request + clock::time_point first_tx; + // last time we sent a ping request + clock::time_point last_tx; + // last time we got a ping reply on the front side + clock::time_point last_rx_front; + // last time we got a ping reply on the back side + clock::time_point last_rx_back; + // most recent epoch we wanted this peer + epoch_t epoch; + + struct reply_t { + clock::time_point deadline; + // one sent over front conn, another sent over back conn + uint8_t unacknowledged = 0; + }; + // history of inflight pings, arranging by timestamp we sent + std::map<utime_t, reply_t> ping_history; +}; + +class Heartbeat::Peer final : private Heartbeat::ConnectionListener { + public: + Peer(Heartbeat&, osd_id_t); + ~Peer(); + Peer(Peer&&) = delete; + Peer(const Peer&) = delete; + Peer& operator=(Peer&&) = delete; + Peer& operator=(const Peer&) = delete; + + void set_epoch(epoch_t epoch) { session.set_epoch(epoch); } + epoch_t get_epoch() const { return session.get_epoch(); } + + // if failure, return time_point since last active + // else, return clock::zero() + clock::time_point failed_since(clock::time_point now) const { + return session.failed_since(now); + } + void send_heartbeat( + clock::time_point, ceph::signedspan, std::vector<seastar::future<>>&); + seastar::future<> handle_reply(crimson::net::ConnectionRef, Ref<MOSDPing>); + void handle_reset(crimson::net::ConnectionRef conn, bool is_replace) { + for_each_conn([&] (auto& _conn) { + if (_conn.matches(conn)) { + if (is_replace) { + _conn.replaced(); + } else { + _conn.reset(); + } + } + }); + } + void handle_connect(crimson::net::ConnectionRef conn) { + for_each_conn([&] (auto& _conn) { + if (_conn.matches(conn)) { + _conn.connected(); + } + }); + } + void handle_accept(crimson::net::ConnectionRef conn) { + for_each_conn([&] (auto& _conn) { + _conn.accepted(conn); + }); + } + + private: + entity_addr_t get_peer_addr(type_t type) override; + void on_connected() override; + void on_disconnected() override; + void do_send_heartbeat( + clock::time_point, ceph::signedspan, std::vector<seastar::future<>>*); + + template <typename Func> + void for_each_conn(Func&& f) { + f(con_front); + f(con_back); + } + + Heartbeat& heartbeat; + const osd_id_t peer; + Session session; + // if need to send heartbeat when session connected + bool pending_send = false; + Connection con_front; + Connection con_back; +}; diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc new file mode 100644 index 000000000..a90903e72 --- /dev/null +++ b/src/crimson/osd/main.cc @@ -0,0 +1,230 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/types.h> +#include <unistd.h> + +#include <iostream> +#include <random> + +#include <seastar/core/app-template.hh> +#include <seastar/core/print.hh> +#include <seastar/core/thread.hh> +#include <seastar/util/std-compat.hh> + +#include "auth/KeyRing.h" +#include "common/ceph_argparse.h" +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" +#include "crimson/net/Messenger.h" +#include "global/pidfile.h" + +#include "osd.h" + +using config_t = crimson::common::ConfigProxy; + +void usage(const char* prog) { + std::cout << "usage: " << prog << " -i <ID>\n" + << " --help-seastar show Seastar help messages\n"; + generic_server_usage(); +} + +auto partition_args(seastar::app_template& app, char** argv_begin, char** argv_end) +{ + namespace bpo = boost::program_options; + // collect all options consumed by seastar::app_template + auto parsed = bpo::command_line_parser(std::distance(argv_begin, argv_end), + argv_begin) + .options(app.get_options_description()).allow_unregistered().run(); + auto unknown_args = bpo::collect_unrecognized(parsed.options, + bpo::include_positional); + std::vector<const char*> ceph_args, app_args; + // ceph_argparse_early_args() and + // seastar::smp::get_options_description() use "-c" for different + // options. and ceph wins + auto consume_conf_arg = [&](char** argv) { + if (std::strcmp(*argv, "-c") == 0) { + ceph_args.push_back(*argv++); + if (argv != argv_end) { + ceph_args.push_back(*argv++); + } + } + return argv; + }; + auto unknown = unknown_args.begin(); + auto consume_unknown_arg = [&](char** argv) { + for (; unknown != unknown_args.end() && + argv != argv_end && + *unknown == *argv; ++argv, ++unknown) { + if (std::strcmp(*argv, "--help-seastar") == 0) { + app_args.push_back("--help"); + } else { + ceph_args.push_back(*argv); + } + } + return argv; + }; + for (auto argv = argv_begin; argv != argv_end;) { + if (auto next_arg = consume_conf_arg(argv); next_arg != argv) { + argv = next_arg; + } else if (auto next_arg = consume_unknown_arg(argv); next_arg != argv) { + argv = next_arg; + } else { + app_args.push_back(*argv++); + } + } + return make_pair(std::move(ceph_args), std::move(app_args)); +} + +using crimson::common::local_conf; + +seastar::future<> make_keyring() +{ + const auto path = local_conf().get_val<string>("keyring"); + return seastar::file_exists(path).then([path](bool exists) { + KeyRing keyring; + EntityName name{local_conf()->name}; + EntityAuth auth; + if (exists && + keyring.load(nullptr, path) == 0 && + keyring.get_auth(name, auth)) { + seastar::fprint(std::cerr, "already have key in keyring: %s\n", path); + return seastar::now(); + } else { + auth.key.create(std::make_unique<CephContext>().get(), CEPH_CRYPTO_AES); + keyring.add(name, auth); + bufferlist bl; + keyring.encode_plaintext(bl); + const auto permissions = (seastar::file_permissions::user_read | + seastar::file_permissions::user_write); + return crimson::write_file(std::move(bl), path, permissions); + } + }).handle_exception_type([path](const std::filesystem::filesystem_error& e) { + seastar::fprint(std::cerr, "FATAL: writing new keyring to %s: %s\n", path, e.what()); + throw e; + }); +} + +uint64_t get_nonce() +{ + if (auto pid = getpid(); pid != 1) { + return pid; + } else { + // we're running in a container; use a random number instead! + std::random_device rd; + std::default_random_engine rng{rd()}; + return std::uniform_int_distribution<uint64_t>{}(rng); + } +} + +int main(int argc, char* argv[]) +{ + seastar::app_template app; + app.add_options() + ("mkkey", "generate a new secret key. " + "This is normally used in combination with --mkfs") + ("mkfs", "create a [new] data directory") + ("debug", "enable debug output on all loggers"); + + auto [ceph_args, app_args] = partition_args(app, argv, argv + argc); + if (ceph_argparse_need_usage(ceph_args) && + std::find(app_args.begin(), app_args.end(), "--help") == app_args.end()) { + usage(argv[0]); + return EXIT_SUCCESS; + } + std::string cluster_name{"ceph"}; + std::string conf_file_list; + // ceph_argparse_early_args() could _exit(), while local_conf() won't ready + // until it's started. so do the boilerplate-settings parsing here. + auto init_params = ceph_argparse_early_args(ceph_args, + CEPH_ENTITY_TYPE_OSD, + &cluster_name, + &conf_file_list); + seastar::sharded<crimson::osd::OSD> osd; + using crimson::common::sharded_conf; + using crimson::common::sharded_perf_coll; + try { + return app.run_deprecated(app_args.size(), const_cast<char**>(app_args.data()), + [&, &ceph_args=ceph_args] { + auto& config = app.configuration(); + return seastar::async([&] { + if (config.count("debug")) { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::debug + ); + } + sharded_conf().start(init_params.name, cluster_name).get(); + seastar::engine().at_exit([] { + return sharded_conf().stop(); + }); + sharded_perf_coll().start().get(); + seastar::engine().at_exit([] { + return sharded_perf_coll().stop(); + }); + local_conf().parse_config_files(conf_file_list).get(); + local_conf().parse_argv(ceph_args).get(); + if (const auto ret = pidfile_write(local_conf()->pid_file); + ret == -EACCES || ret == -EAGAIN) { + ceph_abort_msg( + "likely there is another crimson-osd instance with the same id"); + } else if (ret < 0) { + ceph_abort_msg(fmt::format("pidfile_write failed with {} {}", + ret, cpp_strerror(-ret))); + } + // just ignore SIGHUP, we don't reread settings + seastar::engine().handle_signal(SIGHUP, [] {}); + const int whoami = std::stoi(local_conf()->name.get_id()); + const auto nonce = get_nonce(); + crimson::net::MessengerRef cluster_msgr, client_msgr; + crimson::net::MessengerRef hb_front_msgr, hb_back_msgr; + for (auto [msgr, name] : {make_pair(std::ref(cluster_msgr), "cluster"s), + make_pair(std::ref(client_msgr), "client"s), + make_pair(std::ref(hb_front_msgr), "hb_front"s), + make_pair(std::ref(hb_back_msgr), "hb_back"s)}) { + msgr = crimson::net::Messenger::create(entity_name_t::OSD(whoami), name, + nonce); + if (local_conf()->ms_crc_data) { + msgr->set_crc_data(); + } + if (local_conf()->ms_crc_header) { + msgr->set_crc_header(); + } + } + osd.start_single(whoami, nonce, + cluster_msgr, client_msgr, + hb_front_msgr, hb_back_msgr).get(); + if (config.count("mkkey")) { + make_keyring().handle_exception([](std::exception_ptr) { + seastar::engine().exit(1); + }).get(); + } + if (config.count("mkfs")) { + osd.invoke_on( + 0, + &crimson::osd::OSD::mkfs, + local_conf().get_val<uuid_d>("osd_uuid"), + local_conf().get_val<uuid_d>("fsid")).get(); + } + seastar::engine().at_exit([&] { + return osd.stop(); + }); + if (config.count("mkkey") || config.count("mkfs")) { + seastar::engine().exit(0); + } else { + osd.invoke_on(0, &crimson::osd::OSD::start).get(); + } + }); + }); + } catch (...) { + seastar::fprint(std::cerr, "FATAL: Exception during startup, aborting: %s\n", std::current_exception()); + return EXIT_FAILURE; + } +} + +/* + * Local Variables: + * compile-command: "make -j4 \ + * -C ../../../build \ + * crimson-osd" + * End: + */ diff --git a/src/crimson/osd/objclass.cc b/src/crimson/osd/objclass.cc new file mode 100644 index 000000000..bc3284e26 --- /dev/null +++ b/src/crimson/osd/objclass.cc @@ -0,0 +1,484 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <cstdarg> +#include <cstring> +#include "common/ceph_context.h" +#include "common/ceph_releases.h" +#include "common/config.h" +#include "common/debug.h" + +#include "crimson/osd/exceptions.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/pg_backend.h" + +#include "objclass/objclass.h" +#include "osd/ClassHandler.h" + +#include "auth/Crypto.h" +#include "common/armor.h" + +static inline int execute_osd_op(cls_method_context_t hctx, OSDOp& op) +{ + // we can expect the memory under `ret` will be still fine after + // executing the osd op as we're running inside `seastar::thread` + // created for us by `seastar::async` in `::do_op_call()`. + int ret = 0; + using osd_op_errorator = crimson::osd::OpsExecuter::osd_op_errorator; + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->execute_op(op).handle_error( + osd_op_errorator::all_same_way([&ret] (const std::error_code& err) { + assert(err.value() > 0); + ret = -err.value(); + return seastar::now(); + })).get(); // we're blocking here which requires `seastar::thread`. + return ret; +} + +int cls_call(cls_method_context_t hctx, const char *cls, const char *method, + char *indata, int datalen, + char **outdata, int *outdatalen) +{ +// FIXME, HACK: this is for testing only. Let's use dynamic linker to verify +// our depedencies + return 0; +} + +int cls_getxattr(cls_method_context_t hctx, + const char *name, + char **outdata, + int *outdatalen) +{ + return 0; +} + +int cls_setxattr(cls_method_context_t hctx, + const char *name, + const char *value, + int val_len) +{ + return 0; +} + +int cls_read(cls_method_context_t hctx, + int ofs, int len, + char **outdata, + int *outdatalen) +{ + return 0; +} + +int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin) +{ + assert(origin); + + try { + const auto& message = \ + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message(); + *origin = message.get_orig_source_inst(); + return 0; + } catch (crimson::osd::error& e) { + return -e.code().value(); + } +} + +int cls_cxx_create(cls_method_context_t hctx, const bool exclusive) +{ + OSDOp op{CEPH_OSD_OP_CREATE}; + op.op.flags = (exclusive ? CEPH_OSD_OP_FLAG_EXCL : 0); + return execute_osd_op(hctx, op); +} + +int cls_cxx_remove(cls_method_context_t hctx) +{ + OSDOp op{CEPH_OSD_OP_DELETE}; + return execute_osd_op(hctx, op); +} + +int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime) +{ + OSDOp op{CEPH_OSD_OP_STAT}; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + utime_t ut; + uint64_t s; + try { + auto iter = op.outdata.cbegin(); + decode(s, iter); + decode(ut, iter); + } catch (buffer::error& err) { + return -EIO; + } + if (size) { + *size = s; + } + if (mtime) { + *mtime = ut.sec(); + } + return 0; +} + +int cls_cxx_stat2(cls_method_context_t hctx, + uint64_t *size, + ceph::real_time *mtime) +{ + return 0; +} + +int cls_cxx_read2(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *outbl, + uint32_t op_flags) +{ + OSDOp op{CEPH_OSD_OP_SYNC_READ}; + op.op.extent.offset = ofs; + op.op.extent.length = len; + op.op.flags = op_flags; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return outbl->length(); +} + +int cls_cxx_write2(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *inbl, + uint32_t op_flags) +{ + OSDOp op{CEPH_OSD_OP_WRITE}; + op.op.extent.offset = ofs; + op.op.extent.length = len; + op.op.flags = op_flags; + op.indata = *inbl; + return execute_osd_op(hctx, op); +} + +int cls_cxx_write_full(cls_method_context_t hctx, bufferlist * const inbl) +{ + OSDOp op{CEPH_OSD_OP_WRITEFULL}; + op.op.extent.offset = 0; + op.op.extent.length = inbl->length(); + op.indata = *inbl; + return execute_osd_op(hctx, op); +} + +int cls_cxx_replace(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *inbl) +{ + { + OSDOp top{CEPH_OSD_OP_TRUNCATE}; + top.op.extent.offset = 0; + top.op.extent.length = 0; + if (const auto ret = execute_osd_op(hctx, top); ret < 0) { + return ret; + } + } + + { + OSDOp wop{CEPH_OSD_OP_WRITE}; + wop.op.extent.offset = ofs; + wop.op.extent.length = len; + wop.indata = *inbl; + if (const auto ret = execute_osd_op(hctx, wop); ret < 0) { + return ret; + } + } + return 0; +} + +int cls_cxx_truncate(cls_method_context_t hctx, int ofs) +{ + OSDOp op{CEPH_OSD_OP_TRUNCATE}; + op.op.extent.offset = ofs; + op.op.extent.length = 0; + return execute_osd_op(hctx, op); +} + +int cls_cxx_write_zero(cls_method_context_t hctx, int offset, int len) +{ + OSDOp op{CEPH_OSD_OP_ZERO}; + op.op.extent.offset = offset; + op.op.extent.length = len; + return execute_osd_op(hctx, op); +} + +int cls_cxx_getxattr(cls_method_context_t hctx, + const char *name, + bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_GETXATTR}; + op.op.xattr.name_len = strlen(name); + op.indata.append(name, op.op.xattr.name_len); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return outbl->length(); +} + +int cls_cxx_getxattrs(cls_method_context_t hctx, + map<string, bufferlist> *attrset) +{ + return 0; +} + +int cls_cxx_setxattr(cls_method_context_t hctx, + const char *name, + bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_SETXATTR}; + op.op.xattr.name_len = std::strlen(name); + op.op.xattr.value_len = inbl->length(); + op.indata.append(name, op.op.xattr.name_len); + op.indata.append(*inbl); + return execute_osd_op(hctx, op); +} + +int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid) +{ + OSDOp op{op = CEPH_OSD_OP_ROLLBACK}; + op.op.snap.snapid = snapid; + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_get_all_vals(cls_method_context_t hctx, + map<string, bufferlist>* vals, + bool *more) +{ + return 0; +} + +int cls_cxx_map_get_keys(cls_method_context_t hctx, + const std::string& start_obj, + const uint64_t max_to_get, + std::set<std::string>* const keys, + bool* const more) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETKEYS}; + encode(start_obj, op.indata); + encode(max_to_get, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*keys, iter); + decode(*more, iter); + } catch (buffer::error&) { + return -EIO; + } + return keys->size(); +} + +int cls_cxx_map_get_vals(cls_method_context_t hctx, + const std::string& start_obj, + const std::string& filter_prefix, + const uint64_t max_to_get, + std::map<std::string, ceph::bufferlist> *vals, + bool* const more) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALS}; + encode(start_obj, op.indata); + encode(max_to_get, op.indata); + encode(filter_prefix, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*vals, iter); + decode(*more, iter); + } catch (buffer::error&) { + return -EIO; + } + return vals->size(); +} + +int cls_cxx_map_get_vals_by_keys(cls_method_context_t hctx, + const std::set<std::string> &keys, + std::map<std::string, ceph::bufferlist> *vals) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS}; + encode(keys, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*vals, iter); + } catch (buffer::error&) { + return -EIO; + } + return 0; +} + +int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETHEADER}; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return 0; +} + +int cls_cxx_map_get_val(cls_method_context_t hctx, + const string &key, + bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS}; + { + std::set<std::string> k{key}; + encode(k, op.indata); + } + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + std::map<std::string, ceph::bufferlist> m; + try { + auto iter = op.outdata.cbegin(); + decode(m, iter); + } catch (buffer::error&) { + return -EIO; + } + if (auto iter = std::begin(m); iter != std::end(m)) { + *outbl = std::move(iter->second); + return 0; + } else { + return -ENOENT; + } +} + +int cls_cxx_map_set_val(cls_method_context_t hctx, + const string &key, + bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETVALS}; + { + std::map<std::string, ceph::bufferlist> m; + m[key] = *inbl; + encode(m, op.indata); + } + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_set_vals(cls_method_context_t hctx, + const std::map<string, ceph::bufferlist> *map) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETVALS}; + encode(*map, op.indata); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_clear(cls_method_context_t hctx) +{ + return 0; +} + +int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETHEADER}; + op.indata = std::move(*inbl); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_remove_range(cls_method_context_t hctx, + const std::string& key_begin, + const std::string& key_end) +{ + OSDOp op{CEPH_OSD_OP_OMAPRMKEYRANGE}; + encode(key_begin, op.indata); + encode(key_end, op.indata); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_remove_key(cls_method_context_t hctx, const string &key) +{ + return 0; +} + +int cls_cxx_list_watchers(cls_method_context_t hctx, + obj_list_watch_response_t *watchers) +{ + return 0; +} + +uint64_t cls_current_version(cls_method_context_t hctx) +{ + return 0; +} + + +int cls_current_subop_num(cls_method_context_t hctx) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + // in contrast to classical OSD, crimson doesn't count OP_CALL and + // OP_STAT which seems fine regarding how the plugins we take care + // about use this part of API. + return ox->get_processed_rw_ops_num(); +} + +uint64_t cls_get_features(cls_method_context_t hctx) +{ + return 0; +} + +uint64_t cls_get_client_features(cls_method_context_t hctx) +{ + try { + const auto& message = \ + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message(); + return message.get_features(); + } catch (crimson::osd::error& e) { + return -e.code().value(); + } +} + +uint64_t cls_get_pool_stripe_width(cls_method_context_t hctx) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + return ox->get_pool_stripe_width(); +} + +ceph_release_t cls_get_required_osd_release(cls_method_context_t hctx) +{ + // FIXME + return ceph_release_t::nautilus; +} + +ceph_release_t cls_get_min_compatible_client(cls_method_context_t hctx) +{ + // FIXME + return ceph_release_t::nautilus; +} + +int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq) +{ + return 0; +} + +int cls_cxx_chunk_write_and_set(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *write_inbl, + uint32_t op_flags, + bufferlist *set_inbl, + int set_len) +{ + return 0; +} + +int cls_get_manifest_ref_count(cls_method_context_t hctx, string fp_oid) +{ + return 0; +} + +uint64_t cls_get_osd_min_alloc_size(cls_method_context_t hctx) { + // FIXME + return 4096; +} diff --git a/src/crimson/osd/object_context.cc b/src/crimson/osd/object_context.cc new file mode 100644 index 000000000..bbc71d3f9 --- /dev/null +++ b/src/crimson/osd/object_context.cc @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/object_context.h" + +#include "common/Formatter.h" +#include "crimson/common/config_proxy.h" + +namespace crimson::osd { + +ObjectContextRegistry::ObjectContextRegistry(crimson::common::ConfigProxy &conf) +{ + obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size")); + conf.add_observer(this); +} + +const char** ObjectContextRegistry::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "crimson_osd_obc_lru_size", + nullptr + }; + return KEYS; +} + +void ObjectContextRegistry::handle_conf_change( + const crimson::common::ConfigProxy& conf, + const std::set <std::string> &changed) +{ + obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size")); +} + + +} diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h new file mode 100644 index 000000000..be238851e --- /dev/null +++ b/src/crimson/osd/object_context.h @@ -0,0 +1,189 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <optional> +#include <utility> +#include <seastar/core/shared_future.hh> +#include <seastar/core/shared_ptr.hh> + +#include "common/intrusive_lru.h" +#include "osd/object_state.h" +#include "crimson/common/exception.h" +#include "crimson/common/tri_mutex.h" +#include "crimson/osd/osd_operation.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::common { + class ConfigProxy; +} + +namespace crimson::osd { + +class Watch; + +template <typename OBC> +struct obc_to_hoid { + using type = hobject_t; + const type &operator()(const OBC &obc) { + return obc.obs.oi.soid; + } +}; + +class ObjectContext : public ceph::common::intrusive_lru_base< + ceph::common::intrusive_lru_config< + hobject_t, ObjectContext, obc_to_hoid<ObjectContext>>> +{ +public: + Ref head; // Ref defined as part of ceph::common::intrusive_lru_base + ObjectState obs; + std::optional<SnapSet> ss; + bool loaded : 1; + // the watch / notify machinery rather stays away from the hot and + // frequented paths. std::map is used mostly because of developer's + // convenience. + using watch_key_t = std::pair<uint64_t, entity_name_t>; + std::map<watch_key_t, seastar::shared_ptr<crimson::osd::Watch>> watchers; + + ObjectContext(const hobject_t &hoid) : obs(hoid), loaded(false) {} + + const hobject_t &get_oid() const { + return obs.oi.soid; + } + + bool is_head() const { + return get_oid().is_head(); + } + + const SnapSet &get_ro_ss() const { + if (is_head()) { + ceph_assert(ss); + return *ss; + } else { + ceph_assert(head); + return head->get_ro_ss(); + } + } + + void set_head_state(ObjectState &&_obs, SnapSet &&_ss) { + ceph_assert(is_head()); + obs = std::move(_obs); + ss = std::move(_ss); + loaded = true; + } + + void set_clone_state(ObjectState &&_obs, Ref &&_head) { + ceph_assert(!is_head()); + obs = std::move(_obs); + head = _head; + loaded = true; + } + + /// pass the provided exception to any waiting consumers of this ObjectContext + template<typename Exception> + void interrupt(Exception ex) { + lock.abort(std::move(ex)); + if (recovery_read_marker) { + drop_recovery_read(); + } + } + +private: + tri_mutex lock; + bool recovery_read_marker = false; + + template <typename Lock, typename Func> + auto _with_lock(Lock&& lock, Func&& func) { + Ref obc = this; + return lock.lock().then([&lock, func = std::forward<Func>(func), obc]() mutable { + return seastar::futurize_invoke(func).finally([&lock, obc] { + lock.unlock(); + }); + }); + } + +public: + template<RWState::State Type, typename Func> + auto with_lock(Func&& func) { + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.for_write(), std::forward<Func>(func)); + case RWState::RWREAD: + return _with_lock(lock.for_read(), std::forward<Func>(func)); + case RWState::RWEXCL: + return _with_lock(lock.for_excl(), std::forward<Func>(func)); + case RWState::RWNONE: + return seastar::futurize_invoke(std::forward<Func>(func)); + default: + assert(0 == "noop"); + } + } + template<RWState::State Type, typename Func> + auto with_promoted_lock(Func&& func) { + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.excl_from_write(), std::forward<Func>(func)); + case RWState::RWREAD: + return _with_lock(lock.excl_from_read(), std::forward<Func>(func)); + case RWState::RWEXCL: + return _with_lock(lock.excl_from_excl(), std::forward<Func>(func)); + case RWState::RWNONE: + return _with_lock(lock.for_excl(), std::forward<Func>(func)); + default: + assert(0 == "noop"); + } + } + + bool empty() const { + return !lock.is_acquired(); + } + bool is_request_pending() const { + return lock.is_acquired(); + } + + bool get_recovery_read() { + if (lock.try_lock_for_read()) { + recovery_read_marker = true; + return true; + } else { + return false; + } + } + void wait_recovery_read() { + assert(lock.get_readers() > 0); + recovery_read_marker = true; + } + void drop_recovery_read() { + assert(recovery_read_marker); + recovery_read_marker = false; + } + bool maybe_get_excl() { + return lock.try_lock_for_excl(); + } +}; +using ObjectContextRef = ObjectContext::Ref; + +class ObjectContextRegistry : public md_config_obs_t { + ObjectContext::lru_t obc_lru; + +public: + ObjectContextRegistry(crimson::common::ConfigProxy &conf); + + std::pair<ObjectContextRef, bool> get_cached_obc(const hobject_t &hoid) { + return obc_lru.get_or_create(hoid); + } + ObjectContextRef maybe_get_cached_obc(const hobject_t &hoid) { + return obc_lru.get(hoid); + } + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const crimson::common::ConfigProxy& conf, + const std::set <std::string> &changed) final; +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc new file mode 100644 index 000000000..6b6614e93 --- /dev/null +++ b/src/crimson/osd/ops_executer.cc @@ -0,0 +1,980 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ops_executer.h" + +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/map.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm_ext/push_back.hpp> +#include <boost/range/algorithm/max_element.hpp> +#include <boost/range/numeric.hpp> + +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include <seastar/core/thread.hh> + +#include "crimson/osd/exceptions.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/watch.h" +#include "osd/ClassHandler.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +OpsExecuter::call_errorator::future<> OpsExecuter::do_op_call(OSDOp& osd_op) +{ + std::string cname, mname; + ceph::bufferlist indata; + try { + auto bp = std::begin(osd_op.indata); + bp.copy(osd_op.op.cls.class_len, cname); + bp.copy(osd_op.op.cls.method_len, mname); + bp.copy(osd_op.op.cls.indata_len, indata); + } catch (buffer::error&) { + logger().warn("call unable to decode class + method + indata"); + return crimson::ct_error::invarg::make(); + } + + // NOTE: opening a class can actually result in dlopen(), and thus + // blocking the entire reactor. Thankfully to ClassHandler's cache + // this is supposed to be extremely infrequent. + ClassHandler::ClassData* cls; + int r = ClassHandler::get_instance().open_class(cname, &cls); + if (r) { + logger().warn("class {} open got {}", cname, cpp_strerror(r)); + if (r == -ENOENT) { + return crimson::ct_error::operation_not_supported::make(); + } else if (r == -EPERM) { + // propagate permission errors + return crimson::ct_error::permission_denied::make(); + } + return crimson::ct_error::input_output_error::make(); + } + + ClassHandler::ClassMethod* method = cls->get_method(mname); + if (!method) { + logger().warn("call method {}.{} does not exist", cname, mname); + return crimson::ct_error::operation_not_supported::make(); + } + + const auto flags = method->get_flags(); + if (!obc->obs.exists && (flags & CLS_METHOD_WR) == 0) { + return crimson::ct_error::enoent::make(); + } + +#if 0 + if (flags & CLS_METHOD_WR) { + ctx->user_modify = true; + } +#endif + + logger().debug("calling method {}.{}, num_read={}, num_write={}", + cname, mname, num_read, num_write); + const auto prev_rd = num_read; + const auto prev_wr = num_write; + return seastar::async( + [this, method, indata=std::move(indata)]() mutable { + ceph::bufferlist outdata; + auto cls_context = reinterpret_cast<cls_method_context_t>(this); + const auto ret = method->exec(cls_context, indata, outdata); + return std::make_pair(ret, std::move(outdata)); + } + ).then( + [this, prev_rd, prev_wr, &osd_op, flags] + (auto outcome) -> call_errorator::future<> { + auto& [ret, outdata] = outcome; + osd_op.rval = ret; + + logger().debug("do_op_call: method returned ret={}, outdata.length()={}" + " while num_read={}, num_write={}", + ret, outdata.length(), num_read, num_write); + if (num_read > prev_rd && !(flags & CLS_METHOD_RD)) { + logger().error("method tried to read object but is not marked RD"); + osd_op.rval = -EIO; + return crimson::ct_error::input_output_error::make(); + } + if (num_write > prev_wr && !(flags & CLS_METHOD_WR)) { + logger().error("method tried to update object but is not marked WR"); + osd_op.rval = -EIO; + return crimson::ct_error::input_output_error::make(); + } + // ceph-osd has this implemented in `PrimaryLogPG::execute_ctx`, + // grep for `ignore_out_data`. + using crimson::common::local_conf; + if (op_info.allows_returnvec() && + op_info.may_write() && + ret >= 0 && + outdata.length() > local_conf()->osd_max_write_op_reply_len) { + // the justification of this limit it to not inflate the pg log. + // that's the reason why we don't worry about pure reads. + logger().error("outdata overflow due to .length()={}, limit={}", + outdata.length(), + local_conf()->osd_max_write_op_reply_len); + osd_op.rval = -EOVERFLOW; + return crimson::ct_error::value_too_large::make(); + } + // for write calls we never return data expect errors or RETURNVEC. + // please refer cls/cls_hello.cc to details. + if (!op_info.may_write() || op_info.allows_returnvec() || ret < 0) { + osd_op.op.extent.length = outdata.length(); + osd_op.outdata.claim_append(outdata); + } + if (ret < 0) { + return crimson::stateful_ec{ + std::error_code(-ret, std::generic_category()) }; + } else { + return seastar::now(); + } + } + ); +} + +static watch_info_t create_watch_info(const OSDOp& osd_op, + const MOSDOp& msg) +{ + using crimson::common::local_conf; + const uint32_t timeout = + osd_op.op.watch.timeout == 0 ? local_conf()->osd_client_watch_timeout + : osd_op.op.watch.timeout; + return { + osd_op.op.watch.cookie, + timeout, + msg.get_connection()->get_peer_addr() + }; +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + struct connect_ctx_t { + ObjectContext::watch_key_t key; + crimson::net::ConnectionRef conn; + watch_info_t info; + + connect_ctx_t(const OSDOp& osd_op, const MOSDOp& msg) + : key(osd_op.op.watch.cookie, msg.get_reqid().name), + conn(msg.get_connection()), + info(create_watch_info(osd_op, msg)) { + } + }; + return with_effect_on_obc(connect_ctx_t{ osd_op, get_message() }, + [&] (auto& ctx) { + const auto& entity = ctx.key.second; + auto [it, emplaced] = + os.oi.watchers.try_emplace(ctx.key, std::move(ctx.info)); + if (emplaced) { + logger().info("registered new watch {} by {}", it->second, entity); + txn.nop(); + } else { + logger().info("found existing watch {} by {}", it->second, entity); + } + return seastar::now(); + }, + [] (auto&& ctx, ObjectContextRef obc) { + auto [it, emplaced] = obc->watchers.try_emplace(ctx.key, nullptr); + if (emplaced) { + const auto& [cookie, entity] = ctx.key; + it->second = crimson::osd::Watch::create(obc, ctx.info, entity); + logger().info("op_effect: added new watcher: {}", ctx.key); + } else { + logger().info("op_effect: found existing watcher: {}", ctx.key); + } + return it->second->connect(std::move(ctx.conn), true /* will_ping */); + }); +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_reconnect( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + const entity_name_t& entity = get_message().get_reqid().name; + const auto& cookie = osd_op.op.watch.cookie; + if (!os.oi.watchers.count(std::make_pair(cookie, entity))) { + return crimson::ct_error::not_connected::make(); + } else { + logger().info("found existing watch by {}", entity); + return do_op_watch_subop_watch(osd_op, os, txn); + } +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_unwatch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + logger().info("{}", __func__); + + struct disconnect_ctx_t { + ObjectContext::watch_key_t key; + bool send_disconnect{ false }; + + disconnect_ctx_t(const OSDOp& osd_op, const MOSDOp& msg) + : key(osd_op.op.watch.cookie, msg.get_reqid().name) { + } + }; + return with_effect_on_obc(disconnect_ctx_t{ osd_op, get_message() }, + [&] (auto& ctx) { + const auto& entity = ctx.key.second; + if (auto nh = os.oi.watchers.extract(ctx.key); !nh.empty()) { + logger().info("removed watch {} by {}", nh.mapped(), entity); + txn.nop(); + } else { + logger().info("can't remove: no watch by {}", entity); + } + return seastar::now(); + }, + [] (auto&& ctx, ObjectContextRef obc) { + if (auto nh = obc->watchers.extract(ctx.key); !nh.empty()) { + return seastar::do_with(std::move(nh.mapped()), + [ctx](auto&& watcher) { + logger().info("op_effect: disconnect watcher {}", ctx.key); + return watcher->remove(ctx.send_disconnect); + }); + } else { + logger().info("op_effect: disconnect failed to find watcher {}", ctx.key); + return seastar::now(); + } + }); +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_ping( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + const entity_name_t& entity = get_message().get_reqid().name; + const auto& cookie = osd_op.op.watch.cookie; + const auto key = std::make_pair(cookie, entity); + + // Note: WATCH with PING doesn't cause may_write() to return true, + // so if there is nothing else in the transaction, this is going + // to run do_osd_op_effects, but not write out a log entry */ + if (!os.oi.watchers.count(key)) { + return crimson::ct_error::not_connected::make(); + } + auto it = obc->watchers.find(key); + if (it == std::end(obc->watchers) || !it->second->is_connected()) { + return crimson::ct_error::timed_out::make(); + } + logger().info("found existing watch by {}", entity); + it->second->got_ping(ceph_clock_now()); + return seastar::now(); +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + logger().debug("{}", __func__); + if (!os.exists) { + return crimson::ct_error::enoent::make(); + } + switch (osd_op.op.watch.op) { + case CEPH_OSD_WATCH_OP_WATCH: + return do_op_watch_subop_watch(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_RECONNECT: + return do_op_watch_subop_reconnect(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_PING: + return do_op_watch_subop_ping(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_UNWATCH: + return do_op_watch_subop_unwatch(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_LEGACY_WATCH: + logger().warn("ignoring CEPH_OSD_WATCH_OP_LEGACY_WATCH"); + return crimson::ct_error::invarg::make(); + } + logger().warn("unrecognized WATCH subop: {}", osd_op.op.watch.op); + return crimson::ct_error::invarg::make(); +} + +static uint64_t get_next_notify_id(epoch_t e) +{ + // FIXME + static std::uint64_t next_notify_id = 0; + return (((uint64_t)e) << 32) | ((uint64_t)(next_notify_id++)); +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_notify( + OSDOp& osd_op, + const ObjectState& os) +{ + logger().debug("{}, msg epoch: {}", __func__, get_message().get_map_epoch()); + + if (!os.exists) { + return crimson::ct_error::enoent::make(); + } + struct notify_ctx_t { + crimson::net::ConnectionRef conn; + notify_info_t ninfo; + const uint64_t client_gid; + const epoch_t epoch; + + notify_ctx_t(const MOSDOp& msg) + : conn(msg.get_connection()), + client_gid(msg.get_reqid().name.num()), + epoch(msg.get_map_epoch()) { + } + }; + return with_effect_on_obc(notify_ctx_t{ get_message() }, + [&] (auto& ctx) { + try { + auto bp = osd_op.indata.cbegin(); + uint32_t ver; // obsolete + ceph::decode(ver, bp); + ceph::decode(ctx.ninfo.timeout, bp); + ceph::decode(ctx.ninfo.bl, bp); + } catch (const buffer::error&) { + ctx.ninfo.timeout = 0; + } + if (!ctx.ninfo.timeout) { + using crimson::common::local_conf; + ctx.ninfo.timeout = local_conf()->osd_default_notify_timeout; + } + ctx.ninfo.notify_id = get_next_notify_id(ctx.epoch); + ctx.ninfo.cookie = osd_op.op.notify.cookie; + // return our unique notify id to the client + ceph::encode(ctx.ninfo.notify_id, osd_op.outdata); + return seastar::now(); + }, + [] (auto&& ctx, ObjectContextRef obc) { + auto alive_watchers = obc->watchers | boost::adaptors::map_values + | boost::adaptors::filtered( + [] (const auto& w) { + // FIXME: filter as for the `is_ping` in `Watch::start_notify` + return w->is_alive(); + }); + return crimson::osd::Notify::create_n_propagate( + std::begin(alive_watchers), + std::end(alive_watchers), + std::move(ctx.conn), + ctx.ninfo, + ctx.client_gid, + obc->obs.oi.user_version); + }); +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_notify_ack( + OSDOp& osd_op, + const ObjectState& os) +{ + logger().debug("{}", __func__); + + struct notifyack_ctx_t { + const entity_name_t entity; + uint64_t watch_cookie; + uint64_t notify_id; + ceph::bufferlist reply_bl; + + notifyack_ctx_t(const MOSDOp& msg) : entity(msg.get_reqid().name) { + } + }; + return with_effect_on_obc(notifyack_ctx_t{ get_message() }, + [&] (auto& ctx) -> watch_errorator::future<> { + try { + auto bp = osd_op.indata.cbegin(); + ceph::decode(ctx.notify_id, bp); + ceph::decode(ctx.watch_cookie, bp); + if (!bp.end()) { + ceph::decode(ctx.reply_bl, bp); + } + } catch (const buffer::error&) { + // here we behave differently than ceph-osd. For historical reasons, + // it falls back to using `osd_op.op.watch.cookie` as `ctx.notify_id`. + // crimson just returns EINVAL if the data cannot be decoded. + return crimson::ct_error::invarg::make(); + } + return watch_errorator::now(); + }, + [] (auto&& ctx, ObjectContextRef obc) { + logger().info("notify_ack watch_cookie={}, notify_id={}", + ctx.watch_cookie, ctx.notify_id); + return seastar::do_for_each(obc->watchers, + [ctx=std::move(ctx)] (auto& kv) { + const auto& [key, watchp] = kv; + static_assert( + std::is_same_v<std::decay_t<decltype(watchp)>, + seastar::shared_ptr<crimson::osd::Watch>>); + auto& [cookie, entity] = key; + if (ctx.entity != entity) { + logger().debug("skipping watch {}; entity name {} != {}", + key, entity, ctx.entity); + return seastar::now(); + } + if (ctx.watch_cookie != cookie) { + logger().debug("skipping watch {}; cookie {} != {}", + key, ctx.watch_cookie, cookie); + return seastar::now(); + } + logger().info("acking notify on watch {}", key); + return watchp->notify_ack(ctx.notify_id, ctx.reply_bl); + }); + }); +} + +OpsExecuter::osd_op_errorator::future<> +OpsExecuter::execute_op(OSDOp& osd_op) +{ + // TODO: dispatch via call table? + // TODO: we might want to find a way to unify both input and output + // of each op. + logger().debug( + "handling op {} on object {}", + ceph_osd_op_name(osd_op.op.op), + get_target()); + switch (const ceph_osd_op& op = osd_op.op; op.op) { + case CEPH_OSD_OP_SYNC_READ: + [[fallthrough]]; + case CEPH_OSD_OP_READ: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.read(os, osd_op); + }); + case CEPH_OSD_OP_SPARSE_READ: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.sparse_read(os, osd_op); + }); + case CEPH_OSD_OP_CHECKSUM: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.checksum(os, osd_op); + }); + case CEPH_OSD_OP_CMPEXT: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.cmp_ext(os, osd_op); + }); + case CEPH_OSD_OP_GETXATTR: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.getxattr(os, osd_op); + }); + case CEPH_OSD_OP_GETXATTRS: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.get_xattrs(os, osd_op); + }); + case CEPH_OSD_OP_RMXATTR: + return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) { + return backend.rm_xattr(os, osd_op, txn); + }, true); + case CEPH_OSD_OP_CREATE: + return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) { + return backend.create(os, osd_op, txn); + }, true); + case CEPH_OSD_OP_WRITE: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.write(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_WRITESAME: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.write_same(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_WRITEFULL: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.writefull(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_APPEND: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.append(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_TRUNCATE: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + // FIXME: rework needed. Move this out to do_write_op(), introduce + // do_write_op_no_user_modify()... + return backend.truncate(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_ZERO: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.zero(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_SETALLOCHINT: + return osd_op_errorator::now(); + case CEPH_OSD_OP_SETXATTR: + return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) { + return backend.setxattr(os, osd_op, txn); + }, true); + case CEPH_OSD_OP_DELETE: + return do_write_op([] (auto& backend, auto& os, auto& txn) { + return backend.remove(os, txn); + }, true); + case CEPH_OSD_OP_CALL: + return this->do_op_call(osd_op); + case CEPH_OSD_OP_STAT: + // note: stat does not require RD + return do_const_op([&osd_op] (/* const */auto& backend, const auto& os) { + return backend.stat(os, osd_op); + }); + case CEPH_OSD_OP_TMAPUP: + // TODO: there was an effort to kill TMAP in ceph-osd. According to + // @dzafman this isn't possible yet. Maybe it could be accomplished + // before crimson's readiness and we'd luckily don't need to carry. + return dont_do_legacy_op(); + + // OMAP + case CEPH_OSD_OP_OMAPGETKEYS: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.omap_get_keys(os, osd_op); + }); + case CEPH_OSD_OP_OMAPGETVALS: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.omap_get_vals(os, osd_op); + }); + case CEPH_OSD_OP_OMAPGETHEADER: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.omap_get_header(os, osd_op); + }); + case CEPH_OSD_OP_OMAPGETVALSBYKEYS: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.omap_get_vals_by_keys(os, osd_op); + }); + case CEPH_OSD_OP_OMAPSETVALS: +#if 0 + if (!pg.get_pool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.omap_set_vals(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_OMAPSETHEADER: +#if 0 + if (!pg.get_pool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) { + return backend.omap_set_header(os, osd_op, txn); + }, true); + case CEPH_OSD_OP_OMAPRMKEYRANGE: +#if 0 + if (!pg.get_pool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) { + return backend.omap_remove_range(os, osd_op, txn); + }, true); + case CEPH_OSD_OP_OMAPCLEAR: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.omap_clear(os, osd_op, txn, *osd_op_params); + }, true); + + // watch/notify + case CEPH_OSD_OP_WATCH: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return do_op_watch(osd_op, os, txn); + }, false); + case CEPH_OSD_OP_NOTIFY: + return do_read_op([this, &osd_op] (auto&, const auto& os) { + return do_op_notify(osd_op, os); + }); + case CEPH_OSD_OP_NOTIFY_ACK: + return do_read_op([this, &osd_op] (auto&, const auto& os) { + return do_op_notify_ack(osd_op, os); + }); + + default: + logger().warn("unknown op {}", ceph_osd_op_name(op.op)); + throw std::runtime_error( + fmt::format("op '{}' not supported", ceph_osd_op_name(op.op))); + } +} + +static inline std::unique_ptr<const PGLSFilter> get_pgls_filter( + const std::string& type, + bufferlist::const_iterator& iter) +{ + // storing non-const PGLSFilter for the sake of ::init() + std::unique_ptr<PGLSFilter> filter; + if (type.compare("plain") == 0) { + filter = std::make_unique<PGLSPlainFilter>(); + } else { + std::size_t dot = type.find("."); + if (dot == type.npos || dot == 0 || dot == type.size() - 1) { + throw crimson::osd::invalid_argument{}; + } + + const std::string class_name = type.substr(0, dot); + const std::string filter_name = type.substr(dot + 1); + ClassHandler::ClassData *cls = nullptr; + int r = ClassHandler::get_instance().open_class(class_name, &cls); + if (r != 0) { + logger().warn("can't open class {}: {}", class_name, cpp_strerror(r)); + if (r == -EPERM) { + // propogate permission error + throw crimson::osd::permission_denied{}; + } else { + throw crimson::osd::invalid_argument{}; + } + } else { + ceph_assert(cls); + } + + ClassHandler::ClassFilter * const class_filter = cls->get_filter(filter_name); + if (class_filter == nullptr) { + logger().warn("can't find filter {} in class {}", filter_name, class_name); + throw crimson::osd::invalid_argument{}; + } + + filter.reset(class_filter->fn()); + if (!filter) { + // Object classes are obliged to return us something, but let's + // give an error rather than asserting out. + logger().warn("buggy class {} failed to construct filter {}", + class_name, filter_name); + throw crimson::osd::invalid_argument{}; + } + } + + ceph_assert(filter); + int r = filter->init(iter); + if (r < 0) { + logger().warn("error initializing filter {}: {}", type, cpp_strerror(r)); + throw crimson::osd::invalid_argument{}; + } + + // successfully constructed and initialized, return it. + return filter; +} + +static seastar::future<hobject_t> pgls_filter( + const PGLSFilter& filter, + const PGBackend& backend, + const hobject_t& sobj) +{ + if (const auto xattr = filter.get_xattr(); !xattr.empty()) { + logger().debug("pgls_filter: filter is interested in xattr={} for obj={}", + xattr, sobj); + return backend.getxattr(sobj, xattr).safe_then( + [&filter, sobj] (ceph::bufferptr bp) { + logger().debug("pgls_filter: got xvalue for obj={}", sobj); + + ceph::bufferlist val; + val.push_back(std::move(bp)); + const bool filtered = filter.filter(sobj, val); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + }, PGBackend::get_attr_errorator::all_same_way([&filter, sobj] { + logger().debug("pgls_filter: got error for obj={}", sobj); + + if (filter.reject_empty_xattr()) { + return seastar::make_ready_future<hobject_t>(hobject_t{}); + } + ceph::bufferlist val; + const bool filtered = filter.filter(sobj, val); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + })); + } else { + ceph::bufferlist empty_lvalue_bl; + const bool filtered = filter.filter(sobj, empty_lvalue_bl); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + } +} + +static seastar::future<ceph::bufferlist> do_pgnls_common( + const hobject_t& pg_start, + const hobject_t& pg_end, + const PGBackend& backend, + const hobject_t& lower_bound, + const std::string& nspace, + const uint64_t limit, + const PGLSFilter* const filter) +{ + if (!(lower_bound.is_min() || + lower_bound.is_max() || + (lower_bound >= pg_start && lower_bound < pg_end))) { + // this should only happen with a buggy client. + throw std::invalid_argument("outside of PG bounds"); + } + + return backend.list_objects(lower_bound, limit).then( + [&backend, filter, nspace](auto&& ret) { + auto& [objects, next] = ret; + auto in_my_namespace = [&nspace](const hobject_t& obj) { + using crimson::common::local_conf; + if (obj.get_namespace() == local_conf()->osd_hit_set_namespace) { + return false; + } else if (nspace == librados::all_nspaces) { + return true; + } else { + return obj.get_namespace() == nspace; + } + }; + auto to_pglsed = [&backend, filter] (const hobject_t& obj) { + // this transformation looks costly. However, I don't have any + // reason to think PGLS* operations are critical for, let's say, + // general performance. + // + // from tchaikov: "another way is to use seastar::map_reduce(), + // to 1) save the effort to filter the already filtered objects + // 2) avoid the space to keep the tuple<bool, object> even if + // the object is filtered out". + if (filter) { + return pgls_filter(*filter, backend, obj); + } else { + return seastar::make_ready_future<hobject_t>(obj); + } + }; + + auto range = objects | boost::adaptors::filtered(in_my_namespace) + | boost::adaptors::transformed(to_pglsed); + logger().debug("do_pgnls_common: finishing the 1st stage of pgls"); + return seastar::when_all_succeed(std::begin(range), + std::end(range)).then( + [next=std::move(next)] (auto items) mutable { + // the sole purpose of this chaining is to pass `next` to 2nd + // stage altogether with items + logger().debug("do_pgnls_common: 1st done"); + return seastar::make_ready_future< + std::tuple<std::vector<hobject_t>, hobject_t>>( + std::make_tuple(std::move(items), std::move(next))); + }); + }).then( + [pg_end] (auto&& ret) { + auto& [items, next] = ret; + auto is_matched = [] (const auto& obj) { + return !obj.is_min(); + }; + auto to_entry = [] (const auto& obj) { + return librados::ListObjectImpl{ + obj.get_namespace(), obj.oid.name, obj.get_key() + }; + }; + + pg_nls_response_t response; + boost::push_back(response.entries, items | boost::adaptors::filtered(is_matched) + | boost::adaptors::transformed(to_entry)); + response.handle = next.is_max() ? pg_end : next; + ceph::bufferlist out; + encode(response, out); + logger().debug("{}: response.entries.size()=", + __func__, response.entries.size()); + return seastar::make_ready_future<ceph::bufferlist>(std::move(out)); + }); +} + +static seastar::future<> do_pgnls( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + hobject_t lower_bound; + try { + ceph::decode(lower_bound, osd_op.indata); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGNLS handle"); + } + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = \ + pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num()); + return do_pgnls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + nullptr /* no filter */) + .then([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); +} + +static seastar::future<> do_pgnls_filtered( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + std::string cname, mname, type; + auto bp = osd_op.indata.cbegin(); + try { + ceph::decode(cname, bp); + ceph::decode(mname, bp); + ceph::decode(type, bp); + } catch (const buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + auto filter = get_pgls_filter(type, bp); + + hobject_t lower_bound; + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGNLS_FILTER description"); + } + + logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}", + __func__, cname, mname, type, lower_bound, + static_cast<const void*>(filter.get())); + return seastar::do_with(std::move(filter), + [&, lower_bound=std::move(lower_bound)](auto&& filter) { + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num()); + return do_pgnls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + filter.get()) + .then([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); + }); +} + +static seastar::future<ceph::bufferlist> do_pgls_common( + const hobject_t& pg_start, + const hobject_t& pg_end, + const PGBackend& backend, + const hobject_t& lower_bound, + const std::string& nspace, + const uint64_t limit, + const PGLSFilter* const filter) +{ + if (!(lower_bound.is_min() || + lower_bound.is_max() || + (lower_bound >= pg_start && lower_bound < pg_end))) { + // this should only happen with a buggy client. + throw std::invalid_argument("outside of PG bounds"); + } + + using entries_t = decltype(pg_ls_response_t::entries); + return backend.list_objects(lower_bound, limit).then( + [&backend, filter, nspace](auto&& ret) { + auto& [objects, next] = ret; + return seastar::when_all( + seastar::map_reduce(std::move(objects), + [&backend, filter, nspace](const hobject_t& obj) { + if (obj.get_namespace() == nspace) { + if (filter) { + return pgls_filter(*filter, backend, obj); + } else { + return seastar::make_ready_future<hobject_t>(obj); + } + } else { + return seastar::make_ready_future<hobject_t>(hobject_t{}); + } + }, + entries_t{}, + [](entries_t entries, hobject_t obj) { + if (!obj.is_min()) { + entries.emplace_back(obj.oid, obj.get_key()); + } + return entries; + }), + seastar::make_ready_future<hobject_t>(next)); + }).then([pg_end](auto&& ret) { + auto entries = std::move(std::get<0>(ret).get0()); + auto next = std::move(std::get<1>(ret).get0()); + pg_ls_response_t response; + response.handle = next.is_max() ? pg_end : next; + response.entries = std::move(entries); + ceph::bufferlist out; + encode(response, out); + logger().debug("{}: response.entries.size()=", + __func__, response.entries.size()); + return seastar::make_ready_future<ceph::bufferlist>(std::move(out)); + }); +} + +static seastar::future<> do_pgls( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + hobject_t lower_bound; + auto bp = osd_op.indata.cbegin(); + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument{"unable to decode PGLS handle"}; + } + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = + pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num()); + return do_pgls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + nullptr /* no filter */) + .then([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); +} + +static seastar::future<> do_pgls_filtered( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + std::string cname, mname, type; + auto bp = osd_op.indata.cbegin(); + try { + ceph::decode(cname, bp); + ceph::decode(mname, bp); + ceph::decode(type, bp); + } catch (const buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + auto filter = get_pgls_filter(type, bp); + + hobject_t lower_bound; + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGLS_FILTER description"); + } + + logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}", + __func__, cname, mname, type, lower_bound, + static_cast<const void*>(filter.get())); + return seastar::do_with(std::move(filter), + [&, lower_bound=std::move(lower_bound)](auto&& filter) { + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num()); + return do_pgls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + filter.get()) + .then([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); + }); +} + +seastar::future<> +PgOpsExecuter::execute_op(OSDOp& osd_op) +{ + logger().warn("handling op {}", ceph_osd_op_name(osd_op.op.op)); + switch (const ceph_osd_op& op = osd_op.op; op.op) { + case CEPH_OSD_OP_PGLS: + return do_pgls(pg, nspace, osd_op); + case CEPH_OSD_OP_PGLS_FILTER: + return do_pgls_filtered(pg, nspace, osd_op); + case CEPH_OSD_OP_PGNLS: + return do_pgnls(pg, nspace, osd_op); + case CEPH_OSD_OP_PGNLS_FILTER: + return do_pgnls_filtered(pg, nspace, osd_op); + default: + logger().warn("unknown op {}", ceph_osd_op_name(op.op)); + throw std::runtime_error( + fmt::format("op '{}' not supported", ceph_osd_op_name(op.op))); + } +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h new file mode 100644 index 000000000..42fcf61b8 --- /dev/null +++ b/src/crimson/osd/ops_executer.h @@ -0,0 +1,283 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <type_traits> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <boost/smart_ptr/local_shared_ptr.hpp> +#include <seastar/core/chunked_fifo.hh> +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "common/dout.h" +#include "crimson/net/Fwd.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" +#include "crimson/osd/object_context.h" + +#include "crimson/common/errorator.h" +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/osdmap_gate.h" + +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/exceptions.h" + +#include "messages/MOSDOp.h" + +class PG; +class PGLSFilter; +class OSDOp; + +namespace crimson::osd { + +// PgOpsExecuter -- a class for executing ops targeting a certain object. +class OpsExecuter { + using call_errorator = crimson::errorator< + crimson::stateful_ec, + crimson::ct_error::enoent, + crimson::ct_error::invarg, + crimson::ct_error::permission_denied, + crimson::ct_error::operation_not_supported, + crimson::ct_error::input_output_error, + crimson::ct_error::value_too_large>; + using read_errorator = PGBackend::read_errorator; + using write_ertr = PGBackend::write_ertr; + using get_attr_errorator = PGBackend::get_attr_errorator; + using watch_errorator = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::invarg, + crimson::ct_error::not_connected, + crimson::ct_error::timed_out>; + +public: + // because OpsExecuter is pretty heavy-weight object we want to ensure + // it's not copied nor even moved by accident. Performance is the sole + // reason for prohibiting that. + OpsExecuter(OpsExecuter&&) = delete; + OpsExecuter(const OpsExecuter&) = delete; + + using osd_op_errorator = crimson::compound_errorator_t< + call_errorator, + read_errorator, + write_ertr, + get_attr_errorator, + watch_errorator, + PGBackend::stat_errorator>; + +private: + // an operation can be divided into two stages: main and effect-exposing + // one. The former is performed immediately on call to `do_osd_op()` while + // the later on `submit_changes()` – after successfully processing main + // stages of all involved operations. When any stage fails, none of all + // scheduled effect-exposing stages will be executed. + // when operation requires this division, some variant of `with_effect()` + // should be used. + struct effect_t { + virtual osd_op_errorator::future<> execute() = 0; + virtual ~effect_t() = default; + }; + + ObjectContextRef obc; + const OpInfo& op_info; + const pg_pool_t& pool_info; // for the sake of the ObjClass API + PGBackend& backend; + const MOSDOp& msg; + std::optional<osd_op_params_t> osd_op_params; + bool user_modify = false; + ceph::os::Transaction txn; + + size_t num_read = 0; ///< count read ops + size_t num_write = 0; ///< count update ops + + // this gizmo could be wrapped in std::optional for the sake of lazy + // initialization. we don't need it for ops that doesn't have effect + // TODO: verify the init overhead of chunked_fifo + seastar::chunked_fifo<std::unique_ptr<effect_t>> op_effects; + + template <class Context, class MainFunc, class EffectFunc> + auto with_effect_on_obc( + Context&& ctx, + MainFunc&& main_func, + EffectFunc&& effect_func); + + call_errorator::future<> do_op_call(class OSDOp& osd_op); + watch_errorator::future<> do_op_watch( + class OSDOp& osd_op, + class ObjectState& os, + ceph::os::Transaction& txn); + watch_errorator::future<> do_op_watch_subop_watch( + class OSDOp& osd_op, + class ObjectState& os, + ceph::os::Transaction& txn); + watch_errorator::future<> do_op_watch_subop_reconnect( + class OSDOp& osd_op, + class ObjectState& os, + ceph::os::Transaction& txn); + watch_errorator::future<> do_op_watch_subop_unwatch( + class OSDOp& osd_op, + class ObjectState& os, + ceph::os::Transaction& txn); + watch_errorator::future<> do_op_watch_subop_ping( + class OSDOp& osd_op, + class ObjectState& os, + ceph::os::Transaction& txn); + watch_errorator::future<> do_op_notify( + class OSDOp& osd_op, + const class ObjectState& os); + watch_errorator::future<> do_op_notify_ack( + class OSDOp& osd_op, + const class ObjectState& os); + + hobject_t &get_target() const { + return obc->obs.oi.soid; + } + + template <class Func> + auto do_const_op(Func&& f) { + // TODO: pass backend as read-only + return std::forward<Func>(f)(backend, std::as_const(obc->obs)); + } + + template <class Func> + auto do_read_op(Func&& f) { + ++num_read; + // TODO: pass backend as read-only + return do_const_op(std::forward<Func>(f)); + } + + template <class Func> + auto do_write_op(Func&& f, bool um) { + ++num_write; + if (!osd_op_params) { + osd_op_params.emplace(); + } + user_modify = um; + return std::forward<Func>(f)(backend, obc->obs, txn); + } + + decltype(auto) dont_do_legacy_op() { + return crimson::ct_error::operation_not_supported::make(); + } + +public: + OpsExecuter(ObjectContextRef obc, + const OpInfo& op_info, + const pg_pool_t& pool_info, + PGBackend& backend, + const MOSDOp& msg) + : obc(std::move(obc)), + op_info(op_info), + pool_info(pool_info), + backend(backend), + msg(msg) { + } + + osd_op_errorator::future<> execute_op(class OSDOp& osd_op); + + template <typename Func, typename MutFunc> + osd_op_errorator::future<> flush_changes(Func&& func, MutFunc&& mut_func) &&; + + const auto& get_message() const { + return msg; + } + + size_t get_processed_rw_ops_num() const { + return num_read + num_write; + } + + uint32_t get_pool_stripe_width() const { + return pool_info.get_stripe_width(); + } + + bool has_seen_write() const { + return num_write > 0; + } +}; + +template <class Context, class MainFunc, class EffectFunc> +auto OpsExecuter::with_effect_on_obc( + Context&& ctx, + MainFunc&& main_func, + EffectFunc&& effect_func) +{ + using context_t = std::decay_t<Context>; + // the language offers implicit conversion to pointer-to-function for + // lambda only when it's closureless. We enforce this restriction due + // the fact that `flush_changes()` std::moves many executer's parts. + using allowed_effect_func_t = + seastar::future<> (*)(context_t&&, ObjectContextRef); + static_assert(std::is_convertible_v<EffectFunc, allowed_effect_func_t>, + "with_effect function is not allowed to capture"); + struct task_t final : effect_t { + context_t ctx; + EffectFunc effect_func; + ObjectContextRef obc; + + task_t(Context&& ctx, EffectFunc&& effect_func, ObjectContextRef obc) + : ctx(std::move(ctx)), + effect_func(std::move(effect_func)), + obc(std::move(obc)) { + } + osd_op_errorator::future<> execute() final { + return std::move(effect_func)(std::move(ctx), std::move(obc)); + } + }; + auto task = + std::make_unique<task_t>(std::move(ctx), std::move(effect_func), obc); + auto& ctx_ref = task->ctx; + op_effects.emplace_back(std::move(task)); + return std::forward<MainFunc>(main_func)(ctx_ref); +} + +template <typename Func, + typename MutFunc> +OpsExecuter::osd_op_errorator::future<> OpsExecuter::flush_changes( + Func&& func, + MutFunc&& mut_func) && +{ + const bool want_mutate = !txn.empty(); + // osd_op_params are instantiated by every wr-like operation. + assert(osd_op_params || !want_mutate); + assert(obc); + if (__builtin_expect(op_effects.empty(), true)) { + return want_mutate ? std::forward<MutFunc>(mut_func)(std::move(txn), + std::move(obc), + std::move(*osd_op_params), + user_modify) + : std::forward<Func>(func)(std::move(obc)); + } else { + return (want_mutate ? std::forward<MutFunc>(mut_func)(std::move(txn), + std::move(obc), + std::move(*osd_op_params), + user_modify) + : std::forward<Func>(func)(std::move(obc)) + ).safe_then([this] { + // let's do the cleaning of `op_effects` in destructor + return crimson::do_for_each(op_effects, [] (auto& op_effect) { + return op_effect->execute(); + }); + }); + } +} + +// PgOpsExecuter -- a class for executing ops targeting a certain PG. +class PgOpsExecuter { +public: + PgOpsExecuter(const PG& pg, const MOSDOp& msg) + : pg(pg), nspace(msg.get_hobj().nspace) { + } + + seastar::future<> execute_op(class OSDOp& osd_op); + +private: + const PG& pg; + const std::string& nspace; +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc new file mode 100644 index 000000000..521cb9ba3 --- /dev/null +++ b/src/crimson/osd/osd.cc @@ -0,0 +1,1364 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd.h" + +#include <sys/utsname.h> + +#include <boost/iterator/counting_iterator.hpp> +#include <boost/range/join.hpp> +#include <boost/smart_ptr/make_local_shared.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <seastar/core/timer.hh> + +#include "common/pick_address.h" +#include "include/util.h" + +#include "messages/MCommand.h" +#include "messages/MOSDBeacon.h" +#include "messages/MOSDBoot.h" +#include "messages/MOSDMap.h" +#include "messages/MOSDMarkMeDown.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDPGLog.h" +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDScrub2.h" +#include "messages/MPGStats.h" + +#include "os/Transaction.h" +#include "osd/ClassHandler.h" +#include "osd/OSDCap.h" +#include "osd/PGPeeringEvent.h" +#include "osd/PeeringState.h" + +#include "crimson/admin/osd_admin.h" +#include "crimson/admin/pg_commands.h" +#include "crimson/common/exception.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/heartbeat.h" +#include "crimson/osd/osd_meta.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/pg_meta.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/compound_peering_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/pg_advance_map.h" +#include "crimson/osd/osd_operations/recovery_subrequest.h" +#include "crimson/osd/osd_operations/replicated_request.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } + static constexpr int TICK_INTERVAL = 1; +} + +using crimson::common::local_conf; +using crimson::os::FuturizedStore; + +namespace crimson::osd { + +OSD::OSD(int id, uint32_t nonce, + crimson::net::MessengerRef cluster_msgr, + crimson::net::MessengerRef public_msgr, + crimson::net::MessengerRef hb_front_msgr, + crimson::net::MessengerRef hb_back_msgr) + : whoami{id}, + nonce{nonce}, + // do this in background + beacon_timer{[this] { (void)send_beacon(); }}, + cluster_msgr{cluster_msgr}, + public_msgr{public_msgr}, + monc{new crimson::mon::Client{*public_msgr, *this}}, + mgrc{new crimson::mgr::Client{*public_msgr, *this}}, + store{crimson::os::FuturizedStore::create( + local_conf().get_val<std::string>("osd_objectstore"), + local_conf().get_val<std::string>("osd_data"), + local_conf().get_config_values())}, + shard_services{*this, whoami, *cluster_msgr, *public_msgr, *monc, *mgrc, *store}, + heartbeat{new Heartbeat{whoami, shard_services, *monc, hb_front_msgr, hb_back_msgr}}, + // do this in background + tick_timer{[this] { + update_heartbeat_peers(); + update_stats(); + }}, + asok{seastar::make_lw_shared<crimson::admin::AdminSocket>()}, + osdmap_gate("OSD::osdmap_gate", std::make_optional(std::ref(shard_services))) +{ + osdmaps[0] = boost::make_local_shared<OSDMap>(); + for (auto msgr : {std::ref(cluster_msgr), std::ref(public_msgr), + std::ref(hb_front_msgr), std::ref(hb_back_msgr)}) { + msgr.get()->set_auth_server(monc.get()); + msgr.get()->set_auth_client(monc.get()); + } + + if (local_conf()->osd_open_classes_on_start) { + const int r = ClassHandler::get_instance().open_all_classes(); + if (r) { + logger().warn("{} warning: got an error loading one or more classes: {}", + __func__, cpp_strerror(r)); + } + } +} + +OSD::~OSD() = default; + +namespace { +// Initial features in new superblock. +// Features here are also automatically upgraded +CompatSet get_osd_initial_compat_set() +{ + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES); + return CompatSet(ceph_osd_feature_compat, + ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} +} + +seastar::future<> OSD::mkfs(uuid_d osd_uuid, uuid_d cluster_fsid) +{ + return store->start().then([this, osd_uuid] { + return store->mkfs(osd_uuid); + }).then([this] { + return store->mount(); + }).then([cluster_fsid, this] { + superblock.cluster_fsid = cluster_fsid; + superblock.osd_fsid = store->get_fsid(); + superblock.whoami = whoami; + superblock.compat_features = get_osd_initial_compat_set(); + + logger().info( + "{} writing superblock cluster_fsid {} osd_fsid {}", + __func__, + cluster_fsid, + superblock.osd_fsid); + return store->create_new_collection(coll_t::meta()); + }).then([this] (auto ch) { + meta_coll = make_unique<OSDMeta>(ch , store.get()); + ceph::os::Transaction t; + meta_coll->create(t); + meta_coll->store_superblock(t, superblock); + return store->do_transaction(meta_coll->collection(), std::move(t)); + }).then([cluster_fsid, this] { + return when_all_succeed( + store->write_meta("ceph_fsid", cluster_fsid.to_string()), + store->write_meta("whoami", std::to_string(whoami))); + }).then_unpack([cluster_fsid, this] { + fmt::print("created object store {} for osd.{} fsid {}\n", + local_conf().get_val<std::string>("osd_data"), + whoami, cluster_fsid); + return seastar::now(); + }); +} + +namespace { + entity_addrvec_t pick_addresses(int what) { + entity_addrvec_t addrs; + crimson::common::CephContext cct; + if (int r = ::pick_addresses(&cct, what, &addrs, -1); r < 0) { + throw std::runtime_error("failed to pick address"); + } + for (auto addr : addrs.v) { + logger().info("picked address {}", addr); + } + return addrs; + } + std::pair<entity_addrvec_t, bool> + replace_unknown_addrs(entity_addrvec_t maybe_unknowns, + const entity_addrvec_t& knowns) { + bool changed = false; + auto maybe_replace = [&](entity_addr_t addr) { + if (!addr.is_blank_ip()) { + return addr; + } + for (auto& b : knowns.v) { + if (addr.get_family() == b.get_family()) { + auto a = b; + a.set_nonce(addr.get_nonce()); + a.set_type(addr.get_type()); + a.set_port(addr.get_port()); + changed = true; + return a; + } + } + throw std::runtime_error("failed to replace unknown address"); + }; + entity_addrvec_t replaced; + std::transform(maybe_unknowns.v.begin(), + maybe_unknowns.v.end(), + std::back_inserter(replaced.v), + maybe_replace); + return {replaced, changed}; + } +} + +seastar::future<> OSD::start() +{ + logger().info("start"); + + startup_time = ceph::mono_clock::now(); + + return store->start().then([this] { + return store->mount(); + }).then([this] { + return store->open_collection(coll_t::meta()); + }).then([this](auto ch) { + meta_coll = make_unique<OSDMeta>(ch, store.get()); + return meta_coll->load_superblock(); + }).then([this](OSDSuperblock&& sb) { + superblock = std::move(sb); + return get_map(superblock.current_epoch); + }).then([this](cached_map_t&& map) { + shard_services.update_map(map); + osdmap_gate.got_map(map->get_epoch()); + osdmap = std::move(map); + return load_pgs(); + }).then([this] { + + uint64_t osd_required = + CEPH_FEATURE_UID | + CEPH_FEATURE_PGID64 | + CEPH_FEATURE_OSDENC; + using crimson::net::SocketPolicy; + + public_msgr->set_default_policy(SocketPolicy::stateless_server(0)); + public_msgr->set_policy(entity_name_t::TYPE_MON, + SocketPolicy::lossy_client(osd_required)); + public_msgr->set_policy(entity_name_t::TYPE_MGR, + SocketPolicy::lossy_client(osd_required)); + public_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::stateless_server(0)); + + cluster_msgr->set_default_policy(SocketPolicy::stateless_server(0)); + cluster_msgr->set_policy(entity_name_t::TYPE_MON, + SocketPolicy::lossy_client(0)); + cluster_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossless_peer(osd_required)); + cluster_msgr->set_policy(entity_name_t::TYPE_CLIENT, + SocketPolicy::stateless_server(0)); + + crimson::net::dispatchers_t dispatchers{this, monc.get(), mgrc.get()}; + return seastar::when_all_succeed( + cluster_msgr->try_bind(pick_addresses(CEPH_PICK_ADDRESS_CLUSTER), + local_conf()->ms_bind_port_min, + local_conf()->ms_bind_port_max) + .safe_then([this, dispatchers]() mutable { + return cluster_msgr->start(dispatchers); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [] (const std::error_code& e) { + logger().error("cluster messenger try_bind(): address range is unavailable."); + ceph_abort(); + })), + public_msgr->try_bind(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC), + local_conf()->ms_bind_port_min, + local_conf()->ms_bind_port_max) + .safe_then([this, dispatchers]() mutable { + return public_msgr->start(dispatchers); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [] (const std::error_code& e) { + logger().error("public messenger try_bind(): address range is unavailable."); + ceph_abort(); + }))); + }).then_unpack([this] { + return seastar::when_all_succeed(monc->start(), + mgrc->start()); + }).then_unpack([this] { + return _add_me_to_crush(); + }).then([this] { + monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0); + monc->sub_want("mgrmap", 0, 0); + monc->sub_want("osdmap", 0, 0); + return monc->renew_subs(); + }).then([this] { + if (auto [addrs, changed] = + replace_unknown_addrs(cluster_msgr->get_myaddrs(), + public_msgr->get_myaddrs()); changed) { + return cluster_msgr->set_myaddrs(addrs); + } else { + return seastar::now(); + } + }).then([this] { + return heartbeat->start(public_msgr->get_myaddrs(), + cluster_msgr->get_myaddrs()); + }).then([this] { + // create the admin-socket server, and the objects that register + // to handle incoming commands + return start_asok_admin(); + }).then([this] { + return start_boot(); + }); +} + +seastar::future<> OSD::start_boot() +{ + state.set_preboot(); + return monc->get_version("osdmap").then([this](auto&& ret) { + auto [newest, oldest] = ret; + return _preboot(oldest, newest); + }); +} + +seastar::future<> OSD::_preboot(version_t oldest, version_t newest) +{ + logger().info("osd.{}: _preboot", whoami); + if (osdmap->get_epoch() == 0) { + logger().info("waiting for initial osdmap"); + } else if (osdmap->is_destroyed(whoami)) { + logger().warn("osdmap says I am destroyed"); + // provide a small margin so we don't livelock seeing if we + // un-destroyed ourselves. + if (osdmap->get_epoch() > newest - 1) { + throw std::runtime_error("i am destroyed"); + } + } else if (osdmap->is_noup(whoami)) { + logger().warn("osdmap NOUP flag is set, waiting for it to clear"); + } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) { + logger().error("osdmap SORTBITWISE OSDMap flag is NOT set; please set it"); + } else if (osdmap->require_osd_release < ceph_release_t::octopus) { + logger().error("osdmap require_osd_release < octopus; please upgrade to octopus"); + } else if (false) { + // TODO: update mon if current fullness state is different from osdmap + } else if (version_t n = local_conf()->osd_map_message_max; + osdmap->get_epoch() >= oldest - 1 && + osdmap->get_epoch() + n > newest) { + return _send_boot(); + } + // get all the latest maps + if (osdmap->get_epoch() + 1 >= oldest) { + return shard_services.osdmap_subscribe(osdmap->get_epoch() + 1, false); + } else { + return shard_services.osdmap_subscribe(oldest - 1, true); + } +} + +seastar::future<> OSD::_send_boot() +{ + state.set_booting(); + + logger().info("hb_back_msgr: {}", heartbeat->get_back_addrs()); + logger().info("hb_front_msgr: {}", heartbeat->get_front_addrs()); + logger().info("cluster_msgr: {}", cluster_msgr->get_myaddr()); + auto m = make_message<MOSDBoot>(superblock, + osdmap->get_epoch(), + osdmap->get_epoch(), + heartbeat->get_back_addrs(), + heartbeat->get_front_addrs(), + cluster_msgr->get_myaddrs(), + CEPH_FEATURES_ALL); + collect_sys_info(&m->metadata, NULL); + return monc->send_message(m); +} + +seastar::future<> OSD::_add_me_to_crush() +{ + if (!local_conf().get_val<bool>("osd_crush_update_on_start")) { + return seastar::now(); + } + auto get_weight = [this] { + if (auto w = local_conf().get_val<double>("osd_crush_initial_weight"); + w >= 0) { + return seastar::make_ready_future<double>(w); + } else { + return store->stat().then([](auto st) { + auto total = st.total; + return seastar::make_ready_future<double>( + std::max(.00001, + double(total) / double(1ull << 40))); // TB + }); + } + }; + return get_weight().then([this](auto weight) { + const crimson::crush::CrushLocation loc{make_unique<CephContext>().get()}; + logger().info("{} crush location is {}", __func__, loc); + string cmd = fmt::format(R"({{ + "prefix": "osd crush create-or-move", + "id": {}, + "weight": {:.4f}, + "args": [{}] + }})", whoami, weight, loc); + return monc->run_command({cmd}, {}); + }).then([](auto&& command_result) { + [[maybe_unused]] auto [code, message, out] = std::move(command_result); + if (code) { + logger().warn("fail to add to crush: {} ({})", message, code); + throw std::runtime_error("fail to add to crush"); + } else { + logger().info("added to crush: {}", message); + } + return seastar::now(); + }); +} + +seastar::future<> OSD::handle_command(crimson::net::ConnectionRef conn, + Ref<MCommand> m) +{ + return asok->handle_command(conn, std::move(m)); +} + +/* + The OSD's Admin Socket object created here has two servers (i.e. - blocks of commands + to handle) registered to it: + - OSD's specific commands are handled by the OSD object; + - there are some common commands registered to be directly handled by the AdminSocket object + itself. +*/ +seastar::future<> OSD::start_asok_admin() +{ + auto asok_path = local_conf().get_val<std::string>("admin_socket"); + using namespace crimson::admin; + return asok->start(asok_path).then([this] { + return seastar::when_all_succeed( + asok->register_admin_commands(), + asok->register_command(make_asok_hook<OsdStatusHook>(std::as_const(*this))), + asok->register_command(make_asok_hook<SendBeaconHook>(*this)), + asok->register_command(make_asok_hook<FlushPgStatsHook>(*this)), + asok->register_command(make_asok_hook<DumpPGStateHistory>(std::as_const(*this))), + asok->register_command(make_asok_hook<SeastarMetricsHook>()), + // PG commands + asok->register_command(make_asok_hook<pg::QueryCommand>(*this)), + asok->register_command(make_asok_hook<pg::MarkUnfoundLostCommand>(*this))); + }).then_unpack([] { + return seastar::now(); + }); +} + +seastar::future<> OSD::stop() +{ + logger().info("stop"); + // see also OSD::shutdown() + return prepare_to_stop().then([this] { + state.set_stopping(); + logger().debug("prepared to stop"); + public_msgr->stop(); + cluster_msgr->stop(); + auto gate_close_fut = gate.close(); + return asok->stop().then([this] { + return heartbeat->stop(); + }).then([this] { + return store->umount(); + }).then([this] { + return store->stop(); + }).then([this] { + return seastar::parallel_for_each(pg_map.get_pgs(), + [](auto& p) { + return p.second->stop(); + }); + }).then([this] { + return monc->stop(); + }).then([this] { + return mgrc->stop(); + }).then([fut=std::move(gate_close_fut)]() mutable { + return std::move(fut); + }).then([this] { + return when_all_succeed( + public_msgr->shutdown(), + cluster_msgr->shutdown()); + }).then_unpack([] { + return seastar::now(); + }).handle_exception([](auto ep) { + logger().error("error while stopping osd: {}", ep); + }); + }); +} + +void OSD::dump_status(Formatter* f) const +{ + f->dump_stream("cluster_fsid") << superblock.cluster_fsid; + f->dump_stream("osd_fsid") << superblock.osd_fsid; + f->dump_unsigned("whoami", superblock.whoami); + f->dump_string("state", state.to_string()); + f->dump_unsigned("oldest_map", superblock.oldest_map); + f->dump_unsigned("newest_map", superblock.newest_map); + f->dump_unsigned("num_pgs", pg_map.get_pgs().size()); +} + +void OSD::dump_pg_state_history(Formatter* f) const +{ + f->open_array_section("pgs"); + for (auto [pgid, pg] : pg_map.get_pgs()) { + f->open_object_section("pg"); + f->dump_stream("pg") << pgid; + const auto& peering_state = pg->get_peering_state(); + f->dump_string("currently", peering_state.get_current_state()); + peering_state.dump_history(f); + f->close_section(); + } + f->close_section(); +} + +void OSD::print(std::ostream& out) const +{ + out << "{osd." << superblock.whoami << " " + << superblock.osd_fsid << " [" << superblock.oldest_map + << "," << superblock.newest_map << "] " << pg_map.get_pgs().size() + << " pgs}"; +} + +seastar::future<> OSD::load_pgs() +{ + return store->list_collections().then([this](auto colls) { + return seastar::parallel_for_each(colls, [this](auto coll) { + spg_t pgid; + if (coll.is_pg(&pgid)) { + return load_pg(pgid).then([pgid, this](auto&& pg) { + logger().info("load_pgs: loaded {}", pgid); + pg_map.pg_loaded(pgid, std::move(pg)); + shard_services.inc_pg_num(); + return seastar::now(); + }); + } else if (coll.is_temp(&pgid)) { + // TODO: remove the collection + return seastar::now(); + } else { + logger().warn("ignoring unrecognized collection: {}", coll); + return seastar::now(); + } + }); + }); +} + +seastar::future<Ref<PG>> OSD::make_pg(cached_map_t create_map, + spg_t pgid, + bool do_create) +{ + using ec_profile_t = map<string,string>; + auto get_pool_info = [create_map, pgid, this] { + if (create_map->have_pg_pool(pgid.pool())) { + pg_pool_t pi = *create_map->get_pg_pool(pgid.pool()); + string name = create_map->get_pool_name(pgid.pool()); + ec_profile_t ec_profile; + if (pi.is_erasure()) { + ec_profile = create_map->get_erasure_code_profile(pi.erasure_code_profile); + } + return seastar::make_ready_future<std::tuple<pg_pool_t, string, ec_profile_t>>( + std::make_tuple(std::move(pi), + std::move(name), + std::move(ec_profile))); + } else { + // pool was deleted; grab final pg_pool_t off disk. + return meta_coll->load_final_pool_info(pgid.pool()); + } + }; + auto get_collection = [pgid, do_create, this] { + const coll_t cid{pgid}; + if (do_create) { + return store->create_new_collection(cid); + } else { + return store->open_collection(cid); + } + }; + return seastar::when_all( + std::move(get_pool_info), + std::move(get_collection) + ).then([pgid, create_map, this] (auto&& ret) { + auto [pool, name, ec_profile] = std::move(std::get<0>(ret).get0()); + auto coll = std::move(std::get<1>(ret).get0()); + return seastar::make_ready_future<Ref<PG>>( + new PG{pgid, + pg_shard_t{whoami, pgid.shard}, + std::move(coll), + std::move(pool), + std::move(name), + create_map, + shard_services, + ec_profile}); + }); +} + +seastar::future<Ref<PG>> OSD::load_pg(spg_t pgid) +{ + logger().debug("{}: {}", __func__, pgid); + + return seastar::do_with(PGMeta(store.get(), pgid), [] (auto& pg_meta) { + return pg_meta.get_epoch(); + }).then([this](epoch_t e) { + return get_map(e); + }).then([pgid, this] (auto&& create_map) { + return make_pg(std::move(create_map), pgid, false); + }).then([this](Ref<PG> pg) { + return pg->read_state(store.get()).then([pg] { + return seastar::make_ready_future<Ref<PG>>(std::move(pg)); + }); + }).handle_exception([pgid](auto ep) { + logger().info("pg {} saw exception on load {}", pgid, ep); + ceph_abort("Could not load pg" == 0); + return seastar::make_exception_future<Ref<PG>>(ep); + }); +} + +std::optional<seastar::future<>> +OSD::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + if (state.is_stopping()) { + return {}; + } + bool dispatched = true; + gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] { + switch (m->get_type()) { + case CEPH_MSG_OSD_MAP: + return handle_osd_map(conn, boost::static_pointer_cast<MOSDMap>(m)); + case CEPH_MSG_OSD_OP: + return handle_osd_op(conn, boost::static_pointer_cast<MOSDOp>(m)); + case MSG_OSD_PG_CREATE2: + shard_services.start_operation<CompoundPeeringRequest>( + *this, + conn, + m); + return seastar::now(); + case MSG_COMMAND: + return handle_command(conn, boost::static_pointer_cast<MCommand>(m)); + case MSG_OSD_MARK_ME_DOWN: + return handle_mark_me_down(conn, boost::static_pointer_cast<MOSDMarkMeDown>(m)); + case MSG_OSD_PG_PULL: + [[fallthrough]]; + case MSG_OSD_PG_PUSH: + [[fallthrough]]; + case MSG_OSD_PG_PUSH_REPLY: + [[fallthrough]]; + case MSG_OSD_PG_RECOVERY_DELETE: + [[fallthrough]]; + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + [[fallthrough]]; + case MSG_OSD_PG_SCAN: + [[fallthrough]]; + case MSG_OSD_PG_BACKFILL: + [[fallthrough]]; + case MSG_OSD_PG_BACKFILL_REMOVE: + return handle_recovery_subreq(conn, boost::static_pointer_cast<MOSDFastDispatchOp>(m)); + case MSG_OSD_PG_LEASE: + [[fallthrough]]; + case MSG_OSD_PG_LEASE_ACK: + [[fallthrough]]; + case MSG_OSD_PG_NOTIFY2: + [[fallthrough]]; + case MSG_OSD_PG_INFO2: + [[fallthrough]]; + case MSG_OSD_PG_QUERY2: + [[fallthrough]]; + case MSG_OSD_BACKFILL_RESERVE: + [[fallthrough]]; + case MSG_OSD_RECOVERY_RESERVE: + [[fallthrough]]; + case MSG_OSD_PG_LOG: + return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m)); + case MSG_OSD_REPOP: + return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m)); + case MSG_OSD_REPOPREPLY: + return handle_rep_op_reply(conn, boost::static_pointer_cast<MOSDRepOpReply>(m)); + case MSG_OSD_SCRUB2: + return handle_scrub(conn, boost::static_pointer_cast<MOSDScrub2>(m)); + default: + dispatched = false; + return seastar::now(); + } + }); + return (dispatched ? std::make_optional(seastar::now()) : std::nullopt); +} + +void OSD::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) +{ + // TODO: cleanup the session attached to this connection + logger().warn("ms_handle_reset"); +} + +void OSD::ms_handle_remote_reset(crimson::net::ConnectionRef conn) +{ + logger().warn("ms_handle_remote_reset"); +} + +void OSD::handle_authentication(const EntityName& name, + const AuthCapsInfo& caps_info) +{ + // TODO: store the parsed cap and associate it with the connection + if (caps_info.allow_all) { + logger().debug("{} {} has all caps", __func__, name); + return; + } + if (caps_info.caps.length() > 0) { + auto p = caps_info.caps.cbegin(); + string str; + try { + decode(str, p); + } catch (ceph::buffer::error& e) { + logger().warn("{} {} failed to decode caps string", __func__, name); + return; + } + OSDCap caps; + if (caps.parse(str)) { + logger().debug("{} {} has caps {}", __func__, name, str); + } else { + logger().warn("{} {} failed to parse caps {}", __func__, name, str); + } + } +} + +void OSD::update_stats() +{ + osd_stat_seq++; + osd_stat.up_from = get_up_epoch(); + osd_stat.hb_peers = heartbeat->get_peers(); + osd_stat.seq = (static_cast<uint64_t>(get_up_epoch()) << 32) | osd_stat_seq; + gate.dispatch_in_background("statfs", *this, [this] { + (void) store->stat().then([this](store_statfs_t&& st) { + osd_stat.statfs = st; + }); + }); +} + +MessageRef OSD::get_stats() const +{ + // todo: m-to-n: collect stats using map-reduce + // MPGStats::had_map_for is not used since PGMonitor was removed + auto m = make_message<MPGStats>(monc->get_fsid(), osdmap->get_epoch()); + m->osd_stat = osd_stat; + for (auto [pgid, pg] : pg_map.get_pgs()) { + if (pg->is_primary()) { + auto stats = pg->get_stats(); + // todo: update reported_epoch,reported_seq,last_fresh + stats.reported_epoch = osdmap->get_epoch(); + m->pg_stat.emplace(pgid.pgid, std::move(stats)); + } + } + return m; +} + +uint64_t OSD::send_pg_stats() +{ + // mgr client sends the report message in background + mgrc->report(); + return osd_stat.seq; +} + +OSD::cached_map_t OSD::get_map() const +{ + return osdmap; +} + +seastar::future<OSD::cached_map_t> OSD::get_map(epoch_t e) +{ + // TODO: use LRU cache for managing osdmap, fallback to disk if we have to + if (auto found = osdmaps.find(e); found) { + return seastar::make_ready_future<cached_map_t>(std::move(found)); + } else { + return load_map(e).then([e, this](unique_ptr<OSDMap> osdmap) { + return seastar::make_ready_future<cached_map_t>( + osdmaps.insert(e, std::move(osdmap))); + }); + } +} + +void OSD::store_map_bl(ceph::os::Transaction& t, + epoch_t e, bufferlist&& bl) +{ + meta_coll->store_map(t, e, bl); + map_bl_cache.insert(e, std::move(bl)); +} + +seastar::future<bufferlist> OSD::load_map_bl(epoch_t e) +{ + if (std::optional<bufferlist> found = map_bl_cache.find(e); found) { + return seastar::make_ready_future<bufferlist>(*found); + } else { + return meta_coll->load_map(e); + } +} + +seastar::future<std::map<epoch_t, bufferlist>> OSD::load_map_bls( + epoch_t first, + epoch_t last) +{ + return seastar::map_reduce(boost::make_counting_iterator<epoch_t>(first), + boost::make_counting_iterator<epoch_t>(last + 1), + [this](epoch_t e) { + return load_map_bl(e).then([e](auto&& bl) { + return seastar::make_ready_future<pair<epoch_t, bufferlist>>( + std::make_pair(e, std::move(bl))); + }); + }, + std::map<epoch_t, bufferlist>{}, + [](auto&& bls, auto&& epoch_bl) { + bls.emplace(std::move(epoch_bl)); + return std::move(bls); + }); +} + +seastar::future<std::unique_ptr<OSDMap>> OSD::load_map(epoch_t e) +{ + auto o = std::make_unique<OSDMap>(); + if (e > 0) { + return load_map_bl(e).then([o=std::move(o)](bufferlist bl) mutable { + o->decode(bl); + return seastar::make_ready_future<unique_ptr<OSDMap>>(std::move(o)); + }); + } else { + return seastar::make_ready_future<unique_ptr<OSDMap>>(std::move(o)); + } +} + +seastar::future<> OSD::store_maps(ceph::os::Transaction& t, + epoch_t start, Ref<MOSDMap> m) +{ + return seastar::do_for_each(boost::make_counting_iterator(start), + boost::make_counting_iterator(m->get_last() + 1), + [&t, m, this](epoch_t e) { + if (auto p = m->maps.find(e); p != m->maps.end()) { + auto o = std::make_unique<OSDMap>(); + o->decode(p->second); + logger().info("store_maps osdmap.{}", e); + store_map_bl(t, e, std::move(std::move(p->second))); + osdmaps.insert(e, std::move(o)); + return seastar::now(); + } else if (auto p = m->incremental_maps.find(e); + p != m->incremental_maps.end()) { + return load_map(e - 1).then([e, bl=p->second, &t, this](auto o) { + OSDMap::Incremental inc; + auto i = bl.cbegin(); + inc.decode(i); + o->apply_incremental(inc); + bufferlist fbl; + o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED); + store_map_bl(t, e, std::move(fbl)); + osdmaps.insert(e, std::move(o)); + return seastar::now(); + }); + } else { + logger().error("MOSDMap lied about what maps it had?"); + return seastar::now(); + } + }); +} + +bool OSD::require_mon_peer(crimson::net::Connection *conn, Ref<Message> m) +{ + if (!conn->peer_is_mon()) { + logger().info("{} received from non-mon {}, {}", + __func__, + conn->get_peer_addr(), + *m); + return false; + } + return true; +} + +seastar::future<Ref<PG>> OSD::handle_pg_create_info( + std::unique_ptr<PGCreateInfo> info) { + return seastar::do_with( + std::move(info), + [this](auto &info) -> seastar::future<Ref<PG>> { + return get_map(info->epoch).then( + [&info, this](cached_map_t startmap) -> + seastar::future<std::tuple<Ref<PG>, cached_map_t>> { + const spg_t &pgid = info->pgid; + if (info->by_mon) { + int64_t pool_id = pgid.pgid.pool(); + const pg_pool_t *pool = osdmap->get_pg_pool(pool_id); + if (!pool) { + logger().debug( + "{} ignoring pgid {}, pool dne", + __func__, + pgid); + return seastar::make_ready_future<std::tuple<Ref<PG>, cached_map_t>>( + std::make_tuple(Ref<PG>(), startmap)); + } + ceph_assert(osdmap->require_osd_release >= ceph_release_t::octopus); + if (!pool->has_flag(pg_pool_t::FLAG_CREATING)) { + // this ensures we do not process old creating messages after the + // pool's initial pgs have been created (and pg are subsequently + // allowed to split or merge). + logger().debug( + "{} dropping {} create, pool does not have CREATING flag set", + __func__, + pgid); + return seastar::make_ready_future<std::tuple<Ref<PG>, cached_map_t>>( + std::make_tuple(Ref<PG>(), startmap)); + } + } + return make_pg(startmap, pgid, true).then( + [startmap=std::move(startmap)](auto pg) mutable { + return seastar::make_ready_future<std::tuple<Ref<PG>, cached_map_t>>( + std::make_tuple(std::move(pg), std::move(startmap))); + }); + }).then([this, &info](auto&& ret) -> + seastar::future<Ref<PG>> { + auto [pg, startmap] = std::move(ret); + if (!pg) + return seastar::make_ready_future<Ref<PG>>(Ref<PG>()); + PeeringCtx rctx{ceph_release_t::octopus}; + const pg_pool_t* pp = startmap->get_pg_pool(info->pgid.pool()); + + int up_primary, acting_primary; + vector<int> up, acting; + startmap->pg_to_up_acting_osds( + info->pgid.pgid, &up, &up_primary, &acting, &acting_primary); + + int role = startmap->calc_pg_role(pg_shard_t(whoami, info->pgid.shard), + acting); + + create_pg_collection( + rctx.transaction, + info->pgid, + info->pgid.get_split_bits(pp->get_pg_num())); + init_pg_ondisk( + rctx.transaction, + info->pgid, + pp); + + pg->init( + role, + up, + up_primary, + acting, + acting_primary, + info->history, + info->past_intervals, + false, + rctx.transaction); + + return shard_services.start_operation<PGAdvanceMap>( + *this, pg, pg->get_osdmap_epoch(), + osdmap->get_epoch(), std::move(rctx), true).second.then([pg=pg] { + return seastar::make_ready_future<Ref<PG>>(pg); + }); + }); + }); +} + +seastar::future<> OSD::handle_osd_map(crimson::net::ConnectionRef conn, + Ref<MOSDMap> m) +{ + logger().info("handle_osd_map {}", *m); + if (m->fsid != superblock.cluster_fsid) { + logger().warn("fsid mismatched"); + return seastar::now(); + } + if (state.is_initializing()) { + logger().warn("i am still initializing"); + return seastar::now(); + } + + const auto first = m->get_first(); + const auto last = m->get_last(); + logger().info("handle_osd_map epochs [{}..{}], i have {}, src has [{}..{}]", + first, last, superblock.newest_map, m->oldest_map, m->newest_map); + // make sure there is something new, here, before we bother flushing + // the queues and such + if (last <= superblock.newest_map) { + return seastar::now(); + } + // missing some? + bool skip_maps = false; + epoch_t start = superblock.newest_map + 1; + if (first > start) { + logger().info("handle_osd_map message skips epochs {}..{}", + start, first - 1); + if (m->oldest_map <= start) { + return shard_services.osdmap_subscribe(start, false); + } + // always try to get the full range of maps--as many as we can. this + // 1- is good to have + // 2- is at present the only way to ensure that we get a *full* map as + // the first map! + if (m->oldest_map < first) { + return shard_services.osdmap_subscribe(m->oldest_map - 1, true); + } + skip_maps = true; + start = first; + } + + return seastar::do_with(ceph::os::Transaction{}, + [=](auto& t) { + return store_maps(t, start, m).then([=, &t] { + // even if this map isn't from a mon, we may have satisfied our subscription + monc->sub_got("osdmap", last); + if (!superblock.oldest_map || skip_maps) { + superblock.oldest_map = first; + } + superblock.newest_map = last; + superblock.current_epoch = last; + + // note in the superblock that we were clean thru the prior epoch + if (boot_epoch && boot_epoch >= superblock.mounted) { + superblock.mounted = boot_epoch; + superblock.clean_thru = last; + } + meta_coll->store_superblock(t, superblock); + return store->do_transaction(meta_coll->collection(), std::move(t)); + }); + }).then([=] { + // TODO: write to superblock and commit the transaction + return committed_osd_maps(start, last, m); + }); +} + +seastar::future<> OSD::committed_osd_maps(version_t first, + version_t last, + Ref<MOSDMap> m) +{ + logger().info("osd.{}: committed_osd_maps({}, {})", whoami, first, last); + // advance through the new maps + return seastar::do_for_each(boost::make_counting_iterator(first), + boost::make_counting_iterator(last + 1), + [this](epoch_t cur) { + return get_map(cur).then([this](cached_map_t&& o) { + osdmap = std::move(o); + shard_services.update_map(osdmap); + if (up_epoch == 0 && + osdmap->is_up(whoami) && + osdmap->get_addrs(whoami) == public_msgr->get_myaddrs()) { + up_epoch = osdmap->get_epoch(); + if (!boot_epoch) { + boot_epoch = osdmap->get_epoch(); + } + } + }); + }).then([m, this] { + if (osdmap->is_up(whoami) && + osdmap->get_addrs(whoami) == public_msgr->get_myaddrs() && + bind_epoch < osdmap->get_up_from(whoami)) { + if (state.is_booting()) { + logger().info("osd.{}: activating...", whoami); + state.set_active(); + beacon_timer.arm_periodic( + std::chrono::seconds(local_conf()->osd_beacon_report_interval)); + tick_timer.arm_periodic( + std::chrono::seconds(TICK_INTERVAL)); + } + } else if (!osdmap->is_up(whoami)) { + if (state.is_prestop()) { + got_stop_ack(); + return seastar::now(); + } + } + check_osdmap_features(); + // yay! + return consume_map(osdmap->get_epoch()); + }).then([m, this] { + if (state.is_active()) { + logger().info("osd.{}: now active", whoami); + if (!osdmap->exists(whoami)) { + return shutdown(); + } + if (should_restart()) { + return restart(); + } else { + return seastar::now(); + } + } else if (state.is_preboot()) { + logger().info("osd.{}: now preboot", whoami); + + if (m->get_source().is_mon()) { + return _preboot(m->oldest_map, m->newest_map); + } else { + logger().info("osd.{}: start_boot", whoami); + return start_boot(); + } + } else { + logger().info("osd.{}: now {}", whoami, state); + // XXX + return seastar::now(); + } + }); +} + +seastar::future<> OSD::handle_osd_op(crimson::net::ConnectionRef conn, + Ref<MOSDOp> m) +{ + (void) shard_services.start_operation<ClientRequest>( + *this, + conn, + std::move(m)); + return seastar::now(); +} + +seastar::future<> OSD::send_incremental_map(crimson::net::ConnectionRef conn, + epoch_t first) +{ + if (first >= superblock.oldest_map) { + return load_map_bls(first, superblock.newest_map) + .then([this, conn, first](auto&& bls) { + auto m = make_message<MOSDMap>(monc->get_fsid(), + osdmap->get_encoding_features()); + m->oldest_map = first; + m->newest_map = superblock.newest_map; + m->maps = std::move(bls); + return conn->send(m); + }); + } else { + return load_map_bl(osdmap->get_epoch()) + .then([this, conn](auto&& bl) mutable { + auto m = make_message<MOSDMap>(monc->get_fsid(), + osdmap->get_encoding_features()); + m->oldest_map = superblock.oldest_map; + m->newest_map = superblock.newest_map; + m->maps.emplace(osdmap->get_epoch(), std::move(bl)); + return conn->send(m); + }); + } +} + +seastar::future<> OSD::handle_rep_op(crimson::net::ConnectionRef conn, + Ref<MOSDRepOp> m) +{ + m->finish_decode(); + (void) shard_services.start_operation<RepRequest>( + *this, + std::move(conn), + std::move(m)); + return seastar::now(); +} + +seastar::future<> OSD::handle_rep_op_reply(crimson::net::ConnectionRef conn, + Ref<MOSDRepOpReply> m) +{ + const auto& pgs = pg_map.get_pgs(); + if (auto pg = pgs.find(m->get_spg()); pg != pgs.end()) { + m->finish_decode(); + pg->second->handle_rep_op_reply(conn, *m); + } else { + logger().warn("stale reply: {}", *m); + } + return seastar::now(); +} + +seastar::future<> OSD::handle_scrub(crimson::net::ConnectionRef conn, + Ref<MOSDScrub2> m) +{ + if (m->fsid != superblock.cluster_fsid) { + logger().warn("fsid mismatched"); + return seastar::now(); + } + return seastar::parallel_for_each(std::move(m->scrub_pgs), + [m, conn, this](spg_t pgid) { + pg_shard_t from_shard{static_cast<int>(m->get_source().num()), + pgid.shard}; + PeeringState::RequestScrub scrub_request{m->deep, m->repair}; + return shard_services.start_operation<RemotePeeringEvent>( + *this, + conn, + shard_services, + from_shard, + pgid, + PGPeeringEvent{m->epoch, m->epoch, scrub_request}).second; + }); +} + +seastar::future<> OSD::handle_mark_me_down(crimson::net::ConnectionRef conn, + Ref<MOSDMarkMeDown> m) +{ + if (state.is_prestop()) { + got_stop_ack(); + } + return seastar::now(); +} + +seastar::future<> OSD::handle_recovery_subreq(crimson::net::ConnectionRef conn, + Ref<MOSDFastDispatchOp> m) +{ + (void) shard_services.start_operation<RecoverySubRequest>( + *this, + conn, + std::move(m)); + return seastar::now(); +} + +bool OSD::should_restart() const +{ + if (!osdmap->is_up(whoami)) { + logger().info("map e {} marked osd.{} down", + osdmap->get_epoch(), whoami); + return true; + } else if (osdmap->get_addrs(whoami) != public_msgr->get_myaddrs()) { + logger().error("map e {} had wrong client addr ({} != my {})", + osdmap->get_epoch(), + osdmap->get_addrs(whoami), + public_msgr->get_myaddrs()); + return true; + } else if (osdmap->get_cluster_addrs(whoami) != cluster_msgr->get_myaddrs()) { + logger().error("map e {} had wrong cluster addr ({} != my {})", + osdmap->get_epoch(), + osdmap->get_cluster_addrs(whoami), + cluster_msgr->get_myaddrs()); + return true; + } else { + return false; + } +} + +seastar::future<> OSD::restart() +{ + beacon_timer.cancel(); + tick_timer.cancel(); + up_epoch = 0; + bind_epoch = osdmap->get_epoch(); + // TODO: promote to shutdown if being marked down for multiple times + // rebind messengers + return start_boot(); +} + +seastar::future<> OSD::shutdown() +{ + // TODO + superblock.mounted = boot_epoch; + superblock.clean_thru = osdmap->get_epoch(); + return seastar::now(); +} + +seastar::future<> OSD::send_beacon() +{ + if (!state.is_active()) { + return seastar::now(); + } + // FIXME: min lec should be calculated from pg_stat + // and should set m->pgs + epoch_t min_last_epoch_clean = osdmap->get_epoch(); + auto m = make_message<MOSDBeacon>(osdmap->get_epoch(), + min_last_epoch_clean, + superblock.last_purged_snaps_scrub, + local_conf()->osd_beacon_report_interval); + return monc->send_message(m); +} + +void OSD::update_heartbeat_peers() +{ + if (!state.is_active()) { + return; + } + for (auto& pg : pg_map.get_pgs()) { + vector<int> up, acting; + osdmap->pg_to_up_acting_osds(pg.first.pgid, + &up, nullptr, + &acting, nullptr); + for (int osd : boost::join(up, acting)) { + if (osd == CRUSH_ITEM_NONE || osd == whoami) { + continue; + } else { + heartbeat->add_peer(osd, osdmap->get_epoch()); + } + } + } + heartbeat->update_peers(whoami); +} + +seastar::future<> OSD::handle_peering_op( + crimson::net::ConnectionRef conn, + Ref<MOSDPeeringOp> m) +{ + const int from = m->get_source().num(); + logger().debug("handle_peering_op on {} from {}", m->get_spg(), from); + std::unique_ptr<PGPeeringEvent> evt(m->get_event()); + (void) shard_services.start_operation<RemotePeeringEvent>( + *this, + conn, + shard_services, + pg_shard_t{from, m->get_spg().shard}, + m->get_spg(), + std::move(*evt)); + return seastar::now(); +} + +void OSD::check_osdmap_features() +{ + heartbeat->set_require_authorizer(true); +} + +seastar::future<> OSD::consume_map(epoch_t epoch) +{ + // todo: m-to-n: broadcast this news to all shards + auto &pgs = pg_map.get_pgs(); + return seastar::parallel_for_each(pgs.begin(), pgs.end(), [=](auto& pg) { + return shard_services.start_operation<PGAdvanceMap>( + *this, pg.second, pg.second->get_osdmap_epoch(), epoch, + PeeringCtx{ceph_release_t::octopus}, false).second; + }).then([epoch, this] { + osdmap_gate.got_map(epoch); + return seastar::make_ready_future(); + }); +} + + +blocking_future<Ref<PG>> +OSD::get_or_create_pg( + spg_t pgid, + epoch_t epoch, + std::unique_ptr<PGCreateInfo> info) +{ + if (info) { + auto [fut, creating] = pg_map.wait_for_pg(pgid); + if (!creating) { + pg_map.set_creating(pgid); + (void)handle_pg_create_info(std::move(info)); + } + return std::move(fut); + } else { + return make_ready_blocking_future<Ref<PG>>(pg_map.get_pg(pgid)); + } +} + +blocking_future<Ref<PG>> OSD::wait_for_pg( + spg_t pgid) +{ + return pg_map.wait_for_pg(pgid).first; +} + +Ref<PG> OSD::get_pg(spg_t pgid) +{ + return pg_map.get_pg(pgid); +} + +seastar::future<> OSD::prepare_to_stop() +{ + if (osdmap && osdmap->is_up(whoami)) { + state.set_prestop(); + const auto timeout = + std::chrono::duration_cast<std::chrono::milliseconds>( + std::chrono::duration<double>( + local_conf().get_val<double>("osd_mon_shutdown_timeout"))); + + return seastar::with_timeout( + seastar::timer<>::clock::now() + timeout, + monc->send_message( + make_message<MOSDMarkMeDown>( + monc->get_fsid(), + whoami, + osdmap->get_addrs(whoami), + osdmap->get_epoch(), + true)).then([this] { + return stop_acked.get_future(); + }) + ).handle_exception_type( + [](seastar::timed_out_error&) { + return seastar::now(); + }); + } + return seastar::now(); +} + +} diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h new file mode 100644 index 000000000..889960ced --- /dev/null +++ b/src/crimson/osd/osd.h @@ -0,0 +1,250 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/shared_ptr.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/timer.hh> + +#include "crimson/common/type_helpers.h" +#include "crimson/common/auth_handler.h" +#include "crimson/common/gated.h" +#include "crimson/admin/admin_socket.h" +#include "crimson/common/simple_lru.h" +#include "crimson/common/shared_lru.h" +#include "crimson/mgr/client.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/state.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/pg_map.h" +#include "crimson/osd/osd_operations/peering_event.h" + +#include "messages/MOSDOp.h" +#include "osd/PeeringState.h" +#include "osd/osd_types.h" +#include "osd/osd_perf_counters.h" +#include "osd/PGPeeringEvent.h" + +class MCommand; +class MOSDMap; +class MOSDRepOpReply; +class MOSDRepOp; +class MOSDScrub2; +class OSDMap; +class OSDMeta; +class Heartbeat; + +namespace ceph::os { + class Transaction; +} + +namespace crimson::mon { + class Client; +} + +namespace crimson::net { + class Messenger; +} + +namespace crimson::os { + class FuturizedStore; +} + +namespace crimson::osd { +class PG; + +class OSD final : public crimson::net::Dispatcher, + private OSDMapService, + private crimson::common::AuthHandler, + private crimson::mgr::WithStats { + const int whoami; + const uint32_t nonce; + seastar::timer<seastar::lowres_clock> beacon_timer; + // talk with osd + crimson::net::MessengerRef cluster_msgr; + // talk with client/mon/mgr + crimson::net::MessengerRef public_msgr; + std::unique_ptr<crimson::mon::Client> monc; + std::unique_ptr<crimson::mgr::Client> mgrc; + + SharedLRU<epoch_t, OSDMap> osdmaps; + SimpleLRU<epoch_t, bufferlist, false> map_bl_cache; + cached_map_t osdmap; + // TODO: use a wrapper for ObjectStore + std::unique_ptr<crimson::os::FuturizedStore> store; + std::unique_ptr<OSDMeta> meta_coll; + + OSDState state; + + /// _first_ epoch we were marked up (after this process started) + epoch_t boot_epoch = 0; + /// _most_recent_ epoch we were marked up + epoch_t up_epoch = 0; + //< epoch we last did a bind to new ip:ports + epoch_t bind_epoch = 0; + //< since when there is no more pending pg creates from mon + epoch_t last_pg_create_epoch = 0; + + ceph::mono_time startup_time; + + OSDSuperblock superblock; + + // Dispatcher methods + std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef, MessageRef) final; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final; + void ms_handle_remote_reset(crimson::net::ConnectionRef conn) final; + + // mgr::WithStats methods + // pg statistics including osd ones + osd_stat_t osd_stat; + uint32_t osd_stat_seq = 0; + void update_stats(); + MessageRef get_stats() const final; + + // AuthHandler methods + void handle_authentication(const EntityName& name, + const AuthCapsInfo& caps) final; + + crimson::osd::ShardServices shard_services; + + std::unique_ptr<Heartbeat> heartbeat; + seastar::timer<seastar::lowres_clock> tick_timer; + + // admin-socket + seastar::lw_shared_ptr<crimson::admin::AdminSocket> asok; + +public: + OSD(int id, uint32_t nonce, + crimson::net::MessengerRef cluster_msgr, + crimson::net::MessengerRef client_msgr, + crimson::net::MessengerRef hb_front_msgr, + crimson::net::MessengerRef hb_back_msgr); + ~OSD() final; + + seastar::future<> mkfs(uuid_d osd_uuid, uuid_d cluster_fsid); + + seastar::future<> start(); + seastar::future<> stop(); + + void dump_status(Formatter*) const; + void dump_pg_state_history(Formatter*) const; + void print(std::ostream&) const; + + seastar::future<> send_incremental_map(crimson::net::ConnectionRef conn, + epoch_t first); + + /// @return the seq id of the pg stats being sent + uint64_t send_pg_stats(); + +private: + seastar::future<> start_boot(); + seastar::future<> _preboot(version_t oldest_osdmap, version_t newest_osdmap); + seastar::future<> _send_boot(); + seastar::future<> _add_me_to_crush(); + + seastar::future<Ref<PG>> make_pg(cached_map_t create_map, + spg_t pgid, + bool do_create); + seastar::future<Ref<PG>> load_pg(spg_t pgid); + seastar::future<> load_pgs(); + + // OSDMapService methods + epoch_t get_up_epoch() const final { + return up_epoch; + } + seastar::future<cached_map_t> get_map(epoch_t e) final; + cached_map_t get_map() const final; + seastar::future<std::unique_ptr<OSDMap>> load_map(epoch_t e); + seastar::future<bufferlist> load_map_bl(epoch_t e); + seastar::future<std::map<epoch_t, bufferlist>> + load_map_bls(epoch_t first, epoch_t last); + void store_map_bl(ceph::os::Transaction& t, + epoch_t e, bufferlist&& bl); + seastar::future<> store_maps(ceph::os::Transaction& t, + epoch_t start, Ref<MOSDMap> m); + seastar::future<> osdmap_subscribe(version_t epoch, bool force_request); + + void write_superblock(ceph::os::Transaction& t); + seastar::future<> read_superblock(); + + bool require_mon_peer(crimson::net::Connection *conn, Ref<Message> m); + + seastar::future<Ref<PG>> handle_pg_create_info( + std::unique_ptr<PGCreateInfo> info); + + seastar::future<> handle_osd_map(crimson::net::ConnectionRef conn, + Ref<MOSDMap> m); + seastar::future<> handle_osd_op(crimson::net::ConnectionRef conn, + Ref<MOSDOp> m); + seastar::future<> handle_rep_op(crimson::net::ConnectionRef conn, + Ref<MOSDRepOp> m); + seastar::future<> handle_rep_op_reply(crimson::net::ConnectionRef conn, + Ref<MOSDRepOpReply> m); + seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn, + Ref<MOSDPeeringOp> m); + seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn, + Ref<MOSDFastDispatchOp> m); + seastar::future<> handle_scrub(crimson::net::ConnectionRef conn, + Ref<MOSDScrub2> m); + seastar::future<> handle_mark_me_down(crimson::net::ConnectionRef conn, + Ref<MOSDMarkMeDown> m); + + seastar::future<> committed_osd_maps(version_t first, + version_t last, + Ref<MOSDMap> m); + + void check_osdmap_features(); + + seastar::future<> handle_command(crimson::net::ConnectionRef conn, + Ref<MCommand> m); + seastar::future<> start_asok_admin(); + +public: + OSDMapGate osdmap_gate; + + ShardServices &get_shard_services() { + return shard_services; + } + + seastar::future<> consume_map(epoch_t epoch); + +private: + PGMap pg_map; + crimson::common::Gated gate; + + seastar::promise<> stop_acked; + void got_stop_ack() { + stop_acked.set_value(); + } + seastar::future<> prepare_to_stop(); +public: + blocking_future<Ref<PG>> get_or_create_pg( + spg_t pgid, + epoch_t epoch, + std::unique_ptr<PGCreateInfo> info); + blocking_future<Ref<PG>> wait_for_pg( + spg_t pgid); + Ref<PG> get_pg(spg_t pgid); + + bool should_restart() const; + seastar::future<> restart(); + seastar::future<> shutdown(); + + seastar::future<> send_beacon(); + void update_heartbeat_peers(); + + friend class PGAdvanceMap; +}; + +inline std::ostream& operator<<(std::ostream& out, const OSD& osd) { + osd.print(out); + return out; +} + +} diff --git a/src/crimson/osd/osd_connection_priv.h b/src/crimson/osd/osd_connection_priv.h new file mode 100644 index 000000000..a265bb432 --- /dev/null +++ b/src/crimson/osd/osd_connection_priv.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/replicated_request.h" + +namespace crimson::osd { + +struct OSDConnectionPriv : public crimson::net::Connection::user_private_t { + ClientRequest::ConnectionPipeline client_request_conn_pipeline; + RemotePeeringEvent::ConnectionPipeline peering_request_conn_pipeline; + RepRequest::ConnectionPipeline replicated_request_conn_pipeline; +}; + +static OSDConnectionPriv &get_osd_priv(crimson::net::Connection *conn) { + if (!conn->has_user_private()) { + conn->set_user_private(std::make_unique<OSDConnectionPriv>()); + } + return static_cast<OSDConnectionPriv&>(conn->get_user_private()); +} + +} diff --git a/src/crimson/osd/osd_meta.cc b/src/crimson/osd/osd_meta.cc new file mode 100644 index 000000000..9b9215f5b --- /dev/null +++ b/src/crimson/osd/osd_meta.cc @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd_meta.h" + +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "os/Transaction.h" + +using read_errorator = crimson::os::FuturizedStore::read_errorator; + +void OSDMeta::create(ceph::os::Transaction& t) +{ + t.create_collection(coll->get_cid(), 0); +} + +void OSDMeta::store_map(ceph::os::Transaction& t, + epoch_t e, const bufferlist& m) +{ + t.write(coll->get_cid(), osdmap_oid(e), 0, m.length(), m); +} + +seastar::future<bufferlist> OSDMeta::load_map(epoch_t e) +{ + return store->read(coll, + osdmap_oid(e), 0, 0, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED).handle_error( + read_errorator::all_same_way([e] { + throw std::runtime_error(fmt::format("read gave enoent on {}", + osdmap_oid(e))); + })); +} + +void OSDMeta::store_superblock(ceph::os::Transaction& t, + const OSDSuperblock& superblock) +{ + bufferlist bl; + encode(superblock, bl); + t.write(coll->get_cid(), superblock_oid(), 0, bl.length(), bl); +} + +seastar::future<OSDSuperblock> OSDMeta::load_superblock() +{ + return store->read(coll, superblock_oid(), 0, 0).safe_then( + [] (bufferlist&& bl) { + auto p = bl.cbegin(); + OSDSuperblock superblock; + decode(superblock, p); + return seastar::make_ready_future<OSDSuperblock>(std::move(superblock)); + }, read_errorator::all_same_way([] { + throw std::runtime_error(fmt::format("read gave enoent on {}", + superblock_oid())); + })); +} + +seastar::future<std::tuple<pg_pool_t, + std::string, + OSDMeta::ec_profile_t>> +OSDMeta::load_final_pool_info(int64_t pool) { + return store->read(coll, final_pool_info_oid(pool), + 0, 0).safe_then([] (bufferlist&& bl) { + auto p = bl.cbegin(); + pg_pool_t pi; + string name; + ec_profile_t ec_profile; + decode(pi, p); + decode(name, p); + decode(ec_profile, p); + return seastar::make_ready_future<std::tuple<pg_pool_t, + string, + ec_profile_t>>( + std::make_tuple(std::move(pi), + std::move(name), + std::move(ec_profile))); + },read_errorator::all_same_way([pool] { + throw std::runtime_error(fmt::format("read gave enoent on {}", + final_pool_info_oid(pool))); + })); +} + +ghobject_t OSDMeta::osdmap_oid(epoch_t epoch) +{ + string name = fmt::format("osdmap.{}", epoch); + return ghobject_t(hobject_t(sobject_t(object_t(name), 0))); +} + +ghobject_t OSDMeta::final_pool_info_oid(int64_t pool) +{ + string name = fmt::format("final_pool_{}", pool); + return ghobject_t(hobject_t(sobject_t(object_t(name), CEPH_NOSNAP))); +} + +ghobject_t OSDMeta::superblock_oid() +{ + return ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0))); +} diff --git a/src/crimson/osd/osd_meta.h b/src/crimson/osd/osd_meta.h new file mode 100644 index 000000000..841572087 --- /dev/null +++ b/src/crimson/osd/osd_meta.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <string> +#include <seastar/core/future.hh> +#include "osd/osd_types.h" +#include "crimson/os/futurized_collection.h" + +namespace ceph::os { + class Transaction; +} + +namespace crimson::os { + class FuturizedCollection; + class FuturizedStore; +} + +/// metadata shared across PGs, or put in another way, +/// metadata not specific to certain PGs. +class OSDMeta { + template<typename T> using Ref = boost::intrusive_ptr<T>; + + crimson::os::FuturizedStore* store; + Ref<crimson::os::FuturizedCollection> coll; + +public: + OSDMeta(Ref<crimson::os::FuturizedCollection> coll, + crimson::os::FuturizedStore* store) + : store{store}, coll{coll} + {} + + auto collection() { + return coll; + } + void create(ceph::os::Transaction& t); + + void store_map(ceph::os::Transaction& t, + epoch_t e, const bufferlist& m); + seastar::future<bufferlist> load_map(epoch_t e); + + void store_superblock(ceph::os::Transaction& t, + const OSDSuperblock& sb); + seastar::future<OSDSuperblock> load_superblock(); + + using ec_profile_t = std::map<std::string, std::string>; + seastar::future<std::tuple<pg_pool_t, + std::string, + ec_profile_t>> load_final_pool_info(int64_t pool); +private: + static ghobject_t osdmap_oid(epoch_t epoch); + static ghobject_t final_pool_info_oid(int64_t pool); + static ghobject_t superblock_oid(); +}; diff --git a/src/crimson/osd/osd_operation.cc b/src/crimson/osd/osd_operation.cc new file mode 100644 index 000000000..b5f3c3cbb --- /dev/null +++ b/src/crimson/osd/osd_operation.cc @@ -0,0 +1,159 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd_operation.h" +#include "common/Formatter.h" + +namespace crimson::osd { + +void Operation::dump(ceph::Formatter* f) +{ + f->open_object_section("operation"); + f->dump_string("type", get_type_name()); + f->dump_unsigned("id", id); + { + f->open_object_section("detail"); + dump_detail(f); + f->close_section(); + } + f->open_array_section("blockers"); + for (auto &blocker : blockers) { + blocker->dump(f); + } + f->close_section(); + f->close_section(); +} + +void Operation::dump_brief(ceph::Formatter* f) +{ + f->open_object_section("operation"); + f->dump_string("type", get_type_name()); + f->dump_unsigned("id", id); + f->close_section(); +} + +std::ostream &operator<<(std::ostream &lhs, const Operation &rhs) { + lhs << rhs.get_type_name() << "(id=" << rhs.get_id() << ", detail="; + rhs.print(lhs); + lhs << ")"; + return lhs; +} + +void Blocker::dump(ceph::Formatter* f) const +{ + f->open_object_section("blocker"); + f->dump_string("op_type", get_type_name()); + { + f->open_object_section("detail"); + dump_detail(f); + f->close_section(); + } + f->close_section(); +} + +void AggregateBlocker::dump_detail(ceph::Formatter *f) const +{ + f->open_array_section("parent_blockers"); + for (auto b : parent_blockers) { + f->open_object_section("parent_blocker"); + b->dump(f); + f->close_section(); + } + f->close_section(); +} + +OperationThrottler::OperationThrottler(ConfigProxy &conf) + : scheduler(crimson::osd::scheduler::make_scheduler(conf)) +{ + conf.add_observer(this); + update_from_config(conf); +} + +void OperationThrottler::wake() +{ + while ((!max_in_progress || in_progress < max_in_progress) && + !scheduler->empty()) { + auto item = scheduler->dequeue(); + item.wake.set_value(); + ++in_progress; + --pending; + } +} + +void OperationThrottler::release_throttle() +{ + ceph_assert(in_progress > 0); + --in_progress; + wake(); +} + +blocking_future<> OperationThrottler::acquire_throttle( + crimson::osd::scheduler::params_t params) +{ + crimson::osd::scheduler::item_t item{params, seastar::promise<>()}; + auto fut = item.wake.get_future(); + scheduler->enqueue(std::move(item)); + return make_blocking_future(std::move(fut)); +} + +void OperationThrottler::dump_detail(Formatter *f) const +{ + f->dump_unsigned("max_in_progress", max_in_progress); + f->dump_unsigned("in_progress", in_progress); + f->open_object_section("scheduler"); + { + scheduler->dump(*f); + } + f->close_section(); +} + +void OperationThrottler::update_from_config(const ConfigProxy &conf) +{ + max_in_progress = conf.get_val<uint64_t>("crimson_osd_scheduler_concurrency"); + wake(); +} + +const char** OperationThrottler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "crimson_osd_scheduler_concurrency", + NULL + }; + return KEYS; +} + +void OperationThrottler::handle_conf_change( + const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + update_from_config(conf); +} + + +void OrderedPipelinePhase::Handle::exit() +{ + if (phase) { + phase->mutex.unlock(); + phase = nullptr; + } +} + +blocking_future<> OrderedPipelinePhase::Handle::enter( + OrderedPipelinePhase &new_phase) +{ + auto fut = new_phase.mutex.lock(); + exit(); + phase = &new_phase; + return new_phase.make_blocking_future(std::move(fut)); +} + +OrderedPipelinePhase::Handle::~Handle() +{ + exit(); +} + +void OrderedPipelinePhase::dump_detail(ceph::Formatter* f) const +{ +} + +} diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h new file mode 100644 index 000000000..5178749b0 --- /dev/null +++ b/src/crimson/osd/osd_operation.h @@ -0,0 +1,427 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <algorithm> +#include <array> +#include <set> +#include <vector> +#include <boost/intrusive/list.hpp> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/shared_mutex.hh> +#include <seastar/core/future.hh> +#include <seastar/core/timer.hh> +#include <seastar/core/lowres_clock.hh> + +#include "include/ceph_assert.h" +#include "crimson/osd/scheduler/scheduler.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +enum class OperationTypeCode { + client_request = 0, + peering_event, + compound_peering_request, + pg_advance_map, + pg_creation, + replicated_request, + background_recovery, + background_recovery_sub, + last_op +}; + +static constexpr const char* const OP_NAMES[] = { + "client_request", + "peering_event", + "compound_peering_request", + "pg_advance_map", + "pg_creation", + "replicated_request", + "background_recovery", + "background_recovery_sub", +}; + +// prevent the addition of OperationTypeCode-s with no matching OP_NAMES entry: +static_assert( + (sizeof(OP_NAMES)/sizeof(OP_NAMES[0])) == + static_cast<int>(OperationTypeCode::last_op)); + +class OperationRegistry; + +using registry_hook_t = boost::intrusive::list_member_hook< + boost::intrusive::link_mode<boost::intrusive::auto_unlink>>; + +class Operation; +class Blocker; + +/** + * Provides an abstraction for registering and unregistering a blocker + * for the duration of a future becoming available. + */ +template <typename Fut> +class blocking_future_detail { + friend class Operation; + friend class Blocker; + Blocker *blocker; + Fut fut; + blocking_future_detail(Blocker *b, Fut &&f) + : blocker(b), fut(std::move(f)) {} + + template <typename V, typename U> + friend blocking_future_detail<seastar::future<V>> make_ready_blocking_future(U&& args); + template <typename V, typename Exception> + friend blocking_future_detail<seastar::future<V>> + make_exception_blocking_future(Exception&& e); + + template <typename U> + friend blocking_future_detail<seastar::future<>> join_blocking_futures(U &&u); + + template <typename U> + friend class blocking_future_detail; + +public: + template <typename F> + auto then(F &&f) && { + using result = decltype(std::declval<Fut>().then(f)); + return blocking_future_detail<seastar::futurize_t<result>>( + blocker, + std::move(fut).then(std::forward<F>(f))); + } +}; + +template <typename T=void> +using blocking_future = blocking_future_detail<seastar::future<T>>; + +template <typename V, typename U> +blocking_future_detail<seastar::future<V>> make_ready_blocking_future(U&& args) { + return blocking_future<V>( + nullptr, + seastar::make_ready_future<V>(std::forward<U>(args))); +} + +template <typename V, typename Exception> +blocking_future_detail<seastar::future<V>> +make_exception_blocking_future(Exception&& e) { + return blocking_future<V>( + nullptr, + seastar::make_exception_future<V>(e)); +} + +/** + * Provides an interface for dumping diagnostic information about + * why a particular op is not making progress. + */ +class Blocker { +public: + template <typename T> + blocking_future<T> make_blocking_future(seastar::future<T> &&f) { + return blocking_future<T>(this, std::move(f)); + } + void dump(ceph::Formatter *f) const; + virtual ~Blocker() = default; + +private: + virtual void dump_detail(ceph::Formatter *f) const = 0; + virtual const char *get_type_name() const = 0; +}; + +template <typename T> +class BlockerT : public Blocker { +public: + virtual ~BlockerT() = default; +private: + const char *get_type_name() const final { + return T::type_name; + } +}; + +class AggregateBlocker : public BlockerT<AggregateBlocker> { + vector<Blocker*> parent_blockers; +public: + AggregateBlocker(vector<Blocker*> &&parent_blockers) + : parent_blockers(std::move(parent_blockers)) {} + static constexpr const char *type_name = "AggregateBlocker"; +private: + void dump_detail(ceph::Formatter *f) const final; +}; + +template <typename T> +blocking_future<> join_blocking_futures(T &&t) { + vector<Blocker*> blockers; + blockers.reserve(t.size()); + for (auto &&bf: t) { + blockers.push_back(bf.blocker); + bf.blocker = nullptr; + } + auto agg = std::make_unique<AggregateBlocker>(std::move(blockers)); + return agg->make_blocking_future( + seastar::parallel_for_each( + std::forward<T>(t), + [](auto &&bf) { + return std::move(bf.fut); + }).then([agg=std::move(agg)] { + return seastar::make_ready_future<>(); + })); +} + + +/** + * Common base for all crimson-osd operations. Mainly provides + * an interface for registering ops in flight and dumping + * diagnostic information. + */ +class Operation : public boost::intrusive_ref_counter< + Operation, boost::thread_unsafe_counter> { + public: + uint64_t get_id() const { + return id; + } + + virtual OperationTypeCode get_type() const = 0; + virtual const char *get_type_name() const = 0; + virtual void print(std::ostream &) const = 0; + + template <typename T> + seastar::future<T> with_blocking_future(blocking_future<T> &&f) { + if (f.fut.available()) { + return std::move(f.fut); + } + assert(f.blocker); + add_blocker(f.blocker); + return std::move(f.fut).then_wrapped([this, blocker=f.blocker](auto &&arg) { + clear_blocker(blocker); + return std::move(arg); + }); + } + + void dump(ceph::Formatter *f); + void dump_brief(ceph::Formatter *f); + virtual ~Operation() = default; + + private: + virtual void dump_detail(ceph::Formatter *f) const = 0; + + private: + registry_hook_t registry_hook; + + std::vector<Blocker*> blockers; + uint64_t id = 0; + void set_id(uint64_t in_id) { + id = in_id; + } + + void add_blocker(Blocker *b) { + blockers.push_back(b); + } + + void clear_blocker(Blocker *b) { + auto iter = std::find(blockers.begin(), blockers.end(), b); + if (iter != blockers.end()) { + blockers.erase(iter); + } + } + + friend class OperationRegistry; +}; +using OperationRef = boost::intrusive_ptr<Operation>; + +std::ostream &operator<<(std::ostream &, const Operation &op); + +template <typename T> +class OperationT : public Operation { +public: + static constexpr const char *type_name = OP_NAMES[static_cast<int>(T::type)]; + using IRef = boost::intrusive_ptr<T>; + + OperationTypeCode get_type() const final { + return T::type; + } + + const char *get_type_name() const final { + return T::type_name; + } + + virtual ~OperationT() = default; + +private: + virtual void dump_detail(ceph::Formatter *f) const = 0; +}; + +/** + * Maintains a set of lists of all active ops. + */ +class OperationRegistry { + friend class Operation; + using op_list_member_option = boost::intrusive::member_hook< + Operation, + registry_hook_t, + &Operation::registry_hook + >; + using op_list = boost::intrusive::list< + Operation, + op_list_member_option, + boost::intrusive::constant_time_size<false>>; + + std::array< + op_list, + static_cast<int>(OperationTypeCode::last_op) + > registries; + + std::array< + uint64_t, + static_cast<int>(OperationTypeCode::last_op) + > op_id_counters = {}; + + seastar::timer<seastar::lowres_clock> shutdown_timer; + seastar::promise<> shutdown; +public: + template <typename T, typename... Args> + typename T::IRef create_operation(Args&&... args) { + typename T::IRef op = new T(std::forward<Args>(args)...); + registries[static_cast<int>(T::type)].push_back(*op); + op->set_id(op_id_counters[static_cast<int>(T::type)]++); + return op; + } + + seastar::future<> stop() { + shutdown_timer.set_callback([this] { + if (std::all_of(registries.begin(), + registries.end(), + [](auto& opl) { + return opl.empty(); + })) { + shutdown.set_value(); + shutdown_timer.cancel(); + } + }); + shutdown_timer.arm_periodic(std::chrono::milliseconds(100/*TODO: use option instead*/)); + return shutdown.get_future(); + } +}; + +/** + * Throttles set of currently running operations + * + * Very primitive currently, assumes all ops are equally + * expensive and simply limits the number that can be + * concurrently active. + */ +class OperationThrottler : public Blocker, + private md_config_obs_t { +public: + OperationThrottler(ConfigProxy &conf); + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) final; + void update_from_config(const ConfigProxy &conf); + + template <typename F> + auto with_throttle( + OperationRef op, + crimson::osd::scheduler::params_t params, + F &&f) { + if (!max_in_progress) return f(); + auto fut = acquire_throttle(params); + return op->with_blocking_future(std::move(fut)) + .then(std::forward<F>(f)) + .then([this](auto x) { + release_throttle(); + return x; + }); + } + + template <typename F> + seastar::future<> with_throttle_while( + OperationRef op, + crimson::osd::scheduler::params_t params, + F &&f) { + return with_throttle(op, params, f).then([this, params, op, f](bool cont) { + if (cont) + return with_throttle_while(op, params, f); + else + return seastar::make_ready_future<>(); + }); + } + +private: + void dump_detail(Formatter *f) const final; + const char *get_type_name() const final { + return "OperationThrottler"; + } + +private: + crimson::osd::scheduler::SchedulerRef scheduler; + + uint64_t max_in_progress = 0; + uint64_t in_progress = 0; + + uint64_t pending = 0; + + void wake(); + + blocking_future<> acquire_throttle( + crimson::osd::scheduler::params_t params); + + void release_throttle(); +}; + +/** + * Ensures that at most one op may consider itself in the phase at a time. + * Ops will see enter() unblock in the order in which they tried to enter + * the phase. entering (though not necessarily waiting for the future to + * resolve) a new phase prior to exiting the previous one will ensure that + * the op ordering is preserved. + */ +class OrderedPipelinePhase : public Blocker { +private: + void dump_detail(ceph::Formatter *f) const final; + const char *get_type_name() const final { + return name; + } + +public: + /** + * Used to encapsulate pipeline residency state. + */ + class Handle { + OrderedPipelinePhase *phase = nullptr; + + public: + Handle() = default; + + Handle(const Handle&) = delete; + Handle(Handle&&) = delete; + Handle &operator=(const Handle&) = delete; + Handle &operator=(Handle&&) = delete; + + /** + * Returns a future which unblocks when the handle has entered the passed + * OrderedPipelinePhase. If already in a phase, enter will also release + * that phase after placing itself in the queue for the next one to preserve + * ordering. + */ + blocking_future<> enter(OrderedPipelinePhase &phase); + + /** + * Releases the current phase if there is one. Called in ~Handle(). + */ + void exit(); + + ~Handle(); + }; + + OrderedPipelinePhase(const char *name) : name(name) {} + +private: + const char * name; + seastar::shared_mutex mutex; +}; + +} diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc new file mode 100644 index 000000000..126e0e902 --- /dev/null +++ b/src/crimson/osd/osd_operations/background_recovery.cc @@ -0,0 +1,140 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> + +#include "messages/MOSDOp.h" + +#include "crimson/osd/pg.h" +#include "crimson/osd/shard_services.h" +#include "common/Formatter.h" +#include "crimson/osd/osd_operations/background_recovery.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +BackgroundRecovery::BackgroundRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + crimson::osd::scheduler::scheduler_class_t scheduler_class) + : pg(pg), + epoch_started(epoch_started), + ss(ss), + scheduler_class(scheduler_class) +{} + +void BackgroundRecovery::print(std::ostream &lhs) const +{ + lhs << "BackgroundRecovery(" << pg->get_pgid() << ")"; +} + +void BackgroundRecovery::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->get_pgid(); + f->open_object_section("recovery_detail"); + { + // TODO pg->dump_recovery_state(f); + } + f->close_section(); +} + +seastar::future<> BackgroundRecovery::start() +{ + logger().debug("{}: start", *this); + + IRef ref = this; + return ss.throttler.with_throttle_while( + this, get_scheduler_params(), [this] { + return do_recovery(); + }).handle_exception_type([ref, this](const std::system_error& err) { + if (err.code() == std::make_error_code(std::errc::interrupted)) { + logger().debug("{} recovery interruped: {}", *pg, err.what()); + return seastar::now(); + } + return seastar::make_exception_future<>(err); + }); +} + +seastar::future<bool> UrgentRecovery::do_recovery() +{ + if (!pg->has_reset_since(epoch_started)) { + return with_blocking_future( + pg->get_recovery_handler()->recover_missing(soid, need) + ).then([] { + return seastar::make_ready_future<bool>(false); + }); + } + return seastar::make_ready_future<bool>(false); +} + +void UrgentRecovery::print(std::ostream &lhs) const +{ + lhs << "UrgentRecovery(" << pg->get_pgid() << ", " + << soid << ", v" << need << ")"; +} + +void UrgentRecovery::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->get_pgid(); + f->open_object_section("recovery_detail"); + { + f->dump_stream("oid") << soid; + f->dump_stream("version") << need; + } + f->close_section(); +} + +PglogBasedRecovery::PglogBasedRecovery( + Ref<PG> pg, + ShardServices &ss, + const epoch_t epoch_started) + : BackgroundRecovery( + std::move(pg), + ss, + epoch_started, + crimson::osd::scheduler::scheduler_class_t::background_recovery) +{} + +seastar::future<bool> PglogBasedRecovery::do_recovery() +{ + if (pg->has_reset_since(epoch_started)) + return seastar::make_ready_future<bool>(false); + return with_blocking_future( + pg->get_recovery_handler()->start_recovery_ops( + crimson::common::local_conf()->osd_recovery_max_single_start)); +} + +BackfillRecovery::BackfillRecoveryPipeline &BackfillRecovery::bp(PG &pg) +{ + return pg.backfill_pipeline; +} + +seastar::future<bool> BackfillRecovery::do_recovery() +{ + logger().debug("{}", __func__); + + if (pg->has_reset_since(epoch_started)) { + logger().debug("{}: pg got reset since epoch_started={}", + __func__, epoch_started); + return seastar::make_ready_future<bool>(false); + } + // TODO: limits + return with_blocking_future( + // process_event() of our boost::statechart machine is non-reentrant. + // with the backfill_pipeline we protect it from a second entry from + // the implementation of BackfillListener. + // additionally, this stage serves to synchronize with PeeringEvent. + handle.enter(bp(*pg).process) + ).then([this] { + pg->get_recovery_handler()->dispatch_backfill_event(std::move(evt)); + return seastar::make_ready_future<bool>(false); + }); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/background_recovery.h b/src/crimson/osd/osd_operations/background_recovery.h new file mode 100644 index 000000000..37e46c588 --- /dev/null +++ b/src/crimson/osd/osd_operations/background_recovery.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/statechart/event_base.hpp> + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/common/type_helpers.h" + +#include "messages/MOSDOp.h" + +namespace crimson::osd { +class PG; +class ShardServices; + +class BackgroundRecovery : public OperationT<BackgroundRecovery> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::background_recovery; + + BackgroundRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + crimson::osd::scheduler::scheduler_class_t scheduler_class); + + virtual void print(std::ostream &) const; + seastar::future<> start(); + +protected: + Ref<PG> pg; + const epoch_t epoch_started; + +private: + virtual void dump_detail(Formatter *f) const; + crimson::osd::scheduler::params_t get_scheduler_params() const { + return { + 1, // cost + 0, // owner + scheduler_class + }; + } + virtual seastar::future<bool> do_recovery() = 0; + ShardServices &ss; + const crimson::osd::scheduler::scheduler_class_t scheduler_class; +}; + +/// represent a recovery initiated for serving a client request +/// +/// unlike @c PglogBasedRecovery and @c BackfillRecovery, +/// @c UrgentRecovery is not throttled by the scheduler. and it +/// utilizes @c RecoveryBackend directly to recover the unreadable +/// object. +class UrgentRecovery final : public BackgroundRecovery { +public: + UrgentRecovery( + const hobject_t& soid, + const eversion_t& need, + Ref<PG> pg, + ShardServices& ss, + epoch_t epoch_started) + : BackgroundRecovery{pg, ss, epoch_started, + crimson::osd::scheduler::scheduler_class_t::immediate}, + soid{soid}, need(need) {} + void print(std::ostream&) const final; + +private: + void dump_detail(Formatter* f) const final; + seastar::future<bool> do_recovery() override; + const hobject_t soid; + const eversion_t need; +}; + +class PglogBasedRecovery final : public BackgroundRecovery { +public: + PglogBasedRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started); + +private: + seastar::future<bool> do_recovery() override; +}; + +class BackfillRecovery final : public BackgroundRecovery { +public: + class BackfillRecoveryPipeline { + OrderedPipelinePhase process = { + "BackfillRecovery::PGPipeline::process" + }; + friend class BackfillRecovery; + friend class PeeringEvent; + }; + + template <class EventT> + BackfillRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + const EventT& evt); + + static BackfillRecoveryPipeline &bp(PG &pg); + +private: + boost::intrusive_ptr<const boost::statechart::event_base> evt; + OrderedPipelinePhase::Handle handle; + seastar::future<bool> do_recovery() override; +}; + +template <class EventT> +BackfillRecovery::BackfillRecovery( + Ref<PG> pg, + ShardServices &ss, + const epoch_t epoch_started, + const EventT& evt) + : BackgroundRecovery( + std::move(pg), + ss, + epoch_started, + crimson::osd::scheduler::scheduler_class_t::background_best_effort), + evt(evt.intrusive_from_this()) +{} + + +} diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc new file mode 100644 index 000000000..87b8fc788 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request.cc @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" + +#include "crimson/common/exception.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" +#include "common/Formatter.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_connection_priv.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +ClientRequest::ClientRequest( + OSD &osd, crimson::net::ConnectionRef conn, Ref<MOSDOp> &&m) + : osd(osd), conn(conn), m(m) +{} + +void ClientRequest::print(std::ostream &lhs) const +{ + lhs << *m; +} + +void ClientRequest::dump_detail(Formatter *f) const +{ +} + +ClientRequest::ConnectionPipeline &ClientRequest::cp() +{ + return get_osd_priv(conn.get()).client_request_conn_pipeline; +} + +ClientRequest::PGPipeline &ClientRequest::pp(PG &pg) +{ + return pg.client_request_pg_pipeline; +} + +bool ClientRequest::is_pg_op() const +{ + return std::any_of( + begin(m->ops), end(m->ops), + [](auto& op) { return ceph_osd_op_type_pg(op.op.op); }); +} + +seastar::future<> ClientRequest::start() +{ + logger().debug("{}: start", *this); + + IRef opref = this; + return crimson::common::handle_system_shutdown( + [this, opref=std::move(opref)]() mutable { + return seastar::repeat([this, opref]() mutable { + return with_blocking_future(handle.enter(cp().await_map)) + .then([this]() { + return with_blocking_future(osd.osdmap_gate.wait_for_map(m->get_min_epoch())); + }).then([this](epoch_t epoch) { + return with_blocking_future(handle.enter(cp().get_pg)); + }).then([this] { + return with_blocking_future(osd.wait_for_pg(m->get_spg())); + }).then([this, opref](Ref<PG> pgref) { + PG &pg = *pgref; + if (pg.can_discard_op(*m)) { + return osd.send_incremental_map(conn, m->get_map_epoch()); + } + return with_blocking_future( + handle.enter(pp(pg).await_map) + ).then([this, &pg]() mutable { + return with_blocking_future( + pg.osdmap_gate.wait_for_map(m->get_min_epoch())); + }).then([this, &pg](auto map) mutable { + return with_blocking_future( + handle.enter(pp(pg).wait_for_active)); + }).then([this, &pg]() mutable { + return with_blocking_future(pg.wait_for_active_blocker.wait()); + }).then([this, pgref=std::move(pgref)]() mutable { + if (m->finish_decode()) { + m->clear_payload(); + } + if (is_pg_op()) { + return process_pg_op(pgref); + } else { + return process_op(pgref); + } + }); + }).then([] { + return seastar::stop_iteration::yes; + }).handle_exception_type([](crimson::common::actingset_changed& e) { + if (e.is_primary()) { + logger().debug("operation restart, acting set changed"); + return seastar::stop_iteration::no; + } else { + logger().debug("operation abort, up primary changed"); + return seastar::stop_iteration::yes; + } + }); + }); + }); +} + +seastar::future<> ClientRequest::process_pg_op( + Ref<PG> &pg) +{ + return pg->do_pg_ops(m) + .then([this, pg=std::move(pg)](Ref<MOSDOpReply> reply) { + return conn->send(reply); + }); +} + +seastar::future<> ClientRequest::process_op( + Ref<PG> &pgref) +{ + PG& pg = *pgref; + return with_blocking_future( + handle.enter(pp(pg).recover_missing) + ).then([this, &pg, pgref] { + eversion_t ver; + const hobject_t& soid = m->get_hobj(); + logger().debug("{} check for recovery, {}", *this, soid); + if (pg.is_unreadable_object(soid, &ver) || + pg.is_degraded_or_backfilling_object(soid)) { + logger().debug("{} need to wait for recovery, {}", *this, soid); + if (pg.get_recovery_backend()->is_recovering(soid)) { + return pg.get_recovery_backend()->get_recovering(soid).wait_for_recovered(); + } else { + auto [op, fut] = osd.get_shard_services().start_operation<UrgentRecovery>( + soid, ver, pgref, osd.get_shard_services(), pg.get_osdmap_epoch()); + return std::move(fut); + } + } + return seastar::now(); + }).then([this, &pg] { + return with_blocking_future(handle.enter(pp(pg).get_obc)); + }).then([this, &pg]() -> PG::load_obc_ertr::future<> { + op_info.set_from_op(&*m, *pg.get_osdmap()); + return pg.with_locked_obc(m, op_info, this, [this, &pg](auto obc) { + return with_blocking_future( + handle.enter(pp(pg).process) + ).then([this, &pg, obc] { + if (!pg.is_primary()) { + // primary can handle both normal ops and balanced reads + if (is_misdirected(pg)) { + logger().trace("process_op: dropping misdirected op"); + return seastar::make_ready_future<Ref<MOSDOpReply>>(); + } else if (const hobject_t& hoid = m->get_hobj(); + !pg.get_peering_state().can_serve_replica_read(hoid)) { + auto reply = make_message<MOSDOpReply>( + m.get(), -EAGAIN, pg.get_osdmap_epoch(), + m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK), + !m->has_flag(CEPH_OSD_FLAG_RETURNVEC)); + return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply)); + } + } + return pg.do_osd_ops(m, obc, op_info); + }).then([this](Ref<MOSDOpReply> reply) { + if (reply) { + return conn->send(std::move(reply)); + } else { + return seastar::now(); + } + }); + }); + }).safe_then([pgref=std::move(pgref)] { + return seastar::now(); + }, PG::load_obc_ertr::all_same_way([](auto &code) { + logger().error("ClientRequest saw error code {}", code); + return seastar::now(); + })); +} + +bool ClientRequest::is_misdirected(const PG& pg) const +{ + // otherwise take a closer look + if (const int flags = m->get_flags(); + flags & CEPH_OSD_FLAG_BALANCE_READS || + flags & CEPH_OSD_FLAG_LOCALIZE_READS) { + if (!op_info.may_read()) { + // no read found, so it can't be balanced read + return true; + } + if (op_info.may_write() || op_info.may_cache()) { + // write op, but i am not primary + return true; + } + // balanced reads; any replica will do + return pg.is_nonprimary(); + } + // neither balanced nor localize reads + return true; +} + +} diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h new file mode 100644 index 000000000..ea3124a93 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "osd/osd_op_util.h" +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDOp.h" + +namespace crimson::osd { +class PG; +class OSD; + +class ClientRequest final : public OperationT<ClientRequest> { + OSD &osd; + crimson::net::ConnectionRef conn; + Ref<MOSDOp> m; + OpInfo op_info; + OrderedPipelinePhase::Handle handle; + +public: + class ConnectionPipeline { + OrderedPipelinePhase await_map = { + "ClientRequest::ConnectionPipeline::await_map" + }; + OrderedPipelinePhase get_pg = { + "ClientRequest::ConnectionPipeline::get_pg" + }; + friend class ClientRequest; + }; + class PGPipeline { + OrderedPipelinePhase await_map = { + "ClientRequest::PGPipeline::await_map" + }; + OrderedPipelinePhase wait_for_active = { + "ClientRequest::PGPipeline::wait_for_active" + }; + OrderedPipelinePhase recover_missing = { + "ClientRequest::PGPipeline::recover_missing" + }; + OrderedPipelinePhase get_obc = { + "ClientRequest::PGPipeline::get_obc" + }; + OrderedPipelinePhase process = { + "ClientRequest::PGPipeline::process" + }; + friend class ClientRequest; + }; + + static constexpr OperationTypeCode type = OperationTypeCode::client_request; + + ClientRequest(OSD &osd, crimson::net::ConnectionRef, Ref<MOSDOp> &&m); + + void print(std::ostream &) const final; + void dump_detail(Formatter *f) const final; + +public: + seastar::future<> start(); + +private: + seastar::future<> process_pg_op( + Ref<PG> &pg); + seastar::future<> process_op( + Ref<PG> &pg); + bool is_pg_op() const; + + ConnectionPipeline &cp(); + PGPipeline &pp(PG &pg); + +private: + bool is_misdirected(const PG& pg) const; +}; + +} diff --git a/src/crimson/osd/osd_operations/compound_peering_request.cc b/src/crimson/osd/osd_operations/compound_peering_request.cc new file mode 100644 index 000000000..e55760096 --- /dev/null +++ b/src/crimson/osd/osd_operations/compound_peering_request.cc @@ -0,0 +1,170 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> + +#include "osd/PeeringState.h" + +#include "messages/MOSDPGQuery.h" +#include "messages/MOSDPGCreate2.h" + +#include "common/Formatter.h" + +#include "crimson/common/exception.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_operations/compound_peering_request.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace { +using namespace crimson::osd; + +struct compound_state { + seastar::promise<BufferedRecoveryMessages> promise; + // assuming crimson-osd won't need to be compatible with pre-octopus + // releases + BufferedRecoveryMessages ctx{ceph_release_t::octopus}; + compound_state() = default; + ~compound_state() { + promise.set_value(std::move(ctx)); + } +}; +using compound_state_ref = seastar::lw_shared_ptr<compound_state>; + +class PeeringSubEvent : public RemotePeeringEvent { + compound_state_ref state; +public: + template <typename... Args> + PeeringSubEvent(compound_state_ref state, Args &&... args) : + RemotePeeringEvent(std::forward<Args>(args)...), state(state) {} + + seastar::future<> complete_rctx(Ref<crimson::osd::PG> pg) final { + logger().debug("{}: submitting ctx transaction", *this); + state->ctx.accept_buffered_messages(ctx); + state = {}; + if (!pg) { + ceph_assert(ctx.transaction.empty()); + return seastar::now(); + } else { + return osd.get_shard_services().dispatch_context_transaction( + pg->get_collection_ref(), ctx); + } + } +}; + +std::vector<OperationRef> handle_pg_create( + OSD &osd, + crimson::net::ConnectionRef conn, + compound_state_ref state, + Ref<MOSDPGCreate2> m) +{ + std::vector<OperationRef> ret; + for (auto& [pgid, when] : m->pgs) { + const auto &[created, created_stamp] = when; + auto q = m->pg_extra.find(pgid); + ceph_assert(q != m->pg_extra.end()); + auto& [history, pi] = q->second; + logger().debug( + "{}: {} e{} @{} " + "history {} pi {}", + __func__, pgid, created, created_stamp, + history, pi); + if (!pi.empty() && + m->epoch < pi.get_bounds().second) { + logger().error( + "got pg_create on {} epoch {} " + "unmatched past_intervals {} (history {})", + pgid, m->epoch, + pi, history); + } else { + auto op = osd.get_shard_services().start_operation<PeeringSubEvent>( + state, + osd, + conn, + osd.get_shard_services(), + pg_shard_t(), + pgid, + m->epoch, + m->epoch, + NullEvt(), + true, + new PGCreateInfo(pgid, m->epoch, history, pi, true)).first; + ret.push_back(op); + } + } + return ret; +} + +struct SubOpBlocker : BlockerT<SubOpBlocker> { + static constexpr const char * type_name = "CompoundOpBlocker"; + + std::vector<OperationRef> subops; + SubOpBlocker(std::vector<OperationRef> &&subops) : subops(subops) {} + + virtual void dump_detail(Formatter *f) const { + f->open_array_section("dependent_operations"); + { + for (auto &i : subops) { + i->dump_brief(f); + } + } + f->close_section(); + } +}; + +} // namespace + +namespace crimson::osd { + +CompoundPeeringRequest::CompoundPeeringRequest( + OSD &osd, crimson::net::ConnectionRef conn, Ref<Message> m) + : osd(osd), + conn(conn), + m(m) +{} + +void CompoundPeeringRequest::print(std::ostream &lhs) const +{ + lhs << *m; +} + +void CompoundPeeringRequest::dump_detail(Formatter *f) const +{ + f->dump_stream("message") << *m; +} + +seastar::future<> CompoundPeeringRequest::start() +{ + logger().info("{}: starting", *this); + auto state = seastar::make_lw_shared<compound_state>(); + auto blocker = std::make_unique<SubOpBlocker>( + [&] { + assert((m->get_type() == MSG_OSD_PG_CREATE2)); + return handle_pg_create( + osd, + conn, + state, + boost::static_pointer_cast<MOSDPGCreate2>(m)); + }()); + + IRef ref = this; + logger().info("{}: about to fork future", *this); + return crimson::common::handle_system_shutdown( + [this, ref, blocker=std::move(blocker), state]() mutable { + return with_blocking_future( + blocker->make_blocking_future(state->promise.get_future()) + ).then([this, blocker=std::move(blocker)](auto &&ctx) { + logger().info("{}: sub events complete", *this); + return osd.get_shard_services().dispatch_context_messages(std::move(ctx)); + }).then([this, ref=std::move(ref)] { + logger().info("{}: complete", *this); + }); + }); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/compound_peering_request.h b/src/crimson/osd/osd_operations/compound_peering_request.h new file mode 100644 index 000000000..495306d75 --- /dev/null +++ b/src/crimson/osd/osd_operations/compound_peering_request.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "msg/MessageRef.h" + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" + +namespace crimson::osd { + +class OSD; +class PG; + +using osd_id_t = int; + +class CompoundPeeringRequest : public OperationT<CompoundPeeringRequest> { +public: + static constexpr OperationTypeCode type = + OperationTypeCode::compound_peering_request; + +private: + OSD &osd; + crimson::net::ConnectionRef conn; + Ref<Message> m; + +public: + CompoundPeeringRequest( + OSD &osd, crimson::net::ConnectionRef conn, Ref<Message> m); + + void print(std::ostream &) const final; + void dump_detail(Formatter *f) const final; + seastar::future<> start(); +}; + +} diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h new file mode 100644 index 000000000..a0bd9dcbb --- /dev/null +++ b/src/crimson/osd/osd_operations/osdop_params.h @@ -0,0 +1,27 @@ +#pragma once + +#include "messages/MOSDOp.h" +#include "osd/osd_types.h" +#include "crimson/common/type_helpers.h" + +// The fields in this struct are parameters that may be needed in multiple +// level of processing. I inclosed all those parameters in this struct to +// avoid passing each of them as a method parameter. +struct osd_op_params_t { + Ref<MOSDOp> req; + eversion_t at_version; + eversion_t pg_trim_to; + eversion_t min_last_complete_ondisk; + eversion_t last_complete; + version_t user_at_version = 0; + bool user_modify = false; + ObjectCleanRegions clean_regions; + + osd_op_params_t() = default; + osd_op_params_t(Ref<MOSDOp>&& req) : req(req) {} + osd_op_params_t(Ref<MOSDOp>&& req, eversion_t at_version, eversion_t pg_trim_to, + eversion_t mlcod, eversion_t lc, version_t user_at_version) : + req(req), at_version(at_version), pg_trim_to(pg_trim_to), + min_last_complete_ondisk(mlcod), last_complete(lc), + user_at_version(user_at_version) {} +}; diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc new file mode 100644 index 000000000..d3c6ccf81 --- /dev/null +++ b/src/crimson/osd/osd_operations/peering_event.cc @@ -0,0 +1,173 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> + +#include "messages/MOSDPGLog.h" + +#include "common/Formatter.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_connection_priv.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +void PeeringEvent::print(std::ostream &lhs) const +{ + lhs << "PeeringEvent(" + << "from=" << from + << " pgid=" << pgid + << " sent=" << evt.get_epoch_sent() + << " requested=" << evt.get_epoch_requested() + << " evt=" << evt.get_desc() + << ")"; +} + +void PeeringEvent::dump_detail(Formatter *f) const +{ + f->open_object_section("PeeringEvent"); + f->dump_stream("from") << from; + f->dump_stream("pgid") << pgid; + f->dump_int("sent", evt.get_epoch_sent()); + f->dump_int("requested", evt.get_epoch_requested()); + f->dump_string("evt", evt.get_desc()); + f->close_section(); +} + + +PeeringEvent::PGPipeline &PeeringEvent::pp(PG &pg) +{ + return pg.peering_request_pg_pipeline; +} + +seastar::future<> PeeringEvent::start() +{ + + logger().debug("{}: start", *this); + + IRef ref = this; + return [this] { + if (delay) { + return seastar::sleep(std::chrono::milliseconds( + std::lround(delay*1000))); + } else { + return seastar::now(); + } + }().then([this] { + return get_pg(); + }).then([this](Ref<PG> pg) { + if (!pg) { + logger().warn("{}: pg absent, did not create", *this); + on_pg_absent(); + handle.exit(); + return complete_rctx(pg); + } else { + logger().debug("{}: pg present", *this); + return with_blocking_future(handle.enter(pp(*pg).await_map) + ).then([this, pg] { + return with_blocking_future( + pg->osdmap_gate.wait_for_map(evt.get_epoch_sent())); + }).then([this, pg](auto) { + return with_blocking_future(handle.enter(pp(*pg).process)); + }).then([this, pg] { + // TODO: likely we should synchronize also with the pg log-based + // recovery. + return with_blocking_future( + handle.enter(BackfillRecovery::bp(*pg).process)); + }).then([this, pg] { + pg->do_peering_event(evt, ctx); + handle.exit(); + return complete_rctx(pg); + }).then([this, pg] { + return pg->get_need_up_thru() ? shard_services.send_alive(pg->get_same_interval_since()) + : seastar::now(); + }); + } + }).then([this] { + return shard_services.send_pg_temp(); + }).then([this, ref=std::move(ref)] { + logger().debug("{}: complete", *this); + }); +} + +void PeeringEvent::on_pg_absent() +{ + logger().debug("{}: pg absent, dropping", *this); +} + +seastar::future<> PeeringEvent::complete_rctx(Ref<PG> pg) +{ + logger().debug("{}: submitting ctx", *this); + return shard_services.dispatch_context( + pg->get_collection_ref(), + std::move(ctx)); +} + +RemotePeeringEvent::ConnectionPipeline &RemotePeeringEvent::cp() +{ + return get_osd_priv(conn.get()).peering_request_conn_pipeline; +} + +void RemotePeeringEvent::on_pg_absent() +{ + if (auto& e = get_event().get_event(); + e.dynamic_type() == MQuery::static_type()) { + const auto map_epoch = + shard_services.get_osdmap_service().get_map()->get_epoch(); + const auto& q = static_cast<const MQuery&>(e); + const pg_info_t empty{spg_t{pgid.pgid, q.query.to}}; + if (q.query.type == q.query.LOG || + q.query.type == q.query.FULLLOG) { + auto m = ceph::make_message<MOSDPGLog>(q.query.from, q.query.to, + map_epoch, empty, + q.query.epoch_sent); + ctx.send_osd_message(q.from.osd, std::move(m)); + } else { + ctx.send_notify(q.from.osd, {q.query.from, q.query.to, + q.query.epoch_sent, + map_epoch, empty, + PastIntervals{}}); + } + } +} + +seastar::future<> RemotePeeringEvent::complete_rctx(Ref<PG> pg) +{ + if (pg) { + return PeeringEvent::complete_rctx(pg); + } else { + return shard_services.dispatch_context_messages(std::move(ctx)); + } +} + +seastar::future<Ref<PG>> RemotePeeringEvent::get_pg() +{ + return with_blocking_future( + handle.enter(cp().await_map) + ).then([this] { + return with_blocking_future( + osd.osdmap_gate.wait_for_map(evt.get_epoch_sent())); + }).then([this](auto epoch) { + logger().debug("{}: got map {}", *this, epoch); + return with_blocking_future(handle.enter(cp().get_pg)); + }).then([this] { + return with_blocking_future( + osd.get_or_create_pg( + pgid, evt.get_epoch_sent(), std::move(evt.create_info))); + }); +} + +seastar::future<Ref<PG>> LocalPeeringEvent::get_pg() { + return seastar::make_ready_future<Ref<PG>>(pg); +} + +LocalPeeringEvent::~LocalPeeringEvent() {} + +} diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h new file mode 100644 index 000000000..3a6c0678c --- /dev/null +++ b/src/crimson/osd/osd_operations/peering_event.h @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "crimson/osd/osd_operation.h" +#include "osd/osd_types.h" +#include "osd/PGPeeringEvent.h" +#include "osd/PeeringState.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class OSD; +class ShardServices; +class PG; + +class PeeringEvent : public OperationT<PeeringEvent> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::peering_event; + + class PGPipeline { + OrderedPipelinePhase await_map = { + "PeeringEvent::PGPipeline::await_map" + }; + OrderedPipelinePhase process = { + "PeeringEvent::PGPipeline::process" + }; + friend class PeeringEvent; + friend class PGAdvanceMap; + }; + +protected: + OrderedPipelinePhase::Handle handle; + PGPipeline &pp(PG &pg); + + ShardServices &shard_services; + PeeringCtx ctx; + pg_shard_t from; + spg_t pgid; + float delay = 0; + PGPeeringEvent evt; + + const pg_shard_t get_from() const { + return from; + } + + const spg_t get_pgid() const { + return pgid; + } + + const PGPeeringEvent &get_event() const { + return evt; + } + + virtual void on_pg_absent(); + virtual seastar::future<> complete_rctx(Ref<PG>); + virtual seastar::future<Ref<PG>> get_pg() = 0; + +public: + template <typename... Args> + PeeringEvent( + ShardServices &shard_services, const pg_shard_t &from, const spg_t &pgid, + Args&&... args) : + shard_services(shard_services), + ctx{ceph_release_t::octopus}, + from(from), + pgid(pgid), + evt(std::forward<Args>(args)...) + {} + template <typename... Args> + PeeringEvent( + ShardServices &shard_services, const pg_shard_t &from, const spg_t &pgid, + float delay, Args&&... args) : + shard_services(shard_services), + ctx{ceph_release_t::octopus}, + from(from), + pgid(pgid), + delay(delay), + evt(std::forward<Args>(args)...) + {} + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + seastar::future<> start(); +}; + +class RemotePeeringEvent : public PeeringEvent { +protected: + OSD &osd; + crimson::net::ConnectionRef conn; + + void on_pg_absent() final; + seastar::future<> complete_rctx(Ref<PG> pg) override; + seastar::future<Ref<PG>> get_pg() final; + +public: + class ConnectionPipeline { + OrderedPipelinePhase await_map = { + "PeeringRequest::ConnectionPipeline::await_map" + }; + OrderedPipelinePhase get_pg = { + "PeeringRequest::ConnectionPipeline::get_pg" + }; + friend class RemotePeeringEvent; + }; + + template <typename... Args> + RemotePeeringEvent(OSD &osd, crimson::net::ConnectionRef conn, Args&&... args) : + PeeringEvent(std::forward<Args>(args)...), + osd(osd), + conn(conn) + {} + +private: + ConnectionPipeline &cp(); +}; + +class LocalPeeringEvent final : public PeeringEvent { +protected: + seastar::future<Ref<PG>> get_pg() final; + + Ref<PG> pg; + +public: + template <typename... Args> + LocalPeeringEvent(Ref<PG> pg, Args&&... args) : + PeeringEvent(std::forward<Args>(args)...), + pg(pg) + {} + + virtual ~LocalPeeringEvent(); +}; + + +} diff --git a/src/crimson/osd/osd_operations/pg_advance_map.cc b/src/crimson/osd/osd_operations/pg_advance_map.cc new file mode 100644 index 000000000..a96479d40 --- /dev/null +++ b/src/crimson/osd/osd_operations/pg_advance_map.cc @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/osd_operations/pg_advance_map.h" + +#include <boost/smart_ptr/local_shared_ptr.hpp> +#include <seastar/core/future.hh> + +#include "include/types.h" +#include "common/Formatter.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +PGAdvanceMap::PGAdvanceMap( + OSD &osd, Ref<PG> pg, epoch_t from, epoch_t to, + PeeringCtx &&rctx, bool do_init) + : osd(osd), pg(pg), from(from), to(to), + rctx(std::move(rctx)), do_init(do_init) {} + +PGAdvanceMap::~PGAdvanceMap() {} + +void PGAdvanceMap::print(std::ostream &lhs) const +{ + lhs << "PGAdvanceMap(" + << "pg=" << pg->get_pgid() + << " from=" << from + << " to=" << to; + if (do_init) { + lhs << " do_init"; + } + lhs << ")"; +} + +void PGAdvanceMap::dump_detail(Formatter *f) const +{ + f->open_object_section("PGAdvanceMap"); + f->dump_stream("pgid") << pg->get_pgid(); + f->dump_int("from", from); + f->dump_int("to", to); + f->dump_bool("do_init", do_init); + f->close_section(); +} + +seastar::future<> PGAdvanceMap::start() +{ + using cached_map_t = boost::local_shared_ptr<const OSDMap>; + + logger().debug("{}: start", *this); + + IRef ref = this; + return with_blocking_future( + handle.enter(pg->peering_request_pg_pipeline.process)) + .then([this] { + if (do_init) { + pg->handle_initialize(rctx); + pg->handle_activate_map(rctx); + } + return seastar::do_for_each( + boost::make_counting_iterator(from + 1), + boost::make_counting_iterator(to + 1), + [this](epoch_t next_epoch) { + return osd.get_map(next_epoch).then( + [this] (cached_map_t&& next_map) { + pg->handle_advance_map(next_map, rctx); + }); + }).then([this] { + pg->handle_activate_map(rctx); + handle.exit(); + if (do_init) { + osd.pg_map.pg_created(pg->get_pgid(), pg); + osd.shard_services.inc_pg_num(); + logger().info("PGAdvanceMap::start new pg {}", *pg); + } + return seastar::when_all_succeed( + pg->get_need_up_thru() \ + ? osd.shard_services.send_alive(pg->get_same_interval_since()) + : seastar::now(), + osd.shard_services.dispatch_context( + pg->get_collection_ref(), + std::move(rctx))); + }).then_unpack([this] { + return osd.shard_services.send_pg_temp(); + }); + }).then([this, ref=std::move(ref)] { + logger().debug("{}: complete", *this); + }); +} + +} diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h new file mode 100644 index 000000000..1b27037eb --- /dev/null +++ b/src/crimson/osd/osd_operations/pg_advance_map.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "crimson/osd/osd_operation.h" +#include "osd/osd_types.h" +#include "crimson/common/type_helpers.h" +#include "osd/PeeringState.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class OSD; +class PG; + +class PGAdvanceMap : public OperationT<PGAdvanceMap> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::pg_advance_map; + +protected: + OrderedPipelinePhase::Handle handle; + + OSD &osd; + Ref<PG> pg; + + epoch_t from; + epoch_t to; + + PeeringCtx rctx; + const bool do_init; + +public: + PGAdvanceMap( + OSD &osd, Ref<PG> pg, epoch_t from, epoch_t to, + PeeringCtx &&rctx, bool do_init); + ~PGAdvanceMap(); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter *f) const final; + seastar::future<> start(); +}; + +} diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.cc b/src/crimson/osd/osd_operations/recovery_subrequest.cc new file mode 100644 index 000000000..820c7beab --- /dev/null +++ b/src/crimson/osd/osd_operations/recovery_subrequest.cc @@ -0,0 +1,29 @@ +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "crimson/osd/osd_operations/recovery_subrequest.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +seastar::future<> RecoverySubRequest::start() { + logger().debug("{}: start", *this); + + IRef opref = this; + return with_blocking_future(osd.osdmap_gate.wait_for_map(m->get_min_epoch())) + .then([this] (epoch_t epoch) { + return with_blocking_future(osd.wait_for_pg(m->get_spg())); + }).then([this, opref=std::move(opref)] (Ref<PG> pgref) { + return seastar::do_with(std::move(pgref), std::move(opref), + [this](auto& pgref, auto& opref) { + return pgref->get_recovery_backend()->handle_recovery_op(m); + }); + }); +} + +} diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h new file mode 100644 index 000000000..b151e5c1d --- /dev/null +++ b/src/crimson/osd/osd_operations/recovery_subrequest.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "osd/osd_op_util.h" +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" + +namespace crimson::osd { + +class OSD; +class PG; + +class RecoverySubRequest final : public OperationT<RecoverySubRequest> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::background_recovery_sub; + + RecoverySubRequest(OSD &osd, crimson::net::ConnectionRef conn, Ref<MOSDFastDispatchOp>&& m) + : osd(osd), conn(conn), m(m) {} + + void print(std::ostream& out) const final + { + out << *m; + } + + void dump_detail(Formatter *f) const final + { + } + + seastar::future<> start(); +private: + OSD& osd; + crimson::net::ConnectionRef conn; + Ref<MOSDFastDispatchOp> m; +}; + +} diff --git a/src/crimson/osd/osd_operations/replicated_request.cc b/src/crimson/osd/osd_operations/replicated_request.cc new file mode 100644 index 000000000..34487f9e4 --- /dev/null +++ b/src/crimson/osd/osd_operations/replicated_request.cc @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "replicated_request.h" + +#include "common/Formatter.h" +#include "messages/MOSDRepOp.h" + +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_connection_priv.h" +#include "crimson/osd/pg.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +RepRequest::RepRequest(OSD &osd, + crimson::net::ConnectionRef&& conn, + Ref<MOSDRepOp> &&req) + : osd{osd}, + conn{std::move(conn)}, + req{req} +{} + +void RepRequest::print(std::ostream& os) const +{ + os << "RepRequest(" + << "from=" << req->from + << " req=" << *req + << ")"; +} + +void RepRequest::dump_detail(Formatter *f) const +{ + f->open_object_section("RepRequest"); + f->dump_stream("reqid") << req->reqid; + f->dump_stream("pgid") << req->get_spg(); + f->dump_unsigned("map_epoch", req->get_map_epoch()); + f->dump_unsigned("min_epoch", req->get_min_epoch()); + f->dump_stream("oid") << req->poid; + f->dump_stream("from") << req->from; + f->close_section(); +} + +RepRequest::ConnectionPipeline &RepRequest::cp() +{ + return get_osd_priv(conn.get()).replicated_request_conn_pipeline; +} + +RepRequest::PGPipeline &RepRequest::pp(PG &pg) +{ + return pg.replicated_request_pg_pipeline; +} + +seastar::future<> RepRequest::start() +{ + logger().debug("{} start", *this); + IRef ref = this; + return with_blocking_future(handle.enter(cp().await_map)) + .then([this]() { + return with_blocking_future(osd.osdmap_gate.wait_for_map(req->get_min_epoch())); + }).then([this](epoch_t epoch) { + return with_blocking_future(handle.enter(cp().get_pg)); + }).then([this] { + return with_blocking_future(osd.wait_for_pg(req->get_spg())); + }).then([this, ref=std::move(ref)](Ref<PG> pg) { + return pg->handle_rep_op(std::move(req)); + }); +} +} diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h new file mode 100644 index 000000000..8e9cfc9fe --- /dev/null +++ b/src/crimson/osd/osd_operations/replicated_request.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/common/type_helpers.h" + +class MOSDRepOp; + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class OSD; +class PG; + +class RepRequest final : public OperationT<RepRequest> { +public: + class ConnectionPipeline { + OrderedPipelinePhase await_map = { + "RepRequest::ConnectionPipeline::await_map" + }; + OrderedPipelinePhase get_pg = { + "RepRequest::ConnectionPipeline::get_pg" + }; + friend RepRequest; + }; + class PGPipeline { + OrderedPipelinePhase await_map = { + "RepRequest::PGPipeline::await_map" + }; + OrderedPipelinePhase process = { + "RepRequest::PGPipeline::process" + }; + friend RepRequest; + }; + static constexpr OperationTypeCode type = OperationTypeCode::replicated_request; + RepRequest(OSD&, crimson::net::ConnectionRef&&, Ref<MOSDRepOp>&&); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + seastar::future<> start(); + +private: + ConnectionPipeline &cp(); + PGPipeline &pp(PG &pg); + + OSD &osd; + crimson::net::ConnectionRef conn; + Ref<MOSDRepOp> req; + OrderedPipelinePhase::Handle handle; +}; + +} diff --git a/src/crimson/osd/osdmap_gate.cc b/src/crimson/osd/osdmap_gate.cc new file mode 100644 index 000000000..90afc32b4 --- /dev/null +++ b/src/crimson/osd/osdmap_gate.cc @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/exception.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/shard_services.h" +#include "common/Formatter.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +void OSDMapGate::OSDMapBlocker::dump_detail(Formatter *f) const +{ + f->open_object_section("OSDMapGate"); + f->dump_int("epoch", epoch); + f->close_section(); +} + +blocking_future<epoch_t> OSDMapGate::wait_for_map(epoch_t epoch) +{ + if (__builtin_expect(stopping, false)) { + return make_exception_blocking_future<epoch_t>( + crimson::common::system_shutdown_exception()); + } + if (current >= epoch) { + return make_ready_blocking_future<epoch_t>(current); + } else { + logger().info("evt epoch is {}, i have {}, will wait", epoch, current); + auto &blocker = waiting_peering.emplace( + epoch, make_pair(blocker_type, epoch)).first->second; + auto fut = blocker.promise.get_shared_future(); + if (shard_services) { + return blocker.make_blocking_future( + (*shard_services).get().osdmap_subscribe(current, true).then( + [fut=std::move(fut)]() mutable { + return std::move(fut); + })); + } else { + return blocker.make_blocking_future(std::move(fut)); + } + } +} + +void OSDMapGate::got_map(epoch_t epoch) { + current = epoch; + auto first = waiting_peering.begin(); + auto last = waiting_peering.upper_bound(epoch); + std::for_each(first, last, [epoch](auto& blocked_requests) { + blocked_requests.second.promise.set_value(epoch); + }); + waiting_peering.erase(first, last); +} + +seastar::future<> OSDMapGate::stop() { + logger().info("osdmap::stop"); + stopping = true; + auto first = waiting_peering.begin(); + auto last = waiting_peering.end(); + std::for_each(first, last, [](auto& blocked_requests) { + blocked_requests.second.promise.set_exception( + crimson::common::system_shutdown_exception()); + }); + return seastar::now(); +} + +} diff --git a/src/crimson/osd/osdmap_gate.h b/src/crimson/osd/osdmap_gate.h new file mode 100644 index 000000000..2b73d8959 --- /dev/null +++ b/src/crimson/osd/osdmap_gate.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <functional> +#include <map> +#include <optional> + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "include/types.h" +#include "crimson/osd/osd_operation.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; + +class OSDMapGate { + struct OSDMapBlocker : public Blocker { + const char * type_name; + epoch_t epoch; + + OSDMapBlocker(std::pair<const char *, epoch_t> args) + : type_name(args.first), epoch(args.second) {} + + OSDMapBlocker(const OSDMapBlocker &) = delete; + OSDMapBlocker(OSDMapBlocker &&) = delete; + OSDMapBlocker &operator=(const OSDMapBlocker &) = delete; + OSDMapBlocker &operator=(OSDMapBlocker &&) = delete; + + seastar::shared_promise<epoch_t> promise; + + void dump_detail(Formatter *f) const final; + private: + const char *get_type_name() const final { + return type_name; + } + }; + + // order the promises in ascending order of the waited osdmap epoch, + // so we can access all the waiters expecting a map whose epoch is less + // than or equal to a given epoch + using waiting_peering_t = std::map<epoch_t, + OSDMapBlocker>; + const char *blocker_type; + waiting_peering_t waiting_peering; + epoch_t current = 0; + std::optional<std::reference_wrapper<ShardServices>> shard_services; + bool stopping = false; +public: + OSDMapGate( + const char *blocker_type, + std::optional<std::reference_wrapper<ShardServices>> shard_services) + : blocker_type(blocker_type), shard_services(shard_services) {} + + // wait for an osdmap whose epoch is greater or equal to given epoch + blocking_future<epoch_t> wait_for_map(epoch_t epoch); + void got_map(epoch_t epoch); + seastar::future<> stop(); +}; + +} diff --git a/src/crimson/osd/osdmap_service.h b/src/crimson/osd/osdmap_service.h new file mode 100644 index 000000000..effd45b79 --- /dev/null +++ b/src/crimson/osd/osdmap_service.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/smart_ptr/local_shared_ptr.hpp> + +#include "include/types.h" + +class OSDMap; + +class OSDMapService { +public: + using cached_map_t = boost::local_shared_ptr<const OSDMap>; + virtual ~OSDMapService() = default; + virtual seastar::future<cached_map_t> get_map(epoch_t e) = 0; + /// get the latest map + virtual cached_map_t get_map() const = 0; + virtual epoch_t get_up_epoch() const = 0; +}; diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc new file mode 100644 index 000000000..0f01c1607 --- /dev/null +++ b/src/crimson/osd/pg.cc @@ -0,0 +1,1102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg.h" + +#include <functional> + +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/map.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm/copy.hpp> +#include <boost/range/algorithm/max_element.hpp> +#include <boost/range/numeric.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" + +#include "osd/OSDMap.h" + +#include "os/Transaction.h" + +#include "crimson/common/exception.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/os/cyanstore/cyan_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/exceptions.h" +#include "crimson/osd/pg_meta.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/osd_operations/osdop_params.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/pg_recovery.h" +#include "crimson/osd/replicated_recovery_backend.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace std::chrono { +std::ostream& operator<<(std::ostream& out, const signedspan& d) +{ + auto s = std::chrono::duration_cast<std::chrono::seconds>(d).count(); + auto ns = std::abs((d % 1s).count()); + fmt::print(out, "{}{}s", s, ns ? fmt::format(".{:0>9}", ns) : ""); + return out; +} +} + +namespace crimson::osd { + +using crimson::common::local_conf; + +class RecoverablePredicate : public IsPGRecoverablePredicate { +public: + bool operator()(const set<pg_shard_t> &have) const override { + return !have.empty(); + } +}; + +class ReadablePredicate: public IsPGReadablePredicate { + pg_shard_t whoami; +public: + explicit ReadablePredicate(pg_shard_t whoami) : whoami(whoami) {} + bool operator()(const set<pg_shard_t> &have) const override { + return have.count(whoami); + } +}; + +PG::PG( + spg_t pgid, + pg_shard_t pg_shard, + crimson::os::CollectionRef coll_ref, + pg_pool_t&& pool, + std::string&& name, + cached_map_t osdmap, + ShardServices &shard_services, + ec_profile_t profile) + : pgid{pgid}, + pg_whoami{pg_shard}, + coll_ref{coll_ref}, + pgmeta_oid{pgid.make_pgmeta_oid()}, + osdmap_gate("PG::osdmap_gate", std::nullopt), + shard_services{shard_services}, + osdmap{osdmap}, + backend( + PGBackend::create( + pgid.pgid, + pg_shard, + pool, + coll_ref, + shard_services, + profile)), + recovery_backend( + std::make_unique<ReplicatedRecoveryBackend>( + *this, shard_services, coll_ref, backend.get())), + recovery_handler( + std::make_unique<PGRecovery>(this)), + peering_state( + shard_services.get_cct(), + pg_shard, + pgid, + PGPool( + osdmap, + pgid.pool(), + pool, + name), + osdmap, + this, + this), + wait_for_active_blocker(this) +{ + peering_state.set_backend_predicates( + new ReadablePredicate(pg_whoami), + new RecoverablePredicate()); + osdmap_gate.got_map(osdmap->get_epoch()); +} + +PG::~PG() {} + +bool PG::try_flush_or_schedule_async() { + (void)shard_services.get_store().do_transaction( + coll_ref, + ObjectStore::Transaction()).then( + [this, epoch=get_osdmap_epoch()]() { + return shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + epoch, + epoch, + PeeringState::IntervalFlush()); + }); + return false; +} + +void PG::queue_check_readable(epoch_t last_peering_reset, ceph::timespan delay) +{ + // handle the peering event in the background + check_readable_timer.cancel(); + check_readable_timer.set_callback([last_peering_reset, this] { + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + last_peering_reset, + last_peering_reset, + PeeringState::CheckReadable{}); + }); + check_readable_timer.arm( + std::chrono::duration_cast<seastar::lowres_clock::duration>(delay)); +} + +void PG::recheck_readable() +{ + bool changed = false; + const auto mnow = shard_services.get_mnow(); + if (peering_state.state_test(PG_STATE_WAIT)) { + auto prior_readable_until_ub = peering_state.get_prior_readable_until_ub(); + if (mnow < prior_readable_until_ub) { + logger().info("{} will wait (mnow {} < prior_readable_until_ub {})", + __func__, mnow, prior_readable_until_ub); + } else { + logger().info("{} no longer wait (mnow {} >= prior_readable_until_ub {})", + __func__, mnow, prior_readable_until_ub); + peering_state.state_clear(PG_STATE_WAIT); + peering_state.clear_prior_readable_until_ub(); + changed = true; + } + } + if (peering_state.state_test(PG_STATE_LAGGY)) { + auto readable_until = peering_state.get_readable_until(); + if (readable_until == readable_until.zero()) { + logger().info("{} still laggy (mnow {}, readable_until zero)", + __func__, mnow); + } else if (mnow >= readable_until) { + logger().info("{} still laggy (mnow {} >= readable_until {})", + __func__, mnow, readable_until); + } else { + logger().info("{} no longer laggy (mnow {} < readable_until {})", + __func__, mnow, readable_until); + peering_state.state_clear(PG_STATE_LAGGY); + changed = true; + } + } + if (changed) { + publish_stats_to_osd(); + if (!peering_state.state_test(PG_STATE_WAIT) && + !peering_state.state_test(PG_STATE_LAGGY)) { + // TODO: requeue ops waiting for readable + } + } +} + +unsigned PG::get_target_pg_log_entries() const +{ + const unsigned num_pgs = shard_services.get_pg_num(); + const unsigned target = + local_conf().get_val<uint64_t>("osd_target_pg_log_entries_per_osd"); + const unsigned min_pg_log_entries = + local_conf().get_val<uint64_t>("osd_min_pg_log_entries"); + if (num_pgs > 0 && target > 0) { + // target an even spread of our budgeted log entries across all + // PGs. note that while we only get to control the entry count + // for primary PGs, we'll normally be responsible for a mix of + // primary and replica PGs (for the same pool(s) even), so this + // will work out. + const unsigned max_pg_log_entries = + local_conf().get_val<uint64_t>("osd_max_pg_log_entries"); + return std::clamp(target / num_pgs, + min_pg_log_entries, + max_pg_log_entries); + } else { + // fall back to a per-pg value. + return min_pg_log_entries; + } +} + +void PG::on_activate(interval_set<snapid_t>) +{ + projected_last_update = peering_state.get_info().last_update; +} + +void PG::on_activate_complete() +{ + wait_for_active_blocker.on_active(); + + if (peering_state.needs_recovery()) { + logger().info("{}: requesting recovery", + __func__); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DoRecovery{}); + } else if (peering_state.needs_backfill()) { + logger().info("{}: requesting backfill", + __func__); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill{}); + } else { + logger().debug("{}: no need to recover or backfill, AllReplicasRecovered", + " for pg: {}", __func__, pgid); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::AllReplicasRecovered{}); + } + backend->on_activate_complete(); +} + +void PG::prepare_write(pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ceph::os::Transaction &t) +{ + std::map<string,bufferlist> km; + std::string key_to_remove; + if (dirty_big_info || dirty_info) { + int ret = prepare_info_keymap( + shard_services.get_cct(), + &km, + &key_to_remove, + get_osdmap_epoch(), + info, + last_written_info, + past_intervals, + dirty_big_info, + need_write_epoch, + true, + nullptr, + this); + ceph_assert(ret == 0); + } + pglog.write_log_and_missing( + t, &km, coll_ref->get_cid(), pgmeta_oid, + peering_state.get_pool().info.require_rollback()); + if (!km.empty()) { + t.omap_setkeys(coll_ref->get_cid(), pgmeta_oid, km); + } + if (!key_to_remove.empty()) { + t.omap_rmkey(coll_ref->get_cid(), pgmeta_oid, key_to_remove); + } +} + +std::pair<ghobject_t, bool> +PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next) +{ + // TODO + shard_services.dec_pg_num(); + return {_next, false}; +} + +void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) +{ + // TODO: should update the stats upon finishing the scrub + peering_state.update_stats([scrub_level, this](auto& history, auto& stats) { + const utime_t now = ceph_clock_now(); + history.last_scrub = peering_state.get_info().last_update; + history.last_scrub_stamp = now; + history.last_clean_scrub_stamp = now; + if (scrub_level == scrub_level_t::deep) { + history.last_deep_scrub = history.last_scrub; + history.last_deep_scrub_stamp = now; + } + // yes, please publish the stats + return true; + }); +} + +void PG::log_state_enter(const char *state) { + logger().info("Entering state: {}", state); +} + +void PG::log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) { + logger().info( + "Exiting state: {}, entered at {}, {} spent on {} events", + state_name, + enter_time, + event_dur, + events); +} + +ceph::signedspan PG::get_mnow() +{ + return shard_services.get_mnow(); +} + +HeartbeatStampsRef PG::get_hb_stamps(int peer) +{ + return shard_services.get_hb_stamps(peer); +} + +void PG::schedule_renew_lease(epoch_t last_peering_reset, ceph::timespan delay) +{ + // handle the peering event in the background + renew_lease_timer.cancel(); + renew_lease_timer.set_callback([last_peering_reset, this] { + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + last_peering_reset, + last_peering_reset, + RenewLease{}); + }); + renew_lease_timer.arm( + std::chrono::duration_cast<seastar::lowres_clock::duration>(delay)); +} + + +void PG::init( + int role, + const vector<int>& newup, int new_up_primary, + const vector<int>& newacting, int new_acting_primary, + const pg_history_t& history, + const PastIntervals& pi, + bool backfill, + ObjectStore::Transaction &t) +{ + peering_state.init( + role, newup, new_up_primary, newacting, + new_acting_primary, history, pi, backfill, t); +} + +seastar::future<> PG::read_state(crimson::os::FuturizedStore* store) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + return seastar::do_with(PGMeta(store, pgid), [] (auto& pg_meta) { + return pg_meta.load(); + }).then([this, store](auto&& ret) { + auto [pg_info, past_intervals] = std::move(ret); + return peering_state.init_from_disk_state( + std::move(pg_info), + std::move(past_intervals), + [this, store] (PGLog &pglog) { + return pglog.read_log_and_missing_crimson( + *store, + coll_ref, + peering_state.get_info(), + pgmeta_oid); + }); + }).then([this]() { + int primary, up_primary; + vector<int> acting, up; + peering_state.get_osdmap()->pg_to_up_acting_osds( + pgid.pgid, &up, &up_primary, &acting, &primary); + peering_state.init_primary_up_acting( + up, + acting, + up_primary, + primary); + int rr = OSDMap::calc_pg_role(pg_whoami, acting); + peering_state.set_role(rr); + + epoch_t epoch = get_osdmap_epoch(); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + epoch, + epoch, + PeeringState::Initialize()); + + return seastar::now(); + }); +} + +void PG::do_peering_event( + const boost::statechart::event_base &evt, + PeeringCtx &rctx) +{ + peering_state.handle_event( + evt, + &rctx); + peering_state.write_if_dirty(rctx.transaction); +} + +void PG::do_peering_event( + PGPeeringEvent& evt, PeeringCtx &rctx) +{ + if (!peering_state.pg_has_reset_since(evt.get_epoch_requested())) { + logger().debug("{} handling {} for pg: {}", __func__, evt.get_desc(), pgid); + do_peering_event(evt.get_event(), rctx); + } else { + logger().debug("{} ignoring {} -- pg has reset", __func__, evt.get_desc()); + } +} + +void PG::handle_advance_map( + cached_map_t next_map, PeeringCtx &rctx) +{ + vector<int> newup, newacting; + int up_primary, acting_primary; + next_map->pg_to_up_acting_osds( + pgid.pgid, + &newup, &up_primary, + &newacting, &acting_primary); + peering_state.advance_map( + next_map, + peering_state.get_osdmap(), + newup, + up_primary, + newacting, + acting_primary, + rctx); + osdmap_gate.got_map(next_map->get_epoch()); +} + +void PG::handle_activate_map(PeeringCtx &rctx) +{ + peering_state.activate_map(rctx); +} + +void PG::handle_initialize(PeeringCtx &rctx) +{ + PeeringState::Initialize evt; + peering_state.handle_event(evt, &rctx); +} + + +void PG::print(ostream& out) const +{ + out << peering_state << " "; +} + +void PG::dump_primary(Formatter* f) +{ + peering_state.dump_peering_state(f); + + f->open_array_section("recovery_state"); + PeeringState::QueryState q(f); + peering_state.handle_event(q, 0); + f->close_section(); + + // TODO: snap_trimq + // TODO: scrubber state + // TODO: agent state +} + +std::ostream& operator<<(std::ostream& os, const PG& pg) +{ + os << " pg_epoch " << pg.get_osdmap_epoch() << " "; + pg.print(os); + return os; +} + +void PG::WaitForActiveBlocker::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->pgid; +} + +void PG::WaitForActiveBlocker::on_active() +{ + p.set_value(); + p = {}; +} + +blocking_future<> PG::WaitForActiveBlocker::wait() +{ + if (pg->peering_state.is_active()) { + return make_blocking_future(seastar::now()); + } else { + return make_blocking_future(p.get_shared_future()); + } +} + +seastar::future<> PG::WaitForActiveBlocker::stop() +{ + p.set_exception(crimson::common::system_shutdown_exception()); + return seastar::now(); +} + +seastar::future<> PG::submit_transaction(const OpInfo& op_info, + const std::vector<OSDOp>& ops, + ObjectContextRef&& obc, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + epoch_t map_epoch = get_osdmap_epoch(); + + if (__builtin_expect(osd_op_p.at_version.epoch != map_epoch, false)) { + throw crimson::common::actingset_changed(is_primary()); + } + + std::vector<pg_log_entry_t> log_entries; + log_entries.emplace_back(obc->obs.exists ? + pg_log_entry_t::MODIFY : pg_log_entry_t::DELETE, + obc->obs.oi.soid, osd_op_p.at_version, obc->obs.oi.version, + osd_op_p.user_modify ? osd_op_p.at_version.version : 0, + osd_op_p.req->get_reqid(), osd_op_p.req->get_mtime(), + op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0); + // TODO: refactor the submit_transaction + if (op_info.allows_returnvec()) { + // also the per-op values are recorded in the pg log + log_entries.back().set_op_returns(ops); + logger().debug("{} op_returns: {}", + __func__, log_entries.back().op_returns); + } + log_entries.back().clean_regions = std::move(osd_op_p.clean_regions); + peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version); + peering_state.append_log_with_trim_to_updated(std::move(log_entries), osd_op_p.at_version, + txn, true, false); + + return backend->mutate_object(peering_state.get_acting_recovery_backfill(), + std::move(obc), + std::move(txn), + std::move(osd_op_p), + peering_state.get_last_peering_reset(), + map_epoch, + std::move(log_entries)).then( + [this, last_complete=peering_state.get_info().last_complete, + at_version=osd_op_p.at_version](auto acked) { + for (const auto& peer : acked) { + peering_state.update_peer_last_complete_ondisk( + peer.shard, peer.last_complete_ondisk); + } + peering_state.complete_write(at_version, last_complete); + return seastar::now(); + }); +} + +osd_op_params_t&& PG::fill_op_params_bump_pg_version( + osd_op_params_t&& osd_op_p, + Ref<MOSDOp> m, + const bool user_modify) +{ + osd_op_p.req = std::move(m); + osd_op_p.at_version = next_version(); + osd_op_p.pg_trim_to = get_pg_trim_to(); + osd_op_p.min_last_complete_ondisk = get_min_last_complete_ondisk(); + osd_op_p.last_complete = get_info().last_complete; + if (user_modify) { + osd_op_p.user_at_version = osd_op_p.at_version.version; + } + return std::move(osd_op_p); +} + +seastar::future<Ref<MOSDOpReply>> PG::handle_failed_op( + const std::error_code& e, + ObjectContextRef obc, + const OpsExecuter& ox, + const MOSDOp& m) const +{ + // Oops, an operation had failed. do_osd_ops() altogether with + // OpsExecuter already dropped the ObjectStore::Transaction if + // there was any. However, this is not enough to completely + // rollback as we gave OpsExecuter the very single copy of `obc` + // we maintain and we did it for both reading and writing. + // Now all modifications must be reverted. + // + // Let's just reload from the store. Evicting from the shared + // LRU would be tricky as next MOSDOp (the one at `get_obc` + // phase) could actually already finished the lookup. Fortunately, + // this is supposed to live on cold paths, so performance is not + // a concern -- simplicity wins. + // + // The conditional's purpose is to efficiently handle hot errors + // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or + // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients + // typically append them before any write. If OpsExecuter hasn't + // seen any modifying operation, `obc` is supposed to be kept + // unchanged. + assert(e.value() > 0); + const bool need_reload_obc = ox.has_seen_write(); + logger().debug( + "{}: {} - object {} got error code {}, {}; need_reload_obc {}", + __func__, + m, + obc->obs.oi.soid, + e.value(), + e.message(), + need_reload_obc); + return (need_reload_obc ? reload_obc(*obc) + : load_obc_ertr::now() + ).safe_then([&e, &m, obc = std::move(obc), this] { + auto reply = make_message<MOSDOpReply>( + &m, -e.value(), get_osdmap_epoch(), 0, false); + reply->set_enoent_reply_versions( + peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply)); + }, load_obc_ertr::assert_all{ "can't live with object state messed up" }); +} + +seastar::future<Ref<MOSDOpReply>> PG::do_osd_ops( + Ref<MOSDOp> m, + ObjectContextRef obc, + const OpInfo &op_info) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + using osd_op_errorator = OpsExecuter::osd_op_errorator; + const auto oid = m->get_snapid() == CEPH_SNAPDIR ? m->get_hobj().get_head() + : m->get_hobj(); + auto ox = std::make_unique<OpsExecuter>( + obc, op_info, get_pool().info, get_backend(), *m); + return crimson::do_for_each( + m->ops, [obc, m, ox = ox.get()](OSDOp& osd_op) { + logger().debug( + "do_osd_ops: {} - object {} - handling op {}", + *m, + obc->obs.oi.soid, + ceph_osd_op_name(osd_op.op.op)); + return ox->execute_op(osd_op); + }).safe_then([this, obc, m, ox = ox.get(), &op_info] { + logger().debug( + "do_osd_ops: {} - object {} all operations successful", + *m, + obc->obs.oi.soid); + return std::move(*ox).flush_changes( + [m] (auto&& obc) -> osd_op_errorator::future<> { + logger().debug( + "do_osd_ops: {} - object {} txn is empty, bypassing mutate", + *m, + obc->obs.oi.soid); + return osd_op_errorator::now(); + }, + [this, m, &op_info] (auto&& txn, + auto&& obc, + auto&& osd_op_p, + bool user_modify) -> osd_op_errorator::future<> { + logger().debug( + "do_osd_ops: {} - object {} submitting txn", + *m, + obc->obs.oi.soid); + auto filled_osd_op_p = fill_op_params_bump_pg_version( + std::move(osd_op_p), + std::move(m), + user_modify); + return submit_transaction( + op_info, + filled_osd_op_p.req->ops, + std::move(obc), + std::move(txn), + std::move(filled_osd_op_p)); + }); + }).safe_then([this, + m, + obc, + rvec = op_info.allows_returnvec()] { + // TODO: should stop at the first op which returns a negative retval, + // cmpext uses it for returning the index of first unmatched byte + int result = m->ops.empty() ? 0 : m->ops.back().rval.code; + if (result > 0 && !rvec) { + result = 0; + } + auto reply = make_message<MOSDOpReply>(m.get(), + result, + get_osdmap_epoch(), + 0, + false); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + logger().debug( + "do_osd_ops: {} - object {} sending reply", + *m, + obc->obs.oi.soid); + return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply)); + }, osd_op_errorator::all_same_way([ox = ox.get(), + m, + obc, + this] (const std::error_code& e) { + return handle_failed_op(e, std::move(obc), *ox, *m); + })).handle_exception_type([ox_deleter = std::move(ox), + m, + obc, + this] (const crimson::osd::error& e) { + // we need this handler because throwing path which aren't errorated yet. + logger().debug("encountered the legacy error handling path!"); + return handle_failed_op(e.code(), std::move(obc), *ox_deleter, *m); + }); +} + +seastar::future<Ref<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + auto ox = std::make_unique<PgOpsExecuter>(std::as_const(*this), + std::as_const(*m)); + return seastar::do_for_each(m->ops, [ox = ox.get()](OSDOp& osd_op) { + logger().debug("will be handling pg op {}", ceph_osd_op_name(osd_op.op.op)); + return ox->execute_op(osd_op); + }).then([m, this, ox = std::move(ox)] { + auto reply = make_message<MOSDOpReply>(m.get(), 0, get_osdmap_epoch(), + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + false); + return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply)); + }).handle_exception_type([=](const crimson::osd::error& e) { + auto reply = make_message<MOSDOpReply>( + m.get(), -e.code().value(), get_osdmap_epoch(), 0, false); + reply->set_enoent_reply_versions(peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply)); + }); +} + +hobject_t PG::get_oid(const MOSDOp &m) +{ + return (m.get_snapid() == CEPH_SNAPDIR ? + m.get_hobj().get_head() : + m.get_hobj()); +} + +RWState::State PG::get_lock_type(const OpInfo &op_info) +{ + + if (op_info.rwordered() && op_info.may_read()) { + return RWState::RWEXCL; + } else if (op_info.rwordered()) { + return RWState::RWWRITE; + } else { + ceph_assert(op_info.may_read()); + return RWState::RWREAD; + } +} + +std::optional<hobject_t> PG::resolve_oid( + const SnapSet &ss, + const hobject_t &oid) +{ + if (oid.snap > ss.seq) { + return oid.get_head(); + } else { + // which clone would it be? + auto clone = std::upper_bound( + begin(ss.clones), end(ss.clones), + oid.snap); + if (clone == end(ss.clones)) { + // Doesn't exist, > last clone, < ss.seq + return std::nullopt; + } + auto citer = ss.clone_snaps.find(*clone); + // TODO: how do we want to handle this kind of logic error? + ceph_assert(citer != ss.clone_snaps.end()); + + if (std::find( + citer->second.begin(), + citer->second.end(), + *clone) == citer->second.end()) { + return std::nullopt; + } else { + auto soid = oid; + soid.snap = *clone; + return std::optional<hobject_t>(soid); + } + } +} + +template<RWState::State State> +PG::load_obc_ertr::future<> +PG::with_head_obc(hobject_t oid, with_obc_func_t&& func) +{ + assert(oid.is_head()); + auto [obc, existed] = shard_services.obc_registry.get_cached_obc(oid); + return obc->with_lock<State>( + [oid=std::move(oid), existed=existed, obc=std::move(obc), + func=std::move(func), this] { + auto loaded = load_obc_ertr::make_ready_future<ObjectContextRef>(obc); + if (existed) { + logger().debug("with_head_obc: found {} in cache", oid); + } else { + logger().debug("with_head_obc: cache miss on {}", oid); + loaded = obc->with_promoted_lock<State>([this, obc] { + return load_head_obc(obc); + }); + } + return loaded.safe_then([func=std::move(func)](auto obc) { + return func(std::move(obc)); + }); + }); +} + +template<RWState::State State> +PG::load_obc_ertr::future<> +PG::with_clone_obc(hobject_t oid, with_obc_func_t&& func) +{ + assert(!oid.is_head()); + return with_head_obc<RWState::RWREAD>(oid.get_head(), + [oid, func=std::move(func), this](auto head) -> load_obc_ertr::future<> { + auto coid = resolve_oid(head->get_ro_ss(), oid); + if (!coid) { + // TODO: return crimson::ct_error::enoent::make(); + logger().error("with_clone_obc: {} clone not found", coid); + return load_obc_ertr::make_ready_future<>(); + } + auto [clone, existed] = shard_services.obc_registry.get_cached_obc(*coid); + return clone->template with_lock<State>( + [coid=*coid, existed=existed, + head=std::move(head), clone=std::move(clone), + func=std::move(func), this]() -> load_obc_ertr::future<> { + auto loaded = load_obc_ertr::make_ready_future<ObjectContextRef>(clone); + if (existed) { + logger().debug("with_clone_obc: found {} in cache", coid); + } else { + logger().debug("with_clone_obc: cache miss on {}", coid); + loaded = clone->template with_promoted_lock<State>( + [coid, clone, head, this] { + return backend->load_metadata(coid).safe_then( + [coid, clone=std::move(clone), head=std::move(head)](auto md) mutable { + clone->set_clone_state(std::move(md->os), std::move(head)); + return clone; + }); + }); + } + return loaded.safe_then([func=std::move(func)](auto clone) { + return func(std::move(clone)); + }); + }); + }); +} + +// explicitly instantiate the used instantiations +template PG::load_obc_ertr::future<> +PG::with_head_obc<RWState::RWNONE>(hobject_t, with_obc_func_t&&); + +PG::load_obc_ertr::future<crimson::osd::ObjectContextRef> +PG::load_head_obc(ObjectContextRef obc) +{ + hobject_t oid = obc->get_oid(); + return backend->load_metadata(oid).safe_then([obc=std::move(obc)](auto md) + -> load_obc_ertr::future<crimson::osd::ObjectContextRef> { + const hobject_t& oid = md->os.oi.soid; + logger().debug( + "load_head_obc: loaded obs {} for {}", md->os.oi, oid); + if (!md->ss) { + logger().error( + "load_head_obc: oid {} missing snapset", oid); + return crimson::ct_error::object_corrupted::make(); + } + obc->set_head_state(std::move(md->os), std::move(*(md->ss))); + logger().debug( + "load_head_obc: returning obc {} for {}", + obc->obs.oi, obc->obs.oi.soid); + return load_obc_ertr::make_ready_future< + crimson::osd::ObjectContextRef>(obc); + }); +} + +PG::load_obc_ertr::future<> +PG::reload_obc(crimson::osd::ObjectContext& obc) const +{ + assert(obc.is_head()); + return backend->load_metadata(obc.get_oid()).safe_then([&obc](auto md) + -> load_obc_ertr::future<> { + logger().debug( + "{}: reloaded obs {} for {}", + __func__, + md->os.oi, + obc.get_oid()); + if (!md->ss) { + logger().error( + "{}: oid {} missing snapset", + __func__, + obc.get_oid()); + return crimson::ct_error::object_corrupted::make(); + } + obc.set_head_state(std::move(md->os), std::move(*(md->ss))); + return load_obc_ertr::now(); + }); +} + +PG::load_obc_ertr::future<> +PG::with_locked_obc(Ref<MOSDOp> &m, const OpInfo &op_info, + Operation *op, PG::with_obc_func_t &&f) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + const hobject_t oid = get_oid(*m); + switch (get_lock_type(op_info)) { + case RWState::RWREAD: + if (oid.is_head()) { + return with_head_obc<RWState::RWREAD>(oid, std::move(f)); + } else { + return with_clone_obc<RWState::RWREAD>(oid, std::move(f)); + } + case RWState::RWWRITE: + if (oid.is_head()) { + return with_head_obc<RWState::RWWRITE>(oid, std::move(f)); + } else { + return with_clone_obc<RWState::RWWRITE>(oid, std::move(f)); + } + case RWState::RWEXCL: + if (oid.is_head()) { + return with_head_obc<RWState::RWWRITE>(oid, std::move(f)); + } else { + return with_clone_obc<RWState::RWWRITE>(oid, std::move(f)); + } + default: + ceph_abort(); + }; +} + +seastar::future<> PG::handle_rep_op(Ref<MOSDRepOp> req) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + if (can_discard_replica_op(*req)) { + return seastar::now(); + } + + ceph::os::Transaction txn; + auto encoded_txn = req->get_data().cbegin(); + decode(txn, encoded_txn); + auto p = req->logbl.cbegin(); + std::vector<pg_log_entry_t> log_entries; + decode(log_entries, p); + peering_state.append_log(std::move(log_entries), req->pg_trim_to, + req->version, req->min_last_complete_ondisk, txn, !txn.empty(), false); + return shard_services.get_store().do_transaction(coll_ref, std::move(txn)) + .then([req, lcod=peering_state.get_info().last_complete, this] { + peering_state.update_last_complete_ondisk(lcod); + const auto map_epoch = get_osdmap_epoch(); + auto reply = make_message<MOSDRepOpReply>( + req.get(), pg_whoami, 0, + map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK); + reply->set_last_complete_ondisk(lcod); + return shard_services.send_to_osd(req->from.osd, reply, map_epoch); + }); +} + +void PG::handle_rep_op_reply(crimson::net::ConnectionRef conn, + const MOSDRepOpReply& m) +{ + if (!can_discard_replica_op(m)) { + backend->got_rep_op_reply(m); + } +} + +template <typename MsgType> +bool PG::can_discard_replica_op(const MsgType& m) const +{ + // if a repop is replied after a replica goes down in a new osdmap, and + // before the pg advances to this new osdmap, the repop replies before this + // repop can be discarded by that replica OSD, because the primary resets the + // connection to it when handling the new osdmap marking it down, and also + // resets the messenger sesssion when the replica reconnects. to avoid the + // out-of-order replies, the messages from that replica should be discarded. + const auto osdmap = peering_state.get_osdmap(); + const int from_osd = m.get_source().num(); + if (osdmap->is_down(from_osd)) { + return true; + } + // Mostly, this overlaps with the old_peering_msg + // condition. An important exception is pushes + // sent by replicas not in the acting set, since + // if such a replica goes down it does not cause + // a new interval. + if (osdmap->get_down_at(from_osd) >= m.map_epoch) { + return true; + } + // same pg? + // if pg changes *at all*, we reset and repeer! + if (epoch_t lpr = peering_state.get_last_peering_reset(); + lpr > m.map_epoch) { + logger().debug("{}: pg changed {} after {}, dropping", + __func__, get_info().history, m.map_epoch); + return true; + } + return false; +} + +seastar::future<> PG::stop() +{ + logger().info("PG {} {}", pgid, __func__); + stopping = true; + return osdmap_gate.stop().then([this] { + return wait_for_active_blocker.stop(); + }).then([this] { + return recovery_handler->stop(); + }).then([this] { + return recovery_backend->stop(); + }).then([this] { + return backend->stop(); + }); +} + +void PG::on_change(ceph::os::Transaction &t) { + recovery_backend->on_peering_interval_change(t); + backend->on_actingset_changed({ is_primary() }); +} + +bool PG::can_discard_op(const MOSDOp& m) const { + return __builtin_expect(m.get_map_epoch() + < peering_state.get_info().history.same_primary_since, false); +} + +bool PG::is_degraded_or_backfilling_object(const hobject_t& soid) const { + /* The conditions below may clear (on_local_recover, before we queue + * the transaction) before we actually requeue the degraded waiters + * in on_global_recover after the transaction completes. + */ + if (peering_state.get_pg_log().get_missing().get_items().count(soid)) + return true; + ceph_assert(!get_acting_recovery_backfill().empty()); + for (auto& peer : get_acting_recovery_backfill()) { + if (peer == get_primary()) continue; + auto peer_missing_entry = peering_state.get_peer_missing().find(peer); + // If an object is missing on an async_recovery_target, return false. + // This will not block the op and the object is async recovered later. + if (peer_missing_entry != peering_state.get_peer_missing().end() && + peer_missing_entry->second.get_items().count(soid)) { + return true; + } + // Object is degraded if after last_backfill AND + // we are backfilling it + if (is_backfill_target(peer) && + peering_state.get_peer_info(peer).last_backfill <= soid && + recovery_handler->backfill_state->get_last_backfill_started() >= soid && + recovery_backend->is_recovering(soid)) { + return true; + } + } + return false; +} + +} diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h new file mode 100644 index 000000000..34676ee7a --- /dev/null +++ b/src/crimson/osd/pg.h @@ -0,0 +1,704 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <optional> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <boost/smart_ptr/local_shared_ptr.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/sleep.hh> + +#include "common/dout.h" +#include "crimson/net/Fwd.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDOpReply.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" +#include "crimson/osd/object_context.h" +#include "osd/PeeringState.h" + +#include "crimson/common/type_helpers.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/replicated_request.h" +#include "crimson/osd/osd_operations/background_recovery.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/pg_recovery.h" +#include "crimson/osd/pg_recovery_listener.h" +#include "crimson/osd/recovery_backend.h" + +class MQuery; +class OSDMap; +class PGBackend; +class PGPeeringEvent; +class osd_op_params_t; + +namespace recovery { + class Context; +} + +namespace crimson::net { + class Messenger; +} + +namespace crimson::os { + class FuturizedStore; +} + +namespace crimson::osd { +class ClientRequest; +class OpsExecuter; + +class PG : public boost::intrusive_ref_counter< + PG, + boost::thread_unsafe_counter>, + public PGRecoveryListener, + PeeringState::PeeringListener, + DoutPrefixProvider +{ + using ec_profile_t = std::map<std::string,std::string>; + using cached_map_t = boost::local_shared_ptr<const OSDMap>; + + ClientRequest::PGPipeline client_request_pg_pipeline; + PeeringEvent::PGPipeline peering_request_pg_pipeline; + RepRequest::PGPipeline replicated_request_pg_pipeline; + + spg_t pgid; + pg_shard_t pg_whoami; + crimson::os::CollectionRef coll_ref; + ghobject_t pgmeta_oid; + + seastar::timer<seastar::lowres_clock> check_readable_timer; + seastar::timer<seastar::lowres_clock> renew_lease_timer; + +public: + PG(spg_t pgid, + pg_shard_t pg_shard, + crimson::os::CollectionRef coll_ref, + pg_pool_t&& pool, + std::string&& name, + cached_map_t osdmap, + ShardServices &shard_services, + ec_profile_t profile); + + ~PG(); + + const pg_shard_t& get_pg_whoami() const final { + return pg_whoami; + } + + const spg_t& get_pgid() const final { + return pgid; + } + + PGBackend& get_backend() { + return *backend; + } + const PGBackend& get_backend() const { + return *backend; + } + // EpochSource + epoch_t get_osdmap_epoch() const final { + return peering_state.get_osdmap_epoch(); + } + + eversion_t get_pg_trim_to() const { + return peering_state.get_pg_trim_to(); + } + + eversion_t get_min_last_complete_ondisk() const { + return peering_state.get_min_last_complete_ondisk(); + } + + const pg_info_t& get_info() const final { + return peering_state.get_info(); + } + + // DoutPrefixProvider + std::ostream& gen_prefix(std::ostream& out) const final { + return out << *this; + } + crimson::common::CephContext *get_cct() const final { + return shard_services.get_cct(); + } + unsigned get_subsys() const final { + return ceph_subsys_osd; + } + + crimson::os::CollectionRef get_collection_ref() { + return coll_ref; + } + + // PeeringListener + void prepare_write( + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ceph::os::Transaction &t) final; + + void on_info_history_change() final { + // Not needed yet -- mainly for scrub scheduling + } + + void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) final; + + uint64_t get_snap_trimq_size() const final { + return 0; + } + + void send_cluster_message( + int osd, MessageRef m, + epoch_t epoch, bool share_map_update=false) final { + (void)shard_services.send_to_osd(osd, m, epoch); + } + + void send_pg_created(pg_t pgid) final { + (void)shard_services.send_pg_created(pgid); + } + + bool try_flush_or_schedule_async() final; + + void start_flush_on_transaction( + ceph::os::Transaction &t) final { + t.register_on_commit( + new LambdaContext([this](int r){ + peering_state.complete_flush(); + })); + } + + void on_flushed() final { + // will be needed for unblocking IO operations/peering + } + + template <typename T> + void start_peering_event_operation(T &&evt, float delay = 0) { + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + delay, + std::forward<T>(evt)); + } + + void schedule_event_after( + PGPeeringEventRef event, + float delay) final { + start_peering_event_operation(std::move(*event), delay); + } + std::vector<pg_shard_t> get_replica_recovery_order() const final { + return peering_state.get_replica_recovery_order(); + } + void request_local_background_io_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) final { + shard_services.local_reserver.request_reservation( + pgid, + on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) { + start_peering_event_operation(std::move(*on_grant)); + }) : nullptr, + priority, + on_preempt ? make_lambda_context( + [this, on_preempt=std::move(on_preempt)] (int) { + start_peering_event_operation(std::move(*on_preempt)); + }) : nullptr); + } + + void update_local_background_io_priority( + unsigned priority) final { + shard_services.local_reserver.update_priority( + pgid, + priority); + } + + void cancel_local_background_io_reservation() final { + shard_services.local_reserver.cancel_reservation( + pgid); + } + + void request_remote_recovery_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) final { + shard_services.remote_reserver.request_reservation( + pgid, + on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) { + start_peering_event_operation(std::move(*on_grant)); + }) : nullptr, + priority, + on_preempt ? make_lambda_context( + [this, on_preempt=std::move(on_preempt)] (int) { + start_peering_event_operation(std::move(*on_preempt)); + }) : nullptr); + } + + void cancel_remote_recovery_reservation() final { + shard_services.remote_reserver.cancel_reservation( + pgid); + } + + void schedule_event_on_commit( + ceph::os::Transaction &t, + PGPeeringEventRef on_commit) final { + t.register_on_commit( + make_lambda_context( + [this, on_commit=std::move(on_commit)](int) { + start_peering_event_operation(std::move(*on_commit)); + })); + } + + void update_heartbeat_peers(set<int> peers) final { + // Not needed yet + } + void set_probe_targets(const set<pg_shard_t> &probe_set) final { + // Not needed yet + } + void clear_probe_targets() final { + // Not needed yet + } + void queue_want_pg_temp(const std::vector<int> &wanted) final { + shard_services.queue_want_pg_temp(pgid.pgid, wanted); + } + void clear_want_pg_temp() final { + shard_services.remove_want_pg_temp(pgid.pgid); + } + void publish_stats_to_osd() final { + if (!is_primary()) + return; + + (void) peering_state.prepare_stats_for_publish( + false, + pg_stat_t(), + object_stat_collection_t()); + } + void clear_publish_stats() final { + // Not needed yet + } + void check_recovery_sources(const OSDMapRef& newmap) final { + // Not needed yet + } + void check_blocklisted_watchers() final { + // Not needed yet + } + void clear_primary_state() final { + // Not needed yet + } + + void queue_check_readable(epoch_t last_peering_reset, + ceph::timespan delay) final; + void recheck_readable() final; + + unsigned get_target_pg_log_entries() const final; + + void on_pool_change() final { + // Not needed yet + } + void on_role_change() final { + // Not needed yet + } + void on_change(ceph::os::Transaction &t) final; + void on_activate(interval_set<snapid_t> to_trim) final; + void on_activate_complete() final; + void on_new_interval() final { + // Not needed yet + } + Context *on_clean() final { + // Not needed yet (will be needed for IO unblocking) + return nullptr; + } + void on_activate_committed() final { + // Not needed yet (will be needed for IO unblocking) + } + void on_active_exit() final { + // Not needed yet + } + + void on_removal(ceph::os::Transaction &t) final { + // TODO + } + std::pair<ghobject_t, bool> + do_delete_work(ceph::os::Transaction &t, ghobject_t _next) final; + + // merge/split not ready + void clear_ready_to_merge() final {} + void set_not_ready_to_merge_target(pg_t pgid, pg_t src) final {} + void set_not_ready_to_merge_source(pg_t pgid) final {} + void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) final {} + void set_ready_to_merge_source(eversion_t lu) final {} + + void on_active_actmap() final { + // Not needed yet + } + void on_active_advmap(const OSDMapRef &osdmap) final { + // Not needed yet + } + epoch_t oldest_stored_osdmap() final { + // TODO + return 0; + } + + void on_backfill_reserved() final { + recovery_handler->on_backfill_reserved(); + } + void on_backfill_canceled() final { + ceph_assert(0 == "Not implemented"); + } + + void on_recovery_reserved() final { + recovery_handler->start_pglogbased_recovery(); + } + + + bool try_reserve_recovery_space( + int64_t primary_num_bytes, int64_t local_num_bytes) final { + // TODO + return true; + } + void unreserve_recovery_space() final {} + + struct PGLogEntryHandler : public PGLog::LogEntryHandler { + PG *pg; + ceph::os::Transaction *t; + PGLogEntryHandler(PG *pg, ceph::os::Transaction *t) : pg(pg), t(t) {} + + // LogEntryHandler + void remove(const hobject_t &hoid) override { + // TODO + } + void try_stash(const hobject_t &hoid, version_t v) override { + // TODO + } + void rollback(const pg_log_entry_t &entry) override { + // TODO + } + void rollforward(const pg_log_entry_t &entry) override { + // TODO + } + void trim(const pg_log_entry_t &entry) override { + // TODO + } + }; + PGLog::LogEntryHandlerRef get_log_handler( + ceph::os::Transaction &t) final { + return std::make_unique<PG::PGLogEntryHandler>(this, &t); + } + + void rebuild_missing_set_with_deletes(PGLog &pglog) final { + ceph_assert(0 == "Impossible for crimson"); + } + + PerfCounters &get_peering_perf() final { + return shard_services.get_recoverystate_perf_logger(); + } + PerfCounters &get_perf_logger() final { + return shard_services.get_perf_logger(); + } + + void log_state_enter(const char *state) final; + void log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) final; + + void dump_recovery_info(Formatter *f) const final { + } + + OstreamTemp get_clog_info() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_INFO, nullptr); + } + OstreamTemp get_clog_debug() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_DEBUG, nullptr); + } + OstreamTemp get_clog_error() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_ERROR, nullptr); + } + + ceph::signedspan get_mnow() final; + HeartbeatStampsRef get_hb_stamps(int peer) final; + void schedule_renew_lease(epoch_t plr, ceph::timespan delay) final; + + + // Utility + bool is_primary() const final { + return peering_state.is_primary(); + } + bool is_nonprimary() const { + return peering_state.is_nonprimary(); + } + bool is_peered() const final { + return peering_state.is_peered(); + } + bool is_recovering() const final { + return peering_state.is_recovering(); + } + bool is_backfilling() const final { + return peering_state.is_backfilling(); + } + pg_stat_t get_stats() { + auto stats = peering_state.prepare_stats_for_publish( + false, + pg_stat_t(), + object_stat_collection_t()); + ceph_assert(stats); + return *stats; + } + bool get_need_up_thru() const { + return peering_state.get_need_up_thru(); + } + epoch_t get_same_interval_since() const { + return get_info().history.same_interval_since; + } + + const auto& get_pool() const { + return peering_state.get_pool(); + } + pg_shard_t get_primary() const { + return peering_state.get_primary(); + } + + /// initialize created PG + void init( + int role, + const std::vector<int>& up, + int up_primary, + const std::vector<int>& acting, + int acting_primary, + const pg_history_t& history, + const PastIntervals& pim, + bool backfill, + ceph::os::Transaction &t); + + seastar::future<> read_state(crimson::os::FuturizedStore* store); + + void do_peering_event( + PGPeeringEvent& evt, PeeringCtx &rctx); + + void handle_advance_map(cached_map_t next_map, PeeringCtx &rctx); + void handle_activate_map(PeeringCtx &rctx); + void handle_initialize(PeeringCtx &rctx); + + static hobject_t get_oid(const MOSDOp &m); + static RWState::State get_lock_type(const OpInfo &op_info); + static std::optional<hobject_t> resolve_oid( + const SnapSet &snapset, + const hobject_t &oid); + + using load_obc_ertr = crimson::errorator< + crimson::ct_error::object_corrupted>; + + load_obc_ertr::future<crimson::osd::ObjectContextRef> + load_head_obc(ObjectContextRef obc); + + load_obc_ertr::future<> + reload_obc(crimson::osd::ObjectContext& obc) const; + +public: + using with_obc_func_t = + std::function<load_obc_ertr::future<> (ObjectContextRef)>; + + template<RWState::State State> + load_obc_ertr::future<> with_head_obc(hobject_t oid, with_obc_func_t&& func); + + load_obc_ertr::future<> with_locked_obc( + Ref<MOSDOp> &m, + const OpInfo &op_info, + Operation *op, + with_obc_func_t&& f); + + seastar::future<> handle_rep_op(Ref<MOSDRepOp> m); + void handle_rep_op_reply(crimson::net::ConnectionRef conn, + const MOSDRepOpReply& m); + + void print(std::ostream& os) const; + void dump_primary(Formatter*); + +private: + template<RWState::State State> + load_obc_ertr::future<> with_clone_obc(hobject_t oid, with_obc_func_t&& func); + + load_obc_ertr::future<ObjectContextRef> get_locked_obc( + Operation *op, + const hobject_t &oid, + RWState::State type); + + void do_peering_event( + const boost::statechart::event_base &evt, + PeeringCtx &rctx); + osd_op_params_t&& fill_op_params_bump_pg_version( + osd_op_params_t&& osd_op_p, + Ref<MOSDOp> m, + const bool user_modify); + seastar::future<Ref<MOSDOpReply>> handle_failed_op( + const std::error_code& e, + ObjectContextRef obc, + const OpsExecuter& ox, + const MOSDOp& m) const; + seastar::future<Ref<MOSDOpReply>> do_osd_ops( + Ref<MOSDOp> m, + ObjectContextRef obc, + const OpInfo &op_info); + seastar::future<Ref<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m); + seastar::future<> submit_transaction(const OpInfo& op_info, + const std::vector<OSDOp>& ops, + ObjectContextRef&& obc, + ceph::os::Transaction&& txn, + const osd_op_params_t& oop); + +private: + OSDMapGate osdmap_gate; + ShardServices &shard_services; + + cached_map_t osdmap; + +public: + cached_map_t get_osdmap() { return osdmap; } + eversion_t next_version() { + return eversion_t(get_osdmap_epoch(), + ++projected_last_update.version); + } + ShardServices& get_shard_services() final { + return shard_services; + } + seastar::future<> stop(); + +private: + std::unique_ptr<PGBackend> backend; + std::unique_ptr<RecoveryBackend> recovery_backend; + std::unique_ptr<PGRecovery> recovery_handler; + + PeeringState peering_state; + eversion_t projected_last_update; +public: + RecoveryBackend* get_recovery_backend() final { + return recovery_backend.get(); + } + PGRecovery* get_recovery_handler() final { + return recovery_handler.get(); + } + PeeringState& get_peering_state() final { + return peering_state; + } + bool has_reset_since(epoch_t epoch) const final { + return peering_state.pg_has_reset_since(epoch); + } + + const pg_missing_tracker_t& get_local_missing() const { + return peering_state.get_pg_log().get_missing(); + } + epoch_t get_last_peering_reset() const final { + return peering_state.get_last_peering_reset(); + } + const set<pg_shard_t> &get_acting_recovery_backfill() const { + return peering_state.get_acting_recovery_backfill(); + } + bool is_backfill_target(pg_shard_t osd) const { + return peering_state.is_backfill_target(osd); + } + void begin_peer_recover(pg_shard_t peer, const hobject_t oid) { + peering_state.begin_peer_recover(peer, oid); + } + uint64_t min_peer_features() const { + return peering_state.get_min_peer_features(); + } + const map<hobject_t, set<pg_shard_t>>& + get_missing_loc_shards() const { + return peering_state.get_missing_loc().get_missing_locs(); + } + const map<pg_shard_t, pg_missing_t> &get_shard_missing() const { + return peering_state.get_peer_missing(); + } + const pg_missing_const_i* get_shard_missing(pg_shard_t shard) const { + if (shard == pg_whoami) + return &get_local_missing(); + else { + auto it = peering_state.get_peer_missing().find(shard); + if (it == peering_state.get_peer_missing().end()) + return nullptr; + else + return &it->second; + } + } + int get_recovery_op_priority() const { + int64_t pri = 0; + get_pool().info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri); + return pri > 0 ? pri : crimson::common::local_conf()->osd_recovery_op_priority; + } + seastar::future<> mark_unfound_lost(int) { + // TODO: see PrimaryLogPG::mark_all_unfound_lost() + return seastar::now(); + } + +private: + // instead of seastar::gate, we use a boolean flag to indicate + // whether the system is shutting down, as we don't need to track + // continuations here. + bool stopping = false; + + class WaitForActiveBlocker : public BlockerT<WaitForActiveBlocker> { + PG *pg; + + const spg_t pgid; + seastar::shared_promise<> p; + + protected: + void dump_detail(Formatter *f) const; + + public: + static constexpr const char *type_name = "WaitForActiveBlocker"; + + WaitForActiveBlocker(PG *pg) : pg(pg) {} + void on_active(); + blocking_future<> wait(); + seastar::future<> stop(); + } wait_for_active_blocker; + + friend std::ostream& operator<<(std::ostream&, const PG& pg); + friend class ClientRequest; + friend class PGAdvanceMap; + friend class PeeringEvent; + friend class RepRequest; + friend class BackfillRecovery; + friend struct PGFacade; +private: + seastar::future<bool> find_unfound() { + return seastar::make_ready_future<bool>(true); + } + + template <typename MsgType> + bool can_discard_replica_op(const MsgType& m) const; + bool can_discard_op(const MOSDOp& m) const; + bool is_missing_object(const hobject_t& soid) const { + return peering_state.get_pg_log().get_missing().get_items().count(soid); + } + bool is_unreadable_object(const hobject_t &oid, + eversion_t* v = 0) const final { + return is_missing_object(oid) || + !peering_state.get_missing_loc().readable_with_acting( + oid, get_actingset(), v); + } + bool is_degraded_or_backfilling_object(const hobject_t& soid) const; + const set<pg_shard_t> &get_actingset() const { + return peering_state.get_actingset(); + } + +private: + BackfillRecovery::BackfillRecoveryPipeline backfill_pipeline; +}; + +std::ostream& operator<<(std::ostream&, const PG& pg); + +} diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc new file mode 100644 index 000000000..38dbdbf41 --- /dev/null +++ b/src/crimson/osd/pg_backend.cc @@ -0,0 +1,1171 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg_backend.h" + +#include <optional> +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm/copy.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <seastar/core/print.hh> + +#include "messages/MOSDOp.h" +#include "os/Transaction.h" +#include "common/Checksummer.h" +#include "common/Clock.h" + +#include "crimson/common/exception.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/osd_operation.h" +#include "replicated_backend.h" +#include "replicated_recovery_backend.h" +#include "ec_backend.h" +#include "exceptions.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using crimson::common::local_conf; + +std::unique_ptr<PGBackend> +PGBackend::create(pg_t pgid, + const pg_shard_t pg_shard, + const pg_pool_t& pool, + crimson::os::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile) +{ + switch (pool.type) { + case pg_pool_t::TYPE_REPLICATED: + return std::make_unique<ReplicatedBackend>(pgid, pg_shard, + coll, shard_services); + case pg_pool_t::TYPE_ERASURE: + return std::make_unique<ECBackend>(pg_shard.shard, coll, shard_services, + std::move(ec_profile), + pool.stripe_width); + default: + throw runtime_error(seastar::format("unsupported pool type '{}'", + pool.type)); + } +} + +PGBackend::PGBackend(shard_id_t shard, + CollectionRef coll, + crimson::os::FuturizedStore* store) + : shard{shard}, + coll{coll}, + store{store} +{} + +PGBackend::load_metadata_ertr::future<PGBackend::loaded_object_md_t::ref> +PGBackend::load_metadata(const hobject_t& oid) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + return store->get_attrs( + coll, + ghobject_t{oid, ghobject_t::NO_GEN, shard}).safe_then( + [oid](auto &&attrs) -> load_metadata_ertr::future<loaded_object_md_t::ref>{ + loaded_object_md_t::ref ret(new loaded_object_md_t()); + if (auto oiiter = attrs.find(OI_ATTR); oiiter != attrs.end()) { + bufferlist bl; + bl.push_back(std::move(oiiter->second)); + ret->os = ObjectState( + object_info_t(bl), + true); + } else { + logger().error( + "load_metadata: object {} present but missing object info", + oid); + return crimson::ct_error::object_corrupted::make(); + } + + if (oid.is_head()) { + if (auto ssiter = attrs.find(SS_ATTR); ssiter != attrs.end()) { + bufferlist bl; + bl.push_back(std::move(ssiter->second)); + ret->ss = SnapSet(bl); + } else { + /* TODO: add support for writing out snapsets + logger().error( + "load_metadata: object {} present but missing snapset", + oid); + //return crimson::ct_error::object_corrupted::make(); + */ + ret->ss = SnapSet(); + } + } + + return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>( + std::move(ret)); + }, crimson::ct_error::enoent::handle([oid] { + logger().debug( + "load_metadata: object {} doesn't exist, returning empty metadata", + oid); + return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>( + new loaded_object_md_t{ + ObjectState( + object_info_t(oid), + false), + oid.is_head() ? std::optional<SnapSet>(SnapSet()) : std::nullopt + }); + })); +} + +seastar::future<crimson::osd::acked_peers_t> +PGBackend::mutate_object( + std::set<pg_shard_t> pg_shards, + crimson::osd::ObjectContextRef &&obc, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, + epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + logger().trace("mutate_object: num_ops={}", txn.get_num_ops()); + if (obc->obs.exists) { +#if 0 + obc->obs.oi.version = ctx->at_version; + obc->obs.oi.prior_version = ctx->obs->oi.version; +#endif + + auto& m = osd_op_p.req; + obc->obs.oi.prior_version = obc->obs.oi.version; + obc->obs.oi.version = osd_op_p.at_version; + if (osd_op_p.user_at_version > obc->obs.oi.user_version) + obc->obs.oi.user_version = osd_op_p.user_at_version; + obc->obs.oi.last_reqid = m->get_reqid(); + obc->obs.oi.mtime = m->get_mtime(); + obc->obs.oi.local_mtime = ceph_clock_now(); + + // object_info_t + { + ceph::bufferlist osv; + encode(obc->obs.oi, osv, CEPH_FEATURES_ALL); + // TODO: get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, OI_ATTR, osv); + } + } else { + // reset cached ObjectState without enforcing eviction + obc->obs.oi = object_info_t(obc->obs.oi.soid); + } + return _submit_transaction( + std::move(pg_shards), obc->obs.oi.soid, std::move(txn), + std::move(osd_op_p), min_epoch, map_epoch, std::move(log_entries)); +} + +static inline bool _read_verify_data( + const object_info_t& oi, + const ceph::bufferlist& data) +{ + if (oi.is_data_digest() && oi.size == data.length()) { + // whole object? can we verify the checksum? + if (auto crc = data.crc32c(-1); crc != oi.data_digest) { + logger().error("full-object read crc {} != expected {} on {}", + crc, oi.data_digest, oi.soid); + // todo: mark soid missing, perform recovery, and retry + return false; + } + } + return true; +} + +PGBackend::read_errorator::future<> +PGBackend::read(const ObjectState& os, OSDOp& osd_op) +{ + const auto& oi = os.oi; + const ceph_osd_op& op = osd_op.op; + const uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + logger().trace("read: {} {}~{}", oi.soid, offset, length); + + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + // are we beyond truncate_size? + size_t size = oi.size; + if ((op.extent.truncate_seq > oi.truncate_seq) && + (op.extent.truncate_size < offset + length) && + (op.extent.truncate_size < size)) { + size = op.extent.truncate_size; + } + if (offset >= size) { + // read size was trimmed to zero and it is expected to do nothing, + return read_errorator::now(); + } + if (!length) { + // read the whole object if length is 0 + length = size; + } + return _read(oi.soid, offset, length, op.flags).safe_then( + [&oi, &osd_op](auto&& bl) -> read_errorator::future<> { + if (!_read_verify_data(oi, bl)) { + return crimson::ct_error::object_corrupted::make(); + } + logger().debug("read: data length: {}", bl.length()); + osd_op.rval = bl.length(); + osd_op.outdata = std::move(bl); + return read_errorator::now(); + }); +} + +PGBackend::read_errorator::future<> +PGBackend::sparse_read(const ObjectState& os, OSDOp& osd_op) +{ + const auto& op = osd_op.op; + logger().trace("sparse_read: {} {}~{}", + os.oi.soid, op.extent.offset, op.extent.length); + return store->fiemap(coll, ghobject_t{os.oi.soid}, + op.extent.offset, + op.extent.length).then([&os, &osd_op, this](auto&& m) { + return seastar::do_with(interval_set<uint64_t>{std::move(m)}, + [&os, &osd_op, this](auto&& extents) { + return store->readv(coll, ghobject_t{os.oi.soid}, + extents, osd_op.op.flags).safe_then( + [&os, &osd_op, &extents](auto&& bl) -> read_errorator::future<> { + if (_read_verify_data(os.oi, bl)) { + osd_op.op.extent.length = bl.length(); + // re-encode since it might be modified + ceph::encode(extents, osd_op.outdata); + encode_destructively(bl, osd_op.outdata); + logger().trace("sparse_read got {} bytes from object {}", + osd_op.op.extent.length, os.oi.soid); + return read_errorator::make_ready_future<>(); + } else { + // TODO: repair it if crc mismatches + return crimson::ct_error::object_corrupted::make(); + } + }); + }); + }); +} + +namespace { + + template<class CSum> + PGBackend::checksum_errorator::future<> + do_checksum(ceph::bufferlist& init_value_bl, + size_t chunk_size, + const ceph::bufferlist& buf, + ceph::bufferlist& result) + { + typename CSum::init_value_t init_value; + auto init_value_p = init_value_bl.cbegin(); + try { + decode(init_value, init_value_p); + // chop off the consumed part + init_value_bl.splice(0, init_value_p.get_off()); + } catch (const ceph::buffer::end_of_buffer&) { + logger().warn("{}: init value not provided", __func__); + return crimson::ct_error::invarg::make(); + } + const uint32_t chunk_count = buf.length() / chunk_size; + ceph::bufferptr csum_data{ + ceph::buffer::create(sizeof(typename CSum::value_t) * chunk_count)}; + Checksummer::calculate<CSum>( + init_value, chunk_size, 0, buf.length(), buf, &csum_data); + encode(chunk_count, result); + result.append(std::move(csum_data)); + return PGBackend::checksum_errorator::now(); + } +} + +PGBackend::checksum_errorator::future<> +PGBackend::checksum(const ObjectState& os, OSDOp& osd_op) +{ + // sanity tests and normalize the argments + auto& checksum = osd_op.op.checksum; + if (checksum.offset == 0 && checksum.length == 0) { + // zeroed offset+length implies checksum whole object + checksum.length = os.oi.size; + } else if (checksum.offset >= os.oi.size) { + // read size was trimmed to zero, do nothing, + // see PGBackend::read() + return checksum_errorator::now(); + } + if (checksum.chunk_size > 0) { + if (checksum.length == 0) { + logger().warn("{}: length required when chunk size provided", __func__); + return crimson::ct_error::invarg::make(); + } + if (checksum.length % checksum.chunk_size != 0) { + logger().warn("{}: length not aligned to chunk size", __func__); + return crimson::ct_error::invarg::make(); + } + } else { + checksum.chunk_size = checksum.length; + } + if (checksum.length == 0) { + uint32_t count = 0; + encode(count, osd_op.outdata); + return checksum_errorator::now(); + } + + // read the chunk to be checksum'ed + return _read(os.oi.soid, checksum.offset, checksum.length, osd_op.op.flags).safe_then( + [&osd_op](auto&& read_bl) mutable -> checksum_errorator::future<> { + auto& checksum = osd_op.op.checksum; + if (read_bl.length() != checksum.length) { + logger().warn("checksum: bytes read {} != {}", + read_bl.length(), checksum.length); + return crimson::ct_error::invarg::make(); + } + // calculate its checksum and put the result in outdata + switch (checksum.type) { + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32: + return do_checksum<Checksummer::xxhash32>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64: + return do_checksum<Checksummer::xxhash64>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C: + return do_checksum<Checksummer::crc32c>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + default: + logger().warn("checksum: unknown crc type ({})", + static_cast<uint32_t>(checksum.type)); + return crimson::ct_error::invarg::make(); + } + }); +} + +PGBackend::cmp_ext_errorator::future<> +PGBackend::cmp_ext(const ObjectState& os, OSDOp& osd_op) +{ + const ceph_osd_op& op = osd_op.op; + // return the index of the first unmatched byte in the payload, hence the + // strange limit and check + if (op.extent.length > MAX_ERRNO) { + return crimson::ct_error::invarg::make(); + } + uint64_t obj_size = os.oi.size; + if (os.oi.truncate_seq < op.extent.truncate_seq && + op.extent.offset + op.extent.length > op.extent.truncate_size) { + obj_size = op.extent.truncate_size; + } + uint64_t ext_len; + if (op.extent.offset >= obj_size) { + ext_len = 0; + } else if (op.extent.offset + op.extent.length > obj_size) { + ext_len = obj_size - op.extent.offset; + } else { + ext_len = op.extent.length; + } + auto read_ext = ll_read_errorator::make_ready_future<ceph::bufferlist>(); + if (ext_len == 0) { + logger().debug("{}: zero length extent", __func__); + } else if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + } else { + read_ext = _read(os.oi.soid, op.extent.offset, ext_len, 0); + } + return read_ext.safe_then([&osd_op](auto&& read_bl) { + int32_t retcode = 0; + for (unsigned index = 0; index < osd_op.indata.length(); index++) { + char byte_in_op = osd_op.indata[index]; + char byte_from_disk = (index < read_bl.length() ? read_bl[index] : 0); + if (byte_in_op != byte_from_disk) { + logger().debug("cmp_ext: mismatch at {}", index); + retcode = -MAX_ERRNO - index; + break; + } + } + logger().debug("cmp_ext: {}", retcode); + osd_op.rval = retcode; + }); +} + +PGBackend::stat_errorator::future<> PGBackend::stat( + const ObjectState& os, + OSDOp& osd_op) +{ + if (os.exists/* TODO: && !os.is_whiteout() */) { + logger().debug("stat os.oi.size={}, os.oi.mtime={}", os.oi.size, os.oi.mtime); + encode(os.oi.size, osd_op.outdata); + encode(os.oi.mtime, osd_op.outdata); + } else { + logger().debug("stat object does not exist"); + return crimson::ct_error::enoent::make(); + } + return stat_errorator::now(); + // TODO: ctx->delta_stats.num_rd++; +} + +bool PGBackend::maybe_create_new_object( + ObjectState& os, + ceph::os::Transaction& txn) +{ + if (!os.exists) { + ceph_assert(!os.oi.is_whiteout()); + os.exists = true; + os.oi.new_object(); + + txn.touch(coll->get_cid(), ghobject_t{os.oi.soid}); + // TODO: delta_stats.num_objects++ + return false; + } else if (os.oi.is_whiteout()) { + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + // TODO: delta_stats.num_whiteouts-- + } + return true; +} + +static bool is_offset_and_length_valid( + const std::uint64_t offset, + const std::uint64_t length) +{ + if (const std::uint64_t max = local_conf()->osd_max_object_size; + offset >= max || length > max || offset + length > max) { + logger().debug("{} osd_max_object_size: {}, offset: {}, len: {}; " + "Hard limit of object size is 4GB", + __func__, max, offset, length); + return false; + } else { + return true; + } +} + +seastar::future<> PGBackend::write( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + const ceph_osd_op& op = osd_op.op; + uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + bufferlist buf = osd_op.indata; + if (auto seq = os.oi.truncate_seq; + seq != 0 && op.extent.truncate_seq < seq) { + // old write, arrived after trimtrunc + if (offset + length > os.oi.size) { + // no-op + if (offset > os.oi.size) { + length = 0; + buf.clear(); + } else { + // truncate + auto len = os.oi.size - offset; + buf.splice(len, length); + length = len; + } + } + } else if (op.extent.truncate_seq > seq) { + // write arrives before trimtrunc + if (os.exists && !os.oi.is_whiteout()) { + txn.truncate(coll->get_cid(), + ghobject_t{os.oi.soid}, op.extent.truncate_size); + if (op.extent.truncate_size != os.oi.size) { + os.oi.size = length; + // TODO: truncate_update_size_and_usage() + if (op.extent.truncate_size > os.oi.size) { + osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size, + op.extent.truncate_size - os.oi.size); + } else { + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.truncate_size, + os.oi.size - op.extent.truncate_size); + } + } + } + os.oi.truncate_seq = op.extent.truncate_seq; + os.oi.truncate_size = op.extent.truncate_size; + } + maybe_create_new_object(os, txn); + if (length == 0) { + if (offset > os.oi.size) { + txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, op.extent.offset); + } else { + txn.nop(); + } + } else { + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + offset, length, std::move(buf), op.flags); + os.oi.size = std::max(offset + length, os.oi.size); + } + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset, + op.extent.length); + + return seastar::now(); +} + +seastar::future<> PGBackend::write_same( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + const ceph_osd_op& op = osd_op.op; + const uint64_t len = op.writesame.length; + if (len == 0) { + return seastar::now(); + } + if (op.writesame.data_length == 0 || + len % op.writesame.data_length != 0 || + op.writesame.data_length != osd_op.indata.length()) { + throw crimson::osd::invalid_argument(); + } + ceph::bufferlist repeated_indata; + for (uint64_t size = 0; size < len; size += op.writesame.data_length) { + repeated_indata.append(osd_op.indata); + } + maybe_create_new_object(os, txn); + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + op.writesame.offset, len, + std::move(repeated_indata), op.flags); + os.oi.size = len; + osd_op_params.clean_regions.mark_data_region_dirty(op.writesame.offset, len); + return seastar::now(); +} + +seastar::future<> PGBackend::writefull( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + const ceph_osd_op& op = osd_op.op; + if (op.extent.length != osd_op.indata.length()) { + throw crimson::osd::invalid_argument(); + } + + const bool existing = maybe_create_new_object(os, txn); + if (existing && op.extent.length < os.oi.size) { + txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, op.extent.length); + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.length, + os.oi.size - op.extent.length); + } + if (op.extent.length) { + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, 0, op.extent.length, + osd_op.indata, op.flags); + os.oi.size = op.extent.length; + osd_op_params.clean_regions.mark_data_region_dirty(0, + std::max((uint64_t) op.extent.length, os.oi.size)); + } + return seastar::now(); +} + +PGBackend::append_errorator::future<> PGBackend::append( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + const ceph_osd_op& op = osd_op.op; + if (op.extent.length != osd_op.indata.length()) { + return crimson::ct_error::invarg::make(); + } + maybe_create_new_object(os, txn); + if (op.extent.length) { + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + os.oi.size /* offset */, op.extent.length, + std::move(osd_op.indata), op.flags); + os.oi.size += op.extent.length; + osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size, + op.extent.length); + } + return seastar::now(); +} + +PGBackend::write_ertr::future<> PGBackend::truncate( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{} object dne, truncate is a no-op", __func__); + return write_ertr::now(); + } + const ceph_osd_op& op = osd_op.op; + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + if (op.extent.truncate_seq) { + assert(op.extent.offset == op.extent.truncate_size); + if (op.extent.truncate_seq <= os.oi.truncate_seq) { + logger().debug("{} truncate seq {} <= current {}, no-op", + __func__, op.extent.truncate_seq, os.oi.truncate_seq); + return write_ertr::make_ready_future<>(); + } else { + logger().debug("{} truncate seq {} > current {}, truncating", + __func__, op.extent.truncate_seq, os.oi.truncate_seq); + os.oi.truncate_seq = op.extent.truncate_seq; + os.oi.truncate_size = op.extent.truncate_size; + } + } + maybe_create_new_object(os, txn); + if (os.oi.size != op.extent.offset) { + txn.truncate(coll->get_cid(), + ghobject_t{os.oi.soid}, op.extent.offset); + if (os.oi.size > op.extent.offset) { + // TODO: modified_ranges.union_of(trim); + osd_op_params.clean_regions.mark_data_region_dirty( + op.extent.offset, + os.oi.size - op.extent.offset); + } else { + // os.oi.size < op.extent.offset + osd_op_params.clean_regions.mark_data_region_dirty( + os.oi.size, + op.extent.offset - os.oi.size); + } + os.oi.size = op.extent.offset; + os.oi.clear_data_digest(); + } + // TODO: truncate_update_size_and_usage() + // TODO: ctx->delta_stats.num_wr++; + // ---- + // do no set exists, or we will break above DELETE -> TRUNCATE munging. + return write_ertr::now(); +} + +PGBackend::write_ertr::future<> PGBackend::zero( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{} object dne, zero is a no-op", __func__); + return write_ertr::now(); + } + const ceph_osd_op& op = osd_op.op; + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + assert(op.extent.length); + txn.zero(coll->get_cid(), + ghobject_t{os.oi.soid}, + op.extent.offset, + op.extent.length); + // TODO: modified_ranges.union_of(zeroed); + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset, + op.extent.length); + // TODO: ctx->delta_stats.num_wr++; + os.oi.clear_data_digest(); + return write_ertr::now(); +} + +seastar::future<> PGBackend::create( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + if (os.exists && !os.oi.is_whiteout() && + (osd_op.op.flags & CEPH_OSD_OP_FLAG_EXCL)) { + // this is an exclusive create + throw crimson::osd::make_error(-EEXIST); + } + + if (osd_op.indata.length()) { + // handle the legacy. `category` is no longer implemented. + try { + auto p = osd_op.indata.cbegin(); + std::string category; + decode(category, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument(); + } + } + maybe_create_new_object(os, txn); + txn.nop(); + return seastar::now(); +} + +seastar::future<> PGBackend::remove(ObjectState& os, + ceph::os::Transaction& txn) +{ + // todo: snapset + txn.remove(coll->get_cid(), + ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard}); + os.oi.size = 0; + os.oi.new_object(); + os.exists = false; + // todo: update watchers + if (os.oi.is_whiteout()) { + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + } + return seastar::now(); +} + +seastar::future<std::tuple<std::vector<hobject_t>, hobject_t>> +PGBackend::list_objects(const hobject_t& start, uint64_t limit) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + auto gstart = start.is_min() ? ghobject_t{} : ghobject_t{start, 0, shard}; + return store->list_objects(coll, + gstart, + ghobject_t::get_max(), + limit) + .then([](auto ret) { + auto& [gobjects, next] = ret; + std::vector<hobject_t> objects; + boost::copy(gobjects | + boost::adaptors::filtered([](const ghobject_t& o) { + if (o.is_pgmeta()) { + return false; + } else if (o.hobj.is_temp()) { + return false; + } else { + return o.is_no_gen(); + } + }) | + boost::adaptors::transformed([](const ghobject_t& o) { + return o.hobj; + }), + std::back_inserter(objects)); + return seastar::make_ready_future<std::tuple<std::vector<hobject_t>, hobject_t>>( + std::make_tuple(objects, next.hobj)); + }); +} + +seastar::future<> PGBackend::setxattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + if (local_conf()->osd_max_attr_size > 0 && + osd_op.op.xattr.value_len > local_conf()->osd_max_attr_size) { + throw crimson::osd::make_error(-EFBIG); + } + + const auto max_name_len = std::min<uint64_t>( + store->get_max_attr_name_length(), local_conf()->osd_max_attr_name_len); + if (osd_op.op.xattr.name_len > max_name_len) { + throw crimson::osd::make_error(-ENAMETOOLONG); + } + + maybe_create_new_object(os, txn); + + std::string name{"_"}; + ceph::bufferlist val; + { + auto bp = osd_op.indata.cbegin(); + bp.copy(osd_op.op.xattr.name_len, name); + bp.copy(osd_op.op.xattr.value_len, val); + } + logger().debug("setxattr on obj={} for attr={}", os.oi.soid, name); + + txn.setattr(coll->get_cid(), ghobject_t{os.oi.soid}, name, val); + return seastar::now(); + //ctx->delta_stats.num_wr++; +} + +PGBackend::get_attr_errorator::future<> PGBackend::getxattr( + const ObjectState& os, + OSDOp& osd_op) const +{ + std::string name; + ceph::bufferlist val; + { + auto bp = osd_op.indata.cbegin(); + std::string aname; + bp.copy(osd_op.op.xattr.name_len, aname); + name = "_" + aname; + } + logger().debug("getxattr on obj={} for attr={}", os.oi.soid, name); + return getxattr(os.oi.soid, name).safe_then([&osd_op] (ceph::bufferptr val) { + osd_op.outdata.clear(); + osd_op.outdata.push_back(std::move(val)); + osd_op.op.xattr.value_len = osd_op.outdata.length(); + return get_attr_errorator::now(); + //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + }); + //ctx->delta_stats.num_rd++; +} + +PGBackend::get_attr_errorator::future<ceph::bufferptr> PGBackend::getxattr( + const hobject_t& soid, + std::string_view key) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + return store->get_attr(coll, ghobject_t{soid}, key); +} + +PGBackend::get_attr_errorator::future<> PGBackend::get_xattrs( + const ObjectState& os, + OSDOp& osd_op) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + return store->get_attrs(coll, ghobject_t{os.oi.soid}).safe_then( + [&osd_op](auto&& attrs) { + std::vector<std::pair<std::string, bufferlist>> user_xattrs; + for (auto& [key, val] : attrs) { + if (key.size() > 1 && key[0] == '_') { + ceph::bufferlist bl; + bl.append(std::move(val)); + user_xattrs.emplace_back(key.substr(1), std::move(bl)); + } + } + ceph::encode(user_xattrs, osd_op.outdata); + return get_attr_errorator::now(); + }); +} + +PGBackend::rm_xattr_ertr::future<> PGBackend::rm_xattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + auto bp = osd_op.indata.cbegin(); + string attr_name{"_"}; + bp.copy(osd_op.op.xattr.name_len, attr_name); + txn.rmattr(coll->get_cid(), ghobject_t{os.oi.soid}, attr_name); + return rm_xattr_ertr::now(); +} + +using get_omap_ertr = + crimson::os::FuturizedStore::read_errorator::extend< + crimson::ct_error::enodata>; +static +get_omap_ertr::future< + crimson::os::FuturizedStore::omap_values_t> +maybe_get_omap_vals_by_keys( + crimson::os::FuturizedStore* store, + const crimson::os::CollectionRef& coll, + const object_info_t& oi, + const std::set<std::string>& keys_to_get) +{ + if (oi.is_omap()) { + return store->omap_get_values(coll, ghobject_t{oi.soid}, keys_to_get); + } else { + return crimson::ct_error::enodata::make(); + } +} + +static +get_omap_ertr::future< + std::tuple<bool, crimson::os::FuturizedStore::omap_values_t>> +maybe_get_omap_vals( + crimson::os::FuturizedStore* store, + const crimson::os::CollectionRef& coll, + const object_info_t& oi, + const std::string& start_after) +{ + if (oi.is_omap()) { + return store->omap_get_values(coll, ghobject_t{oi.soid}, start_after); + } else { + return crimson::ct_error::enodata::make(); + } +} + +PGBackend::ll_read_errorator::future<ceph::bufferlist> +PGBackend::omap_get_header( + const crimson::os::CollectionRef& c, + const ghobject_t& oid) const +{ + return store->omap_get_header(c, oid); +} + +PGBackend::ll_read_errorator::future<> +PGBackend::omap_get_header( + const ObjectState& os, + OSDOp& osd_op) const +{ + return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then( + [&osd_op] (ceph::bufferlist&& header) { + osd_op.outdata = std::move(header); + return seastar::now(); + }); +} + +PGBackend::ll_read_errorator::future<> +PGBackend::omap_get_keys( + const ObjectState& os, + OSDOp& osd_op) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + std::string start_after; + uint64_t max_return; + try { + auto p = osd_op.indata.cbegin(); + decode(start_after, p); + decode(max_return, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + max_return = + std::min(max_return, local_conf()->osd_max_omap_entries_per_request); + + // TODO: truly chunk the reading + return maybe_get_omap_vals(store, coll, os.oi, start_after).safe_then( + [=, &osd_op](auto ret) { + ceph::bufferlist result; + bool truncated = false; + uint32_t num = 0; + for (auto &[key, val] : std::get<1>(ret)) { + if (num >= max_return || + result.length() >= local_conf()->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(key, result); + ++num; + } + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(result); + encode(truncated, osd_op.outdata); + return seastar::now(); + }).handle_error( + crimson::ct_error::enodata::handle([&osd_op] { + uint32_t num = 0; + bool truncated = false; + encode(num, osd_op.outdata); + encode(truncated, osd_op.outdata); + return seastar::now(); + }), + ll_read_errorator::pass_further{} + ); + // TODO: + //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + //ctx->delta_stats.num_rd++; +} + +PGBackend::ll_read_errorator::future<> +PGBackend::omap_get_vals( + const ObjectState& os, + OSDOp& osd_op) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + std::string start_after; + uint64_t max_return; + std::string filter_prefix; + try { + auto p = osd_op.indata.cbegin(); + decode(start_after, p); + decode(max_return, p); + decode(filter_prefix, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + max_return = \ + std::min(max_return, local_conf()->osd_max_omap_entries_per_request); + + // TODO: truly chunk the reading + return maybe_get_omap_vals(store, coll, os.oi, start_after).safe_then( + [=, &osd_op] (auto&& ret) { + auto [done, vals] = std::move(ret); + assert(done); + ceph::bufferlist result; + bool truncated = false; + uint32_t num = 0; + auto iter = filter_prefix > start_after ? vals.lower_bound(filter_prefix) + : std::begin(vals); + for (; iter != std::end(vals); ++iter) { + const auto& [key, value] = *iter; + if (key.substr(0, filter_prefix.size()) != filter_prefix) { + break; + } else if (num >= max_return || + result.length() >= local_conf()->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(key, result); + encode(value, result); + ++num; + } + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(result); + encode(truncated, osd_op.outdata); + return ll_read_errorator::now(); + }).handle_error( + crimson::ct_error::enodata::handle([&osd_op] { + encode(uint32_t{0} /* num */, osd_op.outdata); + encode(bool{false} /* truncated */, osd_op.outdata); + return ll_read_errorator::now(); + }), + ll_read_errorator::pass_further{} + ); + + // TODO: + //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + //ctx->delta_stats.num_rd++; +} + +PGBackend::ll_read_errorator::future<> +PGBackend::omap_get_vals_by_keys( + const ObjectState& os, + OSDOp& osd_op) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + + std::set<std::string> keys_to_get; + try { + auto p = osd_op.indata.cbegin(); + decode(keys_to_get, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument(); + } + return maybe_get_omap_vals_by_keys(store, coll, os.oi, keys_to_get).safe_then( + [&osd_op] (crimson::os::FuturizedStore::omap_values_t&& vals) { + encode(vals, osd_op.outdata); + return ll_read_errorator::now(); + }).handle_error( + crimson::ct_error::enodata::handle([&osd_op] { + uint32_t num = 0; + encode(num, osd_op.outdata); + return ll_read_errorator::now(); + }), + ll_read_errorator::pass_further{} + ); + + // TODO: + //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + //ctx->delta_stats.num_rd++; +} + +seastar::future<> PGBackend::omap_set_vals( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + maybe_create_new_object(os, txn); + + ceph::bufferlist to_set_bl; + try { + auto p = osd_op.indata.cbegin(); + decode_str_str_map_to_bl(p, &to_set_bl); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + txn.omap_setkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_set_bl); + + // TODO: + //ctx->clean_regions.mark_omap_dirty(); + + // TODO: + //ctx->delta_stats.num_wr++; + //ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10); + os.oi.set_flag(object_info_t::FLAG_OMAP); + os.oi.clear_omap_digest(); + osd_op_params.clean_regions.mark_omap_dirty(); + return seastar::now(); +} + +seastar::future<> PGBackend::omap_set_header( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + maybe_create_new_object(os, txn); + txn.omap_setheader(coll->get_cid(), ghobject_t{os.oi.soid}, osd_op.indata); + //TODO: + //ctx->clean_regions.mark_omap_dirty(); + //ctx->delta_stats.num_wr++; + os.oi.set_flag(object_info_t::FLAG_OMAP); + os.oi.clear_omap_digest(); + return seastar::now(); +} + +seastar::future<> PGBackend::omap_remove_range( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + std::string key_begin, key_end; + try { + auto p = osd_op.indata.cbegin(); + decode(key_begin, p); + decode(key_end, p); + } catch (buffer::error& e) { + throw crimson::osd::invalid_argument{}; + } + txn.omap_rmkeyrange(coll->get_cid(), ghobject_t{os.oi.soid}, key_begin, key_end); + //TODO: + //ctx->delta_stats.num_wr++; + os.oi.clear_omap_digest(); + return seastar::now(); +} + +PGBackend::omap_clear_ertr::future<> +PGBackend::omap_clear( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + if (!os.oi.is_omap()) { + return omap_clear_ertr::now(); + } + txn.omap_clear(coll->get_cid(), ghobject_t{os.oi.soid}); + osd_op_params.clean_regions.mark_omap_dirty(); + os.oi.clear_omap_digest(); + os.oi.clear_flag(object_info_t::FLAG_OMAP); + return omap_clear_ertr::now(); +} + +seastar::future<struct stat> PGBackend::stat( + CollectionRef c, + const ghobject_t& oid) const +{ + return store->stat(c, oid); +} + +seastar::future<std::map<uint64_t, uint64_t>> +PGBackend::fiemap( + CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + return store->fiemap(c, oid, off, len); +} + +void PGBackend::on_activate_complete() { + peering.reset(); +} + diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h new file mode 100644 index 000000000..d8fa8b2ac --- /dev/null +++ b/src/crimson/osd/pg_backend.h @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <memory> +#include <string> +#include <boost/smart_ptr/local_shared_ptr.hpp> +#include <boost/container/flat_set.hpp> + +#include "crimson/os/futurized_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/acked_peers.h" +#include "crimson/osd/pg.h" +#include "crimson/common/shared_lru.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/osdop_params.h" + +struct hobject_t; + +namespace ceph::os { + class Transaction; +} + +namespace crimson::osd { + class ShardServices; +} + +class PGBackend +{ +protected: + using CollectionRef = crimson::os::CollectionRef; + using ec_profile_t = std::map<std::string, std::string>; + // low-level read errorator + using ll_read_errorator = crimson::os::FuturizedStore::read_errorator; + +public: + using load_metadata_ertr = crimson::errorator< + crimson::ct_error::object_corrupted>; + PGBackend(shard_id_t shard, CollectionRef coll, crimson::os::FuturizedStore* store); + virtual ~PGBackend() = default; + static std::unique_ptr<PGBackend> create(pg_t pgid, + const pg_shard_t pg_shard, + const pg_pool_t& pool, + crimson::os::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile); + using attrs_t = + std::map<std::string, ceph::bufferptr, std::less<>>; + using read_errorator = ll_read_errorator::extend< + crimson::ct_error::object_corrupted>; + read_errorator::future<> read( + const ObjectState& os, + OSDOp& osd_op); + read_errorator::future<> sparse_read( + const ObjectState& os, + OSDOp& osd_op); + using checksum_errorator = ll_read_errorator::extend< + crimson::ct_error::object_corrupted, + crimson::ct_error::invarg>; + checksum_errorator::future<> checksum( + const ObjectState& os, + OSDOp& osd_op); + using cmp_ext_errorator = ll_read_errorator::extend< + crimson::ct_error::invarg>; + cmp_ext_errorator::future<> cmp_ext( + const ObjectState& os, + OSDOp& osd_op); + using stat_errorator = crimson::errorator<crimson::ct_error::enoent>; + stat_errorator::future<> stat( + const ObjectState& os, + OSDOp& osd_op); + + // TODO: switch the entire write family to errorator. + using write_ertr = crimson::errorator< + crimson::ct_error::file_too_large>; + seastar::future<> create( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + seastar::future<> remove( + ObjectState& os, + ceph::os::Transaction& txn); + seastar::future<> write( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + seastar::future<> write_same( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + seastar::future<> writefull( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + using append_errorator = crimson::errorator< + crimson::ct_error::invarg>; + append_errorator::future<> append( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + write_ertr::future<> truncate( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + write_ertr::future<> zero( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + seastar::future<crimson::osd::acked_peers_t> mutate_object( + std::set<pg_shard_t> pg_shards, + crimson::osd::ObjectContextRef &&obc, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, + epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries); + seastar::future<std::tuple<std::vector<hobject_t>, hobject_t>> list_objects( + const hobject_t& start, + uint64_t limit) const; + seastar::future<> setxattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + using get_attr_errorator = crimson::os::FuturizedStore::get_attr_errorator; + get_attr_errorator::future<> getxattr( + const ObjectState& os, + OSDOp& osd_op) const; + get_attr_errorator::future<ceph::bufferptr> getxattr( + const hobject_t& soid, + std::string_view key) const; + get_attr_errorator::future<> get_xattrs( + const ObjectState& os, + OSDOp& osd_op) const; + using rm_xattr_ertr = crimson::errorator<crimson::ct_error::enoent>; + rm_xattr_ertr::future<> rm_xattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) const; + seastar::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len); + + // OMAP + ll_read_errorator::future<> omap_get_keys( + const ObjectState& os, + OSDOp& osd_op) const; + ll_read_errorator::future<> omap_get_vals( + const ObjectState& os, + OSDOp& osd_op) const; + ll_read_errorator::future<> omap_get_vals_by_keys( + const ObjectState& os, + OSDOp& osd_op) const; + seastar::future<> omap_set_vals( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + ll_read_errorator::future<ceph::bufferlist> omap_get_header( + const crimson::os::CollectionRef& c, + const ghobject_t& oid) const; + ll_read_errorator::future<> omap_get_header( + const ObjectState& os, + OSDOp& osd_op) const; + seastar::future<> omap_set_header( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + seastar::future<> omap_remove_range( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + using omap_clear_ertr = crimson::errorator<crimson::ct_error::enoent>; + omap_clear_ertr::future<> omap_clear( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + + virtual void got_rep_op_reply(const MOSDRepOpReply&) {} + virtual seastar::future<> stop() = 0; + struct peering_info_t { + bool is_primary; + }; + virtual void on_actingset_changed(peering_info_t pi) = 0; + virtual void on_activate_complete(); +protected: + const shard_id_t shard; + CollectionRef coll; + crimson::os::FuturizedStore* store; + bool stopping = false; + std::optional<peering_info_t> peering; +public: + struct loaded_object_md_t { + ObjectState os; + std::optional<SnapSet> ss; + using ref = std::unique_ptr<loaded_object_md_t>; + }; + load_metadata_ertr::future<loaded_object_md_t::ref> load_metadata( + const hobject_t &oid); + +private: + virtual ll_read_errorator::future<ceph::bufferlist> _read( + const hobject_t& hoid, + size_t offset, + size_t length, + uint32_t flags) = 0; + + bool maybe_create_new_object(ObjectState& os, ceph::os::Transaction& txn); + virtual seastar::future<crimson::osd::acked_peers_t> + _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) = 0; + friend class ReplicatedRecoveryBackend; +}; diff --git a/src/crimson/osd/pg_map.cc b/src/crimson/osd/pg_map.cc new file mode 100644 index 000000000..08071f260 --- /dev/null +++ b/src/crimson/osd/pg_map.cc @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/pg_map.h" + +#include "crimson/osd/pg.h" +#include "common/Formatter.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +PGMap::PGCreationState::PGCreationState(spg_t pgid) : pgid(pgid) {} +PGMap::PGCreationState::~PGCreationState() {} + +void PGMap::PGCreationState::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pgid; + f->dump_bool("creating", creating); +} + +std::pair<blocking_future<Ref<PG>>, bool> PGMap::wait_for_pg(spg_t pgid) +{ + if (auto pg = get_pg(pgid)) { + return make_pair(make_ready_blocking_future<Ref<PG>>(pg), true); + } else { + auto &state = pgs_creating.emplace(pgid, pgid).first->second; + return make_pair( + state.make_blocking_future(state.promise.get_shared_future()), + state.creating); + } +} + +Ref<PG> PGMap::get_pg(spg_t pgid) +{ + if (auto pg = pgs.find(pgid); pg != pgs.end()) { + return pg->second; + } else { + return nullptr; + } +} + +void PGMap::set_creating(spg_t pgid) +{ + logger().debug("Creating {}", pgid); + ceph_assert(pgs.count(pgid) == 0); + auto pg = pgs_creating.find(pgid); + ceph_assert(pg != pgs_creating.end()); + ceph_assert(pg->second.creating == false); + pg->second.creating = true; +} + +void PGMap::pg_created(spg_t pgid, Ref<PG> pg) +{ + logger().debug("Created {}", pgid); + ceph_assert(!pgs.count(pgid)); + pgs.emplace(pgid, pg); + + auto state = pgs_creating.find(pgid); + ceph_assert(state != pgs_creating.end()); + state->second.promise.set_value(pg); + pgs_creating.erase(pgid); +} + +void PGMap::pg_loaded(spg_t pgid, Ref<PG> pg) +{ + ceph_assert(!pgs.count(pgid)); + pgs.emplace(pgid, pg); +} + +PGMap::~PGMap() {} + +} diff --git a/src/crimson/osd/pg_map.h b/src/crimson/osd/pg_map.h new file mode 100644 index 000000000..b3fe4b562 --- /dev/null +++ b/src/crimson/osd/pg_map.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "include/types.h" +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/pg.h" +#include "osd/osd_types.h" + +namespace crimson::osd { +class PG; + +class PGMap { + struct PGCreationState : BlockerT<PGCreationState> { + static constexpr const char * type_name = "PGCreation"; + + void dump_detail(Formatter *f) const final; + + spg_t pgid; + seastar::shared_promise<Ref<PG>> promise; + bool creating = false; + PGCreationState(spg_t pgid); + + PGCreationState(const PGCreationState &) = delete; + PGCreationState(PGCreationState &&) = delete; + PGCreationState &operator=(const PGCreationState &) = delete; + PGCreationState &operator=(PGCreationState &&) = delete; + + ~PGCreationState(); + }; + + std::map<spg_t, PGCreationState> pgs_creating; + using pgs_t = std::map<spg_t, Ref<PG>>; + pgs_t pgs; + +public: + /** + * Get future for pg with a bool indicating whether it's already being + * created. + */ + std::pair<blocking_future<Ref<PG>>, bool> wait_for_pg(spg_t pgid); + + /** + * get PG in non-blocking manner + */ + Ref<PG> get_pg(spg_t pgid); + + /** + * Set creating + */ + void set_creating(spg_t pgid); + + /** + * Set newly created pg + */ + void pg_created(spg_t pgid, Ref<PG> pg); + + /** + * Add newly loaded pg + */ + void pg_loaded(spg_t pgid, Ref<PG> pg); + + pgs_t& get_pgs() { return pgs; } + const pgs_t& get_pgs() const { return pgs; } + PGMap() = default; + ~PGMap(); +}; + +} diff --git a/src/crimson/osd/pg_meta.cc b/src/crimson/osd/pg_meta.cc new file mode 100644 index 000000000..ad5385963 --- /dev/null +++ b/src/crimson/osd/pg_meta.cc @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg_meta.h" + +#include <string_view> + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can +// easily skip them +using crimson::os::FuturizedStore; + +PGMeta::PGMeta(FuturizedStore* store, spg_t pgid) + : store{store}, + pgid{pgid} +{} + +namespace { + template<typename T> + std::optional<T> find_value(const FuturizedStore::omap_values_t& values, + string_view key) + { + auto found = values.find(key); + if (found == values.end()) { + return {}; + } + auto p = found->second.cbegin(); + T value; + decode(value, p); + return std::make_optional(std::move(value)); + } +} + +seastar::future<epoch_t> PGMeta::get_epoch() +{ + return store->open_collection(coll_t{pgid}).then([this](auto ch) { + return store->omap_get_values(ch, + pgid.make_pgmeta_oid(), + {string{infover_key}, + string{epoch_key}}).safe_then( + [](auto&& values) { + { + // sanity check + auto infover = find_value<__u8>(values, infover_key); + assert(infover); + if (*infover < 10) { + throw std::runtime_error("incompatible pg meta"); + } + } + { + auto epoch = find_value<epoch_t>(values, epoch_key); + assert(epoch); + return seastar::make_ready_future<epoch_t>(*epoch); + } + }, + FuturizedStore::read_errorator::assert_all{ + "PGMeta::get_epoch: unable to read pgmeta" + }); + }); +} + +seastar::future<std::tuple<pg_info_t, PastIntervals>> PGMeta::load() +{ + return store->open_collection(coll_t{pgid}).then([this](auto ch) { + return store->omap_get_values(ch, + pgid.make_pgmeta_oid(), + {string{infover_key}, + string{info_key}, + string{biginfo_key}, + string{fastinfo_key}}); + }).safe_then([](auto&& values) { + { + // sanity check + auto infover = find_value<__u8>(values, infover_key); + assert(infover); + if (infover < 10) { + throw std::runtime_error("incompatible pg meta"); + } + } + pg_info_t info; + { + auto found = find_value<pg_info_t>(values, info_key); + assert(found); + info = *std::move(found); + } + PastIntervals past_intervals; + { + using biginfo_t = std::pair<PastIntervals, decltype(info.purged_snaps)>; + auto big_info = find_value<biginfo_t>(values, biginfo_key); + assert(big_info); + past_intervals = std::move(big_info->first); + info.purged_snaps = std::move(big_info->second); + } + { + auto fast_info = find_value<pg_fast_info_t>(values, fastinfo_key); + if (fast_info) { + fast_info->try_apply_to(&info); + } + } + return seastar::make_ready_future<std::tuple<pg_info_t, PastIntervals>>( + std::make_tuple(std::move(info), std::move(past_intervals))); + }, + FuturizedStore::read_errorator::assert_all{ + "PGMeta::load: unable to read pgmeta" + }); +} diff --git a/src/crimson/osd/pg_meta.h b/src/crimson/osd/pg_meta.h new file mode 100644 index 000000000..e0aa02716 --- /dev/null +++ b/src/crimson/osd/pg_meta.h @@ -0,0 +1,23 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <tuple> +#include <seastar/core/future.hh> +#include "osd/osd_types.h" + +namespace crimson::os { + class FuturizedStore; +} + +/// PG related metadata +class PGMeta +{ + crimson::os::FuturizedStore* store; + const spg_t pgid; +public: + PGMeta(crimson::os::FuturizedStore *store, spg_t pgid); + seastar::future<epoch_t> get_epoch(); + seastar::future<std::tuple<pg_info_t, PastIntervals>> load(); +}; diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc new file mode 100644 index 000000000..7d70b5e8f --- /dev/null +++ b/src/crimson/osd/pg_recovery.cc @@ -0,0 +1,550 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "crimson/common/type_helpers.h" +#include "crimson/osd/backfill_facades.h" +#include "crimson/osd/osd_operations/background_recovery.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/pg_recovery.h" + +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" + +#include "osd/osd_types.h" +#include "osd/PeeringState.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +void PGRecovery::start_pglogbased_recovery() +{ + using PglogBasedRecovery = crimson::osd::PglogBasedRecovery; + (void) pg->get_shard_services().start_operation<PglogBasedRecovery>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_osdmap_epoch()); +} + +crimson::osd::blocking_future<bool> +PGRecovery::start_recovery_ops(size_t max_to_start) +{ + assert(pg->is_primary()); + assert(pg->is_peered()); + assert(pg->is_recovering()); + // in ceph-osd the do_recovery() path handles both the pg log-based + // recovery and the backfill, albeit they are separated at the layer + // of PeeringState. In crimson-osd backfill has been cut from it, so + // and do_recovery() is actually solely for pg log-based recovery. + // At the time of writing it's considered to move it to FSM and fix + // the naming as well. + assert(!pg->is_backfilling()); + assert(!pg->get_peering_state().is_deleting()); + + std::vector<crimson::osd::blocking_future<>> started; + started.reserve(max_to_start); + max_to_start -= start_primary_recovery_ops(max_to_start, &started); + if (max_to_start > 0) { + max_to_start -= start_replica_recovery_ops(max_to_start, &started); + } + return crimson::osd::join_blocking_futures(std::move(started)).then( + [this] { + bool done = !pg->get_peering_state().needs_recovery(); + if (done) { + logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}", + pg->get_pgid()); + using LocalPeeringEvent = crimson::osd::LocalPeeringEvent; + if (!pg->get_peering_state().needs_backfill()) { + logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}", + pg->get_pgid()); + (void) pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::AllReplicasRecovered{}); + } else { + logger().debug("start_recovery_ops: RequestBackfill for pg: {}", + pg->get_pgid()); + (void) pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::RequestBackfill{}); + } + } + return seastar::make_ready_future<bool>(!done); + }); +} + +size_t PGRecovery::start_primary_recovery_ops( + size_t max_to_start, + std::vector<crimson::osd::blocking_future<>> *out) +{ + if (!pg->is_recovering()) { + return 0; + } + + if (!pg->get_peering_state().have_missing()) { + pg->get_peering_state().local_recovery_complete(); + return 0; + } + + const auto &missing = pg->get_peering_state().get_pg_log().get_missing(); + + logger().info("{} recovering {} in pg {}, missing {}", __func__, + pg->get_recovery_backend()->total_recovering(), + *static_cast<crimson::osd::PG*>(pg), + missing); + + unsigned started = 0; + int skipped = 0; + + map<version_t, hobject_t>::const_iterator p = + missing.get_rmissing().lower_bound(pg->get_peering_state().get_pg_log().get_log().last_requested); + while (started < max_to_start && p != missing.get_rmissing().end()) { + // TODO: chain futures here to enable yielding to scheduler? + hobject_t soid; + version_t v = p->first; + + auto it_objects = pg->get_peering_state().get_pg_log().get_log().objects.find(p->second); + if (it_objects != pg->get_peering_state().get_pg_log().get_log().objects.end()) { + // look at log! + pg_log_entry_t *latest = it_objects->second; + assert(latest->is_update() || latest->is_delete()); + soid = latest->soid; + } else { + soid = p->second; + } + const pg_missing_item& item = missing.get_items().find(p->second)->second; + ++p; + + hobject_t head = soid.get_head(); + + logger().info( + "{} {} item.need {} {} {} {} {}", + __func__, + soid, + item.need, + missing.is_missing(soid) ? " (missing)":"", + missing.is_missing(head) ? " (missing head)":"", + pg->get_recovery_backend()->is_recovering(soid) ? " (recovering)":"", + pg->get_recovery_backend()->is_recovering(head) ? " (recovering head)":""); + + // TODO: handle lost/unfound + if (pg->get_recovery_backend()->is_recovering(soid)) { + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + out->push_back(recovery_waiter.wait_for_recovered_blocking()); + ++started; + } else if (pg->get_recovery_backend()->is_recovering(head)) { + ++skipped; + } else { + out->push_back(recover_missing(soid, item.need)); + ++started; + } + + if (!skipped) + pg->get_peering_state().set_last_requested(v); + } + + logger().info("{} started {} skipped {}", __func__, started, skipped); + + return started; +} + +size_t PGRecovery::start_replica_recovery_ops( + size_t max_to_start, + std::vector<crimson::osd::blocking_future<>> *out) +{ + if (!pg->is_recovering()) { + return 0; + } + uint64_t started = 0; + + assert(!pg->get_peering_state().get_acting_recovery_backfill().empty()); + + auto recovery_order = get_replica_recovery_order(); + for (auto &peer : recovery_order) { + assert(peer != pg->get_peering_state().get_primary()); + const auto& pm = pg->get_peering_state().get_peer_missing(peer); + + logger().debug("{}: peer osd.{} missing {} objects", __func__, + peer, pm.num_missing()); + logger().trace("{}: peer osd.{} missing {}", __func__, + peer, pm.get_items()); + + // recover oldest first + for (auto p = pm.get_rmissing().begin(); + p != pm.get_rmissing().end() && started < max_to_start; + ++p) { + const auto &soid = p->second; + + if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) { + logger().debug("{}: object {} still unfound", __func__, soid); + continue; + } + + const pg_info_t &pi = pg->get_peering_state().get_peer_info(peer); + if (soid > pi.last_backfill) { + if (!pg->get_recovery_backend()->is_recovering(soid)) { + logger().error( + "{}: object {} in missing set for backfill (last_backfill {})" + " but not in recovering", + __func__, + soid, + pi.last_backfill); + ceph_abort(); + } + continue; + } + + if (pg->get_recovery_backend()->is_recovering(soid)) { + logger().debug("{}: already recovering object {}", __func__, soid); + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + out->push_back(recovery_waiter.wait_for_recovered_blocking()); + started++; + continue; + } + + if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { + logger().debug("{}: soid {} is a delete, removing", __func__, soid); + map<hobject_t,pg_missing_item>::const_iterator r = + pm.get_items().find(soid); + started += prep_object_replica_deletes( + soid, r->second.need, out); + continue; + } + + if (soid.is_snap() && + pg->get_peering_state().get_pg_log().get_missing().is_missing( + soid.get_head())) { + logger().debug("{}: head {} still missing on primary", __func__, + soid.get_head()); + continue; + } + + if (pg->get_peering_state().get_pg_log().get_missing().is_missing(soid)) { + logger().debug("{}: soid {} still missing on primary", __func__, soid); + continue; + } + + logger().debug("{}: recover_object_replicas({})", __func__,soid); + map<hobject_t,pg_missing_item>::const_iterator r = pm.get_items().find( + soid); + started += prep_object_replica_pushes( + soid, r->second.need, out); + } + } + + return started; +} + +crimson::osd::blocking_future<> PGRecovery::recover_missing( + const hobject_t &soid, eversion_t need) +{ + if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { + return pg->get_recovery_backend()->add_recovering(soid).make_blocking_future( + pg->get_recovery_backend()->recover_delete(soid, need)); + } else { + return pg->get_recovery_backend()->add_recovering(soid).make_blocking_future( + pg->get_recovery_backend()->recover_object(soid, need).handle_exception( + [=, soid = std::move(soid)] (auto e) { + on_failed_recover({ pg->get_pg_whoami() }, soid, need); + return seastar::make_ready_future<>(); + }) + ); + } +} + +size_t PGRecovery::prep_object_replica_deletes( + const hobject_t& soid, + eversion_t need, + std::vector<crimson::osd::blocking_future<>> *in_progress) +{ + in_progress->push_back( + pg->get_recovery_backend()->add_recovering(soid).make_blocking_future( + pg->get_recovery_backend()->push_delete(soid, need).then([=] { + object_stat_sum_t stat_diff; + stat_diff.num_objects_recovered = 1; + on_global_recover(soid, stat_diff, true); + return seastar::make_ready_future<>(); + }) + ) + ); + return 1; +} + +size_t PGRecovery::prep_object_replica_pushes( + const hobject_t& soid, + eversion_t need, + std::vector<crimson::osd::blocking_future<>> *in_progress) +{ + in_progress->push_back( + pg->get_recovery_backend()->add_recovering(soid).make_blocking_future( + pg->get_recovery_backend()->recover_object(soid, need).handle_exception( + [=, soid = std::move(soid)] (auto e) { + on_failed_recover({ pg->get_pg_whoami() }, soid, need); + return seastar::make_ready_future<>(); + }) + ) + ); + return 1; +} + +void PGRecovery::on_local_recover( + const hobject_t& soid, + const ObjectRecoveryInfo& recovery_info, + const bool is_delete, + ceph::os::Transaction& t) +{ + pg->get_peering_state().recover_got(soid, + recovery_info.version, is_delete, t); + + if (pg->is_primary()) { + if (!is_delete) { + auto& obc = pg->get_recovery_backend()->get_recovering(soid).obc; //TODO: move to pg backend? + obc->obs.exists = true; + obc->obs.oi = recovery_info.oi; + } + if (!pg->is_unreadable_object(soid)) { + pg->get_recovery_backend()->get_recovering(soid).set_readable(); + } + pg->publish_stats_to_osd(); + } +} + +void PGRecovery::on_global_recover ( + const hobject_t& soid, + const object_stat_sum_t& stat_diff, + const bool is_delete) +{ + logger().info("{} {}", __func__, soid); + pg->get_peering_state().object_recovered(soid, stat_diff); + pg->publish_stats_to_osd(); + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + if (!is_delete) + recovery_waiter.obc->drop_recovery_read(); + recovery_waiter.set_recovered(); + pg->get_recovery_backend()->remove_recovering(soid); +} + +void PGRecovery::on_failed_recover( + const set<pg_shard_t>& from, + const hobject_t& soid, + const eversion_t& v) +{ + for (auto pg_shard : from) { + if (pg_shard != pg->get_pg_whoami()) { + pg->get_peering_state().force_object_missing(pg_shard, soid, v); + } + } +} + +void PGRecovery::on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info) +{ + crimson::get_logger(ceph_subsys_osd).debug( + "{}: {}, {} on {}", __func__, oid, + recovery_info.version, peer); + pg->get_peering_state().on_peer_recover(peer, oid, recovery_info.version); +} + +void PGRecovery::_committed_pushed_object(epoch_t epoch, + eversion_t last_complete) +{ + if (!pg->has_reset_since(epoch)) { + pg->get_peering_state().recovery_committed_to(last_complete); + } else { + crimson::get_logger(ceph_subsys_osd).debug( + "{} pg has changed, not touching last_complete_ondisk", + __func__); + } +} + +template <class EventT> +void PGRecovery::start_backfill_recovery(const EventT& evt) +{ + using BackfillRecovery = crimson::osd::BackfillRecovery; + std::ignore = pg->get_shard_services().start_operation<BackfillRecovery>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_osdmap_epoch(), + evt); +} + +void PGRecovery::request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) +{ + logger().debug("{}: target.osd={}", __func__, target.osd); + auto msg = make_message<MOSDPGScan>( + MOSDPGScan::OP_SCAN_GET_DIGEST, + pg->get_pg_whoami(), + pg->get_osdmap_epoch(), + pg->get_last_peering_reset(), + spg_t(pg->get_pgid().pgid, target.shard), + begin, + end); + std::ignore = pg->get_shard_services().send_to_osd( + target.osd, + std::move(msg), + pg->get_osdmap_epoch()); +} + +void PGRecovery::request_primary_scan( + const hobject_t& begin) +{ + logger().debug("{}", __func__); + using crimson::common::local_conf; + std::ignore = pg->get_recovery_backend()->scan_for_backfill( + begin, + local_conf()->osd_backfill_scan_min, + local_conf()->osd_backfill_scan_max + ).then([this] (BackfillInterval bi) { + logger().debug("request_primary_scan:{}", __func__); + using BackfillState = crimson::osd::BackfillState; + start_backfill_recovery(BackfillState::PrimaryScanned{ std::move(bi) }); + }); +} + +void PGRecovery::enqueue_push( + const hobject_t& obj, + const eversion_t& v) +{ + logger().debug("{}: obj={} v={}", + __func__, obj, v); + pg->get_recovery_backend()->add_recovering(obj); + std::ignore = pg->get_recovery_backend()->recover_object(obj, v).\ + handle_exception([] (auto) { + ceph_abort_msg("got exception on backfill's push"); + return seastar::make_ready_future<>(); + }).then([this, obj] { + logger().debug("enqueue_push:{}", __func__); + using BackfillState = crimson::osd::BackfillState; + start_backfill_recovery(BackfillState::ObjectPushed(std::move(obj))); + }); +} + +void PGRecovery::enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) +{ + // allocate a pair if target is seen for the first time + auto& req = backfill_drop_requests[target]; + if (!req) { + req = ceph::make_message<MOSDPGBackfillRemove>( + spg_t(pg->get_pgid().pgid, target.shard), pg->get_osdmap_epoch()); + } + req->ls.emplace_back(obj, v); +} + +void PGRecovery::maybe_flush() +{ + for (auto& [target, req] : backfill_drop_requests) { + std::ignore = pg->get_shard_services().send_to_osd( + target.osd, + std::move(req), + pg->get_osdmap_epoch()); + } + backfill_drop_requests.clear(); +} + +void PGRecovery::update_peers_last_backfill( + const hobject_t& new_last_backfill) +{ + logger().debug("{}: new_last_backfill={}", + __func__, new_last_backfill); + // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to + // all the backfill targets. Otherwise, we will move last_backfill up on + // those targets need it and send OP_BACKFILL_PROGRESS to them. + for (const auto& bt : pg->get_peering_state().get_backfill_targets()) { + if (const pg_info_t& pinfo = pg->get_peering_state().get_peer_info(bt); + new_last_backfill > pinfo.last_backfill) { + pg->get_peering_state().update_peer_last_backfill(bt, new_last_backfill); + auto m = make_message<MOSDPGBackfill>( + pinfo.last_backfill.is_max() ? MOSDPGBackfill::OP_BACKFILL_FINISH + : MOSDPGBackfill::OP_BACKFILL_PROGRESS, + pg->get_osdmap_epoch(), + pg->get_last_peering_reset(), + spg_t(pg->get_pgid().pgid, bt.shard)); + // Use default priority here, must match sub_op priority + // TODO: if pinfo.last_backfill.is_max(), then + // start_recovery_op(hobject_t::get_max()); + m->last_backfill = pinfo.last_backfill; + m->stats = pinfo.stats; + std::ignore = pg->get_shard_services().send_to_osd( + bt.osd, std::move(m), pg->get_osdmap_epoch()); + logger().info("{}: peer {} num_objects now {} / {}", + __func__, + bt, + pinfo.stats.stats.sum.num_objects, + pg->get_info().stats.stats.sum.num_objects); + } + } +} + +bool PGRecovery::budget_available() const +{ + // TODO: the limits! + return true; +} + +void PGRecovery::backfilled() +{ + using LocalPeeringEvent = crimson::osd::LocalPeeringEvent; + std::ignore = pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::Backfilled{}); +} + +void PGRecovery::dispatch_backfill_event( + boost::intrusive_ptr<const boost::statechart::event_base> evt) +{ + logger().debug("{}", __func__); + backfill_state->process_event(evt); +} + +void PGRecovery::on_backfill_reserved() +{ + logger().debug("{}", __func__); + // PIMP and depedency injection for the sake unittestability. + // I'm not afraid about the performance here. + using BackfillState = crimson::osd::BackfillState; + backfill_state = std::make_unique<BackfillState>( + *this, + std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()), + std::make_unique<crimson::osd::PGFacade>( + *static_cast<crimson::osd::PG*>(pg))); + // yes, it's **not** backfilling yet. The PG_STATE_BACKFILLING + // will be set after on_backfill_reserved() returns. + // Backfill needs to take this into consideration when scheduling + // events -- they must be mutually exclusive with PeeringEvent + // instances. Otherwise the execution might begin without having + // the state updated. + ceph_assert(!pg->get_peering_state().is_backfilling()); + start_backfill_recovery(BackfillState::Triggered{}); +} diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h new file mode 100644 index 000000000..86f259de5 --- /dev/null +++ b/src/crimson/osd/pg_recovery.h @@ -0,0 +1,113 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/pg_recovery_listener.h" +#include "crimson/osd/scheduler/scheduler.h" +#include "crimson/osd/shard_services.h" + +#include "osd/object_state.h" + +class MOSDPGBackfillRemove; +class PGBackend; + +class PGRecovery : public crimson::osd::BackfillState::BackfillListener { +public: + PGRecovery(PGRecoveryListener* pg) : pg(pg) {} + virtual ~PGRecovery() {} + void start_pglogbased_recovery(); + + crimson::osd::blocking_future<bool> start_recovery_ops(size_t max_to_start); + void on_backfill_reserved(); + void dispatch_backfill_event( + boost::intrusive_ptr<const boost::statechart::event_base> evt); + + seastar::future<> stop() { return seastar::now(); } +private: + PGRecoveryListener* pg; + size_t start_primary_recovery_ops( + size_t max_to_start, + std::vector<crimson::osd::blocking_future<>> *out); + size_t start_replica_recovery_ops( + size_t max_to_start, + std::vector<crimson::osd::blocking_future<>> *out); + + std::vector<pg_shard_t> get_replica_recovery_order() const { + return pg->get_replica_recovery_order(); + } + crimson::osd::blocking_future<> recover_missing( + const hobject_t &soid, eversion_t need); + size_t prep_object_replica_deletes( + const hobject_t& soid, + eversion_t need, + std::vector<crimson::osd::blocking_future<>> *in_progress); + size_t prep_object_replica_pushes( + const hobject_t& soid, + eversion_t need, + std::vector<crimson::osd::blocking_future<>> *in_progress); + + void on_local_recover( + const hobject_t& soid, + const ObjectRecoveryInfo& recovery_info, + bool is_delete, + ceph::os::Transaction& t); + void on_global_recover ( + const hobject_t& soid, + const object_stat_sum_t& stat_diff, + bool is_delete); + void on_failed_recover( + const set<pg_shard_t>& from, + const hobject_t& soid, + const eversion_t& v); + void on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info); + void _committed_pushed_object(epoch_t epoch, + eversion_t last_complete); + friend class ReplicatedRecoveryBackend; + friend class crimson::osd::UrgentRecovery; + seastar::future<> handle_pull(Ref<MOSDPGPull> m); + seastar::future<> handle_push(Ref<MOSDPGPush> m); + seastar::future<> handle_push_reply(Ref<MOSDPGPushReply> m); + seastar::future<> handle_recovery_delete(Ref<MOSDPGRecoveryDelete> m); + seastar::future<> handle_recovery_delete_reply( + Ref<MOSDPGRecoveryDeleteReply> m); + seastar::future<> handle_pull_response(Ref<MOSDPGPush> m); + seastar::future<> handle_scan(MOSDPGScan& m); + + // backfill begin + std::unique_ptr<crimson::osd::BackfillState> backfill_state; + std::map<pg_shard_t, + ceph::ref_t<MOSDPGBackfillRemove>> backfill_drop_requests; + + template <class EventT> + void start_backfill_recovery( + const EventT& evt); + void request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) final; + void request_primary_scan( + const hobject_t& begin) final; + void enqueue_push( + const hobject_t& obj, + const eversion_t& v) final; + void enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) final; + void maybe_flush() final; + void update_peers_last_backfill( + const hobject_t& new_last_backfill) final; + bool budget_available() const final; + void backfilled() final; + friend crimson::osd::BackfillState::PGFacade; + friend crimson::osd::PG; + // backfill end +}; diff --git a/src/crimson/osd/pg_recovery_listener.h b/src/crimson/osd/pg_recovery_listener.h new file mode 100644 index 000000000..c922b9956 --- /dev/null +++ b/src/crimson/osd/pg_recovery_listener.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "common/hobject.h" +#include "include/types.h" +#include "osd/osd_types.h" + +namespace crimson::osd { + class ShardServices; +}; + +class RecoveryBackend; +class PGRecovery; + +class PGRecoveryListener { +public: + virtual crimson::osd::ShardServices& get_shard_services() = 0; + virtual PGRecovery* get_recovery_handler() = 0; + virtual epoch_t get_osdmap_epoch() const = 0; + virtual bool is_primary() const = 0; + virtual bool is_peered() const = 0; + virtual bool is_recovering() const = 0; + virtual bool is_backfilling() const = 0; + virtual PeeringState& get_peering_state() = 0; + virtual const pg_shard_t& get_pg_whoami() const = 0; + virtual const spg_t& get_pgid() const = 0; + virtual RecoveryBackend* get_recovery_backend() = 0; + virtual bool is_unreadable_object(const hobject_t&, eversion_t* v = 0) const = 0; + virtual bool has_reset_since(epoch_t) const = 0; + virtual std::vector<pg_shard_t> get_replica_recovery_order() const = 0; + virtual epoch_t get_last_peering_reset() const = 0; + virtual const pg_info_t& get_info() const= 0; + virtual seastar::future<> stop() = 0; + virtual void publish_stats_to_osd() = 0; +}; diff --git a/src/crimson/osd/recovery_backend.cc b/src/crimson/osd/recovery_backend.cc new file mode 100644 index 000000000..aeec0d14b --- /dev/null +++ b/src/crimson/osd/recovery_backend.cc @@ -0,0 +1,298 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/format.h> + +#include "crimson/common/exception.h" +#include "crimson/osd/recovery_backend.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" + +#include "messages/MOSDFastDispatchOp.h" +#include "osd/osd_types.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +hobject_t RecoveryBackend::get_temp_recovery_object( + const hobject_t& target, + eversion_t version) const +{ + hobject_t hoid = + target.make_temp_hobject(fmt::format("temp_recovering_{}_{}_{}_{}", + pg.get_info().pgid, + version, + pg.get_info().history.same_interval_since, + target.snap)); + logger().debug("{} {}", __func__, hoid); + return hoid; +} + +void RecoveryBackend::clean_up(ceph::os::Transaction& t, + std::string_view why) +{ + for (auto& soid : temp_contents) { + t.remove(pg.get_collection_ref()->get_cid(), + ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard)); + } + temp_contents.clear(); + + for (auto& [soid, recovery_waiter] : recovering) { + if ((recovery_waiter.pi && recovery_waiter.pi->is_complete()) + || (!recovery_waiter.pi + && recovery_waiter.obc && recovery_waiter.obc->obs.exists)) { + recovery_waiter.obc->interrupt( + ::crimson::common::actingset_changed( + pg.is_primary())); + recovery_waiter.interrupt(why); + } + } + recovering.clear(); +} + +void RecoveryBackend::WaitForObjectRecovery::stop() { + readable.set_exception( + crimson::common::system_shutdown_exception()); + recovered.set_exception( + crimson::common::system_shutdown_exception()); + pulled.set_exception( + crimson::common::system_shutdown_exception()); + for (auto& [pg_shard, pr] : pushes) { + pr.set_exception( + crimson::common::system_shutdown_exception()); + } +} + +void RecoveryBackend::handle_backfill_finish( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + ceph_assert(!pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 1); + auto reply = make_message<MOSDPGBackfill>( + MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, + pg.get_osdmap_epoch(), + m.query_epoch, + spg_t(pg.get_pgid().pgid, pg.get_primary().shard)); + reply->set_priority(pg.get_recovery_op_priority()); + std::ignore = m.get_connection()->send(std::move(reply)); + shard_services.start_operation<crimson::osd::LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(&pg), + shard_services, + pg.get_pg_whoami(), + pg.get_pgid(), + pg.get_osdmap_epoch(), + pg.get_osdmap_epoch(), + RecoveryDone{}); +} + +seastar::future<> RecoveryBackend::handle_backfill_progress( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + ceph_assert(!pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 2); + + ObjectStore::Transaction t; + pg.get_peering_state().update_backfill_progress( + m.last_backfill, + m.stats, + m.op == MOSDPGBackfill::OP_BACKFILL_PROGRESS, + t); + return shard_services.get_store().do_transaction( + pg.get_collection_ref(), std::move(t) + ).or_terminate(); +} + +seastar::future<> RecoveryBackend::handle_backfill_finish_ack( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + ceph_assert(pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 3); + // TODO: + // finish_recovery_op(hobject_t::get_max()); + return seastar::now(); +} + +seastar::future<> RecoveryBackend::handle_backfill( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + switch (m.op) { + case MOSDPGBackfill::OP_BACKFILL_FINISH: + handle_backfill_finish(m); + [[fallthrough]]; + case MOSDPGBackfill::OP_BACKFILL_PROGRESS: + return handle_backfill_progress(m); + case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK: + return handle_backfill_finish_ack(m); + default: + ceph_assert("unknown op type for pg backfill"); + return seastar::now(); + } +} + +seastar::future<> RecoveryBackend::handle_backfill_remove( + MOSDPGBackfillRemove& m) +{ + logger().debug("{} m.ls={}", __func__, m.ls); + assert(m.get_type() == MSG_OSD_PG_BACKFILL_REMOVE); + + ObjectStore::Transaction t; + for ([[maybe_unused]] const auto& [soid, ver] : m.ls) { + // TODO: the reserved space management. PG::try_reserve_recovery_space(). + t.remove(pg.get_collection_ref()->get_cid(), + ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard)); + } + return shard_services.get_store().do_transaction( + pg.get_collection_ref(), std::move(t) + ).or_terminate(); +} + +seastar::future<BackfillInterval> RecoveryBackend::scan_for_backfill( + const hobject_t& start, + [[maybe_unused]] const std::int64_t min, + const std::int64_t max) +{ + logger().debug("{} starting from {}", __func__, start); + auto version_map = seastar::make_lw_shared<std::map<hobject_t, eversion_t>>(); + return backend->list_objects(start, max).then( + [this, start, version_map] (auto&& ret) { + auto&& [objects, next] = std::move(ret); + return seastar::parallel_for_each(std::move(objects), + [this, version_map] (const hobject_t& object) { + crimson::osd::ObjectContextRef obc; + if (pg.is_primary()) { + obc = shard_services.obc_registry.maybe_get_cached_obc(object); + } + if (obc) { + if (obc->obs.exists) { + logger().debug("scan_for_backfill found (primary): {} {}", + object, obc->obs.oi.version); + version_map->emplace(object, obc->obs.oi.version); + } else { + // if the object does not exist here, it must have been removed + // between the collection_list_partial and here. This can happen + // for the first item in the range, which is usually last_backfill. + } + return seastar::now(); + } else { + return backend->load_metadata(object).safe_then( + [version_map, object] (auto md) { + if (md->os.exists) { + logger().debug("scan_for_backfill found: {} {}", + object, md->os.oi.version); + version_map->emplace(object, md->os.oi.version); + } + return seastar::now(); + }, PGBackend::load_metadata_ertr::assert_all{}); + } + }).then([version_map, start=std::move(start), next=std::move(next), this] { + BackfillInterval bi; + bi.begin = std::move(start); + bi.end = std::move(next); + bi.version = pg.get_info().last_update; + bi.objects = std::move(*version_map); + logger().debug("{} BackfillInterval filled, leaving", + "scan_for_backfill"); + return seastar::make_ready_future<BackfillInterval>(std::move(bi)); + }); + }); +} + +seastar::future<> RecoveryBackend::handle_scan_get_digest( + MOSDPGScan& m) +{ + logger().debug("{}", __func__); + if (false /* FIXME: check for backfill too full */) { + std::ignore = shard_services.start_operation<crimson::osd::LocalPeeringEvent>( + // TODO: abstract start_background_recovery + static_cast<crimson::osd::PG*>(&pg), + shard_services, + pg.get_pg_whoami(), + pg.get_pgid(), + pg.get_osdmap_epoch(), + pg.get_osdmap_epoch(), + PeeringState::BackfillTooFull()); + return seastar::now(); + } + return scan_for_backfill( + std::move(m.begin), + crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_min"), + crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_max") + ).then([this, + query_epoch=m.query_epoch, + conn=m.get_connection()] (auto backfill_interval) { + auto reply = make_message<MOSDPGScan>( + MOSDPGScan::OP_SCAN_DIGEST, + pg.get_pg_whoami(), + pg.get_osdmap_epoch(), + query_epoch, + spg_t(pg.get_info().pgid.pgid, pg.get_primary().shard), + backfill_interval.begin, + backfill_interval.end); + encode(backfill_interval.objects, reply->get_data()); + return conn->send(std::move(reply)); + }); +} + +seastar::future<> RecoveryBackend::handle_scan_digest( + MOSDPGScan& m) +{ + logger().debug("{}", __func__); + // Check that from is in backfill_targets vector + ceph_assert(pg.is_backfill_target(m.from)); + + BackfillInterval bi; + bi.begin = m.begin; + bi.end = m.end; + { + auto p = m.get_data().cbegin(); + // take care to preserve ordering! + bi.clear_objects(); + ::decode_noclear(bi.objects, p); + } + shard_services.start_operation<crimson::osd::BackfillRecovery>( + static_cast<crimson::osd::PG*>(&pg), + shard_services, + pg.get_osdmap_epoch(), + crimson::osd::BackfillState::ReplicaScanned{ m.from, std::move(bi) }); + return seastar::now(); +} + +seastar::future<> RecoveryBackend::handle_scan( + MOSDPGScan& m) +{ + logger().debug("{}", __func__); + switch (m.op) { + case MOSDPGScan::OP_SCAN_GET_DIGEST: + return handle_scan_get_digest(m); + case MOSDPGScan::OP_SCAN_DIGEST: + return handle_scan_digest(m); + default: + // FIXME: move to errorator + ceph_assert("unknown op type for pg scan"); + return seastar::now(); + } +} + +seastar::future<> RecoveryBackend::handle_recovery_op( + Ref<MOSDFastDispatchOp> m) +{ + switch (m->get_header().type) { + case MSG_OSD_PG_BACKFILL: + return handle_backfill(*boost::static_pointer_cast<MOSDPGBackfill>(m)); + case MSG_OSD_PG_BACKFILL_REMOVE: + return handle_backfill_remove(*boost::static_pointer_cast<MOSDPGBackfillRemove>(m)); + case MSG_OSD_PG_SCAN: + return handle_scan(*boost::static_pointer_cast<MOSDPGScan>(m)); + default: + return seastar::make_exception_future<>( + std::invalid_argument(fmt::format("invalid request type: {}", + m->get_header().type))); + } +} diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h new file mode 100644 index 000000000..cb0ae9f20 --- /dev/null +++ b/src/crimson/osd/recovery_backend.h @@ -0,0 +1,203 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "crimson/common/type_helpers.h" +#include "crimson/os/futurized_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/shard_services.h" + +#include "messages/MOSDPGBackfill.h" +#include "messages/MOSDPGBackfillRemove.h" +#include "messages/MOSDPGScan.h" +#include "osd/recovery_types.h" +#include "osd/osd_types.h" + +namespace crimson::osd{ + class PG; +} + +class PGBackend; + +class RecoveryBackend { + void handle_backfill_finish( + MOSDPGBackfill& m); + seastar::future<> handle_backfill_progress( + MOSDPGBackfill& m); + seastar::future<> handle_backfill_finish_ack( + MOSDPGBackfill& m); + seastar::future<> handle_backfill(MOSDPGBackfill& m); + + seastar::future<> handle_backfill_remove(MOSDPGBackfillRemove& m); + + seastar::future<> handle_scan_get_digest( + MOSDPGScan& m); + seastar::future<> handle_scan_digest( + MOSDPGScan& m); + seastar::future<> handle_scan( + MOSDPGScan& m); +protected: + class WaitForObjectRecovery; +public: + RecoveryBackend(crimson::osd::PG& pg, + crimson::osd::ShardServices& shard_services, + crimson::os::CollectionRef coll, + PGBackend* backend) + : pg{pg}, + shard_services{shard_services}, + store{&shard_services.get_store()}, + coll{coll}, + backend{backend} {} + virtual ~RecoveryBackend() {} + WaitForObjectRecovery& add_recovering(const hobject_t& soid) { + auto [it, added] = recovering.emplace(soid, WaitForObjectRecovery{}); + assert(added); + return it->second; + } + WaitForObjectRecovery& get_recovering(const hobject_t& soid) { + assert(is_recovering(soid)); + return recovering.at(soid); + } + void remove_recovering(const hobject_t& soid) { + recovering.erase(soid); + } + bool is_recovering(const hobject_t& soid) const { + return recovering.count(soid) != 0; + } + uint64_t total_recovering() const { + return recovering.size(); + } + + virtual seastar::future<> handle_recovery_op( + Ref<MOSDFastDispatchOp> m); + + virtual seastar::future<> recover_object( + const hobject_t& soid, + eversion_t need) = 0; + virtual seastar::future<> recover_delete( + const hobject_t& soid, + eversion_t need) = 0; + virtual seastar::future<> push_delete( + const hobject_t& soid, + eversion_t need) = 0; + + seastar::future<BackfillInterval> scan_for_backfill( + const hobject_t& from, + std::int64_t min, + std::int64_t max); + + void on_peering_interval_change(ceph::os::Transaction& t) { + clean_up(t, "new peering interval"); + } + + seastar::future<> stop() { + for (auto& [soid, recovery_waiter] : recovering) { + recovery_waiter.stop(); + } + return on_stop(); + } +protected: + crimson::osd::PG& pg; + crimson::osd::ShardServices& shard_services; + crimson::os::FuturizedStore* store; + crimson::os::CollectionRef coll; + PGBackend* backend; + + struct PullInfo { + pg_shard_t from; + hobject_t soid; + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + crimson::osd::ObjectContextRef head_ctx; + crimson::osd::ObjectContextRef obc; + object_stat_sum_t stat; + bool is_complete() const { + return recovery_progress.is_complete(recovery_info); + } + }; + + struct PushInfo { + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + crimson::osd::ObjectContextRef obc; + object_stat_sum_t stat; + }; + + class WaitForObjectRecovery : public crimson::osd::BlockerT<WaitForObjectRecovery> { + seastar::shared_promise<> readable, recovered, pulled; + std::map<pg_shard_t, seastar::shared_promise<>> pushes; + public: + static constexpr const char* type_name = "WaitForObjectRecovery"; + + crimson::osd::ObjectContextRef obc; + std::optional<PullInfo> pi; + std::map<pg_shard_t, PushInfo> pushing; + + seastar::future<> wait_for_readable() { + return readable.get_shared_future(); + } + seastar::future<> wait_for_pushes(pg_shard_t shard) { + return pushes[shard].get_shared_future(); + } + seastar::future<> wait_for_recovered() { + return recovered.get_shared_future(); + } + crimson::osd::blocking_future<> + wait_for_recovered_blocking() { + return make_blocking_future( + recovered.get_shared_future()); + } + seastar::future<> wait_for_pull() { + return pulled.get_shared_future(); + } + void set_readable() { + readable.set_value(); + } + void set_recovered() { + recovered.set_value(); + } + void set_pushed(pg_shard_t shard) { + pushes[shard].set_value(); + } + void set_pulled() { + pulled.set_value(); + } + void set_push_failed(pg_shard_t shard, std::exception_ptr e) { + pushes.at(shard).set_exception(e); + } + void interrupt(std::string_view why) { + readable.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + recovered.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + pulled.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + for (auto& [pg_shard, pr] : pushes) { + pr.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + } + } + void stop(); + void dump_detail(Formatter* f) const { + } + }; + std::map<hobject_t, WaitForObjectRecovery> recovering; + hobject_t get_temp_recovery_object( + const hobject_t& target, + eversion_t version) const; + + boost::container::flat_set<hobject_t> temp_contents; + + void add_temp_obj(const hobject_t &oid) { + temp_contents.insert(oid); + } + void clear_temp_obj(const hobject_t &oid) { + temp_contents.erase(oid); + } + void clean_up(ceph::os::Transaction& t, std::string_view why); + virtual seastar::future<> on_stop() = 0; +}; diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc new file mode 100644 index 000000000..3a131278b --- /dev/null +++ b/src/crimson/osd/replicated_backend.cc @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "replicated_backend.h" + +#include "messages/MOSDRepOpReply.h" + +#include "crimson/common/exception.h" +#include "crimson/common/log.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/shard_services.h" +#include "osd/PeeringState.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +ReplicatedBackend::ReplicatedBackend(pg_t pgid, + pg_shard_t whoami, + ReplicatedBackend::CollectionRef coll, + crimson::osd::ShardServices& shard_services) + : PGBackend{whoami.shard, coll, &shard_services.get_store()}, + pgid{pgid}, + whoami{whoami}, + shard_services{shard_services} +{} + +ReplicatedBackend::ll_read_errorator::future<ceph::bufferlist> +ReplicatedBackend::_read(const hobject_t& hoid, + const uint64_t off, + const uint64_t len, + const uint32_t flags) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + return store->read(coll, ghobject_t{hoid}, off, len, flags); +} + +seastar::future<crimson::osd::acked_peers_t> +ReplicatedBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + if (__builtin_expect((bool)peering, false)) { + throw crimson::common::actingset_changed(peering->is_primary); + } + + const ceph_tid_t tid = next_txn_id++; + auto req_id = osd_op_p.req->get_reqid(); + auto pending_txn = + pending_trans.emplace(tid, pg_shards.size()).first; + bufferlist encoded_txn; + encode(txn, encoded_txn); + + return seastar::parallel_for_each(std::move(pg_shards), + [=, encoded_txn=std::move(encoded_txn), txn=std::move(txn)] + (auto pg_shard) mutable { + if (pg_shard == whoami) { + return shard_services.get_store().do_transaction(coll,std::move(txn)); + } else { + auto m = make_message<MOSDRepOp>(req_id, whoami, + spg_t{pgid, pg_shard.shard}, hoid, + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + map_epoch, min_epoch, + tid, osd_op_p.at_version); + m->set_data(encoded_txn); + pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}}); + encode(log_entries, m->logbl); + m->pg_trim_to = osd_op_p.pg_trim_to; + m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk; + m->set_rollback_to(osd_op_p.at_version); + // TODO: set more stuff. e.g., pg_states + return shard_services.send_to_osd(pg_shard.osd, std::move(m), map_epoch); + } + }).then([this, peers=pending_txn->second.weak_from_this()] { + if (!peers) { + // for now, only actingset_changed can cause peers + // to be nullptr + assert(peering); + throw crimson::common::actingset_changed(peering->is_primary); + } + if (--peers->pending == 0) { + peers->all_committed.set_value(); + peers->all_committed = {}; + return seastar::now(); + } + return peers->all_committed.get_future(); + }).then([pending_txn, this] { + auto acked_peers = std::move(pending_txn->second.acked_peers); + pending_trans.erase(pending_txn); + return seastar::make_ready_future<crimson::osd::acked_peers_t>(std::move(acked_peers)); + }); +} + +void ReplicatedBackend::on_actingset_changed(peering_info_t pi) +{ + peering.emplace(pi); + crimson::common::actingset_changed e_actingset_changed{peering->is_primary}; + for (auto& [tid, pending_txn] : pending_trans) { + pending_txn.all_committed.set_exception(e_actingset_changed); + } + pending_trans.clear(); +} + +void ReplicatedBackend::got_rep_op_reply(const MOSDRepOpReply& reply) +{ + auto found = pending_trans.find(reply.get_tid()); + if (found == pending_trans.end()) { + logger().warn("{}: no matched pending rep op: {}", __func__, reply); + return; + } + auto& peers = found->second; + for (auto& peer : peers.acked_peers) { + if (peer.shard == reply.from) { + peer.last_complete_ondisk = reply.get_last_complete_ondisk(); + if (--peers.pending == 0) { + peers.all_committed.set_value(); + peers.all_committed = {}; + } + return; + } + } +} + +seastar::future<> ReplicatedBackend::stop() +{ + logger().info("ReplicatedBackend::stop {}", coll->get_cid()); + stopping = true; + for (auto& [tid, pending_on] : pending_trans) { + pending_on.all_committed.set_exception( + crimson::common::system_shutdown_exception()); + } + pending_trans.clear(); + return seastar::now(); +} diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h new file mode 100644 index 000000000..01c0bba64 --- /dev/null +++ b/src/crimson/osd/replicated_backend.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/weak_ptr.hh> +#include "include/buffer_fwd.h" +#include "osd/osd_types.h" + +#include "acked_peers.h" +#include "pg_backend.h" + +namespace crimson::osd { + class ShardServices; +} + +class ReplicatedBackend : public PGBackend +{ +public: + ReplicatedBackend(pg_t pgid, pg_shard_t whoami, + CollectionRef coll, + crimson::osd::ShardServices& shard_services); + void got_rep_op_reply(const MOSDRepOpReply& reply) final; + seastar::future<> stop() final; + void on_actingset_changed(peering_info_t pi) final; +private: + ll_read_errorator::future<ceph::bufferlist> _read(const hobject_t& hoid, + uint64_t off, + uint64_t len, + uint32_t flags) override; + seastar::future<crimson::osd::acked_peers_t> + _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) final; + const pg_t pgid; + const pg_shard_t whoami; + crimson::osd::ShardServices& shard_services; + ceph_tid_t next_txn_id = 0; + class pending_on_t : public seastar::weakly_referencable<pending_on_t> { + public: + pending_on_t(size_t pending) + : pending{static_cast<unsigned>(pending)} + {} + unsigned pending; + crimson::osd::acked_peers_t acked_peers; + seastar::promise<> all_committed; + }; + using pending_transactions_t = std::map<ceph_tid_t, pending_on_t>; + pending_transactions_t pending_trans; +}; diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc new file mode 100644 index 000000000..0812003bb --- /dev/null +++ b/src/crimson/osd/replicated_recovery_backend.cc @@ -0,0 +1,1076 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <seastar/core/future.hh> +#include <seastar/core/do_with.hh> + +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "replicated_recovery_backend.h" + +#include "msg/Message.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +seastar::future<> ReplicatedRecoveryBackend::recover_object( + const hobject_t& soid, + eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + // always add_recovering(soid) before recover_object(soid) + assert(is_recovering(soid)); + // start tracking the recovery of soid + return maybe_pull_missing_obj(soid, need).then([this, soid, need] { + logger().debug("recover_object: loading obc: {}", soid); + return pg.with_head_obc<RWState::RWREAD>(soid, + [this, soid, need](auto obc) { + logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid); + auto& recovery_waiter = recovering.at(soid); + recovery_waiter.obc = obc; + recovery_waiter.obc->wait_recovery_read(); + return maybe_push_shards(soid, need); + }).handle_error( + crimson::osd::PG::load_obc_ertr::all_same_way([soid](auto& code) { + // TODO: may need eio handling? + logger().error("recover_object saw error code {}, ignoring object {}", + code, soid); + })); + }); +} + +seastar::future<> +ReplicatedRecoveryBackend::maybe_push_shards( + const hobject_t& soid, + eversion_t need) +{ + return seastar::parallel_for_each(get_shards_to_push(soid), + [this, need, soid](auto shard) { + return prep_push(soid, need, shard).then([this, soid, shard](auto push) { + auto msg = make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->pushes.push_back(std::move(push)); + msg->set_priority(pg.get_recovery_op_priority()); + return shard_services.send_to_osd(shard.osd, + std::move(msg), + pg.get_osdmap_epoch()).then( + [this, soid, shard] { + return recovering.at(soid).wait_for_pushes(shard); + }); + }); + }).then([this, soid] { + auto &recovery = recovering.at(soid); + auto push_info = recovery.pushing.begin(); + object_stat_sum_t stat = {}; + if (push_info != recovery.pushing.end()) { + stat = push_info->second.stat; + } else { + // no push happened, take pull_info's stat + assert(recovery.pi); + stat = recovery.pi->stat; + } + pg.get_recovery_handler()->on_global_recover(soid, stat, false); + return seastar::make_ready_future<>(); + }).handle_exception([this, soid](auto e) { + auto &recovery = recovering.at(soid); + if (recovery.obc) { + recovery.obc->drop_recovery_read(); + } + recovering.erase(soid); + return seastar::make_exception_future<>(e); + }); +} + +seastar::future<> +ReplicatedRecoveryBackend::maybe_pull_missing_obj( + const hobject_t& soid, + eversion_t need) +{ + pg_missing_tracker_t local_missing = pg.get_local_missing(); + if (!local_missing.is_missing(soid)) { + return seastar::make_ready_future<>(); + } + PullOp po; + auto& recovery_waiter = recovering.at(soid); + recovery_waiter.pi = std::make_optional<RecoveryBackend::PullInfo>(); + auto& pi = *recovery_waiter.pi; + prepare_pull(po, pi, soid, need); + auto msg = make_message<MOSDPGPull>(); + msg->from = pg.get_pg_whoami(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_pulls({std::move(po)}); + return shard_services.send_to_osd( + pi.from.osd, + std::move(msg), + pg.get_osdmap_epoch() + ).then([&recovery_waiter] { + return recovery_waiter.wait_for_pull(); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::push_delete( + const hobject_t& soid, + eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + recovering[soid]; + epoch_t min_epoch = pg.get_last_peering_reset(); + + assert(pg.get_acting_recovery_backfill().size() > 0); + return seastar::parallel_for_each(pg.get_acting_recovery_backfill(), + [this, soid, need, min_epoch](pg_shard_t shard) { + if (shard == pg.get_pg_whoami()) + return seastar::make_ready_future<>(); + auto iter = pg.get_shard_missing().find(shard); + if (iter == pg.get_shard_missing().end()) + return seastar::make_ready_future<>(); + if (iter->second.is_missing(soid)) { + logger().debug("push_delete: will remove {} from {}", soid, shard); + pg.begin_peer_recover(shard, soid); + spg_t target_pg(pg.get_info().pgid.pgid, shard.shard); + auto msg = make_message<MOSDPGRecoveryDelete>( + pg.get_pg_whoami(), target_pg, pg.get_osdmap_epoch(), min_epoch); + msg->set_priority(pg.get_recovery_op_priority()); + msg->objects.push_back(std::make_pair(soid, need)); + return shard_services.send_to_osd(shard.osd, std::move(msg), + pg.get_osdmap_epoch()).then( + [this, soid, shard] { + return recovering.at(soid).wait_for_pushes(shard); + }); + } + return seastar::make_ready_future<>(); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::handle_recovery_delete( + Ref<MOSDPGRecoveryDelete> m) +{ + logger().debug("{}: {}", __func__, *m); + + auto& p = m->objects.front(); //TODO: only one delete per message for now. + return local_recover_delete(p.first, p.second, pg.get_osdmap_epoch()).then( + [this, m] { + auto reply = make_message<MOSDPGRecoveryDeleteReply>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = spg_t(pg.get_info().pgid.pgid, m->from.shard); + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->objects = m->objects; + return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch()); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::on_local_recover_persist( + const hobject_t& soid, + const ObjectRecoveryInfo& _recovery_info, + bool is_delete, + epoch_t epoch_frozen) +{ + logger().debug("{}", __func__); + ceph::os::Transaction t; + pg.get_recovery_handler()->on_local_recover(soid, _recovery_info, is_delete, t); + return shard_services.get_store().do_transaction(coll, std::move(t)).then( + [this, epoch_frozen, last_complete = pg.get_info().last_complete] { + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + return seastar::make_ready_future<>(); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::local_recover_delete( + const hobject_t& soid, + eversion_t need, + epoch_t epoch_to_freeze) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + return backend->load_metadata(soid).safe_then([this] + (auto lomt) { + if (lomt->os.exists) { + return seastar::do_with(ceph::os::Transaction(), + [this, lomt = std::move(lomt)](auto& txn) { + return backend->remove(lomt->os, txn).then([this, &txn]() mutable { + return shard_services.get_store().do_transaction(coll, + std::move(txn)); + }); + }); + } + return seastar::make_ready_future<>(); + }).safe_then([this, soid, epoch_to_freeze, need] { + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = need; + return on_local_recover_persist(soid, recovery_info, + true, epoch_to_freeze); + }, PGBackend::load_metadata_ertr::all_same_way( + [this, soid, epoch_to_freeze, need] (auto e) { + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = need; + return on_local_recover_persist(soid, recovery_info, + true, epoch_to_freeze); + }) + ); +} + +seastar::future<> ReplicatedRecoveryBackend::recover_delete( + const hobject_t &soid, eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + + epoch_t cur_epoch = pg.get_osdmap_epoch(); + return seastar::do_with(object_stat_sum_t(), + [this, soid, need, cur_epoch](auto& stat_diff) { + return local_recover_delete(soid, need, cur_epoch).then( + [this, &stat_diff, cur_epoch, soid, need] { + if (!pg.has_reset_since(cur_epoch)) { + bool object_missing = false; + for (const auto& shard : pg.get_acting_recovery_backfill()) { + if (shard == pg.get_pg_whoami()) + continue; + if (pg.get_shard_missing(shard)->is_missing(soid)) { + logger().debug("recover_delete: soid {} needs to deleted from replca {}", + soid, shard); + object_missing = true; + break; + } + } + + if (!object_missing) { + stat_diff.num_objects_recovered = 1; + return seastar::make_ready_future<>(); + } else { + return push_delete(soid, need); + } + } + return seastar::make_ready_future<>(); + }).then([this, soid, &stat_diff] { + pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true); + return seastar::make_ready_future<>(); + }); + }); +} + +seastar::future<PushOp> +ReplicatedRecoveryBackend::prep_push( + const hobject_t& soid, + eversion_t need, + pg_shard_t pg_shard) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + + auto& recovery_waiter = recovering.at(soid); + auto& obc = recovery_waiter.obc; + interval_set<uint64_t> data_subset; + if (obc->obs.oi.size) { + data_subset.insert(0, obc->obs.oi.size); + } + const auto& missing = pg.get_shard_missing().find(pg_shard)->second; + if (HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS)) { + const auto it = missing.get_items().find(soid); + assert(it != missing.get_items().end()); + data_subset.intersection_of(it->second.clean_regions.get_dirty_regions()); + logger().debug("prep_push: {} data_subset {}", soid, data_subset); + } + + logger().debug("prep_push: {} to {}", soid, pg_shard); + auto& pi = recovery_waiter.pushing[pg_shard]; + pg.begin_peer_recover(pg_shard, soid); + const auto pmissing_iter = pg.get_shard_missing().find(pg_shard); + const auto missing_iter = pmissing_iter->second.get_items().find(soid); + assert(missing_iter != pmissing_iter->second.get_items().end()); + + pi.obc = obc; + pi.recovery_info.size = obc->obs.oi.size; + pi.recovery_info.copy_subset = data_subset; + pi.recovery_info.soid = soid; + pi.recovery_info.oi = obc->obs.oi; + pi.recovery_info.version = obc->obs.oi.version; + pi.recovery_info.object_exist = + missing_iter->second.clean_regions.object_is_exist(); + pi.recovery_progress.omap_complete = + (!missing_iter->second.clean_regions.omap_is_dirty() && + HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS)); + + return build_push_op(pi.recovery_info, pi.recovery_progress, &pi.stat).then( + [this, soid, pg_shard](auto pop) { + auto& recovery_waiter = recovering.at(soid); + auto& pi = recovery_waiter.pushing[pg_shard]; + pi.recovery_progress = pop.after_progress; + return pop; + }); +} + +void ReplicatedRecoveryBackend::prepare_pull(PullOp& po, PullInfo& pi, + const hobject_t& soid, + eversion_t need) { + logger().debug("{}: {}, {}", __func__, soid, need); + + pg_missing_tracker_t local_missing = pg.get_local_missing(); + const auto missing_iter = local_missing.get_items().find(soid); + auto m = pg.get_missing_loc_shards(); + pg_shard_t fromshard = *(m[soid].begin()); + + //TODO: skipped snap objects case for now + po.recovery_info.copy_subset.insert(0, (uint64_t) -1); + if (HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS)) + po.recovery_info.copy_subset.intersection_of( + missing_iter->second.clean_regions.get_dirty_regions()); + po.recovery_info.size = ((uint64_t) -1); + po.recovery_info.object_exist = + missing_iter->second.clean_regions.object_is_exist(); + po.recovery_info.soid = soid; + po.soid = soid; + po.recovery_progress.data_complete = false; + po.recovery_progress.omap_complete = + !missing_iter->second.clean_regions.omap_is_dirty() && + HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS); + po.recovery_progress.data_recovered_to = 0; + po.recovery_progress.first = true; + + pi.from = fromshard; + pi.soid = soid; + pi.recovery_info = po.recovery_info; + pi.recovery_progress = po.recovery_progress; +} + +seastar::future<PushOp> ReplicatedRecoveryBackend::build_push_op( + const ObjectRecoveryInfo& recovery_info, + const ObjectRecoveryProgress& progress, + object_stat_sum_t* stat) +{ + logger().debug("{} {} @{}", + __func__, recovery_info.soid, recovery_info.version); + return seastar::do_with(ObjectRecoveryProgress(progress), + uint64_t(crimson::common::local_conf() + ->osd_recovery_max_chunk), + recovery_info.version, + PushOp(), + [this, &recovery_info, &progress, stat] + (auto new_progress, auto available, auto v, auto pop) { + return read_metadata_for_push_op(recovery_info.soid, + progress, new_progress, + v, &pop).then([&](eversion_t local_ver) mutable { + // If requestor didn't know the version, use ours + if (v == eversion_t()) { + v = local_ver; + } else if (v != local_ver) { + logger().error("build_push_op: {} push {} v{} failed because local copy is {}", + pg.get_pgid(), recovery_info.soid, recovery_info.version, local_ver); + // TODO: bail out + } + return read_omap_for_push_op(recovery_info.soid, + progress, + new_progress, + &available, &pop); + }).then([this, &recovery_info, &progress, &available, &pop]() mutable { + logger().debug("build_push_op: available: {}, copy_subset: {}", + available, recovery_info.copy_subset); + return read_object_for_push_op(recovery_info.soid, + recovery_info.copy_subset, + progress.data_recovered_to, + available, &pop); + }).then([&recovery_info, &v, &progress, &new_progress, stat, &pop] + (uint64_t recovered_to) mutable { + new_progress.data_recovered_to = recovered_to; + if (new_progress.is_complete(recovery_info)) { + new_progress.data_complete = true; + if (stat) + stat->num_objects_recovered++; + } else if (progress.first && progress.omap_complete) { + // If omap is not changed, we need recovery omap + // when recovery cannot be completed once + new_progress.omap_complete = false; + } + if (stat) { + stat->num_keys_recovered += pop.omap_entries.size(); + stat->num_bytes_recovered += pop.data.length(); + } + pop.version = v; + pop.soid = recovery_info.soid; + pop.recovery_info = recovery_info; + pop.after_progress = new_progress; + pop.before_progress = progress; + logger().debug("build_push_op: pop version: {}, pop data length: {}", + pop.version, pop.data.length()); + return seastar::make_ready_future<PushOp>(std::move(pop)); + }); + }); +} + +seastar::future<eversion_t> +ReplicatedRecoveryBackend::read_metadata_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + eversion_t ver, + PushOp* push_op) +{ + if (!progress.first) { + return seastar::make_ready_future<eversion_t>(ver); + } + return seastar::when_all_succeed( + backend->omap_get_header(coll, ghobject_t(oid)).handle_error( + crimson::os::FuturizedStore::read_errorator::all_same_way( + [] (const std::error_code& e) { + return seastar::make_ready_future<bufferlist>(); + })), + store->get_attrs(coll, ghobject_t(oid)).handle_error( + crimson::os::FuturizedStore::get_attrs_ertr::all_same_way( + [] (const std::error_code& e) { + return seastar::make_ready_future<crimson::os::FuturizedStore::attrs_t>(); + })) + ).then_unpack([&new_progress, push_op](auto bl, auto attrs) { + if (bl.length() == 0) { + logger().error("read_metadata_for_push_op: fail to read omap header"); + return eversion_t{}; + } else if (attrs.empty()) { + logger().error("read_metadata_for_push_op: fail to read attrs"); + return eversion_t{}; + } + push_op->omap_header.claim_append(std::move(bl)); + for (auto&& [key, val] : std::move(attrs)) { + push_op->attrset[key].push_back(val); + } + logger().debug("read_metadata_for_push_op: {}", push_op->attrset[OI_ATTR]); + object_info_t oi; + oi.decode(push_op->attrset[OI_ATTR]); + new_progress.first = false; + return oi.version; + }); +} + +seastar::future<uint64_t> +ReplicatedRecoveryBackend::read_object_for_push_op( + const hobject_t& oid, + const interval_set<uint64_t>& copy_subset, + uint64_t offset, + uint64_t max_len, + PushOp* push_op) +{ + if (max_len == 0 || copy_subset.empty()) { + push_op->data_included.clear(); + return seastar::make_ready_future<uint64_t>(offset); + } + // 1. get the extents in the interested range + return backend->fiemap(coll, ghobject_t{oid}, + 0, copy_subset.range_end()).then_wrapped( + [=](auto&& fiemap_included) mutable { + interval_set<uint64_t> extents; + try { + extents.intersection_of(copy_subset, fiemap_included.get0()); + } catch (std::exception &) { + // if fiemap() fails, we will read nothing, as the intersection of + // copy_subset and an empty interval_set would be empty anyway + extents.clear(); + } + // 2. we can read up to "max_len" bytes from "offset", so truncate the + // extents down to this quota. no need to return the number of consumed + // bytes, as this is the last consumer of this quota + push_op->data_included.span_of(extents, offset, max_len); + // 3. read the truncated extents + // TODO: check if the returned extents are pruned + return store->readv(coll, ghobject_t{oid}, push_op->data_included, 0); + }).safe_then([push_op, range_end=copy_subset.range_end()](auto &&bl) { + push_op->data.claim_append(std::move(bl)); + uint64_t recovered_to = 0; + if (push_op->data_included.empty()) { + // zero filled section, skip to end! + recovered_to = range_end; + } else { + // note down the progress, we will start from there next time + recovered_to = push_op->data_included.range_end(); + } + return seastar::make_ready_future<uint64_t>(recovered_to); + }, PGBackend::read_errorator::all_same_way([](auto e) { + logger().debug("build_push_op: read exception"); + return seastar::make_exception_future<uint64_t>(e); + })); +} + +seastar::future<> +ReplicatedRecoveryBackend::read_omap_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + uint64_t* max_len, + PushOp* push_op) +{ + if (progress.omap_complete) { + return seastar::make_ready_future<>(); + } + return shard_services.get_store().get_omap_iterator(coll, ghobject_t{oid}) + .then([&progress, &new_progress, max_len, push_op](auto omap_iter) { + return omap_iter->lower_bound(progress.omap_recovered_to).then( + [omap_iter, &new_progress, max_len, push_op] { + return seastar::do_until([omap_iter, &new_progress, max_len, push_op] { + if (!omap_iter->valid()) { + new_progress.omap_complete = true; + return true; + } + if (push_op->omap_entries.empty()) { + return false; + } + if (const uint64_t entries_per_chunk = + crimson::common::local_conf()->osd_recovery_max_omap_entries_per_chunk; + entries_per_chunk > 0 && + push_op->omap_entries.size() >= entries_per_chunk) { + new_progress.omap_recovered_to = omap_iter->key(); + return true; + } + if (omap_iter->key().size() + omap_iter->value().length() > *max_len) { + new_progress.omap_recovered_to = omap_iter->key(); + return true; + } + return false; + }, + [omap_iter, max_len, push_op] { + push_op->omap_entries.emplace(omap_iter->key(), omap_iter->value()); + if (const uint64_t entry_size = + omap_iter->key().size() + omap_iter->value().length(); + entry_size > *max_len) { + *max_len -= entry_size; + } else { + *max_len = 0; + } + return omap_iter->next(); + }); + }); + }); +} + +std::vector<pg_shard_t> +ReplicatedRecoveryBackend::get_shards_to_push(const hobject_t& soid) const +{ + std::vector<pg_shard_t> shards; + assert(pg.get_acting_recovery_backfill().size() > 0); + for (const auto& peer : pg.get_acting_recovery_backfill()) { + if (peer == pg.get_pg_whoami()) + continue; + auto shard_missing = + pg.get_shard_missing().find(peer); + assert(shard_missing != pg.get_shard_missing().end()); + if (shard_missing->second.is_missing(soid)) { + shards.push_back(shard_missing->first); + } + } + return shards; +} + +seastar::future<> ReplicatedRecoveryBackend::handle_pull(Ref<MOSDPGPull> m) +{ + logger().debug("{}: {}", __func__, *m); + return seastar::parallel_for_each(m->take_pulls(), + [this, from=m->from](auto& pull_op) { + const hobject_t& soid = pull_op.soid; + logger().debug("handle_pull: {}", soid); + return backend->stat(coll, ghobject_t(soid)).then( + [this, &pull_op](auto st) { + ObjectRecoveryInfo &recovery_info = pull_op.recovery_info; + ObjectRecoveryProgress &progress = pull_op.recovery_progress; + if (progress.first && recovery_info.size == ((uint64_t) -1)) { + // Adjust size and copy_subset + recovery_info.size = st.st_size; + if (st.st_size) { + interval_set<uint64_t> object_range; + object_range.insert(0, st.st_size); + recovery_info.copy_subset.intersection_of(object_range); + } else { + recovery_info.copy_subset.clear(); + } + assert(recovery_info.clone_subset.empty()); + } + return build_push_op(recovery_info, progress, 0); + }).then([this, from](auto pop) { + auto msg = make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pushes.push_back(std::move(pop)); + return shard_services.send_to_osd(from.osd, std::move(msg), + pg.get_osdmap_epoch()); + }); + }); +} + +seastar::future<bool> ReplicatedRecoveryBackend::_handle_pull_response( + pg_shard_t from, + const PushOp& pop, + PullOp* response, + ceph::os::Transaction* t) +{ + logger().debug("handle_pull_response {} {} data.size() is {} data_included: {}", + pop.recovery_info, pop.after_progress, pop.data.length(), pop.data_included); + + const hobject_t &hoid = pop.soid; + auto& recovery_waiter = recovering.at(hoid); + auto& pi = *recovery_waiter.pi; + if (pi.recovery_info.size == (uint64_t(-1))) { + pi.recovery_info.size = pop.recovery_info.size; + pi.recovery_info.copy_subset.intersection_of( + pop.recovery_info.copy_subset); + } + + // If primary doesn't have object info and didn't know version + if (pi.recovery_info.version == eversion_t()) + pi.recovery_info.version = pop.version; + + auto prepare_waiter = seastar::make_ready_future<>(); + if (pi.recovery_progress.first) { + prepare_waiter = pg.with_head_obc<RWState::RWNONE>( + pi.recovery_info.soid, [&pi, &recovery_waiter, &pop](auto obc) { + pi.obc = obc; + recovery_waiter.obc = obc; + obc->obs.oi.decode(pop.attrset.at(OI_ATTR)); + pi.recovery_info.oi = obc->obs.oi; + return crimson::osd::PG::load_obc_ertr::now(); + }).handle_error(crimson::ct_error::assert_all{}); + }; + return prepare_waiter.then([this, &pi, &pop, t, response]() mutable { + const bool first = pi.recovery_progress.first; + pi.recovery_progress = pop.after_progress; + logger().debug("new recovery_info {}, new progress {}", + pi.recovery_info, pi.recovery_progress); + interval_set<uint64_t> data_zeros; + { + uint64_t offset = pop.before_progress.data_recovered_to; + uint64_t length = (pop.after_progress.data_recovered_to - + pop.before_progress.data_recovered_to); + if (length) { + data_zeros.insert(offset, length); + } + } + auto [usable_intervals, data] = + trim_pushed_data(pi.recovery_info.copy_subset, + pop.data_included, pop.data); + bool complete = pi.is_complete(); + bool clear_omap = !pop.before_progress.omap_complete; + return submit_push_data(pi.recovery_info, first, complete, clear_omap, + std::move(data_zeros), usable_intervals, data, pop.omap_header, + pop.attrset, pop.omap_entries, t).then( + [this, response, &pi, &pop, complete, t, bytes_recovered=data.length()] { + pi.stat.num_keys_recovered += pop.omap_entries.size(); + pi.stat.num_bytes_recovered += bytes_recovered; + + if (complete) { + pi.stat.num_objects_recovered++; + pg.get_recovery_handler()->on_local_recover( + pop.soid, recovering.at(pop.soid).pi->recovery_info, + false, *t); + return true; + } else { + response->soid = pop.soid; + response->recovery_info = pi.recovery_info; + response->recovery_progress = pi.recovery_progress; + return false; + } + }); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::handle_pull_response( + Ref<MOSDPGPush> m) +{ + const PushOp& pop = m->pushes[0]; //TODO: only one push per message for now. + if (pop.version == eversion_t()) { + // replica doesn't have it! + pg.get_recovery_handler()->on_failed_recover({ m->from }, pop.soid, + get_recovering(pop.soid).pi->recovery_info.version); + return seastar::make_exception_future<>( + std::runtime_error(fmt::format( + "Error on pushing side {} when pulling obj {}", + m->from, pop.soid))); + } + + logger().debug("{}: {}", __func__, *m); + return seastar::do_with(PullOp(), [this, m](auto& response) { + return seastar::do_with(ceph::os::Transaction(), m.get(), + [this, &response](auto& t, auto& m) { + pg_shard_t from = m->from; + PushOp& pop = m->pushes[0]; // only one push per message for now + return _handle_pull_response(from, pop, &response, &t).then( + [this, &t](bool complete) { + epoch_t epoch_frozen = pg.get_osdmap_epoch(); + return shard_services.get_store().do_transaction(coll, std::move(t)) + .then([this, epoch_frozen, complete, + last_complete = pg.get_info().last_complete] { + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + return seastar::make_ready_future<bool>(complete); + }); + }); + }).then([this, m, &response](bool complete) { + if (complete) { + auto& pop = m->pushes[0]; + recovering.at(pop.soid).set_pulled(); + return seastar::make_ready_future<>(); + } else { + auto reply = make_message<MOSDPGPull>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = pg.get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->set_pulls({std::move(response)}); + return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch()); + } + }); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::_handle_push( + pg_shard_t from, + const PushOp &pop, + PushReplyOp *response, + ceph::os::Transaction *t) +{ + logger().debug("{}", __func__); + + bool first = pop.before_progress.first; + interval_set<uint64_t> data_zeros; + { + uint64_t offset = pop.before_progress.data_recovered_to; + uint64_t length = (pop.after_progress.data_recovered_to - + pop.before_progress.data_recovered_to); + if (length) { + data_zeros.insert(offset, length); + } + } + bool complete = (pop.after_progress.data_complete && + pop.after_progress.omap_complete); + bool clear_omap = !pop.before_progress.omap_complete; + response->soid = pop.recovery_info.soid; + + return submit_push_data(pop.recovery_info, first, complete, clear_omap, + std::move(data_zeros), pop.data_included, pop.data, pop.omap_header, + pop.attrset, pop.omap_entries, t).then([this, complete, &pop, t] { + if (complete) { + pg.get_recovery_handler()->on_local_recover( + pop.recovery_info.soid, pop.recovery_info, + false, *t); + } + }); +} + +seastar::future<> ReplicatedRecoveryBackend::handle_push( + Ref<MOSDPGPush> m) +{ + if (pg.is_primary()) { + return handle_pull_response(m); + } + + logger().debug("{}: {}", __func__, *m); + return seastar::do_with(PushReplyOp(), [this, m](auto& response) { + const PushOp& pop = m->pushes[0]; //TODO: only one push per message for now + return seastar::do_with(ceph::os::Transaction(), + [this, m, &pop, &response](auto& t) { + return _handle_push(m->from, pop, &response, &t).then( + [this, &t] { + epoch_t epoch_frozen = pg.get_osdmap_epoch(); + return shard_services.get_store().do_transaction(coll, std::move(t)).then( + [this, epoch_frozen, last_complete = pg.get_info().last_complete] { + //TODO: this should be grouped with pg.on_local_recover somehow. + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + }); + }); + }).then([this, m, &response]() mutable { + auto reply = make_message<MOSDPGPushReply>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = pg.get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + std::vector<PushReplyOp> replies = { std::move(response) }; + reply->replies.swap(replies); + return shard_services.send_to_osd(m->from.osd, + std::move(reply), pg.get_osdmap_epoch()); + }); + }); +} + +seastar::future<std::optional<PushOp>> +ReplicatedRecoveryBackend::_handle_push_reply( + pg_shard_t peer, + const PushReplyOp &op) +{ + const hobject_t& soid = op.soid; + logger().debug("{}, soid {}, from {}", __func__, soid, peer); + auto recovering_iter = recovering.find(soid); + if (recovering_iter == recovering.end() + || !recovering_iter->second.pushing.count(peer)) { + logger().debug("huh, i wasn't pushing {} to osd.{}", soid, peer); + return seastar::make_ready_future<std::optional<PushOp>>(); + } else { + auto& pi = recovering_iter->second.pushing[peer]; + bool error = pi.recovery_progress.error; + if (!pi.recovery_progress.data_complete && !error) { + return build_push_op(pi.recovery_info, pi.recovery_progress, + &pi.stat).then([&pi] (auto pop) { + pi.recovery_progress = pop.after_progress; + return seastar::make_ready_future<std::optional<PushOp>>(std::move(pop)); + }).handle_exception([recovering_iter, &pi, peer] (auto e) { + pi.recovery_progress.error = true; + recovering_iter->second.set_push_failed(peer, e); + return seastar::make_ready_future<std::optional<PushOp>>(); + }); + } + if (!error) { + pg.get_recovery_handler()->on_peer_recover(peer, soid, pi.recovery_info); + } + recovering_iter->second.set_pushed(peer); + return seastar::make_ready_future<std::optional<PushOp>>(); + } +} + +seastar::future<> ReplicatedRecoveryBackend::handle_push_reply( + Ref<MOSDPGPushReply> m) +{ + logger().debug("{}: {}", __func__, *m); + auto from = m->from; + auto& push_reply = m->replies[0]; //TODO: only one reply per message + + return _handle_push_reply(from, push_reply).then( + [this, from](std::optional<PushOp> push_op) { + if (push_op) { + auto msg = make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pushes.push_back(std::move(*push_op)); + return shard_services.send_to_osd(from.osd, + std::move(msg), + pg.get_osdmap_epoch()); + } else { + return seastar::make_ready_future<>(); + } + }); +} + +std::pair<interval_set<uint64_t>, + bufferlist> +ReplicatedRecoveryBackend::trim_pushed_data( + const interval_set<uint64_t> ©_subset, + const interval_set<uint64_t> &intervals_received, + ceph::bufferlist data_received) +{ + logger().debug("{}", __func__); + // what i have is only a subset of what i want + if (intervals_received.subset_of(copy_subset)) { + return {intervals_received, data_received}; + } + // only collect the extents included by copy_subset and intervals_received + interval_set<uint64_t> intervals_usable; + bufferlist data_usable; + intervals_usable.intersection_of(copy_subset, intervals_received); + uint64_t have_off = 0; + for (auto [have_start, have_len] : intervals_received) { + interval_set<uint64_t> want; + want.insert(have_start, have_len); + want.intersection_of(copy_subset); + for (auto [want_start, want_len] : want) { + bufferlist sub; + uint64_t data_off = have_off + (want_start - have_start); + sub.substr_of(data_received, data_off, want_len); + data_usable.claim_append(sub); + } + have_off += have_len; + } + return {intervals_usable, data_usable}; +} + +seastar::future<> ReplicatedRecoveryBackend::submit_push_data( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + interval_set<uint64_t> data_zeros, + const interval_set<uint64_t> &intervals_included, + bufferlist data_included, + bufferlist omap_header, + const map<string, bufferlist> &attrs, + const map<string, bufferlist> &omap_entries, + ObjectStore::Transaction *t) +{ + logger().debug("{}", __func__); + hobject_t target_oid; + if (first && complete) { + target_oid = recovery_info.soid; + } else { + target_oid = get_temp_recovery_object(recovery_info.soid, + recovery_info.version); + if (first) { + logger().debug("{}: Adding oid {} in the temp collection", + __func__, target_oid); + add_temp_obj(target_oid); + } + } + + return [this, &recovery_info, first, complete, t, + &omap_header, &attrs, target_oid, clear_omap] { + if (first) { + if (!complete) { + t->remove(coll->get_cid(), ghobject_t(target_oid)); + t->touch(coll->get_cid(), ghobject_t(target_oid)); + bufferlist bv = attrs.at(OI_ATTR); + object_info_t oi(bv); + t->set_alloc_hint(coll->get_cid(), ghobject_t(target_oid), + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + } else { + if (!recovery_info.object_exist) { + t->remove(coll->get_cid(), ghobject_t(target_oid)); + t->touch(coll->get_cid(), ghobject_t(target_oid)); + bufferlist bv = attrs.at(OI_ATTR); + object_info_t oi(bv); + t->set_alloc_hint(coll->get_cid(), ghobject_t(target_oid), + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + } + //remove xattr and update later if overwrite on original object + t->rmattrs(coll->get_cid(), ghobject_t(target_oid)); + //if need update omap, clear the previous content first + if (clear_omap) + t->omap_clear(coll->get_cid(), ghobject_t(target_oid)); + } + + t->truncate(coll->get_cid(), ghobject_t(target_oid), recovery_info.size); + if (omap_header.length()) + t->omap_setheader(coll->get_cid(), ghobject_t(target_oid), omap_header); + + return store->stat(coll, ghobject_t(recovery_info.soid)).then( + [this, &recovery_info, complete, t, target_oid, + omap_header = std::move(omap_header)] (auto st) { + //TODO: pg num bytes counting + if (!complete) { + //clone overlap content in local object + if (recovery_info.object_exist) { + uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size); + interval_set<uint64_t> local_intervals_included, local_intervals_excluded; + if (local_size) { + local_intervals_included.insert(0, local_size); + local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset); + local_intervals_included.subtract(local_intervals_excluded); + } + for (auto [off, len] : local_intervals_included) { + logger().debug(" clone_range {} {}~{}", + recovery_info.soid, off, len); + t->clone_range(coll->get_cid(), ghobject_t(recovery_info.soid), + ghobject_t(target_oid), off, len, off); + } + } + } + return seastar::make_ready_future<>(); + }); + } + return seastar::make_ready_future<>(); + }().then([this, data_zeros=std::move(data_zeros), + &recovery_info, &intervals_included, t, target_oid, + &omap_entries, &attrs, data_included, complete, first]() mutable { + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL; + // Punch zeros for data, if fiemap indicates nothing but it is marked dirty + if (!data_zeros.empty()) { + data_zeros.intersection_of(recovery_info.copy_subset); + assert(intervals_included.subset_of(data_zeros)); + data_zeros.subtract(intervals_included); + + logger().debug("submit_push_data recovering object {} copy_subset: {} " + "intervals_included: {} data_zeros: {}", + recovery_info.soid, recovery_info.copy_subset, + intervals_included, data_zeros); + + for (auto [start, len] : data_zeros) { + t->zero(coll->get_cid(), ghobject_t(target_oid), start, len); + } + } + uint64_t off = 0; + for (auto [start, len] : intervals_included) { + bufferlist bit; + bit.substr_of(data_included, off, len); + t->write(coll->get_cid(), ghobject_t(target_oid), + start, len, bit, fadvise_flags); + off += len; + } + + if (!omap_entries.empty()) + t->omap_setkeys(coll->get_cid(), ghobject_t(target_oid), omap_entries); + if (!attrs.empty()) + t->setattrs(coll->get_cid(), ghobject_t(target_oid), attrs); + + if (complete) { + if (!first) { + logger().debug("submit_push_data: Removing oid {} from the temp collection", + target_oid); + clear_temp_obj(target_oid); + t->remove(coll->get_cid(), ghobject_t(recovery_info.soid)); + t->collection_move_rename(coll->get_cid(), ghobject_t(target_oid), + coll->get_cid(), ghobject_t(recovery_info.soid)); + } + submit_push_complete(recovery_info, t); + } + logger().debug("submit_push_data: done"); + return seastar::make_ready_future<>(); + }); +} + +void ReplicatedRecoveryBackend::submit_push_complete( + const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t) +{ + for (const auto& [oid, extents] : recovery_info.clone_subset) { + for (const auto [off, len] : extents) { + logger().debug(" clone_range {} {}~{}", oid, off, len); + t->clone_range(coll->get_cid(), ghobject_t(oid), ghobject_t(recovery_info.soid), + off, len, off); + } + } +} + +seastar::future<> ReplicatedRecoveryBackend::handle_recovery_delete_reply( + Ref<MOSDPGRecoveryDeleteReply> m) +{ + auto& p = m->objects.front(); + hobject_t soid = p.first; + ObjectRecoveryInfo recovery_info; + recovery_info.version = p.second; + pg.get_recovery_handler()->on_peer_recover(m->from, soid, recovery_info); + get_recovering(soid).set_pushed(m->from); + return seastar::now(); +} + +seastar::future<> ReplicatedRecoveryBackend::handle_recovery_op(Ref<MOSDFastDispatchOp> m) +{ + switch (m->get_header().type) { + case MSG_OSD_PG_PULL: + return handle_pull(boost::static_pointer_cast<MOSDPGPull>(m)); + case MSG_OSD_PG_PUSH: + return handle_push(boost::static_pointer_cast<MOSDPGPush>(m)); + case MSG_OSD_PG_PUSH_REPLY: + return handle_push_reply( + boost::static_pointer_cast<MOSDPGPushReply>(m)); + case MSG_OSD_PG_RECOVERY_DELETE: + return handle_recovery_delete( + boost::static_pointer_cast<MOSDPGRecoveryDelete>(m)); + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + return handle_recovery_delete_reply( + boost::static_pointer_cast<MOSDPGRecoveryDeleteReply>(m)); + default: + // delegate to parent class for handling backend-agnostic recovery ops. + return RecoveryBackend::handle_recovery_op(std::move(m)); + } +} + diff --git a/src/crimson/osd/replicated_recovery_backend.h b/src/crimson/osd/replicated_recovery_backend.h new file mode 100644 index 000000000..d99538a75 --- /dev/null +++ b/src/crimson/osd/replicated_recovery_backend.h @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/osd/recovery_backend.h" + +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" +#include "os/ObjectStore.h" + +class ReplicatedRecoveryBackend : public RecoveryBackend { +public: + ReplicatedRecoveryBackend(crimson::osd::PG& pg, + crimson::osd::ShardServices& shard_services, + crimson::os::CollectionRef coll, + PGBackend* backend) + : RecoveryBackend(pg, shard_services, coll, backend) {} + seastar::future<> handle_recovery_op( + Ref<MOSDFastDispatchOp> m) final; + + seastar::future<> recover_object( + const hobject_t& soid, + eversion_t need) final; + seastar::future<> recover_delete( + const hobject_t& soid, + eversion_t need) final; + seastar::future<> push_delete( + const hobject_t& soid, + eversion_t need) final; +protected: + seastar::future<> handle_pull( + Ref<MOSDPGPull> m); + seastar::future<> handle_pull_response( + Ref<MOSDPGPush> m); + seastar::future<> handle_push( + Ref<MOSDPGPush> m); + seastar::future<> handle_push_reply( + Ref<MOSDPGPushReply> m); + seastar::future<> handle_recovery_delete( + Ref<MOSDPGRecoveryDelete> m); + seastar::future<> handle_recovery_delete_reply( + Ref<MOSDPGRecoveryDeleteReply> m); + seastar::future<PushOp> prep_push( + const hobject_t& soid, + eversion_t need, + pg_shard_t pg_shard); + void prepare_pull( + PullOp& po, + PullInfo& pi, + const hobject_t& soid, + eversion_t need); + std::vector<pg_shard_t> get_shards_to_push( + const hobject_t& soid) const; + seastar::future<PushOp> build_push_op( + const ObjectRecoveryInfo& recovery_info, + const ObjectRecoveryProgress& progress, + object_stat_sum_t* stat); + /// @returns true if this push op is the last push op for + /// recovery @c pop.soid + seastar::future<bool> _handle_pull_response( + pg_shard_t from, + const PushOp& pop, + PullOp* response, + ceph::os::Transaction* t); + std::pair<interval_set<uint64_t>, ceph::bufferlist> trim_pushed_data( + const interval_set<uint64_t> ©_subset, + const interval_set<uint64_t> &intervals_received, + ceph::bufferlist data_received); + seastar::future<> submit_push_data( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + interval_set<uint64_t> data_zeros, + const interval_set<uint64_t> &intervals_included, + ceph::bufferlist data_included, + ceph::bufferlist omap_header, + const std::map<string, bufferlist> &attrs, + const std::map<string, bufferlist> &omap_entries, + ceph::os::Transaction *t); + void submit_push_complete( + const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t); + seastar::future<> _handle_push( + pg_shard_t from, + const PushOp &pop, + PushReplyOp *response, + ceph::os::Transaction *t); + seastar::future<std::optional<PushOp>> _handle_push_reply( + pg_shard_t peer, + const PushReplyOp &op); + seastar::future<> on_local_recover_persist( + const hobject_t& soid, + const ObjectRecoveryInfo& _recovery_info, + bool is_delete, + epoch_t epoch_to_freeze); + seastar::future<> local_recover_delete( + const hobject_t& soid, + eversion_t need, + epoch_t epoch_frozen); + seastar::future<> on_stop() final { + return seastar::now(); + } +private: + /// pull missing object from peer + seastar::future<> maybe_pull_missing_obj( + const hobject_t& soid, + eversion_t need); + + /// load object context for recovery if it is not ready yet + using load_obc_ertr = crimson::errorator< + crimson::ct_error::object_corrupted>; + + seastar::future<> maybe_push_shards( + const hobject_t& soid, + eversion_t need); + + /// read the data attached to given object. the size of them is supposed to + /// be relatively small. + /// + /// @return @c oi.version + seastar::future<eversion_t> read_metadata_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + eversion_t ver, + PushOp* push_op); + /// read the remaining extents of object to be recovered and fill push_op + /// with them + /// + /// @param oid object being recovered + /// @param copy_subset extents we want + /// @param offset the offset in object from where we should read + /// @return the new offset + seastar::future<uint64_t> read_object_for_push_op( + const hobject_t& oid, + const interval_set<uint64_t>& copy_subset, + uint64_t offset, + uint64_t max_len, + PushOp* push_op); + seastar::future<> read_omap_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + uint64_t* max_len, + PushOp* push_op); +}; diff --git a/src/crimson/osd/scheduler/mclock_scheduler.cc b/src/crimson/osd/scheduler/mclock_scheduler.cc new file mode 100644 index 000000000..195ea8dd8 --- /dev/null +++ b/src/crimson/osd/scheduler/mclock_scheduler.cc @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <memory> +#include <functional> + +#include "crimson/osd/scheduler/mclock_scheduler.h" +#include "common/dout.h" + +namespace dmc = crimson::dmclock; +using namespace std::placeholders; + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout + + +namespace crimson::osd::scheduler { + +mClockScheduler::mClockScheduler(ConfigProxy &conf) : + scheduler( + std::bind(&mClockScheduler::ClientRegistry::get_info, + &client_registry, + _1), + dmc::AtLimit::Allow, + conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout")) +{ + conf.add_observer(this); + client_registry.update_from_config(conf); +} + +void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf) +{ + default_external_client_info.update( + conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"), + conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim")); + + internal_client_infos[ + static_cast<size_t>(scheduler_class_t::background_recovery)].update( + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim")); + + internal_client_infos[ + static_cast<size_t>(scheduler_class_t::background_best_effort)].update( + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim")); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client( + const client_profile_id_t &client) const +{ + auto ret = external_client_infos.find(client); + if (ret == external_client_infos.end()) + return &default_external_client_info; + else + return &(ret->second); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info( + const scheduler_id_t &id) const { + switch (id.class_id) { + case scheduler_class_t::immediate: + ceph_assert(0 == "Cannot schedule immediate"); + return (dmc::ClientInfo*)nullptr; + case scheduler_class_t::repop: + case scheduler_class_t::client: + return get_external_client(id.client_profile_id); + default: + ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size()); + return &internal_client_infos[static_cast<size_t>(id.class_id)]; + } +} + +void mClockScheduler::dump(ceph::Formatter &f) const +{ +} + +void mClockScheduler::enqueue(item_t&& item) +{ + auto id = get_scheduler_id(item); + auto cost = item.params.cost; + + if (scheduler_class_t::immediate == item.params.klass) { + immediate.push_front(std::move(item)); + } else { + scheduler.add_request( + std::move(item), + id, + cost); + } +} + +void mClockScheduler::enqueue_front(item_t&& item) +{ + immediate.push_back(std::move(item)); + // TODO: item may not be immediate, update mclock machinery to permit + // putting the item back in the queue +} + +item_t mClockScheduler::dequeue() +{ + if (!immediate.empty()) { + auto ret = std::move(immediate.back()); + immediate.pop_back(); + return ret; + } else { + mclock_queue_t::PullReq result = scheduler.pull_request(); + if (result.is_future()) { + ceph_assert( + 0 == "Not implemented, user would have to be able to be woken up"); + return std::move(*(item_t*)nullptr); + } else if (result.is_none()) { + ceph_assert( + 0 == "Impossible, must have checked empty() first"); + return std::move(*(item_t*)nullptr); + } else { + ceph_assert(result.is_retn()); + + auto &retn = result.get_retn(); + return std::move(*retn.request); + } + } +} + +const char** mClockScheduler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_mclock_scheduler_client_res", + "osd_mclock_scheduler_client_wgt", + "osd_mclock_scheduler_client_lim", + "osd_mclock_scheduler_background_recovery_res", + "osd_mclock_scheduler_background_recovery_wgt", + "osd_mclock_scheduler_background_recovery_lim", + "osd_mclock_scheduler_background_best_effort_res", + "osd_mclock_scheduler_background_best_effort_wgt", + "osd_mclock_scheduler_background_best_effort_lim", + NULL + }; + return KEYS; +} + +void mClockScheduler::handle_conf_change( + const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + client_registry.update_from_config(conf); +} + +} diff --git a/src/crimson/osd/scheduler/mclock_scheduler.h b/src/crimson/osd/scheduler/mclock_scheduler.h new file mode 100644 index 000000000..c3edbe729 --- /dev/null +++ b/src/crimson/osd/scheduler/mclock_scheduler.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include <ostream> +#include <map> +#include <vector> + +#include "boost/variant.hpp" + +#include "dmclock/src/dmclock_server.h" + +#include "crimson/osd/scheduler/scheduler.h" +#include "common/config.h" +#include "include/cmp.h" +#include "common/ceph_context.h" + + +namespace crimson::osd::scheduler { + +using client_id_t = uint64_t; +using profile_id_t = uint64_t; + +struct client_profile_id_t { + client_id_t client_id; + profile_id_t profile_id; +}; + +WRITE_EQ_OPERATORS_2(client_profile_id_t, client_id, profile_id) +WRITE_CMP_OPERATORS_2(client_profile_id_t, client_id, profile_id) + + +struct scheduler_id_t { + scheduler_class_t class_id; + client_profile_id_t client_profile_id; +}; + +WRITE_EQ_OPERATORS_2(scheduler_id_t, class_id, client_profile_id) +WRITE_CMP_OPERATORS_2(scheduler_id_t, class_id, client_profile_id) + +/** + * Scheduler implementation based on mclock. + * + * TODO: explain configs + */ +class mClockScheduler : public Scheduler, md_config_obs_t { + + class ClientRegistry { + std::array< + crimson::dmclock::ClientInfo, + static_cast<size_t>(scheduler_class_t::client) + > internal_client_infos = { + // Placeholder, gets replaced with configured values + crimson::dmclock::ClientInfo(1, 1, 1), + crimson::dmclock::ClientInfo(1, 1, 1) + }; + + crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1}; + std::map<client_profile_id_t, + crimson::dmclock::ClientInfo> external_client_infos; + const crimson::dmclock::ClientInfo *get_external_client( + const client_profile_id_t &client) const; + public: + void update_from_config(const ConfigProxy &conf); + const crimson::dmclock::ClientInfo *get_info( + const scheduler_id_t &id) const; + } client_registry; + + using mclock_queue_t = crimson::dmclock::PullPriorityQueue< + scheduler_id_t, + item_t, + true, + true, + 2>; + mclock_queue_t scheduler; + std::list<item_t> immediate; + + static scheduler_id_t get_scheduler_id(const item_t &item) { + return scheduler_id_t{ + item.params.klass, + client_profile_id_t{ + item.params.owner, + 0 + } + }; + } + +public: + mClockScheduler(ConfigProxy &conf); + + // Enqueue op in the back of the regular queue + void enqueue(item_t &&item) final; + + // Enqueue the op in the front of the regular queue + void enqueue_front(item_t &&item) final; + + // Return an op to be dispatch + item_t dequeue() final; + + // Returns if the queue is empty + bool empty() const final { + return immediate.empty() && scheduler.empty(); + } + + // Formatted output of the queue + void dump(ceph::Formatter &f) const final; + + void print(std::ostream &ostream) const final { + ostream << "mClockScheduler"; + } + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) final; +}; + +} diff --git a/src/crimson/osd/scheduler/scheduler.cc b/src/crimson/osd/scheduler/scheduler.cc new file mode 100644 index 000000000..c85cb388e --- /dev/null +++ b/src/crimson/osd/scheduler/scheduler.cc @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <ostream> + +#include <seastar/core/print.hh> + +#include "crimson/osd/scheduler/scheduler.h" +#include "crimson/osd/scheduler/mclock_scheduler.h" +#include "common/WeightedPriorityQueue.h" + +namespace crimson::osd::scheduler { + +std::ostream &operator<<(std::ostream &lhs, const scheduler_class_t &c) +{ + switch (c) { + case scheduler_class_t::background_best_effort: + return lhs << "background_best_effort"; + case scheduler_class_t::background_recovery: + return lhs << "background_recovery"; + case scheduler_class_t::client: + return lhs << "client"; + case scheduler_class_t::repop: + return lhs << "repop"; + case scheduler_class_t::immediate: + return lhs << "immediate"; + default: + return lhs; + } +} + +/** + * Implements Scheduler in terms of OpQueue + * + * Templated on queue type to avoid dynamic dispatch, T should implement + * OpQueue<Scheduleritem_t, client_t>. This adapter is mainly responsible for + * the boilerplate priority cutoff/strict concept which is needed for + * OpQueue based implementations. + */ +template <typename T> +class ClassedOpQueueScheduler final : public Scheduler { + const scheduler_class_t cutoff; + T queue; + + using priority_t = uint64_t; + std::array< + priority_t, + static_cast<size_t>(scheduler_class_t::immediate) + > priority_map = { + // Placeholder, gets replaced with configured values + 0, 0, 0 + }; + + static scheduler_class_t get_io_prio_cut(ConfigProxy &conf) { + if (conf.get_val<std::string>("osd_op_queue_cut_off") == "debug_random") { + srand(time(NULL)); + return (rand() % 2 < 1) ? + scheduler_class_t::repop : scheduler_class_t::immediate; + } else if (conf.get_val<std::string>("osd_op_queue_cut_off") == "high") { + return scheduler_class_t::immediate; + } else { + return scheduler_class_t::repop; + } + } + + bool use_strict(scheduler_class_t kl) const { + return static_cast<uint8_t>(kl) >= static_cast<uint8_t>(cutoff); + } + + priority_t get_priority(scheduler_class_t kl) const { + ceph_assert(static_cast<size_t>(kl) < + static_cast<size_t>(scheduler_class_t::immediate)); + return priority_map[static_cast<size_t>(kl)]; + } + +public: + template <typename... Args> + ClassedOpQueueScheduler(ConfigProxy &conf, Args&&... args) : + cutoff(get_io_prio_cut(conf)), + queue(std::forward<Args>(args)...) + { + priority_map[ + static_cast<size_t>(scheduler_class_t::background_best_effort) + ] = conf.get_val<uint64_t>("osd_scrub_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::background_recovery) + ] = conf.get_val<uint64_t>("osd_recovery_op_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::client) + ] = conf.get_val<uint64_t>("osd_client_op_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::repop) + ] = conf.get_val<uint64_t>("osd_client_op_priority"); + } + + void enqueue(item_t &&item) final { + if (use_strict(item.params.klass)) + queue.enqueue_strict( + item.params.owner, get_priority(item.params.klass), std::move(item)); + else + queue.enqueue( + item.params.owner, get_priority(item.params.klass), + item.params.cost, std::move(item)); + } + + void enqueue_front(item_t &&item) final { + if (use_strict(item.params.klass)) + queue.enqueue_strict_front( + item.params.owner, get_priority(item.params.klass), std::move(item)); + else + queue.enqueue_front( + item.params.owner, get_priority(item.params.klass), + item.params.cost, std::move(item)); + } + + bool empty() const final { + return queue.empty(); + } + + item_t dequeue() final { + return queue.dequeue(); + } + + void dump(ceph::Formatter &f) const final { + return queue.dump(&f); + } + + void print(std::ostream &out) const final { + out << "ClassedOpQueueScheduler(queue="; + queue.print(out); + out << ", cutoff=" << cutoff << ")"; + } + + ~ClassedOpQueueScheduler() final {}; +}; + +SchedulerRef make_scheduler(ConfigProxy &conf) +{ + const std::string _type = conf.get_val<std::string>("osd_op_queue"); + const std::string *type = &_type; + if (*type == "debug_random") { + static const std::string index_lookup[] = { "mclock_scheduler", + "wpq" }; + srand(time(NULL)); + unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0])); + type = &index_lookup[which]; + } + + if (*type == "wpq" ) { + // default is 'wpq' + return std::make_unique< + ClassedOpQueueScheduler<WeightedPriorityQueue<item_t, client_t>>>( + conf, + conf.get_val<uint64_t>("osd_op_pq_max_tokens_per_priority"), + conf->osd_op_pq_min_cost + ); + } else if (*type == "mclock_scheduler") { + return std::make_unique<mClockScheduler>(conf); + } else { + ceph_assert("Invalid choice of wq" == 0); + return std::unique_ptr<mClockScheduler>(); + } +} + +std::ostream &operator<<(std::ostream &lhs, const Scheduler &rhs) { + rhs.print(lhs); + return lhs; +} + +} diff --git a/src/crimson/osd/scheduler/scheduler.h b/src/crimson/osd/scheduler/scheduler.h new file mode 100644 index 000000000..a014991ab --- /dev/null +++ b/src/crimson/osd/scheduler/scheduler.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <seastar/core/future.hh> +#include <ostream> + +#include "crimson/common/config_proxy.h" + +namespace crimson::osd::scheduler { + +enum class scheduler_class_t : uint8_t { + background_best_effort = 0, + background_recovery, + client, + repop, + immediate, +}; + +std::ostream &operator<<(std::ostream &, const scheduler_class_t &); + +using client_t = uint64_t; +using cost_t = uint64_t; + +struct params_t { + cost_t cost = 1; + client_t owner; + scheduler_class_t klass; +}; + +struct item_t { + params_t params; + seastar::promise<> wake; +}; + +/** + * Base interface for classes responsible for choosing + * op processing order in the OSD. + */ +class Scheduler { +public: + // Enqueue op for scheduling + virtual void enqueue(item_t &&item) = 0; + + // Enqueue op for processing as though it were enqueued prior + // to other items already scheduled. + virtual void enqueue_front(item_t &&item) = 0; + + // Returns true iff there are no ops scheduled + virtual bool empty() const = 0; + + // Return next op to be processed + virtual item_t dequeue() = 0; + + // Dump formatted representation for the queue + virtual void dump(ceph::Formatter &f) const = 0; + + // Print human readable brief description with relevant parameters + virtual void print(std::ostream &out) const = 0; + + // Destructor + virtual ~Scheduler() {}; +}; + +std::ostream &operator<<(std::ostream &lhs, const Scheduler &); +using SchedulerRef = std::unique_ptr<Scheduler>; + +SchedulerRef make_scheduler(ConfigProxy &); + +} diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc new file mode 100644 index 000000000..8c2cfc415 --- /dev/null +++ b/src/crimson/osd/shard_services.cc @@ -0,0 +1,311 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/shard_services.h" + +#include "messages/MOSDAlive.h" + +#include "osd/osd_perf_counters.h" +#include "osd/PeeringState.h" +#include "crimson/common/config_proxy.h" +#include "crimson/mgr/client.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Messenger.h" +#include "crimson/net/Connection.h" +#include "crimson/os/cyanstore/cyan_store.h" +#include "crimson/osd/osdmap_service.h" +#include "messages/MOSDPGTemp.h" +#include "messages/MOSDPGCreated.h" +#include "messages/MOSDPGNotify.h" +#include "messages/MOSDPGInfo.h" +#include "messages/MOSDPGQuery.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +ShardServices::ShardServices( + OSDMapService &osdmap_service, + const int whoami, + crimson::net::Messenger &cluster_msgr, + crimson::net::Messenger &public_msgr, + crimson::mon::Client &monc, + crimson::mgr::Client &mgrc, + crimson::os::FuturizedStore &store) + : osdmap_service(osdmap_service), + whoami(whoami), + cluster_msgr(cluster_msgr), + public_msgr(public_msgr), + monc(monc), + mgrc(mgrc), + store(store), + throttler(crimson::common::local_conf()), + obc_registry(crimson::common::local_conf()), + local_reserver( + &cct, + &finisher, + crimson::common::local_conf()->osd_max_backfills, + crimson::common::local_conf()->osd_min_recovery_priority), + remote_reserver( + &cct, + &finisher, + crimson::common::local_conf()->osd_max_backfills, + crimson::common::local_conf()->osd_min_recovery_priority) +{ + perf = build_osd_logger(&cct); + cct.get_perfcounters_collection()->add(perf); + + recoverystate_perf = build_recoverystate_perf(&cct); + cct.get_perfcounters_collection()->add(recoverystate_perf); + + crimson::common::local_conf().add_observer(this); +} + +const char** ShardServices::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_max_backfills", + "osd_min_recovery_priority", + nullptr + }; + return KEYS; +} + +void ShardServices::handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + if (changed.count("osd_max_backfills")) { + local_reserver.set_max(conf->osd_max_backfills); + remote_reserver.set_max(conf->osd_max_backfills); + } + if (changed.count("osd_min_recovery_priority")) { + local_reserver.set_min_priority(conf->osd_min_recovery_priority); + remote_reserver.set_min_priority(conf->osd_min_recovery_priority); + } +} + +seastar::future<> ShardServices::send_to_osd( + int peer, Ref<Message> m, epoch_t from_epoch) { + if (osdmap->is_down(peer)) { + logger().info("{}: osd.{} is_down", __func__, peer); + return seastar::now(); + } else if (osdmap->get_info(peer).up_from > from_epoch) { + logger().info("{}: osd.{} {} > {}", __func__, peer, + osdmap->get_info(peer).up_from, from_epoch); + return seastar::now(); + } else { + auto conn = cluster_msgr.connect( + osdmap->get_cluster_addrs(peer).front(), CEPH_ENTITY_TYPE_OSD); + return conn->send(m); + } +} + +seastar::future<> ShardServices::dispatch_context_transaction( + crimson::os::CollectionRef col, PeeringCtx &ctx) { + auto ret = store.do_transaction( + col, + std::move(ctx.transaction)); + ctx.reset_transaction(); + return ret; +} + +seastar::future<> ShardServices::dispatch_context_messages( + BufferedRecoveryMessages &&ctx) +{ + auto ret = seastar::parallel_for_each(std::move(ctx.message_map), + [this](auto& osd_messages) { + auto& [peer, messages] = osd_messages; + logger().debug("dispatch_context_messages sending messages to {}", peer); + return seastar::parallel_for_each( + std::move(messages), [=, peer=peer](auto& m) { + return send_to_osd(peer, m, osdmap->get_epoch()); + }); + }); + ctx.message_map.clear(); + return ret; +} + +seastar::future<> ShardServices::dispatch_context( + crimson::os::CollectionRef col, + PeeringCtx &&ctx) +{ + ceph_assert(col || ctx.transaction.empty()); + return seastar::when_all_succeed( + dispatch_context_messages( + BufferedRecoveryMessages{ceph_release_t::octopus, ctx}), + col ? dispatch_context_transaction(col, ctx) : seastar::now() + ).then_unpack([] { + return seastar::now(); + }); +} + +void ShardServices::queue_want_pg_temp(pg_t pgid, + const vector<int>& want, + bool forced) +{ + auto p = pg_temp_pending.find(pgid); + if (p == pg_temp_pending.end() || + p->second.acting != want || + forced) { + pg_temp_wanted[pgid] = {want, forced}; + } +} + +void ShardServices::remove_want_pg_temp(pg_t pgid) +{ + pg_temp_wanted.erase(pgid); + pg_temp_pending.erase(pgid); +} + +void ShardServices::requeue_pg_temp() +{ + unsigned old_wanted = pg_temp_wanted.size(); + unsigned old_pending = pg_temp_pending.size(); + pg_temp_wanted.merge(pg_temp_pending); + pg_temp_pending.clear(); + logger().debug( + "{}: {} + {} -> {}", + __func__ , + old_wanted, + old_pending, + pg_temp_wanted.size()); +} + +std::ostream& operator<<( + std::ostream& out, + const ShardServices::pg_temp_t& pg_temp) +{ + out << pg_temp.acting; + if (pg_temp.forced) { + out << " (forced)"; + } + return out; +} + +seastar::future<> ShardServices::send_pg_temp() +{ + if (pg_temp_wanted.empty()) + return seastar::now(); + logger().debug("{}: {}", __func__, pg_temp_wanted); + boost::intrusive_ptr<MOSDPGTemp> ms[2] = {nullptr, nullptr}; + for (auto& [pgid, pg_temp] : pg_temp_wanted) { + auto& m = ms[pg_temp.forced]; + if (!m) { + m = make_message<MOSDPGTemp>(osdmap->get_epoch()); + m->forced = pg_temp.forced; + } + m->pg_temp.emplace(pgid, pg_temp.acting); + } + pg_temp_pending.merge(pg_temp_wanted); + pg_temp_wanted.clear(); + return seastar::parallel_for_each(std::begin(ms), std::end(ms), + [this](auto m) { + if (m) { + return monc.send_message(m); + } else { + return seastar::now(); + } + }); +} + +void ShardServices::update_map(cached_map_t new_osdmap) +{ + osdmap = std::move(new_osdmap); +} + +ShardServices::cached_map_t &ShardServices::get_osdmap() +{ + return osdmap; +} + +seastar::future<> ShardServices::send_pg_created(pg_t pgid) +{ + logger().debug(__func__); + auto o = get_osdmap(); + ceph_assert(o->require_osd_release >= ceph_release_t::luminous); + pg_created.insert(pgid); + return monc.send_message(make_message<MOSDPGCreated>(pgid)); +} + +seastar::future<> ShardServices::send_pg_created() +{ + logger().debug(__func__); + auto o = get_osdmap(); + ceph_assert(o->require_osd_release >= ceph_release_t::luminous); + return seastar::parallel_for_each(pg_created, + [this](auto &pgid) { + return monc.send_message(make_message<MOSDPGCreated>(pgid)); + }); +} + +void ShardServices::prune_pg_created() +{ + logger().debug(__func__); + auto o = get_osdmap(); + auto i = pg_created.begin(); + while (i != pg_created.end()) { + auto p = o->get_pg_pool(i->pool()); + if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) { + logger().debug("{} pruning {}", __func__, *i); + i = pg_created.erase(i); + } else { + logger().debug(" keeping {}", __func__, *i); + ++i; + } + } +} + +seastar::future<> ShardServices::osdmap_subscribe(version_t epoch, bool force_request) +{ + logger().info("{}({})", __func__, epoch); + if (monc.sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) || + force_request) { + return monc.renew_subs(); + } else { + return seastar::now(); + } +} + +HeartbeatStampsRef ShardServices::get_hb_stamps(int peer) +{ + auto [stamps, added] = heartbeat_stamps.try_emplace(peer); + if (added) { + stamps->second = ceph::make_ref<HeartbeatStamps>(peer); + } + return stamps->second; +} + +seastar::future<> ShardServices::send_alive(const epoch_t want) +{ + logger().info( + "{} want={} up_thru_wanted={}", + __func__, + want, + up_thru_wanted); + + if (want > up_thru_wanted) { + up_thru_wanted = want; + } else { + logger().debug("{} want={} <= up_thru_wanted={}; skipping", + __func__, want, up_thru_wanted); + return seastar::now(); + } + if (!osdmap->exists(whoami)) { + logger().warn("{} DNE", __func__); + return seastar::now(); + } if (const epoch_t up_thru = osdmap->get_up_thru(whoami); + up_thru_wanted > up_thru) { + logger().debug("{} up_thru_wanted={} up_thru={}", __func__, want, up_thru); + return monc.send_message( + make_message<MOSDAlive>(osdmap->get_epoch(), want)); + } else { + logger().debug("{} {} <= {}", __func__, want, osdmap->get_up_thru(whoami)); + return seastar::now(); + } +} + +}; diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h new file mode 100644 index 000000000..2957639c6 --- /dev/null +++ b/src/crimson/osd/shard_services.h @@ -0,0 +1,215 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> + +#include "include/common_fwd.h" +#include "osd_operation.h" +#include "msg/MessageRef.h" +#include "crimson/common/exception.h" +#include "crimson/os/futurized_collection.h" +#include "osd/PeeringState.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/object_context.h" +#include "common/AsyncReserver.h" + +namespace crimson::net { + class Messenger; +} + +namespace crimson::mgr { + class Client; +} + +namespace crimson::mon { + class Client; +} + +namespace crimson::os { + class FuturizedStore; +} + +class OSDMap; +class PeeringCtx; +class BufferedRecoveryMessages; + +namespace crimson::osd { + +/** + * Represents services available to each PG + */ +class ShardServices : public md_config_obs_t { + using cached_map_t = boost::local_shared_ptr<const OSDMap>; + OSDMapService &osdmap_service; + const int whoami; + crimson::net::Messenger &cluster_msgr; + crimson::net::Messenger &public_msgr; + crimson::mon::Client &monc; + crimson::mgr::Client &mgrc; + crimson::os::FuturizedStore &store; + + crimson::common::CephContext cct; + + PerfCounters *perf = nullptr; + PerfCounters *recoverystate_perf = nullptr; + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) final; +public: + ShardServices( + OSDMapService &osdmap_service, + const int whoami, + crimson::net::Messenger &cluster_msgr, + crimson::net::Messenger &public_msgr, + crimson::mon::Client &monc, + crimson::mgr::Client &mgrc, + crimson::os::FuturizedStore &store); + + seastar::future<> send_to_osd( + int peer, + MessageRef m, + epoch_t from_epoch); + + crimson::os::FuturizedStore &get_store() { + return store; + } + + crimson::common::CephContext *get_cct() { + return &cct; + } + + // OSDMapService + const OSDMapService &get_osdmap_service() const { + return osdmap_service; + } + + // Op Management + OperationRegistry registry; + OperationThrottler throttler; + + template <typename T, typename... Args> + auto start_operation(Args&&... args) { + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + auto op = registry.create_operation<T>(std::forward<Args>(args)...); + return std::make_pair(op, op->start()); + } + + seastar::future<> stop() { + stopping = true; + return registry.stop(); + } + + // Loggers + PerfCounters &get_recoverystate_perf_logger() { + return *recoverystate_perf; + } + PerfCounters &get_perf_logger() { + return *perf; + } + + /// Dispatch and reset ctx transaction + seastar::future<> dispatch_context_transaction( + crimson::os::CollectionRef col, PeeringCtx &ctx); + + /// Dispatch and reset ctx messages + seastar::future<> dispatch_context_messages( + BufferedRecoveryMessages &&ctx); + + /// Dispatch ctx and dispose of context + seastar::future<> dispatch_context( + crimson::os::CollectionRef col, + PeeringCtx &&ctx); + + /// Dispatch ctx and dispose of ctx, transaction must be empty + seastar::future<> dispatch_context( + PeeringCtx &&ctx) { + return dispatch_context({}, std::move(ctx)); + } + + // PG Temp State +private: + // TODO: hook into map processing and some kind of heartbeat/peering + // message processing + struct pg_temp_t { + std::vector<int> acting; + bool forced = false; + }; + map<pg_t, pg_temp_t> pg_temp_wanted; + map<pg_t, pg_temp_t> pg_temp_pending; + friend std::ostream& operator<<(std::ostream&, const pg_temp_t&); +public: + void queue_want_pg_temp(pg_t pgid, const vector<int>& want, + bool forced = false); + void remove_want_pg_temp(pg_t pgid); + void requeue_pg_temp(); + seastar::future<> send_pg_temp(); + + // Shard-local OSDMap +private: + cached_map_t osdmap; +public: + void update_map(cached_map_t new_osdmap); + cached_map_t &get_osdmap(); + + // PG Created State +private: + set<pg_t> pg_created; +public: + seastar::future<> send_pg_created(pg_t pgid); + seastar::future<> send_pg_created(); + void prune_pg_created(); + + unsigned get_pg_num() const { + return num_pgs; + } + void inc_pg_num() { + ++num_pgs; + } + void dec_pg_num() { + --num_pgs; + } + + seastar::future<> osdmap_subscribe(version_t epoch, bool force_request); + + // Time state + ceph::mono_time startup_time = ceph::mono_clock::now(); + ceph::signedspan get_mnow() const { + return ceph::mono_clock::now() - startup_time; + } + HeartbeatStampsRef get_hb_stamps(int peer); + std::map<int, HeartbeatStampsRef> heartbeat_stamps; + + crimson::osd::ObjectContextRegistry obc_registry; + + // Async Reservers +private: + unsigned num_pgs = 0; + + struct DirectFinisher { + void queue(Context *c) { + c->complete(0); + } + } finisher; + // prevent creating new osd operations when system is shutting down, + // this is necessary because there are chances that a new operation + // is created, after the interruption of all ongoing operations, and + // creats and waits on a new and may-never-resolve future, in which + // case the shutdown may never succeed. + bool stopping = false; +public: + AsyncReserver<spg_t, DirectFinisher> local_reserver; + AsyncReserver<spg_t, DirectFinisher> remote_reserver; + +private: + epoch_t up_thru_wanted = 0; +public: + seastar::future<> send_alive(epoch_t want); +}; + +} diff --git a/src/crimson/osd/state.h b/src/crimson/osd/state.h new file mode 100644 index 000000000..ba48cd36f --- /dev/null +++ b/src/crimson/osd/state.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string_view> +#include <ostream> + +class OSDMap; + +class OSDState { + + enum class State { + INITIALIZING, + PREBOOT, + BOOTING, + ACTIVE, + PRESTOP, + STOPPING, + WAITING_FOR_HEALTHY, + }; + + State state = State::INITIALIZING; + +public: + bool is_initializing() const { + return state == State::INITIALIZING; + } + bool is_preboot() const { + return state == State::PREBOOT; + } + bool is_booting() const { + return state == State::BOOTING; + } + bool is_active() const { + return state == State::ACTIVE; + } + bool is_prestop() const { + return state == State::PRESTOP; + } + bool is_stopping() const { + return state == State::STOPPING; + } + bool is_waiting_for_healthy() const { + return state == State::WAITING_FOR_HEALTHY; + } + void set_preboot() { + state = State::PREBOOT; + } + void set_booting() { + state = State::BOOTING; + } + void set_active() { + state = State::ACTIVE; + } + void set_prestop() { + state = State::PRESTOP; + } + void set_stopping() { + state = State::STOPPING; + } + std::string_view to_string() const { + switch (state) { + case State::INITIALIZING: return "initializing"; + case State::PREBOOT: return "preboot"; + case State::BOOTING: return "booting"; + case State::ACTIVE: return "active"; + case State::PRESTOP: return "prestop"; + case State::STOPPING: return "stopping"; + case State::WAITING_FOR_HEALTHY: return "waiting_for_healthy"; + default: return "???"; + } + } +}; + +inline std::ostream& +operator<<(std::ostream& os, const OSDState& s) { + return os << s.to_string(); +} diff --git a/src/crimson/osd/watch.cc b/src/crimson/osd/watch.cc new file mode 100644 index 000000000..a7a3311aa --- /dev/null +++ b/src/crimson/osd/watch.cc @@ -0,0 +1,169 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/watch.h" +#include "messages/MWatchNotify.h" + + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +bool Watch::NotifyCmp::operator()(NotifyRef lhs, NotifyRef rhs) const +{ + ceph_assert(lhs); + ceph_assert(rhs); + return lhs->get_id() < rhs->get_id(); +} + +seastar::future<> Watch::connect(crimson::net::ConnectionRef conn, bool) +{ + if (this->conn == conn) { + logger().debug("conn={} already connected", conn); + } + + this->conn = std::move(conn); + return seastar::now(); +} + +seastar::future<> Watch::send_notify_msg(NotifyRef notify) +{ + logger().info("{} for notify(id={})", __func__, notify->ninfo.notify_id); + return conn->send(make_message<MWatchNotify>( + winfo.cookie, + notify->user_version, + notify->ninfo.notify_id, + CEPH_WATCH_EVENT_NOTIFY, + notify->ninfo.bl, + notify->client_gid)); +} + +seastar::future<> Watch::start_notify(NotifyRef notify) +{ + logger().info("{} adding notify(id={})", __func__, notify->ninfo.notify_id); + auto [ it, emplaced ] = in_progress_notifies.emplace(std::move(notify)); + ceph_assert(emplaced); + ceph_assert(is_alive()); + return is_connected() ? send_notify_msg(*it) : seastar::now(); +} + +seastar::future<> Watch::notify_ack( + const uint64_t notify_id, + const ceph::bufferlist& reply_bl) +{ + logger().info("{}", __func__); + return seastar::do_for_each(in_progress_notifies, + [this_shared=shared_from_this(), &reply_bl] (auto notify) { + return notify->complete_watcher(this_shared, reply_bl); + } + ).then([this] { + in_progress_notifies.clear(); + return seastar::now(); + }); +} + +seastar::future<> Watch::send_disconnect_msg() +{ + if (!is_connected()) { + return seastar::now(); + } + ceph::bufferlist empty; + return conn->send(make_message<MWatchNotify>( + winfo.cookie, + 0, + 0, + CEPH_WATCH_EVENT_DISCONNECT, + empty)); +} + +void Watch::discard_state() +{ + ceph_assert(obc); + in_progress_notifies.clear(); +} + +seastar::future<> Watch::remove(const bool send_disconnect) +{ + logger().info("{}", __func__); + auto disconnected = send_disconnect ? send_disconnect_msg() + : seastar::now(); + return std::move(disconnected).then([this] { + return seastar::do_for_each(in_progress_notifies, + [this_shared=shared_from_this()] (auto notify) { + return notify->remove_watcher(this_shared); + }).then([this] { + discard_state(); + return seastar::now(); + }); + }); +} + +bool notify_reply_t::operator<(const notify_reply_t& rhs) const +{ + // comparing std::pairs to emphasize our legacy. ceph-osd stores + // notify_replies as std::multimap<std::pair<gid, cookie>, bl>. + // unfortunately, what seems to be an implementation detail, got + // exposed as part of our public API (the `reply_buffer` parameter + // of the `rados_notify` family). + const auto lhsp = std::make_pair(watcher_gid, watcher_cookie); + const auto rhsp = std::make_pair(rhs.watcher_gid, rhs.watcher_cookie); + return lhsp < rhsp; +} + +seastar::future<> Notify::remove_watcher(WatchRef watch) +{ + if (discarded || complete) { + return seastar::now(); + } + [[maybe_unused]] const auto num_removed = watchers.erase(watch); + assert(num_removed > 0); + return maybe_send_completion(); +} + + +seastar::future<> Notify::complete_watcher( + WatchRef watch, + const ceph::bufferlist& reply_bl) +{ + if (discarded || complete) { + return seastar::now(); + } + notify_replies.emplace(notify_reply_t{ + watch->get_watcher_gid(), + watch->get_cookie(), + reply_bl}); + return remove_watcher(std::move(watch)); +} + +seastar::future<> Notify::maybe_send_completion() +{ + logger().info("{} -- {} in progress watchers", __func__, watchers.size()); + if (watchers.empty()) { + // prepare reply + ceph::bufferlist bl; + encode(notify_replies, bl); + // FIXME: this is just a stub + std::list<std::pair<uint64_t,uint64_t>> missed; + encode(missed, bl); + + complete = true; + + ceph::bufferlist empty; + auto reply = make_message<MWatchNotify>( + ninfo.cookie, + user_version, + ninfo.notify_id, + CEPH_WATCH_EVENT_NOTIFY_COMPLETE, + empty, + client_gid); + reply->set_data(bl); + return conn->send(std::move(reply)); + } + return seastar::now(); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/watch.h b/src/crimson/osd/watch.h new file mode 100644 index 000000000..6049e16cf --- /dev/null +++ b/src/crimson/osd/watch.h @@ -0,0 +1,194 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iterator> +#include <map> +#include <set> + +#include <seastar/core/shared_ptr.hh> + +#include "crimson/net/Connection.h" +#include "crimson/osd/object_context.h" +#include "include/denc.h" + +namespace crimson::osd { + +class Notify; +using NotifyRef = seastar::shared_ptr<Notify>; + +// NOTE: really need to have this public. Otherwise `shared_from_this()` +// will abort. According to cppreference.com: +// +// "The constructors of std::shared_ptr detect the presence +// of an unambiguous and accessible (ie. public inheritance +// is mandatory) (since C++17) enable_shared_from_this base". +// +// I expect the `seastar::shared_ptr` shares this behaviour. +class Watch : public seastar::enable_shared_from_this<Watch> { + // this is a private tag for the public constructor that turns it into + // de facto private one. The motivation behind the hack is make_shared + // used by create(). + struct private_ctag_t{}; + + struct NotifyCmp { + inline bool operator()(NotifyRef lhs, NotifyRef rhs) const; + }; + std::set<NotifyRef, NotifyCmp> in_progress_notifies; + crimson::net::ConnectionRef conn; + crimson::osd::ObjectContextRef obc; + + watch_info_t winfo; + entity_name_t entity_name; + + seastar::future<> start_notify(NotifyRef); + seastar::future<> send_notify_msg(NotifyRef); + seastar::future<> send_disconnect_msg(); + void discard_state(); + + friend Notify; + +public: + Watch(private_ctag_t, + crimson::osd::ObjectContextRef obc, + const watch_info_t& winfo, + const entity_name_t& entity_name) + : obc(std::move(obc)), + winfo(winfo), + entity_name(entity_name) { + } + + seastar::future<> connect(crimson::net::ConnectionRef, bool); + bool is_alive() const { + return true; + } + bool is_connected() const { + return static_cast<bool>(conn); + } + void got_ping(utime_t) { + // NOP + } + + seastar::future<> remove(bool send_disconnect); + + /// Call when notify_ack received on notify_id + seastar::future<> notify_ack( + uint64_t notify_id, ///< [in] id of acked notify + const ceph::bufferlist& reply_bl); ///< [in] notify reply buffer + + template <class... Args> + static seastar::shared_ptr<Watch> create(Args&&... args) { + return seastar::make_shared<Watch>(private_ctag_t{}, + std::forward<Args>(args)...); + }; + + uint64_t get_watcher_gid() const { + return entity_name.num(); + } + uint64_t get_cookie() const { + return winfo.cookie; + } +}; + +using WatchRef = seastar::shared_ptr<Watch>; + +struct notify_reply_t { + uint64_t watcher_gid; + uint64_t watcher_cookie; + ceph::bufferlist bl; + + bool operator<(const notify_reply_t& rhs) const; + DENC(notify_reply_t, v, p) { + DENC_START(1, 1, p); + denc(v.watcher_gid, p); + denc(v.watcher_cookie, p); + denc(v.bl, p); + DENC_FINISH(p); + } +}; + +class Notify { + std::set<WatchRef> watchers; + notify_info_t ninfo; + crimson::net::ConnectionRef conn; + uint64_t client_gid; + uint64_t user_version; + bool complete = false; + bool discarded = false; + + /// (gid,cookie) -> reply_bl for everyone who acked the notify + std::multiset<notify_reply_t> notify_replies; + + uint64_t get_id() const { return ninfo.notify_id; } + seastar::future<> maybe_send_completion(); + + template <class WatchIteratorT> + Notify(WatchIteratorT begin, + WatchIteratorT end, + crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version); + // this is a private tag for the public constructor that turns it into + // de facto private one. The motivation behind the hack is make_shared + // used by create_n_propagate factory. + struct private_ctag_t{}; + + friend Watch; + +public: + template <class... Args> + Notify(private_ctag_t, Args&&... args) : Notify(std::forward<Args>(args)...) { + } + + template <class WatchIteratorT, class... Args> + static seastar::future<> create_n_propagate( + WatchIteratorT begin, + WatchIteratorT end, + Args&&... args); + + seastar::future<> remove_watcher(WatchRef watch); + seastar::future<> complete_watcher(WatchRef watch, + const ceph::bufferlist& reply_bl); +}; + + +template <class WatchIteratorT> +Notify::Notify(WatchIteratorT begin, + WatchIteratorT end, + crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version) + : watchers(begin, end), + ninfo(ninfo), + conn(std::move(conn)), + client_gid(client_gid), + user_version(user_version) { +} + +template <class WatchIteratorT, class... Args> +seastar::future<> Notify::create_n_propagate( + WatchIteratorT begin, + WatchIteratorT end, + Args&&... args) +{ + static_assert( + std::is_same_v<typename std::iterator_traits<WatchIteratorT>::value_type, + crimson::osd::WatchRef>); + auto notify = seastar::make_shared<Notify>( + private_ctag_t{}, + begin, + end, + std::forward<Args>(args)...); + return seastar::do_for_each(begin, end, [=] (auto& watchref) { + return watchref->start_notify(notify); + }).then([notify = std::move(notify)] { + return notify->maybe_send_completion(); + }); +} + +} // namespace crimson::osd + +WRITE_CLASS_DENC(crimson::osd::notify_reply_t) diff --git a/src/crimson/tools/CMakeLists.txt b/src/crimson/tools/CMakeLists.txt new file mode 100644 index 000000000..1a59a9a11 --- /dev/null +++ b/src/crimson/tools/CMakeLists.txt @@ -0,0 +1,6 @@ +add_executable(crimson-store-nbd + store-nbd.cc + ) +target_link_libraries(crimson-store-nbd + crimson-seastore) +install(TARGETS crimson-store-nbd DESTINATION bin) diff --git a/src/crimson/tools/store-nbd.cc b/src/crimson/tools/store-nbd.cc new file mode 100644 index 000000000..cdf853d15 --- /dev/null +++ b/src/crimson/tools/store-nbd.cc @@ -0,0 +1,621 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- + +/** + * crimson-store-nbd + * + * This tool exposes crimson object store internals as an nbd server + * for use with fio in basic benchmarking. + * + * Example usage: + * + * $ ./bin/crimson-store-nbd --device-path /dev/nvme1n1 -c 1 --total-device-size=107374182400 --mkfs true --uds-path /tmp/store_nbd_socket.sock + * + * $ cat nbd.fio + * [global] + * ioengine=nbd + * uri=nbd+unix:///?socket=/tmp/store_nbd_socket.sock + * rw=randrw + * time_based + * runtime=120 + * group_reporting + * iodepth=1 + * size=500G + * + * [job0] + * offset=0 + * + * $ fio nbd.fio + */ + +#include <random> + +#include <boost/program_options/variables_map.hpp> +#include <boost/program_options/parsers.hpp> + +#include <linux/nbd.h> +#include <linux/fs.h> + +#include <seastar/core/byteorder.hh> + +#include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/segment_cleaner.h" +#include "crimson/os/seastore/segment_manager.h" +#include "crimson/os/seastore/segment_manager/block.h" +#include "crimson/os/seastore/transaction_manager.h" + +#include "test/crimson/seastar_runner.h" +#include "test/crimson/seastore/test_block.h" + +namespace po = boost::program_options; + +using namespace ceph; +using namespace crimson; +using namespace crimson::os; +using namespace crimson::os::seastore; +using namespace crimson::os::seastore::segment_manager::block; + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_test); + } +} + +/** + * BlockDriver + * + * Simple interface to enable throughput test to compare raw disk to + * transaction_manager, etc + */ +class BlockDriver { +public: + struct config_t { + std::string type; + bool mkfs = false; + std::optional<std::string> path; + size_t segment_size; + size_t total_device_size; + + void populate_options( + po::options_description &desc) + { + desc.add_options() + ("type", + po::value<std::string>() + ->default_value("transaction_manager") + ->notifier([this](auto s) { type = s; }), + "Backend to use, options are transaction_manager" + ) + ("segment-size", + po::value<size_t>() + ->default_value(16ul << 20 /* 16MB */) + ->notifier([this](auto s) { segment_size = s; }), + "Total working set size" + ) + ("total-device-size", + po::value<size_t>() + ->default_value(10ul << 30 /* 10G */) + ->notifier([this](auto s) { total_device_size = s; }), + "Size of writes" + ) + ("device-path", + po::value<std::string>() + ->required() + ->notifier([this](auto s) { path = s; }), + "Number of writes outstanding" + ) + ("mkfs", + po::value<bool>() + ->default_value(false) + ->notifier([this](auto s) { mkfs = s; }), + "Do mkfs first" + ); + } + }; + + virtual bufferptr get_buffer(size_t size) = 0; + + virtual seastar::future<> write( + off_t offset, + bufferptr ptr) = 0; + + virtual seastar::future<bufferlist> read( + off_t offset, + size_t size) = 0; + + virtual size_t get_size() const = 0; + + virtual seastar::future<> mount() = 0; + virtual seastar::future<> close() = 0; + + virtual ~BlockDriver() {} +}; +using BlockDriverRef = std::unique_ptr<BlockDriver>; + +BlockDriverRef get_backend(BlockDriver::config_t config); + +struct request_context_t { + uint32_t magic = 0; + uint32_t type = 0; + + char handle[8] = {0}; + + uint64_t from = 0; + uint32_t len = 0; + + unsigned err = 0; + std::optional<bufferptr> in_buffer; + std::optional<bufferlist> out_buffer; + + bool check_magic() const { + // todo + return true; + } + + uint32_t get_command() const { + return type & 0xff; + } + + bool has_input_buffer() const { + return get_command() == NBD_CMD_WRITE; + } + + seastar::future<> read_request(seastar::input_stream<char> &in) { + return in.read_exactly(sizeof(struct nbd_request) + ).then([this, &in](auto buf) { + auto p = buf.get(); + magic = seastar::consume_be<uint32_t>(p); + type = seastar::consume_be<uint32_t>(p); + memcpy(handle, p, sizeof(handle)); + p += sizeof(handle); + from = seastar::consume_be<uint64_t>(p); + len = seastar::consume_be<uint32_t>(p); + logger().debug( + "Got request, magic {}, type {}, from {}, len {}", + magic, type, from, len); + + if (has_input_buffer()) { + return in.read_exactly(len).then([this](auto buf) { + in_buffer = ceph::buffer::create_page_aligned(len); + in_buffer->copy_in(0, len, buf.get()); + return seastar::now(); + }); + } else { + return seastar::now(); + } + }); + } + + seastar::future<> write_reply(seastar::output_stream<char> &out) { + seastar::temporary_buffer<char> buffer{sizeof(struct nbd_reply)}; + auto p = buffer.get_write(); + seastar::produce_be<uint32_t>(p, NBD_REPLY_MAGIC); + seastar::produce_be<uint32_t>(p, err); + memcpy(p, handle, sizeof(handle)); + return out.write(std::move(buffer)).then([this, &out] { + if (out_buffer) { + return seastar::do_for_each( + out_buffer->mut_buffers(), + [&out](bufferptr &ptr) { + return out.write( + seastar::temporary_buffer<char>( + ptr.c_str(), + ptr.length(), + seastar::make_deleter([ptr](){})) + ); + }); + } else { + return seastar::now(); + } + }).then([&out] { + return out.flush(); + }); + } +}; + +/** + * NBDHandler + * + * Simple throughput test for concurrent, single threaded + * writes to an BlockDriver. + */ +class NBDHandler { + BlockDriver &backend; + std::string uds_path; +public: + struct config_t { + std::string uds_path; + + void populate_options( + po::options_description &desc) + { + desc.add_options() + ("uds-path", + po::value<std::string>() + ->default_value("/tmp/store_nbd_socket.sock") + ->notifier([this](auto s) { + uds_path = s; + }), + "Path to domain socket for nbd" + ); + } + }; + + NBDHandler( + BlockDriver &backend, + config_t config) : + backend(backend), + uds_path(config.uds_path) + {} + + seastar::future<> run(); +}; + +int main(int argc, char** argv) +{ + po::options_description desc{"Allowed options"}; + bool debug = false; + desc.add_options() + ("help,h", "show help message") + ("debug", po::value<bool>(&debug)->default_value(false), + "enable debugging"); + + po::options_description nbd_pattern_options{"NBD Pattern Options"}; + NBDHandler::config_t nbd_config; + nbd_config.populate_options(nbd_pattern_options); + desc.add(nbd_pattern_options); + + po::options_description backend_pattern_options{"Backend Options"}; + BlockDriver::config_t backend_config; + backend_config.populate_options(backend_pattern_options); + desc.add(backend_pattern_options); + + po::variables_map vm; + std::vector<std::string> unrecognized_options; + try { + auto parsed = po::command_line_parser(argc, argv) + .options(desc) + .allow_unregistered() + .run(); + po::store(parsed, vm); + if (vm.count("help")) { + std::cout << desc << std::endl; + return 0; + } + + po::notify(vm); + unrecognized_options = + po::collect_unrecognized(parsed.options, po::include_positional); + } catch(const po::error& e) { + std::cerr << "error: " << e.what() << std::endl; + return 1; + } + std::vector<const char*> args(argv, argv + argc); + + seastar::app_template app; + + std::vector<char*> av{argv[0]}; + std::transform(begin(unrecognized_options), + end(unrecognized_options), + std::back_inserter(av), + [](auto& s) { + return const_cast<char*>(s.c_str()); + }); + + SeastarRunner sc; + sc.init(av.size(), av.data()); + + if (debug) { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::debug + ); + } + + sc.run([=] { + auto backend = get_backend(backend_config); + return seastar::do_with( + NBDHandler(*backend, nbd_config), + std::move(backend), + [](auto &nbd, auto &backend) { + return backend->mount( + ).then([&] { + logger().debug("Running nbd server..."); + return nbd.run(); + }).then([&] { + return backend->close(); + }); + }); + }); + sc.stop(); +} + +class nbd_oldstyle_negotiation_t { + uint64_t magic = seastar::cpu_to_be(0x4e42444d41474943); // "NBDMAGIC" + uint64_t magic2 = seastar::cpu_to_be(0x00420281861253); // "IHAVEOPT" + uint64_t size = 0; + uint32_t flags = seastar::cpu_to_be(0); + char reserved[124] = {0}; + +public: + nbd_oldstyle_negotiation_t(uint64_t size, uint32_t flags) + : size(seastar::cpu_to_be(size)), flags(seastar::cpu_to_be(flags)) {} +} __attribute__((packed)); + +seastar::future<> send_negotiation( + size_t size, + seastar::output_stream<char>& out) +{ + seastar::temporary_buffer<char> buf{sizeof(nbd_oldstyle_negotiation_t)}; + new (buf.get_write()) nbd_oldstyle_negotiation_t(size, 1); + return out.write(std::move(buf) + ).then([&out] { + return out.flush(); + }); +} + +seastar::future<> handle_command( + BlockDriver &backend, + request_context_t &context, + seastar::output_stream<char> &out) +{ + logger().debug("got command {}", context.get_command()); + return ([&] { + switch (context.get_command()) { + case NBD_CMD_WRITE: + return backend.write( + context.from, + *context.in_buffer); + case NBD_CMD_READ: + return backend.read( + context.from, + context.len).then([&context] (auto buffer) { + context.out_buffer = buffer; + }); + case NBD_CMD_DISC: + throw std::system_error(std::make_error_code(std::errc::bad_message)); + case NBD_CMD_TRIM: + throw std::system_error(std::make_error_code(std::errc::bad_message)); + default: + throw std::system_error(std::make_error_code(std::errc::bad_message)); + } + })().then([&] { + logger().debug("Writing reply"); + return context.write_reply(out); + }); +} + + +seastar::future<> handle_commands( + BlockDriver &backend, + seastar::input_stream<char>& in, + seastar::output_stream<char>& out) +{ + logger().debug("handle_commands"); + return seastar::keep_doing( + [&] { + logger().debug("waiting for command"); + auto request_ref = std::make_unique<request_context_t>(); + auto &request = *request_ref; + return request.read_request(in + ).then([&] { + return handle_command(backend, request, out); + }).then([req=std::move(request_ref)] { + logger().debug("complete"); + }); + }); +} + +seastar::future<> NBDHandler::run() +{ + logger().debug("About to listen on {}", uds_path); + return seastar::do_with( + seastar::engine().listen( + seastar::socket_address{ + seastar::unix_domain_addr{uds_path}}), + [=](auto &socket) { + return seastar::keep_doing( + [this, &socket] { + return socket.accept().then([this](auto acc) { + logger().debug("Accepted"); + return seastar::do_with( + std::move(acc.connection), + [this](auto &conn) { + return seastar::do_with( + conn.input(), + conn.output(), + [&, this](auto &input, auto &output) { + return send_negotiation( + backend.get_size(), + output + ).then([&, this] { + return handle_commands(backend, input, output); + }).finally([&] { + return input.close(); + }).finally([&] { + return output.close(); + }).handle_exception([](auto e) { + return seastar::now(); + }); + }); + }); + }); + }); + }); +} + +class TMDriver final : public BlockDriver { + const config_t config; + std::unique_ptr<segment_manager::block::BlockSegmentManager> segment_manager; + std::unique_ptr<SegmentCleaner> segment_cleaner; + std::unique_ptr<Journal> journal; + std::unique_ptr<Cache> cache; + LBAManagerRef lba_manager; + std::unique_ptr<TransactionManager> tm; + +public: + TMDriver(config_t config) : config(config) {} + ~TMDriver() final {} + + bufferptr get_buffer(size_t size) final { + return ceph::buffer::create_page_aligned(size); + } + + seastar::future<> write( + off_t offset, + bufferptr ptr) final { + logger().debug("Writing offset {}", offset); + assert(offset % segment_manager->get_block_size() == 0); + assert(ptr.length() == (size_t)segment_manager->get_block_size()); + return seastar::do_with( + tm->create_transaction(), + std::move(ptr), + [this, offset](auto &t, auto &ptr) { + return tm->dec_ref( + *t, + offset + ).safe_then([](auto){}).handle_error( + crimson::ct_error::enoent::handle([](auto) { return seastar::now(); }), + crimson::ct_error::pass_further_all{} + ).safe_then([=, &t, &ptr] { + logger().debug("dec_ref complete"); + return tm->alloc_extent<TestBlock>( + *t, + offset, + ptr.length()); + }).safe_then([=, &t, &ptr](auto ext) mutable { + assert(ext->get_laddr() == (size_t)offset); + assert(ext->get_bptr().length() == ptr.length()); + ext->get_bptr().swap(ptr); + logger().debug("submitting transaction"); + return tm->submit_transaction(std::move(t)); + }); + }).handle_error( + crimson::ct_error::assert_all{} + ); + } + + seastar::future<bufferlist> read( + off_t offset, + size_t size) final { + logger().debug("Reading offset {}", offset); + assert(offset % segment_manager->get_block_size() == 0); + assert(size % (size_t)segment_manager->get_block_size() == 0); + return seastar::do_with( + tm->create_transaction(), + [this, offset, size](auto &t) { + return tm->read_extents<TestBlock>(*t, offset, size + ).safe_then([=](auto ext_list) mutable { + size_t cur = offset; + bufferlist bl; + for (auto &i: ext_list) { + if (cur != i.first) { + assert(cur < i.first); + bl.append_zero(i.first - cur); + cur = i.first; + } + bl.append(i.second->get_bptr()); + cur += i.second->get_bptr().length(); + } + if (bl.length() != size) { + assert(bl.length() < size); + bl.append_zero(size - bl.length()); + } + return seastar::make_ready_future<bufferlist>(std::move(bl)); + }); + }).handle_error( + crimson::ct_error::assert_all{} + ); + } + + void init() { + segment_cleaner = std::make_unique<SegmentCleaner>( + SegmentCleaner::config_t::default_from_segment_manager( + *segment_manager), + true); + journal = std::make_unique<Journal>(*segment_manager); + cache = std::make_unique<Cache>(*segment_manager); + lba_manager = lba_manager::create_lba_manager(*segment_manager, *cache); + tm = std::make_unique<TransactionManager>( + *segment_manager, *segment_cleaner, *journal, *cache, *lba_manager); + journal->set_segment_provider(&*segment_cleaner); + segment_cleaner->set_extent_callback(&*tm); + } + + void clear() { + tm.reset(); + lba_manager.reset(); + cache.reset(); + journal.reset(); + segment_cleaner.reset(); + } + + size_t get_size() const final { + return segment_manager->get_size() * .5; + } + + seastar::future<> mkfs() { + assert(config.path); + segment_manager = std::make_unique< + segment_manager::block::BlockSegmentManager + >(); + logger().debug("mkfs"); + return segment_manager->mkfs( + { *config.path, config.segment_size, config.total_device_size } + ).safe_then([this] { + logger().debug(""); + return segment_manager->mount({ *config.path }); + }).safe_then([this] { + init(); + logger().debug("tm mkfs"); + return tm->mkfs(); + }).safe_then([this] { + logger().debug("tm close"); + return tm->close(); + }).safe_then([this] { + logger().debug("sm close"); + return segment_manager->close(); + }).safe_then([this] { + clear(); + logger().debug("mkfs complete"); + return TransactionManager::mkfs_ertr::now(); + }).handle_error( + crimson::ct_error::assert_all{} + ); + } + + seastar::future<> mount() final { + return (config.mkfs ? mkfs() : seastar::now() + ).then([this] { + segment_manager = std::make_unique< + segment_manager::block::BlockSegmentManager + >(); + return segment_manager->mount({ *config.path }); + }).safe_then([this] { + init(); + return tm->mount(); + }).handle_error( + crimson::ct_error::assert_all{} + ); + }; + + seastar::future<> close() final { + return segment_manager->close( + ).safe_then([this] { + return tm->close(); + }).safe_then([this] { + clear(); + return seastar::now(); + }).handle_error( + crimson::ct_error::assert_all{} + ); + } +}; + +BlockDriverRef get_backend(BlockDriver::config_t config) +{ + if (config.type == "transaction_manager") { + return std::make_unique<TMDriver>(config); + } else { + ceph_assert(0 == "invalid option"); + return BlockDriverRef(); + } +} |