diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/crimson/osd | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crimson/osd')
89 files changed, 23950 insertions, 0 deletions
diff --git a/src/crimson/osd/CMakeLists.txt b/src/crimson/osd/CMakeLists.txt new file mode 100644 index 000000000..f521e0244 --- /dev/null +++ b/src/crimson/osd/CMakeLists.txt @@ -0,0 +1,72 @@ +add_executable(crimson-osd + backfill_state.cc + ec_backend.cc + heartbeat.cc + lsan_suppressions.cc + main.cc + main_config_bootstrap_helpers.cc + osd.cc + osd_meta.cc + pg.cc + pg_backend.cc + pg_meta.cc + replicated_backend.cc + shard_services.cc + pg_shard_manager.cc + object_context.cc + object_context_loader.cc + ops_executer.cc + osd_operation.cc + osd_operations/client_request.cc + osd_operations/client_request_common.cc + osd_operations/internal_client_request.cc + osd_operations/peering_event.cc + osd_operations/pg_advance_map.cc + osd_operations/replicated_request.cc + osd_operations/logmissing_request.cc + osd_operations/logmissing_request_reply.cc + osd_operations/background_recovery.cc + osd_operations/recovery_subrequest.cc + osd_operations/snaptrim_event.cc + pg_recovery.cc + recovery_backend.cc + replicated_recovery_backend.cc + scheduler/scheduler.cc + scheduler/mclock_scheduler.cc + osdmap_gate.cc + pg_activation_blocker.cc + pg_map.cc + pg_interval_interrupt_condition.cc + objclass.cc + ${PROJECT_SOURCE_DIR}/src/objclass/class_api.cc + ${PROJECT_SOURCE_DIR}/src/osd/ClassHandler.cc + ${PROJECT_SOURCE_DIR}/src/osd/osd_op_util.cc + ${PROJECT_SOURCE_DIR}/src/osd/OSDCap.cc + ${PROJECT_SOURCE_DIR}/src/osd/PeeringState.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGStateUtils.cc + ${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc + ${PROJECT_SOURCE_DIR}/src/osd/SnapMapper.cc + ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc + ${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc + watch.cc + ) +if(HAS_VTA) + set_source_files_properties(main.cc + PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments) +endif() +target_link_libraries(crimson-osd + crimson-admin + crimson-common + crimson-os + crimson + fmt::fmt + Boost::MPL + dmclock::dmclock) +set_target_properties(crimson-osd PROPERTIES + POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE}) +install(TARGETS crimson-osd DESTINATION bin) +if(WITH_TESTS) + add_dependencies(tests crimson-osd) +endif() diff --git a/src/crimson/osd/acked_peers.h b/src/crimson/osd/acked_peers.h new file mode 100644 index 000000000..b2f2562c0 --- /dev/null +++ b/src/crimson/osd/acked_peers.h @@ -0,0 +1,14 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <vector> + +namespace crimson::osd { + struct peer_shard_t { + pg_shard_t shard; + eversion_t last_complete_ondisk; + }; + using acked_peers_t = std::vector<peer_shard_t>; +} diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h new file mode 100644 index 000000000..683dc6ea6 --- /dev/null +++ b/src/crimson/osd/backfill_facades.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/pg.h" +#include "osd/PeeringState.h" + +namespace crimson::osd { + +// PeeringFacade -- main implementation of the BackfillState::PeeringFacade +// interface. We have the abstraction to decuple BackfillState from Peering +// State, and thus cut depedencies in unit testing. The second implemention +// is BackfillFixture::PeeringFacade and sits in test_backfill.cc. +struct PeeringFacade final : BackfillState::PeeringFacade { + PeeringState& peering_state; + + hobject_t earliest_backfill() const override { + return peering_state.earliest_backfill(); + } + + const std::set<pg_shard_t>& get_backfill_targets() const override { + return peering_state.get_backfill_targets(); + } + + const hobject_t& get_peer_last_backfill(pg_shard_t peer) const override { + return peering_state.get_peer_info(peer).last_backfill; + } + + const eversion_t& get_last_update() const override { + return peering_state.get_info().last_update; + } + + const eversion_t& get_log_tail() const override { + return peering_state.get_info().log_tail; + } + + void scan_log_after(eversion_t v, scan_log_func_t f) const override { + peering_state.get_pg_log().get_log().scan_log_after(v, std::move(f)); + } + + bool is_backfill_target(pg_shard_t peer) const override { + return peering_state.is_backfill_target(peer); + } + void update_complete_backfill_object_stats(const hobject_t &hoid, + const pg_stat_t &stats) override { + peering_state.update_complete_backfill_object_stats(hoid, stats); + } + + bool is_backfilling() const override { + return peering_state.is_backfilling(); + } + + PeeringFacade(PeeringState& peering_state) + : peering_state(peering_state) { + } +}; + +// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge +// interface of crimson's PG class. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct PGFacade final : BackfillState::PGFacade { + PG& pg; + + const eversion_t& get_projected_last_update() const override { + return pg.projected_last_update; + } + + PGFacade(PG& pg) : pg(pg) {} +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc new file mode 100644 index 000000000..46a270ffe --- /dev/null +++ b/src/crimson/osd/backfill_state.cc @@ -0,0 +1,558 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> +#include <boost/type_index.hpp> +#include <fmt/ranges.h> +#include "common/hobject_fmt.h" +#include "crimson/osd/backfill_state.h" +#include "osd/osd_types_fmt.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +BackfillState::BackfillState( + BackfillState::BackfillListener& backfill_listener, + std::unique_ptr<BackfillState::PeeringFacade> peering_state, + std::unique_ptr<BackfillState::PGFacade> pg) + : backfill_machine(*this, + backfill_listener, + std::move(peering_state), + std::move(pg)), + progress_tracker( + std::make_unique<BackfillState::ProgressTracker>(backfill_machine)) +{ + logger().debug("{}:{}", __func__, __LINE__); + backfill_machine.initiate(); +} + +template <class S> +BackfillState::StateHelper<S>::StateHelper() +{ + logger().debug("enter {}", + boost::typeindex::type_id<S>().pretty_name()); +} + +template <class S> +BackfillState::StateHelper<S>::~StateHelper() +{ + logger().debug("exit {}", + boost::typeindex::type_id<S>().pretty_name()); +} + +BackfillState::~BackfillState() = default; + +BackfillState::BackfillMachine::BackfillMachine( + BackfillState& backfill_state, + BackfillState::BackfillListener& backfill_listener, + std::unique_ptr<BackfillState::PeeringFacade> peering_state, + std::unique_ptr<BackfillState::PGFacade> pg) + : backfill_state(backfill_state), + backfill_listener(backfill_listener), + peering_state(std::move(peering_state)), + pg(std::move(pg)) +{} + +BackfillState::BackfillMachine::~BackfillMachine() = default; + +BackfillState::Initial::Initial(my_context ctx) + : my_base(ctx) +{ + backfill_state().last_backfill_started = peering_state().earliest_backfill(); + logger().debug("{}: bft={} from {}", + __func__, peering_state().get_backfill_targets(), + backfill_state().last_backfill_started); + for (const auto& bt : peering_state().get_backfill_targets()) { + logger().debug("{}: target shard {} from {}", + __func__, bt, peering_state().get_peer_last_backfill(bt)); + } + ceph_assert(peering_state().get_backfill_targets().size()); + ceph_assert(!backfill_state().last_backfill_started.is_max()); +} + +boost::statechart::result +BackfillState::Initial::react(const BackfillState::Triggered& evt) +{ + logger().debug("{}: backfill triggered", __func__); + ceph_assert(backfill_state().last_backfill_started == \ + peering_state().earliest_backfill()); + ceph_assert(peering_state().is_backfilling()); + // initialize BackfillIntervals + for (const auto& bt : peering_state().get_backfill_targets()) { + backfill_state().peer_backfill_info[bt].reset( + peering_state().get_peer_last_backfill(bt)); + } + backfill_state().backfill_info.reset(backfill_state().last_backfill_started); + if (Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + logger().debug("{}: switching to Done state", __func__); + return transit<BackfillState::Done>(); + } else { + logger().debug("{}: switching to Enqueuing state", __func__); + return transit<BackfillState::Enqueuing>(); + } +} + + +// -- Enqueuing +void BackfillState::Enqueuing::maybe_update_range() +{ + if (auto& primary_bi = backfill_state().backfill_info; + primary_bi.version >= pg().get_projected_last_update()) { + logger().info("{}: bi is current", __func__); + ceph_assert(primary_bi.version == pg().get_projected_last_update()); + } else if (primary_bi.version >= peering_state().get_log_tail()) { +#if 0 + if (peering_state().get_pg_log().get_log().empty() && + pg().get_projected_log().empty()) { + /* Because we don't move log_tail on split, the log might be + * empty even if log_tail != last_update. However, the only + * way to get here with an empty log is if log_tail is actually + * eversion_t(), because otherwise the entry which changed + * last_update since the last scan would have to be present. + */ + ceph_assert(primary_bi.version == eversion_t()); + return; + } +#endif + logger().debug("{}: bi is old, ({}) can be updated with log to {}", + __func__, + primary_bi.version, + pg().get_projected_last_update()); + logger().debug("{}: scanning pg log first", __func__); + peering_state().scan_log_after(primary_bi.version, + [&](const pg_log_entry_t& e) { + logger().debug("maybe_update_range(lambda): updating from version {}", + e.version); + if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) { + if (e.is_update()) { + logger().debug("maybe_update_range(lambda): {} updated to ver {}", + e.soid, e.version); + primary_bi.objects.erase(e.soid); + primary_bi.objects.insert(std::make_pair(e.soid, + e.version)); + } else if (e.is_delete()) { + logger().debug("maybe_update_range(lambda): {} removed", + e.soid); + primary_bi.objects.erase(e.soid); + } + } + }); + primary_bi.version = pg().get_projected_last_update(); + } else { + ceph_abort_msg( + "scan_range should have raised primary_bi.version past log_tail"); + } +} + +void BackfillState::Enqueuing::trim_backfill_infos() +{ + for (const auto& bt : peering_state().get_backfill_targets()) { + backfill_state().peer_backfill_info[bt].trim_to( + std::max(peering_state().get_peer_last_backfill(bt), + backfill_state().last_backfill_started)); + } + backfill_state().backfill_info.trim_to( + backfill_state().last_backfill_started); +} + +/* static */ bool BackfillState::Enqueuing::all_enqueued( + const PeeringFacade& peering_state, + const BackfillInterval& backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) +{ + const bool all_local_enqueued = \ + backfill_info.extends_to_end() && backfill_info.empty(); + const bool all_peer_enqueued = std::all_of( + std::begin(peer_backfill_info), + std::end(peer_backfill_info), + [] (const auto& kv) { + [[maybe_unused]] const auto& [ shard, peer_backfill_info ] = kv; + return peer_backfill_info.extends_to_end() && peer_backfill_info.empty(); + }); + return all_local_enqueued && all_peer_enqueued; +} + +hobject_t BackfillState::Enqueuing::earliest_peer_backfill( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const +{ + hobject_t e = hobject_t::get_max(); + for (const pg_shard_t& bt : peering_state().get_backfill_targets()) { + const auto iter = peer_backfill_info.find(bt); + ceph_assert(iter != peer_backfill_info.end()); + e = std::min(e, iter->second.begin); + } + return e; +} + +bool BackfillState::Enqueuing::should_rescan_replicas( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const +{ + const auto& targets = peering_state().get_backfill_targets(); + return std::any_of(std::begin(targets), std::end(targets), + [&] (const auto& bt) { + return ReplicasScanning::replica_needs_scan(peer_backfill_info.at(bt), + backfill_info); + }); +} + +bool BackfillState::Enqueuing::should_rescan_primary( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const +{ + return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) && + !backfill_info.extends_to_end(); +} + +void BackfillState::Enqueuing::trim_backfilled_object_from_intervals( + BackfillState::Enqueuing::result_t&& result, + hobject_t& last_backfill_started, + std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) +{ + std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets), + [&peer_backfill_info] (const auto& bt) { + peer_backfill_info.at(bt).pop_front(); + }); + last_backfill_started = std::move(result.new_last_backfill_started); +} + +BackfillState::Enqueuing::result_t +BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) +{ + // set `new_last_backfill_started` to `check` + result_t result { {}, check }; + for (const auto& bt : peering_state().get_backfill_targets()) { + const auto& pbi = backfill_state().peer_backfill_info.at(bt); + if (pbi.begin == check) { + result.pbi_targets.insert(bt); + const auto& version = pbi.objects.begin()->second; + backfill_state().progress_tracker->enqueue_drop(pbi.begin); + backfill_listener().enqueue_drop(bt, pbi.begin, version); + } + } + logger().debug("{}: BACKFILL removing {} from peers {}", + __func__, check, result.pbi_targets); + ceph_assert(!result.pbi_targets.empty()); + return result; +} + +BackfillState::Enqueuing::result_t +BackfillState::Enqueuing::update_on_peers(const hobject_t& check) +{ + logger().debug("{}: check={}", __func__, check); + const auto& primary_bi = backfill_state().backfill_info; + result_t result { {}, primary_bi.begin }; + + for (const auto& bt : peering_state().get_backfill_targets()) { + const auto& peer_bi = backfill_state().peer_backfill_info.at(bt); + + // Find all check peers that have the wrong version + if (const eversion_t& obj_v = primary_bi.objects.begin()->second; + check == primary_bi.begin && check == peer_bi.begin) { + if(peer_bi.objects.begin()->second != obj_v && + backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) { + backfill_listener().enqueue_push(primary_bi.begin, obj_v); + } else { + // it's fine, keep it! OR already recovering + } + result.pbi_targets.insert(bt); + } else { + // Only include peers that we've caught up to their backfill line + // otherwise, they only appear to be missing this object + // because their peer_bi.begin > backfill_info.begin. + if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) && + backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) { + backfill_listener().enqueue_push(primary_bi.begin, obj_v); + } + } + } + return result; +} + +bool BackfillState::Enqueuing::Enqueuing::all_emptied( + const BackfillInterval& local_backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const +{ + const auto& targets = peering_state().get_backfill_targets(); + const auto replicas_emptied = + std::all_of(std::begin(targets), std::end(targets), + [&] (const auto& bt) { + return peer_backfill_info.at(bt).empty(); + }); + return local_backfill_info.empty() && replicas_emptied; +} + +BackfillState::Enqueuing::Enqueuing(my_context ctx) + : my_base(ctx) +{ + auto& primary_bi = backfill_state().backfill_info; + + // update our local interval to cope with recent changes + primary_bi.begin = backfill_state().last_backfill_started; + if (primary_bi.version < peering_state().get_log_tail()) { + // it might be that the OSD is so flooded with modifying operations + // that backfill will be spinning here over and over. For the sake + // of performance and complexity we don't synchronize with entire PG. + // similar can happen in classical OSD. + logger().warn("{}: bi is old, rescanning of local backfill_info", + __func__); + post_event(RequestPrimaryScanning{}); + return; + } else { + maybe_update_range(); + } + trim_backfill_infos(); + + while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) { + if (!backfill_listener().budget_available()) { + post_event(RequestWaiting{}); + return; + } else if (should_rescan_replicas(backfill_state().peer_backfill_info, + primary_bi)) { + // Count simultaneous scans as a single op and let those complete + post_event(RequestReplicasScanning{}); + return; + } + // Get object within set of peers to operate on and the set of targets + // for which that object applies. + if (const hobject_t check = \ + earliest_peer_backfill(backfill_state().peer_backfill_info); + check < primary_bi.begin) { + // Don't increment ops here because deletions + // are cheap and not replied to unlike real recovery_ops, + // and we can't increment ops without requeueing ourself + // for recovery. + auto result = remove_on_peers(check); + trim_backfilled_object_from_intervals(std::move(result), + backfill_state().last_backfill_started, + backfill_state().peer_backfill_info); + } else { + auto result = update_on_peers(check); + trim_backfilled_object_from_intervals(std::move(result), + backfill_state().last_backfill_started, + backfill_state().peer_backfill_info); + primary_bi.pop_front(); + } + backfill_listener().maybe_flush(); + } + + if (should_rescan_primary(backfill_state().peer_backfill_info, + primary_bi)) { + // need to grab one another chunk of the object namespace and restart + // the queueing. + logger().debug("{}: reached end for current local chunk", + __func__); + post_event(RequestPrimaryScanning{}); + } else if (backfill_state().progress_tracker->tracked_objects_completed()) { + post_event(RequestDone{}); + } else { + logger().debug("{}: reached end for both local and all peers " + "but still has in-flight operations", __func__); + post_event(RequestWaiting{}); + } +} + +// -- PrimaryScanning +BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx) + : my_base(ctx) +{ + backfill_state().backfill_info.version = peering_state().get_last_update(); + backfill_listener().request_primary_scan( + backfill_state().backfill_info.begin); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(PrimaryScanned evt) +{ + logger().debug("{}", __func__); + backfill_state().backfill_info = std::move(evt.result); + return transit<Enqueuing>(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(ObjectPushed evt) +{ + logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + return discard_event(); +} + +// -- ReplicasScanning +bool BackfillState::ReplicasScanning::replica_needs_scan( + const BackfillInterval& replica_backfill_info, + const BackfillInterval& local_backfill_info) +{ + return replica_backfill_info.empty() && \ + replica_backfill_info.begin <= local_backfill_info.begin && \ + !replica_backfill_info.extends_to_end(); +} + +BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx) + : my_base(ctx) +{ + for (const auto& bt : peering_state().get_backfill_targets()) { + if (const auto& pbi = backfill_state().peer_backfill_info.at(bt); + replica_needs_scan(pbi, backfill_state().backfill_info)) { + logger().debug("{}: scanning peer osd.{} from {}", + __func__, bt, pbi.end); + backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{}); + + ceph_assert(waiting_on_backfill.find(bt) == \ + waiting_on_backfill.end()); + waiting_on_backfill.insert(bt); + } + } + ceph_assert(!waiting_on_backfill.empty()); + // TODO: start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end +} + +#if 0 +BackfillState::ReplicasScanning::~ReplicasScanning() +{ + // TODO: finish_recovery_op(hobject_t::get_max()); +} +#endif + +boost::statechart::result +BackfillState::ReplicasScanning::react(ReplicaScanned evt) +{ + logger().debug("{}: got scan result from osd={}, result={}", + __func__, evt.from, evt.result); + // TODO: maybe we'll be able to move waiting_on_backfill from + // the machine to the state. + ceph_assert(peering_state().is_backfill_target(evt.from)); + if (waiting_on_backfill.erase(evt.from)) { + backfill_state().peer_backfill_info[evt.from] = std::move(evt.result); + if (waiting_on_backfill.empty()) { + ceph_assert(backfill_state().peer_backfill_info.size() == \ + peering_state().get_backfill_targets().size()); + return transit<Enqueuing>(); + } + } else { + // we canceled backfill for a while due to a too full, and this + // is an extra response from a non-too-full peer + logger().debug("{}: canceled backfill (too full?)", __func__); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::ReplicasScanning::react(ObjectPushed evt) +{ + logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + return discard_event(); +} + + +// -- Waiting +BackfillState::Waiting::Waiting(my_context ctx) + : my_base(ctx) +{ +} + +boost::statechart::result +BackfillState::Waiting::react(ObjectPushed evt) +{ + logger().debug("Waiting::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + if (!Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + return transit<Enqueuing>(); + } else if (backfill_state().progress_tracker->tracked_objects_completed()) { + return transit<Done>(); + } else { + // we still have something to wait on + logger().debug("Waiting::react() on ObjectPushed; still waiting"); + return discard_event(); + } +} + +// -- Done +BackfillState::Done::Done(my_context ctx) + : my_base(ctx) +{ + logger().info("{}: backfill is done", __func__); + backfill_listener().backfilled(); +} + +// -- Crashed +BackfillState::Crashed::Crashed() +{ + ceph_abort_msg("{}: this should not happen"); +} + +// ProgressTracker is an intermediary between the BackfillListener and +// BackfillMachine + its states. All requests to push or drop an object +// are directed through it. The same happens with notifications about +// completing given operations which are generated by BackfillListener +// and dispatched as i.e. ObjectPushed events. +// This allows ProgressTacker to track the list of in-flight operations +// which is essential to make the decision whether the entire machine +// should switch from Waiting to Done keep in Waiting. +// ProgressTracker also coordinates .last_backfill_started and stats +// updates. +bool BackfillState::ProgressTracker::tracked_objects_completed() const +{ + return registry.empty(); +} + +bool BackfillState::ProgressTracker::enqueue_push(const hobject_t& obj) +{ + [[maybe_unused]] const auto [it, first_seen] = registry.try_emplace( + obj, registry_item_t{op_stage_t::enqueued_push, std::nullopt}); + return first_seen; +} + +void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj) +{ + registry.try_emplace( + obj, registry_item_t{op_stage_t::enqueued_drop, pg_stat_t{}}); +} + +void BackfillState::ProgressTracker::complete_to( + const hobject_t& obj, + const pg_stat_t& stats) +{ + logger().debug("{}: obj={}", + __func__, obj); + if (auto completion_iter = registry.find(obj); + completion_iter != std::end(registry)) { + completion_iter->second = \ + registry_item_t{ op_stage_t::completed_push, stats }; + } else { + ceph_abort_msg("completing untracked object shall not happen"); + } + for (auto it = std::begin(registry); + it != std::end(registry) && + it->second.stage != op_stage_t::enqueued_push; + it = registry.erase(it)) { + auto& [soid, item] = *it; + assert(item.stats); + peering_state().update_complete_backfill_object_stats( + soid, + *item.stats); + } + if (Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info) && + tracked_objects_completed()) { + backfill_state().last_backfill_started = hobject_t::get_max(); + backfill_listener().update_peers_last_backfill(hobject_t::get_max()); + } else { + backfill_listener().update_peers_last_backfill(obj); + } +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h new file mode 100644 index 000000000..4bd2991fb --- /dev/null +++ b/src/crimson/osd/backfill_state.h @@ -0,0 +1,382 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <optional> + +#include <boost/statechart/custom_reaction.hpp> +#include <boost/statechart/event.hpp> +#include <boost/statechart/event_base.hpp> +#include <boost/statechart/simple_state.hpp> +#include <boost/statechart/state.hpp> +#include <boost/statechart/state_machine.hpp> +#include <boost/statechart/transition.hpp> + +#include "osd/recovery_types.h" + +namespace crimson::osd { + +namespace sc = boost::statechart; + +struct BackfillState { + struct BackfillListener; + struct PeeringFacade; + struct PGFacade; + + // events comes first + struct PrimaryScanned : sc::event<PrimaryScanned> { + BackfillInterval result; + PrimaryScanned(BackfillInterval&& result) + : result(std::move(result)) { + } + }; + + struct ReplicaScanned : sc::event<ReplicaScanned> { + pg_shard_t from; + BackfillInterval result; + ReplicaScanned(pg_shard_t from, BackfillInterval&& result) + : from(std::move(from)), + result(std::move(result)) { + } + }; + + struct ObjectPushed : sc::event<ObjectPushed> { + // TODO: implement replica management; I don't want to follow + // current convention where the backend layer is responsible + // for tracking replicas. + hobject_t object; + pg_stat_t stat; + ObjectPushed(hobject_t object) + : object(std::move(object)) { + } + }; + + struct Triggered : sc::event<Triggered> { + }; + +private: + // internal events + struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> { + }; + + struct RequestReplicasScanning : sc::event<RequestReplicasScanning> { + }; + + struct RequestWaiting : sc::event<RequestWaiting> { + }; + + struct RequestDone : sc::event<RequestDone> { + }; + + class ProgressTracker; + +public: + + struct Initial; + struct Enqueuing; + struct PrimaryScanning; + struct ReplicasScanning; + struct Waiting; + struct Done; + + struct BackfillMachine : sc::state_machine<BackfillMachine, Initial> { + BackfillMachine(BackfillState& backfill_state, + BackfillListener& backfill_listener, + std::unique_ptr<PeeringFacade> peering_state, + std::unique_ptr<PGFacade> pg); + ~BackfillMachine(); + BackfillState& backfill_state; + BackfillListener& backfill_listener; + std::unique_ptr<PeeringFacade> peering_state; + std::unique_ptr<PGFacade> pg; + }; + +private: + template <class S> + struct StateHelper { + StateHelper(); + ~StateHelper(); + + BackfillState& backfill_state() { + return static_cast<S*>(this) \ + ->template context<BackfillMachine>().backfill_state; + } + BackfillListener& backfill_listener() { + return static_cast<S*>(this) \ + ->template context<BackfillMachine>().backfill_listener; + } + PeeringFacade& peering_state() { + return *static_cast<S*>(this) \ + ->template context<BackfillMachine>().peering_state; + } + PGFacade& pg() { + return *static_cast<S*>(this)->template context<BackfillMachine>().pg; + } + + const PeeringFacade& peering_state() const { + return *static_cast<const S*>(this) \ + ->template context<BackfillMachine>().peering_state; + } + const BackfillState& backfill_state() const { + return static_cast<const S*>(this) \ + ->template context<BackfillMachine>().backfill_state; + } + }; + +public: + + // states + struct Crashed : sc::simple_state<Crashed, BackfillMachine>, + StateHelper<Crashed> { + explicit Crashed(); + }; + + struct Initial : sc::state<Initial, BackfillMachine>, + StateHelper<Initial> { + using reactions = boost::mpl::list< + sc::custom_reaction<Triggered>, + sc::transition<sc::event_base, Crashed>>; + explicit Initial(my_context); + // initialize after triggering backfill by on_activate_complete(). + // transit to Enqueuing. + sc::result react(const Triggered&); + }; + + struct Enqueuing : sc::state<Enqueuing, BackfillMachine>, + StateHelper<Enqueuing> { + using reactions = boost::mpl::list< + sc::transition<RequestPrimaryScanning, PrimaryScanning>, + sc::transition<RequestReplicasScanning, ReplicasScanning>, + sc::transition<RequestWaiting, Waiting>, + sc::transition<RequestDone, Done>, + sc::transition<sc::event_base, Crashed>>; + explicit Enqueuing(my_context); + + // indicate whether there is any remaining work to do when it comes + // to comparing the hobject_t namespace between primary and replicas. + // true doesn't necessarily mean backfill is done -- there could be + // in-flight pushes or drops which had been enqueued but aren't + // completed yet. + static bool all_enqueued( + const PeeringFacade& peering_state, + const BackfillInterval& backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info); + + private: + void maybe_update_range(); + void trim_backfill_infos(); + + // these methods take BackfillIntervals instead of extracting them from + // the state to emphasize the relationships across the main loop. + bool all_emptied( + const BackfillInterval& local_backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const; + hobject_t earliest_peer_backfill( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const; + bool should_rescan_replicas( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const; + // indicate whether a particular acting primary needs to scanned again + // to process next piece of the hobject_t's namespace. + // the logic is per analogy to replica_needs_scan(). See comments there. + bool should_rescan_primary( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const; + + // the result_t is intermediary between {remove,update}_on_peers() and + // updating BackfillIntervals in trim_backfilled_object_from_intervals. + // This step is important because it affects the main loop's condition, + // and thus deserves to be exposed instead of being called deeply from + // {remove,update}_on_peers(). + struct [[nodiscard]] result_t { + std::set<pg_shard_t> pbi_targets; + hobject_t new_last_backfill_started; + }; + void trim_backfilled_object_from_intervals( + result_t&&, + hobject_t& last_backfill_started, + std::map<pg_shard_t, BackfillInterval>& peer_backfill_info); + result_t remove_on_peers(const hobject_t& check); + result_t update_on_peers(const hobject_t& check); + }; + + struct PrimaryScanning : sc::state<PrimaryScanning, BackfillMachine>, + StateHelper<PrimaryScanning> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::custom_reaction<PrimaryScanned>, + sc::transition<sc::event_base, Crashed>>; + explicit PrimaryScanning(my_context); + sc::result react(ObjectPushed); + // collect scanning result and transit to Enqueuing. + sc::result react(PrimaryScanned); + }; + + struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>, + StateHelper<ReplicasScanning> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::custom_reaction<ReplicaScanned>, + sc::transition<sc::event_base, Crashed>>; + explicit ReplicasScanning(my_context); + // collect scanning result; if all results are collected, transition + // to Enqueuing will happen. + sc::result react(ObjectPushed); + sc::result react(ReplicaScanned); + + // indicate whether a particular peer should be scanned to retrieve + // BackfillInterval for new range of hobject_t namespace. + // true when bi.objects is exhausted, replica bi's end is not MAX, + // and primary bi'begin is further than the replica's one. + static bool replica_needs_scan( + const BackfillInterval& replica_backfill_info, + const BackfillInterval& local_backfill_info); + + private: + std::set<pg_shard_t> waiting_on_backfill; + }; + + struct Waiting : sc::state<Waiting, BackfillMachine>, + StateHelper<Waiting> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::transition<sc::event_base, Crashed>>; + explicit Waiting(my_context); + sc::result react(ObjectPushed); + }; + + struct Done : sc::state<Done, BackfillMachine>, + StateHelper<Done> { + using reactions = boost::mpl::list< + sc::transition<sc::event_base, Crashed>>; + explicit Done(my_context); + }; + + BackfillState(BackfillListener& backfill_listener, + std::unique_ptr<PeeringFacade> peering_state, + std::unique_ptr<PGFacade> pg); + ~BackfillState(); + + void process_event( + boost::intrusive_ptr<const sc::event_base> evt) { + backfill_machine.process_event(*std::move(evt)); + } + + hobject_t get_last_backfill_started() const { + return last_backfill_started; + } +private: + hobject_t last_backfill_started; + BackfillInterval backfill_info; + std::map<pg_shard_t, BackfillInterval> peer_backfill_info; + BackfillMachine backfill_machine; + std::unique_ptr<ProgressTracker> progress_tracker; +}; + +// BackfillListener -- an interface used by the backfill FSM to request +// low-level services like issueing `MOSDPGPush` or `MOSDPGBackfillRemove`. +// The goals behind the interface are: 1) unittestability; 2) possibility +// to retrofit classical OSD with BackfillState. For the second reason we +// never use `seastar::future` -- instead responses to the requests are +// conveyed as events; see ObjectPushed as an example. +struct BackfillState::BackfillListener { + virtual void request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) = 0; + + virtual void request_primary_scan( + const hobject_t& begin) = 0; + + virtual void enqueue_push( + const hobject_t& obj, + const eversion_t& v) = 0; + + virtual void enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) = 0; + + virtual void maybe_flush() = 0; + + virtual void update_peers_last_backfill( + const hobject_t& new_last_backfill) = 0; + + virtual bool budget_available() const = 0; + + virtual void backfilled() = 0; + + virtual ~BackfillListener() = default; +}; + +// PeeringFacade -- a facade (in the GoF-defined meaning) simplifying +// the interface of PeeringState. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct BackfillState::PeeringFacade { + virtual hobject_t earliest_backfill() const = 0; + virtual const std::set<pg_shard_t>& get_backfill_targets() const = 0; + virtual const hobject_t& get_peer_last_backfill(pg_shard_t peer) const = 0; + virtual const eversion_t& get_last_update() const = 0; + virtual const eversion_t& get_log_tail() const = 0; + + // the performance impact of `std::function` has not been considered yet. + // If there is any proof (from e.g. profiling) about its significance, we + // can switch back to the template variant. + using scan_log_func_t = std::function<void(const pg_log_entry_t&)>; + virtual void scan_log_after(eversion_t, scan_log_func_t) const = 0; + + virtual bool is_backfill_target(pg_shard_t peer) const = 0; + virtual void update_complete_backfill_object_stats(const hobject_t &hoid, + const pg_stat_t &stats) = 0; + virtual bool is_backfilling() const = 0; + virtual ~PeeringFacade() {} +}; + +// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge +// interface of crimson's PG class. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct BackfillState::PGFacade { + virtual const eversion_t& get_projected_last_update() const = 0; + virtual ~PGFacade() {} +}; + +class BackfillState::ProgressTracker { + // TODO: apply_stat, + enum class op_stage_t { + enqueued_push, + enqueued_drop, + completed_push, + }; + + struct registry_item_t { + op_stage_t stage; + std::optional<pg_stat_t> stats; + }; + + BackfillMachine& backfill_machine; + std::map<hobject_t, registry_item_t> registry; + + BackfillState& backfill_state() { + return backfill_machine.backfill_state; + } + PeeringFacade& peering_state() { + return *backfill_machine.peering_state; + } + BackfillListener& backfill_listener() { + return backfill_machine.backfill_listener; + } + +public: + ProgressTracker(BackfillMachine& backfill_machine) + : backfill_machine(backfill_machine) { + } + + bool tracked_objects_completed() const; + + bool enqueue_push(const hobject_t&); + void enqueue_drop(const hobject_t&); + void complete_to(const hobject_t&, const pg_stat_t&); +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc new file mode 100644 index 000000000..d555d6cdc --- /dev/null +++ b/src/crimson/osd/ec_backend.cc @@ -0,0 +1,37 @@ +#include "ec_backend.h" + +#include "crimson/osd/shard_services.h" + +ECBackend::ECBackend(shard_id_t shard, + ECBackend::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t&, + uint64_t, + DoutPrefixProvider &dpp) + : PGBackend{shard, coll, shard_services, dpp} +{ + // todo +} + +ECBackend::ll_read_ierrorator::future<ceph::bufferlist> +ECBackend::_read(const hobject_t& hoid, + const uint64_t off, + const uint64_t len, + const uint32_t flags) +{ + // todo + return seastar::make_ready_future<bufferlist>(); +} + +ECBackend::rep_op_fut_t +ECBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + // todo + return {seastar::now(), + seastar::make_ready_future<crimson::osd::acked_peers_t>()}; +} diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h new file mode 100644 index 000000000..3dbcc4def --- /dev/null +++ b/src/crimson/osd/ec_backend.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> +#include "include/buffer_fwd.h" +#include "osd/osd_types.h" +#include "pg_backend.h" + +class ECBackend : public PGBackend +{ +public: + ECBackend(shard_id_t shard, + CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile, + uint64_t stripe_width, + DoutPrefixProvider &dpp); + seastar::future<> stop() final { + return seastar::now(); + } + void on_actingset_changed(bool same_primary) final {} +private: + ll_read_ierrorator::future<ceph::bufferlist> + _read(const hobject_t& hoid, uint64_t off, uint64_t len, uint32_t flags) override; + rep_op_fut_t + _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + osd_op_params_t&& req, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) final; + CollectionRef coll; + crimson::os::FuturizedStore::Shard* store; + seastar::future<> request_committed(const osd_reqid_t& reqid, + const eversion_t& version) final { + return seastar::now(); + } +}; diff --git a/src/crimson/osd/exceptions.h b/src/crimson/osd/exceptions.h new file mode 100644 index 000000000..2783ed252 --- /dev/null +++ b/src/crimson/osd/exceptions.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <exception> +#include <system_error> + +#include "crimson/common/errorator.h" + +namespace crimson::osd { +class error : private std::system_error { +public: + error(const std::errc ec) + : system_error(std::make_error_code(ec)) { + } + + using system_error::code; + using system_error::what; + + friend error make_error(int ret); + +private: + error(const int ret) noexcept + : system_error(ret, std::system_category()) { + } +}; + +inline error make_error(const int ret) { + return error{ret}; +} + +struct object_not_found : public error { + object_not_found() : error(std::errc::no_such_file_or_directory) {} +}; + +struct invalid_argument : public error { + invalid_argument() : error(std::errc::invalid_argument) {} +}; + +// FIXME: error handling +struct permission_denied : public error { + permission_denied() : error(std::errc::operation_not_permitted) {} +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc new file mode 100644 index 000000000..266e56533 --- /dev/null +++ b/src/crimson/osd/heartbeat.cc @@ -0,0 +1,819 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "heartbeat.h" + +#include <boost/range/join.hpp> +#include <fmt/chrono.h> +#include <fmt/os.h> + +#include "messages/MOSDPing.h" +#include "messages/MOSDFailure.h" + +#include "crimson/common/config_proxy.h" +#include "crimson/common/formatter.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/osd/shard_services.h" +#include "crimson/mon/MonClient.h" + +#include "osd/OSDMap.h" + +using std::set; +using std::string; +using crimson::common::local_conf; + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +Heartbeat::Heartbeat(osd_id_t whoami, + crimson::osd::ShardServices& service, + crimson::mon::Client& monc, + crimson::net::Messenger &front_msgr, + crimson::net::Messenger &back_msgr) + : whoami{whoami}, + service{service}, + monc{monc}, + front_msgr{front_msgr}, + back_msgr{back_msgr}, + // do this in background + timer{[this] { + heartbeat_check(); + (void)send_heartbeats(); + }}, + failing_peers{*this} +{} + +seastar::future<> Heartbeat::start(entity_addrvec_t front_addrs, + entity_addrvec_t back_addrs) +{ + logger().info("heartbeat: start front_addrs={}, back_addrs={}", + front_addrs, back_addrs); + // i only care about the address, so any unused port would work + for (auto& addr : boost::join(front_addrs.v, back_addrs.v)) { + addr.set_port(0); + } + + using crimson::net::SocketPolicy; + front_msgr.set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossy_client(0)); + back_msgr.set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossy_client(0)); + return seastar::when_all_succeed(start_messenger(front_msgr, + front_addrs), + start_messenger(back_msgr, + back_addrs)) + .then_unpack([this] { + timer.arm_periodic( + std::chrono::seconds(local_conf()->osd_heartbeat_interval)); + }); +} + +seastar::future<> +Heartbeat::start_messenger(crimson::net::Messenger& msgr, + const entity_addrvec_t& addrs) +{ + return msgr.bind(addrs).safe_then([this, &msgr]() mutable { + return msgr.start({this}); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [addrs] (const std::error_code& e) { + logger().error("heartbeat messenger bind({}): {}", addrs, e); + ceph_abort(); + })); +} + +seastar::future<> Heartbeat::stop() +{ + logger().info("{}", __func__); + timer.cancel(); + front_msgr.stop(); + back_msgr.stop(); + return gate.close().then([this] { + return seastar::when_all_succeed(front_msgr.shutdown(), + back_msgr.shutdown()); + }).then_unpack([] { + return seastar::now(); + }); +} + +const entity_addrvec_t& Heartbeat::get_front_addrs() const +{ + return front_msgr.get_myaddrs(); +} + +const entity_addrvec_t& Heartbeat::get_back_addrs() const +{ + return back_msgr.get_myaddrs(); +} + +crimson::net::Messenger& Heartbeat::get_front_msgr() const +{ + return front_msgr; +} + +crimson::net::Messenger& Heartbeat::get_back_msgr() const +{ + return back_msgr; +} + +void Heartbeat::add_peer(osd_id_t _peer, epoch_t epoch) +{ + assert(whoami != _peer); + auto [iter, added] = peers.try_emplace(_peer, *this, _peer); + auto& peer = iter->second; + peer.set_epoch_added(epoch); +} + +Heartbeat::osds_t Heartbeat::remove_down_peers() +{ + osds_t old_osds; // osds not added in this epoch + for (auto i = peers.begin(); i != peers.end(); ) { + auto osdmap = service.get_map(); + const auto& [osd, peer] = *i; + if (!osdmap->is_up(osd)) { + i = peers.erase(i); + } else { + if (peer.get_epoch_added() < osdmap->get_epoch()) { + old_osds.push_back(osd); + } + ++i; + } + } + return old_osds; +} + +void Heartbeat::add_reporter_peers(int whoami) +{ + auto osdmap = service.get_map(); + // include next and previous up osds to ensure we have a fully-connected set + set<int> want; + if (auto next = osdmap->get_next_up_osd_after(whoami); next >= 0) { + want.insert(next); + } + if (auto prev = osdmap->get_previous_up_osd_before(whoami); prev >= 0) { + want.insert(prev); + } + // make sure we have at least **min_down** osds coming from different + // subtree level (e.g., hosts) for fast failure detection. + auto min_down = local_conf().get_val<uint64_t>("mon_osd_min_down_reporters"); + auto subtree = local_conf().get_val<string>("mon_osd_reporter_subtree_level"); + osdmap->get_random_up_osds_by_subtree( + whoami, subtree, min_down, want, &want); + auto epoch = osdmap->get_epoch(); + for (int osd : want) { + add_peer(osd, epoch); + }; +} + +void Heartbeat::update_peers(int whoami) +{ + const auto min_peers = static_cast<size_t>( + local_conf().get_val<int64_t>("osd_heartbeat_min_peers")); + add_reporter_peers(whoami); + auto extra = remove_down_peers(); + // too many? + for (auto& osd : extra) { + if (peers.size() <= min_peers) { + break; + } + remove_peer(osd); + } + // or too few? + auto osdmap = service.get_map(); + auto epoch = osdmap->get_epoch(); + for (auto next = osdmap->get_next_up_osd_after(whoami); + peers.size() < min_peers && next >= 0 && next != whoami; + next = osdmap->get_next_up_osd_after(next)) { + add_peer(next, epoch); + } +} + +Heartbeat::osds_t Heartbeat::get_peers() const +{ + osds_t osds; + osds.reserve(peers.size()); + for (auto& peer : peers) { + osds.push_back(peer.first); + } + return osds; +} + +void Heartbeat::remove_peer(osd_id_t peer) +{ + assert(peers.count(peer) == 1); + peers.erase(peer); +} + +std::optional<seastar::future<>> +Heartbeat::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + bool dispatched = true; + gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] { + switch (m->get_type()) { + case MSG_OSD_PING: + return handle_osd_ping(conn, boost::static_pointer_cast<MOSDPing>(m)); + default: + dispatched = false; + return seastar::now(); + } + }); + return (dispatched ? std::make_optional(seastar::now()) : std::nullopt); +} + +void Heartbeat::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) +{ + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_reset(conn, is_replace); + } +} + +void Heartbeat::ms_handle_connect( + crimson::net::ConnectionRef conn, + seastar::shard_id prv_shard) +{ + ceph_assert_always(seastar::this_shard_id() == prv_shard); + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_connect(conn); + } +} + +void Heartbeat::ms_handle_accept( + crimson::net::ConnectionRef conn, + seastar::shard_id prv_shard, + bool is_replace) +{ + ceph_assert_always(seastar::this_shard_id() == prv_shard); + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_accept(conn, is_replace); + } +} + +seastar::future<> Heartbeat::handle_osd_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + switch (m->op) { + case MOSDPing::PING: + return handle_ping(conn, m); + case MOSDPing::PING_REPLY: + return handle_reply(conn, m); + case MOSDPing::YOU_DIED: + return handle_you_died(); + default: + return seastar::now(); + } +} + +seastar::future<> Heartbeat::handle_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + auto min_message = static_cast<uint32_t>( + local_conf()->osd_heartbeat_min_size); + auto reply = + crimson::make_message<MOSDPing>( + m->fsid, + service.get_map()->get_epoch(), + MOSDPing::PING_REPLY, + m->ping_stamp, + m->mono_ping_stamp, + service.get_mnow(), + service.get_up_epoch(), + min_message); + return conn->send(std::move(reply) + ).then([this, m, conn] { + return maybe_share_osdmap(conn, m); + }); +} + +seastar::future<> Heartbeat::maybe_share_osdmap( + crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + const osd_id_t from = m->get_source().num(); + const epoch_t current_osdmap_epoch = service.get_map()->get_epoch(); + auto found = peers.find(from); + if (found == peers.end()) { + return seastar::now(); + } + auto& peer = found->second; + + if (m->map_epoch > peer.get_projected_epoch()) { + logger().debug("{} updating peer {} session's projected_epoch" + "from {} to ping map epoch of {}", + __func__, from, peer.get_projected_epoch(), + m->map_epoch); + peer.set_projected_epoch(m->map_epoch); + } + + if (current_osdmap_epoch <= peer.get_projected_epoch()) { + logger().debug("{} peer {} projected_epoch {} is already later " + "than our osdmap epoch of {}", + __func__ , from, peer.get_projected_epoch(), + current_osdmap_epoch); + return seastar::now(); + } + + const epoch_t send_from = peer.get_projected_epoch(); + logger().debug("{} sending peer {} peer maps from projected epoch {} through " + "local osdmap epoch {}", + __func__, + from, + send_from, + current_osdmap_epoch); + peer.set_projected_epoch(current_osdmap_epoch); + return service.send_incremental_map_to_osd(from, send_from); +} + +seastar::future<> Heartbeat::handle_reply(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + const osd_id_t from = m->get_source().num(); + auto found = peers.find(from); + if (found == peers.end()) { + // stale reply + return seastar::now(); + } + auto& peer = found->second; + return peer.handle_reply(conn, m + ).then([this, conn, m] { + return maybe_share_osdmap(conn, m); + }); +} + +seastar::future<> Heartbeat::handle_you_died() +{ + // TODO: ask for newer osdmap + return seastar::now(); +} + +void Heartbeat::heartbeat_check() +{ + failure_queue_t failure_queue; + const auto now = clock::now(); + for (const auto& [osd, peer] : peers) { + auto failed_since = peer.failed_since(now); + if (!clock::is_zero(failed_since)) { + failure_queue.emplace(osd, failed_since); + } + } + if (!failure_queue.empty()) { + // send_failures can run in background, because + // 1. After the execution of send_failures, no msg is actually + // sent, which means the sending operation is not done, + // which further seems to involve problems risks that when + // osd shuts down, the left part of the sending operation + // may reference OSD and Heartbeat instances that are already + // deleted. However, remaining work of that sending operation + // involves no reference back to OSD or Heartbeat instances, + // which means it wouldn't involve the above risks. + // 2. messages are sent in order, if later checks find out + // the previous "failed" peers to be healthy, that "still + // alive" messages would be sent after the previous "osd + // failure" messages which is totally safe. + (void)send_failures(std::move(failure_queue)); + } +} + +seastar::future<> Heartbeat::send_heartbeats() +{ + const auto mnow = service.get_mnow(); + const auto now = clock::now(); + + std::vector<seastar::future<>> futures; + for (auto& [osd, peer] : peers) { + peer.send_heartbeat(now, mnow, futures); + } + return seastar::when_all_succeed(futures.begin(), futures.end()); +} + +seastar::future<> Heartbeat::send_failures(failure_queue_t&& failure_queue) +{ + std::vector<seastar::future<>> futures; + const auto now = clock::now(); + for (auto [osd, failed_since] : failure_queue) { + failing_peers.add_pending(osd, failed_since, now, futures); + } + + return seastar::when_all_succeed(futures.begin(), futures.end()); +} + +void Heartbeat::print(std::ostream& out) const +{ + out << "heartbeat"; +} + +Heartbeat::Connection::~Connection() +{ + if (conn) { + conn->mark_down(); + } +} + +bool Heartbeat::Connection::matches(crimson::net::ConnectionRef _conn) const +{ + return (conn && conn == _conn); +} + +bool Heartbeat::Connection::accepted( + crimson::net::ConnectionRef accepted_conn, + bool is_replace) +{ + ceph_assert(accepted_conn); + ceph_assert(accepted_conn != conn); + if (accepted_conn->get_peer_addr() != listener.get_peer_addr(type)) { + return false; + } + + if (is_replace) { + logger().info("Heartbeat::Connection::accepted(): " + "{} racing", *this); + racing_detected = true; + } + if (conn) { + // there is no assumption about the ordering of the reset and accept + // events for the 2 racing connections. + if (is_connected) { + logger().warn("Heartbeat::Connection::accepted(): " + "{} is accepted while connected, is_replace={}", + *this, is_replace); + conn->mark_down(); + set_unconnected(); + } + } + conn = accepted_conn; + set_connected(); + return true; +} + +void Heartbeat::Connection::reset(bool is_replace) +{ + if (is_replace) { + logger().info("Heartbeat::Connection::reset(): " + "{} racing, waiting for the replacing accept", + *this); + racing_detected = true; + } + + if (is_connected) { + set_unconnected(); + } else { + conn = nullptr; + } + + if (is_replace) { + // waiting for the replacing accept event + } else if (!racing_detected || is_winner_side) { + connect(); + } else { // racing_detected && !is_winner_side + logger().info("Heartbeat::Connection::reset(): " + "{} racing detected and lose, " + "waiting for peer connect me", *this); + } +} + +seastar::future<> Heartbeat::Connection::send(MessageURef msg) +{ + assert(is_connected); + return conn->send(std::move(msg)); +} + +void Heartbeat::Connection::validate() +{ + assert(is_connected); + auto peer_addr = listener.get_peer_addr(type); + if (conn->get_peer_addr() != peer_addr) { + logger().info("Heartbeat::Connection::validate(): " + "{} has new address {} over {}, reset", + *this, peer_addr, conn->get_peer_addr()); + conn->mark_down(); + racing_detected = false; + reset(); + } +} + +void Heartbeat::Connection::retry() +{ + racing_detected = false; + if (!is_connected) { + if (conn) { + conn->mark_down(); + reset(); + } else { + connect(); + } + } +} + +void Heartbeat::Connection::set_connected() +{ + assert(conn); + assert(!is_connected); + ceph_assert(conn->is_connected()); + is_connected = true; + listener.increase_connected(); +} + +void Heartbeat::Connection::set_unconnected() +{ + assert(conn); + assert(is_connected); + conn = nullptr; + is_connected = false; + listener.decrease_connected(); +} + +void Heartbeat::Connection::connect() +{ + assert(!conn); + auto addr = listener.get_peer_addr(type); + conn = msgr.connect(addr, entity_name_t(CEPH_ENTITY_TYPE_OSD, peer)); + if (conn->is_connected()) { + set_connected(); + } +} + +Heartbeat::clock::time_point +Heartbeat::Session::failed_since(Heartbeat::clock::time_point now) const +{ + if (do_health_screen(now) == health_state::UNHEALTHY) { + auto oldest_deadline = ping_history.begin()->second.deadline; + auto failed_since = std::min(last_rx_back, last_rx_front); + if (clock::is_zero(failed_since)) { + logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} " + "ever on either front or back, first ping sent {} " + "(oldest deadline {})", + peer, first_tx, oldest_deadline); + failed_since = first_tx; + } else { + logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} " + "since back {} front {} (oldest deadline {})", + peer, last_rx_back, last_rx_front, oldest_deadline); + } + return failed_since; + } else { + return clock::zero(); + } +} + +void Heartbeat::Session::set_inactive_history(clock::time_point now) +{ + assert(!connected); + if (ping_history.empty()) { + const utime_t sent_stamp{now}; + const auto deadline = + now + std::chrono::seconds(local_conf()->osd_heartbeat_grace); + ping_history.emplace(sent_stamp, reply_t{deadline, 0}); + } else { // the entry is already added + assert(ping_history.size() == 1); + } +} + +Heartbeat::Peer::Peer(Heartbeat& heartbeat, osd_id_t peer) + : ConnectionListener(2), heartbeat{heartbeat}, peer{peer}, session{peer}, + con_front(peer, heartbeat.whoami > peer, Connection::type_t::front, + heartbeat.front_msgr, *this), + con_back(peer, heartbeat.whoami > peer, Connection::type_t::back, + heartbeat.back_msgr, *this) +{ + logger().info("Heartbeat::Peer: osd.{} added", peer); +} + +Heartbeat::Peer::~Peer() +{ + logger().info("Heartbeat::Peer: osd.{} removed", peer); +} + +void Heartbeat::Peer::send_heartbeat( + clock::time_point now, ceph::signedspan mnow, + std::vector<seastar::future<>>& futures) +{ + session.set_tx(now); + if (session.is_started()) { + do_send_heartbeat(now, mnow, &futures); + for_each_conn([] (auto& conn) { + conn.validate(); + }); + } else { + // we should send MOSDPing but still cannot at this moment + if (pending_send) { + // we have already pending for a entire heartbeat interval + logger().warn("Heartbeat::Peer::send_heartbeat(): " + "heartbeat to osd.{} is still pending...", peer); + for_each_conn([] (auto& conn) { + conn.retry(); + }); + } else { + logger().info("Heartbeat::Peer::send_heartbeat(): " + "heartbeat to osd.{} is pending send...", peer); + session.set_inactive_history(now); + pending_send = true; + } + } +} + +void Heartbeat::Peer::handle_reset( + crimson::net::ConnectionRef conn, bool is_replace) +{ + int cnt = 0; + for_each_conn([&] (auto& _conn) { + if (_conn.matches(conn)) { + ++cnt; + _conn.reset(is_replace); + } + }); + + if (cnt == 0) { + logger().info("Heartbeat::Peer::handle_reset(): {} ignores conn, is_replace={} -- {}", + *this, is_replace, *conn); + } else if (cnt > 1) { + logger().error("Heartbeat::Peer::handle_reset(): {} handles conn {} times -- {}", + *this, cnt, *conn); + } +} + +void Heartbeat::Peer::handle_connect(crimson::net::ConnectionRef conn) +{ + int cnt = 0; + for_each_conn([&] (auto& _conn) { + if (_conn.matches(conn)) { + ++cnt; + _conn.connected(); + } + }); + + if (cnt == 0) { + logger().error("Heartbeat::Peer::handle_connect(): {} ignores conn -- {}", + *this, *conn); + conn->mark_down(); + } else if (cnt > 1) { + logger().error("Heartbeat::Peer::handle_connect(): {} handles conn {} times -- {}", + *this, cnt, *conn); + } +} + +void Heartbeat::Peer::handle_accept(crimson::net::ConnectionRef conn, bool is_replace) +{ + int cnt = 0; + for_each_conn([&] (auto& _conn) { + if (_conn.accepted(conn, is_replace)) { + ++cnt; + } + }); + + if (cnt == 0) { + logger().warn("Heartbeat::Peer::handle_accept(): {} ignores conn -- {}", + *this, *conn); + } else if (cnt > 1) { + logger().error("Heartbeat::Peer::handle_accept(): {} handles conn {} times -- {}", + *this, cnt, *conn); + } +} + +seastar::future<> Heartbeat::Peer::handle_reply( + crimson::net::ConnectionRef conn, Ref<MOSDPing> m) +{ + if (!session.is_started()) { + // we haven't sent any ping yet + return seastar::now(); + } + type_t type; + if (con_front.matches(conn)) { + type = type_t::front; + } else if (con_back.matches(conn)) { + type = type_t::back; + } else { + return seastar::now(); + } + const auto now = clock::now(); + if (session.on_pong(m->ping_stamp, type, now)) { + if (session.do_health_screen(now) == Session::health_state::HEALTHY) { + return heartbeat.failing_peers.cancel_one(peer); + } + } + return seastar::now(); +} + +entity_addr_t Heartbeat::Peer::get_peer_addr(type_t type) +{ + const auto osdmap = heartbeat.service.get_map(); + if (type == type_t::front) { + return osdmap->get_hb_front_addrs(peer).front(); + } else { + return osdmap->get_hb_back_addrs(peer).front(); + } +} + +void Heartbeat::Peer::on_connected() +{ + logger().info("Heartbeat::Peer: osd.{} connected (send={})", + peer, pending_send); + session.on_connected(); + if (pending_send) { + pending_send = false; + do_send_heartbeat(clock::now(), heartbeat.service.get_mnow(), nullptr); + } +} + +void Heartbeat::Peer::on_disconnected() +{ + logger().info("Heartbeat::Peer: osd.{} disconnected", peer); + session.on_disconnected(); +} + +void Heartbeat::Peer::do_send_heartbeat( + Heartbeat::clock::time_point now, + ceph::signedspan mnow, + std::vector<seastar::future<>>* futures) +{ + const utime_t sent_stamp{now}; + const auto deadline = + now + std::chrono::seconds(local_conf()->osd_heartbeat_grace); + session.on_ping(sent_stamp, deadline); + for_each_conn([&, this] (auto& conn) { + auto min_message = static_cast<uint32_t>( + local_conf()->osd_heartbeat_min_size); + auto ping = crimson::make_message<MOSDPing>( + heartbeat.monc.get_fsid(), + heartbeat.service.get_map()->get_epoch(), + MOSDPing::PING, + sent_stamp, + mnow, + mnow, + heartbeat.service.get_up_epoch(), + min_message); + if (futures) { + futures->push_back(conn.send(std::move(ping))); + } + }); +} + +bool Heartbeat::FailingPeers::add_pending( + osd_id_t peer, + clock::time_point failed_since, + clock::time_point now, + std::vector<seastar::future<>>& futures) +{ + if (failure_pending.count(peer)) { + return false; + } + auto failed_for = std::chrono::duration_cast<std::chrono::seconds>( + now - failed_since).count(); + auto osdmap = heartbeat.service.get_map(); + auto failure_report = + crimson::make_message<MOSDFailure>(heartbeat.monc.get_fsid(), + peer, + osdmap->get_addrs(peer), + static_cast<int>(failed_for), + osdmap->get_epoch()); + failure_pending.emplace(peer, failure_info_t{failed_since, + osdmap->get_addrs(peer)}); + futures.push_back(heartbeat.monc.send_message(std::move(failure_report))); + logger().info("{}: osd.{} failed for {}", __func__, peer, failed_for); + return true; +} + +seastar::future<> Heartbeat::FailingPeers::cancel_one(osd_id_t peer) +{ + if (auto pending = failure_pending.find(peer); + pending != failure_pending.end()) { + auto fut = send_still_alive(peer, pending->second.addrs); + failure_pending.erase(peer); + return fut; + } + return seastar::now(); +} + +seastar::future<> +Heartbeat::FailingPeers::send_still_alive( + osd_id_t osd, const entity_addrvec_t& addrs) +{ + auto still_alive = crimson::make_message<MOSDFailure>( + heartbeat.monc.get_fsid(), + osd, + addrs, + 0, + heartbeat.service.get_map()->get_epoch(), + MOSDFailure::FLAG_ALIVE); + logger().info("{}: osd.{}", __func__, osd); + return heartbeat.monc.send_message(std::move(still_alive)); +} diff --git a/src/crimson/osd/heartbeat.h b/src/crimson/osd/heartbeat.h new file mode 100644 index 000000000..f5da45118 --- /dev/null +++ b/src/crimson/osd/heartbeat.h @@ -0,0 +1,461 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> +#include <seastar/core/future.hh> +#include "common/ceph_time.h" +#include "crimson/common/gated.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/net/Fwd.h" + +class MOSDPing; + +namespace crimson::osd { + class ShardServices; +} + +namespace crimson::mon { + class Client; +} + +template<typename Message> using Ref = boost::intrusive_ptr<Message>; + +class Heartbeat : public crimson::net::Dispatcher { +public: + using osd_id_t = int; + + Heartbeat(osd_id_t whoami, + crimson::osd::ShardServices& service, + crimson::mon::Client& monc, + crimson::net::Messenger &front_msgr, + crimson::net::Messenger &back_msgr); + + seastar::future<> start(entity_addrvec_t front, + entity_addrvec_t back); + seastar::future<> stop(); + + using osds_t = std::vector<osd_id_t>; + void add_peer(osd_id_t peer, epoch_t epoch); + void update_peers(int whoami); + void remove_peer(osd_id_t peer); + osds_t get_peers() const; + + const entity_addrvec_t& get_front_addrs() const; + const entity_addrvec_t& get_back_addrs() const; + + crimson::net::Messenger &get_front_msgr() const; + crimson::net::Messenger &get_back_msgr() const; + + // Dispatcher methods + std::optional<seastar::future<>> ms_dispatch( + crimson::net::ConnectionRef conn, MessageRef m) override; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override; + void ms_handle_connect(crimson::net::ConnectionRef conn, seastar::shard_id) override; + void ms_handle_accept(crimson::net::ConnectionRef conn, seastar::shard_id, bool is_replace) override; + + void print(std::ostream&) const; +private: + seastar::future<> handle_osd_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_reply(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_you_died(); + + /// remove down OSDs + /// @return peers not added in this epoch + osds_t remove_down_peers(); + /// add enough reporters for fast failure detection + void add_reporter_peers(int whoami); + + seastar::future<> start_messenger(crimson::net::Messenger& msgr, + const entity_addrvec_t& addrs); + seastar::future<> maybe_share_osdmap(crimson::net::ConnectionRef, + Ref<MOSDPing> m); +private: + const osd_id_t whoami; + crimson::osd::ShardServices& service; + crimson::mon::Client& monc; + crimson::net::Messenger &front_msgr; + crimson::net::Messenger &back_msgr; + + seastar::timer<seastar::lowres_clock> timer; + // use real_clock so it can be converted to utime_t + using clock = ceph::coarse_real_clock; + + class ConnectionListener; + class Connection; + class Session; + class Peer; + using peers_map_t = std::map<osd_id_t, Peer>; + peers_map_t peers; + + // osds which are considered failed + // osd_id => when was the last time that both front and back pings were acked + // or sent. + // use for calculating how long the OSD has been unresponsive + using failure_queue_t = std::map<osd_id_t, clock::time_point>; + seastar::future<> send_failures(failure_queue_t&& failure_queue); + seastar::future<> send_heartbeats(); + void heartbeat_check(); + + // osds we've reported to monior as failed ones, but they are not marked down + // yet + crimson::common::Gated gate; + + class FailingPeers { + public: + FailingPeers(Heartbeat& heartbeat) : heartbeat(heartbeat) {} + bool add_pending(osd_id_t peer, + clock::time_point failed_since, + clock::time_point now, + std::vector<seastar::future<>>& futures); + seastar::future<> cancel_one(osd_id_t peer); + + private: + seastar::future<> send_still_alive(osd_id_t, const entity_addrvec_t&); + + Heartbeat& heartbeat; + + struct failure_info_t { + clock::time_point failed_since; + entity_addrvec_t addrs; + }; + std::map<osd_id_t, failure_info_t> failure_pending; + } failing_peers; +}; + +inline std::ostream& operator<<(std::ostream& out, const Heartbeat& hb) { + hb.print(out); + return out; +} + +/* + * Event driven interface for Heartbeat::Peer to be notified when both hb_front + * and hb_back are connected, or connection is lost. + */ +class Heartbeat::ConnectionListener { + public: + ConnectionListener(size_t connections) : connections{connections} {} + + void increase_connected() { + assert(connected < connections); + ++connected; + if (connected == connections) { + on_connected(); + } + } + void decrease_connected() { + assert(connected > 0); + if (connected == connections) { + on_disconnected(); + } + --connected; + } + enum class type_t { front, back }; + virtual entity_addr_t get_peer_addr(type_t) = 0; + + protected: + virtual void on_connected() = 0; + virtual void on_disconnected() = 0; + + private: + const size_t connections; + size_t connected = 0; +}; + +class Heartbeat::Connection { + public: + using type_t = ConnectionListener::type_t; + Connection(osd_id_t peer, bool is_winner_side, type_t type, + crimson::net::Messenger& msgr, + ConnectionListener& listener) + : peer{peer}, type{type}, + msgr{msgr}, listener{listener}, + is_winner_side{is_winner_side} { + connect(); + } + Connection(const Connection&) = delete; + Connection(Connection&&) = delete; + Connection& operator=(const Connection&) = delete; + Connection& operator=(Connection&&) = delete; + + ~Connection(); + + bool matches(crimson::net::ConnectionRef _conn) const; + void connected() { + set_connected(); + } + bool accepted(crimson::net::ConnectionRef, bool is_replace); + void reset(bool is_replace=false); + seastar::future<> send(MessageURef msg); + void validate(); + // retry connection if still pending + void retry(); + + private: + void set_connected(); + void set_unconnected(); + void connect(); + + const osd_id_t peer; + const type_t type; + crimson::net::Messenger& msgr; + ConnectionListener& listener; + +/* + * Resolve the following racing when both me and peer are trying to connect + * each other symmetrically, under SocketPolicy::lossy_client: + * + * OSD.A OSD.B + * - - + * |-[1]----> <----[2]-| + * \ / + * \ / + * delay.. X delay.. + * / \ + * |-[1]x> / \ <x[2]-| + * |<-[2]--- ---[1]->| + * |(reset#1) (reset#2)| + * |(reconnectB) (reconnectA)| + * |-[2]---> <---[1]-| + * delay.. delay.. + * (remote close populated) + * |-[2]x> <x[1]-| + * |(reset#2) (reset#1)| + * | ... ... | + * (dead loop!) + * + * Our solution is to remember if such racing was happened recently, and + * establish connection asymmetrically only from the winner side whose osd-id + * is larger. + */ + const bool is_winner_side; + bool racing_detected = false; + + crimson::net::ConnectionRef conn; + bool is_connected = false; + + friend std::ostream& operator<<(std::ostream& os, const Connection& c) { + if (c.type == type_t::front) { + return os << "con_front(osd." << c.peer << ")"; + } else { + return os << "con_back(osd." << c.peer << ")"; + } + } +}; + +/* + * Track the ping history and ping reply (the pong) from the same session, clean up + * history once hb_front or hb_back loses connection and restart the session once + * both connections are connected again. + * + * We cannot simply remove the entire Heartbeat::Peer once hb_front or hb_back + * loses connection, because we would end up with the following deadloop: + * + * OSD.A OSD.B + * - - + * hb_front reset <--(network)--- hb_front close + * | ^ + * | | + * remove Peer B (dead loop!) remove Peer A + * | | + * V | + * hb_back close ----(network)---> hb_back reset + */ +class Heartbeat::Session { + public: + Session(osd_id_t peer) : peer{peer} {} + + void set_epoch_added(epoch_t epoch_) { epoch = epoch_; } + epoch_t get_epoch_added() const { return epoch; } + + void set_projected_epoch(epoch_t epoch_) { projected_epoch = epoch_; } + epoch_t get_projected_epoch() const { return projected_epoch; } + + bool is_started() const { return connected; } + bool pinged() const { + if (clock::is_zero(first_tx)) { + // i can never receive a pong without sending any ping message first. + assert(clock::is_zero(last_rx_front) && + clock::is_zero(last_rx_back)); + return false; + } else { + return true; + } + } + + enum class health_state { + UNKNOWN, + UNHEALTHY, + HEALTHY, + }; + health_state do_health_screen(clock::time_point now) const { + if (!pinged()) { + // we are not healty nor unhealty because we haven't sent anything yet + return health_state::UNKNOWN; + } else if (!ping_history.empty() && ping_history.begin()->second.deadline < now) { + return health_state::UNHEALTHY; + } else if (!clock::is_zero(last_rx_front) && + !clock::is_zero(last_rx_back)) { + // only declare to be healthy until we have received the first + // replies from both front/back connections + return health_state::HEALTHY; + } else { + return health_state::UNKNOWN; + } + } + + clock::time_point failed_since(clock::time_point now) const; + + void set_tx(clock::time_point now) { + if (!pinged()) { + first_tx = now; + } + last_tx = now; + } + + void on_connected() { + assert(!connected); + connected = true; + ping_history.clear(); + } + + void on_ping(const utime_t& sent_stamp, + const clock::time_point& deadline) { + assert(connected); + [[maybe_unused]] auto [reply, added] = + ping_history.emplace(sent_stamp, reply_t{deadline, 2}); + } + + bool on_pong(const utime_t& ping_stamp, + Connection::type_t type, + clock::time_point now) { + assert(connected); + auto ping = ping_history.find(ping_stamp); + if (ping == ping_history.end()) { + // old replies, deprecated by newly sent pings. + return false; + } + auto& unacked = ping->second.unacknowledged; + assert(unacked); + if (type == Connection::type_t::front) { + last_rx_front = now; + unacked--; + } else { + last_rx_back = now; + unacked--; + } + if (unacked == 0) { + ping_history.erase(ping_history.begin(), ++ping); + } + return true; + } + + void on_disconnected() { + assert(connected); + connected = false; + if (!ping_history.empty()) { + // we lost our ping_history of the last session, but still need to keep + // the oldest deadline for unhealthy check. + auto oldest = ping_history.begin(); + auto sent_stamp = oldest->first; + auto deadline = oldest->second.deadline; + ping_history.clear(); + ping_history.emplace(sent_stamp, reply_t{deadline, 0}); + } + } + + // maintain an entry in ping_history for unhealthy check + void set_inactive_history(clock::time_point); + + private: + const osd_id_t peer; + bool connected = false; + // time we sent our first ping request + clock::time_point first_tx; + // last time we sent a ping request + clock::time_point last_tx; + // last time we got a ping reply on the front side + clock::time_point last_rx_front; + // last time we got a ping reply on the back side + clock::time_point last_rx_back; + // most recent epoch we wanted this peer + epoch_t epoch; // rename me to epoch_added + // epoch we expect peer to be at once our sent incrementals are processed + epoch_t projected_epoch = 0; + + struct reply_t { + clock::time_point deadline; + // one sent over front conn, another sent over back conn + uint8_t unacknowledged = 0; + }; + // history of inflight pings, arranging by timestamp we sent + std::map<utime_t, reply_t> ping_history; +}; + +class Heartbeat::Peer final : private Heartbeat::ConnectionListener { + public: + Peer(Heartbeat&, osd_id_t); + ~Peer(); + Peer(Peer&&) = delete; + Peer(const Peer&) = delete; + Peer& operator=(Peer&&) = delete; + Peer& operator=(const Peer&) = delete; + + // set/get the epoch at which the peer was added + void set_epoch_added(epoch_t epoch) { session.set_epoch_added(epoch); } + epoch_t get_epoch_added() const { return session.get_epoch_added(); } + + void set_projected_epoch(epoch_t epoch) { session.set_projected_epoch(epoch); } + epoch_t get_projected_epoch() const { return session.get_projected_epoch(); } + + // if failure, return time_point since last active + // else, return clock::zero() + clock::time_point failed_since(clock::time_point now) const { + return session.failed_since(now); + } + void send_heartbeat( + clock::time_point, ceph::signedspan, std::vector<seastar::future<>>&); + seastar::future<> handle_reply(crimson::net::ConnectionRef, Ref<MOSDPing>); + + void handle_reset(crimson::net::ConnectionRef conn, bool is_replace); + + void handle_connect(crimson::net::ConnectionRef conn); + + void handle_accept(crimson::net::ConnectionRef conn, bool is_replace); + + private: + entity_addr_t get_peer_addr(type_t type) override; + void on_connected() override; + void on_disconnected() override; + void do_send_heartbeat( + clock::time_point, ceph::signedspan, std::vector<seastar::future<>>*); + + template <typename Func> + void for_each_conn(Func&& f) { + f(con_front); + f(con_back); + } + + Heartbeat& heartbeat; + const osd_id_t peer; + Session session; + // if need to send heartbeat when session connected + bool pending_send = false; + Connection con_front; + Connection con_back; + + friend std::ostream& operator<<(std::ostream& os, const Peer& p) { + return os << "peer(osd." << p.peer << ")"; + } +}; + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<Heartbeat> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<Heartbeat::Connection> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<Heartbeat::Peer> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/lsan_suppressions.cc b/src/crimson/osd/lsan_suppressions.cc new file mode 100644 index 000000000..53b7eb630 --- /dev/null +++ b/src/crimson/osd/lsan_suppressions.cc @@ -0,0 +1,20 @@ +#ifndef _NDEBUG +// The callbacks we define here will be called from the sanitizer runtime, but +// aren't referenced from the Chrome executable. We must ensure that those +// callbacks are not sanitizer-instrumented, and that they aren't stripped by +// the linker. +#define SANITIZER_HOOK_ATTRIBUTE \ + extern "C" \ + __attribute__((no_sanitize("address", "thread", "undefined"))) \ + __attribute__((visibility("default"))) \ + __attribute__((used)) + +static char kLSanDefaultSuppressions[] = + "leak:InitModule\n" + "leak:MallocExtension::Initialize\n" + "leak:MallocExtension::Register\n"; + +SANITIZER_HOOK_ATTRIBUTE const char *__lsan_default_suppressions() { + return kLSanDefaultSuppressions; +} +#endif // ! _NDEBUG diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc new file mode 100644 index 000000000..1e817415d --- /dev/null +++ b/src/crimson/osd/main.cc @@ -0,0 +1,259 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/types.h> +#include <unistd.h> + +#include <iostream> +#include <fstream> +#include <random> + +#include <seastar/core/app-template.hh> +#include <seastar/core/print.hh> +#include <seastar/core/prometheus.hh> +#include <seastar/core/thread.hh> +#include <seastar/http/httpd.hh> +#include <seastar/net/inet_address.hh> +#include <seastar/util/closeable.hh> +#include <seastar/util/defer.hh> +#include <seastar/util/std-compat.hh> + +#include "auth/KeyRing.h" +#include "common/ceph_argparse.h" +#include "common/config_tracker.h" +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" +#include "crimson/common/fatal_signal.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Messenger.h" +#include "crimson/osd/stop_signal.h" +#include "crimson/osd/main_config_bootstrap_helpers.h" +#include "global/pidfile.h" +#include "osd.h" + +using namespace std::literals; +namespace bpo = boost::program_options; +using crimson::common::local_conf; +using crimson::common::sharded_conf; +using crimson::common::sharded_perf_coll; + +static seastar::logger& logger() +{ + return crimson::get_logger(ceph_subsys_osd); +} + +seastar::future<> make_keyring() +{ + const auto path = local_conf().get_val<std::string>("keyring"); + return seastar::file_exists(path).then([path](bool exists) { + KeyRing keyring; + EntityName name{local_conf()->name}; + EntityAuth auth; + if (exists && + keyring.load(nullptr, path) == 0 && + keyring.get_auth(name, auth)) { + fmt::print(std::cerr, "already have key in keyring: {}\n", path); + return seastar::now(); + } else { + CephContext temp_cct{}; + auth.key.create(&temp_cct, CEPH_CRYPTO_AES); + keyring.add(name, auth); + bufferlist bl; + keyring.encode_plaintext(bl); + const auto permissions = (seastar::file_permissions::user_read | + seastar::file_permissions::user_write); + return crimson::write_file(std::move(bl), path, permissions); + } + }).handle_exception_type([path](const std::filesystem::filesystem_error& e) { + fmt::print(std::cerr, "FATAL: writing new keyring to {}: {}\n", path, e.what()); + throw e; + }); +} + +static std::ofstream maybe_set_logger() +{ + std::ofstream log_file_stream; + if (auto log_file = local_conf()->log_file; !log_file.empty()) { + log_file_stream.open(log_file, std::ios::app | std::ios::out); + try { + seastar::throw_system_error_on(log_file_stream.fail()); + } catch (const std::system_error& e) { + ceph_abort_msg(fmt::format("unable to open log file: {}", e.what())); + } + logger().set_ostream(log_file_stream); + } + return log_file_stream; +} + +int main(int argc, const char* argv[]) +{ + auto early_config_result = crimson::osd::get_early_config(argc, argv); + if (!early_config_result.has_value()) { + int r = early_config_result.error(); + std::cerr << "do_early_config returned error: " << r << std::endl; + return r; + } + auto &early_config = early_config_result.value(); + + auto seastar_n_early_args = early_config.get_early_args(); + auto config_proxy_args = early_config.get_ceph_args(); + + seastar::app_template::config app_cfg; + app_cfg.name = "Crimson"; + app_cfg.auto_handle_sigint_sigterm = false; + seastar::app_template app(std::move(app_cfg)); + app.add_options() + ("mkkey", "generate a new secret key. " + "This is normally used in combination with --mkfs") + ("mkfs", "create a [new] data directory") + ("debug", "enable debug output on all loggers") + ("trace", "enable trace output on all loggers") + ("osdspec-affinity", bpo::value<std::string>()->default_value(std::string{}), + "set affinity to an osdspec") + ("prometheus_port", bpo::value<uint16_t>()->default_value(0), + "Prometheus port. Set to zero to disable") + ("prometheus_address", bpo::value<std::string>()->default_value("0.0.0.0"), + "Prometheus listening address") + ("prometheus_prefix", bpo::value<std::string>()->default_value("osd"), + "Prometheus metrics prefix"); + + try { + return app.run( + seastar_n_early_args.size(), + const_cast<char**>(seastar_n_early_args.data()), + [&] { + auto& config = app.configuration(); + return seastar::async([&] { + try { + FatalSignal fatal_signal; + seastar_apps_lib::stop_signal should_stop; + if (config.count("debug")) { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::debug + ); + } + if (config.count("trace")) { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::trace + ); + } + sharded_conf().start( + early_config.init_params.name, early_config.cluster_name).get(); + local_conf().start().get(); + auto stop_conf = seastar::deferred_stop(sharded_conf()); + sharded_perf_coll().start().get(); + auto stop_perf_coll = seastar::deferred_stop(sharded_perf_coll()); + local_conf().parse_config_files(early_config.conf_file_list).get(); + local_conf().parse_env().get(); + local_conf().parse_argv(config_proxy_args).get(); + auto log_file_stream = maybe_set_logger(); + auto reset_logger = seastar::defer([] { + logger().set_ostream(std::cerr); + }); + if (const auto ret = pidfile_write(local_conf()->pid_file); + ret == -EACCES || ret == -EAGAIN) { + ceph_abort_msg( + "likely there is another crimson-osd instance with the same id"); + } else if (ret < 0) { + ceph_abort_msg(fmt::format("pidfile_write failed with {} {}", + ret, cpp_strerror(-ret))); + } + // just ignore SIGHUP, we don't reread settings. keep in mind signals + // handled by S* must be blocked for alien threads (see AlienStore). + seastar::engine().handle_signal(SIGHUP, [] {}); + + // start prometheus API server + seastar::httpd::http_server_control prom_server; + std::any stop_prometheus; + if (uint16_t prom_port = config["prometheus_port"].as<uint16_t>(); + prom_port != 0) { + prom_server.start("prometheus").get(); + stop_prometheus = seastar::make_shared(seastar::deferred_stop(prom_server)); + + seastar::prometheus::config prom_config; + prom_config.prefix = config["prometheus_prefix"].as<std::string>(); + seastar::prometheus::start(prom_server, prom_config).get(); + seastar::net::inet_address prom_addr(config["prometheus_address"].as<std::string>()); + prom_server.listen(seastar::socket_address{prom_addr, prom_port}) + .handle_exception([=] (auto ep) { + std::cerr << seastar::format("Could not start Prometheus API server on {}:{}: {}\n", + prom_addr, prom_port, ep); + return seastar::make_exception_future(ep); + }).get(); + } + + const int whoami = std::stoi(local_conf()->name.get_id()); + const auto nonce = crimson::osd::get_nonce(); + crimson::net::MessengerRef cluster_msgr, client_msgr; + crimson::net::MessengerRef hb_front_msgr, hb_back_msgr; + for (auto [msgr, name] : {make_pair(std::ref(cluster_msgr), "cluster"s), + make_pair(std::ref(client_msgr), "client"s), + make_pair(std::ref(hb_front_msgr), "hb_front"s), + make_pair(std::ref(hb_back_msgr), "hb_back"s)}) { + msgr = crimson::net::Messenger::create(entity_name_t::OSD(whoami), + name, + nonce, + true); + } + auto store = crimson::os::FuturizedStore::create( + local_conf().get_val<std::string>("osd_objectstore"), + local_conf().get_val<std::string>("osd_data"), + local_conf().get_config_values()); + + crimson::osd::OSD osd( + whoami, nonce, std::ref(should_stop.abort_source()), + std::ref(*store), cluster_msgr, client_msgr, + hb_front_msgr, hb_back_msgr); + + if (config.count("mkkey")) { + make_keyring().get(); + } + if (local_conf()->no_mon_config) { + logger().info("bypassing the config fetch due to --no-mon-config"); + } else { + crimson::osd::populate_config_from_mon().get(); + } + if (config.count("mkfs")) { + auto osd_uuid = local_conf().get_val<uuid_d>("osd_uuid"); + if (osd_uuid.is_zero()) { + // use a random osd uuid if not specified + osd_uuid.generate_random(); + } + osd.mkfs( + *store, + whoami, + osd_uuid, + local_conf().get_val<uuid_d>("fsid"), + config["osdspec-affinity"].as<std::string>()).get(); + } + if (config.count("mkkey") || config.count("mkfs")) { + return EXIT_SUCCESS; + } else { + osd.start().get(); + } + logger().info("crimson startup completed"); + should_stop.wait().get(); + logger().info("crimson shutting down"); + osd.stop().get(); + // stop()s registered using defer() are called here + } catch (...) { + logger().error("startup failed: {}", std::current_exception()); + return EXIT_FAILURE; + } + logger().info("crimson shutdown complete"); + return EXIT_SUCCESS; + }); + }); + } catch (...) { + fmt::print(std::cerr, "FATAL: Exception during startup, aborting: {}\n", std::current_exception()); + return EXIT_FAILURE; + } +} + +/* + * Local Variables: + * compile-command: "make -j4 \ + * -C ../../../build \ + * crimson-osd" + * End: + */ diff --git a/src/crimson/osd/main_config_bootstrap_helpers.cc b/src/crimson/osd/main_config_bootstrap_helpers.cc new file mode 100644 index 000000000..807fd1591 --- /dev/null +++ b/src/crimson/osd/main_config_bootstrap_helpers.cc @@ -0,0 +1,265 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/main_config_bootstrap_helpers.h" + +#include <seastar/core/print.hh> +#include <seastar/core/prometheus.hh> +#include <seastar/core/thread.hh> +#include <seastar/http/httpd.hh> +#include <seastar/net/inet_address.hh> +#include <seastar/util/closeable.hh> +#include <seastar/util/defer.hh> +#include <seastar/util/std-compat.hh> + +#include "common/ceph_argparse.h" +#include "common/config_tracker.h" +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" +#include "crimson/common/fatal_signal.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Messenger.h" +#include "crimson/osd/main_config_bootstrap_helpers.h" + +using namespace std::literals; +using crimson::common::local_conf; +using crimson::common::sharded_conf; +using crimson::common::sharded_perf_coll; + +static seastar::logger& logger() +{ + return crimson::get_logger(ceph_subsys_osd); +} + +namespace crimson::osd { + +void usage(const char* prog) +{ + std::cout << "usage: " << prog << std::endl; + generic_server_usage(); +} + + +seastar::future<> populate_config_from_mon() +{ + logger().info("populating config from monitor"); + // i don't have any client before joining the cluster, so no need to have + // a proper auth handler + class DummyAuthHandler : public crimson::common::AuthHandler { + public: + void handle_authentication(const EntityName& name, + const AuthCapsInfo& caps) + {} + }; + return seastar::async([] { + auto auth_handler = std::make_unique<DummyAuthHandler>(); + auto msgr = crimson::net::Messenger::create(entity_name_t::CLIENT(), + "temp_mon_client", + get_nonce(), + true); + crimson::mon::Client monc{*msgr, *auth_handler}; + msgr->set_auth_client(&monc); + msgr->start({&monc}).get(); + auto stop_msgr = seastar::defer([&] { + msgr->stop(); + msgr->shutdown().get(); + }); + monc.start().handle_exception([] (auto ep) { + fmt::print(std::cerr, "FATAL: unable to connect to cluster: {}\n", ep); + return seastar::make_exception_future<>(ep); + }).get(); + auto stop_monc = seastar::defer([&] { + monc.stop().get(); + }); + monc.sub_want("config", 0, 0); + monc.renew_subs().get(); + // wait for monmap and config + monc.wait_for_config().get(); + auto fsid = monc.get_fsid().to_string(); + local_conf().set_val("fsid", fsid).get(); + logger().debug("{}: got config from monitor, fsid {}", __func__, fsid); + }); +} + +static tl::expected<early_config_t, int> +_get_early_config(int argc, const char *argv[]) +{ + early_config_t ret; + + // pull off ceph configs the stuff from early_args + std::vector<const char *> early_args; + early_args.insert( + std::end(early_args), + argv, argv + argc); + + ret.init_params = ceph_argparse_early_args( + early_args, + CEPH_ENTITY_TYPE_OSD, + &ret.cluster_name, + &ret.conf_file_list); + + if (ceph_argparse_need_usage(early_args)) { + usage(argv[0]); + exit(0); + } + + seastar::app_template::config app_cfg; + app_cfg.name = "Crimson-startup"; + app_cfg.auto_handle_sigint_sigterm = false; + seastar::app_template app(std::move(app_cfg)); + const char *bootstrap_args[] = { argv[0], "--smp", "1" }; + int r = app.run( + sizeof(bootstrap_args) / sizeof(bootstrap_args[0]), + const_cast<char**>(bootstrap_args), + [argc, argv, &ret, &early_args] { + return seastar::async([argc, argv, &ret, &early_args] { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::debug); + sharded_conf().start( + ret.init_params.name, ret.cluster_name).get(); + local_conf().start().get(); + auto stop_conf = seastar::deferred_stop(sharded_conf()); + + sharded_perf_coll().start().get(); + auto stop_perf_coll = seastar::deferred_stop(sharded_perf_coll()); + + local_conf().parse_env().get(); + local_conf().parse_argv(early_args).get(); + local_conf().parse_config_files(ret.conf_file_list).get(); + + if (local_conf()->no_mon_config) { + logger().info("bypassing the config fetch due to --no-mon-config"); + } else { + populate_config_from_mon().get(); + } + + // get ceph configs + std::set_difference( + argv, argv + argc, + std::begin(early_args), + std::end(early_args), + std::back_inserter(ret.ceph_args)); + + ret.early_args.insert( + std::end(ret.early_args), + std::begin(early_args), + std::end(early_args)); + + if (auto found = std::find_if( + std::begin(early_args), + std::end(early_args), + [](auto* arg) { return "--smp"sv == arg; }); + found == std::end(early_args)) { + + // Set --smp based on crimson_seastar_smp config option + ret.early_args.emplace_back("--smp"); + + auto smp_config = local_conf().get_val<uint64_t>( + "crimson_seastar_smp"); + + ret.early_args.emplace_back(fmt::format("{}", smp_config)); + logger().info("get_early_config: set --smp {}", smp_config); + } + return 0; + }); + }); + if (r < 0) { + return tl::unexpected(r); + } + return ret; +} + +/* get_early_config handles obtaining config parameters required prior + * to reactor startup. Most deployment mechanisms (cephadm for one) + * rely on pulling configs from the monitor rather than shipping around + * config files, so this process needs to support pulling config options + * from the monitors. + * + * Of particular interest are config params related to the seastar + * reactor itself which can't be modified after the reactor has been + * started -- like the number of cores to use (smp::count). Contacting + * the monitors, however, requires a MonClient, which in turn needs a + * running reactor. + * + * Unfortunately, seastar doesn't clean up thread local state + * associated with seastar::smp task queues etc, so we can't + * start a reactor, stop it, and restart it in the same thread + * without an impractical amount of cleanup in seastar. + * + * More unfortunately, starting a reactor in a seperate thread + * and then joining the thread still doesn't avoid all global state, + * I observed tasks from the previous reactor incarnation nevertheless + * continuing to run in the new one resulting in a crash as they access + * freed memory. + * + * The approach taken here, therefore, is to actually fork, start a + * reactor in the child process, encode the resulting early_config_t, + * and send it back to the parent process. + */ +tl::expected<early_config_t, int> +get_early_config(int argc, const char *argv[]) +{ + int pipes[2]; + int r = pipe2(pipes, 0); + if (r < 0) { + std::cerr << "get_early_config: failed to create pipes: " + << -errno << std::endl; + return tl::unexpected(-errno); + } + + pid_t worker = fork(); + if (worker < 0) { + close(pipes[0]); + close(pipes[1]); + std::cerr << "get_early_config: failed to fork: " + << -errno << std::endl; + return tl::unexpected(-errno); + } else if (worker == 0) { // child + close(pipes[0]); + auto ret = _get_early_config(argc, argv); + if (ret.has_value()) { + bufferlist bl; + ::encode(ret.value(), bl); + r = bl.write_fd(pipes[1]); + close(pipes[1]); + if (r < 0) { + std::cerr << "get_early_config: child failed to write_fd: " + << r << std::endl; + exit(-r); + } else { + exit(0); + } + } else { + std::cerr << "get_early_config: child failed: " + << -ret.error() << std::endl; + exit(-ret.error()); + } + return tl::unexpected(-1); + } else { // parent + close(pipes[1]); + + bufferlist bl; + early_config_t ret; + while ((r = bl.read_fd(pipes[0], 1024)) > 0); + close(pipes[0]); + + // ignore error, we'll propogate error based on read and decode + waitpid(worker, nullptr, 0); + + if (r < 0) { + std::cerr << "get_early_config: parent failed to read from pipe: " + << r << std::endl; + return tl::unexpected(r); + } + try { + auto bliter = bl.cbegin(); + ::decode(ret, bliter); + return ret; + } catch (...) { + std::cerr << "get_early_config: parent failed to decode" << std::endl; + return tl::unexpected(-EINVAL); + } + } +} + +} diff --git a/src/crimson/osd/main_config_bootstrap_helpers.h b/src/crimson/osd/main_config_bootstrap_helpers.h new file mode 100644 index 000000000..7c6131d17 --- /dev/null +++ b/src/crimson/osd/main_config_bootstrap_helpers.h @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <sys/types.h> +#include <unistd.h> + +#include <iostream> +#include <fstream> +#include <random> + +#include <seastar/core/future.hh> + +#include "common/ceph_argparse.h" +#include "include/expected.hpp" + +namespace crimson::osd { + +void usage(const char* prog); + +inline uint64_t get_nonce() +{ + if (auto pid = getpid(); pid == 1 || std::getenv("CEPH_USE_RANDOM_NONCE")) { + // we're running in a container; use a random number instead! + std::random_device rd; + std::default_random_engine rng{rd()}; + return std::uniform_int_distribution<uint64_t>{}(rng); + } else { + return pid; + } +} + +seastar::future<> populate_config_from_mon(); + +struct early_config_t { + std::vector<std::string> early_args; + std::vector<std::string> ceph_args; + + std::string cluster_name{"ceph"}; + std::string conf_file_list; + CephInitParameters init_params{CEPH_ENTITY_TYPE_OSD}; + + /// Returned vector must not outlive in + auto to_ptr_vector(const std::vector<std::string> &in) { + std::vector<const char *> ret; + ret.reserve(in.size()); + std::transform( + std::begin(in), std::end(in), + std::back_inserter(ret), + [](const auto &str) { return str.c_str(); }); + return ret; + } + + std::vector<const char *> get_early_args() { + return to_ptr_vector(early_args); + } + + std::vector<const char *> get_ceph_args() { + return to_ptr_vector(ceph_args); + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(early_args, bl); + encode(ceph_args, bl); + encode(cluster_name, bl); + encode(conf_file_list, bl); + encode(init_params, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(1, bl); + decode(early_args, bl); + decode(ceph_args, bl); + decode(cluster_name, bl); + decode(conf_file_list, bl); + decode(init_params, bl); + DECODE_FINISH(bl); + } +}; + +/** + * get_early_config + * + * Compile initial configuration information from command line arguments, + * config files, and monitors. + * + * This implementation forks off a worker process to do this work and must + * therefore be called very early in main(). (See implementation for an + * explanation). + */ +tl::expected<early_config_t, int> +get_early_config(int argc, const char *argv[]); + +} + +WRITE_CLASS_ENCODER(crimson::osd::early_config_t) diff --git a/src/crimson/osd/objclass.cc b/src/crimson/osd/objclass.cc new file mode 100644 index 000000000..4cc9d7336 --- /dev/null +++ b/src/crimson/osd/objclass.cc @@ -0,0 +1,584 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <cstdarg> +#include <cstring> +#include <boost/container/small_vector.hpp> +#include "common/ceph_context.h" +#include "common/ceph_releases.h" +#include "common/config.h" +#include "crimson/common/config_proxy.h" +#include "common/debug.h" + +#include "crimson/osd/exceptions.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/pg_backend.h" + +#include "objclass/objclass.h" +#include "osd/ClassHandler.h" + +#include "auth/Crypto.h" +#include "common/armor.h" + +using std::map; +using std::string; + +#define dout_context ClassHandler::get_instance().cct + +static constexpr int dout_subsys = ceph_subsys_objclass; + +static inline int execute_osd_op(cls_method_context_t hctx, OSDOp& op) +{ + // we can expect the memory under `ret` will be still fine after + // executing the osd op as we're running inside `seastar::thread` + // created for us by `seastar::async` in `::do_op_call()`. + int ret = 0; + using osd_op_errorator = crimson::osd::OpsExecuter::osd_op_errorator; + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->execute_op(op) + .handle_error_interruptible( + osd_op_errorator::all_same_way([&ret] (const std::error_code& err) { + assert(err.value() > 0); + ret = -err.value(); + return seastar::now(); + })).get(); // we're blocking here which requires `seastar::thread`. + return ret; +} + +int cls_call(cls_method_context_t hctx, const char *cls, const char *method, + char *indata, int datalen, + char **outdata, int *outdatalen) +{ +// FIXME, HACK: this is for testing only. Let's use dynamic linker to verify +// our depedencies + return 0; +} + +int cls_getxattr(cls_method_context_t hctx, + const char *name, + char **outdata, + int *outdatalen) +{ + return 0; +} + +int cls_setxattr(cls_method_context_t hctx, + const char *name, + const char *value, + int val_len) +{ + return 0; +} + +int cls_read(cls_method_context_t hctx, + int ofs, int len, + char **outdata, + int *outdatalen) +{ + return 0; +} + +int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin) +{ + assert(origin); + + try { + const auto& message = \ + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message(); + *origin = message.get_orig_source_inst(); + return 0; + } catch (crimson::osd::error& e) { + return -e.code().value(); + } +} + +int cls_cxx_create(cls_method_context_t hctx, const bool exclusive) +{ + OSDOp op{CEPH_OSD_OP_CREATE}; + op.op.flags = (exclusive ? CEPH_OSD_OP_FLAG_EXCL : 0); + return execute_osd_op(hctx, op); +} + +int cls_cxx_remove(cls_method_context_t hctx) +{ + OSDOp op{CEPH_OSD_OP_DELETE}; + return execute_osd_op(hctx, op); +} + +int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime) +{ + OSDOp op{CEPH_OSD_OP_STAT}; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + utime_t ut; + uint64_t s; + try { + auto iter = op.outdata.cbegin(); + decode(s, iter); + decode(ut, iter); + } catch (buffer::error& err) { + return -EIO; + } + if (size) { + *size = s; + } + if (mtime) { + *mtime = ut.sec(); + } + return 0; +} + +int cls_cxx_stat2(cls_method_context_t hctx, + uint64_t *size, + ceph::real_time *mtime) +{ + OSDOp op{CEPH_OSD_OP_STAT}; + if (const int ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + uint64_t dummy_size; + real_time dummy_mtime; + uint64_t& out_size = size ? *size : dummy_size; + real_time& out_mtime = mtime ? *mtime : dummy_mtime; + try { + auto iter = op.outdata.cbegin(); + decode(out_size, iter); + decode(out_mtime, iter); + return 0; + } catch (buffer::error& err) { + return -EIO; + } +} + +int cls_cxx_read2(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *outbl, + uint32_t op_flags) +{ + OSDOp op{CEPH_OSD_OP_SYNC_READ}; + op.op.extent.offset = ofs; + op.op.extent.length = len; + op.op.flags = op_flags; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return outbl->length(); +} + +int cls_cxx_write2(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *inbl, + uint32_t op_flags) +{ + OSDOp op{CEPH_OSD_OP_WRITE}; + op.op.extent.offset = ofs; + op.op.extent.length = len; + op.op.flags = op_flags; + op.indata = *inbl; + return execute_osd_op(hctx, op); +} + +int cls_cxx_write_full(cls_method_context_t hctx, bufferlist * const inbl) +{ + OSDOp op{CEPH_OSD_OP_WRITEFULL}; + op.op.extent.offset = 0; + op.op.extent.length = inbl->length(); + op.indata = *inbl; + return execute_osd_op(hctx, op); +} + +int cls_cxx_replace(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *inbl) +{ + { + OSDOp top{CEPH_OSD_OP_TRUNCATE}; + top.op.extent.offset = 0; + top.op.extent.length = 0; + if (const auto ret = execute_osd_op(hctx, top); ret < 0) { + return ret; + } + } + + { + OSDOp wop{CEPH_OSD_OP_WRITE}; + wop.op.extent.offset = ofs; + wop.op.extent.length = len; + wop.indata = *inbl; + if (const auto ret = execute_osd_op(hctx, wop); ret < 0) { + return ret; + } + } + return 0; +} + +int cls_cxx_truncate(cls_method_context_t hctx, int ofs) +{ + OSDOp op{CEPH_OSD_OP_TRUNCATE}; + op.op.extent.offset = ofs; + op.op.extent.length = 0; + return execute_osd_op(hctx, op); +} + +int cls_cxx_write_zero(cls_method_context_t hctx, int offset, int len) +{ + OSDOp op{CEPH_OSD_OP_ZERO}; + op.op.extent.offset = offset; + op.op.extent.length = len; + return execute_osd_op(hctx, op); +} + +int cls_cxx_getxattr(cls_method_context_t hctx, + const char *name, + bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_GETXATTR}; + op.op.xattr.name_len = strlen(name); + op.indata.append(name, op.op.xattr.name_len); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return outbl->length(); +} + +int cls_cxx_getxattrs(cls_method_context_t hctx, + map<string, bufferlist> *attrset) +{ + OSDOp op{CEPH_OSD_OP_GETXATTRS}; + if (const int ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*attrset, iter); + } catch (buffer::error& err) { + return -EIO; + } + return 0; +} + +int cls_cxx_setxattr(cls_method_context_t hctx, + const char *name, + bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_SETXATTR}; + op.op.xattr.name_len = std::strlen(name); + op.op.xattr.value_len = inbl->length(); + op.indata.append(name, op.op.xattr.name_len); + op.indata.append(*inbl); + return execute_osd_op(hctx, op); +} + +int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid) +{ + OSDOp op{CEPH_OSD_OP_ROLLBACK}; + op.op.snap.snapid = snapid; + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_get_all_vals(cls_method_context_t hctx, + map<string, bufferlist>* vals, + bool *more) +{ + return 0; +} + +int cls_cxx_map_get_keys(cls_method_context_t hctx, + const std::string& start_obj, + const uint64_t max_to_get, + std::set<std::string>* const keys, + bool* const more) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETKEYS}; + encode(start_obj, op.indata); + encode(max_to_get, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*keys, iter); + decode(*more, iter); + } catch (buffer::error&) { + return -EIO; + } + return keys->size(); +} + +int cls_cxx_map_get_vals(cls_method_context_t hctx, + const std::string& start_obj, + const std::string& filter_prefix, + const uint64_t max_to_get, + std::map<std::string, ceph::bufferlist> *vals, + bool* const more) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALS}; + encode(start_obj, op.indata); + encode(max_to_get, op.indata); + encode(filter_prefix, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*vals, iter); + decode(*more, iter); + } catch (buffer::error&) { + return -EIO; + } + return vals->size(); +} + +int cls_cxx_map_get_vals_by_keys(cls_method_context_t hctx, + const std::set<std::string> &keys, + std::map<std::string, ceph::bufferlist> *vals) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS}; + encode(keys, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*vals, iter); + } catch (buffer::error&) { + return -EIO; + } + return 0; +} + +int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETHEADER}; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return 0; +} + +int cls_cxx_map_get_val(cls_method_context_t hctx, + const string &key, + bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS}; + { + std::set<std::string> k{key}; + encode(k, op.indata); + } + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + std::map<std::string, ceph::bufferlist> m; + try { + auto iter = op.outdata.cbegin(); + decode(m, iter); + } catch (buffer::error&) { + return -EIO; + } + if (auto iter = std::begin(m); iter != std::end(m)) { + *outbl = std::move(iter->second); + return 0; + } else { + return -ENOENT; + } +} + +int cls_cxx_map_set_val(cls_method_context_t hctx, + const string &key, + bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETVALS}; + { + std::map<std::string, ceph::bufferlist> m; + m[key] = *inbl; + encode(m, op.indata); + } + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_set_vals(cls_method_context_t hctx, + const std::map<string, ceph::bufferlist> *map) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETVALS}; + encode(*map, op.indata); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_clear(cls_method_context_t hctx) +{ + OSDOp op{CEPH_OSD_OP_OMAPCLEAR}; + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETHEADER}; + op.indata = std::move(*inbl); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_remove_range(cls_method_context_t hctx, + const std::string& key_begin, + const std::string& key_end) +{ + OSDOp op{CEPH_OSD_OP_OMAPRMKEYRANGE}; + encode(key_begin, op.indata); + encode(key_end, op.indata); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_remove_key(cls_method_context_t hctx, const string &key) +{ + OSDOp op{CEPH_OSD_OP_OMAPRMKEYS}; + std::vector<string> to_rm{key}; + encode(to_rm, op.indata); + return execute_osd_op(hctx, op); +} + +int cls_cxx_list_watchers(cls_method_context_t hctx, + obj_list_watch_response_t *watchers) +{ + OSDOp op{CEPH_OSD_OP_LIST_WATCHERS}; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + + try { + auto iter = op.outdata.cbegin(); + decode(*watchers, iter); + } catch (buffer::error&) { + return -EIO; + } + return 0; +} + +uint64_t cls_current_version(cls_method_context_t hctx) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + return ox->get_last_user_version(); +} + + +int cls_current_subop_num(cls_method_context_t hctx) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + // in contrast to classical OSD, crimson doesn't count OP_CALL and + // OP_STAT which seems fine regarding how the plugins we take care + // about use this part of API. + return ox->get_processed_rw_ops_num(); +} + +uint64_t cls_get_features(cls_method_context_t hctx) +{ + return 0; +} + +uint64_t cls_get_client_features(cls_method_context_t hctx) +{ + try { + const auto& message = \ + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message(); + return message.get_features(); + } catch (crimson::osd::error& e) { + return -e.code().value(); + } +} + +uint64_t cls_get_pool_stripe_width(cls_method_context_t hctx) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + return ox->get_pool_stripe_width(); +} + +ceph_release_t cls_get_required_osd_release(cls_method_context_t hctx) +{ + // FIXME + return ceph_release_t::nautilus; +} + +ceph_release_t cls_get_min_compatible_client(cls_method_context_t hctx) +{ + // FIXME + return ceph_release_t::nautilus; +} + +const ConfigProxy& cls_get_config(cls_method_context_t hctx) +{ + return crimson::common::local_conf(); +} + +const object_info_t& cls_get_object_info(cls_method_context_t hctx) +{ + return reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_object_info(); +} + +int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + auto obc = ox->get_obc(); + if (!obc->obs.exists || + (obc->obs.oi.is_whiteout() && + obc->ssc->snapset.clones.empty())) { + return -ENOENT; + } + *snap_seq = obc->ssc->snapset.seq; + return 0; +} + +int cls_cxx_chunk_write_and_set(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *write_inbl, + uint32_t op_flags, + bufferlist *set_inbl, + int set_len) +{ + return 0; +} + +int cls_get_manifest_ref_count(cls_method_context_t hctx, string fp_oid) +{ + return 0; +} + +uint64_t cls_get_osd_min_alloc_size(cls_method_context_t hctx) { + // FIXME + return 4096; +} + +int cls_cxx_gather(cls_method_context_t hctx, const std::set<std::string> &src_objs, const std::string& pool, + const char *cls, const char *method, bufferlist& inbl) +{ + return 0; +} + +int cls_cxx_get_gathered_data(cls_method_context_t hctx, std::map<std::string, bufferlist> *results) +{ + return 0; +} + +// although at first glance the implementation looks the same as in +// the classical OSD, it's different b/c of how the dout macro expands. +int cls_log(int level, const char *format, ...) +{ + size_t size = 256; + va_list ap; + while (1) { + boost::container::small_vector<char, 256> buf(size); + va_start(ap, format); + int n = vsnprintf(buf.data(), size, format, ap); + va_end(ap); +#define MAX_SIZE 8196UL + if ((n > -1 && static_cast<size_t>(n) < size) || size > MAX_SIZE) { + dout(ceph::dout::need_dynamic(level)) << buf.data() << dendl; + return n; + } + size *= 2; + } +} diff --git a/src/crimson/osd/object_context.cc b/src/crimson/osd/object_context.cc new file mode 100644 index 000000000..1ea701c22 --- /dev/null +++ b/src/crimson/osd/object_context.cc @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/object_context.h" + +#include <fmt/ranges.h> + +#include "common/Formatter.h" +#include "crimson/common/config_proxy.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +ObjectContextRegistry::ObjectContextRegistry(crimson::common::ConfigProxy &conf) +{ + obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size")); + conf.add_observer(this); +} + +ObjectContextRegistry::~ObjectContextRegistry() +{ + // purge the cache to avoid leaks and complains from LSan + obc_lru.set_target_size(0UL); +} + +const char** ObjectContextRegistry::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "crimson_osd_obc_lru_size", + nullptr + }; + return KEYS; +} + +void ObjectContextRegistry::handle_conf_change( + const crimson::common::ConfigProxy& conf, + const std::set <std::string> &changed) +{ + obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size")); +} + +std::optional<hobject_t> resolve_oid( + const SnapSet &ss, + const hobject_t &oid) +{ + logger().debug("{} oid.snap={},head snapset.seq={}", + __func__, oid.snap, ss.seq); + if (oid.snap > ss.seq) { + // Because oid.snap > ss.seq, we are trying to read from a snapshot + // taken after the most recent write to this object. Read from head. + return oid.get_head(); + } else { + // which clone would it be? + auto clone = std::lower_bound( + begin(ss.clones), end(ss.clones), + oid.snap); + if (clone == end(ss.clones)) { + // Doesn't exist, > last clone, < ss.seq + return std::nullopt; + } + auto citer = ss.clone_snaps.find(*clone); + // TODO: how do we want to handle this kind of logic error? + ceph_assert(citer != ss.clone_snaps.end()); + + if (std::find( + citer->second.begin(), + citer->second.end(), + oid.snap) == citer->second.end()) { + logger().debug("{} {} does not contain {} -- DNE", + __func__, ss.clone_snaps, oid.snap); + return std::nullopt; + } else { + auto soid = oid; + soid.snap = *clone; + return std::optional<hobject_t>(soid); + } + } +} + +} diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h new file mode 100644 index 000000000..8abf6d3f7 --- /dev/null +++ b/src/crimson/osd/object_context.h @@ -0,0 +1,276 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <optional> +#include <utility> +#include <seastar/core/shared_future.hh> +#include <seastar/core/shared_ptr.hh> + +#include "common/intrusive_lru.h" +#include "osd/object_state.h" +#include "crimson/common/exception.h" +#include "crimson/common/tri_mutex.h" +#include "crimson/osd/osd_operation.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::common { + class ConfigProxy; +} + +namespace crimson::osd { + +class Watch; +struct SnapSetContext; +using SnapSetContextRef = boost::intrusive_ptr<SnapSetContext>; + +template <typename OBC> +struct obc_to_hoid { + using type = hobject_t; + const type &operator()(const OBC &obc) { + return obc.obs.oi.soid; + } +}; + +struct SnapSetContext : + public boost::intrusive_ref_counter<SnapSetContext, + boost::thread_unsafe_counter> +{ + hobject_t oid; + SnapSet snapset; + bool exists = false; + /** + * exists + * + * Because ObjectContext's are cached, we need to be able to express the case + * where the object to which a cached ObjectContext refers does not exist. + * ObjectContext's for yet-to-be-created objects are initialized with exists=false. + * The ObjectContext for a deleted object will have exists set to false until it falls + * out of cache (or another write recreates the object). + */ + explicit SnapSetContext(const hobject_t& o) : + oid(o), exists(false) {} +}; + +class ObjectContext : public ceph::common::intrusive_lru_base< + ceph::common::intrusive_lru_config< + hobject_t, ObjectContext, obc_to_hoid<ObjectContext>>> +{ +public: + ObjectState obs; + SnapSetContextRef ssc; + // the watch / notify machinery rather stays away from the hot and + // frequented paths. std::map is used mostly because of developer's + // convenience. + using watch_key_t = std::pair<uint64_t, entity_name_t>; + std::map<watch_key_t, seastar::shared_ptr<crimson::osd::Watch>> watchers; + + ObjectContext(hobject_t hoid) : obs(std::move(hoid)) {} + + const hobject_t &get_oid() const { + return obs.oi.soid; + } + + bool is_head() const { + return get_oid().is_head(); + } + + hobject_t get_head_oid() const { + return get_oid().get_head(); + } + + const SnapSet &get_head_ss() const { + ceph_assert(is_head()); + ceph_assert(ssc); + return ssc->snapset; + } + + void set_head_state(ObjectState &&_obs, SnapSetContextRef &&_ssc) { + ceph_assert(is_head()); + obs = std::move(_obs); + ssc = std::move(_ssc); + } + + void set_clone_state(ObjectState &&_obs) { + ceph_assert(!is_head()); + obs = std::move(_obs); + } + + /// pass the provided exception to any waiting consumers of this ObjectContext + template<typename Exception> + void interrupt(Exception ex) { + lock.abort(std::move(ex)); + if (recovery_read_marker) { + drop_recovery_read(); + } + } + +private: + tri_mutex lock; + bool recovery_read_marker = false; + + template <typename Lock, typename Func> + auto _with_lock(Lock&& lock, Func&& func) { + Ref obc = this; + return lock.lock().then([&lock, func = std::forward<Func>(func), obc]() mutable { + return seastar::futurize_invoke(func).finally([&lock, obc] { + lock.unlock(); + }); + }); + } + + boost::intrusive::list_member_hook<> list_hook; + uint64_t list_link_cnt = 0; + +public: + + template <typename ListType> + void append_to(ListType& list) { + if (list_link_cnt++ == 0) { + list.push_back(*this); + } + } + + template <typename ListType> + void remove_from(ListType&& list) { + assert(list_link_cnt > 0); + if (--list_link_cnt == 0) { + list.erase(std::decay_t<ListType>::s_iterator_to(*this)); + } + } + + using obc_accessing_option_t = boost::intrusive::member_hook< + ObjectContext, + boost::intrusive::list_member_hook<>, + &ObjectContext::list_hook>; + + template<RWState::State Type, typename InterruptCond = void, typename Func> + auto with_lock(Func&& func) { + if constexpr (!std::is_void_v<InterruptCond>) { + auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func)); + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.for_write(), std::move(wrapper)); + case RWState::RWREAD: + return _with_lock(lock.for_read(), std::move(wrapper)); + case RWState::RWEXCL: + return _with_lock(lock.for_excl(), std::move(wrapper)); + case RWState::RWNONE: + return seastar::futurize_invoke(std::move(wrapper)); + default: + assert(0 == "noop"); + } + } else { + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.for_write(), std::forward<Func>(func)); + case RWState::RWREAD: + return _with_lock(lock.for_read(), std::forward<Func>(func)); + case RWState::RWEXCL: + return _with_lock(lock.for_excl(), std::forward<Func>(func)); + case RWState::RWNONE: + return seastar::futurize_invoke(std::forward<Func>(func)); + default: + assert(0 == "noop"); + } + } + } + template<RWState::State Type, typename InterruptCond = void, typename Func> + auto with_promoted_lock(Func&& func) { + if constexpr (!std::is_void_v<InterruptCond>) { + auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func)); + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.excl_from_write(), std::move(wrapper)); + case RWState::RWREAD: + return _with_lock(lock.excl_from_read(), std::move(wrapper)); + case RWState::RWEXCL: + return _with_lock(lock.excl_from_excl(), std::move(wrapper)); + case RWState::RWNONE: + return _with_lock(lock.for_excl(), std::move(wrapper)); + default: + assert(0 == "noop"); + } + } else { + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.excl_from_write(), std::forward<Func>(func)); + case RWState::RWREAD: + return _with_lock(lock.excl_from_read(), std::forward<Func>(func)); + case RWState::RWEXCL: + return _with_lock(lock.excl_from_excl(), std::forward<Func>(func)); + case RWState::RWNONE: + return _with_lock(lock.for_excl(), std::forward<Func>(func)); + default: + assert(0 == "noop"); + } + } + } + + bool empty() const { + return !lock.is_acquired(); + } + bool is_request_pending() const { + return lock.is_acquired(); + } + + bool get_recovery_read() { + if (lock.try_lock_for_read()) { + recovery_read_marker = true; + return true; + } else { + return false; + } + } + void wait_recovery_read() { + assert(lock.get_readers() > 0); + recovery_read_marker = true; + } + void drop_recovery_read() { + assert(recovery_read_marker); + recovery_read_marker = false; + } + bool maybe_get_excl() { + return lock.try_lock_for_excl(); + } +}; +using ObjectContextRef = ObjectContext::Ref; + +class ObjectContextRegistry : public md_config_obs_t { + ObjectContext::lru_t obc_lru; + +public: + ObjectContextRegistry(crimson::common::ConfigProxy &conf); + ~ObjectContextRegistry(); + + std::pair<ObjectContextRef, bool> get_cached_obc(const hobject_t &hoid) { + return obc_lru.get_or_create(hoid); + } + ObjectContextRef maybe_get_cached_obc(const hobject_t &hoid) { + return obc_lru.get(hoid); + } + + void clear_range(const hobject_t &from, + const hobject_t &to) { + obc_lru.clear_range(from, to); + } + + template <class F> + void for_each(F&& f) { + obc_lru.for_each(std::forward<F>(f)); + } + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const crimson::common::ConfigProxy& conf, + const std::set <std::string> &changed) final; +}; + +std::optional<hobject_t> resolve_oid(const SnapSet &ss, + const hobject_t &oid); + +} // namespace crimson::osd diff --git a/src/crimson/osd/object_context_loader.cc b/src/crimson/osd/object_context_loader.cc new file mode 100644 index 000000000..0a4d74c0d --- /dev/null +++ b/src/crimson/osd/object_context_loader.cc @@ -0,0 +1,232 @@ +#include "crimson/osd/object_context_loader.h" +#include "osd/osd_types_fmt.h" + +SET_SUBSYS(osd); + +namespace crimson::osd { + +using crimson::common::local_conf; + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_head_obc(ObjectContextRef obc, + bool existed, + with_obc_func_t&& func) + { + LOG_PREFIX(ObjectContextLoader::with_head_obc); + DEBUGDPP("object {}", dpp, obc->get_oid()); + assert(obc->is_head()); + obc->append_to(obc_set_accessing); + return obc->with_lock<State, IOInterruptCondition>( + [existed=existed, obc=obc, func=std::move(func), this] { + return get_or_load_obc<State>(obc, existed) + .safe_then_interruptible( + [func = std::move(func)](auto obc) { + return std::move(func)(std::move(obc)); + }); + }).finally([FNAME, this, obc=std::move(obc)] { + DEBUGDPP("released object {}", dpp, obc->get_oid()); + obc->remove_from(obc_set_accessing); + }); + } + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_clone_obc(hobject_t oid, + with_obc_func_t&& func) + { + LOG_PREFIX(ObjectContextLoader::with_clone_obc); + assert(!oid.is_head()); + return with_obc<RWState::RWREAD>( + oid.get_head(), + [FNAME, oid, func=std::move(func), this](auto head) mutable + -> load_obc_iertr::future<> { + if (!head->obs.exists) { + ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid); + return load_obc_iertr::future<>{ + crimson::ct_error::enoent::make() + }; + } + return this->with_clone_obc_only<State>(std::move(head), + oid, + std::move(func)); + }); + } + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_clone_obc_only(ObjectContextRef head, + hobject_t oid, + with_obc_func_t&& func) + { + LOG_PREFIX(ObjectContextLoader::with_clone_obc_only); + auto coid = resolve_oid(head->get_head_ss(), oid); + if (!coid) { + ERRORDPP("clone {} not found", dpp, oid); + return load_obc_iertr::future<>{ + crimson::ct_error::enoent::make() + }; + } + auto [clone, existed] = obc_registry.get_cached_obc(*coid); + return clone->template with_lock<State, IOInterruptCondition>( + [existed=existed, clone=std::move(clone), + func=std::move(func), head=std::move(head), this]() + -> load_obc_iertr::future<> { + auto loaded = get_or_load_obc<State>(clone, existed); + return loaded.safe_then_interruptible( + [func = std::move(func)](auto clone) { + return std::move(func)(std::move(clone)); + }); + }); + } + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_clone_obc_direct( + hobject_t oid, + with_both_obc_func_t&& func) + { + LOG_PREFIX(ObjectContextLoader::with_clone_obc_direct); + assert(!oid.is_head()); + return with_obc<RWState::RWREAD>( + oid.get_head(), + [FNAME, oid, func=std::move(func), this](auto head) mutable + -> load_obc_iertr::future<> { + if (!head->obs.exists) { + ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid); + return load_obc_iertr::future<>{ + crimson::ct_error::enoent::make() + }; + } +#ifndef NDEBUG + auto &ss = head->get_head_ss(); + auto cit = std::find( + std::begin(ss.clones), std::end(ss.clones), oid.snap); + assert(cit != std::end(ss.clones)); +#endif + auto [clone, existed] = obc_registry.get_cached_obc(oid); + return clone->template with_lock<State, IOInterruptCondition>( + [existed=existed, clone=std::move(clone), + func=std::move(func), head=std::move(head), this]() + -> load_obc_iertr::future<> { + auto loaded = get_or_load_obc<State>(clone, existed); + return loaded.safe_then_interruptible( + [func = std::move(func), head=std::move(head)](auto clone) { + return std::move(func)(std::move(head), std::move(clone)); + }); + }); + }); + } + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_obc(hobject_t oid, + with_obc_func_t&& func) + { + if (oid.is_head()) { + auto [obc, existed] = + obc_registry.get_cached_obc(std::move(oid)); + return with_head_obc<State>(std::move(obc), + existed, + std::move(func)); + } else { + return with_clone_obc<State>(oid, std::move(func)); + } + } + + ObjectContextLoader::load_obc_iertr::future<ObjectContextRef> + ObjectContextLoader::load_obc(ObjectContextRef obc) + { + LOG_PREFIX(ObjectContextLoader::load_obc); + return backend.load_metadata(obc->get_oid()) + .safe_then_interruptible( + [FNAME, this, obc=std::move(obc)](auto md) + -> load_obc_ertr::future<ObjectContextRef> { + const hobject_t& oid = md->os.oi.soid; + DEBUGDPP("loaded obs {} for {}", dpp, md->os.oi, oid); + if (oid.is_head()) { + if (!md->ssc) { + ERRORDPP("oid {} missing snapsetcontext", dpp, oid); + return crimson::ct_error::object_corrupted::make(); + } + obc->set_head_state(std::move(md->os), + std::move(md->ssc)); + } else { + obc->set_clone_state(std::move(md->os)); + } + DEBUGDPP("returning obc {} for {}", dpp, obc->obs.oi, obc->obs.oi.soid); + return load_obc_ertr::make_ready_future<ObjectContextRef>(obc); + }); + } + + template<RWState::State State> + ObjectContextLoader::load_obc_iertr::future<ObjectContextRef> + ObjectContextLoader::get_or_load_obc(ObjectContextRef obc, + bool existed) + { + LOG_PREFIX(ObjectContextLoader::get_or_load_obc); + auto loaded = + load_obc_iertr::make_ready_future<ObjectContextRef>(obc); + if (existed) { + DEBUGDPP("cache hit on {}", dpp, obc->get_oid()); + } else { + DEBUGDPP("cache miss on {}", dpp, obc->get_oid()); + loaded = + obc->template with_promoted_lock<State, IOInterruptCondition>( + [obc, this] { + return load_obc(obc); + }); + } + return loaded; + } + + ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::reload_obc(ObjectContext& obc) const + { + LOG_PREFIX(ObjectContextLoader::reload_obc); + assert(obc.is_head()); + return backend.load_metadata(obc.get_oid()) + .safe_then_interruptible<false>( + [FNAME, this, &obc](auto md)-> load_obc_ertr::future<> { + DEBUGDPP("reloaded obs {} for {}", dpp, md->os.oi, obc.get_oid()); + if (!md->ssc) { + ERRORDPP("oid {} missing snapsetcontext", dpp, obc.get_oid()); + return crimson::ct_error::object_corrupted::make(); + } + obc.set_head_state(std::move(md->os), std::move(md->ssc)); + return load_obc_ertr::now(); + }); + } + + void ObjectContextLoader::notify_on_change(bool is_primary) + { + LOG_PREFIX(ObjectContextLoader::notify_on_change); + DEBUGDPP("is_primary: {}", dpp, is_primary); + for (auto& obc : obc_set_accessing) { + DEBUGDPP("interrupting obc: {}", dpp, obc.get_oid()); + obc.interrupt(::crimson::common::actingset_changed(is_primary)); + } + } + + // explicitly instantiate the used instantiations + template ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_obc<RWState::RWNONE>(hobject_t, + with_obc_func_t&&); + + template ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_obc<RWState::RWREAD>(hobject_t, + with_obc_func_t&&); + + template ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_obc<RWState::RWWRITE>(hobject_t, + with_obc_func_t&&); + + template ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_obc<RWState::RWEXCL>(hobject_t, + with_obc_func_t&&); + + template ObjectContextLoader::load_obc_iertr::future<> + ObjectContextLoader::with_clone_obc_direct<RWState::RWWRITE>( + hobject_t, + with_both_obc_func_t&&); +} diff --git a/src/crimson/osd/object_context_loader.h b/src/crimson/osd/object_context_loader.h new file mode 100644 index 000000000..3ab7f6ad8 --- /dev/null +++ b/src/crimson/osd/object_context_loader.h @@ -0,0 +1,87 @@ +#pragma once + +#include <seastar/core/future.hh> +#include "crimson/common/errorator.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/pg_backend.h" + +namespace crimson::osd { +class ObjectContextLoader { +public: + using obc_accessing_list_t = boost::intrusive::list< + ObjectContext, + ObjectContext::obc_accessing_option_t>; + + ObjectContextLoader( + ObjectContextRegistry& _obc_services, + PGBackend& _backend, + DoutPrefixProvider& dpp) + : obc_registry{_obc_services}, + backend{_backend}, + dpp{dpp} + {} + + using load_obc_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::object_corrupted>; + using load_obc_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + load_obc_ertr>; + + using with_obc_func_t = + std::function<load_obc_iertr::future<> (ObjectContextRef)>; + + using with_both_obc_func_t = + std::function<load_obc_iertr::future<> (ObjectContextRef, ObjectContextRef)>; + + // Use this variant by default + template<RWState::State State> + load_obc_iertr::future<> with_obc(hobject_t oid, + with_obc_func_t&& func); + + // Use this variant in the case where the head object + // obc is already locked and only the clone obc is needed. + // Avoid nesting with_head_obc() calls by using with_clone_obc() + // with an already locked head. + template<RWState::State State> + load_obc_iertr::future<> with_clone_obc_only(ObjectContextRef head, + hobject_t oid, + with_obc_func_t&& func); + + // Use this variant in the case where both the head + // object *and* the matching clone object are being used + // in func. + template<RWState::State State> + load_obc_iertr::future<> with_clone_obc_direct( + hobject_t oid, + with_both_obc_func_t&& func); + + load_obc_iertr::future<> reload_obc(ObjectContext& obc) const; + + void notify_on_change(bool is_primary); + +private: + ObjectContextRegistry& obc_registry; + PGBackend& backend; + DoutPrefixProvider& dpp; + obc_accessing_list_t obc_set_accessing; + + template<RWState::State State> + load_obc_iertr::future<> with_clone_obc(hobject_t oid, + with_obc_func_t&& func); + + template<RWState::State State> + load_obc_iertr::future<> with_head_obc(ObjectContextRef obc, + bool existed, + with_obc_func_t&& func); + + template<RWState::State State> + load_obc_iertr::future<ObjectContextRef> + get_or_load_obc(ObjectContextRef obc, + bool existed); + + load_obc_iertr::future<ObjectContextRef> + load_obc(ObjectContextRef obc); +}; +} diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc new file mode 100644 index 000000000..040870203 --- /dev/null +++ b/src/crimson/osd/ops_executer.cc @@ -0,0 +1,1461 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ops_executer.h" + +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/map.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm_ext/push_back.hpp> +#include <boost/range/algorithm/max_element.hpp> +#include <boost/range/numeric.hpp> + +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include <seastar/core/thread.hh> + +#include "crimson/osd/exceptions.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/watch.h" +#include "osd/ClassHandler.h" +#include "osd/SnapMapper.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +OpsExecuter::call_ierrorator::future<> OpsExecuter::do_op_call(OSDOp& osd_op) +{ + std::string cname, mname; + ceph::bufferlist indata; + try { + auto bp = std::begin(osd_op.indata); + bp.copy(osd_op.op.cls.class_len, cname); + bp.copy(osd_op.op.cls.method_len, mname); + bp.copy(osd_op.op.cls.indata_len, indata); + } catch (buffer::error&) { + logger().warn("call unable to decode class + method + indata"); + return crimson::ct_error::invarg::make(); + } + + // NOTE: opening a class can actually result in dlopen(), and thus + // blocking the entire reactor. Thankfully to ClassHandler's cache + // this is supposed to be extremely infrequent. + ClassHandler::ClassData* cls; + int r = ClassHandler::get_instance().open_class(cname, &cls); + if (r) { + logger().warn("class {} open got {}", cname, cpp_strerror(r)); + if (r == -ENOENT) { + return crimson::ct_error::operation_not_supported::make(); + } else if (r == -EPERM) { + // propagate permission errors + return crimson::ct_error::permission_denied::make(); + } + return crimson::ct_error::input_output_error::make(); + } + + ClassHandler::ClassMethod* method = cls->get_method(mname); + if (!method) { + logger().warn("call method {}.{} does not exist", cname, mname); + return crimson::ct_error::operation_not_supported::make(); + } + + const auto flags = method->get_flags(); + if (!obc->obs.exists && (flags & CLS_METHOD_WR) == 0) { + return crimson::ct_error::enoent::make(); + } + +#if 0 + if (flags & CLS_METHOD_WR) { + ctx->user_modify = true; + } +#endif + + logger().debug("calling method {}.{}, num_read={}, num_write={}", + cname, mname, num_read, num_write); + const auto prev_rd = num_read; + const auto prev_wr = num_write; + return interruptor::async( + [this, method, indata=std::move(indata)]() mutable { + ceph::bufferlist outdata; + auto cls_context = reinterpret_cast<cls_method_context_t>(this); + const auto ret = method->exec(cls_context, indata, outdata); + return std::make_pair(ret, std::move(outdata)); + } + ).then_interruptible( + [this, prev_rd, prev_wr, &osd_op, flags] + (auto outcome) -> call_errorator::future<> { + auto& [ret, outdata] = outcome; + osd_op.rval = ret; + + logger().debug("do_op_call: method returned ret={}, outdata.length()={}" + " while num_read={}, num_write={}", + ret, outdata.length(), num_read, num_write); + if (num_read > prev_rd && !(flags & CLS_METHOD_RD)) { + logger().error("method tried to read object but is not marked RD"); + osd_op.rval = -EIO; + return crimson::ct_error::input_output_error::make(); + } + if (num_write > prev_wr && !(flags & CLS_METHOD_WR)) { + logger().error("method tried to update object but is not marked WR"); + osd_op.rval = -EIO; + return crimson::ct_error::input_output_error::make(); + } + // ceph-osd has this implemented in `PrimaryLogPG::execute_ctx`, + // grep for `ignore_out_data`. + using crimson::common::local_conf; + if (op_info.allows_returnvec() && + op_info.may_write() && + ret >= 0 && + outdata.length() > local_conf()->osd_max_write_op_reply_len) { + // the justification of this limit it to not inflate the pg log. + // that's the reason why we don't worry about pure reads. + logger().error("outdata overflow due to .length()={}, limit={}", + outdata.length(), + local_conf()->osd_max_write_op_reply_len); + osd_op.rval = -EOVERFLOW; + return crimson::ct_error::value_too_large::make(); + } + // for write calls we never return data expect errors or RETURNVEC. + // please refer cls/cls_hello.cc to details. + if (!op_info.may_write() || op_info.allows_returnvec() || ret < 0) { + osd_op.op.extent.length = outdata.length(); + osd_op.outdata.claim_append(outdata); + } + if (ret < 0) { + return crimson::stateful_ec{ + std::error_code(-ret, std::generic_category()) }; + } else { + return seastar::now(); + } + } + ); +} + +static watch_info_t create_watch_info(const OSDOp& osd_op, + const OpsExecuter::ExecutableMessage& msg, + entity_addr_t peer_addr) +{ + using crimson::common::local_conf; + const uint32_t timeout = + osd_op.op.watch.timeout == 0 ? local_conf()->osd_client_watch_timeout + : osd_op.op.watch.timeout; + return { + osd_op.op.watch.cookie, + timeout, + peer_addr + }; +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + logger().debug("{}", __func__); + struct connect_ctx_t { + ObjectContext::watch_key_t key; + crimson::net::ConnectionRef conn; + watch_info_t info; + + connect_ctx_t( + const OSDOp& osd_op, + const ExecutableMessage& msg, + crimson::net::ConnectionRef conn) + : key(osd_op.op.watch.cookie, msg.get_reqid().name), + conn(conn), + info(create_watch_info(osd_op, msg, conn->get_peer_addr())) { + } + }; + + return with_effect_on_obc( + connect_ctx_t{ osd_op, get_message(), conn }, + [&](auto& ctx) { + const auto& entity = ctx.key.second; + auto [it, emplaced] = + os.oi.watchers.try_emplace(ctx.key, std::move(ctx.info)); + if (emplaced) { + logger().info("registered new watch {} by {}", it->second, entity); + txn.nop(); + } else { + logger().info("found existing watch {} by {}", it->second, entity); + } + return seastar::now(); + }, + [](auto&& ctx, ObjectContextRef obc, Ref<PG> pg) { + assert(pg); + auto [it, emplaced] = obc->watchers.try_emplace(ctx.key, nullptr); + if (emplaced) { + const auto& [cookie, entity] = ctx.key; + it->second = crimson::osd::Watch::create( + obc, ctx.info, entity, std::move(pg)); + logger().info("op_effect: added new watcher: {}", ctx.key); + } else { + logger().info("op_effect: found existing watcher: {}", ctx.key); + } + return it->second->connect(std::move(ctx.conn), true /* will_ping */); + } + ); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_reconnect( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + const entity_name_t& entity = get_message().get_reqid().name; + const auto& cookie = osd_op.op.watch.cookie; + if (!os.oi.watchers.count(std::make_pair(cookie, entity))) { + return crimson::ct_error::not_connected::make(); + } else { + logger().info("found existing watch by {}", entity); + return do_op_watch_subop_watch(osd_op, os, txn); + } +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_unwatch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + logger().info("{}", __func__); + + struct disconnect_ctx_t { + ObjectContext::watch_key_t key; + disconnect_ctx_t(const OSDOp& osd_op, const ExecutableMessage& msg) + : key(osd_op.op.watch.cookie, msg.get_reqid().name) { + } + }; + return with_effect_on_obc(disconnect_ctx_t{ osd_op, get_message() }, + [&] (auto& ctx) { + const auto& entity = ctx.key.second; + if (auto nh = os.oi.watchers.extract(ctx.key); !nh.empty()) { + logger().info("removed watch {} by {}", nh.mapped(), entity); + txn.nop(); + } else { + logger().info("can't remove: no watch by {}", entity); + } + return seastar::now(); + }, + [] (auto&& ctx, ObjectContextRef obc, Ref<PG>) { + if (auto nh = obc->watchers.extract(ctx.key); !nh.empty()) { + return seastar::do_with(std::move(nh.mapped()), + [ctx](auto&& watcher) { + logger().info("op_effect: disconnect watcher {}", ctx.key); + return watcher->remove(); + }); + } else { + logger().info("op_effect: disconnect failed to find watcher {}", ctx.key); + return seastar::now(); + } + }); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch_subop_ping( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + const entity_name_t& entity = get_message().get_reqid().name; + const auto& cookie = osd_op.op.watch.cookie; + const auto key = std::make_pair(cookie, entity); + + // Note: WATCH with PING doesn't cause may_write() to return true, + // so if there is nothing else in the transaction, this is going + // to run do_osd_op_effects, but not write out a log entry */ + if (!os.oi.watchers.count(key)) { + return crimson::ct_error::not_connected::make(); + } + auto it = obc->watchers.find(key); + if (it == std::end(obc->watchers) || !it->second->is_connected()) { + return crimson::ct_error::timed_out::make(); + } + logger().info("found existing watch by {}", entity); + it->second->got_ping(ceph_clock_now()); + return seastar::now(); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + logger().debug("{}", __func__); + if (!os.exists) { + return crimson::ct_error::enoent::make(); + } + switch (osd_op.op.watch.op) { + case CEPH_OSD_WATCH_OP_WATCH: + return do_op_watch_subop_watch(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_RECONNECT: + return do_op_watch_subop_reconnect(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_PING: + return do_op_watch_subop_ping(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_UNWATCH: + return do_op_watch_subop_unwatch(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_LEGACY_WATCH: + logger().warn("ignoring CEPH_OSD_WATCH_OP_LEGACY_WATCH"); + return crimson::ct_error::invarg::make(); + } + logger().warn("unrecognized WATCH subop: {}", osd_op.op.watch.op); + return crimson::ct_error::invarg::make(); +} + +static uint64_t get_next_notify_id(epoch_t e) +{ + // FIXME + static std::uint64_t next_notify_id = 0; + return (((uint64_t)e) << 32) | ((uint64_t)(next_notify_id++)); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_notify( + OSDOp& osd_op, + const ObjectState& os) +{ + logger().debug("{}, msg epoch: {}", __func__, get_message().get_map_epoch()); + + if (!os.exists) { + return crimson::ct_error::enoent::make(); + } + struct notify_ctx_t { + crimson::net::ConnectionRef conn; + notify_info_t ninfo; + const uint64_t client_gid; + const epoch_t epoch; + + notify_ctx_t(const ExecutableMessage& msg, + crimson::net::ConnectionRef conn) + : conn(conn), + client_gid(msg.get_reqid().name.num()), + epoch(msg.get_map_epoch()) { + } + }; + return with_effect_on_obc( + notify_ctx_t{ get_message(), conn }, + [&](auto& ctx) { + try { + auto bp = osd_op.indata.cbegin(); + uint32_t ver; // obsolete + ceph::decode(ver, bp); + ceph::decode(ctx.ninfo.timeout, bp); + ceph::decode(ctx.ninfo.bl, bp); + } catch (const buffer::error&) { + ctx.ninfo.timeout = 0; + } + if (!ctx.ninfo.timeout) { + using crimson::common::local_conf; + ctx.ninfo.timeout = local_conf()->osd_default_notify_timeout; + } + ctx.ninfo.notify_id = get_next_notify_id(ctx.epoch); + ctx.ninfo.cookie = osd_op.op.notify.cookie; + // return our unique notify id to the client + ceph::encode(ctx.ninfo.notify_id, osd_op.outdata); + return seastar::now(); + }, + [](auto&& ctx, ObjectContextRef obc, Ref<PG>) { + auto alive_watchers = obc->watchers | boost::adaptors::map_values + | boost::adaptors::filtered( + [] (const auto& w) { + // FIXME: filter as for the `is_ping` in `Watch::start_notify` + return w->is_alive(); + }); + return crimson::osd::Notify::create_n_propagate( + std::begin(alive_watchers), + std::end(alive_watchers), + std::move(ctx.conn), + ctx.ninfo, + ctx.client_gid, + obc->obs.oi.user_version); + } + ); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_list_watchers( + OSDOp& osd_op, + const ObjectState& os) +{ + logger().debug("{}", __func__); + + obj_list_watch_response_t response; + for (const auto& [key, info] : os.oi.watchers) { + logger().debug("{}: key cookie={}, entity={}", + __func__, key.first, key.second); + assert(key.first == info.cookie); + assert(key.second.is_client()); + response.entries.emplace_back(watch_item_t{ + key.second, info.cookie, info.timeout_seconds, info.addr}); + } + response.encode(osd_op.outdata, get_message().get_features()); + return watch_ierrorator::now(); +} + +OpsExecuter::watch_ierrorator::future<> OpsExecuter::do_op_notify_ack( + OSDOp& osd_op, + const ObjectState& os) +{ + logger().debug("{}", __func__); + + struct notifyack_ctx_t { + const entity_name_t entity; + uint64_t watch_cookie; + uint64_t notify_id; + ceph::bufferlist reply_bl; + + notifyack_ctx_t(const ExecutableMessage& msg) + : entity(msg.get_reqid().name) { + } + }; + return with_effect_on_obc(notifyack_ctx_t{ get_message() }, + [&] (auto& ctx) -> watch_errorator::future<> { + try { + auto bp = osd_op.indata.cbegin(); + ceph::decode(ctx.notify_id, bp); + ceph::decode(ctx.watch_cookie, bp); + if (!bp.end()) { + ceph::decode(ctx.reply_bl, bp); + } + } catch (const buffer::error&) { + // here we behave differently than ceph-osd. For historical reasons, + // it falls back to using `osd_op.op.watch.cookie` as `ctx.notify_id`. + // crimson just returns EINVAL if the data cannot be decoded. + return crimson::ct_error::invarg::make(); + } + return watch_errorator::now(); + }, + [] (auto&& ctx, ObjectContextRef obc, Ref<PG>) { + logger().info("notify_ack watch_cookie={}, notify_id={}", + ctx.watch_cookie, ctx.notify_id); + return seastar::do_for_each(obc->watchers, + [ctx=std::move(ctx)] (auto& kv) { + const auto& [key, watchp] = kv; + static_assert( + std::is_same_v<std::decay_t<decltype(watchp)>, + seastar::shared_ptr<crimson::osd::Watch>>); + auto& [cookie, entity] = key; + if (ctx.entity != entity) { + logger().debug("skipping watch {}; entity name {} != {}", + key, entity, ctx.entity); + return seastar::now(); + } + if (ctx.watch_cookie != cookie) { + logger().debug("skipping watch {}; cookie {} != {}", + key, ctx.watch_cookie, cookie); + return seastar::now(); + } + logger().info("acking notify on watch {}", key); + return watchp->notify_ack(ctx.notify_id, ctx.reply_bl); + }); + }); +} + +// Defined here because there is a circular dependency between OpsExecuter and PG +template <class Func> +auto OpsExecuter::do_const_op(Func&& f) { + // TODO: pass backend as read-only + return std::forward<Func>(f)(pg->get_backend(), std::as_const(obc->obs)); +} + +// Defined here because there is a circular dependency between OpsExecuter and PG +template <class Func> +auto OpsExecuter::do_write_op(Func&& f, OpsExecuter::modified_by m) { + ++num_write; + if (!osd_op_params) { + osd_op_params.emplace(); + fill_op_params_bump_pg_version(); + } + user_modify = (m == modified_by::user); + return std::forward<Func>(f)(pg->get_backend(), obc->obs, txn); +} +OpsExecuter::call_errorator::future<> OpsExecuter::do_assert_ver( + OSDOp& osd_op, + const ObjectState& os) +{ + if (!osd_op.op.assert_ver.ver) { + return crimson::ct_error::invarg::make(); + } else if (osd_op.op.assert_ver.ver < os.oi.user_version) { + return crimson::ct_error::erange::make(); + } else if (osd_op.op.assert_ver.ver > os.oi.user_version) { + return crimson::ct_error::value_too_large::make(); + } + return seastar::now(); +} + +OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps( + OSDOp& osd_op, + const ObjectState& os, + const SnapSet& ss) +{ + obj_list_snap_response_t resp; + resp.clones.reserve(ss.clones.size() + 1); + for (auto &clone: ss.clones) { + clone_info ci; + ci.cloneid = clone; + + { + auto p = ss.clone_snaps.find(clone); + if (p == ss.clone_snaps.end()) { + logger().error( + "OpsExecutor::do_list_snaps: {} has inconsistent " + "clone_snaps, missing clone {}", + os.oi.soid, + clone); + return crimson::ct_error::invarg::make(); + } + ci.snaps.reserve(p->second.size()); + ci.snaps.insert(ci.snaps.end(), p->second.rbegin(), p->second.rend()); + } + + { + auto p = ss.clone_overlap.find(clone); + if (p == ss.clone_overlap.end()) { + logger().error( + "OpsExecutor::do_list_snaps: {} has inconsistent " + "clone_overlap, missing clone {}", + os.oi.soid, + clone); + return crimson::ct_error::invarg::make(); + } + ci.overlap.reserve(p->second.num_intervals()); + ci.overlap.insert(ci.overlap.end(), p->second.begin(), p->second.end()); + } + + { + auto p = ss.clone_size.find(clone); + if (p == ss.clone_size.end()) { + logger().error( + "OpsExecutor::do_list_snaps: {} has inconsistent " + "clone_size, missing clone {}", + os.oi.soid, + clone); + return crimson::ct_error::invarg::make(); + } + ci.size = p->second; + } + resp.clones.push_back(std::move(ci)); + } + + if (!os.oi.is_whiteout()) { + clone_info ci; + ci.cloneid = CEPH_NOSNAP; + ci.size = os.oi.size; + resp.clones.push_back(std::move(ci)); + } + resp.seq = ss.seq; + logger().error( + "OpsExecutor::do_list_snaps: {}, resp.clones.size(): {}", + os.oi.soid, + resp.clones.size()); + resp.encode(osd_op.outdata); + return read_ierrorator::now(); +} + +OpsExecuter::interruptible_errorated_future<OpsExecuter::osd_op_errorator> +OpsExecuter::execute_op(OSDOp& osd_op) +{ + return do_execute_op(osd_op).handle_error_interruptible( + osd_op_errorator::all_same_way([&osd_op](auto e, auto&& e_raw) + -> OpsExecuter::osd_op_errorator::future<> { + // All ops except for CMPEXT should have rval set to -e.value(), + // CMPEXT sets rval itself and shouldn't be overridden. + if (e.value() != ct_error::cmp_fail_error_value) { + osd_op.rval = -e.value(); + } + if ((osd_op.op.flags & CEPH_OSD_OP_FLAG_FAILOK) && + e.value() != EAGAIN && e.value() != EINPROGRESS) { + return osd_op_errorator::now(); + } else { + return std::move(e_raw); + } + })); +} + +OpsExecuter::interruptible_errorated_future<OpsExecuter::osd_op_errorator> +OpsExecuter::do_execute_op(OSDOp& osd_op) +{ + // TODO: dispatch via call table? + // TODO: we might want to find a way to unify both input and output + // of each op. + logger().debug( + "handling op {} on object {}", + ceph_osd_op_name(osd_op.op.op), + get_target()); + switch (const ceph_osd_op& op = osd_op.op; op.op) { + case CEPH_OSD_OP_SYNC_READ: + [[fallthrough]]; + case CEPH_OSD_OP_READ: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.read(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_SPARSE_READ: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.sparse_read(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_CHECKSUM: + return do_read_op([&osd_op](auto& backend, const auto& os) { + return backend.checksum(os, osd_op); + }); + case CEPH_OSD_OP_CMPEXT: + return do_read_op([&osd_op](auto& backend, const auto& os) { + return backend.cmp_ext(os, osd_op); + }); + case CEPH_OSD_OP_GETXATTR: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.getxattr(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_GETXATTRS: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.get_xattrs(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_CMPXATTR: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.cmp_xattr(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_RMXATTR: + return do_write_op([&osd_op](auto& backend, auto& os, auto& txn) { + return backend.rm_xattr(os, osd_op, txn); + }); + case CEPH_OSD_OP_CREATE: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.create(os, osd_op, txn, delta_stats); + }); + case CEPH_OSD_OP_WRITE: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.write(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_WRITESAME: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.write_same(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_WRITEFULL: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.writefull(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_ROLLBACK: + return do_write_op([this, &head=obc, + &osd_op](auto& backend, auto& os, auto& txn) { + return backend.rollback(os, osd_op, txn, *osd_op_params, delta_stats, + head, pg->obc_loader); + }); + case CEPH_OSD_OP_APPEND: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.append(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_TRUNCATE: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + // FIXME: rework needed. Move this out to do_write_op(), introduce + // do_write_op_no_user_modify()... + return backend.truncate(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_ZERO: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.zero(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_SETALLOCHINT: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.set_allochint(os, osd_op, txn, delta_stats); + }); + case CEPH_OSD_OP_SETXATTR: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.setxattr(os, osd_op, txn, delta_stats); + }); + case CEPH_OSD_OP_DELETE: + { + bool whiteout = false; + if (!obc->ssc->snapset.clones.empty() || + (snapc.snaps.size() && // there are snaps + snapc.snaps[0] > obc->ssc->snapset.seq)) { // existing obj is old + logger().debug("{} has or will have clones, will whiteout {}", + __func__, obc->obs.oi.soid); + whiteout = true; + } + return do_write_op([this, whiteout](auto& backend, auto& os, auto& txn) { + return backend.remove(os, txn, delta_stats, whiteout); + }); + } + case CEPH_OSD_OP_CALL: + return this->do_op_call(osd_op); + case CEPH_OSD_OP_STAT: + // note: stat does not require RD + return do_const_op([this, &osd_op] (/* const */auto& backend, const auto& os) { + return backend.stat(os, osd_op, delta_stats); + }); + + case CEPH_OSD_OP_TMAPPUT: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.tmapput(os, osd_op, txn, delta_stats, *osd_op_params); + }); + case CEPH_OSD_OP_TMAPUP: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto &txn) { + return backend.tmapup(os, osd_op, txn, delta_stats, *osd_op_params); + }); + case CEPH_OSD_OP_TMAPGET: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.tmapget(os, osd_op, delta_stats); + }); + + // OMAP + case CEPH_OSD_OP_OMAPGETKEYS: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.omap_get_keys(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_OMAPGETVALS: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.omap_get_vals(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_OMAP_CMP: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.omap_cmp(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_OMAPGETHEADER: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.omap_get_header(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_OMAPGETVALSBYKEYS: + return do_read_op([this, &osd_op](auto& backend, const auto& os) { + return backend.omap_get_vals_by_keys(os, osd_op, delta_stats); + }); + case CEPH_OSD_OP_OMAPSETVALS: +#if 0 + if (!pg.get_pgpool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.omap_set_vals(os, osd_op, txn, *osd_op_params, delta_stats); + }); + case CEPH_OSD_OP_OMAPSETHEADER: +#if 0 + if (!pg.get_pgpool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.omap_set_header(os, osd_op, txn, *osd_op_params, + delta_stats); + }); + case CEPH_OSD_OP_OMAPRMKEYRANGE: +#if 0 + if (!pg.get_pgpool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.omap_remove_range(os, osd_op, txn, delta_stats); + }); + case CEPH_OSD_OP_OMAPRMKEYS: + /** TODO: Implement supports_omap() + if (!pg.get_pgpool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + }*/ + return do_write_op([&osd_op](auto& backend, auto& os, auto& txn) { + return backend.omap_remove_key(os, osd_op, txn); + }); + case CEPH_OSD_OP_OMAPCLEAR: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return backend.omap_clear(os, osd_op, txn, *osd_op_params, delta_stats); + }); + + // watch/notify + case CEPH_OSD_OP_WATCH: + return do_write_op([this, &osd_op](auto& backend, auto& os, auto& txn) { + return do_op_watch(osd_op, os, txn); + }, modified_by::sys); + case CEPH_OSD_OP_LIST_WATCHERS: + return do_read_op([this, &osd_op](auto&, const auto& os) { + return do_op_list_watchers(osd_op, os); + }); + case CEPH_OSD_OP_NOTIFY: + return do_read_op([this, &osd_op](auto&, const auto& os) { + return do_op_notify(osd_op, os); + }); + case CEPH_OSD_OP_NOTIFY_ACK: + return do_read_op([this, &osd_op](auto&, const auto& os) { + return do_op_notify_ack(osd_op, os); + }); + case CEPH_OSD_OP_ASSERT_VER: + return do_read_op([this, &osd_op](auto&, const auto& os) { + return do_assert_ver(osd_op, os); + }); + case CEPH_OSD_OP_LIST_SNAPS: + return do_snapset_op([this, &osd_op](const auto &os, const auto &ss) { + return do_list_snaps(osd_op, os, ss); + }); + + default: + logger().warn("unknown op {}", ceph_osd_op_name(op.op)); + throw std::runtime_error( + fmt::format("op '{}' not supported", ceph_osd_op_name(op.op))); + } +} + +void OpsExecuter::fill_op_params_bump_pg_version() +{ + osd_op_params->req_id = msg->get_reqid(); + osd_op_params->mtime = msg->get_mtime(); + osd_op_params->at_version = pg->next_version(); + osd_op_params->pg_trim_to = pg->get_pg_trim_to(); + osd_op_params->min_last_complete_ondisk = pg->get_min_last_complete_ondisk(); + osd_op_params->last_complete = pg->get_info().last_complete; +} + +std::vector<pg_log_entry_t> OpsExecuter::prepare_transaction( + const std::vector<OSDOp>& ops) +{ + // let's ensure we don't need to inform SnapMapper about this particular + // entry. + assert(obc->obs.oi.soid.snap >= CEPH_MAXSNAP); + std::vector<pg_log_entry_t> log_entries; + log_entries.emplace_back( + obc->obs.exists ? + pg_log_entry_t::MODIFY : pg_log_entry_t::DELETE, + obc->obs.oi.soid, + osd_op_params->at_version, + obc->obs.oi.version, + osd_op_params->user_modify ? osd_op_params->at_version.version : 0, + osd_op_params->req_id, + osd_op_params->mtime, + op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0); + if (op_info.allows_returnvec()) { + // also the per-op values are recorded in the pg log + log_entries.back().set_op_returns(ops); + logger().debug("{} op_returns: {}", + __func__, log_entries.back().op_returns); + } + log_entries.back().clean_regions = std::move(osd_op_params->clean_regions); + return log_entries; +} + +OpsExecuter::interruptible_future<> OpsExecuter::snap_map_remove( + const hobject_t& soid, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn) +{ + logger().debug("{}: soid {}", __func__, soid); + return interruptor::async([soid, &snap_mapper, + _t=osdriver.get_transaction(&txn)]() mutable { + const auto r = snap_mapper.remove_oid(soid, &_t); + if (r) { + logger().error("{}: remove_oid {} failed with {}", + __func__, soid, r); + } + // On removal tolerate missing key corruption + assert(r == 0 || r == -ENOENT); + }); +} + +OpsExecuter::interruptible_future<> OpsExecuter::snap_map_modify( + const hobject_t& soid, + const std::set<snapid_t>& snaps, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn) +{ + logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps); + return interruptor::async([soid, snaps, &snap_mapper, + _t=osdriver.get_transaction(&txn)]() mutable { + assert(std::size(snaps) > 0); + [[maybe_unused]] const auto r = snap_mapper.update_snaps( + soid, snaps, 0, &_t); + assert(r == 0); + }); +} + +OpsExecuter::interruptible_future<> OpsExecuter::snap_map_clone( + const hobject_t& soid, + const std::set<snapid_t>& snaps, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn) +{ + logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps); + return interruptor::async([soid, snaps, &snap_mapper, + _t=osdriver.get_transaction(&txn)]() mutable { + assert(std::size(snaps) > 0); + snap_mapper.add_oid(soid, snaps, &_t); + }); +} + +// Defined here because there is a circular dependency between OpsExecuter and PG +uint32_t OpsExecuter::get_pool_stripe_width() const { + return pg->get_pgpool().info.get_stripe_width(); +} + +// Defined here because there is a circular dependency between OpsExecuter and PG +version_t OpsExecuter::get_last_user_version() const +{ + return pg->get_last_user_version(); +} + +std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone( + const SnapContext& snapc, + const ObjectState& initial_obs, + const SnapSet& initial_snapset, + PGBackend& backend, + ceph::os::Transaction& txn) +{ + const hobject_t& soid = initial_obs.oi.soid; + logger().debug("{} {} snapset={} snapc={}", + __func__, soid, + initial_snapset, snapc); + + auto cloning_ctx = std::make_unique<CloningContext>(); + cloning_ctx->new_snapset = initial_snapset; + + // clone object, the snap field is set to the seq of the SnapContext + // at its creation. + hobject_t coid = soid; + coid.snap = snapc.seq; + + // existing snaps are stored in descending order in snapc, + // cloned_snaps vector will hold all the snaps stored until snapset.seq + const std::vector<snapid_t> cloned_snaps = [&] { + auto last = std::find_if( + std::begin(snapc.snaps), std::end(snapc.snaps), + [&](snapid_t snap_id) { return snap_id <= initial_snapset.seq; }); + return std::vector<snapid_t>{std::begin(snapc.snaps), last}; + }(); + + auto [snap_oi, clone_obc] = prepare_clone(coid); + // make clone + backend.clone(snap_oi, initial_obs, clone_obc->obs, txn); + + delta_stats.num_objects++; + if (snap_oi.is_omap()) { + delta_stats.num_objects_omap++; + } + delta_stats.num_object_clones++; + // newsnapset is obc's ssc + cloning_ctx->new_snapset.clones.push_back(coid.snap); + cloning_ctx->new_snapset.clone_size[coid.snap] = initial_obs.oi.size; + cloning_ctx->new_snapset.clone_snaps[coid.snap] = cloned_snaps; + + // clone_overlap should contain an entry for each clone + // (an empty interval_set if there is no overlap) + auto &overlap = cloning_ctx->new_snapset.clone_overlap[coid.snap]; + if (initial_obs.oi.size) { + overlap.insert(0, initial_obs.oi.size); + } + + // log clone + logger().debug("cloning v {} to {} v {} snaps={} snapset={}", + initial_obs.oi.version, coid, + osd_op_params->at_version, cloned_snaps, cloning_ctx->new_snapset); + + cloning_ctx->log_entry = { + pg_log_entry_t::CLONE, + coid, + snap_oi.version, + initial_obs.oi.version, + initial_obs.oi.user_version, + osd_reqid_t(), + initial_obs.oi.mtime, // will be replaced in `apply_to()` + 0 + }; + encode(cloned_snaps, cloning_ctx->log_entry.snaps); + + // TODO: update most recent clone_overlap and usage stats + return cloning_ctx; +} + +void OpsExecuter::CloningContext::apply_to( + std::vector<pg_log_entry_t>& log_entries, + ObjectContext& processed_obc) && +{ + log_entry.mtime = processed_obc.obs.oi.mtime; + log_entries.emplace_back(std::move(log_entry)); + processed_obc.ssc->snapset = std::move(new_snapset); +} + +OpsExecuter::interruptible_future<std::vector<pg_log_entry_t>> +OpsExecuter::flush_clone_metadata( + std::vector<pg_log_entry_t>&& log_entries, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn) +{ + assert(!txn.empty()); + auto maybe_snap_mapped = interruptor::now(); + if (cloning_ctx) { + std::move(*cloning_ctx).apply_to(log_entries, *obc); + const auto& coid = log_entries.back().soid; + const auto& cloned_snaps = obc->ssc->snapset.clone_snaps[coid.snap]; + maybe_snap_mapped = snap_map_clone( + coid, + std::set<snapid_t>{std::begin(cloned_snaps), std::end(cloned_snaps)}, + snap_mapper, + osdriver, + txn); + } + if (snapc.seq > obc->ssc->snapset.seq) { + // update snapset with latest snap context + obc->ssc->snapset.seq = snapc.seq; + obc->ssc->snapset.snaps.clear(); + } + logger().debug("{} done, initial snapset={}, new snapset={}", + __func__, obc->obs.oi.soid, obc->ssc->snapset); + return std::move( + maybe_snap_mapped + ).then_interruptible([log_entries=std::move(log_entries)]() mutable { + return interruptor::make_ready_future<std::vector<pg_log_entry_t>>( + std::move(log_entries)); + }); +} + +// TODO: make this static +std::pair<object_info_t, ObjectContextRef> OpsExecuter::prepare_clone( + const hobject_t& coid) +{ + object_info_t static_snap_oi(coid); + static_snap_oi.version = pg->next_version(); + static_snap_oi.prior_version = obc->obs.oi.version; + static_snap_oi.copy_user_bits(obc->obs.oi); + if (static_snap_oi.is_whiteout()) { + // clone shouldn't be marked as whiteout + static_snap_oi.clear_flag(object_info_t::FLAG_WHITEOUT); + } + + ObjectContextRef clone_obc; + if (pg->is_primary()) { + // lookup_or_create + auto [c_obc, existed] = + pg->obc_registry.get_cached_obc(std::move(coid)); + assert(!existed); + c_obc->obs.oi = static_snap_oi; + c_obc->obs.exists = true; + c_obc->ssc = obc->ssc; + logger().debug("clone_obc: {}", c_obc->obs.oi); + clone_obc = std::move(c_obc); + } + return std::make_pair(std::move(static_snap_oi), std::move(clone_obc)); +} + +void OpsExecuter::apply_stats() +{ + pg->get_peering_state().apply_op_stats(get_target(), delta_stats); + pg->publish_stats_to_osd(); +} + +OpsExecuter::OpsExecuter(Ref<PG> pg, + ObjectContextRef _obc, + const OpInfo& op_info, + abstracted_msg_t&& msg, + crimson::net::ConnectionRef conn, + const SnapContext& _snapc) + : pg(std::move(pg)), + obc(std::move(_obc)), + op_info(op_info), + msg(std::move(msg)), + conn(conn), + snapc(_snapc) +{ + if (op_info.may_write() && should_clone(*obc, snapc)) { + do_write_op([this](auto& backend, auto& os, auto& txn) { + cloning_ctx = execute_clone(std::as_const(snapc), + std::as_const(obc->obs), + std::as_const(obc->ssc->snapset), + backend, + txn); + }); + } +} + +static inline std::unique_ptr<const PGLSFilter> get_pgls_filter( + const std::string& type, + bufferlist::const_iterator& iter) +{ + // storing non-const PGLSFilter for the sake of ::init() + std::unique_ptr<PGLSFilter> filter; + if (type.compare("plain") == 0) { + filter = std::make_unique<PGLSPlainFilter>(); + } else { + std::size_t dot = type.find("."); + if (dot == type.npos || dot == 0 || dot == type.size() - 1) { + throw crimson::osd::invalid_argument{}; + } + + const std::string class_name = type.substr(0, dot); + const std::string filter_name = type.substr(dot + 1); + ClassHandler::ClassData *cls = nullptr; + int r = ClassHandler::get_instance().open_class(class_name, &cls); + if (r != 0) { + logger().warn("can't open class {}: {}", class_name, cpp_strerror(r)); + if (r == -EPERM) { + // propogate permission error + throw crimson::osd::permission_denied{}; + } else { + throw crimson::osd::invalid_argument{}; + } + } else { + ceph_assert(cls); + } + + ClassHandler::ClassFilter * const class_filter = cls->get_filter(filter_name); + if (class_filter == nullptr) { + logger().warn("can't find filter {} in class {}", filter_name, class_name); + throw crimson::osd::invalid_argument{}; + } + + filter.reset(class_filter->fn()); + if (!filter) { + // Object classes are obliged to return us something, but let's + // give an error rather than asserting out. + logger().warn("buggy class {} failed to construct filter {}", + class_name, filter_name); + throw crimson::osd::invalid_argument{}; + } + } + + ceph_assert(filter); + int r = filter->init(iter); + if (r < 0) { + logger().warn("error initializing filter {}: {}", type, cpp_strerror(r)); + throw crimson::osd::invalid_argument{}; + } + + // successfully constructed and initialized, return it. + return filter; +} + +static PG::interruptible_future<hobject_t> pgls_filter( + const PGLSFilter& filter, + const PGBackend& backend, + const hobject_t& sobj) +{ + if (const auto xattr = filter.get_xattr(); !xattr.empty()) { + logger().debug("pgls_filter: filter is interested in xattr={} for obj={}", + xattr, sobj); + return backend.getxattr(sobj, std::move(xattr)).safe_then_interruptible( + [&filter, sobj] (ceph::bufferlist val) { + logger().debug("pgls_filter: got xvalue for obj={}", sobj); + + const bool filtered = filter.filter(sobj, val); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + }, PGBackend::get_attr_errorator::all_same_way([&filter, sobj] { + logger().debug("pgls_filter: got error for obj={}", sobj); + + if (filter.reject_empty_xattr()) { + return seastar::make_ready_future<hobject_t>(); + } + ceph::bufferlist val; + const bool filtered = filter.filter(sobj, val); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + })); + } else { + ceph::bufferlist empty_lvalue_bl; + const bool filtered = filter.filter(sobj, empty_lvalue_bl); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + } +} + +static PG::interruptible_future<ceph::bufferlist> do_pgnls_common( + const hobject_t& pg_start, + const hobject_t& pg_end, + const PGBackend& backend, + const hobject_t& lower_bound, + const std::string& nspace, + const uint64_t limit, + const PGLSFilter* const filter) +{ + if (!(lower_bound.is_min() || + lower_bound.is_max() || + (lower_bound >= pg_start && lower_bound < pg_end))) { + // this should only happen with a buggy client. + throw std::invalid_argument("outside of PG bounds"); + } + + return backend.list_objects(lower_bound, limit).then_interruptible( + [&backend, filter, nspace](auto&& ret) + -> PG::interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>> { + auto& [objects, next] = ret; + auto in_my_namespace = [&nspace](const hobject_t& obj) { + using crimson::common::local_conf; + if (obj.get_namespace() == local_conf()->osd_hit_set_namespace) { + return false; + } else if (nspace == librados::all_nspaces) { + return true; + } else { + return obj.get_namespace() == nspace; + } + }; + auto to_pglsed = [&backend, filter] (const hobject_t& obj) + -> PG::interruptible_future<hobject_t> { + // this transformation looks costly. However, I don't have any + // reason to think PGLS* operations are critical for, let's say, + // general performance. + // + // from tchaikov: "another way is to use seastar::map_reduce(), + // to 1) save the effort to filter the already filtered objects + // 2) avoid the space to keep the tuple<bool, object> even if + // the object is filtered out". + if (filter) { + return pgls_filter(*filter, backend, obj); + } else { + return seastar::make_ready_future<hobject_t>(obj); + } + }; + + auto range = objects | boost::adaptors::filtered(in_my_namespace) + | boost::adaptors::transformed(to_pglsed); + logger().debug("do_pgnls_common: finishing the 1st stage of pgls"); + return seastar::when_all_succeed(std::begin(range), + std::end(range)).then( + [next=std::move(next)] (auto items) mutable { + // the sole purpose of this chaining is to pass `next` to 2nd + // stage altogether with items + logger().debug("do_pgnls_common: 1st done"); + return seastar::make_ready_future< + std::tuple<std::vector<hobject_t>, hobject_t>>( + std::move(items), std::move(next)); + }); + }).then_interruptible( + [pg_end] (auto&& ret) { + auto& [items, next] = ret; + auto is_matched = [] (const auto& obj) { + return !obj.is_min(); + }; + auto to_entry = [] (const auto& obj) { + return librados::ListObjectImpl{ + obj.get_namespace(), obj.oid.name, obj.get_key() + }; + }; + + pg_nls_response_t response; + boost::push_back(response.entries, items | boost::adaptors::filtered(is_matched) + | boost::adaptors::transformed(to_entry)); + response.handle = next.is_max() ? pg_end : next; + ceph::bufferlist out; + encode(response, out); + logger().debug("do_pgnls_common: response.entries.size()= {}", + response.entries.size()); + return seastar::make_ready_future<ceph::bufferlist>(std::move(out)); + }); +} + +static PG::interruptible_future<> do_pgnls( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + hobject_t lower_bound; + try { + ceph::decode(lower_bound, osd_op.indata); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGNLS handle"); + } + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = \ + pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num()); + return do_pgnls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + nullptr /* no filter */) + .then_interruptible([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); +} + +static PG::interruptible_future<> do_pgnls_filtered( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + std::string cname, mname, type; + auto bp = osd_op.indata.cbegin(); + try { + ceph::decode(cname, bp); + ceph::decode(mname, bp); + ceph::decode(type, bp); + } catch (const buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + auto filter = get_pgls_filter(type, bp); + + hobject_t lower_bound; + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGNLS_FILTER description"); + } + + logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}", + __func__, cname, mname, type, lower_bound, + static_cast<const void*>(filter.get())); + return seastar::do_with(std::move(filter), + [&, lower_bound=std::move(lower_bound)](auto&& filter) { + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num()); + return do_pgnls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + filter.get()) + .then_interruptible([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); + }); +} + +static PG::interruptible_future<ceph::bufferlist> do_pgls_common( + const hobject_t& pg_start, + const hobject_t& pg_end, + const PGBackend& backend, + const hobject_t& lower_bound, + const std::string& nspace, + const uint64_t limit, + const PGLSFilter* const filter) +{ + if (!(lower_bound.is_min() || + lower_bound.is_max() || + (lower_bound >= pg_start && lower_bound < pg_end))) { + // this should only happen with a buggy client. + throw std::invalid_argument("outside of PG bounds"); + } + + using entries_t = decltype(pg_ls_response_t::entries); + return backend.list_objects(lower_bound, limit).then_interruptible( + [&backend, filter, nspace](auto&& ret) { + auto& [objects, next] = ret; + return PG::interruptor::when_all( + PG::interruptor::map_reduce(std::move(objects), + [&backend, filter, nspace](const hobject_t& obj) + -> PG::interruptible_future<hobject_t>{ + if (obj.get_namespace() == nspace) { + if (filter) { + return pgls_filter(*filter, backend, obj); + } else { + return seastar::make_ready_future<hobject_t>(obj); + } + } else { + return seastar::make_ready_future<hobject_t>(); + } + }, + entries_t{}, + [](entries_t entries, hobject_t obj) { + if (!obj.is_min()) { + entries.emplace_back(obj.oid, obj.get_key()); + } + return entries; + }), + seastar::make_ready_future<hobject_t>(next)); + }).then_interruptible([pg_end](auto&& ret) { + auto entries = std::move(std::get<0>(ret).get0()); + auto next = std::move(std::get<1>(ret).get0()); + pg_ls_response_t response; + response.handle = next.is_max() ? pg_end : next; + response.entries = std::move(entries); + ceph::bufferlist out; + encode(response, out); + logger().debug("{}: response.entries.size()=", + __func__, response.entries.size()); + return seastar::make_ready_future<ceph::bufferlist>(std::move(out)); + }); +} + +static PG::interruptible_future<> do_pgls( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + hobject_t lower_bound; + auto bp = osd_op.indata.cbegin(); + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument{"unable to decode PGLS handle"}; + } + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = + pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num()); + return do_pgls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + nullptr /* no filter */) + .then_interruptible([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); +} + +static PG::interruptible_future<> do_pgls_filtered( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + std::string cname, mname, type; + auto bp = osd_op.indata.cbegin(); + try { + ceph::decode(cname, bp); + ceph::decode(mname, bp); + ceph::decode(type, bp); + } catch (const buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + auto filter = get_pgls_filter(type, bp); + + hobject_t lower_bound; + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGLS_FILTER description"); + } + + logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}", + __func__, cname, mname, type, lower_bound, + static_cast<const void*>(filter.get())); + return seastar::do_with(std::move(filter), + [&, lower_bound=std::move(lower_bound)](auto&& filter) { + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pgpool().info.get_pg_num()); + return do_pgls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + filter.get()) + .then_interruptible([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); + }); +} + +PgOpsExecuter::interruptible_future<> +PgOpsExecuter::execute_op(OSDOp& osd_op) +{ + logger().warn("handling op {}", ceph_osd_op_name(osd_op.op.op)); + switch (const ceph_osd_op& op = osd_op.op; op.op) { + case CEPH_OSD_OP_PGLS: + return do_pgls(pg, nspace, osd_op); + case CEPH_OSD_OP_PGLS_FILTER: + return do_pgls_filtered(pg, nspace, osd_op); + case CEPH_OSD_OP_PGNLS: + return do_pgnls(pg, nspace, osd_op); + case CEPH_OSD_OP_PGNLS_FILTER: + return do_pgnls_filtered(pg, nspace, osd_op); + default: + logger().warn("unknown op {}", ceph_osd_op_name(op.op)); + throw std::runtime_error( + fmt::format("op '{}' not supported", ceph_osd_op_name(op.op))); + } +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h new file mode 100644 index 000000000..1230b1c5a --- /dev/null +++ b/src/crimson/osd/ops_executer.h @@ -0,0 +1,629 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <type_traits> +#include <utility> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <fmt/os.h> +#include <seastar/core/chunked_fifo.hh> +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/shared_ptr.hh> + +#include "common/dout.h" +#include "common/map_cacher.hpp" +#include "common/static_ptr.h" +#include "messages/MOSDOp.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" + +#include "crimson/common/errorator.h" +#include "crimson/common/interruptible_future.h" +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/shard_services.h" + +struct ObjectState; +struct OSDOp; +class OSDriver; +class SnapMapper; + +namespace crimson::osd { +class PG; + +// OpsExecuter -- a class for executing ops targeting a certain object. +class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> { + friend class SnapTrimObjSubEvent; + + using call_errorator = crimson::errorator< + crimson::stateful_ec, + crimson::ct_error::enoent, + crimson::ct_error::eexist, + crimson::ct_error::enospc, + crimson::ct_error::edquot, + crimson::ct_error::cmp_fail, + crimson::ct_error::eagain, + crimson::ct_error::invarg, + crimson::ct_error::erange, + crimson::ct_error::ecanceled, + crimson::ct_error::enametoolong, + crimson::ct_error::permission_denied, + crimson::ct_error::operation_not_supported, + crimson::ct_error::input_output_error, + crimson::ct_error::value_too_large, + crimson::ct_error::file_too_large>; + using read_errorator = PGBackend::read_errorator; + using write_ertr = PGBackend::write_ertr; + using get_attr_errorator = PGBackend::get_attr_errorator; + using watch_errorator = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::invarg, + crimson::ct_error::not_connected, + crimson::ct_error::timed_out>; + + using call_ierrorator = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, call_errorator>; + using read_ierrorator = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, read_errorator>; + using write_iertr = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, write_ertr>; + using get_attr_ierrorator = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, get_attr_errorator>; + using watch_ierrorator = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, watch_errorator>; + + template <typename Errorator, typename T = void> + using interruptible_errorated_future = + ::crimson::interruptible::interruptible_errorated_future< + IOInterruptCondition, Errorator, T>; + using interruptor = + ::crimson::interruptible::interruptor<IOInterruptCondition>; + template <typename T = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + IOInterruptCondition, T>; + +public: + // ExecutableMessage -- an interface class to allow using OpsExecuter + // with other message types than just the `MOSDOp`. The type erasure + // happens in the ctor of `OpsExecuter`. + struct ExecutableMessage { + virtual osd_reqid_t get_reqid() const = 0; + virtual utime_t get_mtime() const = 0; + virtual epoch_t get_map_epoch() const = 0; + virtual entity_inst_t get_orig_source_inst() const = 0; + virtual uint64_t get_features() const = 0; + virtual bool has_flag(uint32_t flag) const = 0; + virtual entity_name_t get_source() const = 0; + }; + + template <class ImplT> + class ExecutableMessagePimpl final : ExecutableMessage { + const ImplT* pimpl; + // In crimson, conn is independently maintained outside Message. + const crimson::net::ConnectionRef conn; + public: + ExecutableMessagePimpl(const ImplT* pimpl, + const crimson::net::ConnectionRef conn) + : pimpl(pimpl), conn(conn) { + } + + osd_reqid_t get_reqid() const final { + return pimpl->get_reqid(); + } + bool has_flag(uint32_t flag) const final { + return pimpl->has_flag(flag); + } + utime_t get_mtime() const final { + return pimpl->get_mtime(); + }; + epoch_t get_map_epoch() const final { + return pimpl->get_map_epoch(); + } + entity_inst_t get_orig_source_inst() const final { + // We can't get the origin source address from the message + // since (In Crimson) the connection is maintained + // outside of the Message. + return entity_inst_t(get_source(), conn->get_peer_addr()); + } + entity_name_t get_source() const final { + return pimpl->get_source(); + } + uint64_t get_features() const final { + return pimpl->get_features(); + } + }; + + // because OpsExecuter is pretty heavy-weight object we want to ensure + // it's not copied nor even moved by accident. Performance is the sole + // reason for prohibiting that. + OpsExecuter(OpsExecuter&&) = delete; + OpsExecuter(const OpsExecuter&) = delete; + + using osd_op_errorator = crimson::compound_errorator_t< + call_errorator, + read_errorator, + write_ertr, + get_attr_errorator, + watch_errorator, + PGBackend::stat_errorator>; + using osd_op_ierrorator = + ::crimson::interruptible::interruptible_errorator< + IOInterruptCondition, osd_op_errorator>; + + object_stat_sum_t delta_stats; +private: + // an operation can be divided into two stages: main and effect-exposing + // one. The former is performed immediately on call to `do_osd_op()` while + // the later on `submit_changes()` – after successfully processing main + // stages of all involved operations. When any stage fails, none of all + // scheduled effect-exposing stages will be executed. + // when operation requires this division, some variant of `with_effect()` + // should be used. + struct effect_t { + // an effect can affect PG, i.e. create a watch timeout + virtual osd_op_errorator::future<> execute(Ref<PG> pg) = 0; + virtual ~effect_t() = default; + }; + + Ref<PG> pg; // for the sake of object class + ObjectContextRef obc; + const OpInfo& op_info; + using abstracted_msg_t = + ceph::static_ptr<ExecutableMessage, + sizeof(ExecutableMessagePimpl<void>)>; + abstracted_msg_t msg; + crimson::net::ConnectionRef conn; + std::optional<osd_op_params_t> osd_op_params; + bool user_modify = false; + ceph::os::Transaction txn; + + size_t num_read = 0; ///< count read ops + size_t num_write = 0; ///< count update ops + + SnapContext snapc; // writer snap context + struct CloningContext { + SnapSet new_snapset; + pg_log_entry_t log_entry; + + void apply_to( + std::vector<pg_log_entry_t>& log_entries, + ObjectContext& processed_obc) &&; + }; + std::unique_ptr<CloningContext> cloning_ctx; + + + /** + * execute_clone + * + * If snapc contains a snap which occurred logically after the last write + * seen by this object (see OpsExecutor::should_clone()), we first need + * make a clone of the object at its current state. execute_clone primes + * txn with that clone operation and returns an + * OpsExecutor::CloningContext which will allow us to fill in the corresponding + * metadata and log_entries once the operations have been processed. + * + * Note that this strategy differs from classic, which instead performs this + * work at the end and reorders the transaction. See + * PrimaryLogPG::make_writeable + * + * @param snapc [in] snapc for this operation (from the client if from the + * client, from the pool otherwise) + * @param initial_obs [in] objectstate for the object at operation start + * @param initial_snapset [in] snapset for the object at operation start + * @param backend [in,out] interface for generating mutations + * @param txn [out] transaction for the operation + */ + std::unique_ptr<CloningContext> execute_clone( + const SnapContext& snapc, + const ObjectState& initial_obs, + const SnapSet& initial_snapset, + PGBackend& backend, + ceph::os::Transaction& txn); + + + /** + * should_clone + * + * Predicate returning whether a user write with snap context snapc + * contains a snap which occurred prior to the most recent write + * on the object reflected in initial_obc. + * + * @param initial_obc [in] obc for object to be mutated + * @param snapc [in] snapc for this operation (from the client if from the + * client, from the pool otherwise) + */ + static bool should_clone( + const ObjectContext& initial_obc, + const SnapContext& snapc) { + // clone? + return initial_obc.obs.exists // both nominally and... + && !initial_obc.obs.oi.is_whiteout() // ... logically exists + && snapc.snaps.size() // there are snaps + && snapc.snaps[0] > initial_obc.ssc->snapset.seq; // existing obj is old + } + + interruptible_future<std::vector<pg_log_entry_t>> flush_clone_metadata( + std::vector<pg_log_entry_t>&& log_entries, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn); + + static interruptible_future<> snap_map_remove( + const hobject_t& soid, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn); + static interruptible_future<> snap_map_modify( + const hobject_t& soid, + const std::set<snapid_t>& snaps, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn); + static interruptible_future<> snap_map_clone( + const hobject_t& soid, + const std::set<snapid_t>& snaps, + SnapMapper& snap_mapper, + OSDriver& osdriver, + ceph::os::Transaction& txn); + + // this gizmo could be wrapped in std::optional for the sake of lazy + // initialization. we don't need it for ops that doesn't have effect + // TODO: verify the init overhead of chunked_fifo + seastar::chunked_fifo<std::unique_ptr<effect_t>> op_effects; + + template <class Context, class MainFunc, class EffectFunc> + auto with_effect_on_obc( + Context&& ctx, + MainFunc&& main_func, + EffectFunc&& effect_func); + + call_ierrorator::future<> do_op_call(OSDOp& osd_op); + watch_ierrorator::future<> do_op_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn); + watch_ierrorator::future<> do_op_watch_subop_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn); + watch_ierrorator::future<> do_op_watch_subop_reconnect( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn); + watch_ierrorator::future<> do_op_watch_subop_unwatch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn); + watch_ierrorator::future<> do_op_watch_subop_ping( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn); + watch_ierrorator::future<> do_op_list_watchers( + OSDOp& osd_op, + const ObjectState& os); + watch_ierrorator::future<> do_op_notify( + OSDOp& osd_op, + const ObjectState& os); + watch_ierrorator::future<> do_op_notify_ack( + OSDOp& osd_op, + const ObjectState& os); + call_errorator::future<> do_assert_ver( + OSDOp& osd_op, + const ObjectState& os); + + using list_snaps_ertr = read_errorator::extend< + crimson::ct_error::invarg>; + using list_snaps_iertr = ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + list_snaps_ertr>; + list_snaps_iertr::future<> do_list_snaps( + OSDOp& osd_op, + const ObjectState& os, + const SnapSet& ss); + + template <class Func> + auto do_const_op(Func&& f); + + template <class Func> + auto do_read_op(Func&& f) { + ++num_read; + // TODO: pass backend as read-only + return do_const_op(std::forward<Func>(f)); + } + + template <class Func> + auto do_snapset_op(Func&& f) { + ++num_read; + return std::invoke( + std::forward<Func>(f), + std::as_const(obc->obs), + std::as_const(obc->ssc->snapset)); + } + + enum class modified_by { + user, + sys, + }; + + template <class Func> + auto do_write_op(Func&& f, modified_by m = modified_by::user); + + decltype(auto) dont_do_legacy_op() { + return crimson::ct_error::operation_not_supported::make(); + } + + interruptible_errorated_future<osd_op_errorator> + do_execute_op(OSDOp& osd_op); + + OpsExecuter(Ref<PG> pg, + ObjectContextRef obc, + const OpInfo& op_info, + abstracted_msg_t&& msg, + crimson::net::ConnectionRef conn, + const SnapContext& snapc); + +public: + template <class MsgT> + OpsExecuter(Ref<PG> pg, + ObjectContextRef obc, + const OpInfo& op_info, + const MsgT& msg, + crimson::net::ConnectionRef conn, + const SnapContext& snapc) + : OpsExecuter( + std::move(pg), + std::move(obc), + op_info, + abstracted_msg_t{ + std::in_place_type_t<ExecutableMessagePimpl<MsgT>>{}, + &msg, + conn}, + conn, + snapc) { + } + + template <class Func> + struct RollbackHelper; + + template <class Func> + RollbackHelper<Func> create_rollbacker(Func&& func); + + interruptible_errorated_future<osd_op_errorator> + execute_op(OSDOp& osd_op); + + using rep_op_fut_tuple = + std::tuple<interruptible_future<>, osd_op_ierrorator::future<>>; + using rep_op_fut_t = + interruptible_future<rep_op_fut_tuple>; + template <typename MutFunc> + rep_op_fut_t flush_changes_n_do_ops_effects( + const std::vector<OSDOp>& ops, + SnapMapper& snap_mapper, + OSDriver& osdriver, + MutFunc&& mut_func) &&; + std::vector<pg_log_entry_t> prepare_transaction( + const std::vector<OSDOp>& ops); + void fill_op_params_bump_pg_version(); + + ObjectContextRef get_obc() const { + return obc; + } + + const object_info_t &get_object_info() const { + return obc->obs.oi; + } + const hobject_t &get_target() const { + return get_object_info().soid; + } + + const auto& get_message() const { + return *msg; + } + + size_t get_processed_rw_ops_num() const { + return num_read + num_write; + } + + uint32_t get_pool_stripe_width() const; + + bool has_seen_write() const { + return num_write > 0; + } + + object_stat_sum_t& get_stats(){ + return delta_stats; + } + + version_t get_last_user_version() const; + + std::pair<object_info_t, ObjectContextRef> prepare_clone( + const hobject_t& coid); + + void apply_stats(); +}; + +template <class Context, class MainFunc, class EffectFunc> +auto OpsExecuter::with_effect_on_obc( + Context&& ctx, + MainFunc&& main_func, + EffectFunc&& effect_func) +{ + using context_t = std::decay_t<Context>; + // the language offers implicit conversion to pointer-to-function for + // lambda only when it's closureless. We enforce this restriction due + // the fact that `flush_changes()` std::moves many executer's parts. + using allowed_effect_func_t = + seastar::future<> (*)(context_t&&, ObjectContextRef, Ref<PG>); + static_assert(std::is_convertible_v<EffectFunc, allowed_effect_func_t>, + "with_effect function is not allowed to capture"); + struct task_t final : effect_t { + context_t ctx; + EffectFunc effect_func; + ObjectContextRef obc; + + task_t(Context&& ctx, EffectFunc&& effect_func, ObjectContextRef obc) + : ctx(std::move(ctx)), + effect_func(std::move(effect_func)), + obc(std::move(obc)) { + } + osd_op_errorator::future<> execute(Ref<PG> pg) final { + return std::move(effect_func)(std::move(ctx), + std::move(obc), + std::move(pg)); + } + }; + auto task = + std::make_unique<task_t>(std::move(ctx), std::move(effect_func), obc); + auto& ctx_ref = task->ctx; + op_effects.emplace_back(std::move(task)); + return std::forward<MainFunc>(main_func)(ctx_ref); +} + +template <typename MutFunc> +OpsExecuter::rep_op_fut_t +OpsExecuter::flush_changes_n_do_ops_effects( + const std::vector<OSDOp>& ops, + SnapMapper& snap_mapper, + OSDriver& osdriver, + MutFunc&& mut_func) && +{ + const bool want_mutate = !txn.empty(); + // osd_op_params are instantiated by every wr-like operation. + assert(osd_op_params || !want_mutate); + assert(obc); + rep_op_fut_t maybe_mutated = + interruptor::make_ready_future<rep_op_fut_tuple>( + seastar::now(), + interruptor::make_interruptible(osd_op_errorator::now())); + if (cloning_ctx) { + ceph_assert(want_mutate); + } + if (want_mutate) { + if (user_modify) { + osd_op_params->user_at_version = osd_op_params->at_version.version; + } + maybe_mutated = flush_clone_metadata( + prepare_transaction(ops), + snap_mapper, + osdriver, + txn + ).then_interruptible([mut_func=std::move(mut_func), + this](auto&& log_entries) mutable { + auto [submitted, all_completed] = + std::forward<MutFunc>(mut_func)(std::move(txn), + std::move(obc), + std::move(*osd_op_params), + std::move(log_entries)); + return interruptor::make_ready_future<rep_op_fut_tuple>( + std::move(submitted), + osd_op_ierrorator::future<>(std::move(all_completed))); + }); + } + apply_stats(); + + if (__builtin_expect(op_effects.empty(), true)) { + return maybe_mutated; + } else { + return maybe_mutated.then_unpack_interruptible( + // need extra ref pg due to apply_stats() which can be executed after + // informing snap mapper + [this, pg=this->pg](auto&& submitted, auto&& all_completed) mutable { + return interruptor::make_ready_future<rep_op_fut_tuple>( + std::move(submitted), + all_completed.safe_then_interruptible([this, pg=std::move(pg)] { + // let's do the cleaning of `op_effects` in destructor + return interruptor::do_for_each(op_effects, + [pg=std::move(pg)](auto& op_effect) { + return op_effect->execute(pg); + }); + })); + }); + } +} + +template <class Func> +struct OpsExecuter::RollbackHelper { + interruptible_future<> rollback_obc_if_modified(const std::error_code& e); + ObjectContextRef get_obc() const { + assert(ox); + return ox->obc; + } + seastar::lw_shared_ptr<OpsExecuter> ox; + Func func; +}; + +template <class Func> +inline OpsExecuter::RollbackHelper<Func> +OpsExecuter::create_rollbacker(Func&& func) { + return {shared_from_this(), std::forward<Func>(func)}; +} + + +template <class Func> +OpsExecuter::interruptible_future<> +OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified( + const std::error_code& e) +{ + // Oops, an operation had failed. do_osd_ops() altogether with + // OpsExecuter already dropped the ObjectStore::Transaction if + // there was any. However, this is not enough to completely + // rollback as we gave OpsExecuter the very single copy of `obc` + // we maintain and we did it for both reading and writing. + // Now all modifications must be reverted. + // + // Let's just reload from the store. Evicting from the shared + // LRU would be tricky as next MOSDOp (the one at `get_obc` + // phase) could actually already finished the lookup. Fortunately, + // this is supposed to live on cold paths, so performance is not + // a concern -- simplicity wins. + // + // The conditional's purpose is to efficiently handle hot errors + // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or + // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients + // typically append them before any write. If OpsExecuter hasn't + // seen any modifying operation, `obc` is supposed to be kept + // unchanged. + assert(ox); + const auto need_rollback = ox->has_seen_write(); + crimson::get_logger(ceph_subsys_osd).debug( + "{}: object {} got error {}, need_rollback={}", + __func__, + ox->obc->get_oid(), + e, + need_rollback); + return need_rollback ? func(*ox->obc) : interruptor::now(); +} + +// PgOpsExecuter -- a class for executing ops targeting a certain PG. +class PgOpsExecuter { + template <typename T = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + IOInterruptCondition, T>; + +public: + PgOpsExecuter(const PG& pg, const MOSDOp& msg) + : pg(pg), nspace(msg.get_hobj().nspace) { + } + + interruptible_future<> execute_op(OSDOp& osd_op); + +private: + const PG& pg; + const std::string& nspace; +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc new file mode 100644 index 000000000..cfe4f54ab --- /dev/null +++ b/src/crimson/osd/osd.cc @@ -0,0 +1,1357 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd.h" + +#include <sys/utsname.h> + +#include <boost/iterator/counting_iterator.hpp> +#include <boost/range/join.hpp> +#include <fmt/format.h> +#include <fmt/os.h> +#include <fmt/ostream.h> +#include <seastar/core/timer.hh> + +#include "common/pick_address.h" +#include "include/util.h" + +#include "messages/MCommand.h" +#include "messages/MOSDBeacon.h" +#include "messages/MOSDBoot.h" +#include "messages/MOSDMap.h" +#include "messages/MOSDMarkMeDown.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDPeeringOp.h" +#include "messages/MOSDPGCreate2.h" +#include "messages/MOSDPGUpdateLogMissing.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDScrub2.h" +#include "messages/MPGStats.h" + +#include "os/Transaction.h" +#include "osd/ClassHandler.h" +#include "osd/OSDCap.h" +#include "osd/PGPeeringEvent.h" +#include "osd/PeeringState.h" + +#include "crimson/admin/osd_admin.h" +#include "crimson/admin/pg_commands.h" +#include "crimson/common/buffer_io.h" +#include "crimson/common/exception.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/heartbeat.h" +#include "crimson/osd/osd_meta.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/pg_meta.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/pg_advance_map.h" +#include "crimson/osd/osd_operations/recovery_subrequest.h" +#include "crimson/osd/osd_operations/replicated_request.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/crush/CrushLocation.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } + static constexpr int TICK_INTERVAL = 1; +} + +using std::make_unique; +using std::map; +using std::pair; +using std::string; +using std::unique_ptr; +using std::vector; + +using crimson::common::local_conf; +using crimson::os::FuturizedStore; + +namespace crimson::osd { + +OSD::OSD(int id, uint32_t nonce, + seastar::abort_source& abort_source, + crimson::os::FuturizedStore& store, + crimson::net::MessengerRef cluster_msgr, + crimson::net::MessengerRef public_msgr, + crimson::net::MessengerRef hb_front_msgr, + crimson::net::MessengerRef hb_back_msgr) + : whoami{id}, + nonce{nonce}, + abort_source{abort_source}, + // do this in background + beacon_timer{[this] { (void)send_beacon(); }}, + cluster_msgr{cluster_msgr}, + public_msgr{public_msgr}, + hb_front_msgr{hb_front_msgr}, + hb_back_msgr{hb_back_msgr}, + monc{new crimson::mon::Client{*public_msgr, *this}}, + mgrc{new crimson::mgr::Client{*public_msgr, *this}}, + store{store}, + pg_shard_manager{osd_singleton_state, + shard_services, + pg_to_shard_mappings}, + // do this in background -- continuation rearms timer when complete + tick_timer{[this] { + std::ignore = update_heartbeat_peers( + ).then([this] { + update_stats(); + tick_timer.arm( + std::chrono::seconds(TICK_INTERVAL)); + }); + }}, + asok{seastar::make_lw_shared<crimson::admin::AdminSocket>()}, + log_client(cluster_msgr.get(), LogClient::NO_FLAGS), + clog(log_client.create_channel()) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + for (auto msgr : {std::ref(cluster_msgr), std::ref(public_msgr), + std::ref(hb_front_msgr), std::ref(hb_back_msgr)}) { + msgr.get()->set_auth_server(monc.get()); + msgr.get()->set_auth_client(monc.get()); + } + + if (local_conf()->osd_open_classes_on_start) { + const int r = ClassHandler::get_instance().open_all_classes(); + if (r) { + logger().warn("{} warning: got an error loading one or more classes: {}", + __func__, cpp_strerror(r)); + } + } + logger().info("{}: nonce is {}", __func__, nonce); + monc->set_log_client(&log_client); + clog->set_log_to_monitors(true); +} + +OSD::~OSD() = default; + +namespace { +// Initial features in new superblock. +// Features here are also automatically upgraded +CompatSet get_osd_initial_compat_set() +{ + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES); + return CompatSet(ceph_osd_feature_compat, + ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} +} + +seastar::future<> OSD::open_meta_coll() +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return store.get_sharded_store().open_collection( + coll_t::meta() + ).then([this](auto ch) { + pg_shard_manager.init_meta_coll(ch, store.get_sharded_store()); + return seastar::now(); + }); +} + +seastar::future<OSDMeta> OSD::open_or_create_meta_coll(FuturizedStore &store) +{ + return store.get_sharded_store().open_collection(coll_t::meta()).then([&store](auto ch) { + if (!ch) { + return store.get_sharded_store().create_new_collection( + coll_t::meta() + ).then([&store](auto ch) { + return OSDMeta(ch, store.get_sharded_store()); + }); + } else { + return seastar::make_ready_future<OSDMeta>(ch, store.get_sharded_store()); + } + }); +} + +seastar::future<> OSD::mkfs( + FuturizedStore &store, + unsigned whoami, + uuid_d osd_uuid, + uuid_d cluster_fsid, + std::string osdspec_affinity) +{ + return store.start().then([&store, osd_uuid] { + return store.mkfs(osd_uuid).handle_error( + crimson::stateful_ec::handle([] (const auto& ec) { + logger().error("error creating empty object store in {}: ({}) {}", + local_conf().get_val<std::string>("osd_data"), + ec.value(), ec.message()); + std::exit(EXIT_FAILURE); + })); + }).then([&store] { + return store.mount().handle_error( + crimson::stateful_ec::handle([](const auto& ec) { + logger().error("error mounting object store in {}: ({}) {}", + local_conf().get_val<std::string>("osd_data"), + ec.value(), ec.message()); + std::exit(EXIT_FAILURE); + })); + }).then([&store] { + return open_or_create_meta_coll(store); + }).then([&store, whoami, cluster_fsid](auto meta_coll) { + OSDSuperblock superblock; + superblock.cluster_fsid = cluster_fsid; + superblock.osd_fsid = store.get_fsid(); + superblock.whoami = whoami; + superblock.compat_features = get_osd_initial_compat_set(); + return _write_superblock( + store, std::move(meta_coll), std::move(superblock)); + }).then([&store, cluster_fsid] { + return store.write_meta("ceph_fsid", cluster_fsid.to_string()); + }).then([&store] { + return store.write_meta("magic", CEPH_OSD_ONDISK_MAGIC); + }).then([&store, whoami] { + return store.write_meta("whoami", std::to_string(whoami)); + }).then([&store] { + return _write_key_meta(store); + }).then([&store, osdspec_affinity=std::move(osdspec_affinity)] { + return store.write_meta("osdspec_affinity", osdspec_affinity); + }).then([&store] { + return store.write_meta("ready", "ready"); + }).then([&store, whoami, cluster_fsid] { + fmt::print("created object store {} for osd.{} fsid {}\n", + local_conf().get_val<std::string>("osd_data"), + whoami, cluster_fsid); + return store.umount(); + }).then([&store] { + return store.stop(); + }); +} + +seastar::future<> OSD::_write_superblock( + FuturizedStore &store, + OSDMeta meta_coll, + OSDSuperblock superblock) +{ + return seastar::do_with( + std::move(meta_coll), + std::move(superblock), + [&store](auto &meta_coll, auto &superblock) { + return meta_coll.load_superblock( + ).safe_then([&superblock](OSDSuperblock&& sb) { + if (sb.cluster_fsid != superblock.cluster_fsid) { + logger().error("provided cluster fsid {} != superblock's {}", + sb.cluster_fsid, superblock.cluster_fsid); + throw std::invalid_argument("mismatched fsid"); + } + if (sb.whoami != superblock.whoami) { + logger().error("provided osd id {} != superblock's {}", + sb.whoami, superblock.whoami); + throw std::invalid_argument("mismatched osd id"); + } + }).handle_error( + crimson::ct_error::enoent::handle([&store, &meta_coll, &superblock] { + // meta collection does not yet, create superblock + logger().info( + "{} writing superblock cluster_fsid {} osd_fsid {}", + "_write_superblock", + superblock.cluster_fsid, + superblock.osd_fsid); + ceph::os::Transaction t; + meta_coll.create(t); + meta_coll.store_superblock(t, superblock); + logger().debug("OSD::_write_superblock: do_transaction..."); + return store.get_sharded_store().do_transaction( + meta_coll.collection(), + std::move(t)); + }), + crimson::ct_error::assert_all("_write_superbock error") + ); + }); +} + +// this `to_string` sits in the `crimson::osd` namespace, so we don't brake +// the language rule on not overloading in `std::`. +static std::string to_string(const seastar::temporary_buffer<char>& temp_buf) +{ + return {temp_buf.get(), temp_buf.size()}; +} + +seastar::future<> OSD::_write_key_meta(FuturizedStore &store) +{ + + if (auto key = local_conf().get_val<std::string>("key"); !std::empty(key)) { + return store.write_meta("osd_key", key); + } else if (auto keyfile = local_conf().get_val<std::string>("keyfile"); + !std::empty(keyfile)) { + return read_file(keyfile).then([&store](const auto& temp_buf) { + // it's on a truly cold path, so don't worry about memcpy. + return store.write_meta("osd_key", to_string(temp_buf)); + }).handle_exception([keyfile] (auto ep) { + logger().error("_write_key_meta: failed to handle keyfile {}: {}", + keyfile, ep); + ceph_abort(); + }); + } else { + return seastar::now(); + } +} + +namespace { + entity_addrvec_t pick_addresses(int what) { + entity_addrvec_t addrs; + crimson::common::CephContext cct; + // we're interested solely in v2; crimson doesn't do v1 + const auto flags = what | CEPH_PICK_ADDRESS_MSGR2; + if (int r = ::pick_addresses(&cct, flags, &addrs, -1); r < 0) { + throw std::runtime_error("failed to pick address"); + } + for (auto addr : addrs.v) { + logger().info("picked address {}", addr); + } + return addrs; + } + std::pair<entity_addrvec_t, bool> + replace_unknown_addrs(entity_addrvec_t maybe_unknowns, + const entity_addrvec_t& knowns) { + bool changed = false; + auto maybe_replace = [&](entity_addr_t addr) { + if (!addr.is_blank_ip()) { + return addr; + } + for (auto& b : knowns.v) { + if (addr.get_family() == b.get_family()) { + auto a = b; + a.set_nonce(addr.get_nonce()); + a.set_type(addr.get_type()); + a.set_port(addr.get_port()); + changed = true; + return a; + } + } + throw std::runtime_error("failed to replace unknown address"); + }; + entity_addrvec_t replaced; + std::transform(maybe_unknowns.v.begin(), + maybe_unknowns.v.end(), + std::back_inserter(replaced.v), + maybe_replace); + return {replaced, changed}; + } +} + +seastar::future<> OSD::start() +{ + logger().info("start"); + + startup_time = ceph::mono_clock::now(); + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return store.start().then([this] { + return pg_to_shard_mappings.start(0, seastar::smp::count + ).then([this] { + return osd_singleton_state.start_single( + whoami, std::ref(*cluster_msgr), std::ref(*public_msgr), + std::ref(*monc), std::ref(*mgrc)); + }).then([this] { + return osd_states.start(); + }).then([this] { + ceph::mono_time startup_time = ceph::mono_clock::now(); + return shard_services.start( + std::ref(osd_singleton_state), + std::ref(pg_to_shard_mappings), + whoami, + startup_time, + osd_singleton_state.local().perf, + osd_singleton_state.local().recoverystate_perf, + std::ref(store), + std::ref(osd_states)); + }); + }).then([this] { + heartbeat.reset(new Heartbeat{ + whoami, get_shard_services(), + *monc, *hb_front_msgr, *hb_back_msgr}); + return store.mount().handle_error( + crimson::stateful_ec::handle([] (const auto& ec) { + logger().error("error mounting object store in {}: ({}) {}", + local_conf().get_val<std::string>("osd_data"), + ec.value(), ec.message()); + std::exit(EXIT_FAILURE); + })); + }).then([this] { + return open_meta_coll(); + }).then([this] { + return pg_shard_manager.get_meta_coll().load_superblock( + ).handle_error( + crimson::ct_error::assert_all("open_meta_coll error") + ); + }).then([this](OSDSuperblock&& sb) { + superblock = std::move(sb); + pg_shard_manager.set_superblock(superblock); + return pg_shard_manager.get_local_map(superblock.current_epoch); + }).then([this](OSDMapService::local_cached_map_t&& map) { + osdmap = make_local_shared_foreign(OSDMapService::local_cached_map_t(map)); + return pg_shard_manager.update_map(std::move(map)); + }).then([this] { + return shard_services.invoke_on_all([this](auto &local_service) { + local_service.local_state.osdmap_gate.got_map(osdmap->get_epoch()); + }); + }).then([this] { + bind_epoch = osdmap->get_epoch(); + return pg_shard_manager.load_pgs(store); + }).then([this] { + uint64_t osd_required = + CEPH_FEATURE_UID | + CEPH_FEATURE_PGID64 | + CEPH_FEATURE_OSDENC; + using crimson::net::SocketPolicy; + + public_msgr->set_default_policy(SocketPolicy::stateless_server(0)); + public_msgr->set_policy(entity_name_t::TYPE_MON, + SocketPolicy::lossy_client(osd_required)); + public_msgr->set_policy(entity_name_t::TYPE_MGR, + SocketPolicy::lossy_client(osd_required)); + public_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::stateless_server(0)); + + cluster_msgr->set_default_policy(SocketPolicy::stateless_server(0)); + cluster_msgr->set_policy(entity_name_t::TYPE_MON, + SocketPolicy::lossy_client(0)); + cluster_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossless_peer(osd_required)); + cluster_msgr->set_policy(entity_name_t::TYPE_CLIENT, + SocketPolicy::stateless_server(0)); + + crimson::net::dispatchers_t dispatchers{this, monc.get(), mgrc.get()}; + return seastar::when_all_succeed( + cluster_msgr->bind(pick_addresses(CEPH_PICK_ADDRESS_CLUSTER)) + .safe_then([this, dispatchers]() mutable { + return cluster_msgr->start(dispatchers); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [] (const std::error_code& e) { + logger().error("cluster messenger bind(): {}", e); + ceph_abort(); + })), + public_msgr->bind(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC)) + .safe_then([this, dispatchers]() mutable { + return public_msgr->start(dispatchers); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [] (const std::error_code& e) { + logger().error("public messenger bind(): {}", e); + ceph_abort(); + }))); + }).then_unpack([this] { + return seastar::when_all_succeed(monc->start(), + mgrc->start()); + }).then_unpack([this] { + return _add_me_to_crush(); + }).then([this] { + monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0); + monc->sub_want("mgrmap", 0, 0); + monc->sub_want("osdmap", 0, 0); + return monc->renew_subs(); + }).then([this] { + if (auto [addrs, changed] = + replace_unknown_addrs(cluster_msgr->get_myaddrs(), + public_msgr->get_myaddrs()); changed) { + logger().debug("replacing unkwnown addrs of cluster messenger"); + cluster_msgr->set_myaddrs(addrs); + } + return heartbeat->start(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC), + pick_addresses(CEPH_PICK_ADDRESS_CLUSTER)); + }).then([this] { + // create the admin-socket server, and the objects that register + // to handle incoming commands + return start_asok_admin(); + }).then([this] { + return log_client.set_fsid(monc->get_fsid()); + }).then([this] { + return start_boot(); + }); +} + +seastar::future<> OSD::start_boot() +{ + pg_shard_manager.set_preboot(); + return monc->get_version("osdmap").then([this](auto&& ret) { + auto [newest, oldest] = ret; + return _preboot(oldest, newest); + }); +} + +seastar::future<> OSD::_preboot(version_t oldest, version_t newest) +{ + logger().info("osd.{}: _preboot", whoami); + if (osdmap->get_epoch() == 0) { + logger().info("waiting for initial osdmap"); + } else if (osdmap->is_destroyed(whoami)) { + logger().warn("osdmap says I am destroyed"); + // provide a small margin so we don't livelock seeing if we + // un-destroyed ourselves. + if (osdmap->get_epoch() > newest - 1) { + throw std::runtime_error("i am destroyed"); + } + } else if (osdmap->is_noup(whoami)) { + logger().warn("osdmap NOUP flag is set, waiting for it to clear"); + } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) { + logger().error("osdmap SORTBITWISE OSDMap flag is NOT set; please set it"); + } else if (osdmap->require_osd_release < ceph_release_t::octopus) { + logger().error("osdmap require_osd_release < octopus; please upgrade to octopus"); + } else if (false) { + // TODO: update mon if current fullness state is different from osdmap + } else if (version_t n = local_conf()->osd_map_message_max; + osdmap->get_epoch() >= oldest - 1 && + osdmap->get_epoch() + n > newest) { + return _send_boot(); + } + // get all the latest maps + if (osdmap->get_epoch() + 1 >= oldest) { + return get_shard_services().osdmap_subscribe(osdmap->get_epoch() + 1, false); + } else { + return get_shard_services().osdmap_subscribe(oldest - 1, true); + } +} + +seastar::future<> OSD::_send_boot() +{ + pg_shard_manager.set_booting(); + + entity_addrvec_t public_addrs = public_msgr->get_myaddrs(); + entity_addrvec_t cluster_addrs = cluster_msgr->get_myaddrs(); + entity_addrvec_t hb_back_addrs = heartbeat->get_back_addrs(); + entity_addrvec_t hb_front_addrs = heartbeat->get_front_addrs(); + if (cluster_msgr->set_addr_unknowns(public_addrs)) { + cluster_addrs = cluster_msgr->get_myaddrs(); + } + if (heartbeat->get_back_msgr().set_addr_unknowns(cluster_addrs)) { + hb_back_addrs = heartbeat->get_back_addrs(); + } + if (heartbeat->get_front_msgr().set_addr_unknowns(public_addrs)) { + hb_front_addrs = heartbeat->get_front_addrs(); + } + logger().info("hb_back_msgr: {}", hb_back_addrs); + logger().info("hb_front_msgr: {}", hb_front_addrs); + logger().info("cluster_msgr: {}", cluster_addrs); + + auto m = crimson::make_message<MOSDBoot>(superblock, + osdmap->get_epoch(), + boot_epoch, + hb_back_addrs, + hb_front_addrs, + cluster_addrs, + CEPH_FEATURES_ALL); + collect_sys_info(&m->metadata, NULL); + + // See OSDMonitor::preprocess_boot, prevents boot without allow_crimson + // OSDMap flag + m->metadata["osd_type"] = "crimson"; + return monc->send_message(std::move(m)); +} + +seastar::future<> OSD::_add_me_to_crush() +{ + if (!local_conf().get_val<bool>("osd_crush_update_on_start")) { + return seastar::now(); + } + auto get_weight = [this] { + if (auto w = local_conf().get_val<double>("osd_crush_initial_weight"); + w >= 0) { + return seastar::make_ready_future<double>(w); + } else { + return store.stat().then([](auto st) { + auto total = st.total; + return seastar::make_ready_future<double>( + std::max(.00001, + double(total) / double(1ull << 40))); // TB + }); + } + }; + return get_weight().then([this](auto weight) { + const crimson::crush::CrushLocation loc; + return seastar::do_with( + std::move(loc), + [this, weight] (crimson::crush::CrushLocation& loc) { + return loc.init_on_startup().then([this, weight, &loc]() { + logger().info("crush location is {}", loc); + string cmd = fmt::format(R"({{ + "prefix": "osd crush create-or-move", + "id": {}, + "weight": {:.4f}, + "args": [{}] + }})", whoami, weight, loc); + return monc->run_command(std::move(cmd), {}); + }); + }); + }).then([](auto&& command_result) { + [[maybe_unused]] auto [code, message, out] = std::move(command_result); + if (code) { + logger().warn("fail to add to crush: {} ({})", message, code); + throw std::runtime_error("fail to add to crush"); + } else { + logger().info("added to crush: {}", message); + } + return seastar::now(); + }); +} + +seastar::future<> OSD::handle_command( + crimson::net::ConnectionRef conn, + Ref<MCommand> m) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return asok->handle_command(conn, std::move(m)); +} + +/* + The OSD's Admin Socket object created here has two servers (i.e. - blocks of commands + to handle) registered to it: + - OSD's specific commands are handled by the OSD object; + - there are some common commands registered to be directly handled by the AdminSocket object + itself. +*/ +seastar::future<> OSD::start_asok_admin() +{ + auto asok_path = local_conf().get_val<std::string>("admin_socket"); + using namespace crimson::admin; + return asok->start(asok_path).then([this] { + asok->register_admin_commands(); + asok->register_command(make_asok_hook<OsdStatusHook>(std::as_const(*this))); + asok->register_command(make_asok_hook<SendBeaconHook>(*this)); + asok->register_command(make_asok_hook<FlushPgStatsHook>(*this)); + asok->register_command( + make_asok_hook<DumpPGStateHistory>(std::as_const(pg_shard_manager))); + asok->register_command(make_asok_hook<DumpMetricsHook>()); + asok->register_command(make_asok_hook<DumpPerfCountersHook>()); + asok->register_command(make_asok_hook<InjectDataErrorHook>(get_shard_services())); + asok->register_command(make_asok_hook<InjectMDataErrorHook>(get_shard_services())); + // PG commands + asok->register_command(make_asok_hook<pg::QueryCommand>(*this)); + asok->register_command(make_asok_hook<pg::MarkUnfoundLostCommand>(*this)); + // ops commands + asok->register_command( + make_asok_hook<DumpInFlightOpsHook>( + std::as_const(pg_shard_manager))); + asok->register_command( + make_asok_hook<DumpHistoricOpsHook>( + std::as_const(get_shard_services().get_registry()))); + asok->register_command( + make_asok_hook<DumpSlowestHistoricOpsHook>( + std::as_const(get_shard_services().get_registry()))); + asok->register_command( + make_asok_hook<DumpRecoveryReservationsHook>(get_shard_services())); + }); +} + +seastar::future<> OSD::stop() +{ + logger().info("stop"); + beacon_timer.cancel(); + tick_timer.cancel(); + // see also OSD::shutdown() + return prepare_to_stop().then([this] { + return pg_shard_manager.set_stopping(); + }).then([this] { + logger().debug("prepared to stop"); + public_msgr->stop(); + cluster_msgr->stop(); + auto gate_close_fut = gate.close(); + return asok->stop().then([this] { + return heartbeat->stop(); + }).then([this] { + return pg_shard_manager.stop_registries(); + }).then([this] { + return store.umount(); + }).then([this] { + return store.stop(); + }).then([this] { + return pg_shard_manager.stop_pgs(); + }).then([this] { + return monc->stop(); + }).then([this] { + return mgrc->stop(); + }).then([this] { + return shard_services.stop(); + }).then([this] { + return osd_states.stop(); + }).then([this] { + return osd_singleton_state.stop(); + }).then([this] { + return pg_to_shard_mappings.stop(); + }).then([fut=std::move(gate_close_fut)]() mutable { + return std::move(fut); + }).then([this] { + return when_all_succeed( + public_msgr->shutdown(), + cluster_msgr->shutdown()).discard_result(); + }).handle_exception([](auto ep) { + logger().error("error while stopping osd: {}", ep); + }); + }); +} + +void OSD::dump_status(Formatter* f) const +{ + f->dump_stream("cluster_fsid") << superblock.cluster_fsid; + f->dump_stream("osd_fsid") << superblock.osd_fsid; + f->dump_unsigned("whoami", superblock.whoami); + f->dump_string("state", pg_shard_manager.get_osd_state_string()); + f->dump_unsigned("oldest_map", superblock.oldest_map); + f->dump_unsigned("cluster_osdmap_trim_lower_bound", + superblock.cluster_osdmap_trim_lower_bound); + f->dump_unsigned("newest_map", superblock.newest_map); + f->dump_unsigned("num_pgs", pg_shard_manager.get_num_pgs()); +} + +void OSD::print(std::ostream& out) const +{ + out << "{osd." << superblock.whoami << " " + << superblock.osd_fsid << " [" << superblock.oldest_map + << "," << superblock.newest_map << "] " + << "tlb:" << superblock.cluster_osdmap_trim_lower_bound + << " pgs:" << pg_shard_manager.get_num_pgs() + << "}"; +} + +std::optional<seastar::future<>> +OSD::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + if (pg_shard_manager.is_stopping()) { + return seastar::now(); + } + auto maybe_ret = do_ms_dispatch(conn, std::move(m)); + if (!maybe_ret.has_value()) { + return std::nullopt; + } + + gate.dispatch_in_background( + __func__, *this, [ret=std::move(maybe_ret.value())]() mutable { + return std::move(ret); + }); + return seastar::now(); +} + +std::optional<seastar::future<>> +OSD::do_ms_dispatch( + crimson::net::ConnectionRef conn, + MessageRef m) +{ + if (seastar::this_shard_id() != PRIMARY_CORE) { + switch (m->get_type()) { + case CEPH_MSG_OSD_MAP: + case MSG_COMMAND: + case MSG_OSD_MARK_ME_DOWN: + // FIXME: order is not guaranteed in this path + return conn.get_foreign( + ).then([this, m=std::move(m)](auto f_conn) { + return seastar::smp::submit_to(PRIMARY_CORE, + [f_conn=std::move(f_conn), m=std::move(m), this]() mutable { + auto conn = make_local_shared_foreign(std::move(f_conn)); + auto ret = do_ms_dispatch(conn, std::move(m)); + assert(ret.has_value()); + return std::move(ret.value()); + }); + }); + } + } + + switch (m->get_type()) { + case CEPH_MSG_OSD_MAP: + return handle_osd_map(boost::static_pointer_cast<MOSDMap>(m)); + case CEPH_MSG_OSD_OP: + return handle_osd_op(conn, boost::static_pointer_cast<MOSDOp>(m)); + case MSG_OSD_PG_CREATE2: + return handle_pg_create( + conn, boost::static_pointer_cast<MOSDPGCreate2>(m)); + return seastar::now(); + case MSG_COMMAND: + return handle_command(conn, boost::static_pointer_cast<MCommand>(m)); + case MSG_OSD_MARK_ME_DOWN: + return handle_mark_me_down(conn, boost::static_pointer_cast<MOSDMarkMeDown>(m)); + case MSG_OSD_PG_PULL: + [[fallthrough]]; + case MSG_OSD_PG_PUSH: + [[fallthrough]]; + case MSG_OSD_PG_PUSH_REPLY: + [[fallthrough]]; + case MSG_OSD_PG_RECOVERY_DELETE: + [[fallthrough]]; + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + [[fallthrough]]; + case MSG_OSD_PG_SCAN: + [[fallthrough]]; + case MSG_OSD_PG_BACKFILL: + [[fallthrough]]; + case MSG_OSD_PG_BACKFILL_REMOVE: + return handle_recovery_subreq(conn, boost::static_pointer_cast<MOSDFastDispatchOp>(m)); + case MSG_OSD_PG_LEASE: + [[fallthrough]]; + case MSG_OSD_PG_LEASE_ACK: + [[fallthrough]]; + case MSG_OSD_PG_NOTIFY2: + [[fallthrough]]; + case MSG_OSD_PG_INFO2: + [[fallthrough]]; + case MSG_OSD_PG_QUERY2: + [[fallthrough]]; + case MSG_OSD_BACKFILL_RESERVE: + [[fallthrough]]; + case MSG_OSD_RECOVERY_RESERVE: + [[fallthrough]]; + case MSG_OSD_PG_LOG: + return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m)); + case MSG_OSD_REPOP: + return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m)); + case MSG_OSD_REPOPREPLY: + return handle_rep_op_reply(conn, boost::static_pointer_cast<MOSDRepOpReply>(m)); + case MSG_OSD_SCRUB2: + return handle_scrub(conn, boost::static_pointer_cast<MOSDScrub2>(m)); + case MSG_OSD_PG_UPDATE_LOG_MISSING: + return handle_update_log_missing(conn, boost::static_pointer_cast< + MOSDPGUpdateLogMissing>(m)); + case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: + return handle_update_log_missing_reply(conn, boost::static_pointer_cast< + MOSDPGUpdateLogMissingReply>(m)); + default: + return std::nullopt; + } +} + +void OSD::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) +{ + // TODO: cleanup the session attached to this connection + logger().warn("ms_handle_reset"); +} + +void OSD::ms_handle_remote_reset(crimson::net::ConnectionRef conn) +{ + logger().warn("ms_handle_remote_reset"); +} + +void OSD::handle_authentication(const EntityName& name, + const AuthCapsInfo& caps_info) +{ + // TODO: store the parsed cap and associate it with the connection + if (caps_info.allow_all) { + logger().debug("{} {} has all caps", __func__, name); + return; + } + if (caps_info.caps.length() > 0) { + auto p = caps_info.caps.cbegin(); + string str; + try { + decode(str, p); + } catch (ceph::buffer::error& e) { + logger().warn("{} {} failed to decode caps string", __func__, name); + return; + } + OSDCap caps; + if (caps.parse(str)) { + logger().debug("{} {} has caps {}", __func__, name, str); + } else { + logger().warn("{} {} failed to parse caps {}", __func__, name, str); + } + } +} + +void OSD::update_stats() +{ + osd_stat_seq++; + osd_stat.up_from = get_shard_services().get_up_epoch(); + osd_stat.hb_peers = heartbeat->get_peers(); + osd_stat.seq = ( + static_cast<uint64_t>(get_shard_services().get_up_epoch()) << 32 + ) | osd_stat_seq; + gate.dispatch_in_background("statfs", *this, [this] { + (void) store.stat().then([this](store_statfs_t&& st) { + osd_stat.statfs = st; + }); + }); +} + +seastar::future<MessageURef> OSD::get_stats() const +{ + // MPGStats::had_map_for is not used since PGMonitor was removed + auto m = crimson::make_message<MPGStats>(monc->get_fsid(), osdmap->get_epoch()); + m->osd_stat = osd_stat; + return pg_shard_manager.get_pg_stats( + ).then([m=std::move(m)](auto &&stats) mutable { + m->pg_stat = std::move(stats); + return seastar::make_ready_future<MessageURef>(std::move(m)); + }); +} + +uint64_t OSD::send_pg_stats() +{ + // mgr client sends the report message in background + mgrc->report(); + return osd_stat.seq; +} + +seastar::future<> OSD::handle_osd_map(Ref<MOSDMap> m) +{ + /* Ensure that only one MOSDMap is processed at a time. Allowing concurrent + * processing may eventually be worthwhile, but such an implementation would + * need to ensure (among other things) + * 1. any particular map is only processed once + * 2. PGAdvanceMap operations are processed in order for each PG + * As map handling is not presently a bottleneck, we stick to this + * simpler invariant for now. + * See https://tracker.ceph.com/issues/59165 + */ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return handle_osd_map_lock.lock().then([this, m] { + return _handle_osd_map(m); + }).finally([this] { + return handle_osd_map_lock.unlock(); + }); +} + +seastar::future<> OSD::_handle_osd_map(Ref<MOSDMap> m) +{ + logger().info("handle_osd_map {}", *m); + if (m->fsid != superblock.cluster_fsid) { + logger().warn("fsid mismatched"); + return seastar::now(); + } + if (pg_shard_manager.is_initializing()) { + logger().warn("i am still initializing"); + return seastar::now(); + } + + const auto first = m->get_first(); + const auto last = m->get_last(); + logger().info("handle_osd_map epochs [{}..{}], i have {}, src has [{}..{}]", + first, last, superblock.newest_map, + m->cluster_osdmap_trim_lower_bound, m->newest_map); + // make sure there is something new, here, before we bother flushing + // the queues and such + if (last <= superblock.newest_map) { + return seastar::now(); + } + // missing some? + bool skip_maps = false; + epoch_t start = superblock.newest_map + 1; + if (first > start) { + logger().info("handle_osd_map message skips epochs {}..{}", + start, first - 1); + if (m->cluster_osdmap_trim_lower_bound <= start) { + return get_shard_services().osdmap_subscribe(start, false); + } + // always try to get the full range of maps--as many as we can. this + // 1- is good to have + // 2- is at present the only way to ensure that we get a *full* map as + // the first map! + if (m->cluster_osdmap_trim_lower_bound < first) { + return get_shard_services().osdmap_subscribe( + m->cluster_osdmap_trim_lower_bound - 1, true); + } + skip_maps = true; + start = first; + } + + return seastar::do_with(ceph::os::Transaction{}, + [=, this](auto& t) { + return pg_shard_manager.store_maps(t, start, m).then([=, this, &t] { + // even if this map isn't from a mon, we may have satisfied our subscription + monc->sub_got("osdmap", last); + if (!superblock.oldest_map || skip_maps) { + superblock.oldest_map = first; + } + superblock.newest_map = last; + superblock.current_epoch = last; + + // note in the superblock that we were clean thru the prior epoch + if (boot_epoch && boot_epoch >= superblock.mounted) { + superblock.mounted = boot_epoch; + superblock.clean_thru = last; + } + pg_shard_manager.get_meta_coll().store_superblock(t, superblock); + pg_shard_manager.set_superblock(superblock); + logger().debug("OSD::handle_osd_map: do_transaction..."); + return store.get_sharded_store().do_transaction( + pg_shard_manager.get_meta_coll().collection(), + std::move(t)); + }); + }).then([=, this] { + // TODO: write to superblock and commit the transaction + return committed_osd_maps(start, last, m); + }); +} + +seastar::future<> OSD::committed_osd_maps( + version_t first, + version_t last, + Ref<MOSDMap> m) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + logger().info("osd.{}: committed_osd_maps({}, {})", whoami, first, last); + // advance through the new maps + return seastar::do_for_each(boost::make_counting_iterator(first), + boost::make_counting_iterator(last + 1), + [this](epoch_t cur) { + return pg_shard_manager.get_local_map( + cur + ).then([this](OSDMapService::local_cached_map_t&& o) { + osdmap = make_local_shared_foreign(OSDMapService::local_cached_map_t(o)); + return pg_shard_manager.update_map(std::move(o)); + }).then([this] { + if (get_shard_services().get_up_epoch() == 0 && + osdmap->is_up(whoami) && + osdmap->get_addrs(whoami) == public_msgr->get_myaddrs()) { + return pg_shard_manager.set_up_epoch( + osdmap->get_epoch() + ).then([this] { + if (!boot_epoch) { + boot_epoch = osdmap->get_epoch(); + } + }); + } else { + return seastar::now(); + } + }); + }).then([m, this] { + auto fut = seastar::now(); + if (osdmap->is_up(whoami)) { + const auto up_from = osdmap->get_up_from(whoami); + logger().info("osd.{}: map e {} marked me up: up_from {}, bind_epoch {}, state {}", + whoami, osdmap->get_epoch(), up_from, bind_epoch, + pg_shard_manager.get_osd_state_string()); + if (bind_epoch < up_from && + osdmap->get_addrs(whoami) == public_msgr->get_myaddrs() && + pg_shard_manager.is_booting()) { + logger().info("osd.{}: activating...", whoami); + fut = pg_shard_manager.set_active().then([this] { + beacon_timer.arm_periodic( + std::chrono::seconds(local_conf()->osd_beacon_report_interval)); + // timer continuation rearms when complete + tick_timer.arm( + std::chrono::seconds(TICK_INTERVAL)); + }); + } + } else { + if (pg_shard_manager.is_prestop()) { + got_stop_ack(); + return seastar::now(); + } + } + return fut.then([this] { + return check_osdmap_features().then([this] { + // yay! + logger().info("osd.{}: committed_osd_maps: broadcasting osdmaps up" + " to {} epoch to pgs", whoami, osdmap->get_epoch()); + return pg_shard_manager.broadcast_map_to_pgs(osdmap->get_epoch()); + }); + }); + }).then([m, this] { + if (pg_shard_manager.is_active()) { + logger().info("osd.{}: now active", whoami); + if (!osdmap->exists(whoami) || + osdmap->is_stop(whoami)) { + return shutdown(); + } + if (should_restart()) { + return restart(); + } else { + return seastar::now(); + } + } else if (pg_shard_manager.is_preboot()) { + logger().info("osd.{}: now preboot", whoami); + + if (m->get_source().is_mon()) { + return _preboot( + m->cluster_osdmap_trim_lower_bound, m->newest_map); + } else { + logger().info("osd.{}: start_boot", whoami); + return start_boot(); + } + } else { + logger().info("osd.{}: now {}", whoami, + pg_shard_manager.get_osd_state_string()); + // XXX + return seastar::now(); + } + }); +} + +seastar::future<> OSD::handle_osd_op( + crimson::net::ConnectionRef conn, + Ref<MOSDOp> m) +{ + return pg_shard_manager.start_pg_operation<ClientRequest>( + get_shard_services(), + conn, + std::move(m)).second; +} + +seastar::future<> OSD::handle_pg_create( + crimson::net::ConnectionRef conn, + Ref<MOSDPGCreate2> m) +{ + return seastar::do_for_each(m->pgs, [this, conn, m](auto& pg) { + auto& [pgid, when] = pg; + const auto &[created, created_stamp] = when; + auto q = m->pg_extra.find(pgid); + ceph_assert(q != m->pg_extra.end()); + auto& [history, pi] = q->second; + logger().debug( + "{}: {} e{} @{} " + "history {} pi {}", + __func__, pgid, created, created_stamp, + history, pi); + if (!pi.empty() && + m->epoch < pi.get_bounds().second) { + logger().error( + "got pg_create on {} epoch {} " + "unmatched past_intervals {} (history {})", + pgid, m->epoch, + pi, history); + return seastar::now(); + } else { + return pg_shard_manager.start_pg_operation<RemotePeeringEvent>( + conn, + pg_shard_t(), + pgid, + m->epoch, + m->epoch, + NullEvt(), + true, + new PGCreateInfo(pgid, m->epoch, history, pi, true)).second; + } + }); +} + +seastar::future<> OSD::handle_update_log_missing( + crimson::net::ConnectionRef conn, + Ref<MOSDPGUpdateLogMissing> m) +{ + m->decode_payload(); + return pg_shard_manager.start_pg_operation<LogMissingRequest>( + std::move(conn), + std::move(m)).second; +} + +seastar::future<> OSD::handle_update_log_missing_reply( + crimson::net::ConnectionRef conn, + Ref<MOSDPGUpdateLogMissingReply> m) +{ + m->decode_payload(); + return pg_shard_manager.start_pg_operation<LogMissingRequestReply>( + std::move(conn), + std::move(m)).second; +} + +seastar::future<> OSD::handle_rep_op( + crimson::net::ConnectionRef conn, + Ref<MOSDRepOp> m) +{ + m->finish_decode(); + return pg_shard_manager.start_pg_operation<RepRequest>( + std::move(conn), + std::move(m)).second; +} + +seastar::future<> OSD::handle_rep_op_reply( + crimson::net::ConnectionRef conn, + Ref<MOSDRepOpReply> m) +{ + spg_t pgid = m->get_spg(); + return pg_shard_manager.with_pg( + pgid, + [m=std::move(m)](auto &&pg) { + if (pg) { + m->finish_decode(); + pg->handle_rep_op_reply(*m); + } else { + logger().warn("stale reply: {}", *m); + } + return seastar::now(); + }); +} + +seastar::future<> OSD::handle_scrub( + crimson::net::ConnectionRef conn, + Ref<MOSDScrub2> m) +{ + if (m->fsid != superblock.cluster_fsid) { + logger().warn("fsid mismatched"); + return seastar::now(); + } + return seastar::parallel_for_each(std::move(m->scrub_pgs), + [m, conn, this](spg_t pgid) { + pg_shard_t from_shard{static_cast<int>(m->get_source().num()), + pgid.shard}; + PeeringState::RequestScrub scrub_request{m->deep, m->repair}; + return pg_shard_manager.start_pg_operation<RemotePeeringEvent>( + conn, + from_shard, + pgid, + PGPeeringEvent{m->epoch, m->epoch, scrub_request}).second; + }); +} + +seastar::future<> OSD::handle_mark_me_down( + crimson::net::ConnectionRef conn, + Ref<MOSDMarkMeDown> m) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + if (pg_shard_manager.is_prestop()) { + got_stop_ack(); + } + return seastar::now(); +} + +seastar::future<> OSD::handle_recovery_subreq( + crimson::net::ConnectionRef conn, + Ref<MOSDFastDispatchOp> m) +{ + return pg_shard_manager.start_pg_operation<RecoverySubRequest>( + conn, std::move(m)).second; +} + +bool OSD::should_restart() const +{ + if (!osdmap->is_up(whoami)) { + logger().info("map e {} marked osd.{} down", + osdmap->get_epoch(), whoami); + return true; + } else if (osdmap->get_addrs(whoami) != public_msgr->get_myaddrs()) { + logger().error("map e {} had wrong client addr ({} != my {})", + osdmap->get_epoch(), + osdmap->get_addrs(whoami), + public_msgr->get_myaddrs()); + return true; + } else if (osdmap->get_cluster_addrs(whoami) != cluster_msgr->get_myaddrs()) { + logger().error("map e {} had wrong cluster addr ({} != my {})", + osdmap->get_epoch(), + osdmap->get_cluster_addrs(whoami), + cluster_msgr->get_myaddrs()); + return true; + } else { + return false; + } +} + +seastar::future<> OSD::restart() +{ + beacon_timer.cancel(); + tick_timer.cancel(); + return pg_shard_manager.set_up_epoch( + 0 + ).then([this] { + bind_epoch = osdmap->get_epoch(); + // TODO: promote to shutdown if being marked down for multiple times + // rebind messengers + return start_boot(); + }); +} + +seastar::future<> OSD::shutdown() +{ + logger().info("shutting down per osdmap"); + abort_source.request_abort(); + return seastar::now(); +} + +seastar::future<> OSD::send_beacon() +{ + if (!pg_shard_manager.is_active()) { + return seastar::now(); + } + // FIXME: min lec should be calculated from pg_stat + // and should set m->pgs + epoch_t min_last_epoch_clean = osdmap->get_epoch(); + auto m = crimson::make_message<MOSDBeacon>(osdmap->get_epoch(), + min_last_epoch_clean, + superblock.last_purged_snaps_scrub, + local_conf()->osd_beacon_report_interval); + return monc->send_message(std::move(m)); +} + +seastar::future<> OSD::update_heartbeat_peers() +{ + if (!pg_shard_manager.is_active()) { + return seastar::now();; + } + + pg_shard_manager.for_each_pgid([this](auto &pgid) { + vector<int> up, acting; + osdmap->pg_to_up_acting_osds(pgid.pgid, + &up, nullptr, + &acting, nullptr); + for (int osd : boost::join(up, acting)) { + if (osd == CRUSH_ITEM_NONE || osd == whoami) { + continue; + } else { + heartbeat->add_peer(osd, osdmap->get_epoch()); + } + } + }); + heartbeat->update_peers(whoami); + return seastar::now(); +} + +seastar::future<> OSD::handle_peering_op( + crimson::net::ConnectionRef conn, + Ref<MOSDPeeringOp> m) +{ + const int from = m->get_source().num(); + logger().debug("handle_peering_op on {} from {}", m->get_spg(), from); + m->set_features(conn->get_features()); + std::unique_ptr<PGPeeringEvent> evt(m->get_event()); + return pg_shard_manager.start_pg_operation<RemotePeeringEvent>( + conn, + pg_shard_t{from, m->get_spg().shard}, + m->get_spg(), + std::move(*evt)).second; +} + +seastar::future<> OSD::check_osdmap_features() +{ + assert(seastar::this_shard_id() == PRIMARY_CORE); + return store.write_meta( + "require_osd_release", + stringify((int)osdmap->require_osd_release)); +} + +seastar::future<> OSD::prepare_to_stop() +{ + if (osdmap && osdmap->is_up(whoami)) { + pg_shard_manager.set_prestop(); + const auto timeout = + std::chrono::duration_cast<std::chrono::milliseconds>( + std::chrono::duration<double>( + local_conf().get_val<double>("osd_mon_shutdown_timeout"))); + + return seastar::with_timeout( + seastar::timer<>::clock::now() + timeout, + monc->send_message( + crimson::make_message<MOSDMarkMeDown>( + monc->get_fsid(), + whoami, + osdmap->get_addrs(whoami), + osdmap->get_epoch(), + true)).then([this] { + return stop_acked.get_future(); + }) + ).handle_exception_type( + [](seastar::timed_out_error&) { + return seastar::now(); + }); + } + return seastar::now(); +} + +} diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h new file mode 100644 index 000000000..10ff60d47 --- /dev/null +++ b/src/crimson/osd/osd.h @@ -0,0 +1,251 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/abort_source.hh> +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/shared_ptr.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/timer.hh> + +#include "crimson/common/logclient.h" +#include "crimson/common/type_helpers.h" +#include "crimson/common/auth_handler.h" +#include "crimson/common/gated.h" +#include "crimson/admin/admin_socket.h" +#include "crimson/common/simple_lru.h" +#include "crimson/mgr/client.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/pg_shard_manager.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/pg_map.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/state.h" + +#include "messages/MOSDOp.h" +#include "osd/PeeringState.h" +#include "osd/osd_types.h" +#include "osd/osd_perf_counters.h" +#include "osd/PGPeeringEvent.h" + +class MCommand; +class MOSDMap; +class MOSDRepOpReply; +class MOSDRepOp; +class MOSDScrub2; +class OSDMeta; +class Heartbeat; + +namespace ceph::os { + class Transaction; +} + +namespace crimson::mon { + class Client; +} + +namespace crimson::net { + class Messenger; +} + +namespace crimson::os { + class FuturizedStore; +} + +namespace crimson::osd { +class PG; + +class OSD final : public crimson::net::Dispatcher, + private crimson::common::AuthHandler, + private crimson::mgr::WithStats { + const int whoami; + const uint32_t nonce; + seastar::abort_source& abort_source; + seastar::timer<seastar::lowres_clock> beacon_timer; + // talk with osd + crimson::net::MessengerRef cluster_msgr; + // talk with client/mon/mgr + crimson::net::MessengerRef public_msgr; + + // HB Messengers + crimson::net::MessengerRef hb_front_msgr; + crimson::net::MessengerRef hb_back_msgr; + + std::unique_ptr<crimson::mon::Client> monc; + std::unique_ptr<crimson::mgr::Client> mgrc; + + // TODO: use a wrapper for ObjectStore + OSDMapService::cached_map_t osdmap; + crimson::os::FuturizedStore& store; + + /// _first_ epoch we were marked up (after this process started) + epoch_t boot_epoch = 0; + //< epoch we last did a bind to new ip:ports + epoch_t bind_epoch = 0; + //< since when there is no more pending pg creates from mon + epoch_t last_pg_create_epoch = 0; + + ceph::mono_time startup_time; + + seastar::shared_mutex handle_osd_map_lock; + + OSDSuperblock superblock; + + // Dispatcher methods + std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef, MessageRef) final; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final; + void ms_handle_remote_reset(crimson::net::ConnectionRef conn) final; + + std::optional<seastar::future<>> do_ms_dispatch(crimson::net::ConnectionRef, MessageRef); + + // mgr::WithStats methods + // pg statistics including osd ones + osd_stat_t osd_stat; + uint32_t osd_stat_seq = 0; + void update_stats(); + seastar::future<MessageURef> get_stats() const final; + + // AuthHandler methods + void handle_authentication(const EntityName& name, + const AuthCapsInfo& caps) final; + + seastar::sharded<PGShardMapping> pg_to_shard_mappings; + seastar::sharded<OSDSingletonState> osd_singleton_state; + seastar::sharded<OSDState> osd_states; + seastar::sharded<ShardServices> shard_services; + + crimson::osd::PGShardManager pg_shard_manager; + + std::unique_ptr<Heartbeat> heartbeat; + seastar::timer<seastar::lowres_clock> tick_timer; + + // admin-socket + seastar::lw_shared_ptr<crimson::admin::AdminSocket> asok; + +public: + OSD(int id, uint32_t nonce, + seastar::abort_source& abort_source, + crimson::os::FuturizedStore& store, + crimson::net::MessengerRef cluster_msgr, + crimson::net::MessengerRef client_msgr, + crimson::net::MessengerRef hb_front_msgr, + crimson::net::MessengerRef hb_back_msgr); + ~OSD() final; + + auto &get_pg_shard_manager() { + return pg_shard_manager; + } + + seastar::future<> open_meta_coll(); + static seastar::future<OSDMeta> open_or_create_meta_coll( + crimson::os::FuturizedStore &store + ); + static seastar::future<> mkfs( + crimson::os::FuturizedStore &store, + unsigned whoami, + uuid_d osd_uuid, + uuid_d cluster_fsid, + std::string osdspec_affinity); + + seastar::future<> start(); + seastar::future<> stop(); + + void dump_status(Formatter*) const; + void print(std::ostream&) const; + + /// @return the seq id of the pg stats being sent + uint64_t send_pg_stats(); + + auto &get_shard_services() { + return shard_services.local(); + } + +private: + static seastar::future<> _write_superblock( + crimson::os::FuturizedStore &store, + OSDMeta meta, + OSDSuperblock superblock); + static seastar::future<> _write_key_meta( + crimson::os::FuturizedStore &store + ); + seastar::future<> start_boot(); + seastar::future<> _preboot(version_t oldest_osdmap, version_t newest_osdmap); + seastar::future<> _send_boot(); + seastar::future<> _add_me_to_crush(); + + seastar::future<> osdmap_subscribe(version_t epoch, bool force_request); + + seastar::future<> start_asok_admin(); + + void write_superblock(ceph::os::Transaction& t); + seastar::future<> read_superblock(); + + seastar::future<> handle_osd_map(Ref<MOSDMap> m); + seastar::future<> _handle_osd_map(Ref<MOSDMap> m); + seastar::future<> handle_pg_create(crimson::net::ConnectionRef conn, + Ref<MOSDPGCreate2> m); + seastar::future<> handle_osd_op(crimson::net::ConnectionRef conn, + Ref<MOSDOp> m); + seastar::future<> handle_rep_op(crimson::net::ConnectionRef conn, + Ref<MOSDRepOp> m); + seastar::future<> handle_rep_op_reply(crimson::net::ConnectionRef conn, + Ref<MOSDRepOpReply> m); + seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn, + Ref<MOSDPeeringOp> m); + seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn, + Ref<MOSDFastDispatchOp> m); + seastar::future<> handle_scrub(crimson::net::ConnectionRef conn, + Ref<MOSDScrub2> m); + seastar::future<> handle_mark_me_down(crimson::net::ConnectionRef conn, + Ref<MOSDMarkMeDown> m); + + seastar::future<> committed_osd_maps(version_t first, + version_t last, + Ref<MOSDMap> m); + + seastar::future<> check_osdmap_features(); + + seastar::future<> handle_command(crimson::net::ConnectionRef conn, + Ref<MCommand> m); + seastar::future<> handle_update_log_missing(crimson::net::ConnectionRef conn, + Ref<MOSDPGUpdateLogMissing> m); + seastar::future<> handle_update_log_missing_reply( + crimson::net::ConnectionRef conn, + Ref<MOSDPGUpdateLogMissingReply> m); + +private: + crimson::common::Gated gate; + + seastar::promise<> stop_acked; + void got_stop_ack() { + stop_acked.set_value(); + } + seastar::future<> prepare_to_stop(); + bool should_restart() const; + seastar::future<> restart(); + seastar::future<> shutdown(); + seastar::future<> update_heartbeat_peers(); + friend class PGAdvanceMap; + +public: + seastar::future<> send_beacon(); + +private: + LogClient log_client; + LogChannelRef clog; +}; + +inline std::ostream& operator<<(std::ostream& out, const OSD& osd) { + osd.print(out); + return out; +} + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::OSD> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_connection_priv.h b/src/crimson/osd/osd_connection_priv.h new file mode 100644 index 000000000..69edf94b8 --- /dev/null +++ b/src/crimson/osd/osd_connection_priv.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/replicated_request.h" + +namespace crimson::osd { + +struct OSDConnectionPriv : public crimson::net::Connection::user_private_t { + ConnectionPipeline client_request_conn_pipeline; + ConnectionPipeline peering_request_conn_pipeline; + ConnectionPipeline replicated_request_conn_pipeline; +}; + +static OSDConnectionPriv &get_osd_priv(crimson::net::Connection *conn) { + if (!conn->has_user_private()) { + conn->set_user_private(std::make_unique<OSDConnectionPriv>()); + } + return static_cast<OSDConnectionPriv&>(conn->get_user_private()); +} + +} diff --git a/src/crimson/osd/osd_meta.cc b/src/crimson/osd/osd_meta.cc new file mode 100644 index 000000000..e40b2b246 --- /dev/null +++ b/src/crimson/osd/osd_meta.cc @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd_meta.h" + +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "os/Transaction.h" + +using std::string; +using read_errorator = crimson::os::FuturizedStore::Shard::read_errorator; + +void OSDMeta::create(ceph::os::Transaction& t) +{ + t.create_collection(coll->get_cid(), 0); +} + +void OSDMeta::store_map(ceph::os::Transaction& t, + epoch_t e, const bufferlist& m) +{ + t.write(coll->get_cid(), osdmap_oid(e), 0, m.length(), m); +} + +seastar::future<bufferlist> OSDMeta::load_map(epoch_t e) +{ + return store.read(coll, + osdmap_oid(e), 0, 0, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED).handle_error( + read_errorator::all_same_way([e] { + ceph_abort_msg(fmt::format("{} read gave enoent on {}", + __func__, osdmap_oid(e))); + })); +} + +void OSDMeta::store_superblock(ceph::os::Transaction& t, + const OSDSuperblock& superblock) +{ + bufferlist bl; + encode(superblock, bl); + t.write(coll->get_cid(), superblock_oid(), 0, bl.length(), bl); +} + +OSDMeta::load_superblock_ret OSDMeta::load_superblock() +{ + return store.read( + coll, superblock_oid(), 0, 0 + ).safe_then([] (bufferlist&& bl) { + auto p = bl.cbegin(); + OSDSuperblock superblock; + decode(superblock, p); + return seastar::make_ready_future<OSDSuperblock>(std::move(superblock)); + }); +} + +seastar::future<std::tuple<pg_pool_t, + std::string, + OSDMeta::ec_profile_t>> +OSDMeta::load_final_pool_info(int64_t pool) { + return store.read(coll, final_pool_info_oid(pool), + 0, 0).safe_then([] (bufferlist&& bl) { + auto p = bl.cbegin(); + pg_pool_t pi; + string name; + ec_profile_t ec_profile; + decode(pi, p); + decode(name, p); + decode(ec_profile, p); + return seastar::make_ready_future<std::tuple<pg_pool_t, + string, + ec_profile_t>>( + std::make_tuple(std::move(pi), + std::move(name), + std::move(ec_profile))); + },read_errorator::all_same_way([pool] { + throw std::runtime_error(fmt::format("read gave enoent on {}", + final_pool_info_oid(pool))); + })); +} + +ghobject_t OSDMeta::osdmap_oid(epoch_t epoch) +{ + string name = fmt::format("osdmap.{}", epoch); + return ghobject_t(hobject_t(sobject_t(object_t(name), 0))); +} + +ghobject_t OSDMeta::final_pool_info_oid(int64_t pool) +{ + string name = fmt::format("final_pool_{}", pool); + return ghobject_t(hobject_t(sobject_t(object_t(name), CEPH_NOSNAP))); +} + +ghobject_t OSDMeta::superblock_oid() +{ + return ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0))); +} diff --git a/src/crimson/osd/osd_meta.h b/src/crimson/osd/osd_meta.h new file mode 100644 index 000000000..652266d9e --- /dev/null +++ b/src/crimson/osd/osd_meta.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <string> +#include <seastar/core/future.hh> +#include "osd/osd_types.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +namespace ceph::os { + class Transaction; +} + +namespace crimson::os { + class FuturizedCollection; + class FuturizedStore; +} + +/// metadata shared across PGs, or put in another way, +/// metadata not specific to certain PGs. +class OSDMeta { + template<typename T> using Ref = boost::intrusive_ptr<T>; + + crimson::os::FuturizedStore::Shard& store; + Ref<crimson::os::FuturizedCollection> coll; + +public: + OSDMeta(Ref<crimson::os::FuturizedCollection> coll, + crimson::os::FuturizedStore::Shard& store) + : store{store}, coll{coll} + {} + + auto collection() { + return coll; + } + void create(ceph::os::Transaction& t); + + void store_map(ceph::os::Transaction& t, + epoch_t e, const bufferlist& m); + seastar::future<bufferlist> load_map(epoch_t e); + + void store_superblock(ceph::os::Transaction& t, + const OSDSuperblock& sb); + + using load_superblock_ertr = crimson::os::FuturizedStore::Shard::read_errorator; + using load_superblock_ret = load_superblock_ertr::future<OSDSuperblock>; + load_superblock_ret load_superblock(); + + using ec_profile_t = std::map<std::string, std::string>; + seastar::future<std::tuple<pg_pool_t, + std::string, + ec_profile_t>> load_final_pool_info(int64_t pool); +private: + static ghobject_t osdmap_oid(epoch_t epoch); + static ghobject_t final_pool_info_oid(int64_t pool); + static ghobject_t superblock_oid(); +}; diff --git a/src/crimson/osd/osd_operation.cc b/src/crimson/osd/osd_operation.cc new file mode 100644 index 000000000..920fdc114 --- /dev/null +++ b/src/crimson/osd/osd_operation.cc @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd_operation.h" +#include "common/Formatter.h" +#include "crimson/common/log.h" +#include "crimson/osd/osd_operations/client_request.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +void OSDOperationRegistry::do_stop() +{ + logger().info("OSDOperationRegistry::{}", __func__); + // we need to decouple visiting the registry from destructing + // ops because of the auto-unlink feature of boost::intrusive. + // the list shouldn't change while iterating due to constrains + // on iterator's validity. + constexpr auto historic_reg_index = + static_cast<size_t>(OperationTypeCode::historic_client_request); + auto& historic_registry = get_registry<historic_reg_index>(); + std::vector<ClientRequest::ICRef> to_ref_down; + std::transform(std::begin(historic_registry), std::end(historic_registry), + std::back_inserter(to_ref_down), + [] (const Operation& op) { + return ClientRequest::ICRef{ + static_cast<const ClientRequest*>(&op), + /* add_ref= */ false + }; + }); + last_of_recents = std::end(historic_registry); + // to_ref_down is going off +} + +OSDOperationRegistry::OSDOperationRegistry() + : OperationRegistryT(seastar::this_shard_id()) +{ + constexpr auto historic_reg_index = + static_cast<size_t>(OperationTypeCode::historic_client_request); + auto& historic_registry = get_registry<historic_reg_index>(); + last_of_recents = std::begin(historic_registry); +} + +static auto get_duration(const ClientRequest& client_request) +{ + // TODO: consider enhancing `CompletionEvent` with computing duration + // once -- when it's enetered. + return client_request.get_completed() - client_request.get_started(); +} + +void OSDOperationRegistry::put_historic(const ClientRequest& op) +{ + // unlink the op from the client request registry. this is a part of + // the re-link procedure. finally it will be in historic registry. + constexpr auto client_reg_index = + static_cast<size_t>(OperationTypeCode::client_request); + constexpr auto historic_reg_index = + static_cast<size_t>(OperationTypeCode::historic_client_request); + auto& client_registry = get_registry<client_reg_index>(); + auto& historic_registry = get_registry<historic_reg_index>(); + historic_registry.splice(std::end(historic_registry), + client_registry, + client_registry.iterator_to(op)); + ClientRequest::ICRef( + &op, /* add_ref= */true + ).detach(); // yes, "leak" it for now! + + // check whether the history size limit is not exceeded; if so, then + // purge the oldest op. + // NOTE: Operation uses the auto-unlink feature of boost::intrusive. + // NOTE: the cleaning happens in OSDOperationRegistry::do_stop() + using crimson::common::local_conf; + if (num_recent_ops >= local_conf()->osd_op_history_size) { + ++last_of_recents; + ++num_slow_ops; + } else { + ++num_recent_ops; + } + if (num_slow_ops > local_conf()->osd_op_history_slow_op_size) { + // we're interested in keeping slowest ops. if the slow op history + // is disabled, the list will have only one element, so the full-blown + // search will boil down into `.front()`. + const auto fastest_historic_iter = std::min_element( + std::cbegin(historic_registry), last_of_recents, + [] (const auto& lop, const auto& rop) { + const auto& lclient_request = static_cast<const ClientRequest&>(lop); + const auto& rclient_request = static_cast<const ClientRequest&>(rop); + return get_duration(lclient_request) < get_duration(rclient_request); + }); + assert(fastest_historic_iter != std::end(historic_registry)); + const auto& fastest_historic_op = + static_cast<const ClientRequest&>(*fastest_historic_iter); + historic_registry.erase(fastest_historic_iter); + // clear a previously "leaked" op + ClientRequest::ICRef(&fastest_historic_op, /* add_ref= */false); + --num_slow_ops; + } +} + +size_t OSDOperationRegistry::dump_historic_client_requests(ceph::Formatter* f) const +{ + const auto& historic_client_registry = + get_registry<static_cast<size_t>(OperationTypeCode::historic_client_request)>(); //ClientRequest::type)>(); + f->open_object_section("op_history"); + f->dump_int("size", historic_client_registry.size()); + // TODO: f->dump_int("duration", history_duration.load()); + // the intrusive list is configured to not store the size + size_t ops_count = 0; + { + f->open_array_section("ops"); + for (const auto& op : historic_client_registry) { + op.dump(f); + ++ops_count; + } + f->close_section(); + } + f->close_section(); + return ops_count; +} + +size_t OSDOperationRegistry::dump_slowest_historic_client_requests(ceph::Formatter* f) const +{ + const auto& historic_client_registry = + get_registry<static_cast<size_t>(OperationTypeCode::historic_client_request)>(); //ClientRequest::type)>(); + f->open_object_section("op_history"); + f->dump_int("size", historic_client_registry.size()); + // TODO: f->dump_int("duration", history_duration.load()); + // the intrusive list is configured to not store the size + std::multimap<utime_t, + const ClientRequest*, + std::greater<utime_t>> sorted_slowest_ops; + // iterating over the entire registry as a slow op could be also + // in the "recently added" part. + std::transform(std::begin(historic_client_registry), + std::end(historic_client_registry), + std::inserter(sorted_slowest_ops, std::end(sorted_slowest_ops)), + [] (const Operation& op) { + const auto& cop = static_cast<const ClientRequest&>(op); + return std::make_pair(get_duration(cop), &cop); + }); + f->open_array_section("ops"); + using crimson::common::local_conf; + size_t ops_count = 0; + for (auto it = std::begin(sorted_slowest_ops); + ops_count < local_conf()->osd_op_history_slow_op_size + && it != std::end(sorted_slowest_ops); + ++it, ++ops_count) + { + it->second->dump(f); + } + f->close_section(); + return ops_count; +} + +OperationThrottler::OperationThrottler(ConfigProxy &conf) + : scheduler(crimson::osd::scheduler::make_scheduler(conf)) +{ + conf.add_observer(this); + update_from_config(conf); +} + +void OperationThrottler::wake() +{ + while ((!max_in_progress || in_progress < max_in_progress) && + !scheduler->empty()) { + auto item = scheduler->dequeue(); + item.wake.set_value(); + ++in_progress; + --pending; + } +} + +void OperationThrottler::release_throttle() +{ + ceph_assert(in_progress > 0); + --in_progress; + wake(); +} + +seastar::future<> OperationThrottler::acquire_throttle( + crimson::osd::scheduler::params_t params) +{ + crimson::osd::scheduler::item_t item{params, seastar::promise<>()}; + auto fut = item.wake.get_future(); + scheduler->enqueue(std::move(item)); + return fut; +} + +void OperationThrottler::dump_detail(Formatter *f) const +{ + f->dump_unsigned("max_in_progress", max_in_progress); + f->dump_unsigned("in_progress", in_progress); + f->open_object_section("scheduler"); + { + scheduler->dump(*f); + } + f->close_section(); +} + +void OperationThrottler::update_from_config(const ConfigProxy &conf) +{ + max_in_progress = conf.get_val<uint64_t>("crimson_osd_scheduler_concurrency"); + wake(); +} + +const char** OperationThrottler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "crimson_osd_scheduler_concurrency", + NULL + }; + return KEYS; +} + +void OperationThrottler::handle_conf_change( + const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + update_from_config(conf); +} + +} diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h new file mode 100644 index 000000000..8ef44ee9e --- /dev/null +++ b/src/crimson/osd/osd_operation.h @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/operation.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/scheduler/scheduler.h" +#include "osd/osd_types.h" + +namespace crimson::os::seastore { + template<class OpT> + class OperationProxyT; +} + +namespace crimson::osd { + +/// Ordering stages for a class of operations ordered by PG. +struct ConnectionPipeline { + struct AwaitActive : OrderedExclusivePhaseT<AwaitActive> { + static constexpr auto type_name = + "ConnectionPipeline::await_active"; + } await_active; + + struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> { + static constexpr auto type_name = + "ConnectionPipeline::await_map"; + } await_map; + + struct GetPG : OrderedExclusivePhaseT<GetPG> { + static constexpr auto type_name = + "ConnectionPipeline::get_pg"; + } get_pg; +}; + +enum class OperationTypeCode { + client_request = 0, + peering_event, + pg_advance_map, + pg_creation, + replicated_request, + background_recovery, + background_recovery_sub, + internal_client_request, + historic_client_request, + logmissing_request, + logmissing_request_reply, + snaptrim_event, + snaptrimobj_subevent, + last_op +}; + +static constexpr const char* const OP_NAMES[] = { + "client_request", + "peering_event", + "pg_advance_map", + "pg_creation", + "replicated_request", + "background_recovery", + "background_recovery_sub", + "internal_client_request", + "historic_client_request", + "logmissing_request", + "logmissing_request_reply", + "snaptrim_event", + "snaptrimobj_subevent", +}; + +// prevent the addition of OperationTypeCode-s with no matching OP_NAMES entry: +static_assert( + (sizeof(OP_NAMES)/sizeof(OP_NAMES[0])) == + static_cast<int>(OperationTypeCode::last_op)); + +struct InterruptibleOperation : Operation { + template <typename ValuesT = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition, ValuesT>; + using interruptor = + ::crimson::interruptible::interruptor< + ::crimson::osd::IOInterruptCondition>; +}; + +template <typename T> +struct OperationT : InterruptibleOperation { + static constexpr const char *type_name = OP_NAMES[static_cast<int>(T::type)]; + using IRef = boost::intrusive_ptr<T>; + using ICRef = boost::intrusive_ptr<const T>; + + unsigned get_type() const final { + return static_cast<unsigned>(T::type); + } + + const char *get_type_name() const final { + return T::type_name; + } + + virtual ~OperationT() = default; + +private: + virtual void dump_detail(ceph::Formatter *f) const = 0; +}; + +template <class T> +class TrackableOperationT : public OperationT<T> { + T* that() { + return static_cast<T*>(this); + } + const T* that() const { + return static_cast<const T*>(this); + } + +protected: + template<class EventT> + decltype(auto) get_event() { + // all out derivates are supposed to define the list of tracking + // events accessible via `std::get`. This will usually boil down + // into an instance of `std::tuple`. + return std::get<EventT>(that()->tracking_events); + } + + template<class EventT> + decltype(auto) get_event() const { + return std::get<EventT>(that()->tracking_events); + } + + using OperationT<T>::OperationT; + + struct StartEvent : TimeEvent<StartEvent> {}; + struct CompletionEvent : TimeEvent<CompletionEvent> {}; + + template <class EventT, class... Args> + void track_event(Args&&... args) { + // the idea is to have a visitor-like interface that allows to double + // dispatch (backend, blocker type) + get_event<EventT>().trigger(*that(), std::forward<Args>(args)...); + } + + template <class BlockingEventT, class InterruptorT=void, class F> + auto with_blocking_event(F&& f) { + auto ret = std::forward<F>(f)(typename BlockingEventT::template Trigger<T>{ + get_event<BlockingEventT>(), *that() + }); + if constexpr (std::is_same_v<InterruptorT, void>) { + return ret; + } else { + using ret_t = decltype(ret); + return typename InterruptorT::template futurize_t<ret_t>{std::move(ret)}; + } + } + +public: + static constexpr bool is_trackable = true; +}; + +template <class T> +class PhasedOperationT : public TrackableOperationT<T> { + using base_t = TrackableOperationT<T>; + + T* that() { + return static_cast<T*>(this); + } + const T* that() const { + return static_cast<const T*>(this); + } + +protected: + using TrackableOperationT<T>::TrackableOperationT; + + template <class InterruptorT=void, class StageT> + auto enter_stage(StageT& stage) { + return this->template with_blocking_event<typename StageT::BlockingEvent, + InterruptorT>( + [&stage, this] (auto&& trigger) { + // delegated storing the pipeline handle to let childs to match + // the lifetime of pipeline with e.g. ConnectedSocket (important + // for ConnectionPipeline). + return that()->get_handle().template enter<T>(stage, std::move(trigger)); + }); + } + + template <class OpT> + friend class crimson::os::seastore::OperationProxyT; + + // PGShardManager::start_pg_operation needs access to enter_stage, we can make this + // more sophisticated later on + friend class PGShardManager; +}; + +/** + * Maintains a set of lists of all active ops. + */ +struct OSDOperationRegistry : OperationRegistryT< + static_cast<size_t>(OperationTypeCode::last_op) +> { + OSDOperationRegistry(); + + void do_stop() override; + + void put_historic(const class ClientRequest& op); + + size_t dump_historic_client_requests(ceph::Formatter* f) const; + size_t dump_slowest_historic_client_requests(ceph::Formatter* f) const; + +private: + op_list::const_iterator last_of_recents; + size_t num_recent_ops = 0; + size_t num_slow_ops = 0; +}; +/** + * Throttles set of currently running operations + * + * Very primitive currently, assumes all ops are equally + * expensive and simply limits the number that can be + * concurrently active. + */ +class OperationThrottler : public BlockerT<OperationThrottler>, + private md_config_obs_t { + friend BlockerT<OperationThrottler>; + static constexpr const char* type_name = "OperationThrottler"; + + template <typename OperationT, typename F> + auto with_throttle( + OperationT* op, + crimson::osd::scheduler::params_t params, + F &&f) { + if (!max_in_progress) return f(); + return acquire_throttle(params) + .then(std::forward<F>(f)) + .then([this](auto x) { + release_throttle(); + return x; + }); + } + + template <typename OperationT, typename F> + seastar::future<> with_throttle_while( + OperationT* op, + crimson::osd::scheduler::params_t params, + F &&f) { + return with_throttle(op, params, f).then([this, params, op, f](bool cont) { + return cont ? with_throttle_while(op, params, f) : seastar::now(); + }); + } + + +public: + OperationThrottler(ConfigProxy &conf); + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) final; + void update_from_config(const ConfigProxy &conf); + + template <class OpT, class... Args> + seastar::future<> with_throttle_while( + BlockingEvent::Trigger<OpT>&& trigger, + Args&&... args) { + return trigger.maybe_record_blocking( + with_throttle_while(std::forward<Args>(args)...), *this); + } + +private: + void dump_detail(Formatter *f) const final; + + crimson::osd::scheduler::SchedulerRef scheduler; + + uint64_t max_in_progress = 0; + uint64_t in_progress = 0; + + uint64_t pending = 0; + + void wake(); + + seastar::future<> acquire_throttle( + crimson::osd::scheduler::params_t params); + + void release_throttle(); +}; + +} diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h new file mode 100644 index 000000000..4b6dbf4b7 --- /dev/null +++ b/src/crimson/osd/osd_operation_external_tracking.h @@ -0,0 +1,307 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/osd/osd.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operations/background_recovery.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/pg_advance_map.h" +#include "crimson/osd/osd_operations/recovery_subrequest.h" +#include "crimson/osd/osd_operations/replicated_request.h" +#include "crimson/osd/osd_operations/snaptrim_event.h" +#include "crimson/osd/pg_activation_blocker.h" +#include "crimson/osd/pg_map.h" + +namespace crimson::osd { + +// Just the boilerplate currently. Implementing +struct LttngBackend + : ClientRequest::StartEvent::Backend, + ConnectionPipeline::AwaitActive::BlockingEvent::Backend, + ConnectionPipeline::AwaitMap::BlockingEvent::Backend, + ConnectionPipeline::GetPG::BlockingEvent::Backend, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, + PGMap::PGCreationBlockingEvent::Backend, + ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend, + PGActivationBlocker::BlockingEvent::Backend, + ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend, + ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend, + ClientRequest::PGPipeline::Process::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend, + ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend, + ClientRequest::CompletionEvent::Backend +{ + void handle(ClientRequest::StartEvent&, + const Operation&) override {} + + void handle(ConnectionPipeline::AwaitActive::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::AwaitActive& blocker) override { + } + + void handle(ConnectionPipeline::AwaitMap::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::AwaitMap& blocker) override { + } + + void handle(OSD_OSDMapGate::OSDMapBlocker::BlockingEvent&, + const Operation&, + const OSD_OSDMapGate::OSDMapBlocker&) override { + } + + void handle(ConnectionPipeline::GetPG::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::GetPG& blocker) override { + } + + void handle(PGMap::PGCreationBlockingEvent&, + const Operation&, + const PGMap::PGCreationBlocker&) override { + } + + void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::AwaitMap& blocker) override { + } + + void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&, + const Operation&, + const PG_OSDMapGate::OSDMapBlocker&) override { + } + + void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::WaitForActive& blocker) override { + } + + void handle(PGActivationBlocker::BlockingEvent& ev, + const Operation& op, + const PGActivationBlocker& blocker) override { + } + + void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::RecoverMissing& blocker) override { + } + + void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::GetOBC& blocker) override { + } + + void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::Process& blocker) override { + } + + void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::WaitRepop& blocker) override { + } + + void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev, + const Operation& op) override { + } + + void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::SendReply& blocker) override { + } + + void handle(ClientRequest::CompletionEvent&, + const Operation&) override {} +}; + +struct HistoricBackend + : ClientRequest::StartEvent::Backend, + ConnectionPipeline::AwaitActive::BlockingEvent::Backend, + ConnectionPipeline::AwaitMap::BlockingEvent::Backend, + ConnectionPipeline::GetPG::BlockingEvent::Backend, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, + PGMap::PGCreationBlockingEvent::Backend, + ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend, + PGActivationBlocker::BlockingEvent::Backend, + ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend, + ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend, + ClientRequest::PGPipeline::Process::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend, + ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend, + ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend, + ClientRequest::CompletionEvent::Backend +{ + void handle(ClientRequest::StartEvent&, + const Operation&) override {} + + void handle(ConnectionPipeline::AwaitActive::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::AwaitActive& blocker) override { + } + + void handle(ConnectionPipeline::AwaitMap::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::AwaitMap& blocker) override { + } + + void handle(OSD_OSDMapGate::OSDMapBlocker::BlockingEvent&, + const Operation&, + const OSD_OSDMapGate::OSDMapBlocker&) override { + } + + void handle(ConnectionPipeline::GetPG::BlockingEvent& ev, + const Operation& op, + const ConnectionPipeline::GetPG& blocker) override { + } + + void handle(PGMap::PGCreationBlockingEvent&, + const Operation&, + const PGMap::PGCreationBlocker&) override { + } + + void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::AwaitMap& blocker) override { + } + + void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&, + const Operation&, + const PG_OSDMapGate::OSDMapBlocker&) override { + } + + void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::WaitForActive& blocker) override { + } + + void handle(PGActivationBlocker::BlockingEvent& ev, + const Operation& op, + const PGActivationBlocker& blocker) override { + } + + void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::RecoverMissing& blocker) override { + } + + void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::GetOBC& blocker) override { + } + + void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::Process& blocker) override { + } + + void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::WaitRepop& blocker) override { + } + + void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev, + const Operation& op) override { + } + + void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev, + const Operation& op, + const ClientRequest::PGPipeline::SendReply& blocker) override { + } + + static const ClientRequest& to_client_request(const Operation& op) { +#ifdef NDEBUG + return static_cast<const ClientRequest&>(op); +#else + return dynamic_cast<const ClientRequest&>(op); +#endif + } + + void handle(ClientRequest::CompletionEvent&, const Operation& op) override { + if (crimson::common::local_conf()->osd_op_history_size) { + to_client_request(op).put_historic(); + } + } +}; + +} // namespace crimson::osd + +namespace crimson { + +template <> +struct EventBackendRegistry<osd::ClientRequest> { + static std::tuple<osd::LttngBackend, osd::HistoricBackend> get_backends() { + return { {}, {} }; + } +}; + +template <> +struct EventBackendRegistry<osd::RemotePeeringEvent> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + +template <> +struct EventBackendRegistry<osd::LocalPeeringEvent> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + +template <> +struct EventBackendRegistry<osd::RepRequest> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + + +template <> +struct EventBackendRegistry<osd::LogMissingRequest> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + +template <> +struct EventBackendRegistry<osd::LogMissingRequestReply> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + +template <> +struct EventBackendRegistry<osd::RecoverySubRequest> { + static std::tuple<> get_backends() { + return {/* no extenral backends */}; + } +}; + +template <> +struct EventBackendRegistry<osd::BackfillRecovery> { + static std::tuple<> get_backends() { + return {}; + } +}; + +template <> +struct EventBackendRegistry<osd::PGAdvanceMap> { + static std::tuple<> get_backends() { + return {}; + } +}; + +template <> +struct EventBackendRegistry<osd::SnapTrimObjSubEvent> { + static std::tuple<> get_backends() { + return {}; + } +}; + +} // namespace crimson diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc new file mode 100644 index 000000000..953ec9595 --- /dev/null +++ b/src/crimson/osd/osd_operations/background_recovery.cc @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> +#include <seastar/core/sleep.hh> + +#include "messages/MOSDOp.h" + +#include "crimson/osd/pg.h" +#include "crimson/osd/shard_services.h" +#include "common/Formatter.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/osd_operations/background_recovery.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson { + template <> + struct EventBackendRegistry<osd::UrgentRecovery> { + static std::tuple<> get_backends() { + return {}; + } + }; + + template <> + struct EventBackendRegistry<osd::PglogBasedRecovery> { + static std::tuple<> get_backends() { + return {}; + } + }; +} + +namespace crimson::osd { + +template <class T> +BackgroundRecoveryT<T>::BackgroundRecoveryT( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + crimson::osd::scheduler::scheduler_class_t scheduler_class, + float delay) + : pg(pg), + epoch_started(epoch_started), + delay(delay), + ss(ss), + scheduler_class(scheduler_class) +{} + +template <class T> +void BackgroundRecoveryT<T>::print(std::ostream &lhs) const +{ + lhs << "BackgroundRecovery(" << pg->get_pgid() << ")"; +} + +template <class T> +void BackgroundRecoveryT<T>::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->get_pgid(); + f->open_object_section("recovery_detail"); + { + // TODO pg->dump_recovery_state(f); + } + f->close_section(); +} + +template <class T> +seastar::future<> BackgroundRecoveryT<T>::start() +{ + logger().debug("{}: start", *this); + + typename T::IRef ref = static_cast<T*>(this); + auto maybe_delay = seastar::now(); + if (delay) { + maybe_delay = seastar::sleep( + std::chrono::milliseconds(std::lround(delay * 1000))); + } + return maybe_delay.then([ref, this] { + return this->template with_blocking_event<OperationThrottler::BlockingEvent>( + [ref, this] (auto&& trigger) { + return ss.with_throttle_while( + std::move(trigger), + this, get_scheduler_params(), [this] { + return T::interruptor::with_interruption([this] { + return do_recovery(); + }, [](std::exception_ptr) { + return seastar::make_ready_future<bool>(false); + }, pg); + }).handle_exception_type([ref, this](const std::system_error& err) { + if (err.code() == std::make_error_code(std::errc::interrupted)) { + logger().debug("{} recovery interruped: {}", *pg, err.what()); + return seastar::now(); + } + return seastar::make_exception_future<>(err); + }); + }); + }); +} + +UrgentRecovery::UrgentRecovery( + const hobject_t& soid, + const eversion_t& need, + Ref<PG> pg, + ShardServices& ss, + epoch_t epoch_started) + : BackgroundRecoveryT{pg, ss, epoch_started, + crimson::osd::scheduler::scheduler_class_t::immediate}, + soid{soid}, need(need) +{ +} + +UrgentRecovery::interruptible_future<bool> +UrgentRecovery::do_recovery() +{ + logger().debug("{}: {}", __func__, *this); + if (!pg->has_reset_since(epoch_started)) { + return with_blocking_event<RecoveryBackend::RecoveryBlockingEvent, + interruptor>([this] (auto&& trigger) { + return pg->get_recovery_handler()->recover_missing(trigger, soid, need); + }).then_interruptible([] { + return seastar::make_ready_future<bool>(false); + }); + } + return seastar::make_ready_future<bool>(false); +} + +void UrgentRecovery::print(std::ostream &lhs) const +{ + lhs << "UrgentRecovery(" << pg->get_pgid() << ", " + << soid << ", v" << need << ", epoch_started: " + << epoch_started << ")"; +} + +void UrgentRecovery::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->get_pgid(); + f->open_object_section("recovery_detail"); + { + f->dump_stream("oid") << soid; + f->dump_stream("version") << need; + } + f->close_section(); +} + +PglogBasedRecovery::PglogBasedRecovery( + Ref<PG> pg, + ShardServices &ss, + const epoch_t epoch_started, + float delay) + : BackgroundRecoveryT( + std::move(pg), + ss, + epoch_started, + crimson::osd::scheduler::scheduler_class_t::background_recovery, + delay) +{} + +PglogBasedRecovery::interruptible_future<bool> +PglogBasedRecovery::do_recovery() +{ + if (pg->has_reset_since(epoch_started)) { + return seastar::make_ready_future<bool>(false); + } + return with_blocking_event<RecoveryBackend::RecoveryBlockingEvent, + interruptor>([this] (auto&& trigger) { + return pg->get_recovery_handler()->start_recovery_ops( + trigger, + crimson::common::local_conf()->osd_recovery_max_single_start); + }); +} + +PGPeeringPipeline &BackfillRecovery::peering_pp(PG &pg) +{ + return pg.peering_request_pg_pipeline; +} + +BackfillRecovery::interruptible_future<bool> +BackfillRecovery::do_recovery() +{ + logger().debug("{}", __func__); + + if (pg->has_reset_since(epoch_started)) { + logger().debug("{}: pg got reset since epoch_started={}", + __func__, epoch_started); + return seastar::make_ready_future<bool>(false); + } + // TODO: limits + return enter_stage<interruptor>( + // process_event() of our boost::statechart machine is non-reentrant. + // with the backfill_pipeline we protect it from a second entry from + // the implementation of BackfillListener. + // additionally, this stage serves to synchronize with PeeringEvent. + peering_pp(*pg).process + ).then_interruptible([this] { + pg->get_recovery_handler()->dispatch_backfill_event(std::move(evt)); + return seastar::make_ready_future<bool>(false); + }); +} + +template class BackgroundRecoveryT<UrgentRecovery>; +template class BackgroundRecoveryT<PglogBasedRecovery>; +template class BackgroundRecoveryT<BackfillRecovery>; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/background_recovery.h b/src/crimson/osd/osd_operations/background_recovery.h new file mode 100644 index 000000000..17f2cd57a --- /dev/null +++ b/src/crimson/osd/osd_operations/background_recovery.h @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/statechart/event_base.hpp> + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/recovery_backend.h" +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/pg.h" + +namespace crimson::osd { +class PG; +class ShardServices; + +template <class T> +class BackgroundRecoveryT : public PhasedOperationT<T> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::background_recovery; + + BackgroundRecoveryT( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + crimson::osd::scheduler::scheduler_class_t scheduler_class, float delay = 0); + + virtual void print(std::ostream &) const; + seastar::future<> start(); + +protected: + Ref<PG> pg; + const epoch_t epoch_started; + float delay = 0; + +private: + virtual void dump_detail(Formatter *f) const; + crimson::osd::scheduler::params_t get_scheduler_params() const { + return { + 1, // cost + 0, // owner + scheduler_class + }; + } + using do_recovery_ret_t = typename PhasedOperationT<T>::template interruptible_future<bool>; + virtual do_recovery_ret_t do_recovery() = 0; + ShardServices &ss; + const crimson::osd::scheduler::scheduler_class_t scheduler_class; +}; + +/// represent a recovery initiated for serving a client request +/// +/// unlike @c PglogBasedRecovery and @c BackfillRecovery, +/// @c UrgentRecovery is not throttled by the scheduler. and it +/// utilizes @c RecoveryBackend directly to recover the unreadable +/// object. +class UrgentRecovery final : public BackgroundRecoveryT<UrgentRecovery> { +public: + UrgentRecovery( + const hobject_t& soid, + const eversion_t& need, + Ref<PG> pg, + ShardServices& ss, + epoch_t epoch_started); + void print(std::ostream&) const final; + + std::tuple< + OperationThrottler::BlockingEvent, + RecoveryBackend::RecoveryBlockingEvent + > tracking_events; + +private: + void dump_detail(Formatter* f) const final; + interruptible_future<bool> do_recovery() override; + const hobject_t soid; + const eversion_t need; +}; + +class PglogBasedRecovery final : public BackgroundRecoveryT<PglogBasedRecovery> { +public: + PglogBasedRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + float delay = 0); + + std::tuple< + OperationThrottler::BlockingEvent, + RecoveryBackend::RecoveryBlockingEvent + > tracking_events; + +private: + interruptible_future<bool> do_recovery() override; +}; + +class BackfillRecovery final : public BackgroundRecoveryT<BackfillRecovery> { +public: + + template <class EventT> + BackfillRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + const EventT& evt); + + PipelineHandle& get_handle() { return handle; } + + std::tuple< + OperationThrottler::BlockingEvent, + PGPeeringPipeline::Process::BlockingEvent + > tracking_events; + +private: + boost::intrusive_ptr<const boost::statechart::event_base> evt; + PipelineHandle handle; + + static PGPeeringPipeline &peering_pp(PG &pg); + interruptible_future<bool> do_recovery() override; +}; + +template <class EventT> +BackfillRecovery::BackfillRecovery( + Ref<PG> pg, + ShardServices &ss, + const epoch_t epoch_started, + const EventT& evt) + : BackgroundRecoveryT( + std::move(pg), + ss, + epoch_started, + crimson::osd::scheduler::scheduler_class_t::background_best_effort), + evt(evt.intrusive_from_this()) +{} + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::BackfillRecovery> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::osd::PglogBasedRecovery> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::osd::UrgentRecovery> : fmt::ostream_formatter {}; +template <class T> struct fmt::formatter<crimson::osd::BackgroundRecoveryT<T>> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc new file mode 100644 index 000000000..9374fbde2 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request.cc @@ -0,0 +1,388 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" + +#include "crimson/common/exception.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" +#include "common/Formatter.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_connection_priv.h" +#include "osd/object_state_fmt.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + + +void ClientRequest::Orderer::requeue( + ShardServices &shard_services, Ref<PG> pg) +{ + for (auto &req: list) { + logger().debug("{}: {} requeueing {}", __func__, *pg, req); + req.reset_instance_handle(); + std::ignore = req.with_pg_int(shard_services, pg); + } +} + +void ClientRequest::Orderer::clear_and_cancel() +{ + for (auto i = list.begin(); i != list.end(); ) { + logger().debug( + "ClientRequest::Orderer::clear_and_cancel: {}", + *i); + i->complete_request(); + remove_request(*(i++)); + } +} + +void ClientRequest::complete_request() +{ + track_event<CompletionEvent>(); + on_complete.set_value(); +} + +ClientRequest::ClientRequest( + ShardServices &shard_services, crimson::net::ConnectionRef conn, + Ref<MOSDOp> &&m) + : put_historic_shard_services(&shard_services), + conn(std::move(conn)), + m(std::move(m)), + instance_handle(new instance_handle_t) +{} + +ClientRequest::~ClientRequest() +{ + logger().debug("{}: destroying", *this); +} + +void ClientRequest::print(std::ostream &lhs) const +{ + lhs << "m=[" << *m << "]"; +} + +void ClientRequest::dump_detail(Formatter *f) const +{ + logger().debug("{}: dumping", *this); + std::apply([f] (auto... event) { + (..., event.dump(f)); + }, tracking_events); +} + +ConnectionPipeline &ClientRequest::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).client_request_conn_pipeline; +} + +ClientRequest::PGPipeline &ClientRequest::client_pp(PG &pg) +{ + return pg.request_pg_pipeline; +} + +bool ClientRequest::is_pg_op() const +{ + return std::any_of( + begin(m->ops), end(m->ops), + [](auto& op) { return ceph_osd_op_type_pg(op.op.op); }); +} + +seastar::future<> ClientRequest::with_pg_int( + ShardServices &shard_services, Ref<PG> pgref) +{ + epoch_t same_interval_since = pgref->get_interval_start_epoch(); + logger().debug("{} same_interval_since: {}", *this, same_interval_since); + if (m->finish_decode()) { + m->clear_payload(); + } + const auto this_instance_id = instance_id++; + OperationRef opref{this}; + auto instance_handle = get_instance_handle(); + auto &ihref = *instance_handle; + return interruptor::with_interruption( + [this, pgref, this_instance_id, &ihref, &shard_services]() mutable { + PG &pg = *pgref; + if (pg.can_discard_op(*m)) { + return shard_services.send_incremental_map( + std::ref(*conn), m->get_map_epoch() + ).then([this, this_instance_id, pgref] { + logger().debug("{}.{}: discarding", *this, this_instance_id); + pgref->client_request_orderer.remove_request(*this); + complete_request(); + return interruptor::now(); + }); + } + return ihref.enter_stage<interruptor>(client_pp(pg).await_map, *this + ).then_interruptible([this, this_instance_id, &pg, &ihref] { + logger().debug("{}.{}: after await_map stage", *this, this_instance_id); + return ihref.enter_blocker( + *this, pg.osdmap_gate, &decltype(pg.osdmap_gate)::wait_for_map, + m->get_min_epoch(), nullptr); + }).then_interruptible([this, this_instance_id, &pg, &ihref](auto map) { + logger().debug("{}.{}: after wait_for_map", *this, this_instance_id); + return ihref.enter_stage<interruptor>(client_pp(pg).wait_for_active, *this); + }).then_interruptible([this, this_instance_id, &pg, &ihref]() { + logger().debug( + "{}.{}: after wait_for_active stage", *this, this_instance_id); + return ihref.enter_blocker( + *this, + pg.wait_for_active_blocker, + &decltype(pg.wait_for_active_blocker)::wait); + }).then_interruptible([this, pgref, this_instance_id, &ihref]() mutable + -> interruptible_future<> { + logger().debug( + "{}.{}: after wait_for_active", *this, this_instance_id); + if (is_pg_op()) { + return process_pg_op(pgref); + } else { + return process_op(ihref, pgref); + } + }).then_interruptible([this, this_instance_id, pgref] { + logger().debug("{}.{}: after process*", *this, this_instance_id); + pgref->client_request_orderer.remove_request(*this); + complete_request(); + }); + }, [this, this_instance_id, pgref](std::exception_ptr eptr) { + // TODO: better debug output + logger().debug("{}.{}: interrupted {}", *this, this_instance_id, eptr); + }, pgref).finally( + [opref=std::move(opref), pgref=std::move(pgref), + instance_handle=std::move(instance_handle), &ihref] { + ihref.handle.exit(); + }); +} + +seastar::future<> ClientRequest::with_pg( + ShardServices &shard_services, Ref<PG> pgref) +{ + put_historic_shard_services = &shard_services; + pgref->client_request_orderer.add_request(*this); + auto ret = on_complete.get_future(); + std::ignore = with_pg_int( + shard_services, std::move(pgref) + ); + return ret; +} + +ClientRequest::interruptible_future<> +ClientRequest::process_pg_op( + Ref<PG> &pg) +{ + return pg->do_pg_ops( + m + ).then_interruptible([this, pg=std::move(pg)](MURef<MOSDOpReply> reply) { + return conn->send(std::move(reply)); + }); +} + +auto ClientRequest::reply_op_error(const Ref<PG>& pg, int err) +{ + logger().debug("{}: replying with error {}", *this, err); + auto reply = crimson::make_message<MOSDOpReply>( + m.get(), err, pg->get_osdmap_epoch(), + m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK), + !m->has_flag(CEPH_OSD_FLAG_RETURNVEC)); + reply->set_reply_versions(eversion_t(), 0); + reply->set_op_returns(std::vector<pg_log_op_return_item_t>{}); + return conn->send(std::move(reply)); +} + +ClientRequest::interruptible_future<> +ClientRequest::process_op(instance_handle_t &ihref, Ref<PG> &pg) +{ + return ihref.enter_stage<interruptor>( + client_pp(*pg).recover_missing, + *this + ).then_interruptible( + [this, pg]() mutable { + if (pg->is_primary()) { + return do_recover_missing(pg, m->get_hobj()); + } else { + logger().debug("process_op: Skipping do_recover_missing" + "on non primary pg"); + return interruptor::now(); + } + }).then_interruptible([this, pg, &ihref]() mutable { + return pg->already_complete(m->get_reqid()).then_interruptible( + [this, pg, &ihref](auto completed) mutable + -> PG::load_obc_iertr::future<> { + if (completed) { + auto reply = crimson::make_message<MOSDOpReply>( + m.get(), completed->err, pg->get_osdmap_epoch(), + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, false); + reply->set_reply_versions(completed->version, completed->user_version); + return conn->send(std::move(reply)); + } else { + return ihref.enter_stage<interruptor>(client_pp(*pg).get_obc, *this + ).then_interruptible( + [this, pg, &ihref]() mutable -> PG::load_obc_iertr::future<> { + logger().debug("{}: in get_obc stage", *this); + op_info.set_from_op(&*m, *pg->get_osdmap()); + return pg->with_locked_obc( + m->get_hobj(), op_info, + [this, pg, &ihref](auto obc) mutable { + logger().debug("{}: got obc {}", *this, obc->obs); + return ihref.enter_stage<interruptor>( + client_pp(*pg).process, *this + ).then_interruptible([this, pg, obc, &ihref]() mutable { + return do_process(ihref, pg, obc); + }); + }); + }); + } + }); + }).handle_error_interruptible( + PG::load_obc_ertr::all_same_way([this, pg=std::move(pg)](const auto &code) { + logger().error("ClientRequest saw error code {}", code); + assert(code.value() > 0); + return reply_op_error(pg, -code.value()); + })); +} + +ClientRequest::interruptible_future<> +ClientRequest::do_process( + instance_handle_t &ihref, + Ref<PG>& pg, crimson::osd::ObjectContextRef obc) +{ + if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) { + return reply_op_error(pg, -EINVAL); + } + const pg_pool_t pool = pg->get_pgpool().info; + if (pool.has_flag(pg_pool_t::FLAG_EIO)) { + // drop op on the floor; the client will handle returning EIO + if (m->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO)) { + logger().debug("discarding op due to pool EIO flag"); + return seastar::now(); + } else { + logger().debug("replying EIO due to pool EIO flag"); + return reply_op_error(pg, -EIO); + } + } + if (m->get_oid().name.size() + > crimson::common::local_conf()->osd_max_object_name_len) { + return reply_op_error(pg, -ENAMETOOLONG); + } else if (m->get_hobj().get_key().size() + > crimson::common::local_conf()->osd_max_object_name_len) { + return reply_op_error(pg, -ENAMETOOLONG); + } else if (m->get_hobj().nspace.size() + > crimson::common::local_conf()->osd_max_object_namespace_len) { + return reply_op_error(pg, -ENAMETOOLONG); + } else if (m->get_hobj().oid.name.empty()) { + return reply_op_error(pg, -EINVAL); + } else if (pg->get_osdmap()->is_blocklisted(conn->get_peer_addr())) { + logger().info("{} is blocklisted", conn->get_peer_addr()); + return reply_op_error(pg, -EBLOCKLISTED); + } + + if (!obc->obs.exists && !op_info.may_write()) { + return reply_op_error(pg, -ENOENT); + } + + SnapContext snapc = get_snapc(pg,obc); + + if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) && + snapc.seq < obc->ssc->snapset.seq) { + logger().debug("{} ORDERSNAP flag set and snapc seq {}", + " < snapset seq {} on {}", + __func__, snapc.seq, obc->ssc->snapset.seq, + obc->obs.oi.soid); + return reply_op_error(pg, -EOLDSNAPC); + } + + if (!pg->is_primary()) { + // primary can handle both normal ops and balanced reads + if (is_misdirected(*pg)) { + logger().trace("do_process: dropping misdirected op"); + return seastar::now(); + } else if (const hobject_t& hoid = m->get_hobj(); + !pg->get_peering_state().can_serve_replica_read(hoid)) { + logger().debug("{}: unstable write on replica, " + "bouncing to primary", + __func__); + return reply_op_error(pg, -EAGAIN); + } else { + logger().debug("{}: serving replica read on oid {}", + __func__, m->get_hobj()); + } + } + return pg->do_osd_ops(m, conn, obc, op_info, snapc).safe_then_unpack_interruptible( + [this, pg, &ihref](auto submitted, auto all_completed) mutable { + return submitted.then_interruptible([this, pg, &ihref] { + return ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this); + }).then_interruptible( + [this, pg, all_completed=std::move(all_completed), &ihref]() mutable { + return all_completed.safe_then_interruptible( + [this, pg, &ihref](MURef<MOSDOpReply> reply) { + return ihref.enter_stage<interruptor>(client_pp(*pg).send_reply, *this + ).then_interruptible( + [this, reply=std::move(reply)]() mutable { + logger().debug("{}: sending response", *this); + return conn->send(std::move(reply)); + }); + }, crimson::ct_error::eagain::handle([this, pg, &ihref]() mutable { + return process_op(ihref, pg); + })); + }); + }, crimson::ct_error::eagain::handle([this, pg, &ihref]() mutable { + return process_op(ihref, pg); + })); +} + +bool ClientRequest::is_misdirected(const PG& pg) const +{ + // otherwise take a closer look + if (const int flags = m->get_flags(); + flags & CEPH_OSD_FLAG_BALANCE_READS || + flags & CEPH_OSD_FLAG_LOCALIZE_READS) { + if (!op_info.may_read()) { + // no read found, so it can't be balanced read + return true; + } + if (op_info.may_write() || op_info.may_cache()) { + // write op, but i am not primary + return true; + } + // balanced reads; any replica will do + return false; + } + // neither balanced nor localize reads + return true; +} + +void ClientRequest::put_historic() const +{ + ceph_assert_always(put_historic_shard_services); + put_historic_shard_services->get_registry().put_historic(*this); +} + +const SnapContext ClientRequest::get_snapc( + Ref<PG>& pg, + crimson::osd::ObjectContextRef obc) const +{ + SnapContext snapc; + if (op_info.may_write() || op_info.may_cache()) { + // snap + if (pg->get_pgpool().info.is_pool_snaps_mode()) { + // use pool's snapc + snapc = pg->get_pgpool().snapc; + logger().debug("{} using pool's snapc snaps={}", + __func__, snapc.snaps); + + } else { + // client specified snapc + snapc.seq = m->get_snap_seq(); + snapc.snaps = m->get_snaps(); + logger().debug("{} client specified snapc seq={} snaps={}", + __func__, snapc.seq, snapc.snaps); + } + } + return snapc; +} + +} diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h new file mode 100644 index 000000000..b2dce1e87 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request.h @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include <boost/intrusive/list.hpp> +#include <boost/intrusive_ptr.hpp> + +#include "osd/osd_op_util.h" +#include "crimson/net/Connection.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request_common.h" +#include "crimson/osd/osd_operations/common/pg_pipeline.h" +#include "crimson/osd/pg_activation_blocker.h" +#include "crimson/osd/pg_map.h" +#include "crimson/common/type_helpers.h" +#include "crimson/common/utility.h" +#include "messages/MOSDOp.h" + +namespace crimson::osd { +class PG; +class OSD; +class ShardServices; + +class ClientRequest final : public PhasedOperationT<ClientRequest>, + private CommonClientRequest { + // Initially set to primary core, updated to pg core after move, + // used by put_historic + ShardServices *put_historic_shard_services = nullptr; + + crimson::net::ConnectionRef conn; + // must be after conn due to ConnectionPipeline's life-time + Ref<MOSDOp> m; + OpInfo op_info; + seastar::promise<> on_complete; + unsigned instance_id = 0; + +public: + class PGPipeline : public CommonPGPipeline { + public: + struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> { + static constexpr auto type_name = "ClientRequest::PGPipeline::await_map"; + } await_map; + struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> { + static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop"; + } wait_repop; + struct SendReply : OrderedExclusivePhaseT<SendReply> { + static constexpr auto type_name = "ClientRequest::PGPipeline::send_reply"; + } send_reply; + friend class ClientRequest; + friend class LttngBackend; + friend class HistoricBackend; + friend class ReqRequest; + friend class LogMissingRequest; + friend class LogMissingRequestReply; + }; + + /** + * instance_handle_t + * + * Client request is, at present, the only Operation which can be requeued. + * This is, mostly, fine. However, reusing the PipelineHandle or + * BlockingEvent structures before proving that the prior instance has stopped + * can create hangs or crashes due to violations of the BlockerT and + * PipelineHandle invariants. + * + * To solve this, we create an instance_handle_t which contains the events + * for the portion of execution that can be rerun as well as the + * PipelineHandle. ClientRequest::with_pg_int grabs a reference to the current + * instance_handle_t and releases its PipelineHandle in the finally block. + * On requeue, we create a new instance_handle_t with a fresh PipelineHandle + * and events tuple and use it and use it for the next invocation of + * with_pg_int. + */ + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + PGMap::PGCreationBlockingEvent, + CompletionEvent + > tracking_events; + + class instance_handle_t : public boost::intrusive_ref_counter< + instance_handle_t, boost::thread_unsafe_counter> { + public: + // intrusive_ptr because seastar::lw_shared_ptr includes a cpu debug check + // that we will fail since the core on which we allocate the request may not + // be the core on which we perform with_pg_int. This is harmless, since we + // don't leave any references on the source core, so we just bypass it by using + // intrusive_ptr instead. + using ref_t = boost::intrusive_ptr<instance_handle_t>; + PipelineHandle handle; + + std::tuple< + PGPipeline::AwaitMap::BlockingEvent, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent, + PGPipeline::WaitForActive::BlockingEvent, + PGActivationBlocker::BlockingEvent, + PGPipeline::RecoverMissing::BlockingEvent, + PGPipeline::GetOBC::BlockingEvent, + PGPipeline::Process::BlockingEvent, + PGPipeline::WaitRepop::BlockingEvent, + PGPipeline::SendReply::BlockingEvent, + CompletionEvent + > pg_tracking_events; + + template <typename BlockingEventT, typename InterruptorT=void, typename F> + auto with_blocking_event(F &&f, ClientRequest &op) { + auto ret = std::forward<F>(f)( + typename BlockingEventT::template Trigger<ClientRequest>{ + std::get<BlockingEventT>(pg_tracking_events), op + }); + if constexpr (std::is_same_v<InterruptorT, void>) { + return ret; + } else { + using ret_t = decltype(ret); + return typename InterruptorT::template futurize_t<ret_t>{std::move(ret)}; + } + } + + template <typename InterruptorT=void, typename StageT> + auto enter_stage(StageT &stage, ClientRequest &op) { + return this->template with_blocking_event< + typename StageT::BlockingEvent, + InterruptorT>( + [&stage, this](auto &&trigger) { + return handle.template enter<ClientRequest>( + stage, std::move(trigger)); + }, op); + } + + template < + typename InterruptorT=void, typename BlockingObj, typename Method, + typename... Args> + auto enter_blocker( + ClientRequest &op, BlockingObj &obj, Method method, Args&&... args) { + return this->template with_blocking_event< + typename BlockingObj::Blocker::BlockingEvent, + InterruptorT>( + [&obj, method, + args=std::forward_as_tuple(std::move(args)...)](auto &&trigger) mutable { + return apply_method_to_tuple( + obj, method, + std::tuple_cat( + std::forward_as_tuple(std::move(trigger)), + std::move(args)) + ); + }, op); + } + }; + instance_handle_t::ref_t instance_handle; + void reset_instance_handle() { + instance_handle = new instance_handle_t; + } + auto get_instance_handle() { return instance_handle; } + + using ordering_hook_t = boost::intrusive::list_member_hook<>; + ordering_hook_t ordering_hook; + class Orderer { + using list_t = boost::intrusive::list< + ClientRequest, + boost::intrusive::member_hook< + ClientRequest, + typename ClientRequest::ordering_hook_t, + &ClientRequest::ordering_hook> + >; + list_t list; + + public: + void add_request(ClientRequest &request) { + assert(!request.ordering_hook.is_linked()); + intrusive_ptr_add_ref(&request); + list.push_back(request); + } + void remove_request(ClientRequest &request) { + assert(request.ordering_hook.is_linked()); + list.erase(list_t::s_iterator_to(request)); + intrusive_ptr_release(&request); + } + void requeue(ShardServices &shard_services, Ref<PG> pg); + void clear_and_cancel(); + }; + void complete_request(); + + static constexpr OperationTypeCode type = OperationTypeCode::client_request; + + ClientRequest( + ShardServices &shard_services, + crimson::net::ConnectionRef, Ref<MOSDOp> &&m); + ~ClientRequest(); + + void print(std::ostream &) const final; + void dump_detail(Formatter *f) const final; + + static constexpr bool can_create() { return false; } + spg_t get_pgid() const { + return m->get_spg(); + } + PipelineHandle &get_handle() { return instance_handle->handle; } + epoch_t get_epoch() const { return m->get_min_epoch(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } + + seastar::future<> with_pg_int( + ShardServices &shard_services, Ref<PG> pg); + +public: + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pgref); + +private: + template <typename FuncT> + interruptible_future<> with_sequencer(FuncT&& func); + auto reply_op_error(const Ref<PG>& pg, int err); + + interruptible_future<> do_process( + instance_handle_t &ihref, + Ref<PG>& pg, + crimson::osd::ObjectContextRef obc); + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition> process_pg_op( + Ref<PG> &pg); + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition> process_op( + instance_handle_t &ihref, + Ref<PG> &pg); + bool is_pg_op() const; + + PGPipeline &client_pp(PG &pg); + + template <typename Errorator> + using interruptible_errorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + Errorator>; + + bool is_misdirected(const PG& pg) const; + + const SnapContext get_snapc( + Ref<PG>& pg, + crimson::osd::ObjectContextRef obc) const; + +public: + + friend class LttngBackend; + friend class HistoricBackend; + + auto get_started() const { + return get_event<StartEvent>().get_timestamp(); + }; + + auto get_completed() const { + return get_event<CompletionEvent>().get_timestamp(); + }; + + void put_historic() const; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::ClientRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/client_request_common.cc b/src/crimson/osd/osd_operations/client_request_common.cc new file mode 100644 index 000000000..cfd22c774 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request_common.cc @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "crimson/osd/osd_operations/client_request_common.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd_operations/background_recovery.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +typename InterruptibleOperation::template interruptible_future<> +CommonClientRequest::do_recover_missing( + Ref<PG>& pg, const hobject_t& soid) +{ + eversion_t ver; + assert(pg->is_primary()); + logger().debug("{} check for recovery, {}", __func__, soid); + if (!pg->is_unreadable_object(soid, &ver) && + !pg->is_degraded_or_backfilling_object(soid)) { + return seastar::now(); + } + logger().debug("{} need to wait for recovery, {}", __func__, soid); + if (pg->get_recovery_backend()->is_recovering(soid)) { + return pg->get_recovery_backend()->get_recovering(soid).wait_for_recovered(); + } else { + auto [op, fut] = + pg->get_shard_services().start_operation<UrgentRecovery>( + soid, ver, pg, pg->get_shard_services(), pg->get_osdmap_epoch()); + return std::move(fut); + } +} + +bool CommonClientRequest::should_abort_request( + const Operation& op, + std::exception_ptr eptr) +{ + if (*eptr.__cxa_exception_type() == + typeid(::crimson::common::actingset_changed)) { + try { + std::rethrow_exception(eptr); + } catch(::crimson::common::actingset_changed& e) { + if (e.is_primary()) { + logger().debug("{} {} operation restart, acting set changed", __func__, op); + return false; + } else { + logger().debug("{} {} operation abort, up primary changed", __func__, op); + return true; + } + } + } else { + assert(*eptr.__cxa_exception_type() == + typeid(crimson::common::system_shutdown_exception)); + crimson::get_logger(ceph_subsys_osd).debug( + "{} {} operation skipped, system shutdown", __func__, op); + return true; + } +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/client_request_common.h b/src/crimson/osd/osd_operations/client_request_common.h new file mode 100644 index 000000000..6a8a78966 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request_common.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/operation.h" +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operation.h" + +namespace crimson::osd { + +struct CommonClientRequest { + static InterruptibleOperation::template interruptible_future<> + do_recover_missing(Ref<PG>& pg, const hobject_t& soid); + + static bool should_abort_request( + const crimson::Operation& op, std::exception_ptr eptr); +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h new file mode 100644 index 000000000..58fa07b8b --- /dev/null +++ b/src/crimson/osd/osd_operations/common/pg_pipeline.h @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "osd/osd_op_util.h" +#include "crimson/osd/osd_operation.h" + +namespace crimson::osd { + +class CommonPGPipeline { +protected: + friend class InternalClientRequest; + friend class SnapTrimEvent; + friend class SnapTrimObjSubEvent; + + struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> { + static constexpr auto type_name = "CommonPGPipeline:::wait_for_active"; + } wait_for_active; + struct RecoverMissing : OrderedExclusivePhaseT<RecoverMissing> { + static constexpr auto type_name = "CommonPGPipeline::recover_missing"; + } recover_missing; + struct GetOBC : OrderedExclusivePhaseT<GetOBC> { + static constexpr auto type_name = "CommonPGPipeline::get_obc"; + } get_obc; + struct Process : OrderedExclusivePhaseT<Process> { + static constexpr auto type_name = "CommonPGPipeline::process"; + } process; +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc new file mode 100644 index 000000000..1e9b842b2 --- /dev/null +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include <seastar/core/future.hh> + +#include "crimson/osd/osd_operations/internal_client_request.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson { + template <> + struct EventBackendRegistry<osd::InternalClientRequest> { + static std::tuple<> get_backends() { + return {}; + } + }; +} + + +namespace crimson::osd { + +InternalClientRequest::InternalClientRequest(Ref<PG> pg) + : pg(std::move(pg)) +{ + assert(bool(this->pg)); + assert(this->pg->is_primary()); +} + +InternalClientRequest::~InternalClientRequest() +{ + logger().debug("{}: destroying", *this); +} + +void InternalClientRequest::print(std::ostream &) const +{ +} + +void InternalClientRequest::dump_detail(Formatter *f) const +{ +} + +CommonPGPipeline& InternalClientRequest::client_pp() +{ + return pg->request_pg_pipeline; +} + +seastar::future<> InternalClientRequest::start() +{ + track_event<StartEvent>(); + return crimson::common::handle_system_shutdown([this] { + return seastar::repeat([this] { + logger().debug("{}: in repeat", *this); + return interruptor::with_interruption([this]() mutable { + return enter_stage<interruptor>( + client_pp().wait_for_active + ).then_interruptible([this] { + return with_blocking_event<PGActivationBlocker::BlockingEvent, + interruptor>([this] (auto&& trigger) { + return pg->wait_for_active_blocker.wait(std::move(trigger)); + }); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().recover_missing); + }).then_interruptible([this] { + return do_recover_missing(pg, get_target_oid()); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().get_obc); + }).then_interruptible([this] () -> PG::load_obc_iertr::future<> { + logger().debug("{}: getting obc lock", *this); + return seastar::do_with(create_osd_ops(), + [this](auto& osd_ops) mutable { + logger().debug("InternalClientRequest: got {} OSDOps to execute", + std::size(osd_ops)); + [[maybe_unused]] const int ret = op_info.set_from_op( + std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap()); + assert(ret == 0); + return pg->with_locked_obc(get_target_oid(), op_info, + [&osd_ops, this](auto obc) { + return enter_stage<interruptor>(client_pp().process + ).then_interruptible( + [obc=std::move(obc), &osd_ops, this] { + return pg->do_osd_ops( + std::move(obc), + osd_ops, + std::as_const(op_info), + get_do_osd_ops_params(), + [] { + return PG::do_osd_ops_iertr::now(); + }, + [] (const std::error_code& e) { + return PG::do_osd_ops_iertr::now(); + } + ).safe_then_unpack_interruptible( + [](auto submitted, auto all_completed) { + return all_completed.handle_error_interruptible( + crimson::ct_error::eagain::handle([] { + return seastar::now(); + })); + }, crimson::ct_error::eagain::handle([] { + return interruptor::now(); + }) + ); + }); + }); + }); + }).handle_error_interruptible(PG::load_obc_ertr::all_same_way([] { + return seastar::now(); + })).then_interruptible([] { + return seastar::stop_iteration::yes; + }); + }, [this](std::exception_ptr eptr) { + if (should_abort_request(*this, std::move(eptr))) { + return seastar::stop_iteration::yes; + } else { + return seastar::stop_iteration::no; + } + }, pg); + }).then([this] { + track_event<CompletionEvent>(); + }); + }); +} + +} // namespace crimson::osd + diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h new file mode 100644 index 000000000..8eed12e05 --- /dev/null +++ b/src/crimson/osd/osd_operations/internal_client_request.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request_common.h" +#include "crimson/osd/osd_operations/common/pg_pipeline.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_activation_blocker.h" + +namespace crimson::osd { + +class InternalClientRequest : public PhasedOperationT<InternalClientRequest>, + private CommonClientRequest { +public: + explicit InternalClientRequest(Ref<PG> pg); + ~InternalClientRequest(); + + // imposed by `ShardService::start_operation<T>(...)`. + seastar::future<> start(); + +protected: + virtual const hobject_t& get_target_oid() const = 0; + virtual PG::do_osd_ops_params_t get_do_osd_ops_params() const = 0; + virtual std::vector<OSDOp> create_osd_ops() = 0; + + const PG& get_pg() const { + return *pg; + } + +private: + friend OperationT<InternalClientRequest>; + + static constexpr OperationTypeCode type = + OperationTypeCode::internal_client_request; + + void print(std::ostream &) const final; + void dump_detail(Formatter *f) const final; + + CommonPGPipeline& client_pp(); + + seastar::future<> do_process(); + + Ref<PG> pg; + OpInfo op_info; + PipelineHandle handle; + +public: + PipelineHandle& get_handle() { return handle; } + + std::tuple< + StartEvent, + CommonPGPipeline::WaitForActive::BlockingEvent, + PGActivationBlocker::BlockingEvent, + CommonPGPipeline::RecoverMissing::BlockingEvent, + CommonPGPipeline::GetOBC::BlockingEvent, + CommonPGPipeline::Process::BlockingEvent, + CompletionEvent + > tracking_events; +}; + +} // namespace crimson::osd + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::InternalClientRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/logmissing_request.cc b/src/crimson/osd/osd_operations/logmissing_request.cc new file mode 100644 index 000000000..739b46406 --- /dev/null +++ b/src/crimson/osd/osd_operations/logmissing_request.cc @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "logmissing_request.h" + +#include "common/Formatter.h" + +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_connection_priv.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/pg.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +LogMissingRequest::LogMissingRequest(crimson::net::ConnectionRef&& conn, + Ref<MOSDPGUpdateLogMissing> &&req) + : conn{std::move(conn)}, + req{std::move(req)} +{} + +void LogMissingRequest::print(std::ostream& os) const +{ + os << "LogMissingRequest(" + << "from=" << req->from + << " req=" << *req + << ")"; +} + +void LogMissingRequest::dump_detail(Formatter *f) const +{ + f->open_object_section("LogMissingRequest"); + f->dump_stream("req_tid") << req->get_tid(); + f->dump_stream("pgid") << req->get_spg(); + f->dump_unsigned("map_epoch", req->get_map_epoch()); + f->dump_unsigned("min_epoch", req->get_min_epoch()); + f->dump_stream("entries") << req->entries; + f->dump_stream("from") << req->from; + f->close_section(); +} + +ConnectionPipeline &LogMissingRequest::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).replicated_request_conn_pipeline; +} + +ClientRequest::PGPipeline &LogMissingRequest::client_pp(PG &pg) +{ + return pg.request_pg_pipeline; +} + +seastar::future<> LogMissingRequest::with_pg( + ShardServices &shard_services, Ref<PG> pg) +{ + logger().debug("{}: LogMissingRequest::with_pg", *this); + + IRef ref = this; + return interruptor::with_interruption([this, pg] { + logger().debug("{}: pg present", *this); + return this->template enter_stage<interruptor>(client_pp(*pg).await_map + ).then_interruptible([this, pg] { + return this->template with_blocking_event< + PG_OSDMapGate::OSDMapBlocker::BlockingEvent + >([this, pg](auto &&trigger) { + return pg->osdmap_gate.wait_for_map( + std::move(trigger), req->min_epoch); + }); + }).then_interruptible([this, pg](auto) { + return pg->do_update_log_missing(req, conn); + }); + }, [ref](std::exception_ptr) { return seastar::now(); }, pg); +} + +} diff --git a/src/crimson/osd/osd_operations/logmissing_request.h b/src/crimson/osd/osd_operations/logmissing_request.h new file mode 100644 index 000000000..71d0816fd --- /dev/null +++ b/src/crimson/osd/osd_operations/logmissing_request.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/pg_map.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDPGUpdateLogMissing.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; + +class OSD; +class PG; + +class LogMissingRequest final : public PhasedOperationT<LogMissingRequest> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::logmissing_request; + LogMissingRequest(crimson::net::ConnectionRef&&, Ref<MOSDPGUpdateLogMissing>&&); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + + static constexpr bool can_create() { return false; } + spg_t get_pgid() const { + return req->get_spg(); + } + PipelineHandle &get_handle() { return handle; } + epoch_t get_epoch() const { return req->get_min_epoch(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } + + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); + + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + ClientRequest::PGPipeline::AwaitMap::BlockingEvent, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent, + PGMap::PGCreationBlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent + > tracking_events; + +private: + ClientRequest::PGPipeline &client_pp(PG &pg); + + crimson::net::ConnectionRef conn; + // must be after `conn` to ensure the ConnectionPipeline's is alive + PipelineHandle handle; + Ref<MOSDPGUpdateLogMissing> req; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::LogMissingRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.cc b/src/crimson/osd/osd_operations/logmissing_request_reply.cc new file mode 100644 index 000000000..b4bf2938e --- /dev/null +++ b/src/crimson/osd/osd_operations/logmissing_request_reply.cc @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "logmissing_request_reply.h" + +#include "common/Formatter.h" + +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_connection_priv.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/pg.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +LogMissingRequestReply::LogMissingRequestReply( + crimson::net::ConnectionRef&& conn, + Ref<MOSDPGUpdateLogMissingReply> &&req) + : conn{std::move(conn)}, + req{std::move(req)} +{} + +void LogMissingRequestReply::print(std::ostream& os) const +{ + os << "LogMissingRequestReply(" + << "from=" << req->from + << " req=" << *req + << ")"; +} + +void LogMissingRequestReply::dump_detail(Formatter *f) const +{ + f->open_object_section("LogMissingRequestReply"); + f->dump_stream("rep_tid") << req->get_tid(); + f->dump_stream("pgid") << req->get_spg(); + f->dump_unsigned("map_epoch", req->get_map_epoch()); + f->dump_unsigned("min_epoch", req->get_min_epoch()); + f->dump_stream("from") << req->from; + f->close_section(); +} + +ConnectionPipeline &LogMissingRequestReply::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).replicated_request_conn_pipeline; +} + +ClientRequest::PGPipeline &LogMissingRequestReply::client_pp(PG &pg) +{ + return pg.request_pg_pipeline; +} + +seastar::future<> LogMissingRequestReply::with_pg( + ShardServices &shard_services, Ref<PG> pg) +{ + logger().debug("{}: LogMissingRequestReply::with_pg", *this); + + IRef ref = this; + return interruptor::with_interruption([this, pg] { + return pg->do_update_log_missing_reply(std::move(req)); + }, [ref](std::exception_ptr) { return seastar::now(); }, pg); +} + +} diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.h b/src/crimson/osd/osd_operations/logmissing_request_reply.h new file mode 100644 index 000000000..c89131fec --- /dev/null +++ b/src/crimson/osd/osd_operations/logmissing_request_reply.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/pg_map.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; + +class OSD; +class PG; + +class LogMissingRequestReply final : public PhasedOperationT<LogMissingRequestReply> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::logmissing_request_reply; + LogMissingRequestReply(crimson::net::ConnectionRef&&, Ref<MOSDPGUpdateLogMissingReply>&&); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + + static constexpr bool can_create() { return false; } + spg_t get_pgid() const { + return req->get_spg(); + } + PipelineHandle &get_handle() { return handle; } + epoch_t get_epoch() const { return req->get_min_epoch(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } + + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); + + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + PGMap::PGCreationBlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent + > tracking_events; + +private: + ClientRequest::PGPipeline &client_pp(PG &pg); + + crimson::net::ConnectionRef conn; + // must be after `conn` to ensure the ConnectionPipeline's is alive + PipelineHandle handle; + Ref<MOSDPGUpdateLogMissingReply> req; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::LogMissingRequestReply> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h new file mode 100644 index 000000000..c7b81e765 --- /dev/null +++ b/src/crimson/osd/osd_operations/osdop_params.h @@ -0,0 +1,22 @@ +#pragma once + +#include "messages/MOSDOp.h" +#include "osd/osd_types.h" +#include "crimson/common/type_helpers.h" + +// The fields in this struct are parameters that may be needed in multiple +// level of processing. I inclosed all those parameters in this struct to +// avoid passing each of them as a method parameter. +struct osd_op_params_t { + osd_reqid_t req_id; + utime_t mtime; + eversion_t at_version; + eversion_t pg_trim_to; + eversion_t min_last_complete_ondisk; + eversion_t last_complete; + version_t user_at_version = 0; + bool user_modify = false; + ObjectCleanRegions clean_regions; + + osd_op_params_t() = default; +}; diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc new file mode 100644 index 000000000..ea4662bd0 --- /dev/null +++ b/src/crimson/osd/osd_operations/peering_event.cc @@ -0,0 +1,190 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> +#include <seastar/core/sleep.hh> + +#include "messages/MOSDPGLog.h" + +#include "common/Formatter.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_connection_priv.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +template <class T> +void PeeringEvent<T>::print(std::ostream &lhs) const +{ + lhs << "PeeringEvent(" + << "from=" << from + << " pgid=" << pgid + << " sent=" << evt.get_epoch_sent() + << " requested=" << evt.get_epoch_requested() + << " evt=" << evt.get_desc() + << ")"; +} + +template <class T> +void PeeringEvent<T>::dump_detail(Formatter *f) const +{ + f->open_object_section("PeeringEvent"); + f->dump_stream("from") << from; + f->dump_stream("pgid") << pgid; + f->dump_int("sent", evt.get_epoch_sent()); + f->dump_int("requested", evt.get_epoch_requested()); + f->dump_string("evt", evt.get_desc()); + f->open_array_section("events"); + { + std::apply([f](auto&... events) { + (..., events.dump(f)); + }, static_cast<const T*>(this)->tracking_events); + } + f->close_section(); + f->close_section(); +} + + +template <class T> +PGPeeringPipeline &PeeringEvent<T>::peering_pp(PG &pg) +{ + return pg.peering_request_pg_pipeline; +} + +template <class T> +seastar::future<> PeeringEvent<T>::with_pg( + ShardServices &shard_services, Ref<PG> pg) +{ + if (!pg) { + logger().warn("{}: pg absent, did not create", *this); + on_pg_absent(shard_services); + that()->get_handle().exit(); + return complete_rctx_no_pg(shard_services); + } + + using interruptor = typename T::interruptor; + return interruptor::with_interruption([this, pg, &shard_services] { + logger().debug("{}: pg present", *this); + return this->template enter_stage<interruptor>(peering_pp(*pg).await_map + ).then_interruptible([this, pg] { + return this->template with_blocking_event< + PG_OSDMapGate::OSDMapBlocker::BlockingEvent + >([this, pg](auto &&trigger) { + return pg->osdmap_gate.wait_for_map( + std::move(trigger), evt.get_epoch_sent()); + }); + }).then_interruptible([this, pg](auto) { + return this->template enter_stage<interruptor>(peering_pp(*pg).process); + }).then_interruptible([this, pg, &shard_services] { + return pg->do_peering_event(evt, ctx + ).then_interruptible([this, pg, &shard_services] { + that()->get_handle().exit(); + return complete_rctx(shard_services, pg); + }); + }).then_interruptible([pg, &shard_services]() + -> typename T::template interruptible_future<> { + if (!pg->get_need_up_thru()) { + return seastar::now(); + } + return shard_services.send_alive(pg->get_same_interval_since()); + }).then_interruptible([&shard_services] { + return shard_services.send_pg_temp(); + }); + }, [this](std::exception_ptr ep) { + logger().debug("{}: interrupted with {}", *this, ep); + }, pg); +} + +template <class T> +void PeeringEvent<T>::on_pg_absent(ShardServices &) +{ + logger().debug("{}: pg absent, dropping", *this); +} + +template <class T> +typename PeeringEvent<T>::template interruptible_future<> +PeeringEvent<T>::complete_rctx(ShardServices &shard_services, Ref<PG> pg) +{ + logger().debug("{}: submitting ctx", *this); + return shard_services.dispatch_context( + pg->get_collection_ref(), + std::move(ctx)); +} + +ConnectionPipeline &RemotePeeringEvent::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).peering_request_conn_pipeline; +} + +void RemotePeeringEvent::on_pg_absent(ShardServices &shard_services) +{ + if (auto& e = get_event().get_event(); + e.dynamic_type() == MQuery::static_type()) { + const auto map_epoch = + shard_services.get_map()->get_epoch(); + const auto& q = static_cast<const MQuery&>(e); + const pg_info_t empty{spg_t{pgid.pgid, q.query.to}}; + if (q.query.type == q.query.LOG || + q.query.type == q.query.FULLLOG) { + auto m = crimson::make_message<MOSDPGLog>(q.query.from, q.query.to, + map_epoch, empty, + q.query.epoch_sent); + ctx.send_osd_message(q.from.osd, std::move(m)); + } else { + ctx.send_notify(q.from.osd, {q.query.from, q.query.to, + q.query.epoch_sent, + map_epoch, empty, + PastIntervals{}}); + } + } +} + +RemotePeeringEvent::interruptible_future<> RemotePeeringEvent::complete_rctx( + ShardServices &shard_services, + Ref<PG> pg) +{ + if (pg) { + return PeeringEvent::complete_rctx(shard_services, pg); + } else { + return shard_services.dispatch_context_messages(std::move(ctx)); + } +} + +seastar::future<> RemotePeeringEvent::complete_rctx_no_pg( + ShardServices &shard_services) +{ + return shard_services.dispatch_context_messages(std::move(ctx)); +} + +seastar::future<> LocalPeeringEvent::start() +{ + logger().debug("{}: start", *this); + + IRef ref = this; + auto maybe_delay = seastar::now(); + if (delay) { + maybe_delay = seastar::sleep( + std::chrono::milliseconds(std::lround(delay * 1000))); + } + return maybe_delay.then([this] { + return with_pg(pg->get_shard_services(), pg); + }).finally([ref=std::move(ref)] { + logger().debug("{}: complete", *ref); + }); +} + + +LocalPeeringEvent::~LocalPeeringEvent() {} + +template class PeeringEvent<RemotePeeringEvent>; +template class PeeringEvent<LocalPeeringEvent>; + +} diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h new file mode 100644 index 000000000..e94caead1 --- /dev/null +++ b/src/crimson/osd/osd_operations/peering_event.h @@ -0,0 +1,207 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "osd/osd_types.h" +#include "osd/PGPeeringEvent.h" +#include "osd/PeeringState.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class OSD; +class ShardServices; +class PG; +class BackfillRecovery; + + class PGPeeringPipeline { + struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> { + static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map"; + } await_map; + struct Process : OrderedExclusivePhaseT<Process> { + static constexpr auto type_name = "PeeringEvent::PGPipeline::process"; + } process; + template <class T> + friend class PeeringEvent; + friend class LocalPeeringEvent; + friend class RemotePeeringEvent; + friend class PGAdvanceMap; + friend class BackfillRecovery; + }; + +template <class T> +class PeeringEvent : public PhasedOperationT<T> { + T* that() { + return static_cast<T*>(this); + } + const T* that() const { + return static_cast<const T*>(this); + } + +public: + static constexpr OperationTypeCode type = OperationTypeCode::peering_event; + +protected: + PGPeeringPipeline &peering_pp(PG &pg); + + PeeringCtx ctx; + pg_shard_t from; + spg_t pgid; + float delay = 0; + PGPeeringEvent evt; + + const pg_shard_t get_from() const { + return from; + } + + const spg_t get_pgid() const { + return pgid; + } + + const PGPeeringEvent &get_event() const { + return evt; + } + + virtual void on_pg_absent(ShardServices &); + + virtual typename PeeringEvent::template interruptible_future<> + complete_rctx(ShardServices &, Ref<PG>); + + virtual seastar::future<> complete_rctx_no_pg( + ShardServices &shard_services + ) { return seastar::now();} + +public: + template <typename... Args> + PeeringEvent( + const pg_shard_t &from, const spg_t &pgid, + Args&&... args) : + from(from), + pgid(pgid), + evt(std::forward<Args>(args)...) + {} + template <typename... Args> + PeeringEvent( + const pg_shard_t &from, const spg_t &pgid, + float delay, Args&&... args) : + from(from), + pgid(pgid), + delay(delay), + evt(std::forward<Args>(args)...) + {} + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); +}; + +class RemotePeeringEvent : public PeeringEvent<RemotePeeringEvent> { +protected: + crimson::net::ConnectionRef conn; + // must be after conn due to ConnectionPipeline's life-time + PipelineHandle handle; + + void on_pg_absent(ShardServices &) final; + PeeringEvent::interruptible_future<> complete_rctx( + ShardServices &shard_services, + Ref<PG> pg) override; + seastar::future<> complete_rctx_no_pg( + ShardServices &shard_services + ) override; + +public: + class OSDPipeline { + struct AwaitActive : OrderedExclusivePhaseT<AwaitActive> { + static constexpr auto type_name = + "PeeringRequest::OSDPipeline::await_active"; + } await_active; + friend class RemotePeeringEvent; + }; + + template <typename... Args> + RemotePeeringEvent(crimson::net::ConnectionRef conn, Args&&... args) : + PeeringEvent(std::forward<Args>(args)...), + conn(conn) + {} + + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + PGMap::PGCreationBlockingEvent, + PGPeeringPipeline::AwaitMap::BlockingEvent, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent, + PGPeeringPipeline::Process::BlockingEvent, + OSDPipeline::AwaitActive::BlockingEvent, + CompletionEvent + > tracking_events; + + static constexpr bool can_create() { return true; } + auto get_create_info() { return std::move(evt.create_info); } + spg_t get_pgid() const { + return pgid; + } + PipelineHandle &get_handle() { return handle; } + epoch_t get_epoch() const { return evt.get_epoch_sent(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } +}; + +class LocalPeeringEvent final : public PeeringEvent<LocalPeeringEvent> { +protected: + Ref<PG> pg; + PipelineHandle handle; + +public: + template <typename... Args> + LocalPeeringEvent(Ref<PG> pg, Args&&... args) : + PeeringEvent(std::forward<Args>(args)...), + pg(pg) + {} + + seastar::future<> start(); + virtual ~LocalPeeringEvent(); + + PipelineHandle &get_handle() { return handle; } + + std::tuple< + StartEvent, + PGPeeringPipeline::AwaitMap::BlockingEvent, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent, + PGPeeringPipeline::Process::BlockingEvent, + CompletionEvent + > tracking_events; +}; + + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::LocalPeeringEvent> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::osd::RemotePeeringEvent> : fmt::ostream_formatter {}; +template <class T> struct fmt::formatter<crimson::osd::PeeringEvent<T>> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/pg_advance_map.cc b/src/crimson/osd/osd_operations/pg_advance_map.cc new file mode 100644 index 000000000..3706af810 --- /dev/null +++ b/src/crimson/osd/osd_operations/pg_advance_map.cc @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> + +#include "include/types.h" +#include "common/Formatter.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/osd_operations/pg_advance_map.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "osd/PeeringState.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +PGAdvanceMap::PGAdvanceMap( + ShardServices &shard_services, Ref<PG> pg, epoch_t to, + PeeringCtx &&rctx, bool do_init) + : shard_services(shard_services), pg(pg), to(to), + rctx(std::move(rctx)), do_init(do_init) +{ + logger().debug("{}: created", *this); +} + +PGAdvanceMap::~PGAdvanceMap() {} + +void PGAdvanceMap::print(std::ostream &lhs) const +{ + lhs << "PGAdvanceMap(" + << "pg=" << pg->get_pgid() + << " from=" << (from ? *from : -1) + << " to=" << to; + if (do_init) { + lhs << " do_init"; + } + lhs << ")"; +} + +void PGAdvanceMap::dump_detail(Formatter *f) const +{ + f->open_object_section("PGAdvanceMap"); + f->dump_stream("pgid") << pg->get_pgid(); + if (from) { + f->dump_int("from", *from); + } + f->dump_int("to", to); + f->dump_bool("do_init", do_init); + f->close_section(); +} + +PGPeeringPipeline &PGAdvanceMap::peering_pp(PG &pg) +{ + return pg.peering_request_pg_pipeline; +} + +seastar::future<> PGAdvanceMap::start() +{ + using cached_map_t = OSDMapService::cached_map_t; + + logger().debug("{}: start", *this); + + IRef ref = this; + return enter_stage<>( + peering_pp(*pg).process + ).then([this] { + /* + * PGAdvanceMap is scheduled at pg creation and when + * broadcasting new osdmaps to pgs. We are not able to serialize + * between the two different PGAdvanceMap callers since a new pg + * will get advanced to the latest osdmap at it's creation. + * As a result, we may need to adjust the PGAdvance operation + * 'from' epoch. + * See: https://tracker.ceph.com/issues/61744 + */ + from = pg->get_osdmap_epoch(); + auto fut = seastar::now(); + if (do_init) { + fut = pg->handle_initialize(rctx + ).then([this] { + return pg->handle_activate_map(rctx); + }); + } + return fut.then([this] { + ceph_assert(std::cmp_less_equal(*from, to)); + return seastar::do_for_each( + boost::make_counting_iterator(*from + 1), + boost::make_counting_iterator(to + 1), + [this](epoch_t next_epoch) { + logger().debug("{}: start: getting map {}", + *this, next_epoch); + return shard_services.get_map(next_epoch).then( + [this] (cached_map_t&& next_map) { + logger().debug("{}: advancing map to {}", + *this, next_map->get_epoch()); + return pg->handle_advance_map(next_map, rctx); + }); + }).then([this] { + return pg->handle_activate_map(rctx).then([this] { + logger().debug("{}: map activated", *this); + if (do_init) { + shard_services.pg_created(pg->get_pgid(), pg); + logger().info("PGAdvanceMap::start new pg {}", *pg); + } + return seastar::when_all_succeed( + pg->get_need_up_thru() + ? shard_services.send_alive( + pg->get_same_interval_since()) + : seastar::now(), + shard_services.dispatch_context( + pg->get_collection_ref(), + std::move(rctx))); + }); + }).then_unpack([this] { + logger().debug("{}: sending pg temp", *this); + return shard_services.send_pg_temp(); + }); + }); + }).then([this, ref=std::move(ref)] { + logger().debug("{}: complete", *this); + }); +} + +} diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h new file mode 100644 index 000000000..b712cc12e --- /dev/null +++ b/src/crimson/osd/osd_operations/pg_advance_map.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "osd/osd_types.h" +#include "crimson/common/type_helpers.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; +class PG; + +class PGAdvanceMap : public PhasedOperationT<PGAdvanceMap> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::pg_advance_map; + +protected: + ShardServices &shard_services; + Ref<PG> pg; + PipelineHandle handle; + + std::optional<epoch_t> from; + epoch_t to; + + PeeringCtx rctx; + const bool do_init; + +public: + PGAdvanceMap( + ShardServices &shard_services, Ref<PG> pg, epoch_t to, + PeeringCtx &&rctx, bool do_init); + ~PGAdvanceMap(); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter *f) const final; + seastar::future<> start(); + PipelineHandle &get_handle() { return handle; } + + std::tuple< + PGPeeringPipeline::Process::BlockingEvent + > tracking_events; + +private: + PGPeeringPipeline &peering_pp(PG &pg); +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::PGAdvanceMap> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.cc b/src/crimson/osd/osd_operations/recovery_subrequest.cc new file mode 100644 index 000000000..68655b8da --- /dev/null +++ b/src/crimson/osd/osd_operations/recovery_subrequest.cc @@ -0,0 +1,46 @@ +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "crimson/osd/osd_operations/recovery_subrequest.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd_connection_priv.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson { + template <> + struct EventBackendRegistry<osd::RecoverySubRequest> { + static std::tuple<> get_backends() { + return {}; + } + }; +} + +namespace crimson::osd { + +seastar::future<> RecoverySubRequest::with_pg( + ShardServices &shard_services, Ref<PG> pgref) +{ + logger().debug("{}: {}", "RecoverySubRequest::with_pg", *this); + + track_event<StartEvent>(); + IRef opref = this; + return interruptor::with_interruption([this, pgref] { + return pgref->get_recovery_backend()->handle_recovery_op(m, conn); + }, [](std::exception_ptr) { + return seastar::now(); + }, pgref).finally([this, opref, pgref] { + track_event<CompletionEvent>(); + }); +} + +ConnectionPipeline &RecoverySubRequest::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).peering_request_conn_pipeline; +} + +} diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h new file mode 100644 index 000000000..07c7c95b5 --- /dev/null +++ b/src/crimson/osd/osd_operations/recovery_subrequest.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "osd/osd_op_util.h" +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/pg.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDFastDispatchOp.h" + +namespace crimson::osd { + +class PG; + +class RecoverySubRequest final : public PhasedOperationT<RecoverySubRequest> { +public: + static constexpr OperationTypeCode type = + OperationTypeCode::background_recovery_sub; + + RecoverySubRequest( + crimson::net::ConnectionRef conn, + Ref<MOSDFastDispatchOp>&& m) + : conn(conn), m(m) {} + + void print(std::ostream& out) const final + { + out << *m; + } + + void dump_detail(Formatter *f) const final + { + } + + static constexpr bool can_create() { return false; } + spg_t get_pgid() const { + return m->get_spg(); + } + PipelineHandle &get_handle() { return handle; } + epoch_t get_epoch() const { return m->get_min_epoch(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } + + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); + + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + PGMap::PGCreationBlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent, + CompletionEvent + > tracking_events; + +private: + crimson::net::ConnectionRef conn; + // must be after `conn` to ensure the ConnectionPipeline's is alive + PipelineHandle handle; + Ref<MOSDFastDispatchOp> m; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::RecoverySubRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/replicated_request.cc b/src/crimson/osd/osd_operations/replicated_request.cc new file mode 100644 index 000000000..09217575c --- /dev/null +++ b/src/crimson/osd/osd_operations/replicated_request.cc @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "replicated_request.h" + +#include "common/Formatter.h" + +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_connection_priv.h" +#include "crimson/osd/osd_operation_external_tracking.h" +#include "crimson/osd/pg.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +RepRequest::RepRequest(crimson::net::ConnectionRef&& conn, + Ref<MOSDRepOp> &&req) + : conn{std::move(conn)}, + req{std::move(req)} +{} + +void RepRequest::print(std::ostream& os) const +{ + os << "RepRequest(" + << "from=" << req->from + << " req=" << *req + << ")"; +} + +void RepRequest::dump_detail(Formatter *f) const +{ + f->open_object_section("RepRequest"); + f->dump_stream("reqid") << req->reqid; + f->dump_stream("pgid") << req->get_spg(); + f->dump_unsigned("map_epoch", req->get_map_epoch()); + f->dump_unsigned("min_epoch", req->get_min_epoch()); + f->dump_stream("oid") << req->poid; + f->dump_stream("from") << req->from; + f->close_section(); +} + +ConnectionPipeline &RepRequest::get_connection_pipeline() +{ + return get_osd_priv(conn.get()).replicated_request_conn_pipeline; +} + +ClientRequest::PGPipeline &RepRequest::client_pp(PG &pg) +{ + return pg.request_pg_pipeline; +} + +seastar::future<> RepRequest::with_pg( + ShardServices &shard_services, Ref<PG> pg) +{ + logger().debug("{}: RepRequest::with_pg", *this); + IRef ref = this; + return interruptor::with_interruption([this, pg] { + logger().debug("{}: pg present", *this); + return this->template enter_stage<interruptor>(client_pp(*pg).await_map + ).then_interruptible([this, pg] { + return this->template with_blocking_event< + PG_OSDMapGate::OSDMapBlocker::BlockingEvent + >([this, pg](auto &&trigger) { + return pg->osdmap_gate.wait_for_map( + std::move(trigger), req->min_epoch); + }); + }).then_interruptible([this, pg] (auto) { + return pg->handle_rep_op(req); + }); + }, [ref](std::exception_ptr) { + return seastar::now(); + }, pg); +} + +} diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h new file mode 100644 index 000000000..c742888d9 --- /dev/null +++ b/src/crimson/osd/osd_operations/replicated_request.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/pg_map.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDRepOp.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; + +class OSD; +class PG; + +class RepRequest final : public PhasedOperationT<RepRequest> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::replicated_request; + RepRequest(crimson::net::ConnectionRef&&, Ref<MOSDRepOp>&&); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + + static constexpr bool can_create() { return false; } + spg_t get_pgid() const { + return req->get_spg(); + } + PipelineHandle &get_handle() { return handle; } + epoch_t get_epoch() const { return req->get_min_epoch(); } + + ConnectionPipeline &get_connection_pipeline(); + seastar::future<crimson::net::ConnectionFRef> prepare_remote_submission() { + assert(conn); + return conn.get_foreign( + ).then([this](auto f_conn) { + conn.reset(); + return f_conn; + }); + } + void finish_remote_submission(crimson::net::ConnectionFRef _conn) { + assert(!conn); + conn = make_local_shared_foreign(std::move(_conn)); + } + + seastar::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); + + std::tuple< + StartEvent, + ConnectionPipeline::AwaitActive::BlockingEvent, + ConnectionPipeline::AwaitMap::BlockingEvent, + ConnectionPipeline::GetPG::BlockingEvent, + ClientRequest::PGPipeline::AwaitMap::BlockingEvent, + PG_OSDMapGate::OSDMapBlocker::BlockingEvent, + PGMap::PGCreationBlockingEvent, + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent + > tracking_events; + +private: + ClientRequest::PGPipeline &client_pp(PG &pg); + + crimson::net::ConnectionRef conn; + PipelineHandle handle; + Ref<MOSDRepOp> req; +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::RepRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc new file mode 100644 index 000000000..e4a1b04df --- /dev/null +++ b/src/crimson/osd/osd_operations/snaptrim_event.cc @@ -0,0 +1,569 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/osd_operations/snaptrim_event.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/pg.h" +#include <seastar/core/sleep.hh> + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson { + template <> + struct EventBackendRegistry<osd::SnapTrimEvent> { + static std::tuple<> get_backends() { + return {}; + } + }; + + template <> + struct EventBackendRegistry<osd::SnapTrimObjSubEvent> { + static std::tuple<> get_backends() { + return {}; + } + }; +} + +namespace crimson::osd { + +PG::interruptible_future<> +PG::SnapTrimMutex::lock(SnapTrimEvent &st_event) noexcept +{ + return st_event.enter_stage<interruptor>(wait_pg + ).then_interruptible([this] { + return mutex.lock(); + }); +} + +void SnapTrimEvent::SubOpBlocker::dump_detail(Formatter *f) const +{ + f->open_array_section("dependent_operations"); + { + for (const auto &kv : subops) { + f->dump_unsigned("op_id", kv.first); + } + } + f->close_section(); +} + +template <class... Args> +void SnapTrimEvent::SubOpBlocker::emplace_back(Args&&... args) +{ + subops.emplace_back(std::forward<Args>(args)...); +}; + +SnapTrimEvent::remove_or_update_iertr::future<> +SnapTrimEvent::SubOpBlocker::wait_completion() +{ + return interruptor::do_for_each(subops, [](auto&& kv) { + return std::move(kv.second); + }); +} + +void SnapTrimEvent::print(std::ostream &lhs) const +{ + lhs << "SnapTrimEvent(" + << "pgid=" << pg->get_pgid() + << " snapid=" << snapid + << " needs_pause=" << needs_pause + << ")"; +} + +void SnapTrimEvent::dump_detail(Formatter *f) const +{ + f->open_object_section("SnapTrimEvent"); + f->dump_stream("pgid") << pg->get_pgid(); + f->close_section(); +} + +SnapTrimEvent::snap_trim_ertr::future<seastar::stop_iteration> +SnapTrimEvent::start() +{ + logger().debug("{}: {}", *this, __func__); + return with_pg( + pg->get_shard_services(), pg + ).finally([ref=IRef{this}, this] { + logger().debug("{}: complete", *ref); + return handle.complete(); + }); +} + +CommonPGPipeline& SnapTrimEvent::client_pp() +{ + return pg->request_pg_pipeline; +} + +SnapTrimEvent::snap_trim_ertr::future<seastar::stop_iteration> +SnapTrimEvent::with_pg( + ShardServices &shard_services, Ref<PG> _pg) +{ + return interruptor::with_interruption([&shard_services, this] { + return enter_stage<interruptor>( + client_pp().wait_for_active + ).then_interruptible([this] { + return with_blocking_event<PGActivationBlocker::BlockingEvent, + interruptor>([this] (auto&& trigger) { + return pg->wait_for_active_blocker.wait(std::move(trigger)); + }); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().recover_missing); + }).then_interruptible([] { + //return do_recover_missing(pg, get_target_oid()); + return seastar::now(); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().get_obc); + }).then_interruptible([this] { + return pg->snaptrim_mutex.lock(*this); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().process); + }).then_interruptible([&shard_services, this] { + return interruptor::async([this] { + std::vector<hobject_t> to_trim; + using crimson::common::local_conf; + const auto max = + local_conf().get_val<uint64_t>("osd_pg_max_concurrent_snap_trims"); + // we need to look for at least 1 snaptrim, otherwise we'll misinterpret + // the ENOENT below and erase snapid. + int r = snap_mapper.get_next_objects_to_trim( + snapid, + max, + &to_trim); + if (r == -ENOENT) { + to_trim.clear(); // paranoia + return to_trim; + } else if (r != 0) { + logger().error("{}: get_next_objects_to_trim returned {}", + *this, cpp_strerror(r)); + ceph_abort_msg("get_next_objects_to_trim returned an invalid code"); + } else { + assert(!to_trim.empty()); + } + logger().debug("{}: async almost done line {}", *this, __LINE__); + return to_trim; + }).then_interruptible([&shard_services, this] (const auto& to_trim) { + if (to_trim.empty()) { + // the legit ENOENT -> done + logger().debug("{}: to_trim is empty! Stopping iteration", *this); + pg->snaptrim_mutex.unlock(); + return snap_trim_iertr::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + return [&shard_services, this](const auto &to_trim) { + for (const auto& object : to_trim) { + logger().debug("{}: trimming {}", *this, object); + auto [op, fut] = shard_services.start_operation_may_interrupt< + interruptor, SnapTrimObjSubEvent>( + pg, + object, + snapid); + subop_blocker.emplace_back( + op->get_id(), + std::move(fut) + ); + } + return interruptor::now(); + }(to_trim).then_interruptible([this] { + return enter_stage<interruptor>(wait_subop); + }).then_interruptible([this] { + logger().debug("{}: awaiting completion", *this); + return subop_blocker.wait_completion(); + }).finally([this] { + pg->snaptrim_mutex.unlock(); + }).safe_then_interruptible([this] { + if (!needs_pause) { + return interruptor::now(); + } + // let's know operators we're waiting + return enter_stage<interruptor>( + wait_trim_timer + ).then_interruptible([this] { + using crimson::common::local_conf; + const auto time_to_sleep = + local_conf().template get_val<double>("osd_snap_trim_sleep"); + logger().debug("{}: time_to_sleep {}", *this, time_to_sleep); + // TODO: this logic should be more sophisticated and distinguish + // between SSDs, HDDs and the hybrid case + return seastar::sleep( + std::chrono::milliseconds(std::lround(time_to_sleep * 1000))); + }); + }).safe_then_interruptible([this] { + logger().debug("{}: all completed", *this); + return snap_trim_iertr::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }); + }); + }); + }, [this](std::exception_ptr eptr) -> snap_trim_ertr::future<seastar::stop_iteration> { + logger().debug("{}: interrupted {}", *this, eptr); + return crimson::ct_error::eagain::make(); + }, pg); +} + + +CommonPGPipeline& SnapTrimObjSubEvent::client_pp() +{ + return pg->request_pg_pipeline; +} + +SnapTrimObjSubEvent::remove_or_update_iertr::future<> +SnapTrimObjSubEvent::start() +{ + logger().debug("{}: start", *this); + return with_pg( + pg->get_shard_services(), pg + ).finally([ref=IRef{this}, this] { + logger().debug("{}: complete", *ref); + return handle.complete(); + }); +} + +SnapTrimObjSubEvent::remove_or_update_iertr::future<> +SnapTrimObjSubEvent::remove_clone( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries +) { + const auto p = std::find( + head_obc->ssc->snapset.clones.begin(), + head_obc->ssc->snapset.clones.end(), + coid.snap); + if (p == head_obc->ssc->snapset.clones.end()) { + logger().error("{}: Snap {} not in clones", + *this, coid.snap); + return crimson::ct_error::enoent::make(); + } + assert(p != head_obc->ssc->snapset.clones.end()); + snapid_t last = coid.snap; + delta_stats.num_bytes -= head_obc->ssc->snapset.get_clone_bytes(last); + + if (p != head_obc->ssc->snapset.clones.begin()) { + // not the oldest... merge overlap into next older clone + std::vector<snapid_t>::iterator n = p - 1; + hobject_t prev_coid = coid; + prev_coid.snap = *n; + + // does the classical OSD really need is_present_clone(prev_coid)? + delta_stats.num_bytes -= head_obc->ssc->snapset.get_clone_bytes(*n); + head_obc->ssc->snapset.clone_overlap[*n].intersection_of( + head_obc->ssc->snapset.clone_overlap[*p]); + delta_stats.num_bytes += head_obc->ssc->snapset.get_clone_bytes(*n); + } + delta_stats.num_objects--; + if (obc->obs.oi.is_dirty()) { + delta_stats.num_objects_dirty--; + } + if (obc->obs.oi.is_omap()) { + delta_stats.num_objects_omap--; + } + if (obc->obs.oi.is_whiteout()) { + logger().debug("{}: trimming whiteout on {}", + *this, coid); + delta_stats.num_whiteouts--; + } + delta_stats.num_object_clones--; + + obc->obs.exists = false; + head_obc->ssc->snapset.clones.erase(p); + head_obc->ssc->snapset.clone_overlap.erase(last); + head_obc->ssc->snapset.clone_size.erase(last); + head_obc->ssc->snapset.clone_snaps.erase(last); + + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::DELETE, + coid, + osd_op_p.at_version, + obc->obs.oi.version, + 0, + osd_reqid_t(), + obc->obs.oi.mtime, // will be replaced in `apply_to()` + 0} + ); + txn.remove( + pg->get_collection_ref()->get_cid(), + ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}); + obc->obs.oi = object_info_t(coid); + return OpsExecuter::snap_map_remove(coid, pg->snap_mapper, pg->osdriver, txn); +} + +void SnapTrimObjSubEvent::remove_head_whiteout( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries +) { + // NOTE: this arguably constitutes minor interference with the + // tiering agent if this is a cache tier since a snap trim event + // is effectively evicting a whiteout we might otherwise want to + // keep around. + const auto head_oid = coid.get_head(); + logger().info("{}: {} removing {}", + *this, coid, head_oid); + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::DELETE, + head_oid, + osd_op_p.at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + obc->obs.oi.mtime, // will be replaced in `apply_to()` + 0} + ); + logger().info("{}: remove snap head", *this); + object_info_t& oi = head_obc->obs.oi; + delta_stats.num_objects--; + if (oi.is_dirty()) { + delta_stats.num_objects_dirty--; + } + if (oi.is_omap()) { + delta_stats.num_objects_omap--; + } + if (oi.is_whiteout()) { + logger().debug("{}: trimming whiteout on {}", + *this, oi.soid); + delta_stats.num_whiteouts--; + } + head_obc->obs.exists = false; + head_obc->obs.oi = object_info_t(head_oid); + txn.remove(pg->get_collection_ref()->get_cid(), + ghobject_t{head_oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}); +} + +SnapTrimObjSubEvent::interruptible_future<> +SnapTrimObjSubEvent::adjust_snaps( + ObjectContextRef obc, + ObjectContextRef head_obc, + const std::set<snapid_t>& new_snaps, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries +) { + head_obc->ssc->snapset.clone_snaps[coid.snap] = + std::vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend()); + + // we still do a 'modify' event on this object just to trigger a + // snapmapper.update ... :( + obc->obs.oi.prior_version = obc->obs.oi.version; + obc->obs.oi.version = osd_op_p.at_version; + ceph::bufferlist bl; + encode(obc->obs.oi, + bl, + pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + txn.setattr( + pg->get_collection_ref()->get_cid(), + ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}, + OI_ATTR, + bl); + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::MODIFY, + coid, + obc->obs.oi.version, + obc->obs.oi.prior_version, + 0, + osd_reqid_t(), + obc->obs.oi.mtime, + 0} + ); + return OpsExecuter::snap_map_modify( + coid, new_snaps, pg->snap_mapper, pg->osdriver, txn); +} + +void SnapTrimObjSubEvent::update_head( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries +) { + const auto head_oid = coid.get_head(); + logger().info("{}: writing updated snapset on {}, snapset is {}", + *this, head_oid, head_obc->ssc->snapset); + log_entries.emplace_back( + pg_log_entry_t{ + pg_log_entry_t::MODIFY, + head_oid, + osd_op_p.at_version, + head_obc->obs.oi.version, + 0, + osd_reqid_t(), + obc->obs.oi.mtime, + 0} + ); + + head_obc->obs.oi.prior_version = head_obc->obs.oi.version; + head_obc->obs.oi.version = osd_op_p.at_version; + + std::map<std::string, ceph::bufferlist, std::less<>> attrs; + ceph::bufferlist bl; + encode(head_obc->ssc->snapset, bl); + attrs[SS_ATTR] = std::move(bl); + + bl.clear(); + head_obc->obs.oi.encode_no_oid(bl, + pg->get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + attrs[OI_ATTR] = std::move(bl); + txn.setattrs( + pg->get_collection_ref()->get_cid(), + ghobject_t{head_oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD}, + attrs); +} + +SnapTrimObjSubEvent::remove_or_update_iertr::future< + SnapTrimObjSubEvent::remove_or_update_ret_t> +SnapTrimObjSubEvent::remove_or_update( + ObjectContextRef obc, + ObjectContextRef head_obc) +{ + auto citer = head_obc->ssc->snapset.clone_snaps.find(coid.snap); + if (citer == head_obc->ssc->snapset.clone_snaps.end()) { + logger().error("{}: No clone_snaps in snapset {} for object {}", + *this, head_obc->ssc->snapset, coid); + return crimson::ct_error::enoent::make(); + } + const auto& old_snaps = citer->second; + if (old_snaps.empty()) { + logger().error("{}: no object info snaps for object {}", + *this, coid); + return crimson::ct_error::enoent::make(); + } + if (head_obc->ssc->snapset.seq == 0) { + logger().error("{}: no snapset.seq for object {}", + *this, coid); + return crimson::ct_error::enoent::make(); + } + const OSDMapRef& osdmap = pg->get_osdmap(); + std::set<snapid_t> new_snaps; + for (const auto& old_snap : old_snaps) { + if (!osdmap->in_removed_snaps_queue(pg->get_info().pgid.pgid.pool(), + old_snap) + && old_snap != snap_to_trim) { + new_snaps.insert(old_snap); + } + } + + return seastar::do_with(ceph::os::Transaction{}, [=, this](auto &txn) { + std::vector<pg_log_entry_t> log_entries{}; + + int64_t num_objects_before_trim = delta_stats.num_objects; + osd_op_p.at_version = pg->next_version(); + auto ret = remove_or_update_iertr::now(); + if (new_snaps.empty()) { + // remove clone from snapset + logger().info("{}: {} snaps {} -> {} ... deleting", + *this, coid, old_snaps, new_snaps); + ret = remove_clone(obc, head_obc, txn, log_entries); + } else { + // save adjusted snaps for this object + logger().info("{}: {} snaps {} -> {}", + *this, coid, old_snaps, new_snaps); + ret = adjust_snaps(obc, head_obc, new_snaps, txn, log_entries); + } + return std::move(ret).safe_then_interruptible( + [&txn, obc, num_objects_before_trim, log_entries=std::move(log_entries), head_obc=std::move(head_obc), this]() mutable { + osd_op_p.at_version = pg->next_version(); + + // save head snapset + logger().debug("{}: {} new snapset {} on {}", + *this, coid, head_obc->ssc->snapset, head_obc->obs.oi); + if (head_obc->ssc->snapset.clones.empty() && head_obc->obs.oi.is_whiteout()) { + remove_head_whiteout(obc, head_obc, txn, log_entries); + } else { + update_head(obc, head_obc, txn, log_entries); + } + // Stats reporting - Set number of objects trimmed + if (num_objects_before_trim > delta_stats.num_objects) { + //int64_t num_objects_trimmed = + // num_objects_before_trim - delta_stats.num_objects; + //add_objects_trimmed_count(num_objects_trimmed); + } + }).safe_then_interruptible( + [&txn, log_entries=std::move(log_entries)] () mutable { + return remove_or_update_iertr::make_ready_future<remove_or_update_ret_t>( + std::make_pair(std::move(txn), std::move(log_entries))); + }); + }); +} + +SnapTrimObjSubEvent::remove_or_update_iertr::future<> +SnapTrimObjSubEvent::with_pg( + ShardServices &shard_services, Ref<PG> _pg) +{ + return enter_stage<interruptor>( + client_pp().wait_for_active + ).then_interruptible([this] { + return with_blocking_event<PGActivationBlocker::BlockingEvent, + interruptor>([this] (auto&& trigger) { + return pg->wait_for_active_blocker.wait(std::move(trigger)); + }); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().recover_missing); + }).then_interruptible([] { + //return do_recover_missing(pg, get_target_oid()); + return seastar::now(); + }).then_interruptible([this] { + return enter_stage<interruptor>( + client_pp().get_obc); + }).then_interruptible([this] { + logger().debug("{}: getting obc for {}", *this, coid); + // end of commonality + // with_clone_obc_direct lock both clone's and head's obcs + return pg->obc_loader.with_clone_obc_direct<RWState::RWWRITE>( + coid, + [this](auto head_obc, auto clone_obc) { + logger().debug("{}: got clone_obc={}", *this, clone_obc->get_oid()); + return enter_stage<interruptor>( + client_pp().process + ).then_interruptible( + [this,clone_obc=std::move(clone_obc), head_obc=std::move(head_obc)]() mutable { + logger().debug("{}: processing clone_obc={}", *this, clone_obc->get_oid()); + return remove_or_update( + clone_obc, head_obc + ).safe_then_unpack_interruptible([clone_obc, this] + (auto&& txn, auto&& log_entries) mutable { + auto [submitted, all_completed] = pg->submit_transaction( + std::move(clone_obc), + std::move(txn), + std::move(osd_op_p), + std::move(log_entries)); + return submitted.then_interruptible( + [all_completed=std::move(all_completed), this] () mutable { + return enter_stage<interruptor>( + wait_repop + ).then_interruptible([all_completed=std::move(all_completed)] () mutable { + return std::move(all_completed); + }); + }); + }); + }); + }).handle_error_interruptible( + remove_or_update_iertr::pass_further{}, + crimson::ct_error::assert_all{"unexpected error in SnapTrimObjSubEvent"} + ); + }); +} + +void SnapTrimObjSubEvent::print(std::ostream &lhs) const +{ + lhs << "SnapTrimObjSubEvent(" + << "coid=" << coid + << " snapid=" << snap_to_trim + << ")"; +} + +void SnapTrimObjSubEvent::dump_detail(Formatter *f) const +{ + f->open_object_section("SnapTrimObjSubEvent"); + f->dump_stream("coid") << coid; + f->close_section(); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h new file mode 100644 index 000000000..a3a970a04 --- /dev/null +++ b/src/crimson/osd/osd_operations/snaptrim_event.h @@ -0,0 +1,210 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/common/pg_pipeline.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_activation_blocker.h" +#include "osd/osd_types.h" +#include "osd/PGPeeringEvent.h" +#include "osd/PeeringState.h" + +namespace ceph { + class Formatter; +} + +class SnapMapper; + +namespace crimson::osd { + +class OSD; +class ShardServices; + +// trim up to `max` objects for snapshot `snapid +class SnapTrimEvent final : public PhasedOperationT<SnapTrimEvent> { +public: + using remove_or_update_ertr = + crimson::errorator<crimson::ct_error::enoent>; + using remove_or_update_iertr = + crimson::interruptible::interruptible_errorator< + IOInterruptCondition, remove_or_update_ertr>; + using snap_trim_ertr = remove_or_update_ertr::extend< + crimson::ct_error::eagain>; + using snap_trim_iertr = remove_or_update_iertr::extend< + crimson::ct_error::eagain>; + + static constexpr OperationTypeCode type = OperationTypeCode::snaptrim_event; + + SnapTrimEvent(Ref<PG> pg, + SnapMapper& snap_mapper, + const snapid_t snapid, + const bool needs_pause) + : pg(std::move(pg)), + snap_mapper(snap_mapper), + snapid(snapid), + needs_pause(needs_pause) {} + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + snap_trim_ertr::future<seastar::stop_iteration> start(); + snap_trim_ertr::future<seastar::stop_iteration> with_pg( + ShardServices &shard_services, Ref<PG> pg); + +private: + CommonPGPipeline& client_pp(); + + // bases on 998cb8c141bb89aafae298a9d5e130fbd78fe5f2 + struct SubOpBlocker : crimson::BlockerT<SubOpBlocker> { + static constexpr const char* type_name = "CompoundOpBlocker"; + + using id_done_t = std::pair<crimson::Operation::id_t, + remove_or_update_iertr::future<>>; + + void dump_detail(Formatter *f) const final; + + template <class... Args> + void emplace_back(Args&&... args); + + remove_or_update_iertr::future<> wait_completion(); + private: + std::vector<id_done_t> subops; + } subop_blocker; + + // we don't need to synchronize with other instances of SnapTrimEvent; + // it's here for the sake of op tracking. + struct WaitSubop : OrderedConcurrentPhaseT<WaitSubop> { + static constexpr auto type_name = "SnapTrimEvent::wait_subop"; + } wait_subop; + + // an instantiator can instruct us to go over this stage and then + // wait for the future to implement throttling. It is implemented + // that way to for the sake of tracking ops. + struct WaitTrimTimer : OrderedExclusivePhaseT<WaitTrimTimer> { + static constexpr auto type_name = "SnapTrimEvent::wait_trim_timer"; + } wait_trim_timer; + + PipelineHandle handle; + Ref<PG> pg; + SnapMapper& snap_mapper; + const snapid_t snapid; + const bool needs_pause; + +public: + PipelineHandle& get_handle() { return handle; } + + std::tuple< + StartEvent, + CommonPGPipeline::WaitForActive::BlockingEvent, + PGActivationBlocker::BlockingEvent, + CommonPGPipeline::RecoverMissing::BlockingEvent, + CommonPGPipeline::GetOBC::BlockingEvent, + CommonPGPipeline::Process::BlockingEvent, + WaitSubop::BlockingEvent, + PG::SnapTrimMutex::WaitPG::BlockingEvent, + WaitTrimTimer::BlockingEvent, + CompletionEvent + > tracking_events; + + friend class PG::SnapTrimMutex; +}; + +// remove single object. a SnapTrimEvent can create multiple subrequests. +// the division of labour is needed because of the restriction that an Op +// cannot revisite a pipeline's stage it already saw. +class SnapTrimObjSubEvent : public PhasedOperationT<SnapTrimObjSubEvent> { +public: + using remove_or_update_ertr = + crimson::errorator<crimson::ct_error::enoent>; + using remove_or_update_iertr = + crimson::interruptible::interruptible_errorator< + IOInterruptCondition, remove_or_update_ertr>; + + static constexpr OperationTypeCode type = + OperationTypeCode::snaptrimobj_subevent; + + SnapTrimObjSubEvent( + Ref<PG> pg, + const hobject_t& coid, + snapid_t snap_to_trim) + : pg(std::move(pg)), + coid(coid), + snap_to_trim(snap_to_trim) { + } + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + remove_or_update_iertr::future<> start(); + remove_or_update_iertr::future<> with_pg( + ShardServices &shard_services, Ref<PG> pg); + + CommonPGPipeline& client_pp(); + +private: + object_stat_sum_t delta_stats; + + remove_or_update_iertr::future<> remove_clone( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries); + void remove_head_whiteout( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries); + interruptible_future<> adjust_snaps( + ObjectContextRef obc, + ObjectContextRef head_obc, + const std::set<snapid_t>& new_snaps, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries); + void update_head( + ObjectContextRef obc, + ObjectContextRef head_obc, + ceph::os::Transaction& txn, + std::vector<pg_log_entry_t>& log_entries); + + using remove_or_update_ret_t = + std::pair<ceph::os::Transaction, std::vector<pg_log_entry_t>>; + remove_or_update_iertr::future<remove_or_update_ret_t> + remove_or_update(ObjectContextRef obc, ObjectContextRef head_obc); + + // we don't need to synchronize with other instances started by + // SnapTrimEvent; it's here for the sake of op tracking. + struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> { + static constexpr auto type_name = "SnapTrimObjSubEvent::wait_repop"; + } wait_repop; + + Ref<PG> pg; + PipelineHandle handle; + osd_op_params_t osd_op_p; + const hobject_t coid; + const snapid_t snap_to_trim; + +public: + PipelineHandle& get_handle() { return handle; } + + std::tuple< + StartEvent, + CommonPGPipeline::WaitForActive::BlockingEvent, + PGActivationBlocker::BlockingEvent, + CommonPGPipeline::RecoverMissing::BlockingEvent, + CommonPGPipeline::GetOBC::BlockingEvent, + CommonPGPipeline::Process::BlockingEvent, + WaitRepop::BlockingEvent, + CompletionEvent + > tracking_events; +}; + +} // namespace crimson::osd + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::SnapTrimEvent> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::osd::SnapTrimObjSubEvent> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/osdmap_gate.cc b/src/crimson/osd/osdmap_gate.cc new file mode 100644 index 000000000..171ec436d --- /dev/null +++ b/src/crimson/osd/osdmap_gate.cc @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/exception.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/shard_services.h" +#include "common/Formatter.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +template <OSDMapGateType OSDMapGateTypeV> +void OSDMapGate<OSDMapGateTypeV>::OSDMapBlocker::dump_detail(Formatter *f) const +{ + f->open_object_section("OSDMapGate"); + f->dump_int("epoch", epoch); + f->close_section(); +} + +template <OSDMapGateType OSDMapGateTypeV> +seastar::future<epoch_t> OSDMapGate<OSDMapGateTypeV>::wait_for_map( + typename OSDMapBlocker::BlockingEvent::TriggerI&& trigger, + epoch_t epoch, + ShardServices *shard_services) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<epoch_t>( + crimson::common::system_shutdown_exception()); + } + if (current >= epoch) { + return seastar::make_ready_future<epoch_t>(current); + } else { + logger().info("evt epoch is {}, i have {}, will wait", epoch, current); + auto &blocker = waiting_peering.emplace( + epoch, std::make_pair(blocker_type, epoch)).first->second; + auto fut = blocker.promise.get_shared_future(); + if (shard_services) { + return trigger.maybe_record_blocking( + shard_services->osdmap_subscribe(current, true).then( + [fut=std::move(fut)]() mutable { + return std::move(fut); + }), + blocker); + } else { + return trigger.maybe_record_blocking(std::move(fut), blocker); + } + } +} + +template <OSDMapGateType OSDMapGateTypeV> +void OSDMapGate<OSDMapGateTypeV>::got_map(epoch_t epoch) { + if (epoch == 0) { + return; + } + ceph_assert(epoch > current); + current = epoch; + auto first = waiting_peering.begin(); + auto last = waiting_peering.upper_bound(epoch); + std::for_each(first, last, [epoch](auto& blocked_requests) { + blocked_requests.second.promise.set_value(epoch); + }); + waiting_peering.erase(first, last); +} + +template <OSDMapGateType OSDMapGateTypeV> +seastar::future<> OSDMapGate<OSDMapGateTypeV>::stop() { + logger().info("osdmap::stop"); + stopping = true; + auto first = waiting_peering.begin(); + auto last = waiting_peering.end(); + std::for_each(first, last, [](auto& blocked_requests) { + blocked_requests.second.promise.set_exception( + crimson::common::system_shutdown_exception()); + }); + return seastar::now(); +} + +template class OSDMapGate<OSDMapGateType::PG>; +template class OSDMapGate<OSDMapGateType::OSD>; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osdmap_gate.h b/src/crimson/osd/osdmap_gate.h new file mode 100644 index 000000000..d76c4b82f --- /dev/null +++ b/src/crimson/osd/osdmap_gate.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <functional> +#include <map> +#include <optional> + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "include/types.h" +#include "crimson/osd/osd_operation.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; + +enum class OSDMapGateType { + OSD, + PG, +}; + +template <OSDMapGateType OSDMapGateTypeV> +class OSDMapGate { +public: + struct OSDMapBlocker : BlockerT<OSDMapBlocker> { + const char * type_name; + epoch_t epoch; + + OSDMapBlocker(std::pair<const char *, epoch_t> args) + : type_name(args.first), epoch(args.second) {} + + OSDMapBlocker(const OSDMapBlocker &) = delete; + OSDMapBlocker(OSDMapBlocker &&) = delete; + OSDMapBlocker &operator=(const OSDMapBlocker &) = delete; + OSDMapBlocker &operator=(OSDMapBlocker &&) = delete; + + seastar::shared_promise<epoch_t> promise; + + void dump_detail(Formatter *f) const final; + }; + using Blocker = OSDMapBlocker; + +private: + // order the promises in ascending order of the waited osdmap epoch, + // so we can access all the waiters expecting a map whose epoch is less + // than or equal to a given epoch + using waiting_peering_t = std::map<epoch_t, + OSDMapBlocker>; + const char *blocker_type; + waiting_peering_t waiting_peering; + epoch_t current = 0; + bool stopping = false; +public: + OSDMapGate(const char *blocker_type) + : blocker_type(blocker_type) {} + + /** + * wait_for_map + * + * Wait for an osdmap whose epoch is greater or equal to given epoch. + * If shard_services is non-null, request map if not present. + */ + seastar::future<epoch_t> + wait_for_map( + typename OSDMapBlocker::BlockingEvent::TriggerI&& trigger, + epoch_t epoch, + ShardServices *shard_services=nullptr + ); + void got_map(epoch_t epoch); + seastar::future<> stop(); +}; + +using OSD_OSDMapGate = OSDMapGate<OSDMapGateType::OSD>; +using PG_OSDMapGate = OSDMapGate<OSDMapGateType::PG>; + +} diff --git a/src/crimson/osd/osdmap_service.h b/src/crimson/osd/osdmap_service.h new file mode 100644 index 000000000..017303536 --- /dev/null +++ b/src/crimson/osd/osdmap_service.h @@ -0,0 +1,21 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/types.h" +#include "osd/OSDMap.h" + +class OSDMap; + +class OSDMapService { +public: + using cached_map_t = OSDMapRef; + using local_cached_map_t = LocalOSDMapRef; + + virtual ~OSDMapService() = default; + virtual seastar::future<cached_map_t> get_map(epoch_t e) = 0; + /// get the latest map + virtual cached_map_t get_map() const = 0; + virtual epoch_t get_up_epoch() const = 0; +}; diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc new file mode 100644 index 000000000..7cf3b158c --- /dev/null +++ b/src/crimson/osd/pg.cc @@ -0,0 +1,1544 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "pg.h" + +#include <functional> + +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/map.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm/copy.hpp> +#include <boost/range/algorithm/max_element.hpp> +#include <boost/range/numeric.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "common/hobject_fmt.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" + +#include "osd/OSDMap.h" +#include "osd/osd_types_fmt.h" + +#include "os/Transaction.h" + +#include "crimson/common/exception.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/os/cyanstore/cyan_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/exceptions.h" +#include "crimson/osd/pg_meta.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/osd_operations/osdop_params.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/background_recovery.h" +#include "crimson/osd/osd_operations/snaptrim_event.h" +#include "crimson/osd/pg_recovery.h" +#include "crimson/osd/replicated_recovery_backend.h" +#include "crimson/osd/watch.h" + +using std::ostream; +using std::set; +using std::string; +using std::vector; + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace std::chrono { +std::ostream& operator<<(std::ostream& out, const signedspan& d) +{ + auto s = std::chrono::duration_cast<std::chrono::seconds>(d).count(); + auto ns = std::abs((d % 1s).count()); + fmt::print(out, "{}{}s", s, ns ? fmt::format(".{:0>9}", ns) : ""); + return out; +} +} + +template <typename T> +struct fmt::formatter<std::optional<T>> : fmt::formatter<T> { + template <typename FormatContext> + auto format(const std::optional<T>& v, FormatContext& ctx) const { + if (v.has_value()) { + return fmt::formatter<T>::format(*v, ctx); + } + return fmt::format_to(ctx.out(), "<null>"); + } +}; + +namespace crimson::osd { + +using crimson::common::local_conf; + +class RecoverablePredicate : public IsPGRecoverablePredicate { +public: + bool operator()(const set<pg_shard_t> &have) const override { + return !have.empty(); + } +}; + +class ReadablePredicate: public IsPGReadablePredicate { + pg_shard_t whoami; +public: + explicit ReadablePredicate(pg_shard_t whoami) : whoami(whoami) {} + bool operator()(const set<pg_shard_t> &have) const override { + return have.count(whoami); + } +}; + +PG::PG( + spg_t pgid, + pg_shard_t pg_shard, + crimson::os::CollectionRef coll_ref, + pg_pool_t&& pool, + std::string&& name, + cached_map_t osdmap, + ShardServices &shard_services, + ec_profile_t profile) + : pgid{pgid}, + pg_whoami{pg_shard}, + coll_ref{coll_ref}, + pgmeta_oid{pgid.make_pgmeta_oid()}, + osdmap_gate("PG::osdmap_gate"), + shard_services{shard_services}, + backend( + PGBackend::create( + pgid.pgid, + pg_shard, + pool, + coll_ref, + shard_services, + profile, + *this)), + recovery_backend( + std::make_unique<ReplicatedRecoveryBackend>( + *this, shard_services, coll_ref, backend.get())), + recovery_handler( + std::make_unique<PGRecovery>(this)), + peering_state( + shard_services.get_cct(), + pg_shard, + pgid, + PGPool( + osdmap, + pgid.pool(), + pool, + name), + osdmap, + this, + this), + obc_registry{ + local_conf()}, + obc_loader{ + obc_registry, + *backend.get(), + *this}, + osdriver( + &shard_services.get_store(), + coll_ref, + pgid.make_pgmeta_oid()), + snap_mapper( + this->shard_services.get_cct(), + &osdriver, + pgid.ps(), + pgid.get_split_bits(pool.get_pg_num()), + pgid.pool(), + pgid.shard), + wait_for_active_blocker(this) +{ + peering_state.set_backend_predicates( + new ReadablePredicate(pg_whoami), + new RecoverablePredicate()); + osdmap_gate.got_map(osdmap->get_epoch()); +} + +PG::~PG() {} + +void PG::check_blocklisted_watchers() +{ + logger().debug("{}", __func__); + obc_registry.for_each([this](ObjectContextRef obc) { + assert(obc); + for (const auto& [key, watch] : obc->watchers) { + assert(watch->get_pg() == this); + const auto& ea = watch->get_peer_addr(); + logger().debug("watch: Found {} cookie {}. Checking entity_add_t {}", + watch->get_entity(), watch->get_cookie(), ea); + if (get_osdmap()->is_blocklisted(ea)) { + logger().info("watch: Found blocklisted watcher for {}", ea); + watch->do_watch_timeout(); + } + } + }); +} + +bool PG::try_flush_or_schedule_async() { + logger().debug("PG::try_flush_or_schedule_async: flush ..."); + (void)shard_services.get_store().flush( + coll_ref + ).then( + [this, epoch=get_osdmap_epoch()]() { + return shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + epoch, + epoch, + PeeringState::IntervalFlush()); + }); + return false; +} + +void PG::publish_stats_to_osd() +{ + if (!is_primary()) + return; + if (auto new_pg_stats = peering_state.prepare_stats_for_publish( + pg_stats, + object_stat_collection_t()); + new_pg_stats.has_value()) { + pg_stats = std::move(new_pg_stats); + } +} + +void PG::clear_publish_stats() +{ + pg_stats.reset(); +} + +pg_stat_t PG::get_stats() const +{ + return pg_stats.value_or(pg_stat_t{}); +} + +void PG::queue_check_readable(epoch_t last_peering_reset, ceph::timespan delay) +{ + // handle the peering event in the background + logger().debug( + "{}: PG::queue_check_readable lpr: {}, delay: {}", + *this, last_peering_reset, delay); + check_readable_timer.cancel(); + check_readable_timer.set_callback([last_peering_reset, this] { + logger().debug( + "{}: PG::queue_check_readable callback lpr: {}", + *this, last_peering_reset); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + last_peering_reset, + last_peering_reset, + PeeringState::CheckReadable{}); + }); + check_readable_timer.arm( + std::chrono::duration_cast<seastar::lowres_clock::duration>(delay)); +} + +void PG::recheck_readable() +{ + bool changed = false; + const auto mnow = shard_services.get_mnow(); + if (peering_state.state_test(PG_STATE_WAIT)) { + auto prior_readable_until_ub = peering_state.get_prior_readable_until_ub(); + if (mnow < prior_readable_until_ub) { + logger().info( + "{}: {} will wait (mnow {} < prior_readable_until_ub {})", + *this, __func__, mnow, prior_readable_until_ub); + queue_check_readable( + peering_state.get_last_peering_reset(), + prior_readable_until_ub - mnow); + } else { + logger().info( + "{}:{} no longer wait (mnow {} >= prior_readable_until_ub {})", + *this, __func__, mnow, prior_readable_until_ub); + peering_state.state_clear(PG_STATE_WAIT); + peering_state.clear_prior_readable_until_ub(); + changed = true; + } + } + if (peering_state.state_test(PG_STATE_LAGGY)) { + auto readable_until = peering_state.get_readable_until(); + if (readable_until == readable_until.zero()) { + logger().info( + "{}:{} still laggy (mnow {}, readable_until zero)", + *this, __func__, mnow); + } else if (mnow >= readable_until) { + logger().info( + "{}:{} still laggy (mnow {} >= readable_until {})", + *this, __func__, mnow, readable_until); + } else { + logger().info( + "{}:{} no longer laggy (mnow {} < readable_until {})", + *this, __func__, mnow, readable_until); + peering_state.state_clear(PG_STATE_LAGGY); + changed = true; + } + } + if (changed) { + publish_stats_to_osd(); + if (!peering_state.state_test(PG_STATE_WAIT) && + !peering_state.state_test(PG_STATE_LAGGY)) { + // TODO: requeue ops waiting for readable + } + } +} + +unsigned PG::get_target_pg_log_entries() const +{ + const unsigned local_num_pgs = shard_services.get_num_local_pgs(); + const unsigned local_target = + local_conf().get_val<uint64_t>("osd_target_pg_log_entries_per_osd") / + seastar::smp::count; + const unsigned min_pg_log_entries = + local_conf().get_val<uint64_t>("osd_min_pg_log_entries"); + if (local_num_pgs > 0 && local_target > 0) { + // target an even spread of our budgeted log entries across all + // PGs. note that while we only get to control the entry count + // for primary PGs, we'll normally be responsible for a mix of + // primary and replica PGs (for the same pool(s) even), so this + // will work out. + const unsigned max_pg_log_entries = + local_conf().get_val<uint64_t>("osd_max_pg_log_entries"); + return std::clamp(local_target / local_num_pgs, + min_pg_log_entries, + max_pg_log_entries); + } else { + // fall back to a per-pg value. + return min_pg_log_entries; + } +} + +void PG::on_removal(ceph::os::Transaction &t) { + t.register_on_commit( + new LambdaContext( + [this](int r) { + ceph_assert(r == 0); + (void)shard_services.start_operation<LocalPeeringEvent>( + this, pg_whoami, pgid, float(0.001), get_osdmap_epoch(), + get_osdmap_epoch(), PeeringState::DeleteSome()); + })); +} + +void PG::on_activate(interval_set<snapid_t> snaps) +{ + logger().debug("{}: {} snaps={}", *this, __func__, snaps); + snap_trimq = std::move(snaps); + projected_last_update = peering_state.get_info().last_update; +} + +void PG::on_activate_complete() +{ + wait_for_active_blocker.unblock(); + + if (peering_state.needs_recovery()) { + logger().info("{}: requesting recovery", + __func__); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + float(0.001), + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DoRecovery{}); + } else if (peering_state.needs_backfill()) { + logger().info("{}: requesting backfill", + __func__); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + float(0.001), + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill{}); + } else { + logger().debug("{}: no need to recover or backfill, AllReplicasRecovered", + " for pg: {}", __func__, pgid); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + float(0.001), + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::AllReplicasRecovered{}); + } + publish_stats_to_osd(); +} + +void PG::prepare_write(pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ceph::os::Transaction &t) +{ + std::map<string,bufferlist> km; + std::string key_to_remove; + if (dirty_big_info || dirty_info) { + int ret = prepare_info_keymap( + shard_services.get_cct(), + &km, + &key_to_remove, + get_osdmap_epoch(), + info, + last_written_info, + past_intervals, + dirty_big_info, + need_write_epoch, + true, + nullptr, + this); + ceph_assert(ret == 0); + } + pglog.write_log_and_missing( + t, &km, coll_ref->get_cid(), pgmeta_oid, + peering_state.get_pgpool().info.require_rollback()); + if (!km.empty()) { + t.omap_setkeys(coll_ref->get_cid(), pgmeta_oid, km); + } + if (!key_to_remove.empty()) { + t.omap_rmkey(coll_ref->get_cid(), pgmeta_oid, key_to_remove); + } +} + +std::pair<ghobject_t, bool> +PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next) +{ + logger().info("removing pg {}", pgid); + auto fut = interruptor::make_interruptible( + shard_services.get_store().list_objects( + coll_ref, + _next, + ghobject_t::get_max(), + local_conf()->osd_target_transaction_size)); + + auto [objs_to_rm, next] = fut.get(); + if (objs_to_rm.empty()) { + logger().info("all objs removed, removing coll for {}", pgid); + t.remove(coll_ref->get_cid(), pgmeta_oid); + t.remove_collection(coll_ref->get_cid()); + (void) shard_services.get_store().do_transaction( + coll_ref, std::move(t)).then([this] { + return shard_services.remove_pg(pgid); + }); + return {next, false}; + } else { + for (auto &obj : objs_to_rm) { + if (obj == pgmeta_oid) { + continue; + } + logger().trace("pg {}, removing obj {}", pgid, obj); + t.remove(coll_ref->get_cid(), obj); + } + t.register_on_commit( + new LambdaContext([this](int r) { + ceph_assert(r == 0); + logger().trace("triggering more pg delete {}", pgid); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + float(0.001), + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DeleteSome{}); + })); + return {next, true}; + } +} + +Context *PG::on_clean() +{ + // Not needed yet (will be needed for IO unblocking) + return nullptr; +} + +void PG::on_active_actmap() +{ + logger().debug("{}: {} snap_trimq={}", *this, __func__, snap_trimq); + peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR); + // loops until snap_trimq is empty or SNAPTRIM_ERROR. + std::ignore = seastar::do_until( + [this] { return snap_trimq.empty() + || peering_state.state_test(PG_STATE_SNAPTRIM_ERROR); + }, + [this] { + peering_state.state_set(PG_STATE_SNAPTRIM); + publish_stats_to_osd(); + const auto to_trim = snap_trimq.range_start(); + snap_trimq.erase(to_trim); + const auto needs_pause = !snap_trimq.empty(); + return seastar::repeat([to_trim, needs_pause, this] { + logger().debug("{}: going to start SnapTrimEvent, to_trim={}", + *this, to_trim); + return shard_services.start_operation<SnapTrimEvent>( + this, + snap_mapper, + to_trim, + needs_pause + ).second.handle_error( + crimson::ct_error::enoent::handle([this] { + logger().error("{}: ENOENT saw, trimming stopped", *this); + peering_state.state_set(PG_STATE_SNAPTRIM_ERROR); + publish_stats_to_osd(); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::yes); + }), crimson::ct_error::eagain::handle([this] { + logger().info("{}: EAGAIN saw, trimming restarted", *this); + return seastar::make_ready_future<seastar::stop_iteration>( + seastar::stop_iteration::no); + }) + ); + }).then([this, trimmed=to_trim] { + logger().debug("{}: trimmed snap={}", *this, trimmed); + }); + }).finally([this] { + logger().debug("{}: PG::on_active_actmap() finished trimming", + *this); + peering_state.state_clear(PG_STATE_SNAPTRIM); + peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR); + publish_stats_to_osd(); + }); +} + +void PG::on_active_advmap(const OSDMapRef &osdmap) +{ + const auto new_removed_snaps = osdmap->get_new_removed_snaps(); + if (auto it = new_removed_snaps.find(get_pgid().pool()); + it != new_removed_snaps.end()) { + bool bad = false; + for (auto j : it->second) { + if (snap_trimq.intersects(j.first, j.second)) { + decltype(snap_trimq) added, overlap; + added.insert(j.first, j.second); + overlap.intersection_of(snap_trimq, added); + logger().error("{}: {} removed_snaps already contains {}", + *this, __func__, overlap); + bad = true; + snap_trimq.union_of(added); + } else { + snap_trimq.insert(j.first, j.second); + } + } + logger().info("{}: {} new removed snaps {}, snap_trimq now{}", + *this, __func__, it->second, snap_trimq); + assert(!bad || !local_conf().get_val<bool>("osd_debug_verify_cached_snaps")); + } +} + +void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) +{ + // TODO: should update the stats upon finishing the scrub + peering_state.update_stats([scrub_level, this](auto& history, auto& stats) { + const utime_t now = ceph_clock_now(); + history.last_scrub = peering_state.get_info().last_update; + history.last_scrub_stamp = now; + history.last_clean_scrub_stamp = now; + if (scrub_level == scrub_level_t::deep) { + history.last_deep_scrub = history.last_scrub; + history.last_deep_scrub_stamp = now; + } + // yes, please publish the stats + return true; + }); +} + +void PG::log_state_enter(const char *state) { + logger().info("Entering state: {}", state); +} + +void PG::log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) { + logger().info( + "Exiting state: {}, entered at {}, {} spent on {} events", + state_name, + enter_time, + event_dur, + events); +} + +ceph::signedspan PG::get_mnow() const +{ + return shard_services.get_mnow(); +} + +HeartbeatStampsRef PG::get_hb_stamps(int peer) +{ + return shard_services.get_hb_stamps(peer); +} + +void PG::schedule_renew_lease(epoch_t last_peering_reset, ceph::timespan delay) +{ + // handle the peering event in the background + renew_lease_timer.cancel(); + renew_lease_timer.set_callback([last_peering_reset, this] { + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + last_peering_reset, + last_peering_reset, + RenewLease{}); + }); + renew_lease_timer.arm( + std::chrono::duration_cast<seastar::lowres_clock::duration>(delay)); +} + + +void PG::init( + int role, + const vector<int>& newup, int new_up_primary, + const vector<int>& newacting, int new_acting_primary, + const pg_history_t& history, + const PastIntervals& pi, + ObjectStore::Transaction &t) +{ + peering_state.init( + role, newup, new_up_primary, newacting, + new_acting_primary, history, pi, t); +} + +seastar::future<> PG::read_state(crimson::os::FuturizedStore::Shard* store) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + return seastar::do_with(PGMeta(*store, pgid), [] (auto& pg_meta) { + return pg_meta.load(); + }).then([this, store](auto&& ret) { + auto [pg_info, past_intervals] = std::move(ret); + return peering_state.init_from_disk_state( + std::move(pg_info), + std::move(past_intervals), + [this, store] (PGLog &pglog) { + return pglog.read_log_and_missing_crimson( + *store, + coll_ref, + peering_state.get_info(), + pgmeta_oid); + }); + }).then([this]() { + int primary, up_primary; + vector<int> acting, up; + peering_state.get_osdmap()->pg_to_up_acting_osds( + pgid.pgid, &up, &up_primary, &acting, &primary); + peering_state.init_primary_up_acting( + up, + acting, + up_primary, + primary); + int rr = OSDMap::calc_pg_role(pg_whoami, acting); + peering_state.set_role(rr); + + epoch_t epoch = get_osdmap_epoch(); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + epoch, + epoch, + PeeringState::Initialize()); + + return seastar::now(); + }); +} + +PG::interruptible_future<> PG::do_peering_event( + PGPeeringEvent& evt, PeeringCtx &rctx) +{ + if (peering_state.pg_has_reset_since(evt.get_epoch_requested()) || + peering_state.pg_has_reset_since(evt.get_epoch_sent())) { + logger().debug("{} ignoring {} -- pg has reset", __func__, evt.get_desc()); + return interruptor::now(); + } else { + logger().debug("{} handling {} for pg: {}", __func__, evt.get_desc(), pgid); + // all peering event handling needs to be run in a dedicated seastar::thread, + // so that event processing can involve I/O reqs freely, for example: PG::on_removal, + // PG::on_new_interval + return interruptor::async([this, &evt, &rctx] { + peering_state.handle_event( + evt.get_event(), + &rctx); + peering_state.write_if_dirty(rctx.transaction); + }); + } +} + +seastar::future<> PG::handle_advance_map( + cached_map_t next_map, PeeringCtx &rctx) +{ + return seastar::async([this, next_map=std::move(next_map), &rctx] { + vector<int> newup, newacting; + int up_primary, acting_primary; + next_map->pg_to_up_acting_osds( + pgid.pgid, + &newup, &up_primary, + &newacting, &acting_primary); + peering_state.advance_map( + next_map, + peering_state.get_osdmap(), + newup, + up_primary, + newacting, + acting_primary, + rctx); + osdmap_gate.got_map(next_map->get_epoch()); + }); +} + +seastar::future<> PG::handle_activate_map(PeeringCtx &rctx) +{ + return seastar::async([this, &rctx] { + peering_state.activate_map(rctx); + }); +} + +seastar::future<> PG::handle_initialize(PeeringCtx &rctx) +{ + return seastar::async([this, &rctx] { + peering_state.handle_event(PeeringState::Initialize{}, &rctx); + }); +} + + +void PG::print(ostream& out) const +{ + out << peering_state << " "; +} + +void PG::dump_primary(Formatter* f) +{ + peering_state.dump_peering_state(f); + + f->open_array_section("recovery_state"); + PeeringState::QueryState q(f); + peering_state.handle_event(q, 0); + f->close_section(); + + // TODO: snap_trimq + // TODO: scrubber state + // TODO: agent state +} + +std::ostream& operator<<(std::ostream& os, const PG& pg) +{ + os << " pg_epoch " << pg.get_osdmap_epoch() << " "; + pg.print(os); + return os; +} + +std::tuple<PG::interruptible_future<>, + PG::interruptible_future<>> +PG::submit_transaction( + ObjectContextRef&& obc, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + std::vector<pg_log_entry_t>&& log_entries) +{ + if (__builtin_expect(stopping, false)) { + return {seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()), + seastar::now()}; + } + + epoch_t map_epoch = get_osdmap_epoch(); + ceph_assert(!has_reset_since(osd_op_p.at_version.epoch)); + + peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version); + peering_state.append_log_with_trim_to_updated(std::move(log_entries), osd_op_p.at_version, + txn, true, false); + + auto [submitted, all_completed] = backend->mutate_object( + peering_state.get_acting_recovery_backfill(), + std::move(obc), + std::move(txn), + std::move(osd_op_p), + peering_state.get_last_peering_reset(), + map_epoch, + std::move(log_entries)); + return std::make_tuple(std::move(submitted), all_completed.then_interruptible( + [this, last_complete=peering_state.get_info().last_complete, + at_version=osd_op_p.at_version](auto acked) { + for (const auto& peer : acked) { + peering_state.update_peer_last_complete_ondisk( + peer.shard, peer.last_complete_ondisk); + } + peering_state.complete_write(at_version, last_complete); + return seastar::now(); + })); +} + +PG::interruptible_future<> PG::repair_object( + const hobject_t& oid, + eversion_t& v) +{ + // see also PrimaryLogPG::rep_repair_primary_object() + assert(is_primary()); + logger().debug("{}: {} peers osd.{}", __func__, oid, get_acting_recovery_backfill()); + // Add object to PG's missing set if it isn't there already + assert(!get_local_missing().is_missing(oid)); + peering_state.force_object_missing(pg_whoami, oid, v); + auto [op, fut] = get_shard_services().start_operation<UrgentRecovery>( + oid, v, this, get_shard_services(), get_osdmap_epoch()); + return std::move(fut); +} + +template <class Ret, class SuccessFunc, class FailureFunc> +PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<Ret>> +PG::do_osd_ops_execute( + seastar::lw_shared_ptr<OpsExecuter> ox, + std::vector<OSDOp>& ops, + SuccessFunc&& success_func, + FailureFunc&& failure_func) +{ + assert(ox); + auto rollbacker = ox->create_rollbacker([this] (auto& obc) { + return obc_loader.reload_obc(obc).handle_error_interruptible( + load_obc_ertr::assert_all{"can't live with object state messed up"}); + }); + auto failure_func_ptr = seastar::make_lw_shared(std::move(failure_func)); + return interruptor::do_for_each(ops, [ox](OSDOp& osd_op) { + logger().debug( + "do_osd_ops_execute: object {} - handling op {}", + ox->get_target(), + ceph_osd_op_name(osd_op.op.op)); + return ox->execute_op(osd_op); + }).safe_then_interruptible([this, ox, &ops] { + logger().debug( + "do_osd_ops_execute: object {} all operations successful", + ox->get_target()); + // check for full + if ((ox->delta_stats.num_bytes > 0 || + ox->delta_stats.num_objects > 0) && + get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) { + const auto& m = ox->get_message(); + if (m.get_reqid().name.is_mds() || // FIXME: ignore MDS for now + m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) { + logger().info(" full, but proceeding due to FULL_FORCE or MDS"); + } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) { + // they tried, they failed. + logger().info(" full, replying to FULL_TRY op"); + if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) + return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>( + seastar::now(), + OpsExecuter::osd_op_ierrorator::future<>( + crimson::ct_error::edquot::make())); + else + return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>( + seastar::now(), + OpsExecuter::osd_op_ierrorator::future<>( + crimson::ct_error::enospc::make())); + } else { + // drop request + logger().info(" full, dropping request (bad client)"); + return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>( + seastar::now(), + OpsExecuter::osd_op_ierrorator::future<>( + crimson::ct_error::eagain::make())); + } + } + return std::move(*ox).flush_changes_n_do_ops_effects( + ops, + snap_mapper, + osdriver, + [this] (auto&& txn, + auto&& obc, + auto&& osd_op_p, + auto&& log_entries) { + logger().debug( + "do_osd_ops_execute: object {} submitting txn", + obc->get_oid()); + return submit_transaction( + std::move(obc), + std::move(txn), + std::move(osd_op_p), + std::move(log_entries)); + }); + }).safe_then_unpack_interruptible( + [success_func=std::move(success_func), rollbacker, this, failure_func_ptr] + (auto submitted_fut, auto all_completed_fut) mutable { + return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>( + std::move(submitted_fut), + all_completed_fut.safe_then_interruptible_tuple( + std::move(success_func), + crimson::ct_error::object_corrupted::handle( + [rollbacker, this] (const std::error_code& e) mutable { + // this is a path for EIO. it's special because we want to fix the obejct + // and try again. that is, the layer above `PG::do_osd_ops` is supposed to + // restart the execution. + return rollbacker.rollback_obc_if_modified(e).then_interruptible( + [obc=rollbacker.get_obc(), this] { + return repair_object(obc->obs.oi.soid, + obc->obs.oi.version).then_interruptible([] { + return do_osd_ops_iertr::future<Ret>{crimson::ct_error::eagain::make()}; + }); + }); + }), OpsExecuter::osd_op_errorator::all_same_way( + [rollbacker, failure_func_ptr] + (const std::error_code& e) mutable { + return rollbacker.rollback_obc_if_modified(e).then_interruptible( + [e, failure_func_ptr] { + return (*failure_func_ptr)(e); + }); + }) + ) + ); + }, OpsExecuter::osd_op_errorator::all_same_way( + [rollbacker, failure_func_ptr] + (const std::error_code& e) mutable { + return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>( + seastar::now(), + e.value() == ENOENT ? (*failure_func_ptr)(e) : + rollbacker.rollback_obc_if_modified(e).then_interruptible( + [e, failure_func_ptr] { + return (*failure_func_ptr)(e); + })); + })); +} +seastar::future<> PG::submit_error_log( + Ref<MOSDOp> m, + const OpInfo &op_info, + ObjectContextRef obc, + const std::error_code e, + ceph_tid_t rep_tid, + eversion_t &version) +{ + const osd_reqid_t &reqid = m->get_reqid(); + mempool::osd_pglog::list<pg_log_entry_t> log_entries; + log_entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, + obc->obs.oi.soid, + next_version(), + eversion_t(), 0, + reqid, utime_t(), + -e.value())); + if (op_info.allows_returnvec()) { + log_entries.back().set_op_returns(m->ops); + } + ceph_assert(is_primary()); + if (!log_entries.empty()) { + ceph_assert(log_entries.rbegin()->version >= projected_last_update); + version = projected_last_update = log_entries.rbegin()->version; + } + ceph::os::Transaction t; + peering_state.merge_new_log_entries( + log_entries, t, peering_state.get_pg_trim_to(), + peering_state.get_min_last_complete_ondisk()); + + set<pg_shard_t> waiting_on; + for (auto &i : get_acting_recovery_backfill()) { + pg_shard_t peer(i); + if (peer == pg_whoami) continue; + ceph_assert(peering_state.get_peer_missing().count(peer)); + ceph_assert(peering_state.has_peer_info(peer)); + auto log_m = crimson::make_message<MOSDPGUpdateLogMissing>( + log_entries, + spg_t(peering_state.get_info().pgid.pgid, i.shard), + pg_whoami.shard, + get_osdmap_epoch(), + get_last_peering_reset(), + rep_tid, + peering_state.get_pg_trim_to(), + peering_state.get_min_last_complete_ondisk()); + send_cluster_message(peer.osd, std::move(log_m), get_osdmap_epoch()); + waiting_on.insert(peer); + } + waiting_on.insert(pg_whoami); + log_entry_update_waiting_on.insert( + std::make_pair(rep_tid, log_update_t{std::move(waiting_on)})); + return shard_services.get_store().do_transaction( + get_collection_ref(), std::move(t)) + .then([this] { + peering_state.update_trim_to(); + return seastar::now(); + }); +} + +PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<MURef<MOSDOpReply>>> +PG::do_osd_ops( + Ref<MOSDOp> m, + crimson::net::ConnectionRef conn, + ObjectContextRef obc, + const OpInfo &op_info, + const SnapContext& snapc) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + return do_osd_ops_execute<MURef<MOSDOpReply>>( + seastar::make_lw_shared<OpsExecuter>( + Ref<PG>{this}, obc, op_info, *m, conn, snapc), + m->ops, + [this, m, obc, may_write = op_info.may_write(), + may_read = op_info.may_read(), rvec = op_info.allows_returnvec()] { + // TODO: should stop at the first op which returns a negative retval, + // cmpext uses it for returning the index of first unmatched byte + int result = m->ops.empty() ? 0 : m->ops.back().rval.code; + if (may_read && result >= 0) { + for (auto &osdop : m->ops) { + if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { + result = osdop.rval.code; + break; + } + } + } else if (result > 0 && may_write && !rvec) { + result = 0; + } else if (result < 0 && (m->ops.empty() ? + 0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) { + result = 0; + } + auto reply = crimson::make_message<MOSDOpReply>(m.get(), + result, + get_osdmap_epoch(), + 0, + false); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + logger().debug( + "do_osd_ops: {} - object {} sending reply", + *m, + m->get_hobj()); + if (obc->obs.exists) { + reply->set_reply_versions(peering_state.get_info().last_update, + obc->obs.oi.user_version); + } else { + reply->set_reply_versions(peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + } + return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>( + std::move(reply)); + }, + [m, &op_info, obc, this] (const std::error_code& e) { + return seastar::do_with(eversion_t(), [m, &op_info, obc, e, this](auto &version) { + auto fut = seastar::now(); + epoch_t epoch = get_osdmap_epoch(); + ceph_tid_t rep_tid = shard_services.get_tid(); + auto last_complete = peering_state.get_info().last_complete; + if (op_info.may_write()) { + fut = submit_error_log(m, op_info, obc, e, rep_tid, version); + } + return fut.then([m, e, epoch, &op_info, rep_tid, &version, last_complete, this] { + auto log_reply = [m, e, this] { + auto reply = crimson::make_message<MOSDOpReply>( + m.get(), -e.value(), get_osdmap_epoch(), 0, false); + if (m->ops.empty() ? 0 : + m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) { + reply->set_result(0); + } + // For all ops except for CMPEXT, the correct error value is encoded + // in e.value(). For CMPEXT, osdop.rval has the actual error value. + if (e.value() == ct_error::cmp_fail_error_value) { + assert(!m->ops.empty()); + for (auto &osdop : m->ops) { + if (osdop.rval < 0) { + reply->set_result(osdop.rval); + break; + } + } + } + reply->set_enoent_reply_versions( + peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>( + std::move(reply)); + }; + + if (!peering_state.pg_has_reset_since(epoch) && op_info.may_write()) { + auto it = log_entry_update_waiting_on.find(rep_tid); + ceph_assert(it != log_entry_update_waiting_on.end()); + auto it2 = it->second.waiting_on.find(pg_whoami); + ceph_assert(it2 != it->second.waiting_on.end()); + it->second.waiting_on.erase(it2); + + if (it->second.waiting_on.empty()) { + log_entry_update_waiting_on.erase(it); + if (version != eversion_t()) { + peering_state.complete_write(version, last_complete); + } + return log_reply(); + } else { + return it->second.all_committed.get_shared_future() + .then([this, &version, last_complete, log_reply = std::move(log_reply)] { + if (version != eversion_t()) { + peering_state.complete_write(version, last_complete); + } + return log_reply(); + }); + } + } else { + return log_reply(); + } + }); + }); + }); +} + +PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<>> +PG::do_osd_ops( + ObjectContextRef obc, + std::vector<OSDOp>& ops, + const OpInfo &op_info, + const do_osd_ops_params_t &&msg_params, + do_osd_ops_success_func_t success_func, + do_osd_ops_failure_func_t failure_func) +{ + // This overload is generally used for internal client requests, + // use an empty SnapContext. + return seastar::do_with( + std::move(msg_params), + [=, this, &ops, &op_info](auto &msg_params) { + return do_osd_ops_execute<void>( + seastar::make_lw_shared<OpsExecuter>( + Ref<PG>{this}, + std::move(obc), + op_info, + msg_params, + msg_params.get_connection(), + SnapContext{} + ), + ops, + std::move(success_func), + std::move(failure_func)); + }); +} + +PG::interruptible_future<MURef<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + auto ox = std::make_unique<PgOpsExecuter>(std::as_const(*this), + std::as_const(*m)); + return interruptor::do_for_each(m->ops, [ox = ox.get()](OSDOp& osd_op) { + logger().debug("will be handling pg op {}", ceph_osd_op_name(osd_op.op.op)); + return ox->execute_op(osd_op); + }).then_interruptible([m, this, ox = std::move(ox)] { + auto reply = crimson::make_message<MOSDOpReply>(m.get(), 0, get_osdmap_epoch(), + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + false); + reply->claim_op_out_data(m->ops); + reply->set_reply_versions(peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + return seastar::make_ready_future<MURef<MOSDOpReply>>(std::move(reply)); + }).handle_exception_type_interruptible([=, this](const crimson::osd::error& e) { + auto reply = crimson::make_message<MOSDOpReply>( + m.get(), -e.code().value(), get_osdmap_epoch(), 0, false); + reply->set_enoent_reply_versions(peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + return seastar::make_ready_future<MURef<MOSDOpReply>>(std::move(reply)); + }); +} + +hobject_t PG::get_oid(const hobject_t& hobj) +{ + return hobj.snap == CEPH_SNAPDIR ? hobj.get_head() : hobj; +} + +RWState::State PG::get_lock_type(const OpInfo &op_info) +{ + + if (op_info.rwordered() && op_info.may_read()) { + return RWState::RWEXCL; + } else if (op_info.rwordered()) { + return RWState::RWWRITE; + } else { + ceph_assert(op_info.may_read()); + return RWState::RWREAD; + } +} + +void PG::check_blocklisted_obc_watchers( + ObjectContextRef &obc) +{ + if (obc->watchers.empty()) { + for (auto &[src, winfo] : obc->obs.oi.watchers) { + auto watch = crimson::osd::Watch::create( + obc, winfo, src.second, this); + watch->disconnect(); + auto [it, emplaced] = obc->watchers.emplace(src, std::move(watch)); + assert(emplaced); + logger().debug("added watch for obj {}, client {}", + obc->get_oid(), src.second); + } + } +} + +PG::load_obc_iertr::future<> +PG::with_locked_obc(const hobject_t &hobj, + const OpInfo &op_info, + with_obc_func_t &&f) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + const hobject_t oid = get_oid(hobj); + auto wrapper = [f=std::move(f), this](auto obc) { + check_blocklisted_obc_watchers(obc); + return f(obc); + }; + switch (get_lock_type(op_info)) { + case RWState::RWREAD: + return obc_loader.with_obc<RWState::RWREAD>(oid, std::move(wrapper)); + case RWState::RWWRITE: + return obc_loader.with_obc<RWState::RWWRITE>(oid, std::move(wrapper)); + case RWState::RWEXCL: + return obc_loader.with_obc<RWState::RWEXCL>(oid, std::move(wrapper)); + default: + ceph_abort(); + }; +} + +PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + logger().debug("{}: {}", __func__, *req); + if (can_discard_replica_op(*req)) { + return seastar::now(); + } + + ceph::os::Transaction txn; + auto encoded_txn = req->get_data().cbegin(); + decode(txn, encoded_txn); + auto p = req->logbl.cbegin(); + std::vector<pg_log_entry_t> log_entries; + decode(log_entries, p); + log_operation(std::move(log_entries), + req->pg_trim_to, + req->version, + req->min_last_complete_ondisk, + !txn.empty(), + txn, + false); + logger().debug("PG::handle_rep_op: do_transaction..."); + return interruptor::make_interruptible(shard_services.get_store().do_transaction( + coll_ref, std::move(txn))).then_interruptible( + [req, lcod=peering_state.get_info().last_complete, this] { + peering_state.update_last_complete_ondisk(lcod); + const auto map_epoch = get_osdmap_epoch(); + auto reply = crimson::make_message<MOSDRepOpReply>( + req.get(), pg_whoami, 0, + map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK); + reply->set_last_complete_ondisk(lcod); + return shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch); + }); +} + +void PG::log_operation( + std::vector<pg_log_entry_t>&& logv, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + const eversion_t &min_last_complete_ondisk, + bool transaction_applied, + ObjectStore::Transaction &txn, + bool async) { + logger().debug("{}", __func__); + if (is_primary()) { + ceph_assert(trim_to <= peering_state.get_last_update_ondisk()); + } + /* TODO: when we add snap mapper and projected log support, + * we'll likely want to update them here. + * + * See src/osd/PrimaryLogPG.h:log_operation for how classic + * handles these cases. + */ +#if 0 + if (transaction_applied) { + //TODO: + //update_snap_map(logv, t); + } + auto last = logv.rbegin(); + if (is_primary() && last != logv.rend()) { + projected_log.skip_can_rollback_to_to_head(); + projected_log.trim(cct, last->version, nullptr, nullptr, nullptr); + } +#endif + if (!is_primary()) { // && !is_ec_pg() + replica_clear_repop_obc(logv); + } + peering_state.append_log(std::move(logv), + trim_to, + roll_forward_to, + min_last_complete_ondisk, + txn, + !txn.empty(), + false); +} + +void PG::replica_clear_repop_obc( + const std::vector<pg_log_entry_t> &logv) { + logger().debug("{} clearing {} entries", __func__, logv.size()); + for (auto &&e: logv) { + logger().debug(" {} get_object_boundary(from): {} " + " head version(to): {}", + e.soid, + e.soid.get_object_boundary(), + e.soid.get_head()); + /* Have to blast all clones, they share a snapset */ + obc_registry.clear_range( + e.soid.get_object_boundary(), e.soid.get_head()); + } +} + +void PG::handle_rep_op_reply(const MOSDRepOpReply& m) +{ + if (!can_discard_replica_op(m)) { + backend->got_rep_op_reply(m); + } +} + +PG::interruptible_future<> PG::do_update_log_missing( + Ref<MOSDPGUpdateLogMissing> m, + crimson::net::ConnectionRef conn) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING); + ObjectStore::Transaction t; + std::optional<eversion_t> op_trim_to, op_roll_forward_to; + if (m->pg_trim_to != eversion_t()) + op_trim_to = m->pg_trim_to; + if (m->pg_roll_forward_to != eversion_t()) + op_roll_forward_to = m->pg_roll_forward_to; + logger().debug("op_trim_to = {}, op_roll_forward_to = {}", + op_trim_to, op_roll_forward_to); + + peering_state.append_log_entries_update_missing( + m->entries, t, op_trim_to, op_roll_forward_to); + + return interruptor::make_interruptible(shard_services.get_store().do_transaction( + coll_ref, std::move(t))).then_interruptible( + [m, conn, lcod=peering_state.get_info().last_complete, this] { + if (!peering_state.pg_has_reset_since(m->get_epoch())) { + peering_state.update_last_complete_ondisk(lcod); + auto reply = + crimson::make_message<MOSDPGUpdateLogMissingReply>( + spg_t(peering_state.get_info().pgid.pgid, get_primary().shard), + pg_whoami.shard, + m->get_epoch(), + m->min_epoch, + m->get_tid(), + lcod); + reply->set_priority(CEPH_MSG_PRIO_HIGH); + return conn->send(std::move(reply)); + } + return seastar::now(); + }); +} + + +PG::interruptible_future<> PG::do_update_log_missing_reply( + Ref<MOSDPGUpdateLogMissingReply> m) +{ + logger().debug("{}: got reply from {}", __func__, m->get_from()); + + auto it = log_entry_update_waiting_on.find(m->get_tid()); + if (it != log_entry_update_waiting_on.end()) { + if (it->second.waiting_on.count(m->get_from())) { + it->second.waiting_on.erase(m->get_from()); + if (m->last_complete_ondisk != eversion_t()) { + peering_state.update_peer_last_complete_ondisk( + m->get_from(), m->last_complete_ondisk); + } + } else { + logger().error("{} : {} got reply {} from shard we are not waiting for ", + __func__, peering_state.get_info().pgid, *m, m->get_from()); + } + + if (it->second.waiting_on.empty()) { + it->second.all_committed.set_value(); + it->second.all_committed = {}; + log_entry_update_waiting_on.erase(it); + } + } else { + logger().error("{} : {} got reply {} on unknown tid {}", + __func__, peering_state.get_info().pgid, *m, m->get_tid()); + } + return seastar::now(); +} + +bool PG::old_peering_msg( + const epoch_t reply_epoch, + const epoch_t query_epoch) const +{ + if (const epoch_t lpr = peering_state.get_last_peering_reset(); + lpr > reply_epoch || lpr > query_epoch) { + logger().debug("{}: pg changed {} lpr {}, reply_epoch {}, query_epoch {}", + __func__, get_info().history, lpr, reply_epoch, query_epoch); + return true; + } + return false; +} + +bool PG::can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const +{ + // if a repop is replied after a replica goes down in a new osdmap, and + // before the pg advances to this new osdmap, the repop replies before this + // repop can be discarded by that replica OSD, because the primary resets the + // connection to it when handling the new osdmap marking it down, and also + // resets the messenger sesssion when the replica reconnects. to avoid the + // out-of-order replies, the messages from that replica should be discarded. + const auto osdmap = peering_state.get_osdmap(); + const int from_osd = m.get_source().num(); + if (osdmap->is_down(from_osd)) { + return true; + } + // Mostly, this overlaps with the old_peering_msg + // condition. An important exception is pushes + // sent by replicas not in the acting set, since + // if such a replica goes down it does not cause + // a new interval. + if (osdmap->get_down_at(from_osd) >= m_map_epoch) { + return true; + } + // same pg? + // if pg changes *at all*, we reset and repeer! + return old_peering_msg(m_map_epoch, m_map_epoch); +} + +seastar::future<> PG::stop() +{ + logger().info("PG {} {}", pgid, __func__); + stopping = true; + cancel_local_background_io_reservation(); + cancel_remote_recovery_reservation(); + check_readable_timer.cancel(); + renew_lease_timer.cancel(); + return osdmap_gate.stop().then([this] { + return wait_for_active_blocker.stop(); + }).then([this] { + return recovery_handler->stop(); + }).then([this] { + return recovery_backend->stop(); + }).then([this] { + return backend->stop(); + }); +} + +void PG::on_change(ceph::os::Transaction &t) { + logger().debug("{} {}:", *this, __func__); + context_registry_on_change(); + obc_loader.notify_on_change(is_primary()); + recovery_backend->on_peering_interval_change(t); + backend->on_actingset_changed(is_primary()); + wait_for_active_blocker.unblock(); + if (is_primary()) { + logger().debug("{} {}: requeueing", *this, __func__); + client_request_orderer.requeue(shard_services, this); + } else { + logger().debug("{} {}: dropping requests", *this, __func__); + client_request_orderer.clear_and_cancel(); + } +} + +void PG::context_registry_on_change() { + obc_registry.for_each([](ObjectContextRef obc) { + assert(obc); + for (auto j = obc->watchers.begin(); + j != obc->watchers.end(); + j = obc->watchers.erase(j)) { + j->second->discard_state(); + } + }); +} + +bool PG::can_discard_op(const MOSDOp& m) const { + if (m.get_map_epoch() < + peering_state.get_info().history.same_primary_since) { + logger().debug("{} changed after {} dropping {} ", + __func__ , m.get_map_epoch(), m); + return true; + } + + if ((m.get_flags() & (CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS)) + && !is_primary() + && (m.get_map_epoch() < + peering_state.get_info().history.same_interval_since)) + { + // Note: the Objecter will resend on interval change without the primary + // changing if it actually sent to a replica. If the primary hasn't + // changed since the send epoch, we got it, and we're primary, it won't + // have resent even if the interval did change as it sent it to the primary + // (us). + return true; + } + return __builtin_expect(m.get_map_epoch() + < peering_state.get_info().history.same_primary_since, false); +} + +bool PG::is_degraded_or_backfilling_object(const hobject_t& soid) const { + /* The conditions below may clear (on_local_recover, before we queue + * the transaction) before we actually requeue the degraded waiters + * in on_global_recover after the transaction completes. + */ + if (peering_state.get_pg_log().get_missing().get_items().count(soid)) + return true; + ceph_assert(!get_acting_recovery_backfill().empty()); + for (auto& peer : get_acting_recovery_backfill()) { + if (peer == get_primary()) continue; + auto peer_missing_entry = peering_state.get_peer_missing().find(peer); + // If an object is missing on an async_recovery_target, return false. + // This will not block the op and the object is async recovered later. + if (peer_missing_entry != peering_state.get_peer_missing().end() && + peer_missing_entry->second.get_items().count(soid)) { + return true; + } + // Object is degraded if after last_backfill AND + // we are backfilling it + if (is_backfill_target(peer) && + peering_state.get_peer_info(peer).last_backfill <= soid && + recovery_handler->backfill_state && + recovery_handler->backfill_state->get_last_backfill_started() >= soid && + recovery_backend->is_recovering(soid)) { + return true; + } + } + return false; +} + +PG::interruptible_future<std::optional<PG::complete_op_t>> +PG::already_complete(const osd_reqid_t& reqid) +{ + eversion_t version; + version_t user_version; + int ret; + std::vector<pg_log_op_return_item_t> op_returns; + + if (peering_state.get_pg_log().get_log().get_request( + reqid, &version, &user_version, &ret, &op_returns)) { + complete_op_t dupinfo{ + user_version, + version, + ret}; + return backend->request_committed(reqid, version).then([dupinfo] { + return seastar::make_ready_future<std::optional<complete_op_t>>(dupinfo); + }); + } else { + return seastar::make_ready_future<std::optional<complete_op_t>>(std::nullopt); + } +} + +} diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h new file mode 100644 index 000000000..d96db2e20 --- /dev/null +++ b/src/crimson/osd/pg.h @@ -0,0 +1,833 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <memory> +#include <optional> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "common/dout.h" +#include "include/interval_set.h" +#include "crimson/net/Fwd.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDOpReply.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" +#include "osd/osd_types_fmt.h" +#include "crimson/osd/object_context.h" +#include "osd/PeeringState.h" +#include "osd/SnapMapper.h" + +#include "crimson/common/interruptible_future.h" +#include "crimson/common/type_helpers.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/logmissing_request.h" +#include "crimson/osd/osd_operations/logmissing_request_reply.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/replicated_request.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/pg_activation_blocker.h" +#include "crimson/osd/pg_recovery.h" +#include "crimson/osd/pg_recovery_listener.h" +#include "crimson/osd/recovery_backend.h" +#include "crimson/osd/object_context_loader.h" + +class MQuery; +class OSDMap; +class PGBackend; +class PGPeeringEvent; +class osd_op_params_t; + +namespace recovery { + class Context; +} + +namespace crimson::net { + class Messenger; +} + +namespace crimson::os { + class FuturizedStore; +} + +namespace crimson::osd { +class OpsExecuter; +class BackfillRecovery; +class SnapTrimEvent; + +class PG : public boost::intrusive_ref_counter< + PG, + boost::thread_unsafe_counter>, + public PGRecoveryListener, + PeeringState::PeeringListener, + DoutPrefixProvider +{ + using ec_profile_t = std::map<std::string,std::string>; + using cached_map_t = OSDMapService::cached_map_t; + + ClientRequest::PGPipeline request_pg_pipeline; + PGPeeringPipeline peering_request_pg_pipeline; + + ClientRequest::Orderer client_request_orderer; + + spg_t pgid; + pg_shard_t pg_whoami; + crimson::os::CollectionRef coll_ref; + ghobject_t pgmeta_oid; + + seastar::timer<seastar::lowres_clock> check_readable_timer; + seastar::timer<seastar::lowres_clock> renew_lease_timer; + +public: + template <typename T = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition, T>; + + PG(spg_t pgid, + pg_shard_t pg_shard, + crimson::os::CollectionRef coll_ref, + pg_pool_t&& pool, + std::string&& name, + cached_map_t osdmap, + ShardServices &shard_services, + ec_profile_t profile); + + ~PG(); + + const pg_shard_t& get_pg_whoami() const final { + return pg_whoami; + } + + const spg_t& get_pgid() const final { + return pgid; + } + + PGBackend& get_backend() { + return *backend; + } + const PGBackend& get_backend() const { + return *backend; + } + // EpochSource + epoch_t get_osdmap_epoch() const final { + return peering_state.get_osdmap_epoch(); + } + + eversion_t get_pg_trim_to() const { + return peering_state.get_pg_trim_to(); + } + + eversion_t get_min_last_complete_ondisk() const { + return peering_state.get_min_last_complete_ondisk(); + } + + const pg_info_t& get_info() const final { + return peering_state.get_info(); + } + + // DoutPrefixProvider + std::ostream& gen_prefix(std::ostream& out) const final { + return out << *this; + } + crimson::common::CephContext *get_cct() const final { + return shard_services.get_cct(); + } + unsigned get_subsys() const final { + return ceph_subsys_osd; + } + + crimson::os::CollectionRef get_collection_ref() { + return coll_ref; + } + + // PeeringListener + void prepare_write( + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ceph::os::Transaction &t) final; + + void on_info_history_change() final { + // Not needed yet -- mainly for scrub scheduling + } + + /// Notify PG that Primary/Replica status has changed (to update scrub registration) + void on_primary_status_change(bool was_primary, bool now_primary) final { + } + + /// Need to reschedule next scrub. Assuming no change in role + void reschedule_scrub() final { + } + + void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) final; + + uint64_t get_snap_trimq_size() const final { + return std::size(snap_trimq); + } + + void send_cluster_message( + int osd, MessageURef m, + epoch_t epoch, bool share_map_update=false) final { + (void)shard_services.send_to_osd(osd, std::move(m), epoch); + } + + void send_pg_created(pg_t pgid) final { + (void)shard_services.send_pg_created(pgid); + } + + bool try_flush_or_schedule_async() final; + + void start_flush_on_transaction( + ceph::os::Transaction &t) final { + t.register_on_commit( + new LambdaContext([this](int r){ + peering_state.complete_flush(); + })); + } + + void on_flushed() final { + // will be needed for unblocking IO operations/peering + } + + template <typename T> + void start_peering_event_operation(T &&evt, float delay = 0) { + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + pg_whoami, + pgid, + delay, + std::forward<T>(evt)); + } + + void schedule_event_after( + PGPeeringEventRef event, + float delay) final { + start_peering_event_operation(std::move(*event), delay); + } + std::vector<pg_shard_t> get_replica_recovery_order() const final { + return peering_state.get_replica_recovery_order(); + } + void request_local_background_io_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.local_request_reservation( + pgid, + on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) { + start_peering_event_operation(std::move(*on_grant)); + }) : nullptr, + priority, + on_preempt ? make_lambda_context( + [this, on_preempt=std::move(on_preempt)] (int) { + start_peering_event_operation(std::move(*on_preempt)); + }) : nullptr); + } + + void update_local_background_io_priority( + unsigned priority) final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.local_update_priority( + pgid, + priority); + } + + void cancel_local_background_io_reservation() final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.local_cancel_reservation( + pgid); + } + + void request_remote_recovery_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.remote_request_reservation( + pgid, + on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) { + start_peering_event_operation(std::move(*on_grant)); + }) : nullptr, + priority, + on_preempt ? make_lambda_context( + [this, on_preempt=std::move(on_preempt)] (int) { + start_peering_event_operation(std::move(*on_preempt)); + }) : nullptr); + } + + void cancel_remote_recovery_reservation() final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.remote_cancel_reservation( + pgid); + } + + void schedule_event_on_commit( + ceph::os::Transaction &t, + PGPeeringEventRef on_commit) final { + t.register_on_commit( + make_lambda_context( + [this, on_commit=std::move(on_commit)](int) { + start_peering_event_operation(std::move(*on_commit)); + })); + } + + void update_heartbeat_peers(std::set<int> peers) final { + // Not needed yet + } + void set_probe_targets(const std::set<pg_shard_t> &probe_set) final { + // Not needed yet + } + void clear_probe_targets() final { + // Not needed yet + } + void queue_want_pg_temp(const std::vector<int> &wanted) final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.queue_want_pg_temp(pgid.pgid, wanted); + } + void clear_want_pg_temp() final { + // TODO -- we probably want to add a mechanism for blocking on this + // after handling the peering event + std::ignore = shard_services.remove_want_pg_temp(pgid.pgid); + } + void check_recovery_sources(const OSDMapRef& newmap) final { + // Not needed yet + } + void check_blocklisted_watchers() final; + void clear_primary_state() final { + // Not needed yet + } + + void queue_check_readable(epoch_t last_peering_reset, + ceph::timespan delay) final; + void recheck_readable() final; + + unsigned get_target_pg_log_entries() const final; + + void on_pool_change() final { + // Not needed yet + } + void on_role_change() final { + // Not needed yet + } + void on_change(ceph::os::Transaction &t) final; + void on_activate(interval_set<snapid_t> to_trim) final; + void on_activate_complete() final; + void on_new_interval() final { + // Not needed yet + } + Context *on_clean() final; + void on_activate_committed() final { + // Not needed yet (will be needed for IO unblocking) + } + void on_active_exit() final { + // Not needed yet + } + + void on_removal(ceph::os::Transaction &t) final; + + std::pair<ghobject_t, bool> + do_delete_work(ceph::os::Transaction &t, ghobject_t _next) final; + + // merge/split not ready + void clear_ready_to_merge() final {} + void set_not_ready_to_merge_target(pg_t pgid, pg_t src) final {} + void set_not_ready_to_merge_source(pg_t pgid) final {} + void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) final {} + void set_ready_to_merge_source(eversion_t lu) final {} + + void on_active_actmap() final; + void on_active_advmap(const OSDMapRef &osdmap) final; + + epoch_t cluster_osdmap_trim_lower_bound() final { + // TODO + return 0; + } + + void on_backfill_reserved() final { + recovery_handler->on_backfill_reserved(); + } + void on_backfill_canceled() final { + ceph_assert(0 == "Not implemented"); + } + + void on_recovery_reserved() final { + recovery_handler->start_pglogbased_recovery(); + } + + + bool try_reserve_recovery_space( + int64_t primary_num_bytes, int64_t local_num_bytes) final { + // TODO + return true; + } + void unreserve_recovery_space() final {} + + struct PGLogEntryHandler : public PGLog::LogEntryHandler { + PG *pg; + ceph::os::Transaction *t; + PGLogEntryHandler(PG *pg, ceph::os::Transaction *t) : pg(pg), t(t) {} + + // LogEntryHandler + void remove(const hobject_t &hoid) override { + // TODO + } + void try_stash(const hobject_t &hoid, version_t v) override { + // TODO + } + void rollback(const pg_log_entry_t &entry) override { + // TODO + } + void rollforward(const pg_log_entry_t &entry) override { + // TODO + } + void trim(const pg_log_entry_t &entry) override { + // TODO + } + }; + PGLog::LogEntryHandlerRef get_log_handler( + ceph::os::Transaction &t) final { + return std::make_unique<PG::PGLogEntryHandler>(this, &t); + } + + void rebuild_missing_set_with_deletes(PGLog &pglog) final { + pglog.rebuild_missing_set_with_deletes_crimson( + shard_services.get_store(), + coll_ref, + peering_state.get_info()).get(); + } + + PerfCounters &get_peering_perf() final { + return shard_services.get_recoverystate_perf_logger(); + } + PerfCounters &get_perf_logger() final { + return shard_services.get_perf_logger(); + } + + void log_state_enter(const char *state) final; + void log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) final; + + void dump_recovery_info(Formatter *f) const final { + } + + OstreamTemp get_clog_info() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_INFO, nullptr); + } + OstreamTemp get_clog_debug() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_DEBUG, nullptr); + } + OstreamTemp get_clog_error() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_ERROR, nullptr); + } + + ceph::signedspan get_mnow() const final; + HeartbeatStampsRef get_hb_stamps(int peer) final; + void schedule_renew_lease(epoch_t plr, ceph::timespan delay) final; + + + // Utility + bool is_primary() const final { + return peering_state.is_primary(); + } + bool is_nonprimary() const { + return peering_state.is_nonprimary(); + } + bool is_peered() const final { + return peering_state.is_peered(); + } + bool is_recovering() const final { + return peering_state.is_recovering(); + } + bool is_backfilling() const final { + return peering_state.is_backfilling(); + } + uint64_t get_last_user_version() const { + return get_info().last_user_version; + } + bool get_need_up_thru() const { + return peering_state.get_need_up_thru(); + } + epoch_t get_same_interval_since() const { + return get_info().history.same_interval_since; + } + + const auto& get_pgpool() const { + return peering_state.get_pgpool(); + } + pg_shard_t get_primary() const { + return peering_state.get_primary(); + } + + /// initialize created PG + void init( + int role, + const std::vector<int>& up, + int up_primary, + const std::vector<int>& acting, + int acting_primary, + const pg_history_t& history, + const PastIntervals& pim, + ceph::os::Transaction &t); + + seastar::future<> read_state(crimson::os::FuturizedStore::Shard* store); + + interruptible_future<> do_peering_event( + PGPeeringEvent& evt, PeeringCtx &rctx); + + seastar::future<> handle_advance_map(cached_map_t next_map, PeeringCtx &rctx); + seastar::future<> handle_activate_map(PeeringCtx &rctx); + seastar::future<> handle_initialize(PeeringCtx &rctx); + + static hobject_t get_oid(const hobject_t& hobj); + static RWState::State get_lock_type(const OpInfo &op_info); + + using load_obc_ertr = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::object_corrupted>; + using load_obc_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + load_obc_ertr>; + using interruptor = ::crimson::interruptible::interruptor< + ::crimson::osd::IOInterruptCondition>; + +public: + using with_obc_func_t = + std::function<load_obc_iertr::future<> (ObjectContextRef)>; + + load_obc_iertr::future<> with_locked_obc( + const hobject_t &hobj, + const OpInfo &op_info, + with_obc_func_t&& f); + + interruptible_future<> handle_rep_op(Ref<MOSDRepOp> m); + void log_operation( + std::vector<pg_log_entry_t>&& logv, + const eversion_t &trim_to, + const eversion_t &roll_forward_to, + const eversion_t &min_last_complete_ondisk, + bool transaction_applied, + ObjectStore::Transaction &txn, + bool async = false); + void replica_clear_repop_obc( + const std::vector<pg_log_entry_t> &logv); + void handle_rep_op_reply(const MOSDRepOpReply& m); + interruptible_future<> do_update_log_missing( + Ref<MOSDPGUpdateLogMissing> m, + crimson::net::ConnectionRef conn); + interruptible_future<> do_update_log_missing_reply( + Ref<MOSDPGUpdateLogMissingReply> m); + + + void print(std::ostream& os) const; + void dump_primary(Formatter*); + seastar::future<> submit_error_log( + Ref<MOSDOp> m, + const OpInfo &op_info, + ObjectContextRef obc, + const std::error_code e, + ceph_tid_t rep_tid, + eversion_t &version); + +private: + + struct SnapTrimMutex { + struct WaitPG : OrderedConcurrentPhaseT<WaitPG> { + static constexpr auto type_name = "SnapTrimEvent::wait_pg"; + } wait_pg; + seastar::shared_mutex mutex; + + interruptible_future<> lock(SnapTrimEvent &st_event) noexcept; + + void unlock() noexcept { + mutex.unlock(); + } + } snaptrim_mutex; + + using do_osd_ops_ertr = crimson::errorator< + crimson::ct_error::eagain>; + using do_osd_ops_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + ::crimson::errorator<crimson::ct_error::eagain>>; + template <typename Ret = void> + using pg_rep_op_fut_t = + std::tuple<interruptible_future<>, + do_osd_ops_iertr::future<Ret>>; + do_osd_ops_iertr::future<pg_rep_op_fut_t<MURef<MOSDOpReply>>> do_osd_ops( + Ref<MOSDOp> m, + crimson::net::ConnectionRef conn, + ObjectContextRef obc, + const OpInfo &op_info, + const SnapContext& snapc); + using do_osd_ops_success_func_t = + std::function<do_osd_ops_iertr::future<>()>; + using do_osd_ops_failure_func_t = + std::function<do_osd_ops_iertr::future<>(const std::error_code&)>; + struct do_osd_ops_params_t; + do_osd_ops_iertr::future<pg_rep_op_fut_t<>> do_osd_ops( + ObjectContextRef obc, + std::vector<OSDOp>& ops, + const OpInfo &op_info, + const do_osd_ops_params_t &¶ms, + do_osd_ops_success_func_t success_func, + do_osd_ops_failure_func_t failure_func); + template <class Ret, class SuccessFunc, class FailureFunc> + do_osd_ops_iertr::future<pg_rep_op_fut_t<Ret>> do_osd_ops_execute( + seastar::lw_shared_ptr<OpsExecuter> ox, + std::vector<OSDOp>& ops, + SuccessFunc&& success_func, + FailureFunc&& failure_func); + interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m); + std::tuple<interruptible_future<>, interruptible_future<>> + submit_transaction( + ObjectContextRef&& obc, + ceph::os::Transaction&& txn, + osd_op_params_t&& oop, + std::vector<pg_log_entry_t>&& log_entries); + interruptible_future<> repair_object( + const hobject_t& oid, + eversion_t& v); + void check_blocklisted_obc_watchers(ObjectContextRef &obc); + +private: + PG_OSDMapGate osdmap_gate; + ShardServices &shard_services; + + +public: + cached_map_t get_osdmap() { return peering_state.get_osdmap(); } + eversion_t next_version() { + return eversion_t(get_osdmap_epoch(), + ++projected_last_update.version); + } + ShardServices& get_shard_services() final { + return shard_services; + } + seastar::future<> stop(); +private: + std::unique_ptr<PGBackend> backend; + std::unique_ptr<RecoveryBackend> recovery_backend; + std::unique_ptr<PGRecovery> recovery_handler; + + PeeringState peering_state; + eversion_t projected_last_update; + +public: + ObjectContextRegistry obc_registry; + ObjectContextLoader obc_loader; + +private: + OSDriver osdriver; + SnapMapper snap_mapper; + +public: + // PeeringListener + void publish_stats_to_osd() final; + void clear_publish_stats() final; + pg_stat_t get_stats() const; +private: + std::optional<pg_stat_t> pg_stats; + +public: + RecoveryBackend* get_recovery_backend() final { + return recovery_backend.get(); + } + PGRecovery* get_recovery_handler() final { + return recovery_handler.get(); + } + PeeringState& get_peering_state() final { + return peering_state; + } + bool has_reset_since(epoch_t epoch) const final { + return peering_state.pg_has_reset_since(epoch); + } + + const pg_missing_tracker_t& get_local_missing() const { + return peering_state.get_pg_log().get_missing(); + } + epoch_t get_last_peering_reset() const final { + return peering_state.get_last_peering_reset(); + } + const std::set<pg_shard_t> &get_acting_recovery_backfill() const { + return peering_state.get_acting_recovery_backfill(); + } + bool is_backfill_target(pg_shard_t osd) const { + return peering_state.is_backfill_target(osd); + } + void begin_peer_recover(pg_shard_t peer, const hobject_t oid) { + peering_state.begin_peer_recover(peer, oid); + } + uint64_t min_peer_features() const { + return peering_state.get_min_peer_features(); + } + const std::map<hobject_t, std::set<pg_shard_t>>& + get_missing_loc_shards() const { + return peering_state.get_missing_loc().get_missing_locs(); + } + const std::map<pg_shard_t, pg_missing_t> &get_shard_missing() const { + return peering_state.get_peer_missing(); + } + epoch_t get_interval_start_epoch() const { + return get_info().history.same_interval_since; + } + const pg_missing_const_i* get_shard_missing(pg_shard_t shard) const { + if (shard == pg_whoami) + return &get_local_missing(); + else { + auto it = peering_state.get_peer_missing().find(shard); + if (it == peering_state.get_peer_missing().end()) + return nullptr; + else + return &it->second; + } + } + + struct complete_op_t { + const version_t user_version; + const eversion_t version; + const int err; + }; + interruptible_future<std::optional<complete_op_t>> + already_complete(const osd_reqid_t& reqid); + int get_recovery_op_priority() const { + int64_t pri = 0; + get_pgpool().info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri); + return pri > 0 ? pri : crimson::common::local_conf()->osd_recovery_op_priority; + } + seastar::future<> mark_unfound_lost(int) { + // TODO: see PrimaryLogPG::mark_all_unfound_lost() + return seastar::now(); + } + + bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch) const; + + template <typename MsgType> + bool can_discard_replica_op(const MsgType& m) const { + return can_discard_replica_op(m, m.map_epoch); + } + +private: + // instead of seastar::gate, we use a boolean flag to indicate + // whether the system is shutting down, as we don't need to track + // continuations here. + bool stopping = false; + + PGActivationBlocker wait_for_active_blocker; + + friend std::ostream& operator<<(std::ostream&, const PG& pg); + friend class ClientRequest; + friend struct CommonClientRequest; + friend class PGAdvanceMap; + template <class T> + friend class PeeringEvent; + friend class RepRequest; + friend class LogMissingRequest; + friend class LogMissingRequestReply; + friend class BackfillRecovery; + friend struct PGFacade; + friend class InternalClientRequest; + friend class WatchTimeoutRequest; + friend class SnapTrimEvent; + friend class SnapTrimObjSubEvent; +private: + seastar::future<bool> find_unfound() { + return seastar::make_ready_future<bool>(true); + } + + bool can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const; + bool can_discard_op(const MOSDOp& m) const; + void context_registry_on_change(); + bool is_missing_object(const hobject_t& soid) const { + return peering_state.get_pg_log().get_missing().get_items().count(soid); + } + bool is_unreadable_object(const hobject_t &oid, + eversion_t* v = 0) const final { + return is_missing_object(oid) || + !peering_state.get_missing_loc().readable_with_acting( + oid, get_actingset(), v); + } + bool is_degraded_or_backfilling_object(const hobject_t& soid) const; + const std::set<pg_shard_t> &get_actingset() const { + return peering_state.get_actingset(); + } + +private: + friend class IOInterruptCondition; + struct log_update_t { + std::set<pg_shard_t> waiting_on; + seastar::shared_promise<> all_committed; + }; + + std::map<ceph_tid_t, log_update_t> log_entry_update_waiting_on; + // snap trimming + interval_set<snapid_t> snap_trimq; +}; + +struct PG::do_osd_ops_params_t { + crimson::net::ConnectionRef &get_connection() const { + return conn; + } + osd_reqid_t get_reqid() const { + return reqid; + } + utime_t get_mtime() const { + return mtime; + }; + epoch_t get_map_epoch() const { + return map_epoch; + } + entity_inst_t get_orig_source_inst() const { + return orig_source_inst; + } + uint64_t get_features() const { + return features; + } + // Only used by InternalClientRequest, no op flags + bool has_flag(uint32_t flag) const { + return false; + } + + // Only used by ExecutableMessagePimpl + entity_name_t get_source() const { + return orig_source_inst.name; + } + + crimson::net::ConnectionRef &conn; + osd_reqid_t reqid; + utime_t mtime; + epoch_t map_epoch; + entity_inst_t orig_source_inst; + uint64_t features; +}; + +std::ostream& operator<<(std::ostream&, const PG& pg); + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::PG> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/pg_activation_blocker.cc b/src/crimson/osd/pg_activation_blocker.cc new file mode 100644 index 000000000..48ffe3f84 --- /dev/null +++ b/src/crimson/osd/pg_activation_blocker.cc @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_activation_blocker.h" + +namespace crimson::osd { + +void PGActivationBlocker::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->get_pgid(); +} + +void PGActivationBlocker::unblock() +{ + p.set_value(); + p = {}; +} + +seastar::future<> +PGActivationBlocker::wait(PGActivationBlocker::BlockingEvent::TriggerI&& trigger) +{ + if (pg->get_peering_state().is_active()) { + return seastar::now(); + } else { + return trigger.maybe_record_blocking(p.get_shared_future(), *this); + } +} + +seastar::future<> PGActivationBlocker::stop() +{ + p.set_exception(crimson::common::system_shutdown_exception()); + return seastar::now(); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/pg_activation_blocker.h b/src/crimson/osd/pg_activation_blocker.h new file mode 100644 index 000000000..fff8219d1 --- /dev/null +++ b/src/crimson/osd/pg_activation_blocker.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "crimson/common/operation.h" +#include "crimson/osd/osd_operation.h" + +namespace crimson::osd { + +class PG; + +class PGActivationBlocker : public crimson::BlockerT<PGActivationBlocker> { + PG *pg; + + const spg_t pgid; + seastar::shared_promise<> p; + +protected: + void dump_detail(Formatter *f) const; + +public: + static constexpr const char *type_name = "PGActivationBlocker"; + using Blocker = PGActivationBlocker; + + PGActivationBlocker(PG *pg) : pg(pg) {} + void unblock(); + seastar::future<> wait(PGActivationBlocker::BlockingEvent::TriggerI&&); + seastar::future<> stop(); +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc new file mode 100644 index 000000000..02acb9a55 --- /dev/null +++ b/src/crimson/osd/pg_backend.cc @@ -0,0 +1,1811 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg_backend.h" + +#include <charconv> +#include <optional> +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm/copy.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <seastar/core/print.hh> + +#include "messages/MOSDOp.h" +#include "os/Transaction.h" +#include "common/Checksummer.h" +#include "common/Clock.h" + +#include "crimson/common/exception.h" +#include "crimson/common/tmap_helpers.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/object_context_loader.h" +#include "replicated_backend.h" +#include "replicated_recovery_backend.h" +#include "ec_backend.h" +#include "exceptions.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using std::runtime_error; +using std::string; +using std::string_view; +using crimson::common::local_conf; + +std::unique_ptr<PGBackend> +PGBackend::create(pg_t pgid, + const pg_shard_t pg_shard, + const pg_pool_t& pool, + crimson::os::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile, + DoutPrefixProvider &dpp) +{ + switch (pool.type) { + case pg_pool_t::TYPE_REPLICATED: + return std::make_unique<ReplicatedBackend>(pgid, pg_shard, + coll, shard_services, + dpp); + case pg_pool_t::TYPE_ERASURE: + return std::make_unique<ECBackend>(pg_shard.shard, coll, shard_services, + std::move(ec_profile), + pool.stripe_width, + dpp); + default: + throw runtime_error(seastar::format("unsupported pool type '{}'", + pool.type)); + } +} + +PGBackend::PGBackend(shard_id_t shard, + CollectionRef coll, + crimson::osd::ShardServices &shard_services, + DoutPrefixProvider &dpp) + : shard{shard}, + coll{coll}, + shard_services{shard_services}, + dpp{dpp}, + store{&shard_services.get_store()} +{} + +PGBackend::load_metadata_iertr::future + <PGBackend::loaded_object_md_t::ref> +PGBackend::load_metadata(const hobject_t& oid) +{ + return interruptor::make_interruptible(store->get_attrs( + coll, + ghobject_t{oid, ghobject_t::NO_GEN, shard})).safe_then_interruptible( + [oid](auto &&attrs) -> load_metadata_ertr::future<loaded_object_md_t::ref>{ + loaded_object_md_t::ref ret(new loaded_object_md_t()); + if (auto oiiter = attrs.find(OI_ATTR); oiiter != attrs.end()) { + bufferlist bl = std::move(oiiter->second); + try { + ret->os = ObjectState( + object_info_t(bl, oid), + true); + } catch (const buffer::error&) { + logger().warn("unable to decode ObjectState"); + throw crimson::osd::invalid_argument(); + } + } else { + logger().error( + "load_metadata: object {} present but missing object info", + oid); + return crimson::ct_error::object_corrupted::make(); + } + + if (oid.is_head()) { + // Returning object_corrupted when the object exsits and the + // Snapset is either not found or empty. + bool object_corrupted = true; + if (auto ssiter = attrs.find(SS_ATTR); ssiter != attrs.end()) { + object_corrupted = false; + bufferlist bl = std::move(ssiter->second); + if (bl.length()) { + ret->ssc = new crimson::osd::SnapSetContext(oid.get_snapdir()); + try { + ret->ssc->snapset = SnapSet(bl); + ret->ssc->exists = true; + logger().debug( + "load_metadata: object {} and snapset {} present", + oid, ret->ssc->snapset); + } catch (const buffer::error&) { + logger().warn("unable to decode SnapSet"); + throw crimson::osd::invalid_argument(); + } + } else { + object_corrupted = true; + } + } + if (object_corrupted) { + logger().error( + "load_metadata: object {} present but missing snapset", + oid); + return crimson::ct_error::object_corrupted::make(); + } + } + + return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>( + std::move(ret)); + }, crimson::ct_error::enoent::handle([oid] { + logger().debug( + "load_metadata: object {} doesn't exist, returning empty metadata", + oid); + return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>( + new loaded_object_md_t{ + ObjectState( + object_info_t(oid), + false), + oid.is_head() ? (new crimson::osd::SnapSetContext(oid)) : nullptr + }); + })); +} + +PGBackend::rep_op_fut_t +PGBackend::mutate_object( + std::set<pg_shard_t> pg_shards, + crimson::osd::ObjectContextRef &&obc, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, + epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + logger().trace("mutate_object: num_ops={}", txn.get_num_ops()); + if (obc->obs.exists) { +#if 0 + obc->obs.oi.version = ctx->at_version; + obc->obs.oi.prior_version = ctx->obs->oi.version; +#endif + + obc->obs.oi.prior_version = obc->obs.oi.version; + obc->obs.oi.version = osd_op_p.at_version; + if (osd_op_p.user_at_version > obc->obs.oi.user_version) + obc->obs.oi.user_version = osd_op_p.user_at_version; + obc->obs.oi.last_reqid = osd_op_p.req_id; + obc->obs.oi.mtime = osd_op_p.mtime; + obc->obs.oi.local_mtime = ceph_clock_now(); + + // object_info_t + { + ceph::bufferlist osv; + obc->obs.oi.encode_no_oid(osv, CEPH_FEATURES_ALL); + // TODO: get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, OI_ATTR, osv); + } + + // snapset + if (obc->obs.oi.soid.snap == CEPH_NOSNAP) { + logger().debug("final snapset {} in {}", + obc->ssc->snapset, obc->obs.oi.soid); + ceph::bufferlist bss; + encode(obc->ssc->snapset, bss); + txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, SS_ATTR, bss); + obc->ssc->exists = true; + } else { + logger().debug("no snapset (this is a clone)"); + } + } else { + // reset cached ObjectState without enforcing eviction + obc->obs.oi = object_info_t(obc->obs.oi.soid); + } + return _submit_transaction( + std::move(pg_shards), obc->obs.oi.soid, std::move(txn), + std::move(osd_op_p), min_epoch, map_epoch, std::move(log_entries)); +} + +static inline bool _read_verify_data( + const object_info_t& oi, + const ceph::bufferlist& data) +{ + if (oi.is_data_digest() && oi.size == data.length()) { + // whole object? can we verify the checksum? + if (auto crc = data.crc32c(-1); crc != oi.data_digest) { + logger().error("full-object read crc {} != expected {} on {}", + crc, oi.data_digest, oi.soid); + // todo: mark soid missing, perform recovery, and retry + return false; + } + } + return true; +} + +PGBackend::read_ierrorator::future<> +PGBackend::read(const ObjectState& os, OSDOp& osd_op, + object_stat_sum_t& delta_stats) +{ + const auto& oi = os.oi; + const ceph_osd_op& op = osd_op.op; + const uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + logger().trace("read: {} {}~{}", oi.soid, offset, length); + + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + // are we beyond truncate_size? + size_t size = oi.size; + if ((op.extent.truncate_seq > oi.truncate_seq) && + (op.extent.truncate_size < offset + length) && + (op.extent.truncate_size < size)) { + size = op.extent.truncate_size; + } + if (offset >= size) { + // read size was trimmed to zero and it is expected to do nothing, + return read_errorator::now(); + } + if (!length) { + // read the whole object if length is 0 + length = size; + } + return _read(oi.soid, offset, length, op.flags).safe_then_interruptible_tuple( + [&delta_stats, &oi, &osd_op](auto&& bl) -> read_errorator::future<> { + if (!_read_verify_data(oi, bl)) { + // crc mismatches + return crimson::ct_error::object_corrupted::make(); + } + logger().debug("read: data length: {}", bl.length()); + osd_op.op.extent.length = bl.length(); + osd_op.rval = 0; + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + osd_op.outdata = std::move(bl); + return read_errorator::now(); + }, crimson::ct_error::input_output_error::handle([] { + return read_errorator::future<>{crimson::ct_error::object_corrupted::make()}; + }), + read_errorator::pass_further{}); +} + +PGBackend::read_ierrorator::future<> +PGBackend::sparse_read(const ObjectState& os, OSDOp& osd_op, + object_stat_sum_t& delta_stats) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + + const auto& op = osd_op.op; + /* clients (particularly cephfs) may send truncate operations out of order + * w.r.t. reads. op.extent.truncate_seq and op.extent.truncate_size allow + * the OSD to determine whether the client submitted read needs to be + * adjusted to compensate for a truncate the OSD hasn't seen yet. + */ + uint64_t adjusted_size = os.oi.size; + const uint64_t offset = op.extent.offset; + uint64_t adjusted_length = op.extent.length; + if ((os.oi.truncate_seq < op.extent.truncate_seq) && + (op.extent.offset + op.extent.length > op.extent.truncate_size) && + (adjusted_size > op.extent.truncate_size)) { + adjusted_size = op.extent.truncate_size; + } + if (offset > adjusted_size) { + adjusted_length = 0; + } else if (offset + adjusted_length > adjusted_size) { + adjusted_length = adjusted_size - offset; + } + logger().trace("sparse_read: {} {}~{}", + os.oi.soid, op.extent.offset, op.extent.length); + return interruptor::make_interruptible(store->fiemap(coll, ghobject_t{os.oi.soid}, + offset, adjusted_length)).safe_then_interruptible( + [&delta_stats, &os, &osd_op, this](auto&& m) { + return seastar::do_with(interval_set<uint64_t>{std::move(m)}, + [&delta_stats, &os, &osd_op, this](auto&& extents) { + return interruptor::make_interruptible(store->readv(coll, ghobject_t{os.oi.soid}, + extents, osd_op.op.flags)).safe_then_interruptible_tuple( + [&delta_stats, &os, &osd_op, &extents](auto&& bl) -> read_errorator::future<> { + if (_read_verify_data(os.oi, bl)) { + osd_op.op.extent.length = bl.length(); + // re-encode since it might be modified + ceph::encode(extents, osd_op.outdata); + encode_destructively(bl, osd_op.outdata); + logger().trace("sparse_read got {} bytes from object {}", + osd_op.op.extent.length, os.oi.soid); + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(osd_op.op.extent.length, 10); + return read_errorator::make_ready_future<>(); + } else { + // crc mismatches + return crimson::ct_error::object_corrupted::make(); + } + }, crimson::ct_error::input_output_error::handle([] { + return read_errorator::future<>{crimson::ct_error::object_corrupted::make()}; + }), + read_errorator::pass_further{}); + }); + }); +} + +namespace { + + template<class CSum> + PGBackend::checksum_errorator::future<> + do_checksum(ceph::bufferlist& init_value_bl, + size_t chunk_size, + const ceph::bufferlist& buf, + ceph::bufferlist& result) + { + typename CSum::init_value_t init_value; + auto init_value_p = init_value_bl.cbegin(); + try { + decode(init_value, init_value_p); + // chop off the consumed part + init_value_bl.splice(0, init_value_p.get_off()); + } catch (const ceph::buffer::end_of_buffer&) { + logger().warn("{}: init value not provided", __func__); + return crimson::ct_error::invarg::make(); + } + const uint32_t chunk_count = buf.length() / chunk_size; + ceph::bufferptr csum_data{ + ceph::buffer::create(sizeof(typename CSum::value_t) * chunk_count)}; + Checksummer::calculate<CSum>( + init_value, chunk_size, 0, buf.length(), buf, &csum_data); + encode(chunk_count, result); + result.append(std::move(csum_data)); + return PGBackend::checksum_errorator::now(); + } +} + +PGBackend::checksum_ierrorator::future<> +PGBackend::checksum(const ObjectState& os, OSDOp& osd_op) +{ + // sanity tests and normalize the argments + auto& checksum = osd_op.op.checksum; + if (checksum.offset == 0 && checksum.length == 0) { + // zeroed offset+length implies checksum whole object + checksum.length = os.oi.size; + } else if (checksum.offset >= os.oi.size) { + // read size was trimmed to zero, do nothing, + // see PGBackend::read() + return checksum_errorator::now(); + } + if (checksum.chunk_size > 0) { + if (checksum.length == 0) { + logger().warn("{}: length required when chunk size provided", __func__); + return crimson::ct_error::invarg::make(); + } + if (checksum.length % checksum.chunk_size != 0) { + logger().warn("{}: length not aligned to chunk size", __func__); + return crimson::ct_error::invarg::make(); + } + } else { + checksum.chunk_size = checksum.length; + } + if (checksum.length == 0) { + uint32_t count = 0; + encode(count, osd_op.outdata); + return checksum_errorator::now(); + } + + // read the chunk to be checksum'ed + return _read(os.oi.soid, checksum.offset, checksum.length, osd_op.op.flags) + .safe_then_interruptible( + [&osd_op](auto&& read_bl) mutable -> checksum_errorator::future<> { + auto& checksum = osd_op.op.checksum; + if (read_bl.length() != checksum.length) { + logger().warn("checksum: bytes read {} != {}", + read_bl.length(), checksum.length); + return crimson::ct_error::invarg::make(); + } + // calculate its checksum and put the result in outdata + switch (checksum.type) { + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32: + return do_checksum<Checksummer::xxhash32>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64: + return do_checksum<Checksummer::xxhash64>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C: + return do_checksum<Checksummer::crc32c>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + default: + logger().warn("checksum: unknown crc type ({})", + static_cast<uint32_t>(checksum.type)); + return crimson::ct_error::invarg::make(); + } + }); +} + +PGBackend::cmp_ext_ierrorator::future<> +PGBackend::cmp_ext(const ObjectState& os, OSDOp& osd_op) +{ + const ceph_osd_op& op = osd_op.op; + uint64_t obj_size = os.oi.size; + if (os.oi.truncate_seq < op.extent.truncate_seq && + op.extent.offset + op.extent.length > op.extent.truncate_size) { + obj_size = op.extent.truncate_size; + } + uint64_t ext_len; + if (op.extent.offset >= obj_size) { + ext_len = 0; + } else if (op.extent.offset + op.extent.length > obj_size) { + ext_len = obj_size - op.extent.offset; + } else { + ext_len = op.extent.length; + } + auto read_ext = ll_read_ierrorator::make_ready_future<ceph::bufferlist>(); + if (ext_len == 0) { + logger().debug("{}: zero length extent", __func__); + } else if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + } else { + read_ext = _read(os.oi.soid, op.extent.offset, ext_len, 0); + } + return read_ext.safe_then_interruptible([&osd_op](auto&& read_bl) + -> cmp_ext_errorator::future<> { + for (unsigned index = 0; index < osd_op.indata.length(); index++) { + char byte_in_op = osd_op.indata[index]; + char byte_from_disk = (index < read_bl.length() ? read_bl[index] : 0); + if (byte_in_op != byte_from_disk) { + logger().debug("cmp_ext: mismatch at {}", index); + // Unlike other ops, we set osd_op.rval here and return a different + // error code via ct_error::cmp_fail. + osd_op.rval = -MAX_ERRNO - index; + return crimson::ct_error::cmp_fail::make(); + } + } + osd_op.rval = 0; + return cmp_ext_errorator::make_ready_future<>(); + }); +} + +PGBackend::stat_ierrorator::future<> +PGBackend::stat( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) +{ + if (os.exists/* TODO: && !os.is_whiteout() */) { + logger().debug("stat os.oi.size={}, os.oi.mtime={}", os.oi.size, os.oi.mtime); + encode(os.oi.size, osd_op.outdata); + encode(os.oi.mtime, osd_op.outdata); + } else { + logger().debug("stat object does not exist"); + return crimson::ct_error::enoent::make(); + } + delta_stats.num_rd++; + return stat_errorator::now(); +} + +PGBackend::write_iertr::future<> PGBackend::_writefull( + ObjectState& os, + off_t truncate_size, + const bufferlist& bl, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + unsigned flags) +{ + const bool existing = maybe_create_new_object(os, txn, delta_stats); + if (existing && bl.length() < os.oi.size) { + + txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, bl.length()); + truncate_update_size_and_usage(delta_stats, os.oi, truncate_size); + + osd_op_params.clean_regions.mark_data_region_dirty( + bl.length(), + os.oi.size - bl.length()); + } + if (bl.length()) { + txn.write( + coll->get_cid(), ghobject_t{os.oi.soid}, 0, bl.length(), + bl, flags); + update_size_and_usage( + delta_stats, os.oi, 0, + bl.length(), true); + osd_op_params.clean_regions.mark_data_region_dirty( + 0, + std::max((uint64_t)bl.length(), os.oi.size)); + } + return seastar::now(); +} + +PGBackend::write_iertr::future<> PGBackend::_truncate( + ObjectState& os, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + size_t offset, + size_t truncate_size, + uint32_t truncate_seq) +{ + if (truncate_seq) { + assert(offset == truncate_size); + if (truncate_seq <= os.oi.truncate_seq) { + logger().debug("{} truncate seq {} <= current {}, no-op", + __func__, truncate_seq, os.oi.truncate_seq); + return write_ertr::make_ready_future<>(); + } else { + logger().debug("{} truncate seq {} > current {}, truncating", + __func__, truncate_seq, os.oi.truncate_seq); + os.oi.truncate_seq = truncate_seq; + os.oi.truncate_size = truncate_size; + } + } + maybe_create_new_object(os, txn, delta_stats); + if (os.oi.size != offset) { + txn.truncate( + coll->get_cid(), + ghobject_t{os.oi.soid}, offset); + if (os.oi.size > offset) { + // TODO: modified_ranges.union_of(trim); + osd_op_params.clean_regions.mark_data_region_dirty( + offset, + os.oi.size - offset); + } else { + // os.oi.size < offset + osd_op_params.clean_regions.mark_data_region_dirty( + os.oi.size, + offset - os.oi.size); + } + truncate_update_size_and_usage(delta_stats, os.oi, offset); + os.oi.clear_data_digest(); + } + delta_stats.num_wr++; + return write_ertr::now(); +} + +bool PGBackend::maybe_create_new_object( + ObjectState& os, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats) +{ + if (!os.exists) { + ceph_assert(!os.oi.is_whiteout()); + os.exists = true; + os.oi.new_object(); + + txn.touch(coll->get_cid(), ghobject_t{os.oi.soid}); + delta_stats.num_objects++; + return false; + } else if (os.oi.is_whiteout()) { + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + delta_stats.num_whiteouts--; + } + return true; +} + +void PGBackend::update_size_and_usage(object_stat_sum_t& delta_stats, + object_info_t& oi, uint64_t offset, + uint64_t length, bool write_full) +{ + if (write_full || + (offset + length > oi.size && length)) { + uint64_t new_size = offset + length; + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += new_size; + oi.size = new_size; + } + delta_stats.num_wr++; + delta_stats.num_wr_kb += shift_round_up(length, 10); +} + +void PGBackend::truncate_update_size_and_usage(object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size) +{ + if (oi.size != truncate_size) { + delta_stats.num_bytes -= oi.size; + delta_stats.num_bytes += truncate_size; + oi.size = truncate_size; + } +} + +static bool is_offset_and_length_valid( + const std::uint64_t offset, + const std::uint64_t length) +{ + if (const std::uint64_t max = local_conf()->osd_max_object_size; + offset >= max || length > max || offset + length > max) { + logger().debug("{} osd_max_object_size: {}, offset: {}, len: {}; " + "Hard limit of object size is 4GB", + __func__, max, offset, length); + return false; + } else { + return true; + } +} + +PGBackend::interruptible_future<> PGBackend::set_allochint( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats) +{ + maybe_create_new_object(os, txn, delta_stats); + + os.oi.expected_object_size = osd_op.op.alloc_hint.expected_object_size; + os.oi.expected_write_size = osd_op.op.alloc_hint.expected_write_size; + os.oi.alloc_hint_flags = osd_op.op.alloc_hint.flags; + txn.set_alloc_hint(coll->get_cid(), + ghobject_t{os.oi.soid}, + os.oi.expected_object_size, + os.oi.expected_write_size, + os.oi.alloc_hint_flags); + return seastar::now(); +} + +PGBackend::write_iertr::future<> PGBackend::write( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + const ceph_osd_op& op = osd_op.op; + uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + bufferlist buf = osd_op.indata; + if (op.extent.length != osd_op.indata.length()) { + return crimson::ct_error::invarg::make(); + } + + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + + if (auto seq = os.oi.truncate_seq; + seq != 0 && op.extent.truncate_seq < seq) { + // old write, arrived after trimtrunc + if (offset + length > os.oi.size) { + // no-op + if (offset > os.oi.size) { + length = 0; + buf.clear(); + } else { + // truncate + auto len = os.oi.size - offset; + buf.splice(len, length); + length = len; + } + } + } else if (op.extent.truncate_seq > seq) { + // write arrives before trimtrunc + if (os.exists && !os.oi.is_whiteout()) { + txn.truncate(coll->get_cid(), + ghobject_t{os.oi.soid}, op.extent.truncate_size); + if (op.extent.truncate_size != os.oi.size) { + os.oi.size = length; + if (op.extent.truncate_size > os.oi.size) { + osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size, + op.extent.truncate_size - os.oi.size); + } else { + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.truncate_size, + os.oi.size - op.extent.truncate_size); + } + } + truncate_update_size_and_usage(delta_stats, os.oi, op.extent.truncate_size); + } + os.oi.truncate_seq = op.extent.truncate_seq; + os.oi.truncate_size = op.extent.truncate_size; + } + maybe_create_new_object(os, txn, delta_stats); + if (length == 0) { + if (offset > os.oi.size) { + txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, op.extent.offset); + truncate_update_size_and_usage(delta_stats, os.oi, op.extent.offset); + } else { + txn.nop(); + } + } else { + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + offset, length, std::move(buf), op.flags); + update_size_and_usage(delta_stats, os.oi, offset, length); + } + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset, + op.extent.length); + + return seastar::now(); +} + +PGBackend::interruptible_future<> PGBackend::write_same( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + const ceph_osd_op& op = osd_op.op; + const uint64_t len = op.writesame.length; + if (len == 0) { + return seastar::now(); + } + if (op.writesame.data_length == 0 || + len % op.writesame.data_length != 0 || + op.writesame.data_length != osd_op.indata.length()) { + throw crimson::osd::invalid_argument(); + } + ceph::bufferlist repeated_indata; + for (uint64_t size = 0; size < len; size += op.writesame.data_length) { + repeated_indata.append(osd_op.indata); + } + maybe_create_new_object(os, txn, delta_stats); + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + op.writesame.offset, len, + std::move(repeated_indata), op.flags); + update_size_and_usage(delta_stats, os.oi, op.writesame.offset, len); + osd_op_params.clean_regions.mark_data_region_dirty(op.writesame.offset, len); + return seastar::now(); +} + +PGBackend::write_iertr::future<> PGBackend::writefull( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + const ceph_osd_op& op = osd_op.op; + if (op.extent.length != osd_op.indata.length()) { + return crimson::ct_error::invarg::make(); + } + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + + return _writefull( + os, + op.extent.truncate_size, + osd_op.indata, + txn, + osd_op_params, + delta_stats, + op.flags); +} + +PGBackend::rollback_iertr::future<> PGBackend::rollback( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + crimson::osd::ObjectContextRef head, + crimson::osd::ObjectContextLoader& obc_loader) +{ + const ceph_osd_op& op = osd_op.op; + snapid_t snapid = (uint64_t)op.snap.snapid; + assert(os.oi.soid.is_head()); + logger().debug("{} deleting {} and rolling back to old snap {}", + __func__, os.oi.soid ,snapid); + hobject_t target_coid = os.oi.soid; + target_coid.snap = snapid; + return obc_loader.with_clone_obc_only<RWState::RWWRITE>( + head, target_coid, + [this, &os, &txn, &delta_stats, &osd_op_params] + (auto resolved_obc) { + if (resolved_obc->obs.oi.soid.is_head()) { + // no-op: The resolved oid returned the head object + logger().debug("PGBackend::rollback: loaded head_obc: {}" + " do nothing", + resolved_obc->obs.oi.soid); + return rollback_iertr::now(); + } + /* TODO: https://tracker.ceph.com/issues/59114 This implementation will not + * behave correctly for a rados operation consisting of a mutation followed + * by a rollback to a snapshot since the last mutation of the object. + * The correct behavior would be for the rollback to undo the mutation + * earlier in the operation by resolving to the clone created at the start + * of the operation (see resolve_oid). + * Instead, it will select HEAD leaving that mutation intact since the SnapSet won't + * yet contain that clone. This behavior exists in classic as well. + */ + logger().debug("PGBackend::rollback: loaded clone_obc: {}", + resolved_obc->obs.oi.soid); + // 1) Delete current head + if (os.exists) { + txn.remove(coll->get_cid(), ghobject_t{os.oi.soid, + ghobject_t::NO_GEN, shard}); + } + // 2) Clone correct snapshot into head + txn.clone(coll->get_cid(), ghobject_t{resolved_obc->obs.oi.soid}, + ghobject_t{os.oi.soid}); + // Copy clone obc.os.oi to os.oi + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + os.oi.copy_user_bits(resolved_obc->obs.oi); + delta_stats.num_bytes -= os.oi.size; + delta_stats.num_bytes += resolved_obc->obs.oi.size; + osd_op_params.clean_regions.mark_data_region_dirty(0, + std::max(os.oi.size, resolved_obc->obs.oi.size)); + osd_op_params.clean_regions.mark_omap_dirty(); + // TODO: 3) Calculate clone_overlaps by following overlaps + // forward from rollback snapshot + // https://tracker.ceph.com/issues/58263 + return rollback_iertr::now(); + }).safe_then_interruptible([] { + logger().debug("PGBackend::rollback succefully"); + return rollback_iertr::now(); + },// there's no snapshot here, or there's no object. + // if there's no snapshot, we delete the object; + // otherwise, do nothing. + crimson::ct_error::enoent::handle( + [this, &os, &snapid, &txn, &delta_stats] { + logger().debug("PGBackend::rollback: deleting head on {}" + " with snap_id of {}" + " because got ENOENT|whiteout on obc lookup", + os.oi.soid, snapid); + return remove(os, txn, delta_stats, false); + }), + rollback_ertr::pass_further{}, + crimson::ct_error::assert_all{"unexpected error in rollback"} + ); +} + +PGBackend::append_ierrorator::future<> PGBackend::append( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + const ceph_osd_op& op = osd_op.op; + if (op.extent.length != osd_op.indata.length()) { + return crimson::ct_error::invarg::make(); + } + maybe_create_new_object(os, txn, delta_stats); + if (op.extent.length) { + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + os.oi.size /* offset */, op.extent.length, + std::move(osd_op.indata), op.flags); + update_size_and_usage(delta_stats, os.oi, os.oi.size, + op.extent.length); + osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size, + op.extent.length); + } + return seastar::now(); +} + +PGBackend::write_iertr::future<> PGBackend::truncate( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{} object dne, truncate is a no-op", __func__); + return write_ertr::now(); + } + const ceph_osd_op& op = osd_op.op; + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + return _truncate( + os, txn, osd_op_params, delta_stats, + op.extent.offset, op.extent.truncate_size, op.extent.truncate_seq); +} + +PGBackend::write_iertr::future<> PGBackend::zero( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{} object dne, zero is a no-op", __func__); + return write_ertr::now(); + } + const ceph_osd_op& op = osd_op.op; + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + + if (op.extent.offset >= os.oi.size || op.extent.length == 0) { + return write_iertr::now(); // noop + } + + if (op.extent.offset + op.extent.length >= os.oi.size) { + return _truncate( + os, txn, osd_op_params, delta_stats, + op.extent.offset, op.extent.truncate_size, op.extent.truncate_seq); + } + + txn.zero(coll->get_cid(), + ghobject_t{os.oi.soid}, + op.extent.offset, + op.extent.length); + // TODO: modified_ranges.union_of(zeroed); + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset, + op.extent.length); + delta_stats.num_wr++; + os.oi.clear_data_digest(); + return write_ertr::now(); +} + +PGBackend::create_iertr::future<> PGBackend::create( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats) +{ + if (os.exists && !os.oi.is_whiteout() && + (osd_op.op.flags & CEPH_OSD_OP_FLAG_EXCL)) { + // this is an exclusive create + return crimson::ct_error::eexist::make(); + } + + if (osd_op.indata.length()) { + // handle the legacy. `category` is no longer implemented. + try { + auto p = osd_op.indata.cbegin(); + std::string category; + decode(category, p); + } catch (buffer::error&) { + return crimson::ct_error::invarg::make(); + } + } + maybe_create_new_object(os, txn, delta_stats); + txn.create(coll->get_cid(), + ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard}); + return seastar::now(); +} + +PGBackend::interruptible_future<> +PGBackend::remove(ObjectState& os, ceph::os::Transaction& txn) +{ + // todo: snapset + txn.remove(coll->get_cid(), + ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard}); + os.oi.size = 0; + os.oi.new_object(); + os.exists = false; + // todo: update watchers + if (os.oi.is_whiteout()) { + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + } + return seastar::now(); +} + +PGBackend::remove_iertr::future<> +PGBackend::remove(ObjectState& os, ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats, bool whiteout) +{ + if (!os.exists) { + return crimson::ct_error::enoent::make(); + } + + if (!os.exists) { + logger().debug("{} {} does not exist",__func__, os.oi.soid); + return seastar::now(); + } + if (whiteout && os.oi.is_whiteout()) { + logger().debug("{} whiteout set on {} ",__func__, os.oi.soid); + return seastar::now(); + } + txn.remove(coll->get_cid(), + ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard}); + delta_stats.num_bytes -= os.oi.size; + os.oi.size = 0; + os.oi.new_object(); + + // todo: clone_overlap + if (whiteout) { + logger().debug("{} setting whiteout on {} ",__func__, os.oi.soid); + os.oi.set_flag(object_info_t::FLAG_WHITEOUT); + delta_stats.num_whiteouts++; + txn.create(coll->get_cid(), + ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard}); + return seastar::now(); + } + // todo: update watchers + if (os.oi.is_whiteout()) { + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + delta_stats.num_whiteouts--; + } + delta_stats.num_objects--; + os.exists = false; + return seastar::now(); +} + +PGBackend::interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>> +PGBackend::list_objects(const hobject_t& start, uint64_t limit) const +{ + auto gstart = start.is_min() ? ghobject_t{} : ghobject_t{start, 0, shard}; + return interruptor::make_interruptible(store->list_objects(coll, + gstart, + ghobject_t::get_max(), + limit)) + .then_interruptible([](auto ret) { + auto& [gobjects, next] = ret; + std::vector<hobject_t> objects; + boost::copy(gobjects | + boost::adaptors::filtered([](const ghobject_t& o) { + if (o.is_pgmeta()) { + return false; + } else if (o.hobj.is_temp()) { + return false; + } else { + return o.is_no_gen(); + } + }) | + boost::adaptors::transformed([](const ghobject_t& o) { + return o.hobj; + }), + std::back_inserter(objects)); + return seastar::make_ready_future<std::tuple<std::vector<hobject_t>, hobject_t>>( + std::make_tuple(objects, next.hobj)); + }); +} + +PGBackend::setxattr_ierrorator::future<> PGBackend::setxattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats) +{ + if (local_conf()->osd_max_attr_size > 0 && + osd_op.op.xattr.value_len > local_conf()->osd_max_attr_size) { + return crimson::ct_error::file_too_large::make(); + } + + const auto max_name_len = std::min<uint64_t>( + store->get_max_attr_name_length(), local_conf()->osd_max_attr_name_len); + if (osd_op.op.xattr.name_len > max_name_len) { + return crimson::ct_error::enametoolong::make(); + } + + maybe_create_new_object(os, txn, delta_stats); + + std::string name{"_"}; + ceph::bufferlist val; + { + auto bp = osd_op.indata.cbegin(); + bp.copy(osd_op.op.xattr.name_len, name); + bp.copy(osd_op.op.xattr.value_len, val); + } + logger().debug("setxattr on obj={} for attr={}", os.oi.soid, name); + txn.setattr(coll->get_cid(), ghobject_t{os.oi.soid}, name, val); + delta_stats.num_wr++; + return seastar::now(); +} + +PGBackend::get_attr_ierrorator::future<> PGBackend::getxattr( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + std::string name; + ceph::bufferlist val; + { + auto bp = osd_op.indata.cbegin(); + std::string aname; + bp.copy(osd_op.op.xattr.name_len, aname); + name = "_" + aname; + } + logger().debug("getxattr on obj={} for attr={}", os.oi.soid, name); + return getxattr(os.oi.soid, std::move(name)).safe_then_interruptible( + [&delta_stats, &osd_op] (ceph::bufferlist&& val) { + osd_op.outdata = std::move(val); + osd_op.op.xattr.value_len = osd_op.outdata.length(); + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + return get_attr_errorator::now(); + }); +} + +PGBackend::get_attr_ierrorator::future<ceph::bufferlist> +PGBackend::getxattr( + const hobject_t& soid, + std::string_view key) const +{ + return store->get_attr(coll, ghobject_t{soid}, key); +} + +PGBackend::get_attr_ierrorator::future<ceph::bufferlist> +PGBackend::getxattr( + const hobject_t& soid, + std::string&& key) const +{ + return seastar::do_with(key, [this, &soid](auto &key) { + return store->get_attr(coll, ghobject_t{soid}, key); + }); +} + +PGBackend::get_attr_ierrorator::future<> PGBackend::get_xattrs( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + return store->get_attrs(coll, ghobject_t{os.oi.soid}).safe_then( + [&delta_stats, &osd_op](auto&& attrs) { + std::vector<std::pair<std::string, bufferlist>> user_xattrs; + ceph::bufferlist bl; + for (auto& [key, val] : attrs) { + if (key.size() > 1 && key[0] == '_') { + bl.append(std::move(val)); + user_xattrs.emplace_back(key.substr(1), std::move(bl)); + } + } + ceph::encode(user_xattrs, osd_op.outdata); + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + return get_attr_errorator::now(); + }); +} + +namespace { + +template<typename U, typename V> +int do_cmp_xattr(int op, const U& lhs, const V& rhs) +{ + switch (op) { + case CEPH_OSD_CMPXATTR_OP_EQ: + return lhs == rhs; + case CEPH_OSD_CMPXATTR_OP_NE: + return lhs != rhs; + case CEPH_OSD_CMPXATTR_OP_GT: + return lhs > rhs; + case CEPH_OSD_CMPXATTR_OP_GTE: + return lhs >= rhs; + case CEPH_OSD_CMPXATTR_OP_LT: + return lhs < rhs; + case CEPH_OSD_CMPXATTR_OP_LTE: + return lhs <= rhs; + default: + return -EINVAL; + } +} + +} // anonymous namespace + +static int do_xattr_cmp_u64(int op, uint64_t lhs, bufferlist& rhs_xattr) +{ + uint64_t rhs; + + if (rhs_xattr.length() > 0) { + const char* first = rhs_xattr.c_str(); + if (auto [p, ec] = std::from_chars(first, first + rhs_xattr.length(), rhs); + ec != std::errc()) { + return -EINVAL; + } + } else { + rhs = 0; + } + logger().debug("do_xattr_cmp_u64 '{}' vs '{}' op {}", lhs, rhs, op); + return do_cmp_xattr(op, lhs, rhs); +} + +PGBackend::cmp_xattr_ierrorator::future<> PGBackend::cmp_xattr( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + std::string name{"_"}; + auto bp = osd_op.indata.cbegin(); + bp.copy(osd_op.op.xattr.name_len, name); + + logger().debug("cmpxattr on obj={} for attr={}", os.oi.soid, name); + return getxattr(os.oi.soid, std::move(name)).safe_then_interruptible( + [&delta_stats, &osd_op] (auto &&xattr) -> cmp_xattr_ierrorator::future<> { + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(osd_op.op.xattr.value_len, 10); + + int result = 0; + auto bp = osd_op.indata.cbegin(); + bp += osd_op.op.xattr.name_len; + + switch (osd_op.op.xattr.cmp_mode) { + case CEPH_OSD_CMPXATTR_MODE_STRING: + { + string lhs; + bp.copy(osd_op.op.xattr.value_len, lhs); + string_view rhs(xattr.c_str(), xattr.length()); + result = do_cmp_xattr(osd_op.op.xattr.cmp_op, lhs, rhs); + logger().debug("cmpxattr lhs={}, rhs={}", lhs, rhs); + } + break; + case CEPH_OSD_CMPXATTR_MODE_U64: + { + uint64_t lhs; + try { + decode(lhs, bp); + } catch (ceph::buffer::error& e) { + logger().info("cmp_xattr: buffer error expection"); + result = -EINVAL; + break; + } + result = do_xattr_cmp_u64(osd_op.op.xattr.cmp_op, lhs, xattr); + } + break; + default: + logger().info("bad cmp mode {}", osd_op.op.xattr.cmp_mode); + result = -EINVAL; + } + if (result == 0) { + logger().info("cmp_xattr: comparison returned false"); + return crimson::ct_error::ecanceled::make(); + } else if (result == -EINVAL) { + return crimson::ct_error::invarg::make(); + } else { + osd_op.rval = 1; + return cmp_xattr_ierrorator::now(); + } + }).handle_error_interruptible( + crimson::ct_error::enodata::handle([&delta_stats, &osd_op] () + ->cmp_xattr_errorator::future<> { + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(osd_op.op.xattr.value_len, 10); + return crimson::ct_error::ecanceled::make(); + }), + cmp_xattr_errorator::pass_further{} + ); +} + +PGBackend::rm_xattr_iertr::future<> +PGBackend::rm_xattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + auto bp = osd_op.indata.cbegin(); + string attr_name{"_"}; + bp.copy(osd_op.op.xattr.name_len, attr_name); + txn.rmattr(coll->get_cid(), ghobject_t{os.oi.soid}, attr_name); + return rm_xattr_iertr::now(); +} + +void PGBackend::clone( + /* const */object_info_t& snap_oi, + const ObjectState& os, + const ObjectState& d_os, + ceph::os::Transaction& txn) +{ + // See OpsExecutor::execute_clone documentation + txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid}); + { + ceph::bufferlist bv; + snap_oi.encode_no_oid(bv, CEPH_FEATURES_ALL); + txn.setattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, OI_ATTR, bv); + } + txn.rmattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, SS_ATTR); +} + +using get_omap_ertr = + crimson::os::FuturizedStore::Shard::read_errorator::extend< + crimson::ct_error::enodata>; +using get_omap_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + get_omap_ertr>; +static +get_omap_iertr::future< + crimson::os::FuturizedStore::Shard::omap_values_t> +maybe_get_omap_vals_by_keys( + crimson::os::FuturizedStore::Shard* store, + const crimson::os::CollectionRef& coll, + const object_info_t& oi, + const std::set<std::string>& keys_to_get) +{ + if (oi.is_omap()) { + return store->omap_get_values(coll, ghobject_t{oi.soid}, keys_to_get); + } else { + return crimson::ct_error::enodata::make(); + } +} + +static +get_omap_iertr::future< + std::tuple<bool, crimson::os::FuturizedStore::Shard::omap_values_t>> +maybe_get_omap_vals( + crimson::os::FuturizedStore::Shard* store, + const crimson::os::CollectionRef& coll, + const object_info_t& oi, + const std::string& start_after) +{ + if (oi.is_omap()) { + return store->omap_get_values(coll, ghobject_t{oi.soid}, start_after); + } else { + return crimson::ct_error::enodata::make(); + } +} + +PGBackend::ll_read_ierrorator::future<ceph::bufferlist> +PGBackend::omap_get_header( + const crimson::os::CollectionRef& c, + const ghobject_t& oid) const +{ + return store->omap_get_header(c, oid) + .handle_error( + crimson::ct_error::enodata::handle([] { + return seastar::make_ready_future<bufferlist>(); + }), + ll_read_errorator::pass_further{} + ); +} + +PGBackend::ll_read_ierrorator::future<> +PGBackend::omap_get_header( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + if (os.oi.is_omap()) { + return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then_interruptible( + [&delta_stats, &osd_op] (ceph::bufferlist&& header) { + osd_op.outdata = std::move(header); + delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + delta_stats.num_rd++; + return seastar::now(); + }); + } else { + // no omap? return empty data but not ENOENT. This is imporant for + // the case when the object is being creating due to to may_write(). + return seastar::now(); + } +} + +PGBackend::ll_read_ierrorator::future<> +PGBackend::omap_get_keys( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + std::string start_after; + uint64_t max_return; + try { + auto p = osd_op.indata.cbegin(); + decode(start_after, p); + decode(max_return, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + max_return = + std::min(max_return, local_conf()->osd_max_omap_entries_per_request); + + + // TODO: truly chunk the reading + return maybe_get_omap_vals(store, coll, os.oi, start_after).safe_then_interruptible( + [=,&delta_stats, &osd_op](auto ret) { + ceph::bufferlist result; + bool truncated = false; + uint32_t num = 0; + for (auto &[key, val] : std::get<1>(ret)) { + if (num >= max_return || + result.length() >= local_conf()->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(key, result); + ++num; + } + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(result); + encode(truncated, osd_op.outdata); + delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + delta_stats.num_rd++; + return seastar::now(); + }).handle_error_interruptible( + crimson::ct_error::enodata::handle([&osd_op] { + uint32_t num = 0; + bool truncated = false; + encode(num, osd_op.outdata); + encode(truncated, osd_op.outdata); + osd_op.rval = 0; + return seastar::now(); + }), + ll_read_errorator::pass_further{} + ); +} +static +PGBackend::omap_cmp_ertr::future<> do_omap_val_cmp( + std::map<std::string, bufferlist, std::less<>> out, + std::map<std::string, std::pair<bufferlist, int>> assertions) +{ + bufferlist empty; + for (const auto &[akey, avalue] : assertions) { + const auto [abl, aflag] = avalue; + auto out_entry = out.find(akey); + bufferlist &bl = (out_entry != out.end()) ? out_entry->second : empty; + switch (aflag) { + case CEPH_OSD_CMPXATTR_OP_EQ: + if (!(bl == abl)) { + return crimson::ct_error::ecanceled::make(); + } + break; + case CEPH_OSD_CMPXATTR_OP_LT: + if (!(bl < abl)) { + return crimson::ct_error::ecanceled::make(); + } + break; + case CEPH_OSD_CMPXATTR_OP_GT: + if (!(bl > abl)) { + return crimson::ct_error::ecanceled::make(); + } + break; + default: + return crimson::ct_error::invarg::make(); + } + } + return PGBackend::omap_cmp_ertr::now(); +} +PGBackend::omap_cmp_iertr::future<> +PGBackend::omap_cmp( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + + auto bp = osd_op.indata.cbegin(); + std::map<std::string, std::pair<bufferlist, int> > assertions; + try { + decode(assertions, bp); + } catch (buffer::error&) { + return crimson::ct_error::invarg::make(); + } + + delta_stats.num_rd++; + if (os.oi.is_omap()) { + std::set<std::string> to_get; + for (auto &i: assertions) { + to_get.insert(i.first); + } + return store->omap_get_values(coll, ghobject_t{os.oi.soid}, to_get) + .safe_then([=, &osd_op] (auto&& out) -> omap_cmp_iertr::future<> { + osd_op.rval = 0; + return do_omap_val_cmp(out, assertions); + }); + } else { + return crimson::ct_error::ecanceled::make(); + } +} +PGBackend::ll_read_ierrorator::future<> +PGBackend::omap_get_vals( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + std::string start_after; + uint64_t max_return; + std::string filter_prefix; + try { + auto p = osd_op.indata.cbegin(); + decode(start_after, p); + decode(max_return, p); + decode(filter_prefix, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + max_return = \ + std::min(max_return, local_conf()->osd_max_omap_entries_per_request); + delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + delta_stats.num_rd++; + + // TODO: truly chunk the reading + return maybe_get_omap_vals(store, coll, os.oi, start_after) + .safe_then_interruptible( + [=, &osd_op] (auto&& ret) { + auto [done, vals] = std::move(ret); + assert(done); + ceph::bufferlist result; + bool truncated = false; + uint32_t num = 0; + auto iter = filter_prefix > start_after ? vals.lower_bound(filter_prefix) + : std::begin(vals); + for (; iter != std::end(vals); ++iter) { + const auto& [key, value] = *iter; + if (key.substr(0, filter_prefix.size()) != filter_prefix) { + break; + } else if (num >= max_return || + result.length() >= local_conf()->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(key, result); + encode(value, result); + ++num; + } + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(result); + encode(truncated, osd_op.outdata); + return ll_read_errorator::now(); + }).handle_error_interruptible( + crimson::ct_error::enodata::handle([&osd_op] { + encode(uint32_t{0} /* num */, osd_op.outdata); + encode(bool{false} /* truncated */, osd_op.outdata); + osd_op.rval = 0; + return ll_read_errorator::now(); + }), + ll_read_errorator::pass_further{} + ); +} + +PGBackend::ll_read_ierrorator::future<> +PGBackend::omap_get_vals_by_keys( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + + std::set<std::string> keys_to_get; + try { + auto p = osd_op.indata.cbegin(); + decode(keys_to_get, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument(); + } + delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + delta_stats.num_rd++; + return maybe_get_omap_vals_by_keys(store, coll, os.oi, keys_to_get) + .safe_then_interruptible( + [&osd_op] (crimson::os::FuturizedStore::Shard::omap_values_t&& vals) { + encode(vals, osd_op.outdata); + return ll_read_errorator::now(); + }).handle_error_interruptible( + crimson::ct_error::enodata::handle([&osd_op] { + uint32_t num = 0; + encode(num, osd_op.outdata); + osd_op.rval = 0; + return ll_read_errorator::now(); + }), + ll_read_errorator::pass_further{} + ); +} + +PGBackend::interruptible_future<> +PGBackend::omap_set_vals( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + maybe_create_new_object(os, txn, delta_stats); + + ceph::bufferlist to_set_bl; + try { + auto p = osd_op.indata.cbegin(); + decode_str_str_map_to_bl(p, &to_set_bl); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + txn.omap_setkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_set_bl); + osd_op_params.clean_regions.mark_omap_dirty(); + delta_stats.num_wr++; + delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10); + os.oi.set_flag(object_info_t::FLAG_OMAP); + os.oi.clear_omap_digest(); + return seastar::now(); +} + +PGBackend::interruptible_future<> +PGBackend::omap_set_header( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + maybe_create_new_object(os, txn, delta_stats); + txn.omap_setheader(coll->get_cid(), ghobject_t{os.oi.soid}, osd_op.indata); + osd_op_params.clean_regions.mark_omap_dirty(); + delta_stats.num_wr++; + os.oi.set_flag(object_info_t::FLAG_OMAP); + os.oi.clear_omap_digest(); + return seastar::now(); +} + +PGBackend::interruptible_future<> PGBackend::omap_remove_range( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats) +{ + std::string key_begin, key_end; + try { + auto p = osd_op.indata.cbegin(); + decode(key_begin, p); + decode(key_end, p); + } catch (buffer::error& e) { + throw crimson::osd::invalid_argument{}; + } + txn.omap_rmkeyrange(coll->get_cid(), ghobject_t{os.oi.soid}, key_begin, key_end); + delta_stats.num_wr++; + os.oi.clear_omap_digest(); + return seastar::now(); +} + +PGBackend::interruptible_future<> PGBackend::omap_remove_key( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + ceph::bufferlist to_rm_bl; + try { + auto p = osd_op.indata.cbegin(); + decode_str_set_to_bl(p, &to_rm_bl); + } catch (buffer::error& e) { + throw crimson::osd::invalid_argument{}; + } + txn.omap_rmkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_rm_bl); + // TODO: + // ctx->clean_regions.mark_omap_dirty(); + // ctx->delta_stats.num_wr++; + os.oi.clear_omap_digest(); + return seastar::now(); +} + +PGBackend::omap_clear_iertr::future<> +PGBackend::omap_clear( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + if (!os.oi.is_omap()) { + return omap_clear_ertr::now(); + } + txn.omap_clear(coll->get_cid(), ghobject_t{os.oi.soid}); + osd_op_params.clean_regions.mark_omap_dirty(); + delta_stats.num_wr++; + os.oi.clear_omap_digest(); + os.oi.clear_flag(object_info_t::FLAG_OMAP); + return omap_clear_ertr::now(); +} + +PGBackend::interruptible_future<struct stat> +PGBackend::stat( + CollectionRef c, + const ghobject_t& oid) const +{ + return store->stat(c, oid); +} + +PGBackend::read_errorator::future<std::map<uint64_t, uint64_t>> +PGBackend::fiemap( + CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + return store->fiemap(c, oid, off, len); +} + +PGBackend::write_iertr::future<> PGBackend::tmapput( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats, + osd_op_params_t& osd_op_params) +{ + logger().debug("PGBackend::tmapput: {}", os.oi.soid); + auto ret = crimson::common::do_tmap_put(osd_op.indata.cbegin()); + if (!ret.has_value()) { + logger().debug("PGBackend::tmapup: {}, ret={}", os.oi.soid, ret.error()); + ceph_assert(ret.error() == -EINVAL); + return crimson::ct_error::invarg::make(); + } else { + auto bl = std::move(ret.value()); + return _writefull( + os, + bl.length(), + std::move(bl), + txn, + osd_op_params, + delta_stats, + 0); + } +} + +PGBackend::tmapup_iertr::future<> PGBackend::tmapup( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats, + osd_op_params_t& osd_op_params) +{ + logger().debug("PGBackend::tmapup: {}", os.oi.soid); + return PGBackend::write_iertr::now( + ).si_then([this, &os] { + return _read(os.oi.soid, 0, os.oi.size, 0); + }).handle_error_interruptible( + crimson::ct_error::enoent::handle([](auto &) { + return seastar::make_ready_future<bufferlist>(); + }), + PGBackend::write_iertr::pass_further{}, + crimson::ct_error::assert_all{"read error in mutate_object_contents"} + ).si_then([this, &os, &osd_op, &txn, + &delta_stats, &osd_op_params] + (auto &&bl) mutable -> PGBackend::tmapup_iertr::future<> { + auto result = crimson::common::do_tmap_up( + osd_op.indata.cbegin(), + std::move(bl)); + if (!result.has_value()) { + int ret = result.error(); + logger().debug("PGBackend::tmapup: {}, ret={}", os.oi.soid, ret); + switch (ret) { + case -EEXIST: + return crimson::ct_error::eexist::make(); + case -ENOENT: + return crimson::ct_error::enoent::make(); + case -EINVAL: + return crimson::ct_error::invarg::make(); + default: + ceph_assert(0 == "impossible error"); + return crimson::ct_error::invarg::make(); + } + } + + logger().debug( + "PGBackend::tmapup: {}, result.value.length()={}, ret=0", + os.oi.soid, result.value().length()); + return _writefull( + os, + result.value().length(), + result.value(), + txn, + osd_op_params, + delta_stats, + 0); + }); +} + +PGBackend::read_ierrorator::future<> PGBackend::tmapget( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) +{ + logger().debug("PGBackend::tmapget: {}", os.oi.soid); + const auto& oi = os.oi; + logger().debug("PGBackend::tmapget: read {} 0~{}", oi.soid, oi.size); + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("PGBackend::tmapget: {} DNE", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + + return _read(oi.soid, 0, oi.size, 0).safe_then_interruptible_tuple( + [&delta_stats, &osd_op](auto&& bl) -> read_errorator::future<> { + logger().debug("PGBackend::tmapget: data length: {}", bl.length()); + osd_op.op.extent.length = bl.length(); + osd_op.rval = 0; + delta_stats.num_rd++; + delta_stats.num_rd_kb += shift_round_up(bl.length(), 10); + osd_op.outdata = std::move(bl); + return read_errorator::now(); + }, crimson::ct_error::input_output_error::handle([] { + return read_errorator::future<>{crimson::ct_error::object_corrupted::make()}; + }), + read_errorator::pass_further{}); +} + diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h new file mode 100644 index 000000000..fbad37d4c --- /dev/null +++ b/src/crimson/osd/pg_backend.h @@ -0,0 +1,448 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <memory> +#include <string> +#include <boost/container/flat_set.hpp> + +#include "include/rados.h" + +#include "crimson/os/futurized_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/acked_peers.h" +#include "crimson/common/shared_lru.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/osdop_params.h" + +struct hobject_t; + +namespace ceph::os { + class Transaction; +} + +namespace crimson::osd { + class ShardServices; + class PG; + class ObjectContextLoader; +} + +class PGBackend +{ +protected: + using CollectionRef = crimson::os::CollectionRef; + using ec_profile_t = std::map<std::string, std::string>; + // low-level read errorator + using ll_read_errorator = crimson::os::FuturizedStore::Shard::read_errorator; + using ll_read_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + ll_read_errorator>; + +public: + using load_metadata_ertr = crimson::errorator< + crimson::ct_error::object_corrupted>; + using load_metadata_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + load_metadata_ertr>; + using interruptor = + ::crimson::interruptible::interruptor< + ::crimson::osd::IOInterruptCondition>; + template <typename T = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition, T>; + using rep_op_fut_t = + std::tuple<interruptible_future<>, + interruptible_future<crimson::osd::acked_peers_t>>; + PGBackend(shard_id_t shard, CollectionRef coll, + crimson::osd::ShardServices &shard_services, + DoutPrefixProvider &dpp); + virtual ~PGBackend() = default; + static std::unique_ptr<PGBackend> create(pg_t pgid, + const pg_shard_t pg_shard, + const pg_pool_t& pool, + crimson::os::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile, + DoutPrefixProvider &dpp); + using attrs_t = + std::map<std::string, ceph::bufferptr, std::less<>>; + using read_errorator = ll_read_errorator::extend< + crimson::ct_error::object_corrupted>; + using read_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + read_errorator>; + read_ierrorator::future<> read( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats); + read_ierrorator::future<> sparse_read( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats); + using checksum_errorator = ll_read_errorator::extend< + crimson::ct_error::object_corrupted, + crimson::ct_error::invarg>; + using checksum_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + checksum_errorator>; + checksum_ierrorator::future<> checksum( + const ObjectState& os, + OSDOp& osd_op); + using cmp_ext_errorator = ll_read_errorator::extend< + crimson::ct_error::invarg, + crimson::ct_error::cmp_fail>; + using cmp_ext_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + cmp_ext_errorator>; + cmp_ext_ierrorator::future<> cmp_ext( + const ObjectState& os, + OSDOp& osd_op); + using stat_errorator = crimson::errorator<crimson::ct_error::enoent>; + using stat_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + stat_errorator>; + stat_ierrorator::future<> stat( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats); + + // TODO: switch the entire write family to errorator. + using write_ertr = crimson::errorator< + crimson::ct_error::file_too_large, + crimson::ct_error::invarg>; + using write_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + write_ertr>; + using create_ertr = crimson::errorator< + crimson::ct_error::invarg, + crimson::ct_error::eexist>; + using create_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + create_ertr>; + create_iertr::future<> create( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats); + using remove_ertr = crimson::errorator< + crimson::ct_error::enoent>; + using remove_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + remove_ertr>; + remove_iertr::future<> remove( + ObjectState& os, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats, + bool whiteout); + interruptible_future<> remove( + ObjectState& os, + ceph::os::Transaction& txn); + interruptible_future<> set_allochint( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats); + write_iertr::future<> write( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + interruptible_future<> write_same( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + write_iertr::future<> writefull( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + using append_errorator = crimson::errorator< + crimson::ct_error::invarg>; + using append_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + append_errorator>; + append_ierrorator::future<> append( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + using rollback_ertr = crimson::errorator< + crimson::ct_error::enoent>; + using rollback_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + rollback_ertr>; + rollback_iertr::future<> rollback( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + crimson::osd::ObjectContextRef head, + crimson::osd::ObjectContextLoader& obc_loader); + write_iertr::future<> truncate( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + write_iertr::future<> zero( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + rep_op_fut_t mutate_object( + std::set<pg_shard_t> pg_shards, + crimson::osd::ObjectContextRef &&obc, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, + epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries); + interruptible_future<std::tuple<std::vector<hobject_t>, hobject_t>> list_objects( + const hobject_t& start, + uint64_t limit) const; + using setxattr_errorator = crimson::errorator< + crimson::ct_error::file_too_large, + crimson::ct_error::enametoolong>; + using setxattr_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + setxattr_errorator>; + setxattr_ierrorator::future<> setxattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats); + using get_attr_errorator = crimson::os::FuturizedStore::Shard::get_attr_errorator; + using get_attr_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + get_attr_errorator>; + get_attr_ierrorator::future<> getxattr( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + get_attr_ierrorator::future<ceph::bufferlist> getxattr( + const hobject_t& soid, + std::string_view key) const; + get_attr_ierrorator::future<ceph::bufferlist> getxattr( + const hobject_t& soid, + std::string&& key) const; + get_attr_ierrorator::future<> get_xattrs( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + using cmp_xattr_errorator = get_attr_errorator::extend< + crimson::ct_error::ecanceled, + crimson::ct_error::invarg>; + using cmp_xattr_ierrorator = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + cmp_xattr_errorator>; + cmp_xattr_ierrorator::future<> cmp_xattr( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + using rm_xattr_ertr = crimson::errorator<crimson::ct_error::enoent>; + using rm_xattr_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + rm_xattr_ertr>; + rm_xattr_iertr::future<> rm_xattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + void clone( + /* const */object_info_t& snap_oi, + const ObjectState& os, + const ObjectState& d_os, + ceph::os::Transaction& trans); + interruptible_future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) const; + read_errorator::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len); + + write_iertr::future<> tmapput( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats, + osd_op_params_t& osd_op_params); + + using tmapup_ertr = write_ertr::extend< + crimson::ct_error::enoent, + crimson::ct_error::eexist>; + using tmapup_iertr = ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + tmapup_ertr>; + tmapup_iertr::future<> tmapup( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats, + osd_op_params_t& osd_op_params); + + read_ierrorator::future<> tmapget( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats); + + // OMAP + ll_read_ierrorator::future<> omap_get_keys( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + using omap_cmp_ertr = + crimson::os::FuturizedStore::Shard::read_errorator::extend< + crimson::ct_error::ecanceled, + crimson::ct_error::invarg>; + using omap_cmp_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + omap_cmp_ertr>; + omap_cmp_iertr::future<> omap_cmp( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + ll_read_ierrorator::future<> omap_get_vals( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + ll_read_ierrorator::future<> omap_get_vals_by_keys( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + interruptible_future<> omap_set_vals( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + ll_read_ierrorator::future<ceph::bufferlist> omap_get_header( + const crimson::os::CollectionRef& c, + const ghobject_t& oid) const; + ll_read_ierrorator::future<> omap_get_header( + const ObjectState& os, + OSDOp& osd_op, + object_stat_sum_t& delta_stats) const; + interruptible_future<> omap_set_header( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + interruptible_future<> omap_remove_range( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + object_stat_sum_t& delta_stats); + interruptible_future<> omap_remove_key( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + using omap_clear_ertr = crimson::errorator<crimson::ct_error::enoent>; + using omap_clear_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + omap_clear_ertr>; + omap_clear_iertr::future<> omap_clear( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats); + + virtual void got_rep_op_reply(const MOSDRepOpReply&) {} + virtual seastar::future<> stop() = 0; + virtual void on_actingset_changed(bool same_primary) = 0; +protected: + const shard_id_t shard; + CollectionRef coll; + crimson::osd::ShardServices &shard_services; + DoutPrefixProvider &dpp; ///< provides log prefix context + crimson::os::FuturizedStore::Shard* store; + virtual seastar::future<> request_committed( + const osd_reqid_t& reqid, + const eversion_t& at_version) = 0; +public: + struct loaded_object_md_t { + ObjectState os; + crimson::osd::SnapSetContextRef ssc; + using ref = std::unique_ptr<loaded_object_md_t>; + }; + load_metadata_iertr::future<loaded_object_md_t::ref> + load_metadata( + const hobject_t &oid); + +private: + virtual ll_read_ierrorator::future<ceph::bufferlist> _read( + const hobject_t& hoid, + size_t offset, + size_t length, + uint32_t flags) = 0; + write_iertr::future<> _writefull( + ObjectState& os, + off_t truncate_size, + const bufferlist& bl, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + unsigned flags); + write_iertr::future<> _truncate( + ObjectState& os, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params, + object_stat_sum_t& delta_stats, + size_t offset, + size_t truncate_size, + uint32_t truncate_seq); + + bool maybe_create_new_object(ObjectState& os, + ceph::os::Transaction& txn, + object_stat_sum_t& delta_stats); + void update_size_and_usage(object_stat_sum_t& delta_stats, + object_info_t& oi, uint64_t offset, + uint64_t length, bool write_full = false); + void truncate_update_size_and_usage( + object_stat_sum_t& delta_stats, + object_info_t& oi, + uint64_t truncate_size); + virtual rep_op_fut_t + _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) = 0; + friend class ReplicatedRecoveryBackend; + friend class ::crimson::osd::PG; +}; diff --git a/src/crimson/osd/pg_interval_interrupt_condition.cc b/src/crimson/osd/pg_interval_interrupt_condition.cc new file mode 100644 index 000000000..36243b825 --- /dev/null +++ b/src/crimson/osd/pg_interval_interrupt_condition.cc @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg_interval_interrupt_condition.h" +#include "pg.h" + +#include "crimson/common/log.h" + +SET_SUBSYS(osd); + +namespace crimson::osd { + +IOInterruptCondition::IOInterruptCondition(Ref<PG>& pg) + : pg(pg), e(pg->get_osdmap_epoch()) {} + +IOInterruptCondition::~IOInterruptCondition() { + // for the sake of forward declaring PG (which is a detivate of + // intrusive_ref_counter<...>) +} + +bool IOInterruptCondition::new_interval_created() { + LOG_PREFIX(IOInterruptCondition::new_interval_created); + const epoch_t interval_start = pg->get_interval_start_epoch(); + bool ret = e < interval_start; + if (ret) { + DEBUGDPP("stored interval e{} < interval_start e{}", *pg, e, interval_start); + } + return ret; +} + +bool IOInterruptCondition::is_stopping() { + LOG_PREFIX(IOInterruptCondition::is_stopping); + if (pg->stopping) { + DEBUGDPP("pg stopping", *pg); + } + return pg->stopping; +} + +bool IOInterruptCondition::is_primary() { + return pg->is_primary(); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/pg_interval_interrupt_condition.h b/src/crimson/osd/pg_interval_interrupt_condition.h new file mode 100644 index 000000000..a3a0a1edb --- /dev/null +++ b/src/crimson/osd/pg_interval_interrupt_condition.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#pragma once + +#include "include/types.h" +#include "crimson/common/errorator.h" +#include "crimson/common/exception.h" +#include "crimson/common/type_helpers.h" + +namespace crimson::osd { + +class PG; + +class IOInterruptCondition { +public: + IOInterruptCondition(Ref<PG>& pg); + ~IOInterruptCondition(); + + bool new_interval_created(); + + bool is_stopping(); + + bool is_primary(); + + template <typename Fut> + std::optional<Fut> may_interrupt() { + if (new_interval_created()) { + return seastar::futurize<Fut>::make_exception_future( + ::crimson::common::actingset_changed(is_primary())); + } + if (is_stopping()) { + return seastar::futurize<Fut>::make_exception_future( + ::crimson::common::system_shutdown_exception()); + } + return std::optional<Fut>(); + } + + template <typename T> + static constexpr bool is_interruption_v = + std::is_same_v<T, ::crimson::common::actingset_changed> + || std::is_same_v<T, ::crimson::common::system_shutdown_exception>; + + static bool is_interruption(std::exception_ptr& eptr) { + return (*eptr.__cxa_exception_type() == + typeid(::crimson::common::actingset_changed) || + *eptr.__cxa_exception_type() == + typeid(::crimson::common::system_shutdown_exception)); + } + +private: + Ref<PG> pg; + epoch_t e; +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/pg_map.cc b/src/crimson/osd/pg_map.cc new file mode 100644 index 000000000..193781250 --- /dev/null +++ b/src/crimson/osd/pg_map.cc @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/pg_map.h" + +#include "crimson/osd/pg.h" +#include "common/Formatter.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using std::make_pair; + +namespace crimson::osd { + +PGMap::PGCreationState::PGCreationState(spg_t pgid) : pgid(pgid) {} +PGMap::PGCreationState::~PGCreationState() {} + +void PGMap::PGCreationState::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pgid; + f->dump_bool("creating", creating); +} + +PGMap::wait_for_pg_ret +PGMap::wait_for_pg(PGCreationBlockingEvent::TriggerI&& trigger, spg_t pgid) +{ + if (auto pg = get_pg(pgid)) { + return make_pair( + wait_for_pg_fut(wait_for_pg_ertr::ready_future_marker{}, pg), + true); + } else { + auto &state = pgs_creating.emplace(pgid, pgid).first->second; + return make_pair( + wait_for_pg_fut( + trigger.maybe_record_blocking(state.promise.get_shared_future(), state) + ), state.creating); + } +} + +void PGMap::remove_pg(spg_t pgid) { + ceph_assert(pgs.erase(pgid) == 1); +} + +Ref<PG> PGMap::get_pg(spg_t pgid) +{ + if (auto pg = pgs.find(pgid); pg != pgs.end()) { + return pg->second; + } else { + return nullptr; + } +} + +void PGMap::set_creating(spg_t pgid) +{ + logger().debug("Creating {}", pgid); + ceph_assert(pgs.count(pgid) == 0); + auto pg = pgs_creating.find(pgid); + ceph_assert(pg != pgs_creating.end()); + ceph_assert(pg->second.creating == false); + pg->second.creating = true; +} + +void PGMap::pg_created(spg_t pgid, Ref<PG> pg) +{ + logger().debug("Created {}", pgid); + ceph_assert(!pgs.count(pgid)); + pgs.emplace(pgid, pg); + + auto creating_iter = pgs_creating.find(pgid); + ceph_assert(creating_iter != pgs_creating.end()); + auto promise = std::move(creating_iter->second.promise); + pgs_creating.erase(creating_iter); + promise.set_value(pg); +} + +void PGMap::pg_loaded(spg_t pgid, Ref<PG> pg) +{ + ceph_assert(!pgs.count(pgid)); + pgs.emplace(pgid, pg); +} + +void PGMap::pg_creation_canceled(spg_t pgid) +{ + logger().debug("PGMap::pg_creation_canceled: {}", pgid); + ceph_assert(!pgs.count(pgid)); + + auto creating_iter = pgs_creating.find(pgid); + ceph_assert(creating_iter != pgs_creating.end()); + auto promise = std::move(creating_iter->second.promise); + pgs_creating.erase(creating_iter); + promise.set_exception( + crimson::ct_error::ecanceled::exception_ptr() + ); +} + +PGMap::~PGMap() {} + +} diff --git a/src/crimson/osd/pg_map.h b/src/crimson/osd/pg_map.h new file mode 100644 index 000000000..3269de434 --- /dev/null +++ b/src/crimson/osd/pg_map.h @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <algorithm> + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "include/types.h" +#include "crimson/common/type_helpers.h" +#include "crimson/common/smp_helpers.h" +#include "crimson/osd/osd_operation.h" +#include "osd/osd_types.h" + +namespace crimson::osd { +class PG; + +/** + * PGShardMapping + * + * Maintains a mapping from spg_t to the core containing that PG. Internally, each + * core has a local copy of the mapping to enable core-local lookups. Updates + * are proxied to core 0, and the back out to all other cores -- see maybe_create_pg. + */ +class PGShardMapping : public seastar::peering_sharded_service<PGShardMapping> { +public: + /// Returns mapping if present, NULL_CORE otherwise + core_id_t get_pg_mapping(spg_t pgid) { + auto iter = pg_to_core.find(pgid); + ceph_assert_always(iter == pg_to_core.end() || iter->second != NULL_CORE); + return iter == pg_to_core.end() ? NULL_CORE : iter->second; + } + + /// Returns mapping for pgid, creates new one if it doesn't already exist + seastar::future<core_id_t> maybe_create_pg( + spg_t pgid, + core_id_t core = NULL_CORE) { + auto find_iter = pg_to_core.find(pgid); + if (find_iter != pg_to_core.end()) { + ceph_assert_always(find_iter->second != NULL_CORE); + if (core != NULL_CORE) { + ceph_assert_always(find_iter->second == core); + } + return seastar::make_ready_future<core_id_t>(find_iter->second); + } else { + return container().invoke_on(0,[pgid, core] + (auto &primary_mapping) { + auto [insert_iter, inserted] = primary_mapping.pg_to_core.emplace(pgid, core); + ceph_assert_always(inserted); + ceph_assert_always(primary_mapping.core_to_num_pgs.size() > 0); + std::map<core_id_t, unsigned>::iterator core_iter; + if (core == NULL_CORE) { + core_iter = std::min_element( + primary_mapping.core_to_num_pgs.begin(), + primary_mapping.core_to_num_pgs.end(), + [](const auto &left, const auto &right) { + return left.second < right.second; + }); + } else { + core_iter = primary_mapping.core_to_num_pgs.find(core); + } + ceph_assert_always(primary_mapping.core_to_num_pgs.end() != core_iter); + insert_iter->second = core_iter->first; + core_iter->second++; + return primary_mapping.container().invoke_on_others( + [pgid = insert_iter->first, core = insert_iter->second] + (auto &other_mapping) { + ceph_assert_always(core != NULL_CORE); + auto [insert_iter, inserted] = other_mapping.pg_to_core.emplace(pgid, core); + ceph_assert_always(inserted); + }); + }).then([this, pgid] { + auto find_iter = pg_to_core.find(pgid); + return seastar::make_ready_future<core_id_t>(find_iter->second); + }); + } + } + + /// Remove pgid + seastar::future<> remove_pg(spg_t pgid) { + return container().invoke_on(0, [pgid](auto &primary_mapping) { + auto iter = primary_mapping.pg_to_core.find(pgid); + ceph_assert_always(iter != primary_mapping.pg_to_core.end()); + ceph_assert_always(iter->second != NULL_CORE); + auto count_iter = primary_mapping.core_to_num_pgs.find(iter->second); + ceph_assert_always(count_iter != primary_mapping.core_to_num_pgs.end()); + ceph_assert_always(count_iter->second > 0); + --(count_iter->second); + primary_mapping.pg_to_core.erase(iter); + return primary_mapping.container().invoke_on_others( + [pgid](auto &other_mapping) { + auto iter = other_mapping.pg_to_core.find(pgid); + ceph_assert_always(iter != other_mapping.pg_to_core.end()); + ceph_assert_always(iter->second != NULL_CORE); + other_mapping.pg_to_core.erase(iter); + }); + }); + } + + size_t get_num_pgs() const { return pg_to_core.size(); } + + /// Map to cores in [min_core_mapping, core_mapping_limit) + PGShardMapping(core_id_t min_core_mapping, core_id_t core_mapping_limit) { + ceph_assert_always(min_core_mapping < core_mapping_limit); + for (auto i = min_core_mapping; i != core_mapping_limit; ++i) { + core_to_num_pgs.emplace(i, 0); + } + } + + template <typename F> + void for_each_pgid(F &&f) const { + for (const auto &i: pg_to_core) { + std::invoke(f, i.first); + } + } + +private: + std::map<core_id_t, unsigned> core_to_num_pgs; + std::map<spg_t, core_id_t> pg_to_core; +}; + +/** + * PGMap + * + * Maps spg_t to PG instance within a shard. Handles dealing with waiting + * on pg creation. + */ +class PGMap { + struct PGCreationState : BlockerT<PGCreationState> { + static constexpr const char * type_name = "PGCreation"; + + void dump_detail(Formatter *f) const final; + + spg_t pgid; + seastar::shared_promise<Ref<PG>> promise; + bool creating = false; + PGCreationState(spg_t pgid); + + PGCreationState(const PGCreationState &) = delete; + PGCreationState(PGCreationState &&) = delete; + PGCreationState &operator=(const PGCreationState &) = delete; + PGCreationState &operator=(PGCreationState &&) = delete; + + ~PGCreationState(); + }; + + std::map<spg_t, PGCreationState> pgs_creating; + using pgs_t = std::map<spg_t, Ref<PG>>; + pgs_t pgs; + +public: + using PGCreationBlocker = PGCreationState; + using PGCreationBlockingEvent = PGCreationBlocker::BlockingEvent; + /** + * Get future for pg with a bool indicating whether it's already being + * created. + */ + using wait_for_pg_ertr = crimson::errorator< + crimson::ct_error::ecanceled>; + using wait_for_pg_fut = wait_for_pg_ertr::future<Ref<PG>>; + using wait_for_pg_ret = std::pair<wait_for_pg_fut, bool>; + wait_for_pg_ret wait_for_pg(PGCreationBlockingEvent::TriggerI&&, spg_t pgid); + + /** + * get PG in non-blocking manner + */ + Ref<PG> get_pg(spg_t pgid); + + /** + * Set creating + */ + void set_creating(spg_t pgid); + + /** + * Set newly created pg + */ + void pg_created(spg_t pgid, Ref<PG> pg); + + /** + * Add newly loaded pg + */ + void pg_loaded(spg_t pgid, Ref<PG> pg); + + /** + * Cancel pending creation of pgid. + */ + void pg_creation_canceled(spg_t pgid); + + void remove_pg(spg_t pgid); + + pgs_t& get_pgs() { return pgs; } + const pgs_t& get_pgs() const { return pgs; } + auto get_pg_count() const { return pgs.size(); } + PGMap() = default; + ~PGMap(); +}; + +} diff --git a/src/crimson/osd/pg_meta.cc b/src/crimson/osd/pg_meta.cc new file mode 100644 index 000000000..288ee52a0 --- /dev/null +++ b/src/crimson/osd/pg_meta.cc @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg_meta.h" + +#include <string_view> + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +using std::string; +using std::string_view; +// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can +// easily skip them +using crimson::os::FuturizedStore; + +PGMeta::PGMeta(FuturizedStore::Shard& store, spg_t pgid) + : store{store}, + pgid{pgid} +{} + +namespace { + template<typename T> + std::optional<T> find_value(const FuturizedStore::Shard::omap_values_t& values, + string_view key) + { + auto found = values.find(key); + if (found == values.end()) { + return {}; + } + auto p = found->second.cbegin(); + T value; + decode(value, p); + return std::make_optional(std::move(value)); + } +} + +seastar::future<epoch_t> PGMeta::get_epoch() +{ + return store.open_collection(coll_t{pgid}).then([this](auto ch) { + return store.omap_get_values(ch, + pgid.make_pgmeta_oid(), + {string{infover_key}, + string{epoch_key}}).safe_then( + [](auto&& values) { + { + // sanity check + auto infover = find_value<__u8>(values, infover_key); + assert(infover); + if (*infover < 10) { + throw std::runtime_error("incompatible pg meta"); + } + } + { + auto epoch = find_value<epoch_t>(values, epoch_key); + assert(epoch); + return seastar::make_ready_future<epoch_t>(*epoch); + } + }, + FuturizedStore::Shard::read_errorator::assert_all{ + "PGMeta::get_epoch: unable to read pgmeta" + }); + }); +} + +seastar::future<std::tuple<pg_info_t, PastIntervals>> PGMeta::load() +{ + return store.open_collection(coll_t{pgid}).then([this](auto ch) { + return store.omap_get_values(ch, + pgid.make_pgmeta_oid(), + {string{infover_key}, + string{info_key}, + string{biginfo_key}, + string{fastinfo_key}}); + }).safe_then([](auto&& values) { + { + // sanity check + auto infover = find_value<__u8>(values, infover_key); + assert(infover); + if (infover < 10) { + throw std::runtime_error("incompatible pg meta"); + } + } + pg_info_t info; + { + auto found = find_value<pg_info_t>(values, info_key); + assert(found); + info = *std::move(found); + } + PastIntervals past_intervals; + { + using biginfo_t = std::pair<PastIntervals, decltype(info.purged_snaps)>; + auto big_info = find_value<biginfo_t>(values, biginfo_key); + assert(big_info); + past_intervals = std::move(big_info->first); + info.purged_snaps = std::move(big_info->second); + } + { + auto fast_info = find_value<pg_fast_info_t>(values, fastinfo_key); + if (fast_info) { + fast_info->try_apply_to(&info); + } + } + return seastar::make_ready_future<std::tuple<pg_info_t, PastIntervals>>( + std::make_tuple(std::move(info), std::move(past_intervals))); + }, + FuturizedStore::Shard::read_errorator::assert_all{ + "PGMeta::load: unable to read pgmeta" + }); +} diff --git a/src/crimson/osd/pg_meta.h b/src/crimson/osd/pg_meta.h new file mode 100644 index 000000000..21c2bb373 --- /dev/null +++ b/src/crimson/osd/pg_meta.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <tuple> +#include <seastar/core/future.hh> +#include "osd/osd_types.h" +#include "crimson/os/futurized_store.h" + +/// PG related metadata +class PGMeta +{ + crimson::os::FuturizedStore::Shard& store; + const spg_t pgid; +public: + PGMeta(crimson::os::FuturizedStore::Shard& store, spg_t pgid); + seastar::future<epoch_t> get_epoch(); + seastar::future<std::tuple<pg_info_t, PastIntervals>> load(); +}; diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc new file mode 100644 index 000000000..09b45779e --- /dev/null +++ b/src/crimson/osd/pg_recovery.cc @@ -0,0 +1,569 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <fmt/ranges.h> + +#include "crimson/common/type_helpers.h" +#include "crimson/osd/backfill_facades.h" +#include "crimson/osd/osd_operations/background_recovery.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/pg_recovery.h" + +#include "osd/osd_types.h" +#include "osd/PeeringState.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using std::map; +using std::set; + +void PGRecovery::start_pglogbased_recovery() +{ + using PglogBasedRecovery = crimson::osd::PglogBasedRecovery; + (void) pg->get_shard_services().start_operation<PglogBasedRecovery>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_osdmap_epoch(), + float(0.001)); +} + +PGRecovery::interruptible_future<bool> +PGRecovery::start_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + size_t max_to_start) +{ + assert(pg->is_primary()); + assert(pg->is_peered()); + assert(pg->is_recovering()); + // in ceph-osd the do_recovery() path handles both the pg log-based + // recovery and the backfill, albeit they are separated at the layer + // of PeeringState. In crimson-osd backfill has been cut from it, so + // and do_recovery() is actually solely for pg log-based recovery. + // At the time of writing it's considered to move it to FSM and fix + // the naming as well. + assert(!pg->is_backfilling()); + assert(!pg->get_peering_state().is_deleting()); + + std::vector<interruptible_future<>> started; + started.reserve(max_to_start); + max_to_start -= start_primary_recovery_ops(trigger, max_to_start, &started); + if (max_to_start > 0) { + max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started); + } + using interruptor = + crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>; + return interruptor::parallel_for_each(started, + [] (auto&& ifut) { + return std::move(ifut); + }).then_interruptible([this] { + bool done = !pg->get_peering_state().needs_recovery(); + if (done) { + logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}", + pg->get_pgid()); + using LocalPeeringEvent = crimson::osd::LocalPeeringEvent; + if (!pg->get_peering_state().needs_backfill()) { + logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}", + pg->get_pgid()); + (void) pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::AllReplicasRecovered{}); + } else { + logger().debug("start_recovery_ops: RequestBackfill for pg: {}", + pg->get_pgid()); + (void) pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::RequestBackfill{}); + } + } + return seastar::make_ready_future<bool>(!done); + }); +} + +size_t PGRecovery::start_primary_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + size_t max_to_start, + std::vector<PGRecovery::interruptible_future<>> *out) +{ + if (!pg->is_recovering()) { + return 0; + } + + if (!pg->get_peering_state().have_missing()) { + pg->get_peering_state().local_recovery_complete(); + return 0; + } + + const auto &missing = pg->get_peering_state().get_pg_log().get_missing(); + + logger().info("{} recovering {} in pg {}, missing {}", __func__, + pg->get_recovery_backend()->total_recovering(), + *static_cast<crimson::osd::PG*>(pg), + missing); + + unsigned started = 0; + int skipped = 0; + + map<version_t, hobject_t>::const_iterator p = + missing.get_rmissing().lower_bound(pg->get_peering_state().get_pg_log().get_log().last_requested); + while (started < max_to_start && p != missing.get_rmissing().end()) { + // TODO: chain futures here to enable yielding to scheduler? + hobject_t soid; + version_t v = p->first; + + auto it_objects = pg->get_peering_state().get_pg_log().get_log().objects.find(p->second); + if (it_objects != pg->get_peering_state().get_pg_log().get_log().objects.end()) { + // look at log! + pg_log_entry_t *latest = it_objects->second; + assert(latest->is_update() || latest->is_delete()); + soid = latest->soid; + } else { + soid = p->second; + } + const pg_missing_item& item = missing.get_items().find(p->second)->second; + ++p; + + hobject_t head = soid.get_head(); + + logger().info( + "{} {} item.need {} {} {} {} {}", + __func__, + soid, + item.need, + missing.is_missing(soid) ? " (missing)":"", + missing.is_missing(head) ? " (missing head)":"", + pg->get_recovery_backend()->is_recovering(soid) ? " (recovering)":"", + pg->get_recovery_backend()->is_recovering(head) ? " (recovering head)":""); + + // TODO: handle lost/unfound + if (pg->get_recovery_backend()->is_recovering(soid)) { + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + out->emplace_back(recovery_waiter.wait_for_recovered(trigger)); + ++started; + } else if (pg->get_recovery_backend()->is_recovering(head)) { + ++skipped; + } else { + out->emplace_back(recover_missing(trigger, soid, item.need)); + ++started; + } + + if (!skipped) + pg->get_peering_state().set_last_requested(v); + } + + logger().info("{} started {} skipped {}", __func__, started, skipped); + + return started; +} + +size_t PGRecovery::start_replica_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + size_t max_to_start, + std::vector<PGRecovery::interruptible_future<>> *out) +{ + if (!pg->is_recovering()) { + return 0; + } + uint64_t started = 0; + + assert(!pg->get_peering_state().get_acting_recovery_backfill().empty()); + + auto recovery_order = get_replica_recovery_order(); + for (auto &peer : recovery_order) { + assert(peer != pg->get_peering_state().get_primary()); + const auto& pm = pg->get_peering_state().get_peer_missing(peer); + + logger().debug("{}: peer osd.{} missing {} objects", __func__, + peer, pm.num_missing()); + logger().trace("{}: peer osd.{} missing {}", __func__, + peer, pm.get_items()); + + // recover oldest first + for (auto p = pm.get_rmissing().begin(); + p != pm.get_rmissing().end() && started < max_to_start; + ++p) { + const auto &soid = p->second; + + if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) { + logger().debug("{}: object {} still unfound", __func__, soid); + continue; + } + + const pg_info_t &pi = pg->get_peering_state().get_peer_info(peer); + if (soid > pi.last_backfill) { + if (!pg->get_recovery_backend()->is_recovering(soid)) { + logger().error( + "{}: object {} in missing set for backfill (last_backfill {})" + " but not in recovering", + __func__, + soid, + pi.last_backfill); + ceph_abort(); + } + continue; + } + + if (pg->get_recovery_backend()->is_recovering(soid)) { + logger().debug("{}: already recovering object {}", __func__, soid); + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + out->emplace_back(recovery_waiter.wait_for_recovered(trigger)); + started++; + continue; + } + + if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { + logger().debug("{}: soid {} is a delete, removing", __func__, soid); + map<hobject_t,pg_missing_item>::const_iterator r = + pm.get_items().find(soid); + started++; + out->emplace_back( + prep_object_replica_deletes(trigger, soid, r->second.need)); + continue; + } + + if (soid.is_snap() && + pg->get_peering_state().get_pg_log().get_missing().is_missing( + soid.get_head())) { + logger().debug("{}: head {} still missing on primary", __func__, + soid.get_head()); + continue; + } + + if (pg->get_peering_state().get_pg_log().get_missing().is_missing(soid)) { + logger().debug("{}: soid {} still missing on primary", __func__, soid); + continue; + } + + logger().debug("{}: recover_object_replicas({})", __func__,soid); + map<hobject_t,pg_missing_item>::const_iterator r = pm.get_items().find( + soid); + started++; + out->emplace_back( + prep_object_replica_pushes(trigger, soid, r->second.need)); + } + } + + return started; +} + +PGRecovery::interruptible_future<> +PGRecovery::recover_missing( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + const hobject_t &soid, eversion_t need) +{ + if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { + return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( + trigger, + pg->get_recovery_backend()->recover_delete(soid, need)); + } else { + return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( + trigger, + pg->get_recovery_backend()->recover_object(soid, need) + .handle_exception_interruptible( + [=, this, soid = std::move(soid)] (auto e) { + on_failed_recover({ pg->get_pg_whoami() }, soid, need); + return seastar::make_ready_future<>(); + }) + ); + } +} + +RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_deletes( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + const hobject_t& soid, + eversion_t need) +{ + return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( + trigger, + pg->get_recovery_backend()->push_delete(soid, need).then_interruptible( + [=, this] { + object_stat_sum_t stat_diff; + stat_diff.num_objects_recovered = 1; + on_global_recover(soid, stat_diff, true); + return seastar::make_ready_future<>(); + }) + ); +} + +RecoveryBackend::interruptible_future<> PGRecovery::prep_object_replica_pushes( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + const hobject_t& soid, + eversion_t need) +{ + return pg->get_recovery_backend()->add_recovering(soid).wait_track_blocking( + trigger, + pg->get_recovery_backend()->recover_object(soid, need) + .handle_exception_interruptible( + [=, this, soid = std::move(soid)] (auto e) { + on_failed_recover({ pg->get_pg_whoami() }, soid, need); + return seastar::make_ready_future<>(); + }) + ); +} + +void PGRecovery::on_local_recover( + const hobject_t& soid, + const ObjectRecoveryInfo& recovery_info, + const bool is_delete, + ceph::os::Transaction& t) +{ + if (const auto &log = pg->get_peering_state().get_pg_log(); + !is_delete && + log.get_missing().is_missing(recovery_info.soid) && + log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) { + assert(pg->is_primary()); + if (const auto* latest = log.get_log().objects.find(recovery_info.soid)->second; + latest->op == pg_log_entry_t::LOST_REVERT) { + ceph_abort("mark_unfound_lost (LOST_REVERT) is not implemented yet"); + } + } + pg->get_peering_state().recover_got(soid, + recovery_info.version, is_delete, t); + + if (pg->is_primary()) { + if (!is_delete) { + auto& obc = pg->get_recovery_backend()->get_recovering(soid).obc; //TODO: move to pg backend? + obc->obs.exists = true; + obc->obs.oi = recovery_info.oi; + } + if (!pg->is_unreadable_object(soid)) { + pg->get_recovery_backend()->get_recovering(soid).set_readable(); + } + pg->publish_stats_to_osd(); + } +} + +void PGRecovery::on_global_recover ( + const hobject_t& soid, + const object_stat_sum_t& stat_diff, + const bool is_delete) +{ + logger().info("{} {}", __func__, soid); + pg->get_peering_state().object_recovered(soid, stat_diff); + pg->publish_stats_to_osd(); + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + if (!is_delete) + recovery_waiter.obc->drop_recovery_read(); + recovery_waiter.set_recovered(); + pg->get_recovery_backend()->remove_recovering(soid); +} + +void PGRecovery::on_failed_recover( + const set<pg_shard_t>& from, + const hobject_t& soid, + const eversion_t& v) +{ + for (auto pg_shard : from) { + if (pg_shard != pg->get_pg_whoami()) { + pg->get_peering_state().force_object_missing(pg_shard, soid, v); + } + } +} + +void PGRecovery::on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info) +{ + crimson::get_logger(ceph_subsys_osd).debug( + "{}: {}, {} on {}", __func__, oid, + recovery_info.version, peer); + pg->get_peering_state().on_peer_recover(peer, oid, recovery_info.version); +} + +void PGRecovery::_committed_pushed_object(epoch_t epoch, + eversion_t last_complete) +{ + if (!pg->has_reset_since(epoch)) { + pg->get_peering_state().recovery_committed_to(last_complete); + } else { + crimson::get_logger(ceph_subsys_osd).debug( + "{} pg has changed, not touching last_complete_ondisk", + __func__); + } +} + +template <class EventT> +void PGRecovery::start_backfill_recovery(const EventT& evt) +{ + using BackfillRecovery = crimson::osd::BackfillRecovery; + std::ignore = pg->get_shard_services().start_operation<BackfillRecovery>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_osdmap_epoch(), + evt); +} + +void PGRecovery::request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) +{ + logger().debug("{}: target.osd={}", __func__, target.osd); + auto msg = crimson::make_message<MOSDPGScan>( + MOSDPGScan::OP_SCAN_GET_DIGEST, + pg->get_pg_whoami(), + pg->get_osdmap_epoch(), + pg->get_last_peering_reset(), + spg_t(pg->get_pgid().pgid, target.shard), + begin, + end); + std::ignore = pg->get_shard_services().send_to_osd( + target.osd, + std::move(msg), + pg->get_osdmap_epoch()); +} + +void PGRecovery::request_primary_scan( + const hobject_t& begin) +{ + logger().debug("{}", __func__); + using crimson::common::local_conf; + std::ignore = pg->get_recovery_backend()->scan_for_backfill( + begin, + local_conf()->osd_backfill_scan_min, + local_conf()->osd_backfill_scan_max + ).then_interruptible([this] (BackfillInterval bi) { + logger().debug("request_primary_scan:{}", __func__); + using BackfillState = crimson::osd::BackfillState; + start_backfill_recovery(BackfillState::PrimaryScanned{ std::move(bi) }); + }); +} + +void PGRecovery::enqueue_push( + const hobject_t& obj, + const eversion_t& v) +{ + logger().debug("{}: obj={} v={}", + __func__, obj, v); + pg->get_recovery_backend()->add_recovering(obj); + std::ignore = pg->get_recovery_backend()->recover_object(obj, v).\ + handle_exception_interruptible([] (auto) { + ceph_abort_msg("got exception on backfill's push"); + return seastar::make_ready_future<>(); + }).then_interruptible([this, obj] { + logger().debug("enqueue_push:{}", __func__); + using BackfillState = crimson::osd::BackfillState; + start_backfill_recovery(BackfillState::ObjectPushed(std::move(obj))); + }); +} + +void PGRecovery::enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) +{ + // allocate a pair if target is seen for the first time + auto& req = backfill_drop_requests[target]; + if (!req) { + req = crimson::make_message<MOSDPGBackfillRemove>( + spg_t(pg->get_pgid().pgid, target.shard), pg->get_osdmap_epoch()); + } + req->ls.emplace_back(obj, v); +} + +void PGRecovery::maybe_flush() +{ + for (auto& [target, req] : backfill_drop_requests) { + std::ignore = pg->get_shard_services().send_to_osd( + target.osd, + std::move(req), + pg->get_osdmap_epoch()); + } + backfill_drop_requests.clear(); +} + +void PGRecovery::update_peers_last_backfill( + const hobject_t& new_last_backfill) +{ + logger().debug("{}: new_last_backfill={}", + __func__, new_last_backfill); + // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to + // all the backfill targets. Otherwise, we will move last_backfill up on + // those targets need it and send OP_BACKFILL_PROGRESS to them. + for (const auto& bt : pg->get_peering_state().get_backfill_targets()) { + if (const pg_info_t& pinfo = pg->get_peering_state().get_peer_info(bt); + new_last_backfill > pinfo.last_backfill) { + pg->get_peering_state().update_peer_last_backfill(bt, new_last_backfill); + auto m = crimson::make_message<MOSDPGBackfill>( + pinfo.last_backfill.is_max() ? MOSDPGBackfill::OP_BACKFILL_FINISH + : MOSDPGBackfill::OP_BACKFILL_PROGRESS, + pg->get_osdmap_epoch(), + pg->get_last_peering_reset(), + spg_t(pg->get_pgid().pgid, bt.shard)); + // Use default priority here, must match sub_op priority + // TODO: if pinfo.last_backfill.is_max(), then + // start_recovery_op(hobject_t::get_max()); + m->last_backfill = pinfo.last_backfill; + m->stats = pinfo.stats; + std::ignore = pg->get_shard_services().send_to_osd( + bt.osd, std::move(m), pg->get_osdmap_epoch()); + logger().info("{}: peer {} num_objects now {} / {}", + __func__, + bt, + pinfo.stats.stats.sum.num_objects, + pg->get_info().stats.stats.sum.num_objects); + } + } +} + +bool PGRecovery::budget_available() const +{ + // TODO: the limits! + return true; +} + +void PGRecovery::backfilled() +{ + using LocalPeeringEvent = crimson::osd::LocalPeeringEvent; + std::ignore = pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::Backfilled{}); +} + +void PGRecovery::dispatch_backfill_event( + boost::intrusive_ptr<const boost::statechart::event_base> evt) +{ + logger().debug("{}", __func__); + backfill_state->process_event(evt); +} + +void PGRecovery::on_backfill_reserved() +{ + logger().debug("{}", __func__); + // PIMP and depedency injection for the sake unittestability. + // I'm not afraid about the performance here. + using BackfillState = crimson::osd::BackfillState; + backfill_state = std::make_unique<BackfillState>( + *this, + std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()), + std::make_unique<crimson::osd::PGFacade>( + *static_cast<crimson::osd::PG*>(pg))); + // yes, it's **not** backfilling yet. The PG_STATE_BACKFILLING + // will be set after on_backfill_reserved() returns. + // Backfill needs to take this into consideration when scheduling + // events -- they must be mutually exclusive with PeeringEvent + // instances. Otherwise the execution might begin without having + // the state updated. + ceph_assert(!pg->get_peering_state().is_backfilling()); + start_backfill_recovery(BackfillState::Triggered{}); +} diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h new file mode 100644 index 000000000..719d0ad2d --- /dev/null +++ b/src/crimson/osd/pg_recovery.h @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/pg_recovery_listener.h" +#include "crimson/osd/scheduler/scheduler.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/recovery_backend.h" + +#include "osd/object_state.h" + +namespace crimson::osd { +class UrgentRecovery; +} + +class MOSDPGBackfillRemove; +class PGBackend; + +class PGRecovery : public crimson::osd::BackfillState::BackfillListener { +public: + template <typename T = void> + using interruptible_future = RecoveryBackend::interruptible_future<T>; + PGRecovery(PGRecoveryListener* pg) : pg(pg) {} + virtual ~PGRecovery() {} + void start_pglogbased_recovery(); + + interruptible_future<bool> start_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI&, + size_t max_to_start); + void on_backfill_reserved(); + void dispatch_backfill_event( + boost::intrusive_ptr<const boost::statechart::event_base> evt); + + seastar::future<> stop() { return seastar::now(); } +private: + PGRecoveryListener* pg; + size_t start_primary_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI&, + size_t max_to_start, + std::vector<interruptible_future<>> *out); + size_t start_replica_recovery_ops( + RecoveryBackend::RecoveryBlockingEvent::TriggerI&, + size_t max_to_start, + std::vector<interruptible_future<>> *out); + + std::vector<pg_shard_t> get_replica_recovery_order() const { + return pg->get_replica_recovery_order(); + } + RecoveryBackend::interruptible_future<> recover_missing( + RecoveryBackend::RecoveryBlockingEvent::TriggerI&, + const hobject_t &soid, eversion_t need); + RecoveryBackend::interruptible_future<> prep_object_replica_deletes( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + const hobject_t& soid, + eversion_t need); + RecoveryBackend::interruptible_future<> prep_object_replica_pushes( + RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger, + const hobject_t& soid, + eversion_t need); + + void on_local_recover( + const hobject_t& soid, + const ObjectRecoveryInfo& recovery_info, + bool is_delete, + ceph::os::Transaction& t); + void on_global_recover ( + const hobject_t& soid, + const object_stat_sum_t& stat_diff, + bool is_delete); + void on_failed_recover( + const std::set<pg_shard_t>& from, + const hobject_t& soid, + const eversion_t& v); + void on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info); + void _committed_pushed_object(epoch_t epoch, + eversion_t last_complete); + friend class ReplicatedRecoveryBackend; + friend class crimson::osd::UrgentRecovery; + + // backfill begin + std::unique_ptr<crimson::osd::BackfillState> backfill_state; + std::map<pg_shard_t, + MURef<MOSDPGBackfillRemove>> backfill_drop_requests; + + template <class EventT> + void start_backfill_recovery( + const EventT& evt); + void request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) final; + void request_primary_scan( + const hobject_t& begin) final; + void enqueue_push( + const hobject_t& obj, + const eversion_t& v) final; + void enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) final; + void maybe_flush() final; + void update_peers_last_backfill( + const hobject_t& new_last_backfill) final; + bool budget_available() const final; + void backfilled() final; + friend crimson::osd::BackfillState::PGFacade; + friend crimson::osd::PG; + // backfill end +}; diff --git a/src/crimson/osd/pg_recovery_listener.h b/src/crimson/osd/pg_recovery_listener.h new file mode 100644 index 000000000..c922b9956 --- /dev/null +++ b/src/crimson/osd/pg_recovery_listener.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "common/hobject.h" +#include "include/types.h" +#include "osd/osd_types.h" + +namespace crimson::osd { + class ShardServices; +}; + +class RecoveryBackend; +class PGRecovery; + +class PGRecoveryListener { +public: + virtual crimson::osd::ShardServices& get_shard_services() = 0; + virtual PGRecovery* get_recovery_handler() = 0; + virtual epoch_t get_osdmap_epoch() const = 0; + virtual bool is_primary() const = 0; + virtual bool is_peered() const = 0; + virtual bool is_recovering() const = 0; + virtual bool is_backfilling() const = 0; + virtual PeeringState& get_peering_state() = 0; + virtual const pg_shard_t& get_pg_whoami() const = 0; + virtual const spg_t& get_pgid() const = 0; + virtual RecoveryBackend* get_recovery_backend() = 0; + virtual bool is_unreadable_object(const hobject_t&, eversion_t* v = 0) const = 0; + virtual bool has_reset_since(epoch_t) const = 0; + virtual std::vector<pg_shard_t> get_replica_recovery_order() const = 0; + virtual epoch_t get_last_peering_reset() const = 0; + virtual const pg_info_t& get_info() const= 0; + virtual seastar::future<> stop() = 0; + virtual void publish_stats_to_osd() = 0; +}; diff --git a/src/crimson/osd/pg_shard_manager.cc b/src/crimson/osd/pg_shard_manager.cc new file mode 100644 index 000000000..6061c856b --- /dev/null +++ b/src/crimson/osd/pg_shard_manager.cc @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/pg_shard_manager.h" +#include "crimson/osd/pg.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +seastar::future<> PGShardManager::load_pgs(crimson::os::FuturizedStore& store) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return store.list_collections( + ).then([this](auto colls_cores) { + return seastar::parallel_for_each( + colls_cores, + [this](auto coll_core) { + auto[coll, shard_core] = coll_core; + spg_t pgid; + if (coll.is_pg(&pgid)) { + return get_pg_to_shard_mapping().maybe_create_pg( + pgid, shard_core + ).then([this, pgid] (auto core) { + return this->template with_remote_shard_state( + core, + [pgid]( + PerShardState &per_shard_state, + ShardServices &shard_services) { + return shard_services.load_pg( + pgid + ).then([pgid, &per_shard_state](auto &&pg) { + logger().info("load_pgs: loaded {}", pgid); + per_shard_state.pg_map.pg_loaded(pgid, std::move(pg)); + return seastar::now(); + }); + }); + }); + } else if (coll.is_temp(&pgid)) { + logger().warn( + "found temp collection on crimson osd, should be impossible: {}", + coll); + ceph_assert(0 == "temp collection on crimson osd, should be impossible"); + return seastar::now(); + } else { + logger().warn("ignoring unrecognized collection: {}", coll); + return seastar::now(); + } + }); + }); +} + +seastar::future<> PGShardManager::stop_pgs() +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return shard_services.invoke_on_all([](auto &local_service) { + return local_service.local_state.stop_pgs(); + }); +} + +seastar::future<std::map<pg_t, pg_stat_t>> +PGShardManager::get_pg_stats() const +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return shard_services.map_reduce0( + [](auto &local) { + return local.local_state.get_pg_stats(); + }, + std::map<pg_t, pg_stat_t>(), + [](auto &&left, auto &&right) { + left.merge(std::move(right)); + return std::move(left); + }); +} + +seastar::future<> PGShardManager::broadcast_map_to_pgs(epoch_t epoch) +{ + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return shard_services.invoke_on_all([epoch](auto &local_service) { + return local_service.local_state.broadcast_map_to_pgs( + local_service, epoch + ); + }).then([this, epoch] { + logger().debug("PGShardManager::broadcast_map_to_pgs " + "broadcasted up to {}", + epoch); + return shard_services.invoke_on_all([epoch](auto &local_service) { + local_service.local_state.osdmap_gate.got_map(epoch); + return seastar::now(); + }); + }); +} + +seastar::future<> PGShardManager::set_up_epoch(epoch_t e) { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return shard_services.invoke_on_all( + seastar::smp_submit_to_options{}, + [e](auto &local_service) { + local_service.local_state.set_up_epoch(e); + return seastar::now(); + }); +} + +} diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h new file mode 100644 index 000000000..2f3a3015d --- /dev/null +++ b/src/crimson/osd/pg_shard_manager.h @@ -0,0 +1,390 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/sharded.hh> + +#include "crimson/osd/shard_services.h" +#include "crimson/osd/pg_map.h" + +namespace crimson::os { + class FuturizedStore; +} + +namespace crimson::osd { +/** + * PGShardManager + * + * Manages all state required to partition PGs over seastar reactors + * as well as state required to route messages to pgs. Mediates access to + * shared resources required by PGs (objectstore, messenger, monclient, + * etc) + */ +class PGShardManager { + seastar::sharded<OSDSingletonState> &osd_singleton_state; + seastar::sharded<ShardServices> &shard_services; + seastar::sharded<PGShardMapping> &pg_to_shard_mapping; + +#define FORWARD_CONST(FROM_METHOD, TO_METHOD, TARGET) \ + template <typename... Args> \ + auto FROM_METHOD(Args&&... args) const { \ + return TARGET.TO_METHOD(std::forward<Args>(args)...); \ + } + +#define FORWARD(FROM_METHOD, TO_METHOD, TARGET) \ + template <typename... Args> \ + auto FROM_METHOD(Args&&... args) { \ + return TARGET.TO_METHOD(std::forward<Args>(args)...); \ + } + +#define FORWARD_TO_OSD_SINGLETON(METHOD) \ + FORWARD(METHOD, METHOD, get_osd_singleton_state()) + +public: + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; + + PGShardManager( + seastar::sharded<OSDSingletonState> &osd_singleton_state, + seastar::sharded<ShardServices> &shard_services, + seastar::sharded<PGShardMapping> &pg_to_shard_mapping) + : osd_singleton_state(osd_singleton_state), + shard_services(shard_services), + pg_to_shard_mapping(pg_to_shard_mapping) {} + + auto &get_osd_singleton_state() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return osd_singleton_state.local(); + } + auto &get_osd_singleton_state() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return osd_singleton_state.local(); + } + auto &get_shard_services() { + return shard_services.local(); + } + auto &get_shard_services() const { + return shard_services.local(); + } + auto &get_local_state() { return get_shard_services().local_state; } + auto &get_local_state() const { return get_shard_services().local_state; } + auto &get_pg_to_shard_mapping() { return pg_to_shard_mapping.local(); } + auto &get_pg_to_shard_mapping() const { return pg_to_shard_mapping.local(); } + + seastar::future<> update_map(local_cached_map_t &&map) { + get_osd_singleton_state().update_map( + make_local_shared_foreign(local_cached_map_t(map)) + ); + /* We need each core to get its own foreign_ptr<local_cached_map_t>. + * foreign_ptr can't be cheaply copied, so we make one for each core + * up front. */ + return seastar::do_with( + std::vector<seastar::foreign_ptr<local_cached_map_t>>(), + [this, map](auto &fmaps) { + fmaps.resize(seastar::smp::count); + for (auto &i: fmaps) { + i = seastar::foreign_ptr(map); + } + return shard_services.invoke_on_all( + [&fmaps](auto &local) mutable { + local.local_state.update_map( + make_local_shared_foreign( + std::move(fmaps[seastar::this_shard_id()]) + )); + }); + }); + } + + seastar::future<> stop_registries() { + return shard_services.invoke_on_all([](auto &local) { + return local.local_state.stop_registry(); + }); + } + + FORWARD_TO_OSD_SINGLETON(send_pg_created) + + // osd state forwards + FORWARD(is_active, is_active, get_shard_services().local_state.osd_state) + FORWARD(is_preboot, is_preboot, get_shard_services().local_state.osd_state) + FORWARD(is_booting, is_booting, get_shard_services().local_state.osd_state) + FORWARD(is_stopping, is_stopping, get_shard_services().local_state.osd_state) + FORWARD(is_prestop, is_prestop, get_shard_services().local_state.osd_state) + FORWARD(is_initializing, is_initializing, get_shard_services().local_state.osd_state) + FORWARD(set_prestop, set_prestop, get_shard_services().local_state.osd_state) + FORWARD(set_preboot, set_preboot, get_shard_services().local_state.osd_state) + FORWARD(set_booting, set_booting, get_shard_services().local_state.osd_state) + FORWARD(set_stopping, set_stopping, get_shard_services().local_state.osd_state) + FORWARD(set_active, set_active, get_shard_services().local_state.osd_state) + FORWARD(when_active, when_active, get_shard_services().local_state.osd_state) + FORWARD_CONST(get_osd_state_string, to_string, get_shard_services().local_state.osd_state) + + FORWARD(got_map, got_map, get_shard_services().local_state.osdmap_gate) + FORWARD(wait_for_map, wait_for_map, get_shard_services().local_state.osdmap_gate) + + // Metacoll + FORWARD_TO_OSD_SINGLETON(init_meta_coll) + FORWARD_TO_OSD_SINGLETON(get_meta_coll) + + FORWARD_TO_OSD_SINGLETON(set_superblock) + + // Core OSDMap methods + FORWARD_TO_OSD_SINGLETON(get_local_map) + FORWARD_TO_OSD_SINGLETON(load_map_bl) + FORWARD_TO_OSD_SINGLETON(load_map_bls) + FORWARD_TO_OSD_SINGLETON(store_maps) + + seastar::future<> set_up_epoch(epoch_t e); + + template <typename F> + auto with_remote_shard_state(core_id_t core, F &&f) { + return shard_services.invoke_on( + core, [f=std::move(f)](auto &target_shard_services) mutable { + return std::invoke( + std::move(f), target_shard_services.local_state, + target_shard_services); + }); + } + + template <typename T, typename F> + auto with_remote_shard_state_and_op( + core_id_t core, + typename T::IRef &&op, + F &&f) { + if (seastar::this_shard_id() == core) { + auto &target_shard_services = shard_services.local(); + return std::invoke( + std::move(f), + target_shard_services.local_state, + target_shard_services, + std::move(op)); + } + return op->prepare_remote_submission( + ).then([op=std::move(op), f=std::move(f), this, core + ](auto f_conn) mutable { + return shard_services.invoke_on( + core, + [f=std::move(f), op=std::move(op), f_conn=std::move(f_conn) + ](auto &target_shard_services) mutable { + op->finish_remote_submission(std::move(f_conn)); + return std::invoke( + std::move(f), + target_shard_services.local_state, + target_shard_services, + std::move(op)); + }); + }); + } + + /// Runs opref on the appropriate core, creating the pg as necessary. + template <typename T> + seastar::future<> run_with_pg_maybe_create( + typename T::IRef op + ) { + ceph_assert(op->use_count() == 1); + auto &logger = crimson::get_logger(ceph_subsys_osd); + static_assert(T::can_create()); + logger.debug("{}: can_create", *op); + + get_local_state().registry.remove_from_registry(*op); + return get_pg_to_shard_mapping().maybe_create_pg( + op->get_pgid() + ).then([this, op = std::move(op)](auto core) mutable { + return this->template with_remote_shard_state_and_op<T>( + core, std::move(op), + [](PerShardState &per_shard_state, + ShardServices &shard_services, + typename T::IRef op) { + per_shard_state.registry.add_to_registry(*op); + auto &logger = crimson::get_logger(ceph_subsys_osd); + auto &opref = *op; + return opref.template with_blocking_event< + PGMap::PGCreationBlockingEvent + >([&shard_services, &opref]( + auto &&trigger) { + return shard_services.get_or_create_pg( + std::move(trigger), + opref.get_pgid(), + std::move(opref.get_create_info()) + ); + }).safe_then([&logger, &shard_services, &opref](Ref<PG> pgref) { + logger.debug("{}: have_pg", opref); + return opref.with_pg(shard_services, pgref); + }).handle_error( + crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { + logger.debug("{}: pg creation canceled, dropping", opref); + return seastar::now(); + }) + ).then([op=std::move(op)] {}); + }); + }); + } + + /// Runs opref on the appropriate core, waiting for pg as necessary + template <typename T> + seastar::future<> run_with_pg_maybe_wait( + typename T::IRef op + ) { + ceph_assert(op->use_count() == 1); + auto &logger = crimson::get_logger(ceph_subsys_osd); + static_assert(!T::can_create()); + logger.debug("{}: !can_create", *op); + + get_local_state().registry.remove_from_registry(*op); + return get_pg_to_shard_mapping().maybe_create_pg( + op->get_pgid() + ).then([this, op = std::move(op)](auto core) mutable { + return this->template with_remote_shard_state_and_op<T>( + core, std::move(op), + [](PerShardState &per_shard_state, + ShardServices &shard_services, + typename T::IRef op) { + per_shard_state.registry.add_to_registry(*op); + auto &logger = crimson::get_logger(ceph_subsys_osd); + auto &opref = *op; + return opref.template with_blocking_event< + PGMap::PGCreationBlockingEvent + >([&shard_services, &opref]( + auto &&trigger) { + return shard_services.wait_for_pg( + std::move(trigger), opref.get_pgid()); + }).safe_then([&logger, &shard_services, &opref](Ref<PG> pgref) { + logger.debug("{}: have_pg", opref); + return opref.with_pg(shard_services, pgref); + }).handle_error( + crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { + logger.debug("{}: pg creation canceled, dropping", opref); + return seastar::now(); + }) + ).then([op=std::move(op)] {}); + }); + }); + } + + seastar::future<> load_pgs(crimson::os::FuturizedStore& store); + seastar::future<> stop_pgs(); + + seastar::future<std::map<pg_t, pg_stat_t>> get_pg_stats() const; + + /** + * invoke_method_on_each_shard_seq + * + * Invokes shard_services method on each shard sequentially. + */ + template <typename F, typename... Args> + seastar::future<> invoke_on_each_shard_seq( + F &&f) const { + return sharded_map_seq( + shard_services, + [f=std::forward<F>(f)](const ShardServices &shard_services) mutable { + return std::invoke( + f, + shard_services); + }); + } + + /** + * for_each_pg + * + * Invokes f on each pg sequentially. Caller may rely on f not being + * invoked concurrently on multiple cores. + */ + template <typename F> + seastar::future<> for_each_pg(F &&f) const { + return invoke_on_each_shard_seq( + [f=std::move(f)](const auto &local_service) mutable { + for (auto &pg: local_service.local_state.pg_map.get_pgs()) { + std::apply(f, pg); + } + return seastar::now(); + }); + } + + /** + * for_each_pgid + * + * Syncronously invokes f on each pgid + */ + template <typename F> + void for_each_pgid(F &&f) const { + return get_pg_to_shard_mapping().for_each_pgid( + std::forward<F>(f)); + } + + auto get_num_pgs() const { + return get_pg_to_shard_mapping().get_num_pgs(); + } + + seastar::future<> broadcast_map_to_pgs(epoch_t epoch); + + template <typename F> + auto with_pg(spg_t pgid, F &&f) { + core_id_t core = get_pg_to_shard_mapping().get_pg_mapping(pgid); + return with_remote_shard_state( + core, + [pgid, f=std::move(f)](auto &local_state, auto &local_service) mutable { + return std::invoke( + std::move(f), + local_state.pg_map.get_pg(pgid)); + }); + } + + template <typename T, typename... Args> + auto start_pg_operation(Args&&... args) { + auto op = get_local_state().registry.create_operation<T>( + std::forward<Args>(args)...); + auto &logger = crimson::get_logger(ceph_subsys_osd); + logger.debug("{}: starting {}", *op, __func__); + + auto &opref = *op; + auto id = op->get_id(); + if constexpr (T::is_trackable) { + op->template track_event<typename T::StartEvent>(); + } + auto fut = opref.template enter_stage<>( + opref.get_connection_pipeline().await_active + ).then([this, &opref, &logger] { + logger.debug("{}: start_pg_operation in await_active stage", opref); + return get_shard_services().local_state.osd_state.when_active(); + }).then([&logger, &opref] { + logger.debug("{}: start_pg_operation active, entering await_map", opref); + return opref.template enter_stage<>( + opref.get_connection_pipeline().await_map); + }).then([this, &logger, &opref] { + logger.debug("{}: start_pg_operation await_map stage", opref); + using OSDMapBlockingEvent = + OSD_OSDMapGate::OSDMapBlocker::BlockingEvent; + return opref.template with_blocking_event<OSDMapBlockingEvent>( + [this, &opref](auto &&trigger) { + std::ignore = this; + return get_shard_services().local_state.osdmap_gate.wait_for_map( + std::move(trigger), + opref.get_epoch(), + &get_shard_services()); + }); + }).then([&logger, &opref](auto epoch) { + logger.debug("{}: got map {}, entering get_pg", opref, epoch); + return opref.template enter_stage<>( + opref.get_connection_pipeline().get_pg); + }).then([this, &logger, &opref, op=std::move(op)]() mutable { + logger.debug("{}: in get_pg core {}", opref, seastar::this_shard_id()); + logger.debug("{}: in get_pg", opref); + if constexpr (T::can_create()) { + logger.debug("{}: can_create", opref); + return run_with_pg_maybe_create<T>(std::move(op)); + } else { + logger.debug("{}: !can_create", opref); + return run_with_pg_maybe_wait<T>(std::move(op)); + } + }); + return std::make_pair(id, std::move(fut)); + } + +#undef FORWARD +#undef FORWARD_CONST +#undef FORWARD_TO_OSD_SINGLETON +}; + +} diff --git a/src/crimson/osd/recovery_backend.cc b/src/crimson/osd/recovery_backend.cc new file mode 100644 index 000000000..b5394bfdc --- /dev/null +++ b/src/crimson/osd/recovery_backend.cc @@ -0,0 +1,328 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/format.h> + +#include "crimson/common/exception.h" +#include "crimson/osd/recovery_backend.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/osd_operations/background_recovery.h" + +#include "messages/MOSDFastDispatchOp.h" +#include "osd/osd_types.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +hobject_t RecoveryBackend::get_temp_recovery_object( + const hobject_t& target, + eversion_t version) const +{ + hobject_t hoid = + target.make_temp_hobject(fmt::format("temp_recovering_{}_{}_{}_{}", + pg.get_info().pgid, + version, + pg.get_info().history.same_interval_since, + target.snap)); + logger().debug("{} {}", __func__, hoid); + return hoid; +} + +void RecoveryBackend::clean_up(ceph::os::Transaction& t, + std::string_view why) +{ + for (auto& soid : temp_contents) { + t.remove(pg.get_collection_ref()->get_cid(), + ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard)); + } + temp_contents.clear(); + + for (auto& [soid, recovery_waiter] : recovering) { + if ((recovery_waiter->pull_info + && recovery_waiter->pull_info->is_complete()) + || (!recovery_waiter->pull_info + && recovery_waiter->obc && recovery_waiter->obc->obs.exists)) { + recovery_waiter->obc->interrupt( + ::crimson::common::actingset_changed( + pg.is_primary())); + recovery_waiter->interrupt(why); + } + } + recovering.clear(); +} + +void RecoveryBackend::WaitForObjectRecovery::stop() { + readable.set_exception( + crimson::common::system_shutdown_exception()); + recovered.set_exception( + crimson::common::system_shutdown_exception()); + pulled.set_exception( + crimson::common::system_shutdown_exception()); + for (auto& [pg_shard, pr] : pushes) { + pr.set_exception( + crimson::common::system_shutdown_exception()); + } +} + +void RecoveryBackend::handle_backfill_finish( + MOSDPGBackfill& m, + crimson::net::ConnectionRef conn) +{ + logger().debug("{}", __func__); + ceph_assert(!pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 1); + auto reply = crimson::make_message<MOSDPGBackfill>( + MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, + pg.get_osdmap_epoch(), + m.query_epoch, + spg_t(pg.get_pgid().pgid, pg.get_primary().shard)); + reply->set_priority(pg.get_recovery_op_priority()); + std::ignore = conn->send(std::move(reply)); + shard_services.start_operation<crimson::osd::LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(&pg), + pg.get_pg_whoami(), + pg.get_pgid(), + pg.get_osdmap_epoch(), + pg.get_osdmap_epoch(), + RecoveryDone{}); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_backfill_progress( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + ceph_assert(!pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 2); + + ObjectStore::Transaction t; + pg.get_peering_state().update_backfill_progress( + m.last_backfill, + m.stats, + m.op == MOSDPGBackfill::OP_BACKFILL_PROGRESS, + t); + logger().debug("RecoveryBackend::handle_backfill_progress: do_transaction..."); + return shard_services.get_store().do_transaction( + pg.get_collection_ref(), std::move(t)).or_terminate(); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_backfill_finish_ack( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + ceph_assert(pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 3); + // TODO: + // finish_recovery_op(hobject_t::get_max()); + return seastar::now(); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_backfill( + MOSDPGBackfill& m, + crimson::net::ConnectionRef conn) +{ + logger().debug("{}", __func__); + if (pg.old_peering_msg(m.map_epoch, m.query_epoch)) { + logger().debug("{}: discarding {}", __func__, m); + return seastar::now(); + } + switch (m.op) { + case MOSDPGBackfill::OP_BACKFILL_FINISH: + handle_backfill_finish(m, conn); + [[fallthrough]]; + case MOSDPGBackfill::OP_BACKFILL_PROGRESS: + return handle_backfill_progress(m); + case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK: + return handle_backfill_finish_ack(m); + default: + ceph_assert("unknown op type for pg backfill"); + return seastar::now(); + } +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_backfill_remove( + MOSDPGBackfillRemove& m) +{ + logger().debug("{} m.ls={}", __func__, m.ls); + assert(m.get_type() == MSG_OSD_PG_BACKFILL_REMOVE); + if (pg.can_discard_replica_op(m)) { + logger().debug("{}: discarding {}", __func__, m); + return seastar::now(); + } + ObjectStore::Transaction t; + for ([[maybe_unused]] const auto& [soid, ver] : m.ls) { + // TODO: the reserved space management. PG::try_reserve_recovery_space(). + t.remove(pg.get_collection_ref()->get_cid(), + ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard)); + } + logger().debug("RecoveryBackend::handle_backfill_remove: do_transaction..."); + return shard_services.get_store().do_transaction( + pg.get_collection_ref(), std::move(t)).or_terminate(); +} + +RecoveryBackend::interruptible_future<BackfillInterval> +RecoveryBackend::scan_for_backfill( + const hobject_t& start, + [[maybe_unused]] const std::int64_t min, + const std::int64_t max) +{ + logger().debug("{} starting from {}", __func__, start); + auto version_map = seastar::make_lw_shared<std::map<hobject_t, eversion_t>>(); + return backend->list_objects(start, max).then_interruptible( + [this, start, version_map] (auto&& ret) { + auto&& [objects, next] = std::move(ret); + return seastar::do_with( + std::move(objects), + [this, version_map](auto &objects) { + return interruptor::parallel_for_each(objects, + [this, version_map] (const hobject_t& object) + -> interruptible_future<> { + crimson::osd::ObjectContextRef obc; + if (pg.is_primary()) { + obc = pg.obc_registry.maybe_get_cached_obc(object); + } + if (obc) { + if (obc->obs.exists) { + logger().debug("scan_for_backfill found (primary): {} {}", + object, obc->obs.oi.version); + version_map->emplace(object, obc->obs.oi.version); + } else { + // if the object does not exist here, it must have been removed + // between the collection_list_partial and here. This can happen + // for the first item in the range, which is usually last_backfill. + } + return seastar::now(); + } else { + return backend->load_metadata(object).safe_then_interruptible( + [version_map, object] (auto md) { + if (md->os.exists) { + logger().debug("scan_for_backfill found: {} {}", + object, md->os.oi.version); + version_map->emplace(object, md->os.oi.version); + } + return seastar::now(); + }, PGBackend::load_metadata_ertr::assert_all{}); + } + }); + }).then_interruptible([version_map, start=std::move(start), next=std::move(next), this] { + BackfillInterval bi; + bi.begin = std::move(start); + bi.end = std::move(next); + bi.version = pg.get_info().last_update; + bi.objects = std::move(*version_map); + logger().debug("{} BackfillInterval filled, leaving", + "scan_for_backfill"); + return seastar::make_ready_future<BackfillInterval>(std::move(bi)); + }); + }); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_scan_get_digest( + MOSDPGScan& m, + crimson::net::ConnectionRef conn) +{ + logger().debug("{}", __func__); + if (false /* FIXME: check for backfill too full */) { + std::ignore = shard_services.start_operation<crimson::osd::LocalPeeringEvent>( + // TODO: abstract start_background_recovery + static_cast<crimson::osd::PG*>(&pg), + pg.get_pg_whoami(), + pg.get_pgid(), + pg.get_osdmap_epoch(), + pg.get_osdmap_epoch(), + PeeringState::BackfillTooFull()); + return seastar::now(); + } + return scan_for_backfill( + std::move(m.begin), + crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_min"), + crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_max") + ).then_interruptible( + [this, query_epoch=m.query_epoch, conn + ](auto backfill_interval) { + auto reply = crimson::make_message<MOSDPGScan>( + MOSDPGScan::OP_SCAN_DIGEST, + pg.get_pg_whoami(), + pg.get_osdmap_epoch(), + query_epoch, + spg_t(pg.get_info().pgid.pgid, pg.get_primary().shard), + backfill_interval.begin, + backfill_interval.end); + encode(backfill_interval.objects, reply->get_data()); + return conn->send(std::move(reply)); + }); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_scan_digest( + MOSDPGScan& m) +{ + logger().debug("{}", __func__); + // Check that from is in backfill_targets vector + ceph_assert(pg.is_backfill_target(m.from)); + + BackfillInterval bi; + bi.begin = m.begin; + bi.end = m.end; + { + auto p = m.get_data().cbegin(); + // take care to preserve ordering! + bi.clear_objects(); + ::decode_noclear(bi.objects, p); + } + shard_services.start_operation<crimson::osd::BackfillRecovery>( + static_cast<crimson::osd::PG*>(&pg), + shard_services, + pg.get_osdmap_epoch(), + crimson::osd::BackfillState::ReplicaScanned{ m.from, std::move(bi) }); + return seastar::now(); +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_scan( + MOSDPGScan& m, + crimson::net::ConnectionRef conn) +{ + logger().debug("{}", __func__); + if (pg.old_peering_msg(m.map_epoch, m.query_epoch)) { + logger().debug("{}: discarding {}", __func__, m); + return seastar::now(); + } + switch (m.op) { + case MOSDPGScan::OP_SCAN_GET_DIGEST: + return handle_scan_get_digest(m, conn); + case MOSDPGScan::OP_SCAN_DIGEST: + return handle_scan_digest(m); + default: + // FIXME: move to errorator + ceph_assert("unknown op type for pg scan"); + return seastar::now(); + } +} + +RecoveryBackend::interruptible_future<> +RecoveryBackend::handle_recovery_op( + Ref<MOSDFastDispatchOp> m, + crimson::net::ConnectionRef conn) +{ + switch (m->get_header().type) { + case MSG_OSD_PG_BACKFILL: + return handle_backfill(*boost::static_pointer_cast<MOSDPGBackfill>(m), conn); + case MSG_OSD_PG_BACKFILL_REMOVE: + return handle_backfill_remove(*boost::static_pointer_cast<MOSDPGBackfillRemove>(m)); + case MSG_OSD_PG_SCAN: + return handle_scan(*boost::static_pointer_cast<MOSDPGScan>(m), conn); + default: + return seastar::make_exception_future<>( + std::invalid_argument(fmt::format("invalid request type: {}", + m->get_header().type))); + } +} diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h new file mode 100644 index 000000000..65e9bb01f --- /dev/null +++ b/src/crimson/osd/recovery_backend.h @@ -0,0 +1,233 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "crimson/common/type_helpers.h" +#include "crimson/os/futurized_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/shard_services.h" + +#include "messages/MOSDPGBackfill.h" +#include "messages/MOSDPGBackfillRemove.h" +#include "messages/MOSDPGScan.h" +#include "osd/recovery_types.h" +#include "osd/osd_types.h" + +namespace crimson::osd{ + class PG; +} + +class PGBackend; + +class RecoveryBackend { +public: + class WaitForObjectRecovery; +public: + template <typename T = void> + using interruptible_future = + ::crimson::interruptible::interruptible_future< + ::crimson::osd::IOInterruptCondition, T>; + using interruptor = + ::crimson::interruptible::interruptor< + ::crimson::osd::IOInterruptCondition>; + RecoveryBackend(crimson::osd::PG& pg, + crimson::osd::ShardServices& shard_services, + crimson::os::CollectionRef coll, + PGBackend* backend) + : pg{pg}, + shard_services{shard_services}, + store{&shard_services.get_store()}, + coll{coll}, + backend{backend} {} + virtual ~RecoveryBackend() {} + WaitForObjectRecovery& add_recovering(const hobject_t& soid) { + auto [it, added] = recovering.emplace(soid, new WaitForObjectRecovery{}); + assert(added); + return *(it->second); + } + WaitForObjectRecovery& get_recovering(const hobject_t& soid) { + assert(is_recovering(soid)); + return *(recovering.at(soid)); + } + void remove_recovering(const hobject_t& soid) { + recovering.erase(soid); + } + bool is_recovering(const hobject_t& soid) const { + return recovering.count(soid) != 0; + } + uint64_t total_recovering() const { + return recovering.size(); + } + + virtual interruptible_future<> handle_recovery_op( + Ref<MOSDFastDispatchOp> m, + crimson::net::ConnectionRef conn); + + virtual interruptible_future<> recover_object( + const hobject_t& soid, + eversion_t need) = 0; + virtual interruptible_future<> recover_delete( + const hobject_t& soid, + eversion_t need) = 0; + virtual interruptible_future<> push_delete( + const hobject_t& soid, + eversion_t need) = 0; + + interruptible_future<BackfillInterval> scan_for_backfill( + const hobject_t& from, + std::int64_t min, + std::int64_t max); + + void on_peering_interval_change(ceph::os::Transaction& t) { + clean_up(t, "new peering interval"); + } + + seastar::future<> stop() { + for (auto& [soid, recovery_waiter] : recovering) { + recovery_waiter->stop(); + } + return on_stop(); + } +protected: + crimson::osd::PG& pg; + crimson::osd::ShardServices& shard_services; + crimson::os::FuturizedStore::Shard* store; + crimson::os::CollectionRef coll; + PGBackend* backend; + + struct pull_info_t { + pg_shard_t from; + hobject_t soid; + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + crimson::osd::ObjectContextRef head_ctx; + crimson::osd::ObjectContextRef obc; + object_stat_sum_t stat; + bool is_complete() const { + return recovery_progress.is_complete(recovery_info); + } + }; + + struct push_info_t { + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + crimson::osd::ObjectContextRef obc; + object_stat_sum_t stat; + }; + +public: + class WaitForObjectRecovery : + public boost::intrusive_ref_counter< + WaitForObjectRecovery, boost::thread_unsafe_counter>, + public crimson::BlockerT<WaitForObjectRecovery> { + seastar::shared_promise<> readable, recovered, pulled; + std::map<pg_shard_t, seastar::shared_promise<>> pushes; + public: + static constexpr const char* type_name = "WaitForObjectRecovery"; + + crimson::osd::ObjectContextRef obc; + std::optional<pull_info_t> pull_info; + std::map<pg_shard_t, push_info_t> pushing; + + seastar::future<> wait_for_readable() { + return readable.get_shared_future(); + } + seastar::future<> wait_for_pushes(pg_shard_t shard) { + return pushes[shard].get_shared_future(); + } + seastar::future<> wait_for_recovered() { + return recovered.get_shared_future(); + } + template <typename T, typename F> + auto wait_track_blocking(T &trigger, F &&fut) { + WaitForObjectRecoveryRef ref = this; + return track_blocking( + trigger, + std::forward<F>(fut) + ).finally([ref] {}); + } + template <typename T> + seastar::future<> wait_for_recovered(T &trigger) { + WaitForObjectRecoveryRef ref = this; + return wait_track_blocking(trigger, recovered.get_shared_future()); + } + seastar::future<> wait_for_pull() { + return pulled.get_shared_future(); + } + void set_readable() { + readable.set_value(); + } + void set_recovered() { + recovered.set_value(); + } + void set_pushed(pg_shard_t shard) { + pushes[shard].set_value(); + } + void set_pulled() { + pulled.set_value(); + } + void set_push_failed(pg_shard_t shard, std::exception_ptr e) { + pushes.at(shard).set_exception(e); + } + void interrupt(std::string_view why) { + readable.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + recovered.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + pulled.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + for (auto& [pg_shard, pr] : pushes) { + pr.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + } + } + void stop(); + void dump_detail(Formatter* f) const { + } + }; + using RecoveryBlockingEvent = + crimson::AggregateBlockingEvent<WaitForObjectRecovery::BlockingEvent>; + using WaitForObjectRecoveryRef = boost::intrusive_ptr<WaitForObjectRecovery>; +protected: + std::map<hobject_t, WaitForObjectRecoveryRef> recovering; + hobject_t get_temp_recovery_object( + const hobject_t& target, + eversion_t version) const; + + boost::container::flat_set<hobject_t> temp_contents; + + void add_temp_obj(const hobject_t &oid) { + temp_contents.insert(oid); + } + void clear_temp_obj(const hobject_t &oid) { + temp_contents.erase(oid); + } + void clean_up(ceph::os::Transaction& t, std::string_view why); + virtual seastar::future<> on_stop() = 0; +private: + void handle_backfill_finish( + MOSDPGBackfill& m, + crimson::net::ConnectionRef conn); + interruptible_future<> handle_backfill_progress( + MOSDPGBackfill& m); + interruptible_future<> handle_backfill_finish_ack( + MOSDPGBackfill& m); + interruptible_future<> handle_backfill( + MOSDPGBackfill& m, + crimson::net::ConnectionRef conn); + + interruptible_future<> handle_scan_get_digest( + MOSDPGScan& m, + crimson::net::ConnectionRef conn); + interruptible_future<> handle_scan_digest( + MOSDPGScan& m); + interruptible_future<> handle_scan( + MOSDPGScan& m, + crimson::net::ConnectionRef conn); + interruptible_future<> handle_backfill_remove(MOSDPGBackfillRemove& m); +}; diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc new file mode 100644 index 000000000..0ff4ad573 --- /dev/null +++ b/src/crimson/osd/replicated_backend.cc @@ -0,0 +1,174 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "replicated_backend.h" + +#include "messages/MOSDRepOpReply.h" + +#include "crimson/common/exception.h" +#include "crimson/common/log.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/shard_services.h" +#include "osd/PeeringState.h" + +SET_SUBSYS(osd); + +ReplicatedBackend::ReplicatedBackend(pg_t pgid, + pg_shard_t whoami, + ReplicatedBackend::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + DoutPrefixProvider &dpp) + : PGBackend{whoami.shard, coll, shard_services, dpp}, + pgid{pgid}, + whoami{whoami} +{} + +ReplicatedBackend::ll_read_ierrorator::future<ceph::bufferlist> +ReplicatedBackend::_read(const hobject_t& hoid, + const uint64_t off, + const uint64_t len, + const uint32_t flags) +{ + return store->read(coll, ghobject_t{hoid}, off, len, flags); +} + +ReplicatedBackend::rep_op_fut_t +ReplicatedBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + LOG_PREFIX(ReplicatedBackend::_submit_transaction); + + const ceph_tid_t tid = shard_services.get_tid(); + auto pending_txn = + pending_trans.try_emplace(tid, pg_shards.size(), osd_op_p.at_version).first; + bufferlist encoded_txn; + encode(txn, encoded_txn); + + DEBUGDPP("object {}", dpp, hoid); + auto all_completed = interruptor::make_interruptible( + shard_services.get_store().do_transaction(coll, std::move(txn)) + ).then_interruptible([FNAME, this, + peers=pending_txn->second.weak_from_this()] { + if (!peers) { + // for now, only actingset_changed can cause peers + // to be nullptr + ERRORDPP("peers is null, this should be impossible", dpp); + assert(0 == "impossible"); + } + if (--peers->pending == 0) { + peers->all_committed.set_value(); + peers->all_committed = {}; + return seastar::now(); + } + return peers->all_committed.get_shared_future(); + }).then_interruptible([pending_txn, this] { + auto acked_peers = std::move(pending_txn->second.acked_peers); + pending_trans.erase(pending_txn); + return seastar::make_ready_future<crimson::osd::acked_peers_t>(std::move(acked_peers)); + }); + + auto sends = std::make_unique<std::vector<seastar::future<>>>(); + for (auto pg_shard : pg_shards) { + if (pg_shard != whoami) { + auto m = crimson::make_message<MOSDRepOp>( + osd_op_p.req_id, + whoami, + spg_t{pgid, pg_shard.shard}, + hoid, + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + map_epoch, + min_epoch, + tid, + osd_op_p.at_version); + m->set_data(encoded_txn); + pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}}); + encode(log_entries, m->logbl); + m->pg_trim_to = osd_op_p.pg_trim_to; + m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk; + m->set_rollback_to(osd_op_p.at_version); + // TODO: set more stuff. e.g., pg_states + sends->emplace_back(shard_services.send_to_osd(pg_shard.osd, std::move(m), map_epoch)); + } + } + auto sends_complete = seastar::when_all_succeed( + sends->begin(), sends->end() + ).finally([sends=std::move(sends)] {}); + return {std::move(sends_complete), std::move(all_completed)}; +} + +void ReplicatedBackend::on_actingset_changed(bool same_primary) +{ + crimson::common::actingset_changed e_actingset_changed{same_primary}; + for (auto& [tid, pending_txn] : pending_trans) { + pending_txn.all_committed.set_exception(e_actingset_changed); + } + pending_trans.clear(); +} + +void ReplicatedBackend::got_rep_op_reply(const MOSDRepOpReply& reply) +{ + LOG_PREFIX(ReplicatedBackend::got_rep_op_reply); + auto found = pending_trans.find(reply.get_tid()); + if (found == pending_trans.end()) { + WARNDPP("cannot find rep op for message {}", dpp, reply); + return; + } + auto& peers = found->second; + for (auto& peer : peers.acked_peers) { + if (peer.shard == reply.from) { + peer.last_complete_ondisk = reply.get_last_complete_ondisk(); + if (--peers.pending == 0) { + peers.all_committed.set_value(); + peers.all_committed = {}; + } + return; + } + } +} + +seastar::future<> ReplicatedBackend::stop() +{ + LOG_PREFIX(ReplicatedBackend::stop); + INFODPP("cid {}", coll->get_cid()); + for (auto& [tid, pending_on] : pending_trans) { + pending_on.all_committed.set_exception( + crimson::common::system_shutdown_exception()); + } + pending_trans.clear(); + return seastar::now(); +} + +seastar::future<> +ReplicatedBackend::request_committed(const osd_reqid_t& reqid, + const eversion_t& at_version) +{ + if (std::empty(pending_trans)) { + return seastar::now(); + } + auto iter = pending_trans.begin(); + auto& pending_txn = iter->second; + if (pending_txn.at_version > at_version) { + return seastar::now(); + } + for (; iter->second.at_version < at_version; ++iter); + // As for now, the previous client_request with the same reqid + // mustn't have finished, as that would mean later client_requests + // has finished before earlier ones. + // + // The following line of code should be "assert(pending_txn.at_version == at_version)", + // as there can be only one transaction at any time in pending_trans due to + // PG::request_pg_pipeline. But there's a high possibility that we will + // improve the parallelism here in the future, which means there may be multiple + // client requests in flight, so we loosed the restriction to as follows. Correct + // me if I'm wrong:-) + assert(iter != pending_trans.end() && iter->second.at_version == at_version); + if (iter->second.pending) { + return iter->second.all_committed.get_shared_future(); + } else { + return seastar::now(); + } +} diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h new file mode 100644 index 000000000..f789a35ea --- /dev/null +++ b/src/crimson/osd/replicated_backend.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/weak_ptr.hh> +#include "include/buffer_fwd.h" +#include "osd/osd_types.h" + +#include "acked_peers.h" +#include "pg_backend.h" + +namespace crimson::osd { + class ShardServices; +} + +class ReplicatedBackend : public PGBackend +{ +public: + ReplicatedBackend(pg_t pgid, pg_shard_t whoami, + CollectionRef coll, + crimson::osd::ShardServices& shard_services, + DoutPrefixProvider &dpp); + void got_rep_op_reply(const MOSDRepOpReply& reply) final; + seastar::future<> stop() final; + void on_actingset_changed(bool same_primary) final; +private: + ll_read_ierrorator::future<ceph::bufferlist> + _read(const hobject_t& hoid, uint64_t off, + uint64_t len, uint32_t flags) override; + rep_op_fut_t _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + osd_op_params_t&& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) final; + const pg_t pgid; + const pg_shard_t whoami; + class pending_on_t : public seastar::weakly_referencable<pending_on_t> { + public: + pending_on_t(size_t pending, const eversion_t& at_version) + : pending{static_cast<unsigned>(pending)}, at_version(at_version) + {} + unsigned pending; + // The order of pending_txns' at_version must be the same as their + // corresponding ceph_tid_t, as we rely on this condition for checking + // whether a client request is already completed. To put it another + // way, client requests at_version must be updated synchorously/simultaneously + // with ceph_tid_t. + const eversion_t at_version; + crimson::osd::acked_peers_t acked_peers; + seastar::shared_promise<> all_committed; + }; + using pending_transactions_t = std::map<ceph_tid_t, pending_on_t>; + pending_transactions_t pending_trans; + + seastar::future<> request_committed( + const osd_reqid_t& reqid, const eversion_t& at_version) final; +}; diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc new file mode 100644 index 000000000..bd301cc2b --- /dev/null +++ b/src/crimson/osd/replicated_recovery_backend.cc @@ -0,0 +1,1182 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab expandtab + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <seastar/core/future.hh> +#include <seastar/core/do_with.hh> + +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "osd/osd_types_fmt.h" +#include "replicated_recovery_backend.h" +#include "msg/Message.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using std::less; +using std::map; +using std::string; + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::recover_object( + const hobject_t& soid, + eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + // always add_recovering(soid) before recover_object(soid) + assert(is_recovering(soid)); + // start tracking the recovery of soid + return maybe_pull_missing_obj(soid, need).then_interruptible([this, soid, need] { + logger().debug("recover_object: loading obc: {}", soid); + return pg.obc_loader.with_obc<RWState::RWREAD>(soid, + [this, soid, need](auto obc) { + logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid); + auto& recovery_waiter = get_recovering(soid); + recovery_waiter.obc = obc; + recovery_waiter.obc->wait_recovery_read(); + return maybe_push_shards(soid, need); + }).handle_error_interruptible( + crimson::osd::PG::load_obc_ertr::all_same_way([soid](auto& code) { + // TODO: may need eio handling? + logger().error("recover_object saw error code {}, ignoring object {}", + code, soid); + })); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::maybe_push_shards( + const hobject_t& soid, + eversion_t need) +{ + return seastar::do_with( + get_shards_to_push(soid), + [this, need, soid](auto &shards) { + return interruptor::parallel_for_each( + shards, + [this, need, soid](auto shard) { + return prep_push(soid, need, shard).then_interruptible([this, soid, shard](auto push) { + auto msg = crimson::make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->pushes.push_back(std::move(push)); + msg->set_priority(pg.get_recovery_op_priority()); + return interruptor::make_interruptible( + shard_services.send_to_osd(shard.osd, + std::move(msg), + pg.get_osdmap_epoch())) + .then_interruptible( + [this, soid, shard] { + return get_recovering(soid).wait_for_pushes(shard); + }); + }); + }); + }).then_interruptible([this, soid] { + auto &recovery = get_recovering(soid); + if (auto push_info = recovery.pushing.begin(); + push_info != recovery.pushing.end()) { + pg.get_recovery_handler()->on_global_recover(soid, + push_info->second.stat, + false); + } else if (recovery.pull_info) { + // no push happened (empty get_shards_to_push()) but pull actually did + pg.get_recovery_handler()->on_global_recover(soid, + recovery.pull_info->stat, + false); + } else { + // no pulls, no pushes + } + return seastar::make_ready_future<>(); + }).handle_exception_interruptible([this, soid](auto e) { + auto &recovery = get_recovering(soid); + if (recovery.obc) { + recovery.obc->drop_recovery_read(); + } + recovering.erase(soid); + return seastar::make_exception_future<>(e); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::maybe_pull_missing_obj( + const hobject_t& soid, + eversion_t need) +{ + pg_missing_tracker_t local_missing = pg.get_local_missing(); + if (!local_missing.is_missing(soid)) { + return seastar::make_ready_future<>(); + } + PullOp pull_op; + auto& recovery_waiter = get_recovering(soid); + recovery_waiter.pull_info = + std::make_optional<RecoveryBackend::pull_info_t>(); + auto& pull_info = *recovery_waiter.pull_info; + prepare_pull(pull_op, pull_info, soid, need); + auto msg = crimson::make_message<MOSDPGPull>(); + msg->from = pg.get_pg_whoami(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_pulls({std::move(pull_op)}); + return interruptor::make_interruptible( + shard_services.send_to_osd( + pull_info.from.osd, + std::move(msg), + pg.get_osdmap_epoch() + )).then_interruptible([&recovery_waiter] { + return recovery_waiter.wait_for_pull(); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::push_delete( + const hobject_t& soid, + eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + epoch_t min_epoch = pg.get_last_peering_reset(); + + assert(pg.get_acting_recovery_backfill().size() > 0); + return interruptor::parallel_for_each(pg.get_acting_recovery_backfill(), + [this, soid, need, min_epoch](pg_shard_t shard) + -> interruptible_future<> { + if (shard == pg.get_pg_whoami()) + return seastar::make_ready_future<>(); + auto iter = pg.get_shard_missing().find(shard); + if (iter == pg.get_shard_missing().end()) + return seastar::make_ready_future<>(); + if (iter->second.is_missing(soid)) { + logger().debug("push_delete: will remove {} from {}", soid, shard); + pg.begin_peer_recover(shard, soid); + spg_t target_pg(pg.get_info().pgid.pgid, shard.shard); + auto msg = crimson::make_message<MOSDPGRecoveryDelete>( + pg.get_pg_whoami(), target_pg, pg.get_osdmap_epoch(), min_epoch); + msg->set_priority(pg.get_recovery_op_priority()); + msg->objects.push_back(std::make_pair(soid, need)); + return interruptor::make_interruptible( + shard_services.send_to_osd(shard.osd, std::move(msg), + pg.get_osdmap_epoch())).then_interruptible( + [this, soid, shard] { + return get_recovering(soid).wait_for_pushes(shard); + }); + } + return seastar::make_ready_future<>(); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_recovery_delete( + Ref<MOSDPGRecoveryDelete> m) +{ + logger().debug("{}: {}", __func__, *m); + + auto& p = m->objects.front(); //TODO: only one delete per message for now. + return local_recover_delete(p.first, p.second, pg.get_osdmap_epoch()) + .then_interruptible( + [this, m] { + auto reply = crimson::make_message<MOSDPGRecoveryDeleteReply>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = spg_t(pg.get_info().pgid.pgid, m->from.shard); + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->objects = m->objects; + return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch()); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::on_local_recover_persist( + const hobject_t& soid, + const ObjectRecoveryInfo& _recovery_info, + bool is_delete, + epoch_t epoch_frozen) +{ + logger().debug("{}", __func__); + ceph::os::Transaction t; + pg.get_recovery_handler()->on_local_recover(soid, _recovery_info, is_delete, t); + logger().debug("ReplicatedRecoveryBackend::on_local_recover_persist: do_transaction..."); + return interruptor::make_interruptible( + shard_services.get_store().do_transaction(coll, std::move(t))) + .then_interruptible( + [this, epoch_frozen, last_complete = pg.get_info().last_complete] { + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + return seastar::make_ready_future<>(); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::local_recover_delete( + const hobject_t& soid, + eversion_t need, + epoch_t epoch_to_freeze) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + return backend->load_metadata(soid).safe_then_interruptible([this] + (auto lomt) -> interruptible_future<> { + if (lomt->os.exists) { + return seastar::do_with(ceph::os::Transaction(), + [this, lomt = std::move(lomt)](auto& txn) { + return backend->remove(lomt->os, txn).then_interruptible( + [this, &txn]() mutable { + logger().debug("ReplicatedRecoveryBackend::local_recover_delete: do_transaction..."); + return shard_services.get_store().do_transaction(coll, + std::move(txn)); + }); + }); + } + return seastar::make_ready_future<>(); + }).safe_then_interruptible([this, soid, epoch_to_freeze, need] { + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = need; + return on_local_recover_persist(soid, recovery_info, + true, epoch_to_freeze); + }, PGBackend::load_metadata_ertr::all_same_way( + [this, soid, epoch_to_freeze, need] (auto e) { + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = need; + return on_local_recover_persist(soid, recovery_info, + true, epoch_to_freeze); + }) + ); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::recover_delete( + const hobject_t &soid, eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + + epoch_t cur_epoch = pg.get_osdmap_epoch(); + return seastar::do_with(object_stat_sum_t(), + [this, soid, need, cur_epoch](auto& stat_diff) { + return local_recover_delete(soid, need, cur_epoch).then_interruptible( + [this, &stat_diff, cur_epoch, soid, need]() + -> interruptible_future<> { + if (!pg.has_reset_since(cur_epoch)) { + bool object_missing = false; + for (const auto& shard : pg.get_acting_recovery_backfill()) { + if (shard == pg.get_pg_whoami()) + continue; + if (pg.get_shard_missing(shard)->is_missing(soid)) { + logger().debug("recover_delete: soid {} needs to deleted from replca {}", + soid, shard); + object_missing = true; + break; + } + } + + if (!object_missing) { + stat_diff.num_objects_recovered = 1; + return seastar::make_ready_future<>(); + } else { + return push_delete(soid, need); + } + } + return seastar::make_ready_future<>(); + }).then_interruptible([this, soid, &stat_diff] { + pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true); + return seastar::make_ready_future<>(); + }); + }); +} + +RecoveryBackend::interruptible_future<PushOp> +ReplicatedRecoveryBackend::prep_push( + const hobject_t& soid, + eversion_t need, + pg_shard_t pg_shard) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + + auto& recovery_waiter = get_recovering(soid); + auto& obc = recovery_waiter.obc; + interval_set<uint64_t> data_subset; + if (obc->obs.oi.size) { + data_subset.insert(0, obc->obs.oi.size); + } + const auto& missing = pg.get_shard_missing().find(pg_shard)->second; + const auto it = missing.get_items().find(soid); + assert(it != missing.get_items().end()); + data_subset.intersection_of(it->second.clean_regions.get_dirty_regions()); + logger().debug("prep_push: {} data_subset {} to {}", + soid, data_subset, pg_shard); + + auto& push_info = recovery_waiter.pushing[pg_shard]; + pg.begin_peer_recover(pg_shard, soid); + const auto pmissing_iter = pg.get_shard_missing().find(pg_shard); + const auto missing_iter = pmissing_iter->second.get_items().find(soid); + assert(missing_iter != pmissing_iter->second.get_items().end()); + + push_info.obc = obc; + push_info.recovery_info.size = obc->obs.oi.size; + push_info.recovery_info.copy_subset = data_subset; + push_info.recovery_info.soid = soid; + push_info.recovery_info.oi = obc->obs.oi; + push_info.recovery_info.version = obc->obs.oi.version; + push_info.recovery_info.object_exist = + missing_iter->second.clean_regions.object_is_exist(); + push_info.recovery_progress.omap_complete = + !missing_iter->second.clean_regions.omap_is_dirty(); + + return build_push_op(push_info.recovery_info, + push_info.recovery_progress, + &push_info.stat).then_interruptible( + [this, soid, pg_shard](auto push_op) { + auto& recovery_waiter = get_recovering(soid); + auto& push_info = recovery_waiter.pushing[pg_shard]; + push_info.recovery_progress = push_op.after_progress; + return push_op; + }); +} + +void ReplicatedRecoveryBackend::prepare_pull(PullOp& pull_op, + pull_info_t& pull_info, + const hobject_t& soid, + eversion_t need) { + logger().debug("{}: {}, {}", __func__, soid, need); + + pg_missing_tracker_t local_missing = pg.get_local_missing(); + const auto missing_iter = local_missing.get_items().find(soid); + auto m = pg.get_missing_loc_shards(); + pg_shard_t fromshard = *(m[soid].begin()); + + //TODO: skipped snap objects case for now + pull_op.recovery_info.copy_subset.insert(0, (uint64_t) -1); + pull_op.recovery_info.copy_subset.intersection_of( + missing_iter->second.clean_regions.get_dirty_regions()); + pull_op.recovery_info.size = ((uint64_t) -1); + pull_op.recovery_info.object_exist = + missing_iter->second.clean_regions.object_is_exist(); + pull_op.recovery_info.soid = soid; + pull_op.soid = soid; + pull_op.recovery_progress.data_complete = false; + pull_op.recovery_progress.omap_complete = + !missing_iter->second.clean_regions.omap_is_dirty(); + pull_op.recovery_progress.data_recovered_to = 0; + pull_op.recovery_progress.first = true; + + pull_info.from = fromshard; + pull_info.soid = soid; + pull_info.recovery_info = pull_op.recovery_info; + pull_info.recovery_progress = pull_op.recovery_progress; +} + +RecoveryBackend::interruptible_future<PushOp> +ReplicatedRecoveryBackend::build_push_op( + const ObjectRecoveryInfo& recovery_info, + const ObjectRecoveryProgress& progress, + object_stat_sum_t* stat) +{ + logger().debug("{} {} @{}", + __func__, recovery_info.soid, recovery_info.version); + return seastar::do_with(ObjectRecoveryProgress(progress), + uint64_t(crimson::common::local_conf() + ->osd_recovery_max_chunk), + recovery_info.version, + PushOp(), + [this, &recovery_info, &progress, stat] + (auto& new_progress, auto& available, auto& v, auto& push_op) { + return read_metadata_for_push_op(recovery_info.soid, + progress, new_progress, + v, &push_op + ).then_interruptible([&](eversion_t local_ver) mutable { + // If requestor didn't know the version, use ours + if (v == eversion_t()) { + v = local_ver; + } else if (v != local_ver) { + logger().error("build_push_op: {} push {} v{} failed because local copy is {}", + pg.get_pgid(), recovery_info.soid, recovery_info.version, local_ver); + // TODO: bail out + } + return read_omap_for_push_op(recovery_info.soid, + progress, + new_progress, + available, &push_op); + }).then_interruptible([this, &recovery_info, &progress, + &available, &push_op]() mutable { + logger().debug("build_push_op: available: {}, copy_subset: {}", + available, recovery_info.copy_subset); + return read_object_for_push_op(recovery_info.soid, + recovery_info.copy_subset, + progress.data_recovered_to, + available, &push_op); + }).then_interruptible([&recovery_info, &v, &progress, + &new_progress, stat, &push_op] + (uint64_t recovered_to) mutable { + new_progress.data_recovered_to = recovered_to; + if (new_progress.is_complete(recovery_info)) { + new_progress.data_complete = true; + if (stat) + stat->num_objects_recovered++; + } else if (progress.first && progress.omap_complete) { + // If omap is not changed, we need recovery omap + // when recovery cannot be completed once + new_progress.omap_complete = false; + } + if (stat) { + stat->num_keys_recovered += push_op.omap_entries.size(); + stat->num_bytes_recovered += push_op.data.length(); + } + push_op.version = v; + push_op.soid = recovery_info.soid; + push_op.recovery_info = recovery_info; + push_op.after_progress = new_progress; + push_op.before_progress = progress; + logger().debug("build_push_op: push_op version:" + " {}, push_op data length: {}", + push_op.version, push_op.data.length()); + return seastar::make_ready_future<PushOp>(std::move(push_op)); + }); + }); +} + +RecoveryBackend::interruptible_future<eversion_t> +ReplicatedRecoveryBackend::read_metadata_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + eversion_t ver, + PushOp* push_op) +{ + logger().debug("{}, {}", __func__, oid); + if (!progress.first) { + return seastar::make_ready_future<eversion_t>(ver); + } + return interruptor::make_interruptible(interruptor::when_all_succeed( + backend->omap_get_header(coll, ghobject_t(oid)).handle_error_interruptible<false>( + crimson::os::FuturizedStore::Shard::read_errorator::all_same_way( + [oid] (const std::error_code& e) { + logger().debug("read_metadata_for_push_op, error {} when getting omap header: {}", e, oid); + return seastar::make_ready_future<bufferlist>(); + })), + interruptor::make_interruptible(store->get_attrs(coll, ghobject_t(oid))) + .handle_error_interruptible<false>( + crimson::os::FuturizedStore::Shard::get_attrs_ertr::all_same_way( + [oid] (const std::error_code& e) { + logger().debug("read_metadata_for_push_op, error {} when getting attrs: {}", e, oid); + return seastar::make_ready_future<crimson::os::FuturizedStore::Shard::attrs_t>(); + })) + )).then_unpack_interruptible([&new_progress, push_op](auto bl, auto attrs) { + if (bl.length() == 0) { + logger().warn("read_metadata_for_push_op: fail to read omap header"); + } else if (attrs.empty()) { + logger().error("read_metadata_for_push_op: fail to read attrs"); + return eversion_t{}; + } + push_op->omap_header.claim_append(std::move(bl)); + for (auto&& [key, val] : attrs) { + push_op->attrset.emplace(std::move(key), std::move(val)); + } + logger().debug("read_metadata_for_push_op: {}", push_op->attrset[OI_ATTR]); + object_info_t oi; + oi.decode_no_oid(push_op->attrset[OI_ATTR]); + new_progress.first = false; + return oi.version; + }); +} + +RecoveryBackend::interruptible_future<uint64_t> +ReplicatedRecoveryBackend::read_object_for_push_op( + const hobject_t& oid, + const interval_set<uint64_t>& copy_subset, + uint64_t offset, + uint64_t max_len, + PushOp* push_op) +{ + if (max_len == 0 || copy_subset.empty()) { + push_op->data_included.clear(); + return seastar::make_ready_future<uint64_t>(offset); + } + // 1. get the extents in the interested range + return interruptor::make_interruptible(backend->fiemap(coll, ghobject_t{oid}, + 0, copy_subset.range_end())).safe_then_interruptible( + [=, this](auto&& fiemap_included) mutable { + interval_set<uint64_t> extents; + try { + extents.intersection_of(copy_subset, std::move(fiemap_included)); + } catch (std::exception &) { + // if fiemap() fails, we will read nothing, as the intersection of + // copy_subset and an empty interval_set would be empty anyway + extents.clear(); + } + // 2. we can read up to "max_len" bytes from "offset", so truncate the + // extents down to this quota. no need to return the number of consumed + // bytes, as this is the last consumer of this quota + push_op->data_included.span_of(extents, offset, max_len); + // 3. read the truncated extents + // TODO: check if the returned extents are pruned + return interruptor::make_interruptible(store->readv(coll, ghobject_t{oid}, + push_op->data_included, 0)); + }).safe_then_interruptible([push_op, range_end=copy_subset.range_end()](auto &&bl) { + push_op->data.claim_append(std::move(bl)); + uint64_t recovered_to = 0; + if (push_op->data_included.empty()) { + // zero filled section, skip to end! + recovered_to = range_end; + } else { + // note down the progress, we will start from there next time + recovered_to = push_op->data_included.range_end(); + } + return seastar::make_ready_future<uint64_t>(recovered_to); + }, PGBackend::read_errorator::all_same_way([](auto e) { + logger().debug("build_push_op: read exception"); + return seastar::make_exception_future<uint64_t>(e); + })); +} + +static std::optional<std::string> nullopt_if_empty(const std::string& s) +{ + return s.empty() ? std::nullopt : std::make_optional(s); +} + +static bool is_too_many_entries_per_chunk(const PushOp* push_op) +{ + const uint64_t entries_per_chunk = + crimson::common::local_conf()->osd_recovery_max_omap_entries_per_chunk; + if (!entries_per_chunk) { + // the limit is disabled + return false; + } + return push_op->omap_entries.size() >= entries_per_chunk; +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::read_omap_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + uint64_t& max_len, + PushOp* push_op) +{ + if (progress.omap_complete) { + return seastar::make_ready_future<>(); + } + return seastar::repeat([&new_progress, &max_len, push_op, &oid, this] { + return shard_services.get_store().omap_get_values( + coll, ghobject_t{oid}, nullopt_if_empty(new_progress.omap_recovered_to) + ).safe_then([&new_progress, &max_len, push_op](const auto& ret) { + const auto& [done, kvs] = ret; + bool stop = done; + // assuming "values.empty() only if done" holds here! + for (const auto& [key, value] : kvs) { + if (is_too_many_entries_per_chunk(push_op)) { + stop = true; + break; + } + if (const uint64_t entry_size = key.size() + value.length(); + entry_size > max_len) { + stop = true; + break; + } else { + max_len -= std::min(max_len, entry_size); + } + push_op->omap_entries.emplace(key, value); + } + if (!push_op->omap_entries.empty()) { + // we iterate in order + new_progress.omap_recovered_to = std::rbegin(push_op->omap_entries)->first; + } + if (done) { + new_progress.omap_complete = true; + } + return seastar::make_ready_future<seastar::stop_iteration>( + stop ? seastar::stop_iteration::yes : seastar::stop_iteration::no + ); + }, crimson::os::FuturizedStore::Shard::read_errorator::assert_all{}); + }); +} + +std::vector<pg_shard_t> +ReplicatedRecoveryBackend::get_shards_to_push(const hobject_t& soid) const +{ + std::vector<pg_shard_t> shards; + assert(pg.get_acting_recovery_backfill().size() > 0); + for (const auto& peer : pg.get_acting_recovery_backfill()) { + if (peer == pg.get_pg_whoami()) + continue; + auto shard_missing = + pg.get_shard_missing().find(peer); + assert(shard_missing != pg.get_shard_missing().end()); + if (shard_missing->second.is_missing(soid)) { + shards.push_back(shard_missing->first); + } + } + return shards; +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_pull(Ref<MOSDPGPull> m) +{ + logger().debug("{}: {}", __func__, *m); + if (pg.can_discard_replica_op(*m)) { + logger().debug("{}: discarding {}", __func__, *m); + return seastar::now(); + } + return seastar::do_with(m->take_pulls(), [this, from=m->from](auto& pulls) { + return interruptor::parallel_for_each(pulls, + [this, from](auto& pull_op) { + const hobject_t& soid = pull_op.soid; + logger().debug("handle_pull: {}", soid); + return backend->stat(coll, ghobject_t(soid)).then_interruptible( + [this, &pull_op](auto st) { + ObjectRecoveryInfo &recovery_info = pull_op.recovery_info; + ObjectRecoveryProgress &progress = pull_op.recovery_progress; + if (progress.first && recovery_info.size == ((uint64_t) -1)) { + // Adjust size and copy_subset + recovery_info.size = st.st_size; + if (st.st_size) { + interval_set<uint64_t> object_range; + object_range.insert(0, st.st_size); + recovery_info.copy_subset.intersection_of(object_range); + } else { + recovery_info.copy_subset.clear(); + } + assert(recovery_info.clone_subset.empty()); + } + return build_push_op(recovery_info, progress, 0); + }).then_interruptible([this, from](auto push_op) { + auto msg = crimson::make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pushes.push_back(std::move(push_op)); + return shard_services.send_to_osd(from.osd, std::move(msg), + pg.get_osdmap_epoch()); + }); + }); + }); +} + +RecoveryBackend::interruptible_future<bool> +ReplicatedRecoveryBackend::_handle_pull_response( + pg_shard_t from, + PushOp& push_op, + PullOp* response, + ceph::os::Transaction* t) +{ + logger().debug("handle_pull_response {} {} data.size() is {} data_included: {}", + push_op.recovery_info, push_op.after_progress, + push_op.data.length(), push_op.data_included); + + const hobject_t &hoid = push_op.soid; + auto& recovery_waiter = get_recovering(hoid); + auto& pull_info = *recovery_waiter.pull_info; + if (pull_info.recovery_info.size == (uint64_t(-1))) { + pull_info.recovery_info.size = push_op.recovery_info.size; + pull_info.recovery_info.copy_subset.intersection_of( + push_op.recovery_info.copy_subset); + } + + // If primary doesn't have object info and didn't know version + if (pull_info.recovery_info.version == eversion_t()) + pull_info.recovery_info.version = push_op.version; + + auto prepare_waiter = interruptor::make_interruptible( + seastar::make_ready_future<>()); + if (pull_info.recovery_progress.first) { + prepare_waiter = pg.obc_loader.with_obc<RWState::RWNONE>( + pull_info.recovery_info.soid, + [&pull_info, &recovery_waiter, &push_op](auto obc) { + pull_info.obc = obc; + recovery_waiter.obc = obc; + obc->obs.oi.decode_no_oid(push_op.attrset.at(OI_ATTR), push_op.soid); + pull_info.recovery_info.oi = obc->obs.oi; + return crimson::osd::PG::load_obc_ertr::now(); + }).handle_error_interruptible(crimson::ct_error::assert_all{}); + }; + return prepare_waiter.then_interruptible( + [this, &pull_info, &push_op, t, response]() mutable { + const bool first = pull_info.recovery_progress.first; + pull_info.recovery_progress = push_op.after_progress; + logger().debug("new recovery_info {}, new progress {}", + pull_info.recovery_info, pull_info.recovery_progress); + interval_set<uint64_t> data_zeros; + { + uint64_t offset = push_op.before_progress.data_recovered_to; + uint64_t length = (push_op.after_progress.data_recovered_to - + push_op.before_progress.data_recovered_to); + if (length) { + data_zeros.insert(offset, length); + } + } + auto [usable_intervals, data] = + trim_pushed_data(pull_info.recovery_info.copy_subset, + push_op.data_included, push_op.data); + bool complete = pull_info.is_complete(); + bool clear_omap = !push_op.before_progress.omap_complete; + return submit_push_data(pull_info.recovery_info, + first, complete, clear_omap, + std::move(data_zeros), std::move(usable_intervals), + std::move(data), std::move(push_op.omap_header), + push_op.attrset, std::move(push_op.omap_entries), t) + .then_interruptible( + [this, response, &pull_info, &push_op, complete, + t, bytes_recovered=data.length()] { + pull_info.stat.num_keys_recovered += push_op.omap_entries.size(); + pull_info.stat.num_bytes_recovered += bytes_recovered; + + if (complete) { + pull_info.stat.num_objects_recovered++; + pg.get_recovery_handler()->on_local_recover( + push_op.soid, get_recovering(push_op.soid).pull_info->recovery_info, + false, *t); + return true; + } else { + response->soid = push_op.soid; + response->recovery_info = pull_info.recovery_info; + response->recovery_progress = pull_info.recovery_progress; + return false; + } + }); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_pull_response( + Ref<MOSDPGPush> m) +{ + if (pg.can_discard_replica_op(*m)) { + logger().debug("{}: discarding {}", __func__, *m); + return seastar::now(); + } + const PushOp& push_op = m->pushes[0]; //TODO: only one push per message for now. + if (push_op.version == eversion_t()) { + // replica doesn't have it! + pg.get_recovery_handler()->on_failed_recover({ m->from }, push_op.soid, + get_recovering(push_op.soid).pull_info->recovery_info.version); + return seastar::make_exception_future<>( + std::runtime_error(fmt::format( + "Error on pushing side {} when pulling obj {}", + m->from, push_op.soid))); + } + + logger().debug("{}: {}", __func__, *m); + return seastar::do_with(PullOp(), [this, m](auto& response) { + return seastar::do_with(ceph::os::Transaction(), m.get(), + [this, &response](auto& t, auto& m) { + pg_shard_t from = m->from; + PushOp& push_op = m->pushes[0]; // only one push per message for now + return _handle_pull_response(from, push_op, &response, &t + ).then_interruptible( + [this, &t](bool complete) { + epoch_t epoch_frozen = pg.get_osdmap_epoch(); + logger().debug("ReplicatedRecoveryBackend::handle_pull_response: do_transaction..."); + return shard_services.get_store().do_transaction(coll, std::move(t)) + .then([this, epoch_frozen, complete, + last_complete = pg.get_info().last_complete] { + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + return seastar::make_ready_future<bool>(complete); + }); + }); + }).then_interruptible([this, m, &response](bool complete) { + if (complete) { + auto& push_op = m->pushes[0]; + get_recovering(push_op.soid).set_pulled(); + return seastar::make_ready_future<>(); + } else { + auto reply = crimson::make_message<MOSDPGPull>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = pg.get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->set_pulls({std::move(response)}); + return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch()); + } + }); + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::_handle_push( + pg_shard_t from, + PushOp &push_op, + PushReplyOp *response, + ceph::os::Transaction *t) +{ + logger().debug("{}", __func__); + + bool first = push_op.before_progress.first; + interval_set<uint64_t> data_zeros; + { + uint64_t offset = push_op.before_progress.data_recovered_to; + uint64_t length = (push_op.after_progress.data_recovered_to - + push_op.before_progress.data_recovered_to); + if (length) { + data_zeros.insert(offset, length); + } + } + bool complete = (push_op.after_progress.data_complete && + push_op.after_progress.omap_complete); + bool clear_omap = !push_op.before_progress.omap_complete; + response->soid = push_op.recovery_info.soid; + + return submit_push_data(push_op.recovery_info, first, complete, clear_omap, + std::move(data_zeros), + std::move(push_op.data_included), + std::move(push_op.data), + std::move(push_op.omap_header), + push_op.attrset, + std::move(push_op.omap_entries), t) + .then_interruptible( + [this, complete, &push_op, t] { + if (complete) { + pg.get_recovery_handler()->on_local_recover( + push_op.recovery_info.soid, push_op.recovery_info, + false, *t); + } + }); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_push( + Ref<MOSDPGPush> m) +{ + if (pg.can_discard_replica_op(*m)) { + logger().debug("{}: discarding {}", __func__, *m); + return seastar::now(); + } + if (pg.is_primary()) { + return handle_pull_response(m); + } + + logger().debug("{}: {}", __func__, *m); + return seastar::do_with(PushReplyOp(), [this, m](auto& response) { + PushOp& push_op = m->pushes[0]; // TODO: only one push per message for now + return seastar::do_with(ceph::os::Transaction(), + [this, m, &push_op, &response](auto& t) { + return _handle_push(m->from, push_op, &response, &t).then_interruptible( + [this, &t] { + epoch_t epoch_frozen = pg.get_osdmap_epoch(); + logger().debug("ReplicatedRecoveryBackend::handle_push: do_transaction..."); + return interruptor::make_interruptible( + shard_services.get_store().do_transaction(coll, std::move(t))).then_interruptible( + [this, epoch_frozen, last_complete = pg.get_info().last_complete] { + //TODO: this should be grouped with pg.on_local_recover somehow. + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + }); + }); + }).then_interruptible([this, m, &response]() mutable { + auto reply = crimson::make_message<MOSDPGPushReply>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = pg.get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + std::vector<PushReplyOp> replies = { std::move(response) }; + reply->replies.swap(replies); + return shard_services.send_to_osd(m->from.osd, + std::move(reply), pg.get_osdmap_epoch()); + }); + }); +} + +RecoveryBackend::interruptible_future<std::optional<PushOp>> +ReplicatedRecoveryBackend::_handle_push_reply( + pg_shard_t peer, + const PushReplyOp &op) +{ + const hobject_t& soid = op.soid; + logger().debug("{}, soid {}, from {}", __func__, soid, peer); + auto recovering_iter = recovering.find(soid); + if (recovering_iter == recovering.end() + || !recovering_iter->second->pushing.count(peer)) { + logger().debug("huh, i wasn't pushing {} to osd.{}", soid, peer); + return seastar::make_ready_future<std::optional<PushOp>>(); + } else { + auto& push_info = recovering_iter->second->pushing[peer]; + bool error = push_info.recovery_progress.error; + if (!push_info.recovery_progress.data_complete && !error) { + return build_push_op(push_info.recovery_info, push_info.recovery_progress, + &push_info.stat + ).then_interruptible([&push_info] (auto push_op) { + push_info.recovery_progress = push_op.after_progress; + return seastar::make_ready_future<std::optional<PushOp>>( + std::move(push_op)); + }).handle_exception_interruptible( + [recovering_iter, &push_info, peer] (auto e) { + push_info.recovery_progress.error = true; + recovering_iter->second->set_push_failed(peer, e); + return seastar::make_ready_future<std::optional<PushOp>>(); + }); + } + if (!error) { + pg.get_recovery_handler()->on_peer_recover(peer, + soid, + push_info.recovery_info); + } + recovering_iter->second->set_pushed(peer); + return seastar::make_ready_future<std::optional<PushOp>>(); + } +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_push_reply( + Ref<MOSDPGPushReply> m) +{ + logger().debug("{}: {}", __func__, *m); + auto from = m->from; + auto& push_reply = m->replies[0]; //TODO: only one reply per message + + return _handle_push_reply(from, push_reply).then_interruptible( + [this, from](std::optional<PushOp> push_op) { + if (push_op) { + auto msg = crimson::make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pushes.push_back(std::move(*push_op)); + return shard_services.send_to_osd(from.osd, + std::move(msg), + pg.get_osdmap_epoch()); + } else { + return seastar::make_ready_future<>(); + } + }); +} + +std::pair<interval_set<uint64_t>, + bufferlist> +ReplicatedRecoveryBackend::trim_pushed_data( + const interval_set<uint64_t> ©_subset, + const interval_set<uint64_t> &intervals_received, + ceph::bufferlist data_received) +{ + logger().debug("{}", __func__); + // what i have is only a subset of what i want + if (intervals_received.subset_of(copy_subset)) { + return {intervals_received, data_received}; + } + // only collect the extents included by copy_subset and intervals_received + interval_set<uint64_t> intervals_usable; + bufferlist data_usable; + intervals_usable.intersection_of(copy_subset, intervals_received); + uint64_t have_off = 0; + for (auto [have_start, have_len] : intervals_received) { + interval_set<uint64_t> want; + want.insert(have_start, have_len); + want.intersection_of(copy_subset); + for (auto [want_start, want_len] : want) { + bufferlist sub; + uint64_t data_off = have_off + (want_start - have_start); + sub.substr_of(data_received, data_off, want_len); + data_usable.claim_append(sub); + } + have_off += have_len; + } + return {intervals_usable, data_usable}; +} + +RecoveryBackend::interruptible_future<hobject_t> +ReplicatedRecoveryBackend::prep_push_target( + const ObjectRecoveryInfo& recovery_info, + bool first, + bool complete, + bool clear_omap, + ObjectStore::Transaction* t, + const map<string, bufferlist, less<>>& attrs, + bufferlist&& omap_header) +{ + if (!first) { + return seastar::make_ready_future<hobject_t>( + get_temp_recovery_object(recovery_info.soid, + recovery_info.version)); + } + + ghobject_t target_oid; + if (complete) { + // overwrite the original object + target_oid = ghobject_t(recovery_info.soid); + } else { + target_oid = ghobject_t(get_temp_recovery_object(recovery_info.soid, + recovery_info.version)); + logger().debug("{}: Adding oid {} in the temp collection", + __func__, target_oid); + add_temp_obj(target_oid.hobj); + } + // create a new object + if (!complete || !recovery_info.object_exist) { + t->remove(coll->get_cid(), target_oid); + t->touch(coll->get_cid(), target_oid); + object_info_t oi; + oi.decode_no_oid(attrs.at(OI_ATTR)); + t->set_alloc_hint(coll->get_cid(), target_oid, + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + } + if (complete) { + // remove xattr and update later if overwrite on original object + t->rmattrs(coll->get_cid(), target_oid); + // if need update omap, clear the previous content first + if (clear_omap) { + t->omap_clear(coll->get_cid(), target_oid); + } + } + t->truncate(coll->get_cid(), target_oid, recovery_info.size); + if (omap_header.length()) { + t->omap_setheader(coll->get_cid(), target_oid, omap_header); + } + if (complete || !recovery_info.object_exist) { + return seastar::make_ready_future<hobject_t>(target_oid.hobj); + } + // clone overlap content in local object if using a new object + return interruptor::make_interruptible(store->stat(coll, ghobject_t(recovery_info.soid))) + .then_interruptible( + [this, &recovery_info, t, target_oid] (auto st) { + // TODO: pg num bytes counting + uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size); + interval_set<uint64_t> local_intervals_included, local_intervals_excluded; + if (local_size) { + local_intervals_included.insert(0, local_size); + local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset); + local_intervals_included.subtract(local_intervals_excluded); + } + for (auto [off, len] : local_intervals_included) { + logger().debug(" clone_range {} {}~{}", + recovery_info.soid, off, len); + t->clone_range(coll->get_cid(), ghobject_t(recovery_info.soid), + target_oid, off, len, off); + } + return seastar::make_ready_future<hobject_t>(target_oid.hobj); + }); +} +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::submit_push_data( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + interval_set<uint64_t>&& data_zeros, + interval_set<uint64_t>&& intervals_included, + bufferlist&& data_included, + bufferlist&& omap_header, + const map<string, bufferlist, less<>> &attrs, + map<string, bufferlist>&& omap_entries, + ObjectStore::Transaction *t) +{ + logger().debug("{}", __func__); + return prep_push_target(recovery_info, first, complete, + clear_omap, t, attrs, + std::move(omap_header)).then_interruptible( + [this, + &recovery_info, t, + first, complete, + data_zeros=std::move(data_zeros), + intervals_included=std::move(intervals_included), + data_included=std::move(data_included), + omap_entries=std::move(omap_entries), + &attrs](auto target_oid) mutable { + + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL; + // Punch zeros for data, if fiemap indicates nothing but it is marked dirty + if (!data_zeros.empty()) { + data_zeros.intersection_of(recovery_info.copy_subset); + assert(intervals_included.subset_of(data_zeros)); + data_zeros.subtract(intervals_included); + + logger().debug("submit_push_data recovering object {} copy_subset: {} " + "intervals_included: {} data_zeros: {}", + recovery_info.soid, recovery_info.copy_subset, + intervals_included, data_zeros); + + for (auto [start, len] : data_zeros) { + t->zero(coll->get_cid(), ghobject_t(target_oid), start, len); + } + } + uint64_t off = 0; + for (auto [start, len] : intervals_included) { + bufferlist bit; + bit.substr_of(data_included, off, len); + t->write(coll->get_cid(), ghobject_t(target_oid), + start, len, bit, fadvise_flags); + off += len; + } + + if (!omap_entries.empty()) + t->omap_setkeys(coll->get_cid(), ghobject_t(target_oid), omap_entries); + if (!attrs.empty()) + t->setattrs(coll->get_cid(), ghobject_t(target_oid), attrs); + + if (complete) { + if (!first) { + logger().debug("submit_push_data: Removing oid {} from the temp collection", + target_oid); + clear_temp_obj(target_oid); + t->remove(coll->get_cid(), ghobject_t(recovery_info.soid)); + t->collection_move_rename(coll->get_cid(), ghobject_t(target_oid), + coll->get_cid(), ghobject_t(recovery_info.soid)); + } + submit_push_complete(recovery_info, t); + } + logger().debug("submit_push_data: done"); + return seastar::make_ready_future<>(); + }); +} + +void ReplicatedRecoveryBackend::submit_push_complete( + const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t) +{ + for (const auto& [oid, extents] : recovery_info.clone_subset) { + for (const auto& [off, len] : extents) { + logger().debug(" clone_range {} {}~{}", oid, off, len); + t->clone_range(coll->get_cid(), ghobject_t(oid), ghobject_t(recovery_info.soid), + off, len, off); + } + } +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_recovery_delete_reply( + Ref<MOSDPGRecoveryDeleteReply> m) +{ + auto& p = m->objects.front(); + hobject_t soid = p.first; + ObjectRecoveryInfo recovery_info; + recovery_info.version = p.second; + pg.get_recovery_handler()->on_peer_recover(m->from, soid, recovery_info); + get_recovering(soid).set_pushed(m->from); + return seastar::now(); +} + +RecoveryBackend::interruptible_future<> +ReplicatedRecoveryBackend::handle_recovery_op( + Ref<MOSDFastDispatchOp> m, + crimson::net::ConnectionRef conn) +{ + switch (m->get_header().type) { + case MSG_OSD_PG_PULL: + return handle_pull(boost::static_pointer_cast<MOSDPGPull>(m)); + case MSG_OSD_PG_PUSH: + return handle_push(boost::static_pointer_cast<MOSDPGPush>(m)); + case MSG_OSD_PG_PUSH_REPLY: + return handle_push_reply( + boost::static_pointer_cast<MOSDPGPushReply>(m)); + case MSG_OSD_PG_RECOVERY_DELETE: + return handle_recovery_delete( + boost::static_pointer_cast<MOSDPGRecoveryDelete>(m)); + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + return handle_recovery_delete_reply( + boost::static_pointer_cast<MOSDPGRecoveryDeleteReply>(m)); + default: + // delegate to parent class for handling backend-agnostic recovery ops. + return RecoveryBackend::handle_recovery_op(std::move(m), conn); + } +} + diff --git a/src/crimson/osd/replicated_recovery_backend.h b/src/crimson/osd/replicated_recovery_backend.h new file mode 100644 index 000000000..b023b7417 --- /dev/null +++ b/src/crimson/osd/replicated_recovery_backend.h @@ -0,0 +1,169 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/common/interruptible_future.h" +#include "crimson/osd/pg_interval_interrupt_condition.h" +#include "crimson/osd/recovery_backend.h" + +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" +#include "os/ObjectStore.h" + +class ReplicatedRecoveryBackend : public RecoveryBackend { +public: + ReplicatedRecoveryBackend(crimson::osd::PG& pg, + crimson::osd::ShardServices& shard_services, + crimson::os::CollectionRef coll, + PGBackend* backend) + : RecoveryBackend(pg, shard_services, coll, backend) + {} + interruptible_future<> handle_recovery_op( + Ref<MOSDFastDispatchOp> m, + crimson::net::ConnectionRef conn) final; + + interruptible_future<> recover_object( + const hobject_t& soid, + eversion_t need) final; + interruptible_future<> recover_delete( + const hobject_t& soid, + eversion_t need) final; + interruptible_future<> push_delete( + const hobject_t& soid, + eversion_t need) final; +protected: + interruptible_future<> handle_pull( + Ref<MOSDPGPull> m); + interruptible_future<> handle_pull_response( + Ref<MOSDPGPush> m); + interruptible_future<> handle_push( + Ref<MOSDPGPush> m); + interruptible_future<> handle_push_reply( + Ref<MOSDPGPushReply> m); + interruptible_future<> handle_recovery_delete( + Ref<MOSDPGRecoveryDelete> m); + interruptible_future<> handle_recovery_delete_reply( + Ref<MOSDPGRecoveryDeleteReply> m); + interruptible_future<PushOp> prep_push( + const hobject_t& soid, + eversion_t need, + pg_shard_t pg_shard); + void prepare_pull( + PullOp& pull_op, + pull_info_t& pull_info, + const hobject_t& soid, + eversion_t need); + std::vector<pg_shard_t> get_shards_to_push( + const hobject_t& soid) const; + interruptible_future<PushOp> build_push_op( + const ObjectRecoveryInfo& recovery_info, + const ObjectRecoveryProgress& progress, + object_stat_sum_t* stat); + /// @returns true if this push op is the last push op for + /// recovery @c pop.soid + interruptible_future<bool> _handle_pull_response( + pg_shard_t from, + PushOp& push_op, + PullOp* response, + ceph::os::Transaction* t); + std::pair<interval_set<uint64_t>, ceph::bufferlist> trim_pushed_data( + const interval_set<uint64_t> ©_subset, + const interval_set<uint64_t> &intervals_received, + ceph::bufferlist data_received); + interruptible_future<> submit_push_data( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + interval_set<uint64_t>&& data_zeros, + interval_set<uint64_t>&& intervals_included, + ceph::bufferlist&& data_included, + ceph::bufferlist&& omap_header, + const std::map<std::string, bufferlist, std::less<>> &attrs, + std::map<std::string, bufferlist>&& omap_entries, + ceph::os::Transaction *t); + void submit_push_complete( + const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t); + interruptible_future<> _handle_push( + pg_shard_t from, + PushOp& push_op, + PushReplyOp *response, + ceph::os::Transaction *t); + interruptible_future<std::optional<PushOp>> _handle_push_reply( + pg_shard_t peer, + const PushReplyOp &op); + interruptible_future<> on_local_recover_persist( + const hobject_t& soid, + const ObjectRecoveryInfo& _recovery_info, + bool is_delete, + epoch_t epoch_to_freeze); + interruptible_future<> local_recover_delete( + const hobject_t& soid, + eversion_t need, + epoch_t epoch_frozen); + seastar::future<> on_stop() final { + return seastar::now(); + } +private: + /// pull missing object from peer + interruptible_future<> maybe_pull_missing_obj( + const hobject_t& soid, + eversion_t need); + + /// load object context for recovery if it is not ready yet + using load_obc_ertr = crimson::errorator< + crimson::ct_error::object_corrupted>; + using load_obc_iertr = + ::crimson::interruptible::interruptible_errorator< + ::crimson::osd::IOInterruptCondition, + load_obc_ertr>; + + interruptible_future<> maybe_push_shards( + const hobject_t& soid, + eversion_t need); + + /// read the data attached to given object. the size of them is supposed to + /// be relatively small. + /// + /// @return @c oi.version + interruptible_future<eversion_t> read_metadata_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + eversion_t ver, + PushOp* push_op); + /// read the remaining extents of object to be recovered and fill push_op + /// with them + /// + /// @param oid object being recovered + /// @param copy_subset extents we want + /// @param offset the offset in object from where we should read + /// @return the new offset + interruptible_future<uint64_t> read_object_for_push_op( + const hobject_t& oid, + const interval_set<uint64_t>& copy_subset, + uint64_t offset, + uint64_t max_len, + PushOp* push_op); + interruptible_future<> read_omap_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + uint64_t& max_len, + PushOp* push_op); + interruptible_future<hobject_t> prep_push_target( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + ObjectStore::Transaction* t, + const std::map<std::string, bufferlist, std::less<>> &attrs, + bufferlist&& omap_header); + using interruptor = crimson::interruptible::interruptor< + crimson::osd::IOInterruptCondition>; +}; diff --git a/src/crimson/osd/scheduler/mclock_scheduler.cc b/src/crimson/osd/scheduler/mclock_scheduler.cc new file mode 100644 index 000000000..006e4816c --- /dev/null +++ b/src/crimson/osd/scheduler/mclock_scheduler.cc @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <memory> +#include <functional> + +#include "crimson/osd/scheduler/mclock_scheduler.h" +#include "common/dout.h" + +namespace dmc = crimson::dmclock; +using namespace std::placeholders; + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout + + +namespace crimson::osd::scheduler { + +mClockScheduler::mClockScheduler(ConfigProxy &conf) : + scheduler( + std::bind(&mClockScheduler::ClientRegistry::get_info, + &client_registry, + _1), + dmc::AtLimit::Allow, + conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout")) +{ + conf.add_observer(this); + client_registry.update_from_config(conf); +} + +void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf) +{ + default_external_client_info.update( + conf.get_val<double>("osd_mclock_scheduler_client_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"), + conf.get_val<double>("osd_mclock_scheduler_client_lim")); + + internal_client_infos[ + static_cast<size_t>(scheduler_class_t::background_recovery)].update( + conf.get_val<double>("osd_mclock_scheduler_background_recovery_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"), + conf.get_val<double>("osd_mclock_scheduler_background_recovery_lim")); + + internal_client_infos[ + static_cast<size_t>(scheduler_class_t::background_best_effort)].update( + conf.get_val<double>("osd_mclock_scheduler_background_best_effort_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"), + conf.get_val<double>("osd_mclock_scheduler_background_best_effort_lim")); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client( + const client_profile_id_t &client) const +{ + auto ret = external_client_infos.find(client); + if (ret == external_client_infos.end()) + return &default_external_client_info; + else + return &(ret->second); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info( + const scheduler_id_t &id) const { + switch (id.class_id) { + case scheduler_class_t::immediate: + ceph_assert(0 == "Cannot schedule immediate"); + return (dmc::ClientInfo*)nullptr; + case scheduler_class_t::repop: + case scheduler_class_t::client: + return get_external_client(id.client_profile_id); + default: + ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size()); + return &internal_client_infos[static_cast<size_t>(id.class_id)]; + } +} + +void mClockScheduler::dump(ceph::Formatter &f) const +{ +} + +void mClockScheduler::enqueue(item_t&& item) +{ + auto id = get_scheduler_id(item); + auto cost = item.params.cost; + + if (scheduler_class_t::immediate == item.params.klass) { + immediate.push_front(std::move(item)); + } else { + scheduler.add_request( + std::move(item), + id, + cost); + } +} + +void mClockScheduler::enqueue_front(item_t&& item) +{ + immediate.push_back(std::move(item)); + // TODO: item may not be immediate, update mclock machinery to permit + // putting the item back in the queue +} + +item_t mClockScheduler::dequeue() +{ + if (!immediate.empty()) { + auto ret = std::move(immediate.back()); + immediate.pop_back(); + return ret; + } else { + mclock_queue_t::PullReq result = scheduler.pull_request(); + if (result.is_future()) { + ceph_assert( + 0 == "Not implemented, user would have to be able to be woken up"); + return std::move(*(item_t*)nullptr); + } else if (result.is_none()) { + ceph_assert( + 0 == "Impossible, must have checked empty() first"); + return std::move(*(item_t*)nullptr); + } else { + ceph_assert(result.is_retn()); + + auto &retn = result.get_retn(); + return std::move(*retn.request); + } + } +} + +const char** mClockScheduler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_mclock_scheduler_client_res", + "osd_mclock_scheduler_client_wgt", + "osd_mclock_scheduler_client_lim", + "osd_mclock_scheduler_background_recovery_res", + "osd_mclock_scheduler_background_recovery_wgt", + "osd_mclock_scheduler_background_recovery_lim", + "osd_mclock_scheduler_background_best_effort_res", + "osd_mclock_scheduler_background_best_effort_wgt", + "osd_mclock_scheduler_background_best_effort_lim", + NULL + }; + return KEYS; +} + +void mClockScheduler::handle_conf_change( + const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + client_registry.update_from_config(conf); +} + +} diff --git a/src/crimson/osd/scheduler/mclock_scheduler.h b/src/crimson/osd/scheduler/mclock_scheduler.h new file mode 100644 index 000000000..153fc758b --- /dev/null +++ b/src/crimson/osd/scheduler/mclock_scheduler.h @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include <ostream> +#include <map> +#include <vector> + +#include "boost/variant.hpp" + +#include "dmclock/src/dmclock_server.h" + +#include "crimson/osd/scheduler/scheduler.h" +#include "common/config.h" +#include "common/ceph_context.h" + + +namespace crimson::osd::scheduler { + +using client_id_t = uint64_t; +using profile_id_t = uint64_t; + +struct client_profile_id_t { + client_id_t client_id; + profile_id_t profile_id; + auto operator<=>(const client_profile_id_t&) const = default; +}; + + +struct scheduler_id_t { + scheduler_class_t class_id; + client_profile_id_t client_profile_id; + auto operator<=>(const scheduler_id_t&) const = default; +}; + +/** + * Scheduler implementation based on mclock. + * + * TODO: explain configs + */ +class mClockScheduler : public Scheduler, md_config_obs_t { + + class ClientRegistry { + std::array< + crimson::dmclock::ClientInfo, + static_cast<size_t>(scheduler_class_t::client) + > internal_client_infos = { + // Placeholder, gets replaced with configured values + crimson::dmclock::ClientInfo(1, 1, 1), + crimson::dmclock::ClientInfo(1, 1, 1) + }; + + crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1}; + std::map<client_profile_id_t, + crimson::dmclock::ClientInfo> external_client_infos; + const crimson::dmclock::ClientInfo *get_external_client( + const client_profile_id_t &client) const; + public: + void update_from_config(const ConfigProxy &conf); + const crimson::dmclock::ClientInfo *get_info( + const scheduler_id_t &id) const; + } client_registry; + + using mclock_queue_t = crimson::dmclock::PullPriorityQueue< + scheduler_id_t, + item_t, + true, + true, + 2>; + mclock_queue_t scheduler; + std::list<item_t> immediate; + + static scheduler_id_t get_scheduler_id(const item_t &item) { + return scheduler_id_t{ + item.params.klass, + client_profile_id_t{ + item.params.owner, + 0 + } + }; + } + +public: + mClockScheduler(ConfigProxy &conf); + + // Enqueue op in the back of the regular queue + void enqueue(item_t &&item) final; + + // Enqueue the op in the front of the regular queue + void enqueue_front(item_t &&item) final; + + // Return an op to be dispatch + item_t dequeue() final; + + // Returns if the queue is empty + bool empty() const final { + return immediate.empty() && scheduler.empty(); + } + + // Formatted output of the queue + void dump(ceph::Formatter &f) const final; + + void print(std::ostream &ostream) const final { + ostream << "mClockScheduler"; + } + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) final; +}; + +} diff --git a/src/crimson/osd/scheduler/scheduler.cc b/src/crimson/osd/scheduler/scheduler.cc new file mode 100644 index 000000000..c85cb388e --- /dev/null +++ b/src/crimson/osd/scheduler/scheduler.cc @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <ostream> + +#include <seastar/core/print.hh> + +#include "crimson/osd/scheduler/scheduler.h" +#include "crimson/osd/scheduler/mclock_scheduler.h" +#include "common/WeightedPriorityQueue.h" + +namespace crimson::osd::scheduler { + +std::ostream &operator<<(std::ostream &lhs, const scheduler_class_t &c) +{ + switch (c) { + case scheduler_class_t::background_best_effort: + return lhs << "background_best_effort"; + case scheduler_class_t::background_recovery: + return lhs << "background_recovery"; + case scheduler_class_t::client: + return lhs << "client"; + case scheduler_class_t::repop: + return lhs << "repop"; + case scheduler_class_t::immediate: + return lhs << "immediate"; + default: + return lhs; + } +} + +/** + * Implements Scheduler in terms of OpQueue + * + * Templated on queue type to avoid dynamic dispatch, T should implement + * OpQueue<Scheduleritem_t, client_t>. This adapter is mainly responsible for + * the boilerplate priority cutoff/strict concept which is needed for + * OpQueue based implementations. + */ +template <typename T> +class ClassedOpQueueScheduler final : public Scheduler { + const scheduler_class_t cutoff; + T queue; + + using priority_t = uint64_t; + std::array< + priority_t, + static_cast<size_t>(scheduler_class_t::immediate) + > priority_map = { + // Placeholder, gets replaced with configured values + 0, 0, 0 + }; + + static scheduler_class_t get_io_prio_cut(ConfigProxy &conf) { + if (conf.get_val<std::string>("osd_op_queue_cut_off") == "debug_random") { + srand(time(NULL)); + return (rand() % 2 < 1) ? + scheduler_class_t::repop : scheduler_class_t::immediate; + } else if (conf.get_val<std::string>("osd_op_queue_cut_off") == "high") { + return scheduler_class_t::immediate; + } else { + return scheduler_class_t::repop; + } + } + + bool use_strict(scheduler_class_t kl) const { + return static_cast<uint8_t>(kl) >= static_cast<uint8_t>(cutoff); + } + + priority_t get_priority(scheduler_class_t kl) const { + ceph_assert(static_cast<size_t>(kl) < + static_cast<size_t>(scheduler_class_t::immediate)); + return priority_map[static_cast<size_t>(kl)]; + } + +public: + template <typename... Args> + ClassedOpQueueScheduler(ConfigProxy &conf, Args&&... args) : + cutoff(get_io_prio_cut(conf)), + queue(std::forward<Args>(args)...) + { + priority_map[ + static_cast<size_t>(scheduler_class_t::background_best_effort) + ] = conf.get_val<uint64_t>("osd_scrub_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::background_recovery) + ] = conf.get_val<uint64_t>("osd_recovery_op_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::client) + ] = conf.get_val<uint64_t>("osd_client_op_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::repop) + ] = conf.get_val<uint64_t>("osd_client_op_priority"); + } + + void enqueue(item_t &&item) final { + if (use_strict(item.params.klass)) + queue.enqueue_strict( + item.params.owner, get_priority(item.params.klass), std::move(item)); + else + queue.enqueue( + item.params.owner, get_priority(item.params.klass), + item.params.cost, std::move(item)); + } + + void enqueue_front(item_t &&item) final { + if (use_strict(item.params.klass)) + queue.enqueue_strict_front( + item.params.owner, get_priority(item.params.klass), std::move(item)); + else + queue.enqueue_front( + item.params.owner, get_priority(item.params.klass), + item.params.cost, std::move(item)); + } + + bool empty() const final { + return queue.empty(); + } + + item_t dequeue() final { + return queue.dequeue(); + } + + void dump(ceph::Formatter &f) const final { + return queue.dump(&f); + } + + void print(std::ostream &out) const final { + out << "ClassedOpQueueScheduler(queue="; + queue.print(out); + out << ", cutoff=" << cutoff << ")"; + } + + ~ClassedOpQueueScheduler() final {}; +}; + +SchedulerRef make_scheduler(ConfigProxy &conf) +{ + const std::string _type = conf.get_val<std::string>("osd_op_queue"); + const std::string *type = &_type; + if (*type == "debug_random") { + static const std::string index_lookup[] = { "mclock_scheduler", + "wpq" }; + srand(time(NULL)); + unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0])); + type = &index_lookup[which]; + } + + if (*type == "wpq" ) { + // default is 'wpq' + return std::make_unique< + ClassedOpQueueScheduler<WeightedPriorityQueue<item_t, client_t>>>( + conf, + conf.get_val<uint64_t>("osd_op_pq_max_tokens_per_priority"), + conf->osd_op_pq_min_cost + ); + } else if (*type == "mclock_scheduler") { + return std::make_unique<mClockScheduler>(conf); + } else { + ceph_assert("Invalid choice of wq" == 0); + return std::unique_ptr<mClockScheduler>(); + } +} + +std::ostream &operator<<(std::ostream &lhs, const Scheduler &rhs) { + rhs.print(lhs); + return lhs; +} + +} diff --git a/src/crimson/osd/scheduler/scheduler.h b/src/crimson/osd/scheduler/scheduler.h new file mode 100644 index 000000000..a014991ab --- /dev/null +++ b/src/crimson/osd/scheduler/scheduler.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <seastar/core/future.hh> +#include <ostream> + +#include "crimson/common/config_proxy.h" + +namespace crimson::osd::scheduler { + +enum class scheduler_class_t : uint8_t { + background_best_effort = 0, + background_recovery, + client, + repop, + immediate, +}; + +std::ostream &operator<<(std::ostream &, const scheduler_class_t &); + +using client_t = uint64_t; +using cost_t = uint64_t; + +struct params_t { + cost_t cost = 1; + client_t owner; + scheduler_class_t klass; +}; + +struct item_t { + params_t params; + seastar::promise<> wake; +}; + +/** + * Base interface for classes responsible for choosing + * op processing order in the OSD. + */ +class Scheduler { +public: + // Enqueue op for scheduling + virtual void enqueue(item_t &&item) = 0; + + // Enqueue op for processing as though it were enqueued prior + // to other items already scheduled. + virtual void enqueue_front(item_t &&item) = 0; + + // Returns true iff there are no ops scheduled + virtual bool empty() const = 0; + + // Return next op to be processed + virtual item_t dequeue() = 0; + + // Dump formatted representation for the queue + virtual void dump(ceph::Formatter &f) const = 0; + + // Print human readable brief description with relevant parameters + virtual void print(std::ostream &out) const = 0; + + // Destructor + virtual ~Scheduler() {}; +}; + +std::ostream &operator<<(std::ostream &lhs, const Scheduler &); +using SchedulerRef = std::unique_ptr<Scheduler>; + +SchedulerRef make_scheduler(ConfigProxy &); + +} diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc new file mode 100644 index 000000000..a6431305d --- /dev/null +++ b/src/crimson/osd/shard_services.cc @@ -0,0 +1,761 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/smart_ptr/make_local_shared.hpp> + +#include "crimson/osd/shard_services.h" + +#include "messages/MOSDAlive.h" +#include "messages/MOSDMap.h" +#include "messages/MOSDPGCreated.h" +#include "messages/MOSDPGTemp.h" + +#include "osd/osd_perf_counters.h" +#include "osd/PeeringState.h" +#include "crimson/common/config_proxy.h" +#include "crimson/mgr/client.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Messenger.h" +#include "crimson/net/Connection.h" +#include "crimson/os/cyanstore/cyan_store.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/osd_operations/pg_advance_map.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_meta.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using std::vector; + +namespace crimson::osd { + +PerShardState::PerShardState( + int whoami, + ceph::mono_time startup_time, + PerfCounters *perf, + PerfCounters *recoverystate_perf, + crimson::os::FuturizedStore &store, + OSDState &osd_state) + : whoami(whoami), + store(store.get_sharded_store()), + osd_state(osd_state), + osdmap_gate("PerShardState::osdmap_gate"), + perf(perf), recoverystate_perf(recoverystate_perf), + throttler(crimson::common::local_conf()), + next_tid( + static_cast<ceph_tid_t>(seastar::this_shard_id()) << + (std::numeric_limits<ceph_tid_t>::digits - 8)), + startup_time(startup_time) +{} + +seastar::future<> PerShardState::dump_ops_in_flight(Formatter *f) const +{ + registry.for_each_op([f](const auto &op) { + op.dump(f); + }); + return seastar::now(); +} + +seastar::future<> PerShardState::stop_pgs() +{ + assert_core(); + return seastar::parallel_for_each( + pg_map.get_pgs(), + [](auto& p) { + return p.second->stop(); + }); +} + +std::map<pg_t, pg_stat_t> PerShardState::get_pg_stats() const +{ + assert_core(); + std::map<pg_t, pg_stat_t> ret; + for (auto [pgid, pg] : pg_map.get_pgs()) { + if (pg->is_primary()) { + auto stats = pg->get_stats(); + // todo: update reported_epoch,reported_seq,last_fresh + stats.reported_epoch = osdmap->get_epoch(); + ret.emplace(pgid.pgid, std::move(stats)); + } + } + return ret; +} + +seastar::future<> PerShardState::broadcast_map_to_pgs( + ShardServices &shard_services, + epoch_t epoch) +{ + assert_core(); + auto &pgs = pg_map.get_pgs(); + return seastar::parallel_for_each( + pgs.begin(), pgs.end(), + [=, &shard_services](auto& pg) { + return shard_services.start_operation<PGAdvanceMap>( + shard_services, + pg.second, epoch, + PeeringCtx{}, false).second; + }); +} + +Ref<PG> PerShardState::get_pg(spg_t pgid) +{ + assert_core(); + return pg_map.get_pg(pgid); +} + +HeartbeatStampsRef PerShardState::get_hb_stamps(int peer) +{ + assert_core(); + auto [stamps, added] = heartbeat_stamps.try_emplace(peer); + if (added) { + stamps->second = ceph::make_ref<HeartbeatStamps>(peer); + } + return stamps->second; +} + +OSDSingletonState::OSDSingletonState( + int whoami, + crimson::net::Messenger &cluster_msgr, + crimson::net::Messenger &public_msgr, + crimson::mon::Client &monc, + crimson::mgr::Client &mgrc) + : whoami(whoami), + cluster_msgr(cluster_msgr), + public_msgr(public_msgr), + monc(monc), + mgrc(mgrc), + local_reserver( + &cct, + &finisher, + crimson::common::local_conf()->osd_max_backfills, + crimson::common::local_conf()->osd_min_recovery_priority), + remote_reserver( + &cct, + &finisher, + crimson::common::local_conf()->osd_max_backfills, + crimson::common::local_conf()->osd_min_recovery_priority), + snap_reserver( + &cct, + &finisher, + crimson::common::local_conf()->osd_max_trimming_pgs) +{ + crimson::common::local_conf().add_observer(this); + osdmaps[0] = boost::make_local_shared<OSDMap>(); + + perf = build_osd_logger(&cct); + cct.get_perfcounters_collection()->add(perf); + + recoverystate_perf = build_recoverystate_perf(&cct); + cct.get_perfcounters_collection()->add(recoverystate_perf); +} + +seastar::future<> OSDSingletonState::send_to_osd( + int peer, MessageURef m, epoch_t from_epoch) +{ + if (osdmap->is_down(peer)) { + logger().info("{}: osd.{} is_down", __func__, peer); + return seastar::now(); + } else if (osdmap->get_info(peer).up_from > from_epoch) { + logger().info("{}: osd.{} {} > {}", __func__, peer, + osdmap->get_info(peer).up_from, from_epoch); + return seastar::now(); + } else { + auto conn = cluster_msgr.connect( + osdmap->get_cluster_addrs(peer).front(), CEPH_ENTITY_TYPE_OSD); + return conn->send(std::move(m)); + } +} + +seastar::future<> OSDSingletonState::osdmap_subscribe( + version_t epoch, bool force_request) +{ + logger().info("{}({})", __func__, epoch); + if (monc.sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) || + force_request) { + return monc.renew_subs(); + } else { + return seastar::now(); + } +} + +void OSDSingletonState::queue_want_pg_temp( + pg_t pgid, + const vector<int>& want, + bool forced) +{ + auto p = pg_temp_pending.find(pgid); + if (p == pg_temp_pending.end() || + p->second.acting != want || + forced) { + pg_temp_wanted[pgid] = {want, forced}; + } +} + +void OSDSingletonState::remove_want_pg_temp(pg_t pgid) +{ + pg_temp_wanted.erase(pgid); + pg_temp_pending.erase(pgid); +} + +void OSDSingletonState::requeue_pg_temp() +{ + unsigned old_wanted = pg_temp_wanted.size(); + unsigned old_pending = pg_temp_pending.size(); + pg_temp_wanted.merge(pg_temp_pending); + pg_temp_pending.clear(); + logger().debug( + "{}: {} + {} -> {}", + __func__ , + old_wanted, + old_pending, + pg_temp_wanted.size()); +} + +seastar::future<> OSDSingletonState::send_pg_temp() +{ + if (pg_temp_wanted.empty()) + return seastar::now(); + logger().debug("{}: {}", __func__, pg_temp_wanted); + MURef<MOSDPGTemp> ms[2] = {nullptr, nullptr}; + for (auto& [pgid, pg_temp] : pg_temp_wanted) { + auto& m = ms[pg_temp.forced]; + if (!m) { + m = crimson::make_message<MOSDPGTemp>(osdmap->get_epoch()); + m->forced = pg_temp.forced; + } + m->pg_temp.emplace(pgid, pg_temp.acting); + } + pg_temp_pending.merge(pg_temp_wanted); + pg_temp_wanted.clear(); + return seastar::parallel_for_each(std::begin(ms), std::end(ms), + [this](auto& m) { + if (m) { + return monc.send_message(std::move(m)); + } else { + return seastar::now(); + } + }); +} + +std::ostream& operator<<( + std::ostream& out, + const OSDSingletonState::pg_temp_t& pg_temp) +{ + out << pg_temp.acting; + if (pg_temp.forced) { + out << " (forced)"; + } + return out; +} + +seastar::future<> OSDSingletonState::send_pg_created(pg_t pgid) +{ + logger().debug(__func__); + auto o = get_osdmap(); + ceph_assert(o->require_osd_release >= ceph_release_t::luminous); + pg_created.insert(pgid); + return monc.send_message(crimson::make_message<MOSDPGCreated>(pgid)); +} + +seastar::future<> OSDSingletonState::send_pg_created() +{ + logger().debug(__func__); + auto o = get_osdmap(); + ceph_assert(o->require_osd_release >= ceph_release_t::luminous); + return seastar::parallel_for_each(pg_created, + [this](auto &pgid) { + return monc.send_message(crimson::make_message<MOSDPGCreated>(pgid)); + }); +} + +void OSDSingletonState::prune_pg_created() +{ + logger().debug(__func__); + auto o = get_osdmap(); + auto i = pg_created.begin(); + while (i != pg_created.end()) { + auto p = o->get_pg_pool(i->pool()); + if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) { + logger().debug("{} pruning {}", __func__, *i); + i = pg_created.erase(i); + } else { + logger().debug(" keeping {}", __func__, *i); + ++i; + } + } +} + +seastar::future<> OSDSingletonState::send_alive(const epoch_t want) +{ + logger().info( + "{} want={} up_thru_wanted={}", + __func__, + want, + up_thru_wanted); + + if (want > up_thru_wanted) { + up_thru_wanted = want; + } else { + logger().debug("{} want={} <= up_thru_wanted={}; skipping", + __func__, want, up_thru_wanted); + return seastar::now(); + } + if (!osdmap->exists(whoami)) { + logger().warn("{} DNE", __func__); + return seastar::now(); + } if (const epoch_t up_thru = osdmap->get_up_thru(whoami); + up_thru_wanted > up_thru) { + logger().debug("{} up_thru_wanted={} up_thru={}", __func__, want, up_thru); + return monc.send_message( + crimson::make_message<MOSDAlive>(osdmap->get_epoch(), want)); + } else { + logger().debug("{} {} <= {}", __func__, want, osdmap->get_up_thru(whoami)); + return seastar::now(); + } +} + +const char** OSDSingletonState::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_max_backfills", + "osd_min_recovery_priority", + "osd_max_trimming_pgs", + nullptr + }; + return KEYS; +} + +void OSDSingletonState::handle_conf_change( + const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + if (changed.count("osd_max_backfills")) { + local_reserver.set_max(conf->osd_max_backfills); + remote_reserver.set_max(conf->osd_max_backfills); + } + if (changed.count("osd_min_recovery_priority")) { + local_reserver.set_min_priority(conf->osd_min_recovery_priority); + remote_reserver.set_min_priority(conf->osd_min_recovery_priority); + } + if (changed.count("osd_max_trimming_pgs")) { + snap_reserver.set_max(conf->osd_max_trimming_pgs); + } +} + +seastar::future<OSDSingletonState::local_cached_map_t> +OSDSingletonState::get_local_map(epoch_t e) +{ + // TODO: use LRU cache for managing osdmap, fallback to disk if we have to + if (auto found = osdmaps.find(e); found) { + logger().debug("{} osdmap.{} found in cache", __func__, e); + return seastar::make_ready_future<local_cached_map_t>(std::move(found)); + } else { + logger().debug("{} loading osdmap.{} from disk", __func__, e); + return load_map(e).then([e, this](std::unique_ptr<OSDMap> osdmap) { + return seastar::make_ready_future<local_cached_map_t>( + osdmaps.insert(e, std::move(osdmap))); + }); + } +} + +void OSDSingletonState::store_map_bl( + ceph::os::Transaction& t, + epoch_t e, bufferlist&& bl) +{ + meta_coll->store_map(t, e, bl); + map_bl_cache.insert(e, std::move(bl)); +} + +seastar::future<bufferlist> OSDSingletonState::load_map_bl( + epoch_t e) +{ + if (std::optional<bufferlist> found = map_bl_cache.find(e); found) { + logger().debug("{} osdmap.{} found in cache", __func__, e); + return seastar::make_ready_future<bufferlist>(*found); + } else { + logger().debug("{} loading osdmap.{} from disk", __func__, e); + return meta_coll->load_map(e); + } +} + +seastar::future<std::map<epoch_t, bufferlist>> OSDSingletonState::load_map_bls( + epoch_t first, + epoch_t last) +{ + logger().debug("{} loading maps [{},{}]", + __func__, first, last); + ceph_assert(first <= last); + return seastar::map_reduce(boost::make_counting_iterator<epoch_t>(first), + boost::make_counting_iterator<epoch_t>(last + 1), + [this](epoch_t e) { + return load_map_bl(e).then([e](auto&& bl) { + return seastar::make_ready_future<std::pair<epoch_t, bufferlist>>( + std::make_pair(e, std::move(bl))); + }); + }, + std::map<epoch_t, bufferlist>{}, + [](auto&& bls, auto&& epoch_bl) { + bls.emplace(std::move(epoch_bl)); + return std::move(bls); + }); +} + +seastar::future<std::unique_ptr<OSDMap>> OSDSingletonState::load_map(epoch_t e) +{ + auto o = std::make_unique<OSDMap>(); + logger().info("{} osdmap.{}", __func__, e); + if (e == 0) { + return seastar::make_ready_future<std::unique_ptr<OSDMap>>(std::move(o)); + } + return load_map_bl(e).then([o=std::move(o)](bufferlist bl) mutable { + o->decode(bl); + return seastar::make_ready_future<std::unique_ptr<OSDMap>>(std::move(o)); + }); +} + +seastar::future<> OSDSingletonState::store_maps(ceph::os::Transaction& t, + epoch_t start, Ref<MOSDMap> m) +{ + return seastar::do_for_each( + boost::make_counting_iterator(start), + boost::make_counting_iterator(m->get_last() + 1), + [&t, m, this](epoch_t e) { + if (auto p = m->maps.find(e); p != m->maps.end()) { + auto o = std::make_unique<OSDMap>(); + o->decode(p->second); + logger().info("store_maps storing osdmap.{}", e); + store_map_bl(t, e, std::move(std::move(p->second))); + osdmaps.insert(e, std::move(o)); + return seastar::now(); + } else if (auto p = m->incremental_maps.find(e); + p != m->incremental_maps.end()) { + logger().info("store_maps found osdmap.{} incremental map, " + "loading osdmap.{}", e, e - 1); + ceph_assert(std::cmp_greater(e, 0u)); + return load_map(e - 1).then([e, bl=p->second, &t, this](auto o) { + OSDMap::Incremental inc; + auto i = bl.cbegin(); + inc.decode(i); + o->apply_incremental(inc); + bufferlist fbl; + o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED); + logger().info("store_maps storing osdmap.{}", o->get_epoch()); + store_map_bl(t, e, std::move(fbl)); + osdmaps.insert(e, std::move(o)); + return seastar::now(); + }); + } else { + logger().error("MOSDMap lied about what maps it had?"); + return seastar::now(); + } + }); +} + +seastar::future<Ref<PG>> ShardServices::make_pg( + OSDMapService::cached_map_t create_map, + spg_t pgid, + bool do_create) +{ + using ec_profile_t = std::map<std::string, std::string>; + auto get_pool_info_for_pg = [create_map, pgid, this] { + if (create_map->have_pg_pool(pgid.pool())) { + pg_pool_t pi = *create_map->get_pg_pool(pgid.pool()); + std::string name = create_map->get_pool_name(pgid.pool()); + ec_profile_t ec_profile; + if (pi.is_erasure()) { + ec_profile = create_map->get_erasure_code_profile( + pi.erasure_code_profile); + } + return seastar::make_ready_future< + std::tuple<pg_pool_t,std::string, ec_profile_t> + >(std::make_tuple( + std::move(pi), + std::move(name), + std::move(ec_profile))); + } else { + // pool was deleted; grab final pg_pool_t off disk. + return get_pool_info(pgid.pool()); + } + }; + auto get_collection = [pgid, do_create, this] { + const coll_t cid{pgid}; + if (do_create) { + return get_store().create_new_collection(cid); + } else { + return get_store().open_collection(cid); + } + }; + return seastar::when_all( + std::move(get_pool_info_for_pg), + std::move(get_collection) + ).then([pgid, create_map, this](auto &&ret) { + auto [pool, name, ec_profile] = std::move(std::get<0>(ret).get0()); + auto coll = std::move(std::get<1>(ret).get0()); + return seastar::make_ready_future<Ref<PG>>( + new PG{ + pgid, + pg_shard_t{local_state.whoami, pgid.shard}, + std::move(coll), + std::move(pool), + std::move(name), + create_map, + *this, + ec_profile}); + }); +} + +seastar::future<Ref<PG>> ShardServices::handle_pg_create_info( + std::unique_ptr<PGCreateInfo> info) { + return seastar::do_with( + std::move(info), + [this](auto &info) + -> seastar::future<Ref<PG>> { + return get_map(info->epoch).then( + [&info, this](cached_map_t startmap) + -> seastar::future<std::tuple<Ref<PG>, cached_map_t>> { + const spg_t &pgid = info->pgid; + if (info->by_mon) { + int64_t pool_id = pgid.pgid.pool(); + const pg_pool_t *pool = get_map()->get_pg_pool(pool_id); + if (!pool) { + logger().debug( + "{} ignoring pgid {}, pool dne", + __func__, + pgid); + local_state.pg_map.pg_creation_canceled(pgid); + return seastar::make_ready_future< + std::tuple<Ref<PG>, OSDMapService::cached_map_t> + >(std::make_tuple(Ref<PG>(), startmap)); + } else if (!pool->is_crimson()) { + logger().debug( + "{} ignoring pgid {}, pool lacks crimson flag", + __func__, + pgid); + local_state.pg_map.pg_creation_canceled(pgid); + return seastar::make_ready_future< + std::tuple<Ref<PG>, OSDMapService::cached_map_t> + >(std::make_tuple(Ref<PG>(), startmap)); + } + ceph_assert(get_map()->require_osd_release >= + ceph_release_t::octopus); + if (!pool->has_flag(pg_pool_t::FLAG_CREATING)) { + // this ensures we do not process old creating messages after the + // pool's initial pgs have been created (and pg are subsequently + // allowed to split or merge). + logger().debug( + "{} dropping {} create, pool does not have CREATING flag set", + __func__, + pgid); + local_state.pg_map.pg_creation_canceled(pgid); + return seastar::make_ready_future< + std::tuple<Ref<PG>, OSDMapService::cached_map_t> + >(std::make_tuple(Ref<PG>(), startmap)); + } + } + return make_pg( + startmap, pgid, true + ).then([startmap=std::move(startmap)](auto pg) mutable { + return seastar::make_ready_future< + std::tuple<Ref<PG>, OSDMapService::cached_map_t> + >(std::make_tuple(std::move(pg), std::move(startmap))); + }); + }).then([this, &info](auto &&ret) + ->seastar::future<Ref<PG>> { + auto [pg, startmap] = std::move(ret); + if (!pg) + return seastar::make_ready_future<Ref<PG>>(Ref<PG>()); + const pg_pool_t* pp = startmap->get_pg_pool(info->pgid.pool()); + + int up_primary, acting_primary; + vector<int> up, acting; + startmap->pg_to_up_acting_osds( + info->pgid.pgid, &up, &up_primary, &acting, &acting_primary); + + int role = startmap->calc_pg_role( + pg_shard_t(local_state.whoami, info->pgid.shard), + acting); + + PeeringCtx rctx; + create_pg_collection( + rctx.transaction, + info->pgid, + info->pgid.get_split_bits(pp->get_pg_num())); + init_pg_ondisk( + rctx.transaction, + info->pgid, + pp); + + pg->init( + role, + up, + up_primary, + acting, + acting_primary, + info->history, + info->past_intervals, + rctx.transaction); + + return start_operation<PGAdvanceMap>( + *this, pg, get_map()->get_epoch(), std::move(rctx), true + ).second.then([pg=pg] { + return seastar::make_ready_future<Ref<PG>>(pg); + }); + }); + }); +} + + +ShardServices::get_or_create_pg_ret +ShardServices::get_or_create_pg( + PGMap::PGCreationBlockingEvent::TriggerI&& trigger, + spg_t pgid, + std::unique_ptr<PGCreateInfo> info) +{ + if (info) { + auto [fut, creating] = local_state.pg_map.wait_for_pg( + std::move(trigger), pgid); + if (!creating) { + local_state.pg_map.set_creating(pgid); + (void)handle_pg_create_info( + std::move(info)); + } + return std::move(fut); + } else { + return get_or_create_pg_ret( + get_or_create_pg_ertr::ready_future_marker{}, + local_state.pg_map.get_pg(pgid)); + } +} + +ShardServices::wait_for_pg_ret +ShardServices::wait_for_pg( + PGMap::PGCreationBlockingEvent::TriggerI&& trigger, spg_t pgid) +{ + return local_state.pg_map.wait_for_pg(std::move(trigger), pgid).first; +} + +seastar::future<Ref<PG>> ShardServices::load_pg(spg_t pgid) + +{ + logger().debug("{}: {}", __func__, pgid); + + return seastar::do_with(PGMeta(get_store(), pgid), [](auto& pg_meta) { + return pg_meta.get_epoch(); + }).then([this](epoch_t e) { + return get_map(e); + }).then([pgid, this](auto&& create_map) { + return make_pg(std::move(create_map), pgid, false); + }).then([this](Ref<PG> pg) { + return pg->read_state(&get_store()).then([pg] { + return seastar::make_ready_future<Ref<PG>>(std::move(pg)); + }); + }).handle_exception([pgid](auto ep) { + logger().info("pg {} saw exception on load {}", pgid, ep); + ceph_abort("Could not load pg" == 0); + return seastar::make_exception_future<Ref<PG>>(ep); + }); +} + +seastar::future<> ShardServices::dispatch_context_transaction( + crimson::os::CollectionRef col, PeeringCtx &ctx) { + if (ctx.transaction.empty()) { + logger().debug("ShardServices::dispatch_context_transaction: empty transaction"); + return seastar::now(); + } + + logger().debug("ShardServices::dispatch_context_transaction: do_transaction ..."); + auto ret = get_store().do_transaction( + col, + std::move(ctx.transaction)); + ctx.reset_transaction(); + return ret; +} + +seastar::future<> ShardServices::dispatch_context_messages( + BufferedRecoveryMessages &&ctx) +{ + auto ret = seastar::parallel_for_each(std::move(ctx.message_map), + [this](auto& osd_messages) { + auto& [peer, messages] = osd_messages; + logger().debug("dispatch_context_messages sending messages to {}", peer); + return seastar::parallel_for_each( + std::move(messages), [=, peer=peer, this](auto& m) { + return send_to_osd(peer, std::move(m), local_state.osdmap->get_epoch()); + }); + }); + ctx.message_map.clear(); + return ret; +} + +seastar::future<> ShardServices::dispatch_context( + crimson::os::CollectionRef col, + PeeringCtx &&ctx) +{ + ceph_assert(col || ctx.transaction.empty()); + return seastar::when_all_succeed( + dispatch_context_messages( + BufferedRecoveryMessages{ctx}), + col ? dispatch_context_transaction(col, ctx) : seastar::now() + ).then_unpack([] { + return seastar::now(); + }); +} + +seastar::future<> OSDSingletonState::send_incremental_map( + crimson::net::Connection &conn, + epoch_t first) +{ + logger().info("{}: first osdmap: {} " + "superblock's oldest map: {}", + __func__, first, superblock.oldest_map); + if (first >= superblock.oldest_map) { + return load_map_bls( + first, superblock.newest_map + ).then([this, &conn, first](auto&& bls) { + auto m = crimson::make_message<MOSDMap>( + monc.get_fsid(), + osdmap->get_encoding_features()); + m->cluster_osdmap_trim_lower_bound = first; + m->newest_map = superblock.newest_map; + m->maps = std::move(bls); + return conn.send(std::move(m)); + }); + } else { + return load_map_bl(osdmap->get_epoch() + ).then([this, &conn](auto&& bl) mutable { + auto m = crimson::make_message<MOSDMap>( + monc.get_fsid(), + osdmap->get_encoding_features()); + /* TODO: once we support the tracking of superblock's + * cluster_osdmap_trim_lower_bound, the MOSDMap should + * be populated with this value instead of the oldest_map. + * See: OSD::handle_osd_map for how classic updates the + * cluster's trim lower bound. + */ + m->cluster_osdmap_trim_lower_bound = superblock.oldest_map; + m->newest_map = superblock.newest_map; + m->maps.emplace(osdmap->get_epoch(), std::move(bl)); + return conn.send(std::move(m)); + }); + } +} + +seastar::future<> OSDSingletonState::send_incremental_map_to_osd( + int osd, + epoch_t first) +{ + if (osdmap->is_down(osd)) { + logger().info("{}: osd.{} is_down", __func__, osd); + return seastar::now(); + } else { + auto conn = cluster_msgr.connect( + osdmap->get_cluster_addrs(osd).front(), CEPH_ENTITY_TYPE_OSD); + return send_incremental_map(*conn, first); + } +} + +}; diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h new file mode 100644 index 000000000..9b7553e7b --- /dev/null +++ b/src/crimson/osd/shard_services.h @@ -0,0 +1,589 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> + +#include "include/common_fwd.h" +#include "osd_operation.h" +#include "msg/MessageRef.h" +#include "crimson/common/exception.h" +#include "crimson/common/shared_lru.h" +#include "crimson/os/futurized_collection.h" +#include "osd/PeeringState.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/osd_meta.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/pg_map.h" +#include "crimson/osd/state.h" +#include "common/AsyncReserver.h" + +namespace crimson::net { + class Messenger; +} + +namespace crimson::mgr { + class Client; +} + +namespace crimson::mon { + class Client; +} + +namespace crimson::os { + class FuturizedStore; +} + +class OSDMap; +class PeeringCtx; +class BufferedRecoveryMessages; + +namespace crimson::osd { + +class PGShardManager; + +/** + * PerShardState + * + * Per-shard state holding instances local to each shard. + */ +class PerShardState { + friend class ShardServices; + friend class PGShardManager; + friend class OSD; + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; + + const core_id_t core = seastar::this_shard_id(); +#define assert_core() ceph_assert(seastar::this_shard_id() == core); + + const int whoami; + crimson::os::FuturizedStore::Shard &store; + crimson::common::CephContext cct; + + OSDState &osd_state; + OSD_OSDMapGate osdmap_gate; + + PerfCounters *perf = nullptr; + PerfCounters *recoverystate_perf = nullptr; + + // Op Management + OSDOperationRegistry registry; + OperationThrottler throttler; + + seastar::future<> dump_ops_in_flight(Formatter *f) const; + + epoch_t up_epoch = 0; + OSDMapService::cached_map_t osdmap; + const auto &get_osdmap() const { + assert_core(); + return osdmap; + } + void update_map(OSDMapService::cached_map_t new_osdmap) { + assert_core(); + osdmap = std::move(new_osdmap); + } + void set_up_epoch(epoch_t epoch) { + assert_core(); + up_epoch = epoch; + } + + // prevent creating new osd operations when system is shutting down, + // this is necessary because there are chances that a new operation + // is created, after the interruption of all ongoing operations, and + // creats and waits on a new and may-never-resolve future, in which + // case the shutdown may never succeed. + bool stopping = false; + seastar::future<> stop_registry() { + assert_core(); + crimson::get_logger(ceph_subsys_osd).info("PerShardState::{}", __func__); + stopping = true; + return registry.stop(); + } + + // PGMap state + PGMap pg_map; + + seastar::future<> stop_pgs(); + std::map<pg_t, pg_stat_t> get_pg_stats() const; + seastar::future<> broadcast_map_to_pgs( + ShardServices &shard_services, + epoch_t epoch); + + Ref<PG> get_pg(spg_t pgid); + template <typename F> + void for_each_pg(F &&f) const { + assert_core(); + for (auto &pg : pg_map.get_pgs()) { + std::invoke(f, pg.first, pg.second); + } + } + + template <typename T, typename... Args> + auto start_operation(Args&&... args) { + assert_core(); + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + auto op = registry.create_operation<T>(std::forward<Args>(args)...); + crimson::get_logger(ceph_subsys_osd).info( + "PerShardState::{}, {}", __func__, *op); + auto fut = seastar::yield().then([op] { + return op->start().finally([op /* by copy */] { + // ensure the op's lifetime is appropriate. It is not enough to + // guarantee it's alive at the scheduling stages (i.e. `then()` + // calling) but also during the actual execution (i.e. when passed + // lambdas are actually run). + }); + }); + return std::make_pair(std::move(op), std::move(fut)); + } + + template <typename InterruptorT, typename T, typename... Args> + auto start_operation_may_interrupt(Args&&... args) { + assert_core(); + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + auto op = registry.create_operation<T>(std::forward<Args>(args)...); + crimson::get_logger(ceph_subsys_osd).info( + "PerShardState::{}, {}", __func__, *op); + auto fut = InterruptorT::make_interruptible( + seastar::yield() + ).then_interruptible([op] { + return op->start().finally([op /* by copy */] { + // ensure the op's lifetime is appropriate. It is not enough to + // guarantee it's alive at the scheduling stages (i.e. `then()` + // calling) but also during the actual execution (i.e. when passed + // lambdas are actually run). + }); + }); + return std::make_pair(std::move(op), std::move(fut)); + } + + // tids for ops i issue, prefixed with core id to ensure uniqueness + ceph_tid_t next_tid; + ceph_tid_t get_tid() { + assert_core(); + return next_tid++; + } + + HeartbeatStampsRef get_hb_stamps(int peer); + std::map<int, HeartbeatStampsRef> heartbeat_stamps; + + // Time state + const ceph::mono_time startup_time; + ceph::signedspan get_mnow() const { + assert_core(); + return ceph::mono_clock::now() - startup_time; + } + +public: + PerShardState( + int whoami, + ceph::mono_time startup_time, + PerfCounters *perf, + PerfCounters *recoverystate_perf, + crimson::os::FuturizedStore &store, + OSDState& osd_state); +}; + +/** + * OSDSingletonState + * + * OSD-wide singleton holding instances that need to be accessible + * from all PGs. + */ +class OSDSingletonState : public md_config_obs_t { + friend class ShardServices; + friend class PGShardManager; + friend class OSD; + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; + +public: + OSDSingletonState( + int whoami, + crimson::net::Messenger &cluster_msgr, + crimson::net::Messenger &public_msgr, + crimson::mon::Client &monc, + crimson::mgr::Client &mgrc); + +private: + const int whoami; + + crimson::common::CephContext cct; + PerfCounters *perf = nullptr; + PerfCounters *recoverystate_perf = nullptr; + + SharedLRU<epoch_t, OSDMap> osdmaps; + SimpleLRU<epoch_t, bufferlist, false> map_bl_cache; + + cached_map_t osdmap; + cached_map_t &get_osdmap() { return osdmap; } + void update_map(cached_map_t new_osdmap) { + osdmap = std::move(new_osdmap); + } + + crimson::net::Messenger &cluster_msgr; + crimson::net::Messenger &public_msgr; + + seastar::future<> send_to_osd(int peer, MessageURef m, epoch_t from_epoch); + + crimson::mon::Client &monc; + seastar::future<> osdmap_subscribe(version_t epoch, bool force_request); + + crimson::mgr::Client &mgrc; + + std::unique_ptr<OSDMeta> meta_coll; + template <typename... Args> + void init_meta_coll(Args&&... args) { + meta_coll = std::make_unique<OSDMeta>(std::forward<Args>(args)...); + } + OSDMeta &get_meta_coll() { + assert(meta_coll); + return *meta_coll; + } + + OSDSuperblock superblock; + void set_superblock(OSDSuperblock _superblock) { + superblock = std::move(_superblock); + } + + seastar::future<> send_incremental_map( + crimson::net::Connection &conn, + epoch_t first); + + seastar::future<> send_incremental_map_to_osd(int osd, epoch_t first); + + auto get_pool_info(int64_t poolid) { + return get_meta_coll().load_final_pool_info(poolid); + } + + // global pg temp state + struct pg_temp_t { + std::vector<int> acting; + bool forced = false; + }; + std::map<pg_t, pg_temp_t> pg_temp_wanted; + std::map<pg_t, pg_temp_t> pg_temp_pending; + friend std::ostream& operator<<(std::ostream&, const pg_temp_t&); + + void queue_want_pg_temp(pg_t pgid, const std::vector<int>& want, + bool forced = false); + void remove_want_pg_temp(pg_t pgid); + void requeue_pg_temp(); + seastar::future<> send_pg_temp(); + + std::set<pg_t> pg_created; + seastar::future<> send_pg_created(pg_t pgid); + seastar::future<> send_pg_created(); + void prune_pg_created(); + + struct DirectFinisher { + void queue(Context *c) { + c->complete(0); + } + } finisher; + AsyncReserver<spg_t, DirectFinisher> local_reserver; + AsyncReserver<spg_t, DirectFinisher> remote_reserver; + AsyncReserver<spg_t, DirectFinisher> snap_reserver; + + epoch_t up_thru_wanted = 0; + seastar::future<> send_alive(epoch_t want); + + const char** get_tracked_conf_keys() const final; + void handle_conf_change( + const ConfigProxy& conf, + const std::set <std::string> &changed) final; + + seastar::future<local_cached_map_t> get_local_map(epoch_t e); + seastar::future<std::unique_ptr<OSDMap>> load_map(epoch_t e); + seastar::future<bufferlist> load_map_bl(epoch_t e); + seastar::future<std::map<epoch_t, bufferlist>> + load_map_bls(epoch_t first, epoch_t last); + void store_map_bl(ceph::os::Transaction& t, + epoch_t e, bufferlist&& bl); + seastar::future<> store_maps(ceph::os::Transaction& t, + epoch_t start, Ref<MOSDMap> m); +}; + +/** + * Represents services available to each PG + */ +class ShardServices : public OSDMapService { + friend class PGShardManager; + friend class OSD; + using cached_map_t = OSDMapService::cached_map_t; + using local_cached_map_t = OSDMapService::local_cached_map_t; + + PerShardState local_state; + seastar::sharded<OSDSingletonState> &osd_singleton_state; + PGShardMapping& pg_to_shard_mapping; + + template <typename F, typename... Args> + auto with_singleton(F &&f, Args&&... args) { + return osd_singleton_state.invoke_on( + PRIMARY_CORE, + std::forward<F>(f), + std::forward<Args>(args)... + ); + } + +#define FORWARD_CONST(FROM_METHOD, TO_METHOD, TARGET) \ + template <typename... Args> \ + auto FROM_METHOD(Args&&... args) const { \ + return TARGET.TO_METHOD(std::forward<Args>(args)...); \ + } + +#define FORWARD(FROM_METHOD, TO_METHOD, TARGET) \ + template <typename... Args> \ + auto FROM_METHOD(Args&&... args) { \ + return TARGET.TO_METHOD(std::forward<Args>(args)...); \ + } + +#define FORWARD_TO_LOCAL(METHOD) FORWARD(METHOD, METHOD, local_state) +#define FORWARD_TO_LOCAL_CONST(METHOD) FORWARD_CONST( \ + METHOD, METHOD, local_state) \ + +#define FORWARD_TO_OSD_SINGLETON_TARGET(METHOD, TARGET) \ + template <typename... Args> \ + auto METHOD(Args&&... args) { \ + return with_singleton( \ + [](auto &local_state, auto&&... args) { \ + return local_state.TARGET( \ + std::forward<decltype(args)>(args)...); \ + }, std::forward<Args>(args)...); \ + } +#define FORWARD_TO_OSD_SINGLETON(METHOD) \ + FORWARD_TO_OSD_SINGLETON_TARGET(METHOD, METHOD) + +public: + template <typename... PSSArgs> + ShardServices( + seastar::sharded<OSDSingletonState> &osd_singleton_state, + PGShardMapping& pg_to_shard_mapping, + PSSArgs&&... args) + : local_state(std::forward<PSSArgs>(args)...), + osd_singleton_state(osd_singleton_state), + pg_to_shard_mapping(pg_to_shard_mapping) {} + + FORWARD_TO_OSD_SINGLETON(send_to_osd) + + crimson::os::FuturizedStore::Shard &get_store() { + return local_state.store; + } + + auto remove_pg(spg_t pgid) { + local_state.pg_map.remove_pg(pgid); + return pg_to_shard_mapping.remove_pg(pgid); + } + + crimson::common::CephContext *get_cct() { + return &(local_state.cct); + } + + template <typename T, typename... Args> + auto start_operation(Args&&... args) { + return local_state.start_operation<T>(std::forward<Args>(args)...); + } + + template <typename InterruptorT, typename T, typename... Args> + auto start_operation_may_interrupt(Args&&... args) { + return local_state.start_operation_may_interrupt< + InterruptorT, T>(std::forward<Args>(args)...); + } + + auto &get_registry() { return local_state.registry; } + + // Loggers + PerfCounters &get_recoverystate_perf_logger() { + return *local_state.recoverystate_perf; + } + PerfCounters &get_perf_logger() { + return *local_state.perf; + } + + // Diagnostics + FORWARD_TO_LOCAL_CONST(dump_ops_in_flight); + + // Local PG Management + seastar::future<Ref<PG>> make_pg( + cached_map_t create_map, + spg_t pgid, + bool do_create); + seastar::future<Ref<PG>> handle_pg_create_info( + std::unique_ptr<PGCreateInfo> info); + + using get_or_create_pg_ertr = PGMap::wait_for_pg_ertr; + using get_or_create_pg_ret = get_or_create_pg_ertr::future<Ref<PG>>; + get_or_create_pg_ret get_or_create_pg( + PGMap::PGCreationBlockingEvent::TriggerI&&, + spg_t pgid, + std::unique_ptr<PGCreateInfo> info); + + using wait_for_pg_ertr = PGMap::wait_for_pg_ertr; + using wait_for_pg_ret = wait_for_pg_ertr::future<Ref<PG>>; + wait_for_pg_ret wait_for_pg( + PGMap::PGCreationBlockingEvent::TriggerI&&, spg_t pgid); + seastar::future<Ref<PG>> load_pg(spg_t pgid); + + /// Dispatch and reset ctx transaction + seastar::future<> dispatch_context_transaction( + crimson::os::CollectionRef col, PeeringCtx &ctx); + + /// Dispatch and reset ctx messages + seastar::future<> dispatch_context_messages( + BufferedRecoveryMessages &&ctx); + + /// Dispatch ctx and dispose of context + seastar::future<> dispatch_context( + crimson::os::CollectionRef col, + PeeringCtx &&ctx); + + /// Dispatch ctx and dispose of ctx, transaction must be empty + seastar::future<> dispatch_context( + PeeringCtx &&ctx) { + return dispatch_context({}, std::move(ctx)); + } + + /// Return per-core tid + ceph_tid_t get_tid() { return local_state.get_tid(); } + + /// Return core-local pg count * number of cores + unsigned get_num_local_pgs() const { + return local_state.pg_map.get_pg_count(); + } + + // OSDMapService + cached_map_t get_map() const final { return local_state.get_osdmap(); } + epoch_t get_up_epoch() const final { return local_state.up_epoch; } + seastar::future<cached_map_t> get_map(epoch_t e) final { + return with_singleton( + [](auto &sstate, epoch_t e) { + return sstate.get_local_map( + e + ).then([](auto lmap) { + return seastar::foreign_ptr<local_cached_map_t>(lmap); + }); + }, e).then([](auto fmap) { + return make_local_shared_foreign(std::move(fmap)); + }); + } + + FORWARD_TO_OSD_SINGLETON(get_pool_info) + FORWARD(with_throttle_while, with_throttle_while, local_state.throttler) + + FORWARD_TO_OSD_SINGLETON(send_incremental_map) + FORWARD_TO_OSD_SINGLETON(send_incremental_map_to_osd) + + FORWARD_TO_OSD_SINGLETON(osdmap_subscribe) + FORWARD_TO_OSD_SINGLETON(queue_want_pg_temp) + FORWARD_TO_OSD_SINGLETON(remove_want_pg_temp) + FORWARD_TO_OSD_SINGLETON(requeue_pg_temp) + FORWARD_TO_OSD_SINGLETON(send_pg_created) + FORWARD_TO_OSD_SINGLETON(send_alive) + FORWARD_TO_OSD_SINGLETON(send_pg_temp) + FORWARD_TO_LOCAL_CONST(get_mnow) + FORWARD_TO_LOCAL(get_hb_stamps) + + FORWARD(pg_created, pg_created, local_state.pg_map) + + FORWARD_TO_OSD_SINGLETON_TARGET( + local_update_priority, + local_reserver.update_priority) + FORWARD_TO_OSD_SINGLETON_TARGET( + local_cancel_reservation, + local_reserver.cancel_reservation) + FORWARD_TO_OSD_SINGLETON_TARGET( + local_dump_reservations, + local_reserver.dump) + FORWARD_TO_OSD_SINGLETON_TARGET( + remote_cancel_reservation, + remote_reserver.cancel_reservation) + FORWARD_TO_OSD_SINGLETON_TARGET( + remote_dump_reservations, + remote_reserver.dump) + FORWARD_TO_OSD_SINGLETON_TARGET( + snap_cancel_reservation, + snap_reserver.cancel_reservation) + FORWARD_TO_OSD_SINGLETON_TARGET( + snap_dump_reservations, + snap_reserver.dump) + + Context *invoke_context_on_core(core_id_t core, Context *c) { + if (!c) return nullptr; + return new LambdaContext([core, c](int code) { + std::ignore = seastar::smp::submit_to( + core, + [c, code] { + c->complete(code); + }); + }); + } + seastar::future<> local_request_reservation( + spg_t item, + Context *on_reserved, + unsigned prio, + Context *on_preempt) { + return with_singleton( + [item, prio](OSDSingletonState &singleton, + Context *wrapped_on_reserved, Context *wrapped_on_preempt) { + return singleton.local_reserver.request_reservation( + item, + wrapped_on_reserved, + prio, + wrapped_on_preempt); + }, + invoke_context_on_core(seastar::this_shard_id(), on_reserved), + invoke_context_on_core(seastar::this_shard_id(), on_preempt)); + } + seastar::future<> remote_request_reservation( + spg_t item, + Context *on_reserved, + unsigned prio, + Context *on_preempt) { + return with_singleton( + [item, prio](OSDSingletonState &singleton, + Context *wrapped_on_reserved, Context *wrapped_on_preempt) { + return singleton.remote_reserver.request_reservation( + item, + wrapped_on_reserved, + prio, + wrapped_on_preempt); + }, + invoke_context_on_core(seastar::this_shard_id(), on_reserved), + invoke_context_on_core(seastar::this_shard_id(), on_preempt)); + } + seastar::future<> snap_request_reservation( + spg_t item, + Context *on_reserved, + unsigned prio) { + return with_singleton( + [item, prio](OSDSingletonState &singleton, + Context *wrapped_on_reserved) { + return singleton.snap_reserver.request_reservation( + item, + wrapped_on_reserved, + prio); + }, + invoke_context_on_core(seastar::this_shard_id(), on_reserved)); + } + +#undef FORWARD_CONST +#undef FORWARD +#undef FORWARD_TO_OSD_SINGLETON +#undef FORWARD_TO_LOCAL +#undef FORWARD_TO_LOCAL_CONST +}; + +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::OSDSingletonState::pg_temp_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/state.h b/src/crimson/osd/state.h new file mode 100644 index 000000000..f0676a4ec --- /dev/null +++ b/src/crimson/osd/state.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string_view> +#include <ostream> + +#include <seastar/core/shared_future.hh> + +class OSDMap; + +namespace crimson::osd { + +// seastar::sharded puts start_single on core 0 +constexpr core_id_t PRIMARY_CORE = 0; + +/** + * OSDState + * + * Maintains state representing the OSD's progress from booting through + * shutdown. + * + * Shards other than PRIMARY_CORE may use their local instance to check + * on ACTIVE and STOPPING. All other methods are restricted to + * PRIMARY_CORE (such methods start with an assert to this effect). + */ +class OSDState : public seastar::peering_sharded_service<OSDState> { + + enum class State { + INITIALIZING, + PREBOOT, + BOOTING, + ACTIVE, + PRESTOP, + STOPPING, + WAITING_FOR_HEALTHY, + }; + + State state = State::INITIALIZING; + mutable seastar::shared_promise<> wait_for_active; + + /// Sets local instance state to active, called from set_active + void _set_active() { + state = State::ACTIVE; + wait_for_active.set_value(); + wait_for_active = {}; + } + /// Sets local instance state to stopping, called from set_stopping + void _set_stopping() { + state = State::STOPPING; + wait_for_active.set_exception(crimson::common::system_shutdown_exception{}); + wait_for_active = {}; + } +public: + bool is_initializing() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return state == State::INITIALIZING; + } + bool is_preboot() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return state == State::PREBOOT; + } + bool is_booting() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return state == State::BOOTING; + } + bool is_active() const { + return state == State::ACTIVE; + } + seastar::future<> when_active() const { + return is_active() ? seastar::now() + : wait_for_active.get_shared_future(); + }; + bool is_prestop() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return state == State::PRESTOP; + } + bool is_stopping() const { + return state == State::STOPPING; + } + bool is_waiting_for_healthy() const { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return state == State::WAITING_FOR_HEALTHY; + } + void set_preboot() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + state = State::PREBOOT; + } + void set_booting() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + state = State::BOOTING; + } + /// Sets all shards to active + seastar::future<> set_active() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return container().invoke_on_all([](auto& osd_state) { + osd_state._set_active(); + }); + } + void set_prestop() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + state = State::PRESTOP; + } + /// Sets all shards to stopping + seastar::future<> set_stopping() { + ceph_assert(seastar::this_shard_id() == PRIMARY_CORE); + return container().invoke_on_all([](auto& osd_state) { + osd_state._set_stopping(); + }); + } + std::string_view to_string() const { + switch (state) { + case State::INITIALIZING: return "initializing"; + case State::PREBOOT: return "preboot"; + case State::BOOTING: return "booting"; + case State::ACTIVE: return "active"; + case State::PRESTOP: return "prestop"; + case State::STOPPING: return "stopping"; + case State::WAITING_FOR_HEALTHY: return "waiting_for_healthy"; + default: return "???"; + } + } +}; + +inline std::ostream& +operator<<(std::ostream& os, const OSDState& s) { + return os << s.to_string(); +} +} diff --git a/src/crimson/osd/stop_signal.h b/src/crimson/osd/stop_signal.h new file mode 100644 index 000000000..951f8d4b7 --- /dev/null +++ b/src/crimson/osd/stop_signal.h @@ -0,0 +1,83 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2020 Cloudius Systems, Ltd. + */ + +#pragma once + +#include <seastar/core/abort_source.hh> +#include <seastar/core/reactor.hh> +#include <seastar/core/condition-variable.hh> + +/// Seastar apps lib namespace + +namespace seastar_apps_lib { + + +/// \brief Futurized SIGINT/SIGTERM signals handler class +/// +/// Seastar-style helper class that allows easy waiting for SIGINT/SIGTERM signals +/// from your app. +/// +/// Example: +/// \code +/// #include <seastar/apps/lib/stop_signal.hh> +/// ... +/// int main() { +/// ... +/// seastar::thread th([] { +/// seastar_apps_lib::stop_signal stop_signal; +/// <some code> +/// stop_signal.wait().get(); // this will wait till we receive SIGINT or SIGTERM signal +/// }); +/// \endcode +class stop_signal { + seastar::condition_variable _cond; + seastar::abort_source _abort_source; + +private: + void on_signal() { + if (stopping()) { + return; + } + _abort_source.request_abort(); + _cond.broadcast(); + } +public: + stop_signal() { + seastar::engine().handle_signal(SIGINT, [this] { on_signal(); }); + seastar::engine().handle_signal(SIGTERM, [this] { on_signal(); }); + } + ~stop_signal() { + // There's no way to unregister a handler yet, so register a no-op handler instead. + seastar::engine().handle_signal(SIGINT, [] {}); + seastar::engine().handle_signal(SIGTERM, [] {}); + } + seastar::future<> wait() { + return _cond.wait([this] { return _abort_source.abort_requested(); }); + } + bool stopping() const { + return _abort_source.abort_requested(); + } + auto& abort_source() { + return _abort_source; + } +}; +} diff --git a/src/crimson/osd/watch.cc b/src/crimson/osd/watch.cc new file mode 100644 index 000000000..4573333c3 --- /dev/null +++ b/src/crimson/osd/watch.cc @@ -0,0 +1,354 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> + +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm_ext/insert.hpp> + +#include "crimson/osd/watch.h" +#include "crimson/osd/osd_operations/internal_client_request.h" + +#include "messages/MWatchNotify.h" + + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +// a watcher can remove itself if it has not seen a notification after a period of time. +// in the case, we need to drop it also from the persisted `ObjectState` instance. +// this operation resembles a bit the `_UNWATCH` subop. +class WatchTimeoutRequest final : public InternalClientRequest { +public: + WatchTimeoutRequest(WatchRef watch, Ref<PG> pg) + : InternalClientRequest(std::move(pg)), + watch(std::move(watch)) { + } + + const hobject_t& get_target_oid() const final; + PG::do_osd_ops_params_t get_do_osd_ops_params() const final; + std::vector<OSDOp> create_osd_ops() final; + +private: + WatchRef watch; +}; + +const hobject_t& WatchTimeoutRequest::get_target_oid() const +{ + assert(watch->obc); + return watch->obc->get_oid(); +} + +PG::do_osd_ops_params_t +WatchTimeoutRequest::get_do_osd_ops_params() const +{ + osd_reqid_t reqid; + reqid.name = watch->entity_name; + PG::do_osd_ops_params_t params{ + watch->conn, + reqid, + ceph_clock_now(), + get_pg().get_osdmap_epoch(), + entity_inst_t{ watch->entity_name, watch->winfo.addr }, + 0 + }; + logger().debug("{}: params.reqid={}", __func__, params.reqid); + return params; +} + +std::vector<OSDOp> WatchTimeoutRequest::create_osd_ops() +{ + logger().debug("{}", __func__); + assert(watch); + OSDOp osd_op; + osd_op.op.op = CEPH_OSD_OP_WATCH; + osd_op.op.flags = 0; + osd_op.op.watch.op = CEPH_OSD_WATCH_OP_UNWATCH; + osd_op.op.watch.cookie = watch->winfo.cookie; + return std::vector{std::move(osd_op)}; +} + +Watch::~Watch() +{ + logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie()); +} + +seastar::future<> Watch::connect(crimson::net::ConnectionRef conn, bool) +{ + if (this->conn == conn) { + logger().debug("conn={} already connected", conn); + return seastar::now(); + } + timeout_timer.cancel(); + timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds}); + this->conn = std::move(conn); + return seastar::now(); +} + +void Watch::disconnect() +{ + ceph_assert(!conn); + timeout_timer.cancel(); + timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds}); +} + +seastar::future<> Watch::send_notify_msg(NotifyRef notify) +{ + logger().info("{} for notify(id={})", __func__, notify->ninfo.notify_id); + return conn->send(crimson::make_message<MWatchNotify>( + winfo.cookie, + notify->user_version, + notify->ninfo.notify_id, + CEPH_WATCH_EVENT_NOTIFY, + notify->ninfo.bl, + notify->client_gid)); +} + +seastar::future<> Watch::start_notify(NotifyRef notify) +{ + logger().debug("{} gid={} cookie={} starting notify(id={})", + __func__, get_watcher_gid(), get_cookie(), + notify->ninfo.notify_id); + auto [ it, emplaced ] = in_progress_notifies.emplace(std::move(notify)); + ceph_assert(emplaced); + ceph_assert(is_alive()); + return is_connected() ? send_notify_msg(*it) : seastar::now(); +} + +seastar::future<> Watch::notify_ack( + const uint64_t notify_id, + const ceph::bufferlist& reply_bl) +{ + logger().debug("{} gid={} cookie={} notify_id={}", + __func__, get_watcher_gid(), get_cookie(), notify_id); + const auto it = in_progress_notifies.find(notify_id); + if (it == std::end(in_progress_notifies)) { + logger().error("{} notify_id={} not found on the in-progess list." + " Supressing but this should not happen.", + __func__, notify_id); + return seastar::now(); + } + auto notify = *it; + logger().debug("Watch::notify_ack gid={} cookie={} found notify(id={})", + get_watcher_gid(), + get_cookie(), + notify->get_id()); + // let's ensure we're extending the life-time till end of this method + static_assert(std::is_same_v<decltype(notify), NotifyRef>); + in_progress_notifies.erase(it); + return notify->complete_watcher(shared_from_this(), reply_bl); +} + +seastar::future<> Watch::send_disconnect_msg() +{ + if (!is_connected()) { + return seastar::now(); + } + ceph::bufferlist empty; + return conn->send(crimson::make_message<MWatchNotify>( + winfo.cookie, + 0, + 0, + CEPH_WATCH_EVENT_DISCONNECT, + empty)); +} + +void Watch::discard_state() +{ + logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie()); + ceph_assert(obc); + in_progress_notifies.clear(); + timeout_timer.cancel(); +} + +void Watch::got_ping(utime_t) +{ + if (is_connected()) { + // using cancel() + arm() as rearm() has no overload for time delta. + timeout_timer.cancel(); + timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds}); + } +} + +seastar::future<> Watch::remove() +{ + logger().debug("{} gid={} cookie={}", __func__, get_watcher_gid(), get_cookie()); + // in contrast to ceph-osd crimson sends CEPH_WATCH_EVENT_DISCONNECT directly + // from the timeout handler and _after_ CEPH_WATCH_EVENT_NOTIFY_COMPLETE. + // this simplifies the Watch::remove() interface as callers aren't obliged + // anymore to decide whether EVENT_DISCONNECT needs to be send or not -- it + // becomes an implementation detail of Watch. + return seastar::do_for_each(in_progress_notifies, + [this_shared=shared_from_this()] (auto notify) { + logger().debug("Watch::remove gid={} cookie={} notify(id={})", + this_shared->get_watcher_gid(), + this_shared->get_cookie(), + notify->ninfo.notify_id); + return notify->remove_watcher(this_shared); + }).then([this] { + discard_state(); + return seastar::now(); + }); +} + +void Watch::cancel_notify(const uint64_t notify_id) +{ + logger().debug("{} gid={} cookie={} notify(id={})", + __func__, get_watcher_gid(), get_cookie(), + notify_id); + const auto it = in_progress_notifies.find(notify_id); + assert(it != std::end(in_progress_notifies)); + in_progress_notifies.erase(it); +} + +void Watch::do_watch_timeout() +{ + assert(pg); + auto [op, fut] = pg->get_shard_services().start_operation<WatchTimeoutRequest>( + shared_from_this(), pg); + std::ignore = std::move(fut).then([op=std::move(op), this] { + return send_disconnect_msg(); + }); +} + +bool notify_reply_t::operator<(const notify_reply_t& rhs) const +{ + // comparing std::pairs to emphasize our legacy. ceph-osd stores + // notify_replies as std::multimap<std::pair<gid, cookie>, bl>. + // unfortunately, what seems to be an implementation detail, got + // exposed as part of our public API (the `reply_buffer` parameter + // of the `rados_notify` family). + const auto lhsp = std::make_pair(watcher_gid, watcher_cookie); + const auto rhsp = std::make_pair(rhs.watcher_gid, rhs.watcher_cookie); + return lhsp < rhsp; +} + +std::ostream &operator<<(std::ostream &out, const notify_reply_t &rhs) +{ + out << "notify_reply_t{watcher_gid=" << rhs.watcher_gid + << ", watcher_cookie=" << rhs.watcher_cookie << "}"; + return out; +} + +Notify::Notify(crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version) + : ninfo(ninfo), + conn(std::move(conn)), + client_gid(client_gid), + user_version(user_version) +{} + +Notify::~Notify() +{ + logger().debug("{} for notify(id={})", __func__, ninfo.notify_id); +} + +seastar::future<> Notify::remove_watcher(WatchRef watch) +{ + logger().debug("{} for notify(id={})", __func__, ninfo.notify_id); + + if (discarded || complete) { + logger().debug("{} for notify(id={}) discarded/complete already" + " discarded: {} complete: {}", __func__, + ninfo.notify_id, discarded ,complete); + return seastar::now(); + } + [[maybe_unused]] const auto num_removed = watchers.erase(watch); + assert(num_removed > 0); + if (watchers.empty()) { + complete = true; + [[maybe_unused]] bool was_armed = timeout_timer.cancel(); + assert(was_armed); + return send_completion(); + } else { + return seastar::now(); + } +} + + +seastar::future<> Notify::complete_watcher( + WatchRef watch, + const ceph::bufferlist& reply_bl) +{ + logger().debug("{} for notify(id={})", __func__, ninfo.notify_id); + + if (discarded || complete) { + logger().debug("{} for notify(id={}) discarded/complete already" + " discarded: {} complete: {}", __func__, + ninfo.notify_id, discarded ,complete); + return seastar::now(); + } + notify_replies.emplace(notify_reply_t{ + watch->get_watcher_gid(), + watch->get_cookie(), + reply_bl}); + return remove_watcher(std::move(watch)); +} + +seastar::future<> Notify::send_completion( + std::set<WatchRef> timedout_watchers) +{ + logger().info("{} -- {} in progress watchers, timedout watchers {}", + __func__, watchers.size(), timedout_watchers.size()); + logger().debug("{} sending notify replies: {}", __func__, notify_replies); + + ceph::bufferlist empty; + auto reply = crimson::make_message<MWatchNotify>( + ninfo.cookie, + user_version, + ninfo.notify_id, + CEPH_WATCH_EVENT_NOTIFY_COMPLETE, + empty, + client_gid); + ceph::bufferlist reply_bl; + { + std::vector<std::pair<uint64_t,uint64_t>> missed; + missed.reserve(std::size(timedout_watchers)); + boost::insert( + missed, std::begin(missed), + timedout_watchers | boost::adaptors::transformed([] (auto w) { + return std::make_pair(w->get_watcher_gid(), w->get_cookie()); + })); + ceph::encode(notify_replies, reply_bl); + ceph::encode(missed, reply_bl); + } + reply->set_data(std::move(reply_bl)); + if (!timedout_watchers.empty()) { + reply->return_code = -ETIMEDOUT; + } + return conn->send(std::move(reply)); +} + +void Notify::do_notify_timeout() +{ + logger().debug("{} complete={}", __func__, complete); + if (complete) { + return; + } + // it might be that `this` is kept alive only because of the reference + // a watcher stores and which is being removed by `cancel_notify()`. + // to avoid use-after-free we bump up the ref counter with `guard_ptr`. + [[maybe_unused]] auto guard_ptr = shared_from_this(); + for (auto& watcher : watchers) { + logger().debug("canceling watcher cookie={} gid={} use_count={}", + watcher->get_cookie(), + watcher->get_watcher_gid(), + watcher->use_count()); + watcher->cancel_notify(ninfo.notify_id); + } + std::ignore = send_completion(std::move(watchers)); + watchers.clear(); +} + +} // namespace crimson::osd + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::WatchTimeoutRequest> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/osd/watch.h b/src/crimson/osd/watch.h new file mode 100644 index 000000000..b3982141d --- /dev/null +++ b/src/crimson/osd/watch.h @@ -0,0 +1,256 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iterator> +#include <map> +#include <set> + +#include <seastar/core/shared_ptr.hh> + +#include "crimson/net/Connection.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/pg.h" +#include "include/denc.h" + +namespace crimson::osd { + +class Notify; +using NotifyRef = seastar::shared_ptr<Notify>; + +// NOTE: really need to have this public. Otherwise `shared_from_this()` +// will abort. According to cppreference.com: +// +// "The constructors of std::shared_ptr detect the presence +// of an unambiguous and accessible (ie. public inheritance +// is mandatory) (since C++17) enable_shared_from_this base". +// +// I expect the `seastar::shared_ptr` shares this behaviour. +class Watch : public seastar::enable_shared_from_this<Watch> { + // this is a private tag for the public constructor that turns it into + // de facto private one. The motivation behind the hack is make_shared + // used by create(). + struct private_ctag_t{}; + + std::set<NotifyRef, std::less<>> in_progress_notifies; + crimson::net::ConnectionRef conn; + crimson::osd::ObjectContextRef obc; + + watch_info_t winfo; + entity_name_t entity_name; + Ref<PG> pg; + + seastar::timer<seastar::lowres_clock> timeout_timer; + + seastar::future<> start_notify(NotifyRef); + seastar::future<> send_notify_msg(NotifyRef); + seastar::future<> send_disconnect_msg(); + + friend Notify; + friend class WatchTimeoutRequest; + +public: + Watch(private_ctag_t, + crimson::osd::ObjectContextRef obc, + const watch_info_t& winfo, + const entity_name_t& entity_name, + Ref<PG> pg) + : obc(std::move(obc)), + winfo(winfo), + entity_name(entity_name), + pg(std::move(pg)), + timeout_timer([this] { + return do_watch_timeout(); + }) { + assert(this->pg); + } + ~Watch(); + + seastar::future<> connect(crimson::net::ConnectionRef, bool); + void disconnect(); + bool is_alive() const { + return true; + } + bool is_connected() const { + return static_cast<bool>(conn); + } + void got_ping(utime_t); + + void discard_state(); + + seastar::future<> remove(); + + /// Call when notify_ack received on notify_id + seastar::future<> notify_ack( + uint64_t notify_id, ///< [in] id of acked notify + const ceph::bufferlist& reply_bl); ///< [in] notify reply buffer + + template <class... Args> + static seastar::shared_ptr<Watch> create(Args&&... args) { + return seastar::make_shared<Watch>(private_ctag_t{}, + std::forward<Args>(args)...); + }; + + uint64_t get_watcher_gid() const { + return entity_name.num(); + } + auto get_pg() const { + return pg; + } + auto& get_entity() const { + return entity_name; + } + auto& get_cookie() const { + return winfo.cookie; + } + auto& get_peer_addr() const { + return winfo.addr; + } + void cancel_notify(const uint64_t notify_id); + void do_watch_timeout(); +}; + +using WatchRef = seastar::shared_ptr<Watch>; + +struct notify_reply_t { + uint64_t watcher_gid; + uint64_t watcher_cookie; + ceph::bufferlist bl; + + bool operator<(const notify_reply_t& rhs) const; + DENC(notify_reply_t, v, p) { + // there is no versioning / preamble + denc(v.watcher_gid, p); + denc(v.watcher_cookie, p); + denc(v.bl, p); + } +}; +std::ostream &operator<<(std::ostream &out, const notify_reply_t &rhs); + +class Notify : public seastar::enable_shared_from_this<Notify> { + std::set<WatchRef> watchers; + const notify_info_t ninfo; + crimson::net::ConnectionRef conn; + const uint64_t client_gid; + const uint64_t user_version; + bool complete{false}; + bool discarded{false}; + seastar::timer<seastar::lowres_clock> timeout_timer{ + [this] { do_notify_timeout(); } + }; + + ~Notify(); + + /// (gid,cookie) -> reply_bl for everyone who acked the notify + std::multiset<notify_reply_t> notify_replies; + + uint64_t get_id() const { return ninfo.notify_id; } + + /// Sends notify completion if watchers.empty() or timeout + seastar::future<> send_completion( + std::set<WatchRef> timedout_watchers = {}); + + /// Called on Notify timeout + void do_notify_timeout(); + + Notify(crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version); + template <class WatchIteratorT> + Notify(WatchIteratorT begin, + WatchIteratorT end, + crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version); + // this is a private tag for the public constructor that turns it into + // de facto private one. The motivation behind the hack is make_shared + // used by create_n_propagate factory. + struct private_ctag_t{}; + + using ptr_t = seastar::shared_ptr<Notify>; + friend bool operator<(const ptr_t& lhs, const ptr_t& rhs) { + assert(lhs); + assert(rhs); + return lhs->get_id() < rhs->get_id(); + } + friend bool operator<(const ptr_t& ptr, const uint64_t id) { + assert(ptr); + return ptr->get_id() < id; + } + friend bool operator<(const uint64_t id, const ptr_t& ptr) { + assert(ptr); + return id < ptr->get_id(); + } + + friend Watch; + +public: + template <class... Args> + Notify(private_ctag_t, Args&&... args) : Notify(std::forward<Args>(args)...) { + } + + template <class WatchIteratorT, class... Args> + static seastar::future<> create_n_propagate( + WatchIteratorT begin, + WatchIteratorT end, + Args&&... args); + + seastar::future<> remove_watcher(WatchRef watch); + seastar::future<> complete_watcher(WatchRef watch, + const ceph::bufferlist& reply_bl); +}; + + +template <class WatchIteratorT> +Notify::Notify(WatchIteratorT begin, + WatchIteratorT end, + crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version) + : watchers(begin, end), + ninfo(ninfo), + conn(std::move(conn)), + client_gid(client_gid), + user_version(user_version) { + assert(!std::empty(watchers)); + if (ninfo.timeout) { + timeout_timer.arm(std::chrono::seconds{ninfo.timeout}); + } +} + +template <class WatchIteratorT, class... Args> +seastar::future<> Notify::create_n_propagate( + WatchIteratorT begin, + WatchIteratorT end, + Args&&... args) +{ + static_assert( + std::is_same_v<typename std::iterator_traits<WatchIteratorT>::value_type, + crimson::osd::WatchRef>); + if (begin == end) { + auto notify = seastar::make_shared<Notify>( + private_ctag_t{}, + std::forward<Args>(args)...); + return notify->send_completion(); + } else { + auto notify = seastar::make_shared<Notify>( + private_ctag_t{}, + begin, end, + std::forward<Args>(args)...); + return seastar::do_for_each(begin, end, [=] (auto& watchref) { + return watchref->start_notify(notify); + }); + } +} + +} // namespace crimson::osd + +WRITE_CLASS_DENC(crimson::osd::notify_reply_t) + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::notify_reply_t> : fmt::ostream_formatter {}; +#endif |