diff options
Diffstat (limited to 'src/crimson/osd')
67 files changed, 16176 insertions, 0 deletions
diff --git a/src/crimson/osd/CMakeLists.txt b/src/crimson/osd/CMakeLists.txt new file mode 100644 index 000000000..898f70c42 --- /dev/null +++ b/src/crimson/osd/CMakeLists.txt @@ -0,0 +1,57 @@ +add_executable(crimson-osd + backfill_state.cc + ec_backend.cc + heartbeat.cc + main.cc + osd.cc + osd_meta.cc + pg.cc + pg_backend.cc + pg_meta.cc + replicated_backend.cc + shard_services.cc + object_context.cc + ops_executer.cc + osd_operation.cc + osd_operations/client_request.cc + osd_operations/compound_peering_request.cc + osd_operations/peering_event.cc + osd_operations/pg_advance_map.cc + osd_operations/replicated_request.cc + osd_operations/background_recovery.cc + osd_operations/recovery_subrequest.cc + pg_recovery.cc + recovery_backend.cc + replicated_recovery_backend.cc + scheduler/scheduler.cc + scheduler/mclock_scheduler.cc + osdmap_gate.cc + pg_map.cc + objclass.cc + ${PROJECT_SOURCE_DIR}/src/objclass/class_api.cc + ${PROJECT_SOURCE_DIR}/src/osd/ClassHandler.cc + ${PROJECT_SOURCE_DIR}/src/osd/osd_op_util.cc + ${PROJECT_SOURCE_DIR}/src/osd/OSDCap.cc + ${PROJECT_SOURCE_DIR}/src/osd/PeeringState.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGPeeringEvent.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGStateUtils.cc + ${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc + ${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc + ${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc + ${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc + watch.cc + ) +target_link_libraries(crimson-osd + crimson-admin + crimson-common + crimson-os + crimson + fmt::fmt + Boost::MPL + dmclock::dmclock) +set_target_properties(crimson-osd PROPERTIES + POSITION_INDEPENDENT_CODE ${EXE_LINKER_USE_PIE}) +install(TARGETS crimson-osd DESTINATION bin) +if(WITH_TESTS) + add_dependencies(tests crimson-osd) +endif() diff --git a/src/crimson/osd/acked_peers.h b/src/crimson/osd/acked_peers.h new file mode 100644 index 000000000..b2f2562c0 --- /dev/null +++ b/src/crimson/osd/acked_peers.h @@ -0,0 +1,14 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <vector> + +namespace crimson::osd { + struct peer_shard_t { + pg_shard_t shard; + eversion_t last_complete_ondisk; + }; + using acked_peers_t = std::vector<peer_shard_t>; +} diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h new file mode 100644 index 000000000..683dc6ea6 --- /dev/null +++ b/src/crimson/osd/backfill_facades.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/pg.h" +#include "osd/PeeringState.h" + +namespace crimson::osd { + +// PeeringFacade -- main implementation of the BackfillState::PeeringFacade +// interface. We have the abstraction to decuple BackfillState from Peering +// State, and thus cut depedencies in unit testing. The second implemention +// is BackfillFixture::PeeringFacade and sits in test_backfill.cc. +struct PeeringFacade final : BackfillState::PeeringFacade { + PeeringState& peering_state; + + hobject_t earliest_backfill() const override { + return peering_state.earliest_backfill(); + } + + const std::set<pg_shard_t>& get_backfill_targets() const override { + return peering_state.get_backfill_targets(); + } + + const hobject_t& get_peer_last_backfill(pg_shard_t peer) const override { + return peering_state.get_peer_info(peer).last_backfill; + } + + const eversion_t& get_last_update() const override { + return peering_state.get_info().last_update; + } + + const eversion_t& get_log_tail() const override { + return peering_state.get_info().log_tail; + } + + void scan_log_after(eversion_t v, scan_log_func_t f) const override { + peering_state.get_pg_log().get_log().scan_log_after(v, std::move(f)); + } + + bool is_backfill_target(pg_shard_t peer) const override { + return peering_state.is_backfill_target(peer); + } + void update_complete_backfill_object_stats(const hobject_t &hoid, + const pg_stat_t &stats) override { + peering_state.update_complete_backfill_object_stats(hoid, stats); + } + + bool is_backfilling() const override { + return peering_state.is_backfilling(); + } + + PeeringFacade(PeeringState& peering_state) + : peering_state(peering_state) { + } +}; + +// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge +// interface of crimson's PG class. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct PGFacade final : BackfillState::PGFacade { + PG& pg; + + const eversion_t& get_projected_last_update() const override { + return pg.projected_last_update; + } + + PGFacade(PG& pg) : pg(pg) {} +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc new file mode 100644 index 000000000..57f845f92 --- /dev/null +++ b/src/crimson/osd/backfill_state.cc @@ -0,0 +1,556 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <algorithm> +#include <boost/type_index.hpp> + +#include "crimson/osd/backfill_state.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +BackfillState::BackfillState( + BackfillState::BackfillListener& backfill_listener, + std::unique_ptr<BackfillState::PeeringFacade> peering_state, + std::unique_ptr<BackfillState::PGFacade> pg) + : backfill_machine(*this, + backfill_listener, + std::move(peering_state), + std::move(pg)), + progress_tracker( + std::make_unique<BackfillState::ProgressTracker>(backfill_machine)) +{ + logger().debug("{}:{}", __func__, __LINE__); + backfill_machine.initiate(); +} + +template <class S> +BackfillState::StateHelper<S>::StateHelper() +{ + logger().debug("enter {}", + boost::typeindex::type_id<S>().pretty_name()); +} + +template <class S> +BackfillState::StateHelper<S>::~StateHelper() +{ + logger().debug("exit {}", + boost::typeindex::type_id<S>().pretty_name()); +} + +BackfillState::~BackfillState() = default; + +BackfillState::BackfillMachine::BackfillMachine( + BackfillState& backfill_state, + BackfillState::BackfillListener& backfill_listener, + std::unique_ptr<BackfillState::PeeringFacade> peering_state, + std::unique_ptr<BackfillState::PGFacade> pg) + : backfill_state(backfill_state), + backfill_listener(backfill_listener), + peering_state(std::move(peering_state)), + pg(std::move(pg)) +{} + +BackfillState::BackfillMachine::~BackfillMachine() = default; + +BackfillState::Initial::Initial(my_context ctx) + : my_base(ctx) +{ + backfill_state().last_backfill_started = peering_state().earliest_backfill(); + logger().debug("{}: bft={} from {}", + __func__, peering_state().get_backfill_targets(), + backfill_state().last_backfill_started); + for (const auto& bt : peering_state().get_backfill_targets()) { + logger().debug("{}: target shard {} from {}", + __func__, bt, peering_state().get_peer_last_backfill(bt)); + } + ceph_assert(peering_state().get_backfill_targets().size()); + ceph_assert(!backfill_state().last_backfill_started.is_max()); +} + +boost::statechart::result +BackfillState::Initial::react(const BackfillState::Triggered& evt) +{ + logger().debug("{}: backfill triggered", __func__); + ceph_assert(backfill_state().last_backfill_started == \ + peering_state().earliest_backfill()); + ceph_assert(peering_state().is_backfilling()); + // initialize BackfillIntervals + for (const auto& bt : peering_state().get_backfill_targets()) { + backfill_state().peer_backfill_info[bt].reset( + peering_state().get_peer_last_backfill(bt)); + } + backfill_state().backfill_info.reset(backfill_state().last_backfill_started); + if (Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + logger().debug("{}: switching to Done state", __func__); + return transit<BackfillState::Done>(); + } else { + logger().debug("{}: switching to Enqueuing state", __func__); + return transit<BackfillState::Enqueuing>(); + } +} + + +// -- Enqueuing +void BackfillState::Enqueuing::maybe_update_range() +{ + if (auto& primary_bi = backfill_state().backfill_info; + primary_bi.version >= pg().get_projected_last_update()) { + logger().info("{}: bi is current", __func__); + ceph_assert(primary_bi.version == pg().get_projected_last_update()); + } else if (primary_bi.version >= peering_state().get_log_tail()) { +#if 0 + if (peering_state().get_pg_log().get_log().empty() && + pg().get_projected_log().empty()) { + /* Because we don't move log_tail on split, the log might be + * empty even if log_tail != last_update. However, the only + * way to get here with an empty log is if log_tail is actually + * eversion_t(), because otherwise the entry which changed + * last_update since the last scan would have to be present. + */ + ceph_assert(primary_bi.version == eversion_t()); + return; + } +#endif + logger().debug("{}: bi is old, ({}) can be updated with log to {}", + __func__, + primary_bi.version, + pg().get_projected_last_update()); + logger().debug("{}: scanning pg log first", __func__); + peering_state().scan_log_after(primary_bi.version, + [&](const pg_log_entry_t& e) { + logger().debug("maybe_update_range(lambda): updating from version {}", + e.version); + if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) { + if (e.is_update()) { + logger().debug("maybe_update_range(lambda): {} updated to ver {}", + e.soid, e.version); + primary_bi.objects.erase(e.soid); + primary_bi.objects.insert(std::make_pair(e.soid, + e.version)); + } else if (e.is_delete()) { + logger().debug("maybe_update_range(lambda): {} removed", + e.soid); + primary_bi.objects.erase(e.soid); + } + } + }); + primary_bi.version = pg().get_projected_last_update(); + } else { + ceph_abort_msg( + "scan_range should have raised primary_bi.version past log_tail"); + } +} + +void BackfillState::Enqueuing::trim_backfill_infos() +{ + for (const auto& bt : peering_state().get_backfill_targets()) { + backfill_state().peer_backfill_info[bt].trim_to( + std::max(peering_state().get_peer_last_backfill(bt), + backfill_state().last_backfill_started)); + } + backfill_state().backfill_info.trim_to( + backfill_state().last_backfill_started); +} + +/* static */ bool BackfillState::Enqueuing::all_enqueued( + const PeeringFacade& peering_state, + const BackfillInterval& backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) +{ + const bool all_local_enqueued = \ + backfill_info.extends_to_end() && backfill_info.empty(); + const bool all_peer_enqueued = std::all_of( + std::begin(peer_backfill_info), + std::end(peer_backfill_info), + [] (const auto& kv) { + [[maybe_unused]] const auto& [ shard, peer_backfill_info ] = kv; + return peer_backfill_info.extends_to_end() && peer_backfill_info.empty(); + }); + return all_local_enqueued && all_peer_enqueued; +} + +hobject_t BackfillState::Enqueuing::earliest_peer_backfill( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const +{ + hobject_t e = hobject_t::get_max(); + for (const pg_shard_t& bt : peering_state().get_backfill_targets()) { + const auto iter = peer_backfill_info.find(bt); + ceph_assert(iter != peer_backfill_info.end()); + e = std::min(e, iter->second.begin); + } + return e; +} + +bool BackfillState::Enqueuing::should_rescan_replicas( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const +{ + const auto& targets = peering_state().get_backfill_targets(); + return std::any_of(std::begin(targets), std::end(targets), + [&] (const auto& bt) { + return ReplicasScanning::replica_needs_scan(peer_backfill_info.at(bt), + backfill_info); + }); +} + +bool BackfillState::Enqueuing::should_rescan_primary( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const +{ + return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) && + !backfill_info.extends_to_end(); +} + +void BackfillState::Enqueuing::trim_backfilled_object_from_intervals( + BackfillState::Enqueuing::result_t&& result, + hobject_t& last_backfill_started, + std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) +{ + std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets), + [&peer_backfill_info] (const auto& bt) { + peer_backfill_info.at(bt).pop_front(); + }); + last_backfill_started = std::move(result.new_last_backfill_started); +} + +BackfillState::Enqueuing::result_t +BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) +{ + // set `new_last_backfill_started` to `check` + result_t result { {}, check }; + for (const auto& bt : peering_state().get_backfill_targets()) { + const auto& pbi = backfill_state().peer_backfill_info.at(bt); + if (pbi.begin == check) { + result.pbi_targets.insert(bt); + const auto& version = pbi.objects.begin()->second; + backfill_state().progress_tracker->enqueue_drop(pbi.begin); + backfill_listener().enqueue_drop(bt, pbi.begin, version); + } + } + logger().debug("{}: BACKFILL removing {} from peers {}", + __func__, check, result.pbi_targets); + ceph_assert(!result.pbi_targets.empty()); + return result; +} + +BackfillState::Enqueuing::result_t +BackfillState::Enqueuing::update_on_peers(const hobject_t& check) +{ + logger().debug("{}: check={}", __func__, check); + const auto& primary_bi = backfill_state().backfill_info; + result_t result { {}, primary_bi.begin }; + + for (const auto& bt : peering_state().get_backfill_targets()) { + const auto& peer_bi = backfill_state().peer_backfill_info.at(bt); + + // Find all check peers that have the wrong version + if (const eversion_t& obj_v = primary_bi.objects.begin()->second; + check == primary_bi.begin && check == peer_bi.begin) { + if(peer_bi.objects.begin()->second != obj_v && + backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) { + backfill_listener().enqueue_push(primary_bi.begin, obj_v); + } else { + // it's fine, keep it! OR already recovering + } + result.pbi_targets.insert(bt); + } else { + // Only include peers that we've caught up to their backfill line + // otherwise, they only appear to be missing this object + // because their peer_bi.begin > backfill_info.begin. + if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) && + backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) { + backfill_listener().enqueue_push(primary_bi.begin, obj_v); + } + } + } + return result; +} + +bool BackfillState::Enqueuing::Enqueuing::all_emptied( + const BackfillInterval& local_backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const +{ + const auto& targets = peering_state().get_backfill_targets(); + const auto replicas_emptied = + std::all_of(std::begin(targets), std::end(targets), + [&] (const auto& bt) { + return peer_backfill_info.at(bt).empty(); + }); + return local_backfill_info.empty() && replicas_emptied; +} + +BackfillState::Enqueuing::Enqueuing(my_context ctx) + : my_base(ctx) +{ + auto& primary_bi = backfill_state().backfill_info; + + // update our local interval to cope with recent changes + primary_bi.begin = backfill_state().last_backfill_started; + if (primary_bi.version < peering_state().get_log_tail()) { + // it might be that the OSD is so flooded with modifying operations + // that backfill will be spinning here over and over. For the sake + // of performance and complexity we don't synchronize with entire PG. + // similar can happen in classical OSD. + logger().warn("{}: bi is old, rescanning of local backfill_info", + __func__); + post_event(RequestPrimaryScanning{}); + return; + } else { + maybe_update_range(); + } + trim_backfill_infos(); + + while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) { + if (!backfill_listener().budget_available()) { + post_event(RequestWaiting{}); + return; + } else if (should_rescan_replicas(backfill_state().peer_backfill_info, + primary_bi)) { + // Count simultaneous scans as a single op and let those complete + post_event(RequestReplicasScanning{}); + return; + } + // Get object within set of peers to operate on and the set of targets + // for which that object applies. + if (const hobject_t check = \ + earliest_peer_backfill(backfill_state().peer_backfill_info); + check < primary_bi.begin) { + // Don't increment ops here because deletions + // are cheap and not replied to unlike real recovery_ops, + // and we can't increment ops without requeueing ourself + // for recovery. + auto result = remove_on_peers(check); + trim_backfilled_object_from_intervals(std::move(result), + backfill_state().last_backfill_started, + backfill_state().peer_backfill_info); + } else { + auto result = update_on_peers(check); + trim_backfilled_object_from_intervals(std::move(result), + backfill_state().last_backfill_started, + backfill_state().peer_backfill_info); + primary_bi.pop_front(); + } + backfill_listener().maybe_flush(); + } + + if (should_rescan_primary(backfill_state().peer_backfill_info, + primary_bi)) { + // need to grab one another chunk of the object namespace and restart + // the queueing. + logger().debug("{}: reached end for current local chunk", + __func__); + post_event(RequestPrimaryScanning{}); + } else if (backfill_state().progress_tracker->tracked_objects_completed()) { + post_event(RequestDone{}); + } else { + logger().debug("{}: reached end for both local and all peers " + "but still has in-flight operations", __func__); + post_event(RequestWaiting{}); + } +} + +// -- PrimaryScanning +BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx) + : my_base(ctx) +{ + backfill_state().backfill_info.version = peering_state().get_last_update(); + backfill_listener().request_primary_scan( + backfill_state().backfill_info.begin); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(PrimaryScanned evt) +{ + logger().debug("{}", __func__); + backfill_state().backfill_info = std::move(evt.result); + return transit<Enqueuing>(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(ObjectPushed evt) +{ + logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + return discard_event(); +} + +// -- ReplicasScanning +bool BackfillState::ReplicasScanning::replica_needs_scan( + const BackfillInterval& replica_backfill_info, + const BackfillInterval& local_backfill_info) +{ + return replica_backfill_info.empty() && \ + replica_backfill_info.begin <= local_backfill_info.begin && \ + !replica_backfill_info.extends_to_end(); +} + +BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx) + : my_base(ctx) +{ + for (const auto& bt : peering_state().get_backfill_targets()) { + if (const auto& pbi = backfill_state().peer_backfill_info.at(bt); + replica_needs_scan(pbi, backfill_state().backfill_info)) { + logger().debug("{}: scanning peer osd.{} from {}", + __func__, bt, pbi.end); + backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{}); + + ceph_assert(waiting_on_backfill.find(bt) == \ + waiting_on_backfill.end()); + waiting_on_backfill.insert(bt); + } + } + ceph_assert(!waiting_on_backfill.empty()); + // TODO: start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end +} + +#if 0 +BackfillState::ReplicasScanning::~ReplicasScanning() +{ + // TODO: finish_recovery_op(hobject_t::get_max()); +} +#endif + +boost::statechart::result +BackfillState::ReplicasScanning::react(ReplicaScanned evt) +{ + logger().debug("{}: got scan result from osd={}, result={}", + __func__, evt.from, evt.result); + // TODO: maybe we'll be able to move waiting_on_backfill from + // the machine to the state. + ceph_assert(peering_state().is_backfill_target(evt.from)); + if (waiting_on_backfill.erase(evt.from)) { + backfill_state().peer_backfill_info[evt.from] = std::move(evt.result); + if (waiting_on_backfill.empty()) { + ceph_assert(backfill_state().peer_backfill_info.size() == \ + peering_state().get_backfill_targets().size()); + return transit<Enqueuing>(); + } + } else { + // we canceled backfill for a while due to a too full, and this + // is an extra response from a non-too-full peer + logger().debug("{}: canceled backfill (too full?)", __func__); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::ReplicasScanning::react(ObjectPushed evt) +{ + logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + return discard_event(); +} + + +// -- Waiting +BackfillState::Waiting::Waiting(my_context ctx) + : my_base(ctx) +{ +} + +boost::statechart::result +BackfillState::Waiting::react(ObjectPushed evt) +{ + logger().debug("Waiting::react() on ObjectPushed; evt.object={}", + evt.object); + backfill_state().progress_tracker->complete_to(evt.object, evt.stat); + if (!Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + return transit<Enqueuing>(); + } else if (backfill_state().progress_tracker->tracked_objects_completed()) { + return transit<Done>(); + } else { + // we still have something to wait on + logger().debug("Waiting::react() on ObjectPushed; still waiting"); + return discard_event(); + } +} + +// -- Done +BackfillState::Done::Done(my_context ctx) + : my_base(ctx) +{ + logger().info("{}: backfill is done", __func__); + backfill_listener().backfilled(); +} + +// -- Crashed +BackfillState::Crashed::Crashed() +{ + ceph_abort_msg("{}: this should not happen"); +} + +// ProgressTracker is an intermediary between the BackfillListener and +// BackfillMachine + its states. All requests to push or drop an object +// are directed through it. The same happens with notifications about +// completing given operations which are generated by BackfillListener +// and dispatched as i.e. ObjectPushed events. +// This allows ProgressTacker to track the list of in-flight operations +// which is essential to make the decision whether the entire machine +// should switch from Waiting to Done keep in Waiting. +// ProgressTracker also coordinates .last_backfill_started and stats +// updates. +bool BackfillState::ProgressTracker::tracked_objects_completed() const +{ + return registry.empty(); +} + +bool BackfillState::ProgressTracker::enqueue_push(const hobject_t& obj) +{ + [[maybe_unused]] const auto [it, first_seen] = registry.try_emplace( + obj, registry_item_t{op_stage_t::enqueued_push, std::nullopt}); + return first_seen; +} + +void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj) +{ + registry.try_emplace( + obj, registry_item_t{op_stage_t::enqueued_drop, pg_stat_t{}}); +} + +void BackfillState::ProgressTracker::complete_to( + const hobject_t& obj, + const pg_stat_t& stats) +{ + logger().debug("{}: obj={}", + __func__, obj); + if (auto completion_iter = registry.find(obj); + completion_iter != std::end(registry)) { + completion_iter->second = \ + registry_item_t{ op_stage_t::completed_push, stats }; + } else { + ceph_abort_msg("completing untracked object shall not happen"); + } + for (auto it = std::begin(registry); + it != std::end(registry) && + it->second.stage != op_stage_t::enqueued_push; + it = registry.erase(it)) { + auto& [soid, item] = *it; + assert(item.stats); + peering_state().update_complete_backfill_object_stats( + soid, + *item.stats); + } + if (Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info) && + tracked_objects_completed()) { + backfill_state().last_backfill_started = hobject_t::get_max(); + backfill_listener().update_peers_last_backfill(hobject_t::get_max()); + } else { + backfill_listener().update_peers_last_backfill(obj); + } +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h new file mode 100644 index 000000000..4bd2991fb --- /dev/null +++ b/src/crimson/osd/backfill_state.h @@ -0,0 +1,382 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <optional> + +#include <boost/statechart/custom_reaction.hpp> +#include <boost/statechart/event.hpp> +#include <boost/statechart/event_base.hpp> +#include <boost/statechart/simple_state.hpp> +#include <boost/statechart/state.hpp> +#include <boost/statechart/state_machine.hpp> +#include <boost/statechart/transition.hpp> + +#include "osd/recovery_types.h" + +namespace crimson::osd { + +namespace sc = boost::statechart; + +struct BackfillState { + struct BackfillListener; + struct PeeringFacade; + struct PGFacade; + + // events comes first + struct PrimaryScanned : sc::event<PrimaryScanned> { + BackfillInterval result; + PrimaryScanned(BackfillInterval&& result) + : result(std::move(result)) { + } + }; + + struct ReplicaScanned : sc::event<ReplicaScanned> { + pg_shard_t from; + BackfillInterval result; + ReplicaScanned(pg_shard_t from, BackfillInterval&& result) + : from(std::move(from)), + result(std::move(result)) { + } + }; + + struct ObjectPushed : sc::event<ObjectPushed> { + // TODO: implement replica management; I don't want to follow + // current convention where the backend layer is responsible + // for tracking replicas. + hobject_t object; + pg_stat_t stat; + ObjectPushed(hobject_t object) + : object(std::move(object)) { + } + }; + + struct Triggered : sc::event<Triggered> { + }; + +private: + // internal events + struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> { + }; + + struct RequestReplicasScanning : sc::event<RequestReplicasScanning> { + }; + + struct RequestWaiting : sc::event<RequestWaiting> { + }; + + struct RequestDone : sc::event<RequestDone> { + }; + + class ProgressTracker; + +public: + + struct Initial; + struct Enqueuing; + struct PrimaryScanning; + struct ReplicasScanning; + struct Waiting; + struct Done; + + struct BackfillMachine : sc::state_machine<BackfillMachine, Initial> { + BackfillMachine(BackfillState& backfill_state, + BackfillListener& backfill_listener, + std::unique_ptr<PeeringFacade> peering_state, + std::unique_ptr<PGFacade> pg); + ~BackfillMachine(); + BackfillState& backfill_state; + BackfillListener& backfill_listener; + std::unique_ptr<PeeringFacade> peering_state; + std::unique_ptr<PGFacade> pg; + }; + +private: + template <class S> + struct StateHelper { + StateHelper(); + ~StateHelper(); + + BackfillState& backfill_state() { + return static_cast<S*>(this) \ + ->template context<BackfillMachine>().backfill_state; + } + BackfillListener& backfill_listener() { + return static_cast<S*>(this) \ + ->template context<BackfillMachine>().backfill_listener; + } + PeeringFacade& peering_state() { + return *static_cast<S*>(this) \ + ->template context<BackfillMachine>().peering_state; + } + PGFacade& pg() { + return *static_cast<S*>(this)->template context<BackfillMachine>().pg; + } + + const PeeringFacade& peering_state() const { + return *static_cast<const S*>(this) \ + ->template context<BackfillMachine>().peering_state; + } + const BackfillState& backfill_state() const { + return static_cast<const S*>(this) \ + ->template context<BackfillMachine>().backfill_state; + } + }; + +public: + + // states + struct Crashed : sc::simple_state<Crashed, BackfillMachine>, + StateHelper<Crashed> { + explicit Crashed(); + }; + + struct Initial : sc::state<Initial, BackfillMachine>, + StateHelper<Initial> { + using reactions = boost::mpl::list< + sc::custom_reaction<Triggered>, + sc::transition<sc::event_base, Crashed>>; + explicit Initial(my_context); + // initialize after triggering backfill by on_activate_complete(). + // transit to Enqueuing. + sc::result react(const Triggered&); + }; + + struct Enqueuing : sc::state<Enqueuing, BackfillMachine>, + StateHelper<Enqueuing> { + using reactions = boost::mpl::list< + sc::transition<RequestPrimaryScanning, PrimaryScanning>, + sc::transition<RequestReplicasScanning, ReplicasScanning>, + sc::transition<RequestWaiting, Waiting>, + sc::transition<RequestDone, Done>, + sc::transition<sc::event_base, Crashed>>; + explicit Enqueuing(my_context); + + // indicate whether there is any remaining work to do when it comes + // to comparing the hobject_t namespace between primary and replicas. + // true doesn't necessarily mean backfill is done -- there could be + // in-flight pushes or drops which had been enqueued but aren't + // completed yet. + static bool all_enqueued( + const PeeringFacade& peering_state, + const BackfillInterval& backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info); + + private: + void maybe_update_range(); + void trim_backfill_infos(); + + // these methods take BackfillIntervals instead of extracting them from + // the state to emphasize the relationships across the main loop. + bool all_emptied( + const BackfillInterval& local_backfill_info, + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const; + hobject_t earliest_peer_backfill( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const; + bool should_rescan_replicas( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const; + // indicate whether a particular acting primary needs to scanned again + // to process next piece of the hobject_t's namespace. + // the logic is per analogy to replica_needs_scan(). See comments there. + bool should_rescan_primary( + const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info, + const BackfillInterval& backfill_info) const; + + // the result_t is intermediary between {remove,update}_on_peers() and + // updating BackfillIntervals in trim_backfilled_object_from_intervals. + // This step is important because it affects the main loop's condition, + // and thus deserves to be exposed instead of being called deeply from + // {remove,update}_on_peers(). + struct [[nodiscard]] result_t { + std::set<pg_shard_t> pbi_targets; + hobject_t new_last_backfill_started; + }; + void trim_backfilled_object_from_intervals( + result_t&&, + hobject_t& last_backfill_started, + std::map<pg_shard_t, BackfillInterval>& peer_backfill_info); + result_t remove_on_peers(const hobject_t& check); + result_t update_on_peers(const hobject_t& check); + }; + + struct PrimaryScanning : sc::state<PrimaryScanning, BackfillMachine>, + StateHelper<PrimaryScanning> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::custom_reaction<PrimaryScanned>, + sc::transition<sc::event_base, Crashed>>; + explicit PrimaryScanning(my_context); + sc::result react(ObjectPushed); + // collect scanning result and transit to Enqueuing. + sc::result react(PrimaryScanned); + }; + + struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>, + StateHelper<ReplicasScanning> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::custom_reaction<ReplicaScanned>, + sc::transition<sc::event_base, Crashed>>; + explicit ReplicasScanning(my_context); + // collect scanning result; if all results are collected, transition + // to Enqueuing will happen. + sc::result react(ObjectPushed); + sc::result react(ReplicaScanned); + + // indicate whether a particular peer should be scanned to retrieve + // BackfillInterval for new range of hobject_t namespace. + // true when bi.objects is exhausted, replica bi's end is not MAX, + // and primary bi'begin is further than the replica's one. + static bool replica_needs_scan( + const BackfillInterval& replica_backfill_info, + const BackfillInterval& local_backfill_info); + + private: + std::set<pg_shard_t> waiting_on_backfill; + }; + + struct Waiting : sc::state<Waiting, BackfillMachine>, + StateHelper<Waiting> { + using reactions = boost::mpl::list< + sc::custom_reaction<ObjectPushed>, + sc::transition<sc::event_base, Crashed>>; + explicit Waiting(my_context); + sc::result react(ObjectPushed); + }; + + struct Done : sc::state<Done, BackfillMachine>, + StateHelper<Done> { + using reactions = boost::mpl::list< + sc::transition<sc::event_base, Crashed>>; + explicit Done(my_context); + }; + + BackfillState(BackfillListener& backfill_listener, + std::unique_ptr<PeeringFacade> peering_state, + std::unique_ptr<PGFacade> pg); + ~BackfillState(); + + void process_event( + boost::intrusive_ptr<const sc::event_base> evt) { + backfill_machine.process_event(*std::move(evt)); + } + + hobject_t get_last_backfill_started() const { + return last_backfill_started; + } +private: + hobject_t last_backfill_started; + BackfillInterval backfill_info; + std::map<pg_shard_t, BackfillInterval> peer_backfill_info; + BackfillMachine backfill_machine; + std::unique_ptr<ProgressTracker> progress_tracker; +}; + +// BackfillListener -- an interface used by the backfill FSM to request +// low-level services like issueing `MOSDPGPush` or `MOSDPGBackfillRemove`. +// The goals behind the interface are: 1) unittestability; 2) possibility +// to retrofit classical OSD with BackfillState. For the second reason we +// never use `seastar::future` -- instead responses to the requests are +// conveyed as events; see ObjectPushed as an example. +struct BackfillState::BackfillListener { + virtual void request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) = 0; + + virtual void request_primary_scan( + const hobject_t& begin) = 0; + + virtual void enqueue_push( + const hobject_t& obj, + const eversion_t& v) = 0; + + virtual void enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) = 0; + + virtual void maybe_flush() = 0; + + virtual void update_peers_last_backfill( + const hobject_t& new_last_backfill) = 0; + + virtual bool budget_available() const = 0; + + virtual void backfilled() = 0; + + virtual ~BackfillListener() = default; +}; + +// PeeringFacade -- a facade (in the GoF-defined meaning) simplifying +// the interface of PeeringState. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct BackfillState::PeeringFacade { + virtual hobject_t earliest_backfill() const = 0; + virtual const std::set<pg_shard_t>& get_backfill_targets() const = 0; + virtual const hobject_t& get_peer_last_backfill(pg_shard_t peer) const = 0; + virtual const eversion_t& get_last_update() const = 0; + virtual const eversion_t& get_log_tail() const = 0; + + // the performance impact of `std::function` has not been considered yet. + // If there is any proof (from e.g. profiling) about its significance, we + // can switch back to the template variant. + using scan_log_func_t = std::function<void(const pg_log_entry_t&)>; + virtual void scan_log_after(eversion_t, scan_log_func_t) const = 0; + + virtual bool is_backfill_target(pg_shard_t peer) const = 0; + virtual void update_complete_backfill_object_stats(const hobject_t &hoid, + const pg_stat_t &stats) = 0; + virtual bool is_backfilling() const = 0; + virtual ~PeeringFacade() {} +}; + +// PGFacade -- a facade (in the GoF-defined meaning) simplifying the huge +// interface of crimson's PG class. The motivation is to have an inventory +// of behaviour that must be provided by a unit test's mock. +struct BackfillState::PGFacade { + virtual const eversion_t& get_projected_last_update() const = 0; + virtual ~PGFacade() {} +}; + +class BackfillState::ProgressTracker { + // TODO: apply_stat, + enum class op_stage_t { + enqueued_push, + enqueued_drop, + completed_push, + }; + + struct registry_item_t { + op_stage_t stage; + std::optional<pg_stat_t> stats; + }; + + BackfillMachine& backfill_machine; + std::map<hobject_t, registry_item_t> registry; + + BackfillState& backfill_state() { + return backfill_machine.backfill_state; + } + PeeringFacade& peering_state() { + return *backfill_machine.peering_state; + } + BackfillListener& backfill_listener() { + return backfill_machine.backfill_listener; + } + +public: + ProgressTracker(BackfillMachine& backfill_machine) + : backfill_machine(backfill_machine) { + } + + bool tracked_objects_completed() const; + + bool enqueue_push(const hobject_t&); + void enqueue_drop(const hobject_t&); + void complete_to(const hobject_t&, const pg_stat_t&); +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc new file mode 100644 index 000000000..c6516d50a --- /dev/null +++ b/src/crimson/osd/ec_backend.cc @@ -0,0 +1,35 @@ +#include "ec_backend.h" + +#include "crimson/osd/shard_services.h" + +ECBackend::ECBackend(shard_id_t shard, + ECBackend::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t&, + uint64_t) + : PGBackend{shard, coll, &shard_services.get_store()} +{ + // todo +} + +ECBackend::ll_read_errorator::future<ceph::bufferlist> +ECBackend::_read(const hobject_t& hoid, + const uint64_t off, + const uint64_t len, + const uint32_t flags) +{ + // todo + return seastar::make_ready_future<bufferlist>(); +} + +seastar::future<crimson::osd::acked_peers_t> +ECBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + // todo + return seastar::make_ready_future<crimson::osd::acked_peers_t>(); +} diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h new file mode 100644 index 000000000..e15b19970 --- /dev/null +++ b/src/crimson/osd/ec_backend.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> +#include "include/buffer_fwd.h" +#include "osd/osd_types.h" +#include "pg_backend.h" + +class ECBackend : public PGBackend +{ +public: + ECBackend(shard_id_t shard, + CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile, + uint64_t stripe_width); + seastar::future<> stop() final { + return seastar::now(); + } + void on_actingset_changed(peering_info_t pi) final {} +private: + ll_read_errorator::future<ceph::bufferlist> _read(const hobject_t& hoid, + uint64_t off, + uint64_t len, + uint32_t flags) override; + seastar::future<crimson::osd::acked_peers_t> + _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + const osd_op_params_t& req, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) final; + CollectionRef coll; + crimson::os::FuturizedStore* store; +}; diff --git a/src/crimson/osd/exceptions.h b/src/crimson/osd/exceptions.h new file mode 100644 index 000000000..2783ed252 --- /dev/null +++ b/src/crimson/osd/exceptions.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <exception> +#include <system_error> + +#include "crimson/common/errorator.h" + +namespace crimson::osd { +class error : private std::system_error { +public: + error(const std::errc ec) + : system_error(std::make_error_code(ec)) { + } + + using system_error::code; + using system_error::what; + + friend error make_error(int ret); + +private: + error(const int ret) noexcept + : system_error(ret, std::system_category()) { + } +}; + +inline error make_error(const int ret) { + return error{ret}; +} + +struct object_not_found : public error { + object_not_found() : error(std::errc::no_such_file_or_directory) {} +}; + +struct invalid_argument : public error { + invalid_argument() : error(std::errc::invalid_argument) {} +}; + +// FIXME: error handling +struct permission_denied : public error { + permission_denied() : error(std::errc::operation_not_permitted) {} +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc new file mode 100644 index 000000000..81ec06ecd --- /dev/null +++ b/src/crimson/osd/heartbeat.cc @@ -0,0 +1,680 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "heartbeat.h" + +#include <boost/range/join.hpp> + +#include "messages/MOSDPing.h" +#include "messages/MOSDFailure.h" + +#include "crimson/common/config_proxy.h" +#include "crimson/common/formatter.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/osd/shard_services.h" +#include "crimson/mon/MonClient.h" + +#include "osd/OSDMap.h" + +using crimson::common::local_conf; + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +Heartbeat::Heartbeat(osd_id_t whoami, + const crimson::osd::ShardServices& service, + crimson::mon::Client& monc, + crimson::net::MessengerRef front_msgr, + crimson::net::MessengerRef back_msgr) + : whoami{whoami}, + service{service}, + monc{monc}, + front_msgr{front_msgr}, + back_msgr{back_msgr}, + // do this in background + timer{[this] { + heartbeat_check(); + (void)send_heartbeats(); + }}, + failing_peers{*this} +{} + +seastar::future<> Heartbeat::start(entity_addrvec_t front_addrs, + entity_addrvec_t back_addrs) +{ + logger().info("heartbeat: start"); + // i only care about the address, so any unused port would work + for (auto& addr : boost::join(front_addrs.v, back_addrs.v)) { + addr.set_port(0); + } + + using crimson::net::SocketPolicy; + front_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossy_client(0)); + back_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossy_client(0)); + return seastar::when_all_succeed(start_messenger(*front_msgr, + front_addrs), + start_messenger(*back_msgr, + back_addrs)) + .then_unpack([this] { + timer.arm_periodic( + std::chrono::seconds(local_conf()->osd_heartbeat_interval)); + }); +} + +seastar::future<> +Heartbeat::start_messenger(crimson::net::Messenger& msgr, + const entity_addrvec_t& addrs) +{ + return msgr.try_bind(addrs, + local_conf()->ms_bind_port_min, + local_conf()->ms_bind_port_max) + .safe_then([this, &msgr]() mutable { + return msgr.start({this}); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [] (const std::error_code& e) { + logger().error("heartbeat messenger try_bind(): address range is unavailable."); + ceph_abort(); + })); +} + +seastar::future<> Heartbeat::stop() +{ + logger().info("{}", __func__); + timer.cancel(); + front_msgr->stop(); + back_msgr->stop(); + return gate.close().then([this] { + return seastar::when_all_succeed(front_msgr->shutdown(), + back_msgr->shutdown()); + }).then_unpack([] { + return seastar::now(); + }); +} + +const entity_addrvec_t& Heartbeat::get_front_addrs() const +{ + return front_msgr->get_myaddrs(); +} + +const entity_addrvec_t& Heartbeat::get_back_addrs() const +{ + return back_msgr->get_myaddrs(); +} + +void Heartbeat::set_require_authorizer(bool require_authorizer) +{ + if (front_msgr->get_require_authorizer() != require_authorizer) { + front_msgr->set_require_authorizer(require_authorizer); + back_msgr->set_require_authorizer(require_authorizer); + } +} + +void Heartbeat::add_peer(osd_id_t _peer, epoch_t epoch) +{ + assert(whoami != _peer); + auto [iter, added] = peers.try_emplace(_peer, *this, _peer); + auto& peer = iter->second; + peer.set_epoch(epoch); +} + +Heartbeat::osds_t Heartbeat::remove_down_peers() +{ + osds_t old_osds; // osds not added in this epoch + for (auto i = peers.begin(); i != peers.end(); ) { + auto osdmap = service.get_osdmap_service().get_map(); + const auto& [osd, peer] = *i; + if (!osdmap->is_up(osd)) { + i = peers.erase(i); + } else { + if (peer.get_epoch() < osdmap->get_epoch()) { + old_osds.push_back(osd); + } + ++i; + } + } + return old_osds; +} + +void Heartbeat::add_reporter_peers(int whoami) +{ + auto osdmap = service.get_osdmap_service().get_map(); + // include next and previous up osds to ensure we have a fully-connected set + set<int> want; + if (auto next = osdmap->get_next_up_osd_after(whoami); next >= 0) { + want.insert(next); + } + if (auto prev = osdmap->get_previous_up_osd_before(whoami); prev >= 0) { + want.insert(prev); + } + // make sure we have at least **min_down** osds coming from different + // subtree level (e.g., hosts) for fast failure detection. + auto min_down = local_conf().get_val<uint64_t>("mon_osd_min_down_reporters"); + auto subtree = local_conf().get_val<string>("mon_osd_reporter_subtree_level"); + osdmap->get_random_up_osds_by_subtree( + whoami, subtree, min_down, want, &want); + auto epoch = osdmap->get_epoch(); + for (int osd : want) { + add_peer(osd, epoch); + }; +} + +void Heartbeat::update_peers(int whoami) +{ + const auto min_peers = static_cast<size_t>( + local_conf().get_val<int64_t>("osd_heartbeat_min_peers")); + add_reporter_peers(whoami); + auto extra = remove_down_peers(); + // too many? + for (auto& osd : extra) { + if (peers.size() <= min_peers) { + break; + } + remove_peer(osd); + } + // or too few? + auto osdmap = service.get_osdmap_service().get_map(); + auto epoch = osdmap->get_epoch(); + for (auto next = osdmap->get_next_up_osd_after(whoami); + peers.size() < min_peers && next >= 0 && next != whoami; + next = osdmap->get_next_up_osd_after(next)) { + add_peer(next, epoch); + } +} + +Heartbeat::osds_t Heartbeat::get_peers() const +{ + osds_t osds; + osds.reserve(peers.size()); + for (auto& peer : peers) { + osds.push_back(peer.first); + } + return osds; +} + +void Heartbeat::remove_peer(osd_id_t peer) +{ + assert(peers.count(peer) == 1); + peers.erase(peer); +} + +std::optional<seastar::future<>> +Heartbeat::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + bool dispatched = true; + gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] { + switch (m->get_type()) { + case MSG_OSD_PING: + return handle_osd_ping(conn, boost::static_pointer_cast<MOSDPing>(m)); + default: + dispatched = false; + return seastar::now(); + } + }); + return (dispatched ? std::make_optional(seastar::now()) : std::nullopt); +} + +void Heartbeat::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) +{ + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_reset(conn, is_replace); + } +} + +void Heartbeat::ms_handle_connect(crimson::net::ConnectionRef conn) +{ + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_connect(conn); + } +} + +void Heartbeat::ms_handle_accept(crimson::net::ConnectionRef conn) +{ + auto peer = conn->get_peer_id(); + if (conn->get_peer_type() != entity_name_t::TYPE_OSD || + peer == entity_name_t::NEW) { + return; + } + if (auto found = peers.find(peer); + found != peers.end()) { + found->second.handle_accept(conn); + } +} + +seastar::future<> Heartbeat::handle_osd_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + switch (m->op) { + case MOSDPing::PING: + return handle_ping(conn, m); + case MOSDPing::PING_REPLY: + return handle_reply(conn, m); + case MOSDPing::YOU_DIED: + return handle_you_died(); + default: + return seastar::now(); + } +} + +seastar::future<> Heartbeat::handle_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + auto min_message = static_cast<uint32_t>( + local_conf()->osd_heartbeat_min_size); + auto reply = + make_message<MOSDPing>( + m->fsid, + service.get_osdmap_service().get_map()->get_epoch(), + MOSDPing::PING_REPLY, + m->ping_stamp, + m->mono_ping_stamp, + service.get_mnow(), + service.get_osdmap_service().get_up_epoch(), + min_message); + return conn->send(reply); +} + +seastar::future<> Heartbeat::handle_reply(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m) +{ + const osd_id_t from = m->get_source().num(); + auto found = peers.find(from); + if (found == peers.end()) { + // stale reply + return seastar::now(); + } + auto& peer = found->second; + return peer.handle_reply(conn, m); +} + +seastar::future<> Heartbeat::handle_you_died() +{ + // TODO: ask for newer osdmap + return seastar::now(); +} + +void Heartbeat::heartbeat_check() +{ + failure_queue_t failure_queue; + const auto now = clock::now(); + for (const auto& [osd, peer] : peers) { + auto failed_since = peer.failed_since(now); + if (!clock::is_zero(failed_since)) { + failure_queue.emplace(osd, failed_since); + } + } + if (!failure_queue.empty()) { + // send_failures can run in background, because + // 1. After the execution of send_failures, no msg is actually + // sent, which means the sending operation is not done, + // which further seems to involve problems risks that when + // osd shuts down, the left part of the sending operation + // may reference OSD and Heartbeat instances that are already + // deleted. However, remaining work of that sending operation + // involves no reference back to OSD or Heartbeat instances, + // which means it wouldn't involve the above risks. + // 2. messages are sent in order, if later checks find out + // the previous "failed" peers to be healthy, that "still + // alive" messages would be sent after the previous "osd + // failure" messages which is totally safe. + (void)send_failures(std::move(failure_queue)); + } +} + +seastar::future<> Heartbeat::send_heartbeats() +{ + const auto mnow = service.get_mnow(); + const auto now = clock::now(); + + std::vector<seastar::future<>> futures; + for (auto& [osd, peer] : peers) { + peer.send_heartbeat(now, mnow, futures); + } + return seastar::when_all_succeed(futures.begin(), futures.end()); +} + +seastar::future<> Heartbeat::send_failures(failure_queue_t&& failure_queue) +{ + std::vector<seastar::future<>> futures; + const auto now = clock::now(); + for (auto [osd, failed_since] : failure_queue) { + failing_peers.add_pending(osd, failed_since, now, futures); + } + + return seastar::when_all_succeed(futures.begin(), futures.end()); +} + +void Heartbeat::print(std::ostream& out) const +{ + out << "heartbeat"; +} + +Heartbeat::Connection::~Connection() +{ + if (conn) { + conn->mark_down(); + } +} + +bool Heartbeat::Connection::matches(crimson::net::ConnectionRef _conn) const +{ + return (conn && conn == _conn); +} + +void Heartbeat::Connection::accepted(crimson::net::ConnectionRef accepted_conn) +{ + if (!conn) { + if (accepted_conn->get_peer_addr() == listener.get_peer_addr(type)) { + logger().info("Heartbeat::Connection::accepted(): " + "{} racing resolved", *this); + conn = accepted_conn; + set_connected(); + } + } else if (conn == accepted_conn) { + set_connected(); + } +} + +void Heartbeat::Connection::replaced() +{ + assert(!is_connected); + auto replaced_conn = conn; + // set the racing connection, will be handled by handle_accept() + conn = msgr.connect(replaced_conn->get_peer_addr(), + replaced_conn->get_peer_name()); + racing_detected = true; + logger().warn("Heartbeat::Connection::replaced(): {} racing", *this); + assert(conn != replaced_conn); + assert(conn->is_connected()); +} + +void Heartbeat::Connection::reset() +{ + conn = nullptr; + if (is_connected) { + is_connected = false; + listener.decrease_connected(); + } + if (!racing_detected || is_winner_side) { + connect(); + } else { + logger().info("Heartbeat::Connection::reset(): " + "{} racing detected and lose, " + "waiting for peer connect me", *this); + } +} + +seastar::future<> Heartbeat::Connection::send(MessageRef msg) +{ + assert(is_connected); + return conn->send(msg); +} + +void Heartbeat::Connection::validate() +{ + assert(is_connected); + auto peer_addr = listener.get_peer_addr(type); + if (conn->get_peer_addr() != peer_addr) { + logger().info("Heartbeat::Connection::validate(): " + "{} has new address {} over {}, reset", + *this, peer_addr, conn->get_peer_addr()); + conn->mark_down(); + racing_detected = false; + reset(); + } +} + +void Heartbeat::Connection::retry() +{ + racing_detected = false; + if (!is_connected) { + if (conn) { + conn->mark_down(); + reset(); + } else { + connect(); + } + } +} + +void Heartbeat::Connection::set_connected() +{ + assert(!is_connected); + is_connected = true; + listener.increase_connected(); +} + +void Heartbeat::Connection::connect() +{ + assert(!conn); + auto addr = listener.get_peer_addr(type); + conn = msgr.connect(addr, entity_name_t(CEPH_ENTITY_TYPE_OSD, peer)); + if (conn->is_connected()) { + set_connected(); + } +} + +Heartbeat::clock::time_point +Heartbeat::Session::failed_since(Heartbeat::clock::time_point now) const +{ + if (do_health_screen(now) == health_state::UNHEALTHY) { + auto oldest_deadline = ping_history.begin()->second.deadline; + auto failed_since = std::min(last_rx_back, last_rx_front); + if (clock::is_zero(failed_since)) { + logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} " + "ever on either front or back, first ping sent {} " + "(oldest deadline {})", + peer, first_tx, oldest_deadline); + failed_since = first_tx; + } else { + logger().error("Heartbeat::Session::failed_since(): no reply from osd.{} " + "since back {} front {} (oldest deadline {})", + peer, last_rx_back, last_rx_front, oldest_deadline); + } + return failed_since; + } else { + return clock::zero(); + } +} + +void Heartbeat::Session::set_inactive_history(clock::time_point now) +{ + assert(!connected); + if (ping_history.empty()) { + const utime_t sent_stamp{now}; + const auto deadline = + now + std::chrono::seconds(local_conf()->osd_heartbeat_grace); + ping_history.emplace(sent_stamp, reply_t{deadline, 0}); + } else { // the entry is already added + assert(ping_history.size() == 1); + } +} + +Heartbeat::Peer::Peer(Heartbeat& heartbeat, osd_id_t peer) + : ConnectionListener(2), heartbeat{heartbeat}, peer{peer}, session{peer}, + con_front(peer, heartbeat.whoami > peer, Connection::type_t::front, + *heartbeat.front_msgr, *this), + con_back(peer, heartbeat.whoami > peer, Connection::type_t::back, + *heartbeat.back_msgr, *this) +{ + logger().info("Heartbeat::Peer: osd.{} added", peer); +} + +Heartbeat::Peer::~Peer() +{ + logger().info("Heartbeat::Peer: osd.{} removed", peer); +} + +void Heartbeat::Peer::send_heartbeat( + clock::time_point now, ceph::signedspan mnow, + std::vector<seastar::future<>>& futures) +{ + session.set_tx(now); + if (session.is_started()) { + do_send_heartbeat(now, mnow, &futures); + for_each_conn([] (auto& conn) { + conn.validate(); + }); + } else { + // we should send MOSDPing but still cannot at this moment + if (pending_send) { + // we have already pending for a entire heartbeat interval + logger().warn("Heartbeat::Peer::send_heartbeat(): " + "heartbeat to osd.{} is still pending...", peer); + for_each_conn([] (auto& conn) { + conn.retry(); + }); + } else { + logger().info("Heartbeat::Peer::send_heartbeat(): " + "heartbeat to osd.{} is pending send...", peer); + session.set_inactive_history(now); + pending_send = true; + } + } +} + +seastar::future<> Heartbeat::Peer::handle_reply( + crimson::net::ConnectionRef conn, Ref<MOSDPing> m) +{ + if (!session.is_started()) { + // we haven't sent any ping yet + return seastar::now(); + } + type_t type; + if (con_front.matches(conn)) { + type = type_t::front; + } else if (con_back.matches(conn)) { + type = type_t::back; + } else { + return seastar::now(); + } + const auto now = clock::now(); + if (session.on_pong(m->ping_stamp, type, now)) { + if (session.do_health_screen(now) == Session::health_state::HEALTHY) { + return heartbeat.failing_peers.cancel_one(peer); + } + } + return seastar::now(); +} + +entity_addr_t Heartbeat::Peer::get_peer_addr(type_t type) +{ + const auto osdmap = heartbeat.service.get_osdmap_service().get_map(); + if (type == type_t::front) { + return osdmap->get_hb_front_addrs(peer).front(); + } else { + return osdmap->get_hb_back_addrs(peer).front(); + } +} + +void Heartbeat::Peer::on_connected() +{ + logger().info("Heartbeat::Peer: osd.{} connected (send={})", + peer, pending_send); + session.on_connected(); + if (pending_send) { + pending_send = false; + do_send_heartbeat(clock::now(), heartbeat.service.get_mnow(), nullptr); + } +} + +void Heartbeat::Peer::on_disconnected() +{ + logger().info("Heartbeat::Peer: osd.{} disconnected", peer); + session.on_disconnected(); +} + +void Heartbeat::Peer::do_send_heartbeat( + Heartbeat::clock::time_point now, + ceph::signedspan mnow, + std::vector<seastar::future<>>* futures) +{ + const utime_t sent_stamp{now}; + const auto deadline = + now + std::chrono::seconds(local_conf()->osd_heartbeat_grace); + session.on_ping(sent_stamp, deadline); + for_each_conn([&, this] (auto& conn) { + auto min_message = static_cast<uint32_t>( + local_conf()->osd_heartbeat_min_size); + auto ping = make_message<MOSDPing>( + heartbeat.monc.get_fsid(), + heartbeat.service.get_osdmap_service().get_map()->get_epoch(), + MOSDPing::PING, + sent_stamp, + mnow, + mnow, + heartbeat.service.get_osdmap_service().get_up_epoch(), + min_message); + if (futures) { + futures->push_back(conn.send(std::move(ping))); + } + }); +} + +bool Heartbeat::FailingPeers::add_pending( + osd_id_t peer, + clock::time_point failed_since, + clock::time_point now, + std::vector<seastar::future<>>& futures) +{ + if (failure_pending.count(peer)) { + return false; + } + auto failed_for = chrono::duration_cast<chrono::seconds>( + now - failed_since).count(); + auto osdmap = heartbeat.service.get_osdmap_service().get_map(); + auto failure_report = + make_message<MOSDFailure>(heartbeat.monc.get_fsid(), + peer, + osdmap->get_addrs(peer), + static_cast<int>(failed_for), + osdmap->get_epoch()); + failure_pending.emplace(peer, failure_info_t{failed_since, + osdmap->get_addrs(peer)}); + futures.push_back(heartbeat.monc.send_message(failure_report)); + logger().info("{}: osd.{} failed for {}", __func__, peer, failed_for); + return true; +} + +seastar::future<> Heartbeat::FailingPeers::cancel_one(osd_id_t peer) +{ + if (auto pending = failure_pending.find(peer); + pending != failure_pending.end()) { + auto fut = send_still_alive(peer, pending->second.addrs); + failure_pending.erase(peer); + return fut; + } + return seastar::now(); +} + +seastar::future<> +Heartbeat::FailingPeers::send_still_alive( + osd_id_t osd, const entity_addrvec_t& addrs) +{ + auto still_alive = make_message<MOSDFailure>( + heartbeat.monc.get_fsid(), + osd, + addrs, + 0, + heartbeat.service.get_osdmap_service().get_map()->get_epoch(), + MOSDFailure::FLAG_ALIVE); + logger().info("{}: osd.{}", __func__, osd); + return heartbeat.monc.send_message(still_alive); +} diff --git a/src/crimson/osd/heartbeat.h b/src/crimson/osd/heartbeat.h new file mode 100644 index 000000000..4947e871f --- /dev/null +++ b/src/crimson/osd/heartbeat.h @@ -0,0 +1,455 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <cstdint> +#include <seastar/core/future.hh> +#include "common/ceph_time.h" +#include "crimson/common/gated.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/net/Fwd.h" + +class MOSDPing; + +namespace crimson::osd { + class ShardServices; +} + +namespace crimson::mon { + class Client; +} + +template<typename Message> using Ref = boost::intrusive_ptr<Message>; + +class Heartbeat : public crimson::net::Dispatcher { +public: + using osd_id_t = int; + + Heartbeat(osd_id_t whoami, + const crimson::osd::ShardServices& service, + crimson::mon::Client& monc, + crimson::net::MessengerRef front_msgr, + crimson::net::MessengerRef back_msgr); + + seastar::future<> start(entity_addrvec_t front, + entity_addrvec_t back); + seastar::future<> stop(); + + using osds_t = std::vector<osd_id_t>; + void add_peer(osd_id_t peer, epoch_t epoch); + void update_peers(int whoami); + void remove_peer(osd_id_t peer); + osds_t get_peers() const; + + const entity_addrvec_t& get_front_addrs() const; + const entity_addrvec_t& get_back_addrs() const; + + void set_require_authorizer(bool); + + // Dispatcher methods + std::optional<seastar::future<>> ms_dispatch( + crimson::net::ConnectionRef conn, MessageRef m) override; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) override; + void ms_handle_connect(crimson::net::ConnectionRef conn) override; + void ms_handle_accept(crimson::net::ConnectionRef conn) override; + + void print(std::ostream&) const; +private: + seastar::future<> handle_osd_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_ping(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_reply(crimson::net::ConnectionRef conn, + Ref<MOSDPing> m); + seastar::future<> handle_you_died(); + + /// remove down OSDs + /// @return peers not added in this epoch + osds_t remove_down_peers(); + /// add enough reporters for fast failure detection + void add_reporter_peers(int whoami); + + seastar::future<> start_messenger(crimson::net::Messenger& msgr, + const entity_addrvec_t& addrs); +private: + const osd_id_t whoami; + const crimson::osd::ShardServices& service; + crimson::mon::Client& monc; + crimson::net::MessengerRef front_msgr; + crimson::net::MessengerRef back_msgr; + + seastar::timer<seastar::lowres_clock> timer; + // use real_clock so it can be converted to utime_t + using clock = ceph::coarse_real_clock; + + class ConnectionListener; + class Connection; + class Session; + class Peer; + using peers_map_t = std::map<osd_id_t, Peer>; + peers_map_t peers; + + // osds which are considered failed + // osd_id => when was the last time that both front and back pings were acked + // or sent. + // use for calculating how long the OSD has been unresponsive + using failure_queue_t = std::map<osd_id_t, clock::time_point>; + seastar::future<> send_failures(failure_queue_t&& failure_queue); + seastar::future<> send_heartbeats(); + void heartbeat_check(); + + // osds we've reported to monior as failed ones, but they are not marked down + // yet + crimson::common::Gated gate; + + class FailingPeers { + public: + FailingPeers(Heartbeat& heartbeat) : heartbeat(heartbeat) {} + bool add_pending(osd_id_t peer, + clock::time_point failed_since, + clock::time_point now, + std::vector<seastar::future<>>& futures); + seastar::future<> cancel_one(osd_id_t peer); + + private: + seastar::future<> send_still_alive(osd_id_t, const entity_addrvec_t&); + + Heartbeat& heartbeat; + + struct failure_info_t { + clock::time_point failed_since; + entity_addrvec_t addrs; + }; + std::map<osd_id_t, failure_info_t> failure_pending; + } failing_peers; +}; + +inline std::ostream& operator<<(std::ostream& out, const Heartbeat& hb) { + hb.print(out); + return out; +} + +/* + * Event driven interface for Heartbeat::Peer to be notified when both hb_front + * and hb_back are connected, or connection is lost. + */ +class Heartbeat::ConnectionListener { + public: + ConnectionListener(size_t connections) : connections{connections} {} + + void increase_connected() { + assert(connected < connections); + ++connected; + if (connected == connections) { + on_connected(); + } + } + void decrease_connected() { + assert(connected > 0); + if (connected == connections) { + on_disconnected(); + } + --connected; + } + enum class type_t { front, back }; + virtual entity_addr_t get_peer_addr(type_t) = 0; + + protected: + virtual void on_connected() = 0; + virtual void on_disconnected() = 0; + + private: + const size_t connections; + size_t connected = 0; +}; + +class Heartbeat::Connection { + public: + using type_t = ConnectionListener::type_t; + Connection(osd_id_t peer, bool is_winner_side, type_t type, + crimson::net::Messenger& msgr, + ConnectionListener& listener) + : peer{peer}, type{type}, + msgr{msgr}, listener{listener}, + is_winner_side{is_winner_side} { + connect(); + } + Connection(const Connection&) = delete; + Connection(Connection&&) = delete; + Connection& operator=(const Connection&) = delete; + Connection& operator=(Connection&&) = delete; + + ~Connection(); + + bool matches(crimson::net::ConnectionRef _conn) const; + void connected() { + set_connected(); + } + void accepted(crimson::net::ConnectionRef); + void replaced(); + void reset(); + seastar::future<> send(MessageRef msg); + void validate(); + // retry connection if still pending + void retry(); + + private: + void set_connected(); + void connect(); + + const osd_id_t peer; + const type_t type; + crimson::net::Messenger& msgr; + ConnectionListener& listener; + +/* + * Resolve the following racing when both me and peer are trying to connect + * each other symmetrically, under SocketPolicy::lossy_client: + * + * OSD.A OSD.B + * - - + * |-[1]----> <----[2]-| + * \ / + * \ / + * delay.. X delay.. + * / \ + * |-[1]x> / \ <x[2]-| + * |<-[2]--- ---[1]->| + * |(reset#1) (reset#2)| + * |(reconnectB) (reconnectA)| + * |-[2]---> <---[1]-| + * delay.. delay.. + * (remote close populated) + * |-[2]x> <x[1]-| + * |(reset#2) (reset#1)| + * | ... ... | + * (dead loop!) + * + * Our solution is to remember if such racing was happened recently, and + * establish connection asymmetrically only from the winner side whose osd-id + * is larger. + */ + const bool is_winner_side; + bool racing_detected = false; + + crimson::net::ConnectionRef conn; + bool is_connected = false; + + friend std::ostream& operator<<(std::ostream& os, const Connection& c) { + if (c.type == type_t::front) { + return os << "con_front(osd." << c.peer << ")"; + } else { + return os << "con_back(osd." << c.peer << ")"; + } + } +}; + +/* + * Track the ping history and ping reply (the pong) from the same session, clean up + * history once hb_front or hb_back loses connection and restart the session once + * both connections are connected again. + * + * We cannot simply remove the entire Heartbeat::Peer once hb_front or hb_back + * loses connection, because we would end up with the following deadloop: + * + * OSD.A OSD.B + * - - + * hb_front reset <--(network)--- hb_front close + * | ^ + * | | + * remove Peer B (dead loop!) remove Peer A + * | | + * V | + * hb_back close ----(network)---> hb_back reset + */ +class Heartbeat::Session { + public: + Session(osd_id_t peer) : peer{peer} {} + + void set_epoch(epoch_t epoch_) { epoch = epoch_; } + epoch_t get_epoch() const { return epoch; } + bool is_started() const { return connected; } + bool pinged() const { + if (clock::is_zero(first_tx)) { + // i can never receive a pong without sending any ping message first. + assert(clock::is_zero(last_rx_front) && + clock::is_zero(last_rx_back)); + return false; + } else { + return true; + } + } + + enum class health_state { + UNKNOWN, + UNHEALTHY, + HEALTHY, + }; + health_state do_health_screen(clock::time_point now) const { + if (!pinged()) { + // we are not healty nor unhealty because we haven't sent anything yet + return health_state::UNKNOWN; + } else if (!ping_history.empty() && ping_history.begin()->second.deadline < now) { + return health_state::UNHEALTHY; + } else if (!clock::is_zero(last_rx_front) && + !clock::is_zero(last_rx_back)) { + // only declare to be healthy until we have received the first + // replies from both front/back connections + return health_state::HEALTHY; + } else { + return health_state::UNKNOWN; + } + } + + clock::time_point failed_since(clock::time_point now) const; + + void set_tx(clock::time_point now) { + if (!pinged()) { + first_tx = now; + } + last_tx = now; + } + + void on_connected() { + assert(!connected); + connected = true; + ping_history.clear(); + } + + void on_ping(const utime_t& sent_stamp, + const clock::time_point& deadline) { + assert(connected); + [[maybe_unused]] auto [reply, added] = + ping_history.emplace(sent_stamp, reply_t{deadline, 2}); + } + + bool on_pong(const utime_t& ping_stamp, + Connection::type_t type, + clock::time_point now) { + assert(connected); + auto ping = ping_history.find(ping_stamp); + if (ping == ping_history.end()) { + // old replies, deprecated by newly sent pings. + return false; + } + auto& unacked = ping->second.unacknowledged; + assert(unacked); + if (type == Connection::type_t::front) { + last_rx_front = now; + unacked--; + } else { + last_rx_back = now; + unacked--; + } + if (unacked == 0) { + ping_history.erase(ping_history.begin(), ++ping); + } + return true; + } + + void on_disconnected() { + assert(connected); + connected = false; + if (!ping_history.empty()) { + // we lost our ping_history of the last session, but still need to keep + // the oldest deadline for unhealthy check. + auto oldest = ping_history.begin(); + auto sent_stamp = oldest->first; + auto deadline = oldest->second.deadline; + ping_history.clear(); + ping_history.emplace(sent_stamp, reply_t{deadline, 0}); + } + } + + // maintain an entry in ping_history for unhealthy check + void set_inactive_history(clock::time_point); + + private: + const osd_id_t peer; + bool connected = false; + // time we sent our first ping request + clock::time_point first_tx; + // last time we sent a ping request + clock::time_point last_tx; + // last time we got a ping reply on the front side + clock::time_point last_rx_front; + // last time we got a ping reply on the back side + clock::time_point last_rx_back; + // most recent epoch we wanted this peer + epoch_t epoch; + + struct reply_t { + clock::time_point deadline; + // one sent over front conn, another sent over back conn + uint8_t unacknowledged = 0; + }; + // history of inflight pings, arranging by timestamp we sent + std::map<utime_t, reply_t> ping_history; +}; + +class Heartbeat::Peer final : private Heartbeat::ConnectionListener { + public: + Peer(Heartbeat&, osd_id_t); + ~Peer(); + Peer(Peer&&) = delete; + Peer(const Peer&) = delete; + Peer& operator=(Peer&&) = delete; + Peer& operator=(const Peer&) = delete; + + void set_epoch(epoch_t epoch) { session.set_epoch(epoch); } + epoch_t get_epoch() const { return session.get_epoch(); } + + // if failure, return time_point since last active + // else, return clock::zero() + clock::time_point failed_since(clock::time_point now) const { + return session.failed_since(now); + } + void send_heartbeat( + clock::time_point, ceph::signedspan, std::vector<seastar::future<>>&); + seastar::future<> handle_reply(crimson::net::ConnectionRef, Ref<MOSDPing>); + void handle_reset(crimson::net::ConnectionRef conn, bool is_replace) { + for_each_conn([&] (auto& _conn) { + if (_conn.matches(conn)) { + if (is_replace) { + _conn.replaced(); + } else { + _conn.reset(); + } + } + }); + } + void handle_connect(crimson::net::ConnectionRef conn) { + for_each_conn([&] (auto& _conn) { + if (_conn.matches(conn)) { + _conn.connected(); + } + }); + } + void handle_accept(crimson::net::ConnectionRef conn) { + for_each_conn([&] (auto& _conn) { + _conn.accepted(conn); + }); + } + + private: + entity_addr_t get_peer_addr(type_t type) override; + void on_connected() override; + void on_disconnected() override; + void do_send_heartbeat( + clock::time_point, ceph::signedspan, std::vector<seastar::future<>>*); + + template <typename Func> + void for_each_conn(Func&& f) { + f(con_front); + f(con_back); + } + + Heartbeat& heartbeat; + const osd_id_t peer; + Session session; + // if need to send heartbeat when session connected + bool pending_send = false; + Connection con_front; + Connection con_back; +}; diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc new file mode 100644 index 000000000..a90903e72 --- /dev/null +++ b/src/crimson/osd/main.cc @@ -0,0 +1,230 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include <sys/types.h> +#include <unistd.h> + +#include <iostream> +#include <random> + +#include <seastar/core/app-template.hh> +#include <seastar/core/print.hh> +#include <seastar/core/thread.hh> +#include <seastar/util/std-compat.hh> + +#include "auth/KeyRing.h" +#include "common/ceph_argparse.h" +#include "crimson/common/buffer_io.h" +#include "crimson/common/config_proxy.h" +#include "crimson/net/Messenger.h" +#include "global/pidfile.h" + +#include "osd.h" + +using config_t = crimson::common::ConfigProxy; + +void usage(const char* prog) { + std::cout << "usage: " << prog << " -i <ID>\n" + << " --help-seastar show Seastar help messages\n"; + generic_server_usage(); +} + +auto partition_args(seastar::app_template& app, char** argv_begin, char** argv_end) +{ + namespace bpo = boost::program_options; + // collect all options consumed by seastar::app_template + auto parsed = bpo::command_line_parser(std::distance(argv_begin, argv_end), + argv_begin) + .options(app.get_options_description()).allow_unregistered().run(); + auto unknown_args = bpo::collect_unrecognized(parsed.options, + bpo::include_positional); + std::vector<const char*> ceph_args, app_args; + // ceph_argparse_early_args() and + // seastar::smp::get_options_description() use "-c" for different + // options. and ceph wins + auto consume_conf_arg = [&](char** argv) { + if (std::strcmp(*argv, "-c") == 0) { + ceph_args.push_back(*argv++); + if (argv != argv_end) { + ceph_args.push_back(*argv++); + } + } + return argv; + }; + auto unknown = unknown_args.begin(); + auto consume_unknown_arg = [&](char** argv) { + for (; unknown != unknown_args.end() && + argv != argv_end && + *unknown == *argv; ++argv, ++unknown) { + if (std::strcmp(*argv, "--help-seastar") == 0) { + app_args.push_back("--help"); + } else { + ceph_args.push_back(*argv); + } + } + return argv; + }; + for (auto argv = argv_begin; argv != argv_end;) { + if (auto next_arg = consume_conf_arg(argv); next_arg != argv) { + argv = next_arg; + } else if (auto next_arg = consume_unknown_arg(argv); next_arg != argv) { + argv = next_arg; + } else { + app_args.push_back(*argv++); + } + } + return make_pair(std::move(ceph_args), std::move(app_args)); +} + +using crimson::common::local_conf; + +seastar::future<> make_keyring() +{ + const auto path = local_conf().get_val<string>("keyring"); + return seastar::file_exists(path).then([path](bool exists) { + KeyRing keyring; + EntityName name{local_conf()->name}; + EntityAuth auth; + if (exists && + keyring.load(nullptr, path) == 0 && + keyring.get_auth(name, auth)) { + seastar::fprint(std::cerr, "already have key in keyring: %s\n", path); + return seastar::now(); + } else { + auth.key.create(std::make_unique<CephContext>().get(), CEPH_CRYPTO_AES); + keyring.add(name, auth); + bufferlist bl; + keyring.encode_plaintext(bl); + const auto permissions = (seastar::file_permissions::user_read | + seastar::file_permissions::user_write); + return crimson::write_file(std::move(bl), path, permissions); + } + }).handle_exception_type([path](const std::filesystem::filesystem_error& e) { + seastar::fprint(std::cerr, "FATAL: writing new keyring to %s: %s\n", path, e.what()); + throw e; + }); +} + +uint64_t get_nonce() +{ + if (auto pid = getpid(); pid != 1) { + return pid; + } else { + // we're running in a container; use a random number instead! + std::random_device rd; + std::default_random_engine rng{rd()}; + return std::uniform_int_distribution<uint64_t>{}(rng); + } +} + +int main(int argc, char* argv[]) +{ + seastar::app_template app; + app.add_options() + ("mkkey", "generate a new secret key. " + "This is normally used in combination with --mkfs") + ("mkfs", "create a [new] data directory") + ("debug", "enable debug output on all loggers"); + + auto [ceph_args, app_args] = partition_args(app, argv, argv + argc); + if (ceph_argparse_need_usage(ceph_args) && + std::find(app_args.begin(), app_args.end(), "--help") == app_args.end()) { + usage(argv[0]); + return EXIT_SUCCESS; + } + std::string cluster_name{"ceph"}; + std::string conf_file_list; + // ceph_argparse_early_args() could _exit(), while local_conf() won't ready + // until it's started. so do the boilerplate-settings parsing here. + auto init_params = ceph_argparse_early_args(ceph_args, + CEPH_ENTITY_TYPE_OSD, + &cluster_name, + &conf_file_list); + seastar::sharded<crimson::osd::OSD> osd; + using crimson::common::sharded_conf; + using crimson::common::sharded_perf_coll; + try { + return app.run_deprecated(app_args.size(), const_cast<char**>(app_args.data()), + [&, &ceph_args=ceph_args] { + auto& config = app.configuration(); + return seastar::async([&] { + if (config.count("debug")) { + seastar::global_logger_registry().set_all_loggers_level( + seastar::log_level::debug + ); + } + sharded_conf().start(init_params.name, cluster_name).get(); + seastar::engine().at_exit([] { + return sharded_conf().stop(); + }); + sharded_perf_coll().start().get(); + seastar::engine().at_exit([] { + return sharded_perf_coll().stop(); + }); + local_conf().parse_config_files(conf_file_list).get(); + local_conf().parse_argv(ceph_args).get(); + if (const auto ret = pidfile_write(local_conf()->pid_file); + ret == -EACCES || ret == -EAGAIN) { + ceph_abort_msg( + "likely there is another crimson-osd instance with the same id"); + } else if (ret < 0) { + ceph_abort_msg(fmt::format("pidfile_write failed with {} {}", + ret, cpp_strerror(-ret))); + } + // just ignore SIGHUP, we don't reread settings + seastar::engine().handle_signal(SIGHUP, [] {}); + const int whoami = std::stoi(local_conf()->name.get_id()); + const auto nonce = get_nonce(); + crimson::net::MessengerRef cluster_msgr, client_msgr; + crimson::net::MessengerRef hb_front_msgr, hb_back_msgr; + for (auto [msgr, name] : {make_pair(std::ref(cluster_msgr), "cluster"s), + make_pair(std::ref(client_msgr), "client"s), + make_pair(std::ref(hb_front_msgr), "hb_front"s), + make_pair(std::ref(hb_back_msgr), "hb_back"s)}) { + msgr = crimson::net::Messenger::create(entity_name_t::OSD(whoami), name, + nonce); + if (local_conf()->ms_crc_data) { + msgr->set_crc_data(); + } + if (local_conf()->ms_crc_header) { + msgr->set_crc_header(); + } + } + osd.start_single(whoami, nonce, + cluster_msgr, client_msgr, + hb_front_msgr, hb_back_msgr).get(); + if (config.count("mkkey")) { + make_keyring().handle_exception([](std::exception_ptr) { + seastar::engine().exit(1); + }).get(); + } + if (config.count("mkfs")) { + osd.invoke_on( + 0, + &crimson::osd::OSD::mkfs, + local_conf().get_val<uuid_d>("osd_uuid"), + local_conf().get_val<uuid_d>("fsid")).get(); + } + seastar::engine().at_exit([&] { + return osd.stop(); + }); + if (config.count("mkkey") || config.count("mkfs")) { + seastar::engine().exit(0); + } else { + osd.invoke_on(0, &crimson::osd::OSD::start).get(); + } + }); + }); + } catch (...) { + seastar::fprint(std::cerr, "FATAL: Exception during startup, aborting: %s\n", std::current_exception()); + return EXIT_FAILURE; + } +} + +/* + * Local Variables: + * compile-command: "make -j4 \ + * -C ../../../build \ + * crimson-osd" + * End: + */ diff --git a/src/crimson/osd/objclass.cc b/src/crimson/osd/objclass.cc new file mode 100644 index 000000000..bc3284e26 --- /dev/null +++ b/src/crimson/osd/objclass.cc @@ -0,0 +1,484 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <cstdarg> +#include <cstring> +#include "common/ceph_context.h" +#include "common/ceph_releases.h" +#include "common/config.h" +#include "common/debug.h" + +#include "crimson/osd/exceptions.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/pg_backend.h" + +#include "objclass/objclass.h" +#include "osd/ClassHandler.h" + +#include "auth/Crypto.h" +#include "common/armor.h" + +static inline int execute_osd_op(cls_method_context_t hctx, OSDOp& op) +{ + // we can expect the memory under `ret` will be still fine after + // executing the osd op as we're running inside `seastar::thread` + // created for us by `seastar::async` in `::do_op_call()`. + int ret = 0; + using osd_op_errorator = crimson::osd::OpsExecuter::osd_op_errorator; + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->execute_op(op).handle_error( + osd_op_errorator::all_same_way([&ret] (const std::error_code& err) { + assert(err.value() > 0); + ret = -err.value(); + return seastar::now(); + })).get(); // we're blocking here which requires `seastar::thread`. + return ret; +} + +int cls_call(cls_method_context_t hctx, const char *cls, const char *method, + char *indata, int datalen, + char **outdata, int *outdatalen) +{ +// FIXME, HACK: this is for testing only. Let's use dynamic linker to verify +// our depedencies + return 0; +} + +int cls_getxattr(cls_method_context_t hctx, + const char *name, + char **outdata, + int *outdatalen) +{ + return 0; +} + +int cls_setxattr(cls_method_context_t hctx, + const char *name, + const char *value, + int val_len) +{ + return 0; +} + +int cls_read(cls_method_context_t hctx, + int ofs, int len, + char **outdata, + int *outdatalen) +{ + return 0; +} + +int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin) +{ + assert(origin); + + try { + const auto& message = \ + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message(); + *origin = message.get_orig_source_inst(); + return 0; + } catch (crimson::osd::error& e) { + return -e.code().value(); + } +} + +int cls_cxx_create(cls_method_context_t hctx, const bool exclusive) +{ + OSDOp op{CEPH_OSD_OP_CREATE}; + op.op.flags = (exclusive ? CEPH_OSD_OP_FLAG_EXCL : 0); + return execute_osd_op(hctx, op); +} + +int cls_cxx_remove(cls_method_context_t hctx) +{ + OSDOp op{CEPH_OSD_OP_DELETE}; + return execute_osd_op(hctx, op); +} + +int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime) +{ + OSDOp op{CEPH_OSD_OP_STAT}; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + utime_t ut; + uint64_t s; + try { + auto iter = op.outdata.cbegin(); + decode(s, iter); + decode(ut, iter); + } catch (buffer::error& err) { + return -EIO; + } + if (size) { + *size = s; + } + if (mtime) { + *mtime = ut.sec(); + } + return 0; +} + +int cls_cxx_stat2(cls_method_context_t hctx, + uint64_t *size, + ceph::real_time *mtime) +{ + return 0; +} + +int cls_cxx_read2(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *outbl, + uint32_t op_flags) +{ + OSDOp op{CEPH_OSD_OP_SYNC_READ}; + op.op.extent.offset = ofs; + op.op.extent.length = len; + op.op.flags = op_flags; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return outbl->length(); +} + +int cls_cxx_write2(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *inbl, + uint32_t op_flags) +{ + OSDOp op{CEPH_OSD_OP_WRITE}; + op.op.extent.offset = ofs; + op.op.extent.length = len; + op.op.flags = op_flags; + op.indata = *inbl; + return execute_osd_op(hctx, op); +} + +int cls_cxx_write_full(cls_method_context_t hctx, bufferlist * const inbl) +{ + OSDOp op{CEPH_OSD_OP_WRITEFULL}; + op.op.extent.offset = 0; + op.op.extent.length = inbl->length(); + op.indata = *inbl; + return execute_osd_op(hctx, op); +} + +int cls_cxx_replace(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *inbl) +{ + { + OSDOp top{CEPH_OSD_OP_TRUNCATE}; + top.op.extent.offset = 0; + top.op.extent.length = 0; + if (const auto ret = execute_osd_op(hctx, top); ret < 0) { + return ret; + } + } + + { + OSDOp wop{CEPH_OSD_OP_WRITE}; + wop.op.extent.offset = ofs; + wop.op.extent.length = len; + wop.indata = *inbl; + if (const auto ret = execute_osd_op(hctx, wop); ret < 0) { + return ret; + } + } + return 0; +} + +int cls_cxx_truncate(cls_method_context_t hctx, int ofs) +{ + OSDOp op{CEPH_OSD_OP_TRUNCATE}; + op.op.extent.offset = ofs; + op.op.extent.length = 0; + return execute_osd_op(hctx, op); +} + +int cls_cxx_write_zero(cls_method_context_t hctx, int offset, int len) +{ + OSDOp op{CEPH_OSD_OP_ZERO}; + op.op.extent.offset = offset; + op.op.extent.length = len; + return execute_osd_op(hctx, op); +} + +int cls_cxx_getxattr(cls_method_context_t hctx, + const char *name, + bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_GETXATTR}; + op.op.xattr.name_len = strlen(name); + op.indata.append(name, op.op.xattr.name_len); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return outbl->length(); +} + +int cls_cxx_getxattrs(cls_method_context_t hctx, + map<string, bufferlist> *attrset) +{ + return 0; +} + +int cls_cxx_setxattr(cls_method_context_t hctx, + const char *name, + bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_SETXATTR}; + op.op.xattr.name_len = std::strlen(name); + op.op.xattr.value_len = inbl->length(); + op.indata.append(name, op.op.xattr.name_len); + op.indata.append(*inbl); + return execute_osd_op(hctx, op); +} + +int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid) +{ + OSDOp op{op = CEPH_OSD_OP_ROLLBACK}; + op.op.snap.snapid = snapid; + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_get_all_vals(cls_method_context_t hctx, + map<string, bufferlist>* vals, + bool *more) +{ + return 0; +} + +int cls_cxx_map_get_keys(cls_method_context_t hctx, + const std::string& start_obj, + const uint64_t max_to_get, + std::set<std::string>* const keys, + bool* const more) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETKEYS}; + encode(start_obj, op.indata); + encode(max_to_get, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*keys, iter); + decode(*more, iter); + } catch (buffer::error&) { + return -EIO; + } + return keys->size(); +} + +int cls_cxx_map_get_vals(cls_method_context_t hctx, + const std::string& start_obj, + const std::string& filter_prefix, + const uint64_t max_to_get, + std::map<std::string, ceph::bufferlist> *vals, + bool* const more) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALS}; + encode(start_obj, op.indata); + encode(max_to_get, op.indata); + encode(filter_prefix, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*vals, iter); + decode(*more, iter); + } catch (buffer::error&) { + return -EIO; + } + return vals->size(); +} + +int cls_cxx_map_get_vals_by_keys(cls_method_context_t hctx, + const std::set<std::string> &keys, + std::map<std::string, ceph::bufferlist> *vals) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS}; + encode(keys, op.indata); + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + try { + auto iter = op.outdata.cbegin(); + decode(*vals, iter); + } catch (buffer::error&) { + return -EIO; + } + return 0; +} + +int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETHEADER}; + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + *outbl = std::move(op.outdata); + return 0; +} + +int cls_cxx_map_get_val(cls_method_context_t hctx, + const string &key, + bufferlist *outbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPGETVALSBYKEYS}; + { + std::set<std::string> k{key}; + encode(k, op.indata); + } + if (const auto ret = execute_osd_op(hctx, op); ret < 0) { + return ret; + } + std::map<std::string, ceph::bufferlist> m; + try { + auto iter = op.outdata.cbegin(); + decode(m, iter); + } catch (buffer::error&) { + return -EIO; + } + if (auto iter = std::begin(m); iter != std::end(m)) { + *outbl = std::move(iter->second); + return 0; + } else { + return -ENOENT; + } +} + +int cls_cxx_map_set_val(cls_method_context_t hctx, + const string &key, + bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETVALS}; + { + std::map<std::string, ceph::bufferlist> m; + m[key] = *inbl; + encode(m, op.indata); + } + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_set_vals(cls_method_context_t hctx, + const std::map<string, ceph::bufferlist> *map) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETVALS}; + encode(*map, op.indata); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_clear(cls_method_context_t hctx) +{ + return 0; +} + +int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl) +{ + OSDOp op{CEPH_OSD_OP_OMAPSETHEADER}; + op.indata = std::move(*inbl); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_remove_range(cls_method_context_t hctx, + const std::string& key_begin, + const std::string& key_end) +{ + OSDOp op{CEPH_OSD_OP_OMAPRMKEYRANGE}; + encode(key_begin, op.indata); + encode(key_end, op.indata); + return execute_osd_op(hctx, op); +} + +int cls_cxx_map_remove_key(cls_method_context_t hctx, const string &key) +{ + return 0; +} + +int cls_cxx_list_watchers(cls_method_context_t hctx, + obj_list_watch_response_t *watchers) +{ + return 0; +} + +uint64_t cls_current_version(cls_method_context_t hctx) +{ + return 0; +} + + +int cls_current_subop_num(cls_method_context_t hctx) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + // in contrast to classical OSD, crimson doesn't count OP_CALL and + // OP_STAT which seems fine regarding how the plugins we take care + // about use this part of API. + return ox->get_processed_rw_ops_num(); +} + +uint64_t cls_get_features(cls_method_context_t hctx) +{ + return 0; +} + +uint64_t cls_get_client_features(cls_method_context_t hctx) +{ + try { + const auto& message = \ + reinterpret_cast<crimson::osd::OpsExecuter*>(hctx)->get_message(); + return message.get_features(); + } catch (crimson::osd::error& e) { + return -e.code().value(); + } +} + +uint64_t cls_get_pool_stripe_width(cls_method_context_t hctx) +{ + auto* ox = reinterpret_cast<crimson::osd::OpsExecuter*>(hctx); + return ox->get_pool_stripe_width(); +} + +ceph_release_t cls_get_required_osd_release(cls_method_context_t hctx) +{ + // FIXME + return ceph_release_t::nautilus; +} + +ceph_release_t cls_get_min_compatible_client(cls_method_context_t hctx) +{ + // FIXME + return ceph_release_t::nautilus; +} + +int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq) +{ + return 0; +} + +int cls_cxx_chunk_write_and_set(cls_method_context_t hctx, + int ofs, + int len, + bufferlist *write_inbl, + uint32_t op_flags, + bufferlist *set_inbl, + int set_len) +{ + return 0; +} + +int cls_get_manifest_ref_count(cls_method_context_t hctx, string fp_oid) +{ + return 0; +} + +uint64_t cls_get_osd_min_alloc_size(cls_method_context_t hctx) { + // FIXME + return 4096; +} diff --git a/src/crimson/osd/object_context.cc b/src/crimson/osd/object_context.cc new file mode 100644 index 000000000..bbc71d3f9 --- /dev/null +++ b/src/crimson/osd/object_context.cc @@ -0,0 +1,34 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/object_context.h" + +#include "common/Formatter.h" +#include "crimson/common/config_proxy.h" + +namespace crimson::osd { + +ObjectContextRegistry::ObjectContextRegistry(crimson::common::ConfigProxy &conf) +{ + obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size")); + conf.add_observer(this); +} + +const char** ObjectContextRegistry::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "crimson_osd_obc_lru_size", + nullptr + }; + return KEYS; +} + +void ObjectContextRegistry::handle_conf_change( + const crimson::common::ConfigProxy& conf, + const std::set <std::string> &changed) +{ + obc_lru.set_target_size(conf.get_val<uint64_t>("crimson_osd_obc_lru_size")); +} + + +} diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h new file mode 100644 index 000000000..be238851e --- /dev/null +++ b/src/crimson/osd/object_context.h @@ -0,0 +1,189 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <optional> +#include <utility> +#include <seastar/core/shared_future.hh> +#include <seastar/core/shared_ptr.hh> + +#include "common/intrusive_lru.h" +#include "osd/object_state.h" +#include "crimson/common/exception.h" +#include "crimson/common/tri_mutex.h" +#include "crimson/osd/osd_operation.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::common { + class ConfigProxy; +} + +namespace crimson::osd { + +class Watch; + +template <typename OBC> +struct obc_to_hoid { + using type = hobject_t; + const type &operator()(const OBC &obc) { + return obc.obs.oi.soid; + } +}; + +class ObjectContext : public ceph::common::intrusive_lru_base< + ceph::common::intrusive_lru_config< + hobject_t, ObjectContext, obc_to_hoid<ObjectContext>>> +{ +public: + Ref head; // Ref defined as part of ceph::common::intrusive_lru_base + ObjectState obs; + std::optional<SnapSet> ss; + bool loaded : 1; + // the watch / notify machinery rather stays away from the hot and + // frequented paths. std::map is used mostly because of developer's + // convenience. + using watch_key_t = std::pair<uint64_t, entity_name_t>; + std::map<watch_key_t, seastar::shared_ptr<crimson::osd::Watch>> watchers; + + ObjectContext(const hobject_t &hoid) : obs(hoid), loaded(false) {} + + const hobject_t &get_oid() const { + return obs.oi.soid; + } + + bool is_head() const { + return get_oid().is_head(); + } + + const SnapSet &get_ro_ss() const { + if (is_head()) { + ceph_assert(ss); + return *ss; + } else { + ceph_assert(head); + return head->get_ro_ss(); + } + } + + void set_head_state(ObjectState &&_obs, SnapSet &&_ss) { + ceph_assert(is_head()); + obs = std::move(_obs); + ss = std::move(_ss); + loaded = true; + } + + void set_clone_state(ObjectState &&_obs, Ref &&_head) { + ceph_assert(!is_head()); + obs = std::move(_obs); + head = _head; + loaded = true; + } + + /// pass the provided exception to any waiting consumers of this ObjectContext + template<typename Exception> + void interrupt(Exception ex) { + lock.abort(std::move(ex)); + if (recovery_read_marker) { + drop_recovery_read(); + } + } + +private: + tri_mutex lock; + bool recovery_read_marker = false; + + template <typename Lock, typename Func> + auto _with_lock(Lock&& lock, Func&& func) { + Ref obc = this; + return lock.lock().then([&lock, func = std::forward<Func>(func), obc]() mutable { + return seastar::futurize_invoke(func).finally([&lock, obc] { + lock.unlock(); + }); + }); + } + +public: + template<RWState::State Type, typename Func> + auto with_lock(Func&& func) { + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.for_write(), std::forward<Func>(func)); + case RWState::RWREAD: + return _with_lock(lock.for_read(), std::forward<Func>(func)); + case RWState::RWEXCL: + return _with_lock(lock.for_excl(), std::forward<Func>(func)); + case RWState::RWNONE: + return seastar::futurize_invoke(std::forward<Func>(func)); + default: + assert(0 == "noop"); + } + } + template<RWState::State Type, typename Func> + auto with_promoted_lock(Func&& func) { + switch (Type) { + case RWState::RWWRITE: + return _with_lock(lock.excl_from_write(), std::forward<Func>(func)); + case RWState::RWREAD: + return _with_lock(lock.excl_from_read(), std::forward<Func>(func)); + case RWState::RWEXCL: + return _with_lock(lock.excl_from_excl(), std::forward<Func>(func)); + case RWState::RWNONE: + return _with_lock(lock.for_excl(), std::forward<Func>(func)); + default: + assert(0 == "noop"); + } + } + + bool empty() const { + return !lock.is_acquired(); + } + bool is_request_pending() const { + return lock.is_acquired(); + } + + bool get_recovery_read() { + if (lock.try_lock_for_read()) { + recovery_read_marker = true; + return true; + } else { + return false; + } + } + void wait_recovery_read() { + assert(lock.get_readers() > 0); + recovery_read_marker = true; + } + void drop_recovery_read() { + assert(recovery_read_marker); + recovery_read_marker = false; + } + bool maybe_get_excl() { + return lock.try_lock_for_excl(); + } +}; +using ObjectContextRef = ObjectContext::Ref; + +class ObjectContextRegistry : public md_config_obs_t { + ObjectContext::lru_t obc_lru; + +public: + ObjectContextRegistry(crimson::common::ConfigProxy &conf); + + std::pair<ObjectContextRef, bool> get_cached_obc(const hobject_t &hoid) { + return obc_lru.get_or_create(hoid); + } + ObjectContextRef maybe_get_cached_obc(const hobject_t &hoid) { + return obc_lru.get(hoid); + } + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const crimson::common::ConfigProxy& conf, + const std::set <std::string> &changed) final; +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc new file mode 100644 index 000000000..6b6614e93 --- /dev/null +++ b/src/crimson/osd/ops_executer.cc @@ -0,0 +1,980 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ops_executer.h" + +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/map.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm_ext/push_back.hpp> +#include <boost/range/algorithm/max_element.hpp> +#include <boost/range/numeric.hpp> + +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include <seastar/core/thread.hh> + +#include "crimson/osd/exceptions.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/watch.h" +#include "osd/ClassHandler.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +OpsExecuter::call_errorator::future<> OpsExecuter::do_op_call(OSDOp& osd_op) +{ + std::string cname, mname; + ceph::bufferlist indata; + try { + auto bp = std::begin(osd_op.indata); + bp.copy(osd_op.op.cls.class_len, cname); + bp.copy(osd_op.op.cls.method_len, mname); + bp.copy(osd_op.op.cls.indata_len, indata); + } catch (buffer::error&) { + logger().warn("call unable to decode class + method + indata"); + return crimson::ct_error::invarg::make(); + } + + // NOTE: opening a class can actually result in dlopen(), and thus + // blocking the entire reactor. Thankfully to ClassHandler's cache + // this is supposed to be extremely infrequent. + ClassHandler::ClassData* cls; + int r = ClassHandler::get_instance().open_class(cname, &cls); + if (r) { + logger().warn("class {} open got {}", cname, cpp_strerror(r)); + if (r == -ENOENT) { + return crimson::ct_error::operation_not_supported::make(); + } else if (r == -EPERM) { + // propagate permission errors + return crimson::ct_error::permission_denied::make(); + } + return crimson::ct_error::input_output_error::make(); + } + + ClassHandler::ClassMethod* method = cls->get_method(mname); + if (!method) { + logger().warn("call method {}.{} does not exist", cname, mname); + return crimson::ct_error::operation_not_supported::make(); + } + + const auto flags = method->get_flags(); + if (!obc->obs.exists && (flags & CLS_METHOD_WR) == 0) { + return crimson::ct_error::enoent::make(); + } + +#if 0 + if (flags & CLS_METHOD_WR) { + ctx->user_modify = true; + } +#endif + + logger().debug("calling method {}.{}, num_read={}, num_write={}", + cname, mname, num_read, num_write); + const auto prev_rd = num_read; + const auto prev_wr = num_write; + return seastar::async( + [this, method, indata=std::move(indata)]() mutable { + ceph::bufferlist outdata; + auto cls_context = reinterpret_cast<cls_method_context_t>(this); + const auto ret = method->exec(cls_context, indata, outdata); + return std::make_pair(ret, std::move(outdata)); + } + ).then( + [this, prev_rd, prev_wr, &osd_op, flags] + (auto outcome) -> call_errorator::future<> { + auto& [ret, outdata] = outcome; + osd_op.rval = ret; + + logger().debug("do_op_call: method returned ret={}, outdata.length()={}" + " while num_read={}, num_write={}", + ret, outdata.length(), num_read, num_write); + if (num_read > prev_rd && !(flags & CLS_METHOD_RD)) { + logger().error("method tried to read object but is not marked RD"); + osd_op.rval = -EIO; + return crimson::ct_error::input_output_error::make(); + } + if (num_write > prev_wr && !(flags & CLS_METHOD_WR)) { + logger().error("method tried to update object but is not marked WR"); + osd_op.rval = -EIO; + return crimson::ct_error::input_output_error::make(); + } + // ceph-osd has this implemented in `PrimaryLogPG::execute_ctx`, + // grep for `ignore_out_data`. + using crimson::common::local_conf; + if (op_info.allows_returnvec() && + op_info.may_write() && + ret >= 0 && + outdata.length() > local_conf()->osd_max_write_op_reply_len) { + // the justification of this limit it to not inflate the pg log. + // that's the reason why we don't worry about pure reads. + logger().error("outdata overflow due to .length()={}, limit={}", + outdata.length(), + local_conf()->osd_max_write_op_reply_len); + osd_op.rval = -EOVERFLOW; + return crimson::ct_error::value_too_large::make(); + } + // for write calls we never return data expect errors or RETURNVEC. + // please refer cls/cls_hello.cc to details. + if (!op_info.may_write() || op_info.allows_returnvec() || ret < 0) { + osd_op.op.extent.length = outdata.length(); + osd_op.outdata.claim_append(outdata); + } + if (ret < 0) { + return crimson::stateful_ec{ + std::error_code(-ret, std::generic_category()) }; + } else { + return seastar::now(); + } + } + ); +} + +static watch_info_t create_watch_info(const OSDOp& osd_op, + const MOSDOp& msg) +{ + using crimson::common::local_conf; + const uint32_t timeout = + osd_op.op.watch.timeout == 0 ? local_conf()->osd_client_watch_timeout + : osd_op.op.watch.timeout; + return { + osd_op.op.watch.cookie, + timeout, + msg.get_connection()->get_peer_addr() + }; +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + struct connect_ctx_t { + ObjectContext::watch_key_t key; + crimson::net::ConnectionRef conn; + watch_info_t info; + + connect_ctx_t(const OSDOp& osd_op, const MOSDOp& msg) + : key(osd_op.op.watch.cookie, msg.get_reqid().name), + conn(msg.get_connection()), + info(create_watch_info(osd_op, msg)) { + } + }; + return with_effect_on_obc(connect_ctx_t{ osd_op, get_message() }, + [&] (auto& ctx) { + const auto& entity = ctx.key.second; + auto [it, emplaced] = + os.oi.watchers.try_emplace(ctx.key, std::move(ctx.info)); + if (emplaced) { + logger().info("registered new watch {} by {}", it->second, entity); + txn.nop(); + } else { + logger().info("found existing watch {} by {}", it->second, entity); + } + return seastar::now(); + }, + [] (auto&& ctx, ObjectContextRef obc) { + auto [it, emplaced] = obc->watchers.try_emplace(ctx.key, nullptr); + if (emplaced) { + const auto& [cookie, entity] = ctx.key; + it->second = crimson::osd::Watch::create(obc, ctx.info, entity); + logger().info("op_effect: added new watcher: {}", ctx.key); + } else { + logger().info("op_effect: found existing watcher: {}", ctx.key); + } + return it->second->connect(std::move(ctx.conn), true /* will_ping */); + }); +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_reconnect( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + const entity_name_t& entity = get_message().get_reqid().name; + const auto& cookie = osd_op.op.watch.cookie; + if (!os.oi.watchers.count(std::make_pair(cookie, entity))) { + return crimson::ct_error::not_connected::make(); + } else { + logger().info("found existing watch by {}", entity); + return do_op_watch_subop_watch(osd_op, os, txn); + } +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_unwatch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + logger().info("{}", __func__); + + struct disconnect_ctx_t { + ObjectContext::watch_key_t key; + bool send_disconnect{ false }; + + disconnect_ctx_t(const OSDOp& osd_op, const MOSDOp& msg) + : key(osd_op.op.watch.cookie, msg.get_reqid().name) { + } + }; + return with_effect_on_obc(disconnect_ctx_t{ osd_op, get_message() }, + [&] (auto& ctx) { + const auto& entity = ctx.key.second; + if (auto nh = os.oi.watchers.extract(ctx.key); !nh.empty()) { + logger().info("removed watch {} by {}", nh.mapped(), entity); + txn.nop(); + } else { + logger().info("can't remove: no watch by {}", entity); + } + return seastar::now(); + }, + [] (auto&& ctx, ObjectContextRef obc) { + if (auto nh = obc->watchers.extract(ctx.key); !nh.empty()) { + return seastar::do_with(std::move(nh.mapped()), + [ctx](auto&& watcher) { + logger().info("op_effect: disconnect watcher {}", ctx.key); + return watcher->remove(ctx.send_disconnect); + }); + } else { + logger().info("op_effect: disconnect failed to find watcher {}", ctx.key); + return seastar::now(); + } + }); +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch_subop_ping( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + const entity_name_t& entity = get_message().get_reqid().name; + const auto& cookie = osd_op.op.watch.cookie; + const auto key = std::make_pair(cookie, entity); + + // Note: WATCH with PING doesn't cause may_write() to return true, + // so if there is nothing else in the transaction, this is going + // to run do_osd_op_effects, but not write out a log entry */ + if (!os.oi.watchers.count(key)) { + return crimson::ct_error::not_connected::make(); + } + auto it = obc->watchers.find(key); + if (it == std::end(obc->watchers) || !it->second->is_connected()) { + return crimson::ct_error::timed_out::make(); + } + logger().info("found existing watch by {}", entity); + it->second->got_ping(ceph_clock_now()); + return seastar::now(); +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_watch( + OSDOp& osd_op, + ObjectState& os, + ceph::os::Transaction& txn) +{ + logger().debug("{}", __func__); + if (!os.exists) { + return crimson::ct_error::enoent::make(); + } + switch (osd_op.op.watch.op) { + case CEPH_OSD_WATCH_OP_WATCH: + return do_op_watch_subop_watch(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_RECONNECT: + return do_op_watch_subop_reconnect(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_PING: + return do_op_watch_subop_ping(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_UNWATCH: + return do_op_watch_subop_unwatch(osd_op, os, txn); + case CEPH_OSD_WATCH_OP_LEGACY_WATCH: + logger().warn("ignoring CEPH_OSD_WATCH_OP_LEGACY_WATCH"); + return crimson::ct_error::invarg::make(); + } + logger().warn("unrecognized WATCH subop: {}", osd_op.op.watch.op); + return crimson::ct_error::invarg::make(); +} + +static uint64_t get_next_notify_id(epoch_t e) +{ + // FIXME + static std::uint64_t next_notify_id = 0; + return (((uint64_t)e) << 32) | ((uint64_t)(next_notify_id++)); +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_notify( + OSDOp& osd_op, + const ObjectState& os) +{ + logger().debug("{}, msg epoch: {}", __func__, get_message().get_map_epoch()); + + if (!os.exists) { + return crimson::ct_error::enoent::make(); + } + struct notify_ctx_t { + crimson::net::ConnectionRef conn; + notify_info_t ninfo; + const uint64_t client_gid; + const epoch_t epoch; + + notify_ctx_t(const MOSDOp& msg) + : conn(msg.get_connection()), + client_gid(msg.get_reqid().name.num()), + epoch(msg.get_map_epoch()) { + } + }; + return with_effect_on_obc(notify_ctx_t{ get_message() }, + [&] (auto& ctx) { + try { + auto bp = osd_op.indata.cbegin(); + uint32_t ver; // obsolete + ceph::decode(ver, bp); + ceph::decode(ctx.ninfo.timeout, bp); + ceph::decode(ctx.ninfo.bl, bp); + } catch (const buffer::error&) { + ctx.ninfo.timeout = 0; + } + if (!ctx.ninfo.timeout) { + using crimson::common::local_conf; + ctx.ninfo.timeout = local_conf()->osd_default_notify_timeout; + } + ctx.ninfo.notify_id = get_next_notify_id(ctx.epoch); + ctx.ninfo.cookie = osd_op.op.notify.cookie; + // return our unique notify id to the client + ceph::encode(ctx.ninfo.notify_id, osd_op.outdata); + return seastar::now(); + }, + [] (auto&& ctx, ObjectContextRef obc) { + auto alive_watchers = obc->watchers | boost::adaptors::map_values + | boost::adaptors::filtered( + [] (const auto& w) { + // FIXME: filter as for the `is_ping` in `Watch::start_notify` + return w->is_alive(); + }); + return crimson::osd::Notify::create_n_propagate( + std::begin(alive_watchers), + std::end(alive_watchers), + std::move(ctx.conn), + ctx.ninfo, + ctx.client_gid, + obc->obs.oi.user_version); + }); +} + +OpsExecuter::watch_errorator::future<> OpsExecuter::do_op_notify_ack( + OSDOp& osd_op, + const ObjectState& os) +{ + logger().debug("{}", __func__); + + struct notifyack_ctx_t { + const entity_name_t entity; + uint64_t watch_cookie; + uint64_t notify_id; + ceph::bufferlist reply_bl; + + notifyack_ctx_t(const MOSDOp& msg) : entity(msg.get_reqid().name) { + } + }; + return with_effect_on_obc(notifyack_ctx_t{ get_message() }, + [&] (auto& ctx) -> watch_errorator::future<> { + try { + auto bp = osd_op.indata.cbegin(); + ceph::decode(ctx.notify_id, bp); + ceph::decode(ctx.watch_cookie, bp); + if (!bp.end()) { + ceph::decode(ctx.reply_bl, bp); + } + } catch (const buffer::error&) { + // here we behave differently than ceph-osd. For historical reasons, + // it falls back to using `osd_op.op.watch.cookie` as `ctx.notify_id`. + // crimson just returns EINVAL if the data cannot be decoded. + return crimson::ct_error::invarg::make(); + } + return watch_errorator::now(); + }, + [] (auto&& ctx, ObjectContextRef obc) { + logger().info("notify_ack watch_cookie={}, notify_id={}", + ctx.watch_cookie, ctx.notify_id); + return seastar::do_for_each(obc->watchers, + [ctx=std::move(ctx)] (auto& kv) { + const auto& [key, watchp] = kv; + static_assert( + std::is_same_v<std::decay_t<decltype(watchp)>, + seastar::shared_ptr<crimson::osd::Watch>>); + auto& [cookie, entity] = key; + if (ctx.entity != entity) { + logger().debug("skipping watch {}; entity name {} != {}", + key, entity, ctx.entity); + return seastar::now(); + } + if (ctx.watch_cookie != cookie) { + logger().debug("skipping watch {}; cookie {} != {}", + key, ctx.watch_cookie, cookie); + return seastar::now(); + } + logger().info("acking notify on watch {}", key); + return watchp->notify_ack(ctx.notify_id, ctx.reply_bl); + }); + }); +} + +OpsExecuter::osd_op_errorator::future<> +OpsExecuter::execute_op(OSDOp& osd_op) +{ + // TODO: dispatch via call table? + // TODO: we might want to find a way to unify both input and output + // of each op. + logger().debug( + "handling op {} on object {}", + ceph_osd_op_name(osd_op.op.op), + get_target()); + switch (const ceph_osd_op& op = osd_op.op; op.op) { + case CEPH_OSD_OP_SYNC_READ: + [[fallthrough]]; + case CEPH_OSD_OP_READ: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.read(os, osd_op); + }); + case CEPH_OSD_OP_SPARSE_READ: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.sparse_read(os, osd_op); + }); + case CEPH_OSD_OP_CHECKSUM: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.checksum(os, osd_op); + }); + case CEPH_OSD_OP_CMPEXT: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.cmp_ext(os, osd_op); + }); + case CEPH_OSD_OP_GETXATTR: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.getxattr(os, osd_op); + }); + case CEPH_OSD_OP_GETXATTRS: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.get_xattrs(os, osd_op); + }); + case CEPH_OSD_OP_RMXATTR: + return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) { + return backend.rm_xattr(os, osd_op, txn); + }, true); + case CEPH_OSD_OP_CREATE: + return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) { + return backend.create(os, osd_op, txn); + }, true); + case CEPH_OSD_OP_WRITE: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.write(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_WRITESAME: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.write_same(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_WRITEFULL: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.writefull(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_APPEND: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.append(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_TRUNCATE: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + // FIXME: rework needed. Move this out to do_write_op(), introduce + // do_write_op_no_user_modify()... + return backend.truncate(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_ZERO: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.zero(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_SETALLOCHINT: + return osd_op_errorator::now(); + case CEPH_OSD_OP_SETXATTR: + return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) { + return backend.setxattr(os, osd_op, txn); + }, true); + case CEPH_OSD_OP_DELETE: + return do_write_op([] (auto& backend, auto& os, auto& txn) { + return backend.remove(os, txn); + }, true); + case CEPH_OSD_OP_CALL: + return this->do_op_call(osd_op); + case CEPH_OSD_OP_STAT: + // note: stat does not require RD + return do_const_op([&osd_op] (/* const */auto& backend, const auto& os) { + return backend.stat(os, osd_op); + }); + case CEPH_OSD_OP_TMAPUP: + // TODO: there was an effort to kill TMAP in ceph-osd. According to + // @dzafman this isn't possible yet. Maybe it could be accomplished + // before crimson's readiness and we'd luckily don't need to carry. + return dont_do_legacy_op(); + + // OMAP + case CEPH_OSD_OP_OMAPGETKEYS: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.omap_get_keys(os, osd_op); + }); + case CEPH_OSD_OP_OMAPGETVALS: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.omap_get_vals(os, osd_op); + }); + case CEPH_OSD_OP_OMAPGETHEADER: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.omap_get_header(os, osd_op); + }); + case CEPH_OSD_OP_OMAPGETVALSBYKEYS: + return do_read_op([&osd_op] (auto& backend, const auto& os) { + return backend.omap_get_vals_by_keys(os, osd_op); + }); + case CEPH_OSD_OP_OMAPSETVALS: +#if 0 + if (!pg.get_pool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.omap_set_vals(os, osd_op, txn, *osd_op_params); + }, true); + case CEPH_OSD_OP_OMAPSETHEADER: +#if 0 + if (!pg.get_pool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) { + return backend.omap_set_header(os, osd_op, txn); + }, true); + case CEPH_OSD_OP_OMAPRMKEYRANGE: +#if 0 + if (!pg.get_pool().info.supports_omap()) { + return crimson::ct_error::operation_not_supported::make(); + } +#endif + return do_write_op([&osd_op] (auto& backend, auto& os, auto& txn) { + return backend.omap_remove_range(os, osd_op, txn); + }, true); + case CEPH_OSD_OP_OMAPCLEAR: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return backend.omap_clear(os, osd_op, txn, *osd_op_params); + }, true); + + // watch/notify + case CEPH_OSD_OP_WATCH: + return do_write_op([this, &osd_op] (auto& backend, auto& os, auto& txn) { + return do_op_watch(osd_op, os, txn); + }, false); + case CEPH_OSD_OP_NOTIFY: + return do_read_op([this, &osd_op] (auto&, const auto& os) { + return do_op_notify(osd_op, os); + }); + case CEPH_OSD_OP_NOTIFY_ACK: + return do_read_op([this, &osd_op] (auto&, const auto& os) { + return do_op_notify_ack(osd_op, os); + }); + + default: + logger().warn("unknown op {}", ceph_osd_op_name(op.op)); + throw std::runtime_error( + fmt::format("op '{}' not supported", ceph_osd_op_name(op.op))); + } +} + +static inline std::unique_ptr<const PGLSFilter> get_pgls_filter( + const std::string& type, + bufferlist::const_iterator& iter) +{ + // storing non-const PGLSFilter for the sake of ::init() + std::unique_ptr<PGLSFilter> filter; + if (type.compare("plain") == 0) { + filter = std::make_unique<PGLSPlainFilter>(); + } else { + std::size_t dot = type.find("."); + if (dot == type.npos || dot == 0 || dot == type.size() - 1) { + throw crimson::osd::invalid_argument{}; + } + + const std::string class_name = type.substr(0, dot); + const std::string filter_name = type.substr(dot + 1); + ClassHandler::ClassData *cls = nullptr; + int r = ClassHandler::get_instance().open_class(class_name, &cls); + if (r != 0) { + logger().warn("can't open class {}: {}", class_name, cpp_strerror(r)); + if (r == -EPERM) { + // propogate permission error + throw crimson::osd::permission_denied{}; + } else { + throw crimson::osd::invalid_argument{}; + } + } else { + ceph_assert(cls); + } + + ClassHandler::ClassFilter * const class_filter = cls->get_filter(filter_name); + if (class_filter == nullptr) { + logger().warn("can't find filter {} in class {}", filter_name, class_name); + throw crimson::osd::invalid_argument{}; + } + + filter.reset(class_filter->fn()); + if (!filter) { + // Object classes are obliged to return us something, but let's + // give an error rather than asserting out. + logger().warn("buggy class {} failed to construct filter {}", + class_name, filter_name); + throw crimson::osd::invalid_argument{}; + } + } + + ceph_assert(filter); + int r = filter->init(iter); + if (r < 0) { + logger().warn("error initializing filter {}: {}", type, cpp_strerror(r)); + throw crimson::osd::invalid_argument{}; + } + + // successfully constructed and initialized, return it. + return filter; +} + +static seastar::future<hobject_t> pgls_filter( + const PGLSFilter& filter, + const PGBackend& backend, + const hobject_t& sobj) +{ + if (const auto xattr = filter.get_xattr(); !xattr.empty()) { + logger().debug("pgls_filter: filter is interested in xattr={} for obj={}", + xattr, sobj); + return backend.getxattr(sobj, xattr).safe_then( + [&filter, sobj] (ceph::bufferptr bp) { + logger().debug("pgls_filter: got xvalue for obj={}", sobj); + + ceph::bufferlist val; + val.push_back(std::move(bp)); + const bool filtered = filter.filter(sobj, val); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + }, PGBackend::get_attr_errorator::all_same_way([&filter, sobj] { + logger().debug("pgls_filter: got error for obj={}", sobj); + + if (filter.reject_empty_xattr()) { + return seastar::make_ready_future<hobject_t>(hobject_t{}); + } + ceph::bufferlist val; + const bool filtered = filter.filter(sobj, val); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + })); + } else { + ceph::bufferlist empty_lvalue_bl; + const bool filtered = filter.filter(sobj, empty_lvalue_bl); + return seastar::make_ready_future<hobject_t>(filtered ? sobj : hobject_t{}); + } +} + +static seastar::future<ceph::bufferlist> do_pgnls_common( + const hobject_t& pg_start, + const hobject_t& pg_end, + const PGBackend& backend, + const hobject_t& lower_bound, + const std::string& nspace, + const uint64_t limit, + const PGLSFilter* const filter) +{ + if (!(lower_bound.is_min() || + lower_bound.is_max() || + (lower_bound >= pg_start && lower_bound < pg_end))) { + // this should only happen with a buggy client. + throw std::invalid_argument("outside of PG bounds"); + } + + return backend.list_objects(lower_bound, limit).then( + [&backend, filter, nspace](auto&& ret) { + auto& [objects, next] = ret; + auto in_my_namespace = [&nspace](const hobject_t& obj) { + using crimson::common::local_conf; + if (obj.get_namespace() == local_conf()->osd_hit_set_namespace) { + return false; + } else if (nspace == librados::all_nspaces) { + return true; + } else { + return obj.get_namespace() == nspace; + } + }; + auto to_pglsed = [&backend, filter] (const hobject_t& obj) { + // this transformation looks costly. However, I don't have any + // reason to think PGLS* operations are critical for, let's say, + // general performance. + // + // from tchaikov: "another way is to use seastar::map_reduce(), + // to 1) save the effort to filter the already filtered objects + // 2) avoid the space to keep the tuple<bool, object> even if + // the object is filtered out". + if (filter) { + return pgls_filter(*filter, backend, obj); + } else { + return seastar::make_ready_future<hobject_t>(obj); + } + }; + + auto range = objects | boost::adaptors::filtered(in_my_namespace) + | boost::adaptors::transformed(to_pglsed); + logger().debug("do_pgnls_common: finishing the 1st stage of pgls"); + return seastar::when_all_succeed(std::begin(range), + std::end(range)).then( + [next=std::move(next)] (auto items) mutable { + // the sole purpose of this chaining is to pass `next` to 2nd + // stage altogether with items + logger().debug("do_pgnls_common: 1st done"); + return seastar::make_ready_future< + std::tuple<std::vector<hobject_t>, hobject_t>>( + std::make_tuple(std::move(items), std::move(next))); + }); + }).then( + [pg_end] (auto&& ret) { + auto& [items, next] = ret; + auto is_matched = [] (const auto& obj) { + return !obj.is_min(); + }; + auto to_entry = [] (const auto& obj) { + return librados::ListObjectImpl{ + obj.get_namespace(), obj.oid.name, obj.get_key() + }; + }; + + pg_nls_response_t response; + boost::push_back(response.entries, items | boost::adaptors::filtered(is_matched) + | boost::adaptors::transformed(to_entry)); + response.handle = next.is_max() ? pg_end : next; + ceph::bufferlist out; + encode(response, out); + logger().debug("{}: response.entries.size()=", + __func__, response.entries.size()); + return seastar::make_ready_future<ceph::bufferlist>(std::move(out)); + }); +} + +static seastar::future<> do_pgnls( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + hobject_t lower_bound; + try { + ceph::decode(lower_bound, osd_op.indata); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGNLS handle"); + } + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = \ + pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num()); + return do_pgnls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + nullptr /* no filter */) + .then([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); +} + +static seastar::future<> do_pgnls_filtered( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + std::string cname, mname, type; + auto bp = osd_op.indata.cbegin(); + try { + ceph::decode(cname, bp); + ceph::decode(mname, bp); + ceph::decode(type, bp); + } catch (const buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + auto filter = get_pgls_filter(type, bp); + + hobject_t lower_bound; + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGNLS_FILTER description"); + } + + logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}", + __func__, cname, mname, type, lower_bound, + static_cast<const void*>(filter.get())); + return seastar::do_with(std::move(filter), + [&, lower_bound=std::move(lower_bound)](auto&& filter) { + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num()); + return do_pgnls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + filter.get()) + .then([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); + }); +} + +static seastar::future<ceph::bufferlist> do_pgls_common( + const hobject_t& pg_start, + const hobject_t& pg_end, + const PGBackend& backend, + const hobject_t& lower_bound, + const std::string& nspace, + const uint64_t limit, + const PGLSFilter* const filter) +{ + if (!(lower_bound.is_min() || + lower_bound.is_max() || + (lower_bound >= pg_start && lower_bound < pg_end))) { + // this should only happen with a buggy client. + throw std::invalid_argument("outside of PG bounds"); + } + + using entries_t = decltype(pg_ls_response_t::entries); + return backend.list_objects(lower_bound, limit).then( + [&backend, filter, nspace](auto&& ret) { + auto& [objects, next] = ret; + return seastar::when_all( + seastar::map_reduce(std::move(objects), + [&backend, filter, nspace](const hobject_t& obj) { + if (obj.get_namespace() == nspace) { + if (filter) { + return pgls_filter(*filter, backend, obj); + } else { + return seastar::make_ready_future<hobject_t>(obj); + } + } else { + return seastar::make_ready_future<hobject_t>(hobject_t{}); + } + }, + entries_t{}, + [](entries_t entries, hobject_t obj) { + if (!obj.is_min()) { + entries.emplace_back(obj.oid, obj.get_key()); + } + return entries; + }), + seastar::make_ready_future<hobject_t>(next)); + }).then([pg_end](auto&& ret) { + auto entries = std::move(std::get<0>(ret).get0()); + auto next = std::move(std::get<1>(ret).get0()); + pg_ls_response_t response; + response.handle = next.is_max() ? pg_end : next; + response.entries = std::move(entries); + ceph::bufferlist out; + encode(response, out); + logger().debug("{}: response.entries.size()=", + __func__, response.entries.size()); + return seastar::make_ready_future<ceph::bufferlist>(std::move(out)); + }); +} + +static seastar::future<> do_pgls( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + hobject_t lower_bound; + auto bp = osd_op.indata.cbegin(); + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument{"unable to decode PGLS handle"}; + } + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = + pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num()); + return do_pgls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + nullptr /* no filter */) + .then([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); +} + +static seastar::future<> do_pgls_filtered( + const PG& pg, + const std::string& nspace, + OSDOp& osd_op) +{ + std::string cname, mname, type; + auto bp = osd_op.indata.cbegin(); + try { + ceph::decode(cname, bp); + ceph::decode(mname, bp); + ceph::decode(type, bp); + } catch (const buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + auto filter = get_pgls_filter(type, bp); + + hobject_t lower_bound; + try { + lower_bound.decode(bp); + } catch (const buffer::error&) { + throw std::invalid_argument("unable to decode PGLS_FILTER description"); + } + + logger().debug("{}: cname={}, mname={}, type={}, lower_bound={}, filter={}", + __func__, cname, mname, type, lower_bound, + static_cast<const void*>(filter.get())); + return seastar::do_with(std::move(filter), + [&, lower_bound=std::move(lower_bound)](auto&& filter) { + const auto pg_start = pg.get_pgid().pgid.get_hobj_start(); + const auto pg_end = pg.get_pgid().pgid.get_hobj_end(pg.get_pool().info.get_pg_num()); + return do_pgls_common(pg_start, + pg_end, + pg.get_backend(), + lower_bound, + nspace, + osd_op.op.pgls.count, + filter.get()) + .then([&osd_op](bufferlist bl) { + osd_op.outdata = std::move(bl); + return seastar::now(); + }); + }); +} + +seastar::future<> +PgOpsExecuter::execute_op(OSDOp& osd_op) +{ + logger().warn("handling op {}", ceph_osd_op_name(osd_op.op.op)); + switch (const ceph_osd_op& op = osd_op.op; op.op) { + case CEPH_OSD_OP_PGLS: + return do_pgls(pg, nspace, osd_op); + case CEPH_OSD_OP_PGLS_FILTER: + return do_pgls_filtered(pg, nspace, osd_op); + case CEPH_OSD_OP_PGNLS: + return do_pgnls(pg, nspace, osd_op); + case CEPH_OSD_OP_PGNLS_FILTER: + return do_pgnls_filtered(pg, nspace, osd_op); + default: + logger().warn("unknown op {}", ceph_osd_op_name(op.op)); + throw std::runtime_error( + fmt::format("op '{}' not supported", ceph_osd_op_name(op.op))); + } +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h new file mode 100644 index 000000000..42fcf61b8 --- /dev/null +++ b/src/crimson/osd/ops_executer.h @@ -0,0 +1,283 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <type_traits> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <boost/smart_ptr/local_shared_ptr.hpp> +#include <seastar/core/chunked_fifo.hh> +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "common/dout.h" +#include "crimson/net/Fwd.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" +#include "crimson/osd/object_context.h" + +#include "crimson/common/errorator.h" +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/osdmap_gate.h" + +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/exceptions.h" + +#include "messages/MOSDOp.h" + +class PG; +class PGLSFilter; +class OSDOp; + +namespace crimson::osd { + +// PgOpsExecuter -- a class for executing ops targeting a certain object. +class OpsExecuter { + using call_errorator = crimson::errorator< + crimson::stateful_ec, + crimson::ct_error::enoent, + crimson::ct_error::invarg, + crimson::ct_error::permission_denied, + crimson::ct_error::operation_not_supported, + crimson::ct_error::input_output_error, + crimson::ct_error::value_too_large>; + using read_errorator = PGBackend::read_errorator; + using write_ertr = PGBackend::write_ertr; + using get_attr_errorator = PGBackend::get_attr_errorator; + using watch_errorator = crimson::errorator< + crimson::ct_error::enoent, + crimson::ct_error::invarg, + crimson::ct_error::not_connected, + crimson::ct_error::timed_out>; + +public: + // because OpsExecuter is pretty heavy-weight object we want to ensure + // it's not copied nor even moved by accident. Performance is the sole + // reason for prohibiting that. + OpsExecuter(OpsExecuter&&) = delete; + OpsExecuter(const OpsExecuter&) = delete; + + using osd_op_errorator = crimson::compound_errorator_t< + call_errorator, + read_errorator, + write_ertr, + get_attr_errorator, + watch_errorator, + PGBackend::stat_errorator>; + +private: + // an operation can be divided into two stages: main and effect-exposing + // one. The former is performed immediately on call to `do_osd_op()` while + // the later on `submit_changes()` – after successfully processing main + // stages of all involved operations. When any stage fails, none of all + // scheduled effect-exposing stages will be executed. + // when operation requires this division, some variant of `with_effect()` + // should be used. + struct effect_t { + virtual osd_op_errorator::future<> execute() = 0; + virtual ~effect_t() = default; + }; + + ObjectContextRef obc; + const OpInfo& op_info; + const pg_pool_t& pool_info; // for the sake of the ObjClass API + PGBackend& backend; + const MOSDOp& msg; + std::optional<osd_op_params_t> osd_op_params; + bool user_modify = false; + ceph::os::Transaction txn; + + size_t num_read = 0; ///< count read ops + size_t num_write = 0; ///< count update ops + + // this gizmo could be wrapped in std::optional for the sake of lazy + // initialization. we don't need it for ops that doesn't have effect + // TODO: verify the init overhead of chunked_fifo + seastar::chunked_fifo<std::unique_ptr<effect_t>> op_effects; + + template <class Context, class MainFunc, class EffectFunc> + auto with_effect_on_obc( + Context&& ctx, + MainFunc&& main_func, + EffectFunc&& effect_func); + + call_errorator::future<> do_op_call(class OSDOp& osd_op); + watch_errorator::future<> do_op_watch( + class OSDOp& osd_op, + class ObjectState& os, + ceph::os::Transaction& txn); + watch_errorator::future<> do_op_watch_subop_watch( + class OSDOp& osd_op, + class ObjectState& os, + ceph::os::Transaction& txn); + watch_errorator::future<> do_op_watch_subop_reconnect( + class OSDOp& osd_op, + class ObjectState& os, + ceph::os::Transaction& txn); + watch_errorator::future<> do_op_watch_subop_unwatch( + class OSDOp& osd_op, + class ObjectState& os, + ceph::os::Transaction& txn); + watch_errorator::future<> do_op_watch_subop_ping( + class OSDOp& osd_op, + class ObjectState& os, + ceph::os::Transaction& txn); + watch_errorator::future<> do_op_notify( + class OSDOp& osd_op, + const class ObjectState& os); + watch_errorator::future<> do_op_notify_ack( + class OSDOp& osd_op, + const class ObjectState& os); + + hobject_t &get_target() const { + return obc->obs.oi.soid; + } + + template <class Func> + auto do_const_op(Func&& f) { + // TODO: pass backend as read-only + return std::forward<Func>(f)(backend, std::as_const(obc->obs)); + } + + template <class Func> + auto do_read_op(Func&& f) { + ++num_read; + // TODO: pass backend as read-only + return do_const_op(std::forward<Func>(f)); + } + + template <class Func> + auto do_write_op(Func&& f, bool um) { + ++num_write; + if (!osd_op_params) { + osd_op_params.emplace(); + } + user_modify = um; + return std::forward<Func>(f)(backend, obc->obs, txn); + } + + decltype(auto) dont_do_legacy_op() { + return crimson::ct_error::operation_not_supported::make(); + } + +public: + OpsExecuter(ObjectContextRef obc, + const OpInfo& op_info, + const pg_pool_t& pool_info, + PGBackend& backend, + const MOSDOp& msg) + : obc(std::move(obc)), + op_info(op_info), + pool_info(pool_info), + backend(backend), + msg(msg) { + } + + osd_op_errorator::future<> execute_op(class OSDOp& osd_op); + + template <typename Func, typename MutFunc> + osd_op_errorator::future<> flush_changes(Func&& func, MutFunc&& mut_func) &&; + + const auto& get_message() const { + return msg; + } + + size_t get_processed_rw_ops_num() const { + return num_read + num_write; + } + + uint32_t get_pool_stripe_width() const { + return pool_info.get_stripe_width(); + } + + bool has_seen_write() const { + return num_write > 0; + } +}; + +template <class Context, class MainFunc, class EffectFunc> +auto OpsExecuter::with_effect_on_obc( + Context&& ctx, + MainFunc&& main_func, + EffectFunc&& effect_func) +{ + using context_t = std::decay_t<Context>; + // the language offers implicit conversion to pointer-to-function for + // lambda only when it's closureless. We enforce this restriction due + // the fact that `flush_changes()` std::moves many executer's parts. + using allowed_effect_func_t = + seastar::future<> (*)(context_t&&, ObjectContextRef); + static_assert(std::is_convertible_v<EffectFunc, allowed_effect_func_t>, + "with_effect function is not allowed to capture"); + struct task_t final : effect_t { + context_t ctx; + EffectFunc effect_func; + ObjectContextRef obc; + + task_t(Context&& ctx, EffectFunc&& effect_func, ObjectContextRef obc) + : ctx(std::move(ctx)), + effect_func(std::move(effect_func)), + obc(std::move(obc)) { + } + osd_op_errorator::future<> execute() final { + return std::move(effect_func)(std::move(ctx), std::move(obc)); + } + }; + auto task = + std::make_unique<task_t>(std::move(ctx), std::move(effect_func), obc); + auto& ctx_ref = task->ctx; + op_effects.emplace_back(std::move(task)); + return std::forward<MainFunc>(main_func)(ctx_ref); +} + +template <typename Func, + typename MutFunc> +OpsExecuter::osd_op_errorator::future<> OpsExecuter::flush_changes( + Func&& func, + MutFunc&& mut_func) && +{ + const bool want_mutate = !txn.empty(); + // osd_op_params are instantiated by every wr-like operation. + assert(osd_op_params || !want_mutate); + assert(obc); + if (__builtin_expect(op_effects.empty(), true)) { + return want_mutate ? std::forward<MutFunc>(mut_func)(std::move(txn), + std::move(obc), + std::move(*osd_op_params), + user_modify) + : std::forward<Func>(func)(std::move(obc)); + } else { + return (want_mutate ? std::forward<MutFunc>(mut_func)(std::move(txn), + std::move(obc), + std::move(*osd_op_params), + user_modify) + : std::forward<Func>(func)(std::move(obc)) + ).safe_then([this] { + // let's do the cleaning of `op_effects` in destructor + return crimson::do_for_each(op_effects, [] (auto& op_effect) { + return op_effect->execute(); + }); + }); + } +} + +// PgOpsExecuter -- a class for executing ops targeting a certain PG. +class PgOpsExecuter { +public: + PgOpsExecuter(const PG& pg, const MOSDOp& msg) + : pg(pg), nspace(msg.get_hobj().nspace) { + } + + seastar::future<> execute_op(class OSDOp& osd_op); + +private: + const PG& pg; + const std::string& nspace; +}; + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc new file mode 100644 index 000000000..521cb9ba3 --- /dev/null +++ b/src/crimson/osd/osd.cc @@ -0,0 +1,1364 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd.h" + +#include <sys/utsname.h> + +#include <boost/iterator/counting_iterator.hpp> +#include <boost/range/join.hpp> +#include <boost/smart_ptr/make_local_shared.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <seastar/core/timer.hh> + +#include "common/pick_address.h" +#include "include/util.h" + +#include "messages/MCommand.h" +#include "messages/MOSDBeacon.h" +#include "messages/MOSDBoot.h" +#include "messages/MOSDMap.h" +#include "messages/MOSDMarkMeDown.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDPGLog.h" +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDScrub2.h" +#include "messages/MPGStats.h" + +#include "os/Transaction.h" +#include "osd/ClassHandler.h" +#include "osd/OSDCap.h" +#include "osd/PGPeeringEvent.h" +#include "osd/PeeringState.h" + +#include "crimson/admin/osd_admin.h" +#include "crimson/admin/pg_commands.h" +#include "crimson/common/exception.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/heartbeat.h" +#include "crimson/osd/osd_meta.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/pg_meta.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/compound_peering_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/pg_advance_map.h" +#include "crimson/osd/osd_operations/recovery_subrequest.h" +#include "crimson/osd/osd_operations/replicated_request.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } + static constexpr int TICK_INTERVAL = 1; +} + +using crimson::common::local_conf; +using crimson::os::FuturizedStore; + +namespace crimson::osd { + +OSD::OSD(int id, uint32_t nonce, + crimson::net::MessengerRef cluster_msgr, + crimson::net::MessengerRef public_msgr, + crimson::net::MessengerRef hb_front_msgr, + crimson::net::MessengerRef hb_back_msgr) + : whoami{id}, + nonce{nonce}, + // do this in background + beacon_timer{[this] { (void)send_beacon(); }}, + cluster_msgr{cluster_msgr}, + public_msgr{public_msgr}, + monc{new crimson::mon::Client{*public_msgr, *this}}, + mgrc{new crimson::mgr::Client{*public_msgr, *this}}, + store{crimson::os::FuturizedStore::create( + local_conf().get_val<std::string>("osd_objectstore"), + local_conf().get_val<std::string>("osd_data"), + local_conf().get_config_values())}, + shard_services{*this, whoami, *cluster_msgr, *public_msgr, *monc, *mgrc, *store}, + heartbeat{new Heartbeat{whoami, shard_services, *monc, hb_front_msgr, hb_back_msgr}}, + // do this in background + tick_timer{[this] { + update_heartbeat_peers(); + update_stats(); + }}, + asok{seastar::make_lw_shared<crimson::admin::AdminSocket>()}, + osdmap_gate("OSD::osdmap_gate", std::make_optional(std::ref(shard_services))) +{ + osdmaps[0] = boost::make_local_shared<OSDMap>(); + for (auto msgr : {std::ref(cluster_msgr), std::ref(public_msgr), + std::ref(hb_front_msgr), std::ref(hb_back_msgr)}) { + msgr.get()->set_auth_server(monc.get()); + msgr.get()->set_auth_client(monc.get()); + } + + if (local_conf()->osd_open_classes_on_start) { + const int r = ClassHandler::get_instance().open_all_classes(); + if (r) { + logger().warn("{} warning: got an error loading one or more classes: {}", + __func__, cpp_strerror(r)); + } + } +} + +OSD::~OSD() = default; + +namespace { +// Initial features in new superblock. +// Features here are also automatically upgraded +CompatSet get_osd_initial_compat_set() +{ + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO); + ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES); + return CompatSet(ceph_osd_feature_compat, + ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} +} + +seastar::future<> OSD::mkfs(uuid_d osd_uuid, uuid_d cluster_fsid) +{ + return store->start().then([this, osd_uuid] { + return store->mkfs(osd_uuid); + }).then([this] { + return store->mount(); + }).then([cluster_fsid, this] { + superblock.cluster_fsid = cluster_fsid; + superblock.osd_fsid = store->get_fsid(); + superblock.whoami = whoami; + superblock.compat_features = get_osd_initial_compat_set(); + + logger().info( + "{} writing superblock cluster_fsid {} osd_fsid {}", + __func__, + cluster_fsid, + superblock.osd_fsid); + return store->create_new_collection(coll_t::meta()); + }).then([this] (auto ch) { + meta_coll = make_unique<OSDMeta>(ch , store.get()); + ceph::os::Transaction t; + meta_coll->create(t); + meta_coll->store_superblock(t, superblock); + return store->do_transaction(meta_coll->collection(), std::move(t)); + }).then([cluster_fsid, this] { + return when_all_succeed( + store->write_meta("ceph_fsid", cluster_fsid.to_string()), + store->write_meta("whoami", std::to_string(whoami))); + }).then_unpack([cluster_fsid, this] { + fmt::print("created object store {} for osd.{} fsid {}\n", + local_conf().get_val<std::string>("osd_data"), + whoami, cluster_fsid); + return seastar::now(); + }); +} + +namespace { + entity_addrvec_t pick_addresses(int what) { + entity_addrvec_t addrs; + crimson::common::CephContext cct; + if (int r = ::pick_addresses(&cct, what, &addrs, -1); r < 0) { + throw std::runtime_error("failed to pick address"); + } + for (auto addr : addrs.v) { + logger().info("picked address {}", addr); + } + return addrs; + } + std::pair<entity_addrvec_t, bool> + replace_unknown_addrs(entity_addrvec_t maybe_unknowns, + const entity_addrvec_t& knowns) { + bool changed = false; + auto maybe_replace = [&](entity_addr_t addr) { + if (!addr.is_blank_ip()) { + return addr; + } + for (auto& b : knowns.v) { + if (addr.get_family() == b.get_family()) { + auto a = b; + a.set_nonce(addr.get_nonce()); + a.set_type(addr.get_type()); + a.set_port(addr.get_port()); + changed = true; + return a; + } + } + throw std::runtime_error("failed to replace unknown address"); + }; + entity_addrvec_t replaced; + std::transform(maybe_unknowns.v.begin(), + maybe_unknowns.v.end(), + std::back_inserter(replaced.v), + maybe_replace); + return {replaced, changed}; + } +} + +seastar::future<> OSD::start() +{ + logger().info("start"); + + startup_time = ceph::mono_clock::now(); + + return store->start().then([this] { + return store->mount(); + }).then([this] { + return store->open_collection(coll_t::meta()); + }).then([this](auto ch) { + meta_coll = make_unique<OSDMeta>(ch, store.get()); + return meta_coll->load_superblock(); + }).then([this](OSDSuperblock&& sb) { + superblock = std::move(sb); + return get_map(superblock.current_epoch); + }).then([this](cached_map_t&& map) { + shard_services.update_map(map); + osdmap_gate.got_map(map->get_epoch()); + osdmap = std::move(map); + return load_pgs(); + }).then([this] { + + uint64_t osd_required = + CEPH_FEATURE_UID | + CEPH_FEATURE_PGID64 | + CEPH_FEATURE_OSDENC; + using crimson::net::SocketPolicy; + + public_msgr->set_default_policy(SocketPolicy::stateless_server(0)); + public_msgr->set_policy(entity_name_t::TYPE_MON, + SocketPolicy::lossy_client(osd_required)); + public_msgr->set_policy(entity_name_t::TYPE_MGR, + SocketPolicy::lossy_client(osd_required)); + public_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::stateless_server(0)); + + cluster_msgr->set_default_policy(SocketPolicy::stateless_server(0)); + cluster_msgr->set_policy(entity_name_t::TYPE_MON, + SocketPolicy::lossy_client(0)); + cluster_msgr->set_policy(entity_name_t::TYPE_OSD, + SocketPolicy::lossless_peer(osd_required)); + cluster_msgr->set_policy(entity_name_t::TYPE_CLIENT, + SocketPolicy::stateless_server(0)); + + crimson::net::dispatchers_t dispatchers{this, monc.get(), mgrc.get()}; + return seastar::when_all_succeed( + cluster_msgr->try_bind(pick_addresses(CEPH_PICK_ADDRESS_CLUSTER), + local_conf()->ms_bind_port_min, + local_conf()->ms_bind_port_max) + .safe_then([this, dispatchers]() mutable { + return cluster_msgr->start(dispatchers); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [] (const std::error_code& e) { + logger().error("cluster messenger try_bind(): address range is unavailable."); + ceph_abort(); + })), + public_msgr->try_bind(pick_addresses(CEPH_PICK_ADDRESS_PUBLIC), + local_conf()->ms_bind_port_min, + local_conf()->ms_bind_port_max) + .safe_then([this, dispatchers]() mutable { + return public_msgr->start(dispatchers); + }, crimson::net::Messenger::bind_ertr::all_same_way( + [] (const std::error_code& e) { + logger().error("public messenger try_bind(): address range is unavailable."); + ceph_abort(); + }))); + }).then_unpack([this] { + return seastar::when_all_succeed(monc->start(), + mgrc->start()); + }).then_unpack([this] { + return _add_me_to_crush(); + }).then([this] { + monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0); + monc->sub_want("mgrmap", 0, 0); + monc->sub_want("osdmap", 0, 0); + return monc->renew_subs(); + }).then([this] { + if (auto [addrs, changed] = + replace_unknown_addrs(cluster_msgr->get_myaddrs(), + public_msgr->get_myaddrs()); changed) { + return cluster_msgr->set_myaddrs(addrs); + } else { + return seastar::now(); + } + }).then([this] { + return heartbeat->start(public_msgr->get_myaddrs(), + cluster_msgr->get_myaddrs()); + }).then([this] { + // create the admin-socket server, and the objects that register + // to handle incoming commands + return start_asok_admin(); + }).then([this] { + return start_boot(); + }); +} + +seastar::future<> OSD::start_boot() +{ + state.set_preboot(); + return monc->get_version("osdmap").then([this](auto&& ret) { + auto [newest, oldest] = ret; + return _preboot(oldest, newest); + }); +} + +seastar::future<> OSD::_preboot(version_t oldest, version_t newest) +{ + logger().info("osd.{}: _preboot", whoami); + if (osdmap->get_epoch() == 0) { + logger().info("waiting for initial osdmap"); + } else if (osdmap->is_destroyed(whoami)) { + logger().warn("osdmap says I am destroyed"); + // provide a small margin so we don't livelock seeing if we + // un-destroyed ourselves. + if (osdmap->get_epoch() > newest - 1) { + throw std::runtime_error("i am destroyed"); + } + } else if (osdmap->is_noup(whoami)) { + logger().warn("osdmap NOUP flag is set, waiting for it to clear"); + } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) { + logger().error("osdmap SORTBITWISE OSDMap flag is NOT set; please set it"); + } else if (osdmap->require_osd_release < ceph_release_t::octopus) { + logger().error("osdmap require_osd_release < octopus; please upgrade to octopus"); + } else if (false) { + // TODO: update mon if current fullness state is different from osdmap + } else if (version_t n = local_conf()->osd_map_message_max; + osdmap->get_epoch() >= oldest - 1 && + osdmap->get_epoch() + n > newest) { + return _send_boot(); + } + // get all the latest maps + if (osdmap->get_epoch() + 1 >= oldest) { + return shard_services.osdmap_subscribe(osdmap->get_epoch() + 1, false); + } else { + return shard_services.osdmap_subscribe(oldest - 1, true); + } +} + +seastar::future<> OSD::_send_boot() +{ + state.set_booting(); + + logger().info("hb_back_msgr: {}", heartbeat->get_back_addrs()); + logger().info("hb_front_msgr: {}", heartbeat->get_front_addrs()); + logger().info("cluster_msgr: {}", cluster_msgr->get_myaddr()); + auto m = make_message<MOSDBoot>(superblock, + osdmap->get_epoch(), + osdmap->get_epoch(), + heartbeat->get_back_addrs(), + heartbeat->get_front_addrs(), + cluster_msgr->get_myaddrs(), + CEPH_FEATURES_ALL); + collect_sys_info(&m->metadata, NULL); + return monc->send_message(m); +} + +seastar::future<> OSD::_add_me_to_crush() +{ + if (!local_conf().get_val<bool>("osd_crush_update_on_start")) { + return seastar::now(); + } + auto get_weight = [this] { + if (auto w = local_conf().get_val<double>("osd_crush_initial_weight"); + w >= 0) { + return seastar::make_ready_future<double>(w); + } else { + return store->stat().then([](auto st) { + auto total = st.total; + return seastar::make_ready_future<double>( + std::max(.00001, + double(total) / double(1ull << 40))); // TB + }); + } + }; + return get_weight().then([this](auto weight) { + const crimson::crush::CrushLocation loc{make_unique<CephContext>().get()}; + logger().info("{} crush location is {}", __func__, loc); + string cmd = fmt::format(R"({{ + "prefix": "osd crush create-or-move", + "id": {}, + "weight": {:.4f}, + "args": [{}] + }})", whoami, weight, loc); + return monc->run_command({cmd}, {}); + }).then([](auto&& command_result) { + [[maybe_unused]] auto [code, message, out] = std::move(command_result); + if (code) { + logger().warn("fail to add to crush: {} ({})", message, code); + throw std::runtime_error("fail to add to crush"); + } else { + logger().info("added to crush: {}", message); + } + return seastar::now(); + }); +} + +seastar::future<> OSD::handle_command(crimson::net::ConnectionRef conn, + Ref<MCommand> m) +{ + return asok->handle_command(conn, std::move(m)); +} + +/* + The OSD's Admin Socket object created here has two servers (i.e. - blocks of commands + to handle) registered to it: + - OSD's specific commands are handled by the OSD object; + - there are some common commands registered to be directly handled by the AdminSocket object + itself. +*/ +seastar::future<> OSD::start_asok_admin() +{ + auto asok_path = local_conf().get_val<std::string>("admin_socket"); + using namespace crimson::admin; + return asok->start(asok_path).then([this] { + return seastar::when_all_succeed( + asok->register_admin_commands(), + asok->register_command(make_asok_hook<OsdStatusHook>(std::as_const(*this))), + asok->register_command(make_asok_hook<SendBeaconHook>(*this)), + asok->register_command(make_asok_hook<FlushPgStatsHook>(*this)), + asok->register_command(make_asok_hook<DumpPGStateHistory>(std::as_const(*this))), + asok->register_command(make_asok_hook<SeastarMetricsHook>()), + // PG commands + asok->register_command(make_asok_hook<pg::QueryCommand>(*this)), + asok->register_command(make_asok_hook<pg::MarkUnfoundLostCommand>(*this))); + }).then_unpack([] { + return seastar::now(); + }); +} + +seastar::future<> OSD::stop() +{ + logger().info("stop"); + // see also OSD::shutdown() + return prepare_to_stop().then([this] { + state.set_stopping(); + logger().debug("prepared to stop"); + public_msgr->stop(); + cluster_msgr->stop(); + auto gate_close_fut = gate.close(); + return asok->stop().then([this] { + return heartbeat->stop(); + }).then([this] { + return store->umount(); + }).then([this] { + return store->stop(); + }).then([this] { + return seastar::parallel_for_each(pg_map.get_pgs(), + [](auto& p) { + return p.second->stop(); + }); + }).then([this] { + return monc->stop(); + }).then([this] { + return mgrc->stop(); + }).then([fut=std::move(gate_close_fut)]() mutable { + return std::move(fut); + }).then([this] { + return when_all_succeed( + public_msgr->shutdown(), + cluster_msgr->shutdown()); + }).then_unpack([] { + return seastar::now(); + }).handle_exception([](auto ep) { + logger().error("error while stopping osd: {}", ep); + }); + }); +} + +void OSD::dump_status(Formatter* f) const +{ + f->dump_stream("cluster_fsid") << superblock.cluster_fsid; + f->dump_stream("osd_fsid") << superblock.osd_fsid; + f->dump_unsigned("whoami", superblock.whoami); + f->dump_string("state", state.to_string()); + f->dump_unsigned("oldest_map", superblock.oldest_map); + f->dump_unsigned("newest_map", superblock.newest_map); + f->dump_unsigned("num_pgs", pg_map.get_pgs().size()); +} + +void OSD::dump_pg_state_history(Formatter* f) const +{ + f->open_array_section("pgs"); + for (auto [pgid, pg] : pg_map.get_pgs()) { + f->open_object_section("pg"); + f->dump_stream("pg") << pgid; + const auto& peering_state = pg->get_peering_state(); + f->dump_string("currently", peering_state.get_current_state()); + peering_state.dump_history(f); + f->close_section(); + } + f->close_section(); +} + +void OSD::print(std::ostream& out) const +{ + out << "{osd." << superblock.whoami << " " + << superblock.osd_fsid << " [" << superblock.oldest_map + << "," << superblock.newest_map << "] " << pg_map.get_pgs().size() + << " pgs}"; +} + +seastar::future<> OSD::load_pgs() +{ + return store->list_collections().then([this](auto colls) { + return seastar::parallel_for_each(colls, [this](auto coll) { + spg_t pgid; + if (coll.is_pg(&pgid)) { + return load_pg(pgid).then([pgid, this](auto&& pg) { + logger().info("load_pgs: loaded {}", pgid); + pg_map.pg_loaded(pgid, std::move(pg)); + shard_services.inc_pg_num(); + return seastar::now(); + }); + } else if (coll.is_temp(&pgid)) { + // TODO: remove the collection + return seastar::now(); + } else { + logger().warn("ignoring unrecognized collection: {}", coll); + return seastar::now(); + } + }); + }); +} + +seastar::future<Ref<PG>> OSD::make_pg(cached_map_t create_map, + spg_t pgid, + bool do_create) +{ + using ec_profile_t = map<string,string>; + auto get_pool_info = [create_map, pgid, this] { + if (create_map->have_pg_pool(pgid.pool())) { + pg_pool_t pi = *create_map->get_pg_pool(pgid.pool()); + string name = create_map->get_pool_name(pgid.pool()); + ec_profile_t ec_profile; + if (pi.is_erasure()) { + ec_profile = create_map->get_erasure_code_profile(pi.erasure_code_profile); + } + return seastar::make_ready_future<std::tuple<pg_pool_t, string, ec_profile_t>>( + std::make_tuple(std::move(pi), + std::move(name), + std::move(ec_profile))); + } else { + // pool was deleted; grab final pg_pool_t off disk. + return meta_coll->load_final_pool_info(pgid.pool()); + } + }; + auto get_collection = [pgid, do_create, this] { + const coll_t cid{pgid}; + if (do_create) { + return store->create_new_collection(cid); + } else { + return store->open_collection(cid); + } + }; + return seastar::when_all( + std::move(get_pool_info), + std::move(get_collection) + ).then([pgid, create_map, this] (auto&& ret) { + auto [pool, name, ec_profile] = std::move(std::get<0>(ret).get0()); + auto coll = std::move(std::get<1>(ret).get0()); + return seastar::make_ready_future<Ref<PG>>( + new PG{pgid, + pg_shard_t{whoami, pgid.shard}, + std::move(coll), + std::move(pool), + std::move(name), + create_map, + shard_services, + ec_profile}); + }); +} + +seastar::future<Ref<PG>> OSD::load_pg(spg_t pgid) +{ + logger().debug("{}: {}", __func__, pgid); + + return seastar::do_with(PGMeta(store.get(), pgid), [] (auto& pg_meta) { + return pg_meta.get_epoch(); + }).then([this](epoch_t e) { + return get_map(e); + }).then([pgid, this] (auto&& create_map) { + return make_pg(std::move(create_map), pgid, false); + }).then([this](Ref<PG> pg) { + return pg->read_state(store.get()).then([pg] { + return seastar::make_ready_future<Ref<PG>>(std::move(pg)); + }); + }).handle_exception([pgid](auto ep) { + logger().info("pg {} saw exception on load {}", pgid, ep); + ceph_abort("Could not load pg" == 0); + return seastar::make_exception_future<Ref<PG>>(ep); + }); +} + +std::optional<seastar::future<>> +OSD::ms_dispatch(crimson::net::ConnectionRef conn, MessageRef m) +{ + if (state.is_stopping()) { + return {}; + } + bool dispatched = true; + gate.dispatch_in_background(__func__, *this, [this, conn, &m, &dispatched] { + switch (m->get_type()) { + case CEPH_MSG_OSD_MAP: + return handle_osd_map(conn, boost::static_pointer_cast<MOSDMap>(m)); + case CEPH_MSG_OSD_OP: + return handle_osd_op(conn, boost::static_pointer_cast<MOSDOp>(m)); + case MSG_OSD_PG_CREATE2: + shard_services.start_operation<CompoundPeeringRequest>( + *this, + conn, + m); + return seastar::now(); + case MSG_COMMAND: + return handle_command(conn, boost::static_pointer_cast<MCommand>(m)); + case MSG_OSD_MARK_ME_DOWN: + return handle_mark_me_down(conn, boost::static_pointer_cast<MOSDMarkMeDown>(m)); + case MSG_OSD_PG_PULL: + [[fallthrough]]; + case MSG_OSD_PG_PUSH: + [[fallthrough]]; + case MSG_OSD_PG_PUSH_REPLY: + [[fallthrough]]; + case MSG_OSD_PG_RECOVERY_DELETE: + [[fallthrough]]; + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + [[fallthrough]]; + case MSG_OSD_PG_SCAN: + [[fallthrough]]; + case MSG_OSD_PG_BACKFILL: + [[fallthrough]]; + case MSG_OSD_PG_BACKFILL_REMOVE: + return handle_recovery_subreq(conn, boost::static_pointer_cast<MOSDFastDispatchOp>(m)); + case MSG_OSD_PG_LEASE: + [[fallthrough]]; + case MSG_OSD_PG_LEASE_ACK: + [[fallthrough]]; + case MSG_OSD_PG_NOTIFY2: + [[fallthrough]]; + case MSG_OSD_PG_INFO2: + [[fallthrough]]; + case MSG_OSD_PG_QUERY2: + [[fallthrough]]; + case MSG_OSD_BACKFILL_RESERVE: + [[fallthrough]]; + case MSG_OSD_RECOVERY_RESERVE: + [[fallthrough]]; + case MSG_OSD_PG_LOG: + return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m)); + case MSG_OSD_REPOP: + return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m)); + case MSG_OSD_REPOPREPLY: + return handle_rep_op_reply(conn, boost::static_pointer_cast<MOSDRepOpReply>(m)); + case MSG_OSD_SCRUB2: + return handle_scrub(conn, boost::static_pointer_cast<MOSDScrub2>(m)); + default: + dispatched = false; + return seastar::now(); + } + }); + return (dispatched ? std::make_optional(seastar::now()) : std::nullopt); +} + +void OSD::ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) +{ + // TODO: cleanup the session attached to this connection + logger().warn("ms_handle_reset"); +} + +void OSD::ms_handle_remote_reset(crimson::net::ConnectionRef conn) +{ + logger().warn("ms_handle_remote_reset"); +} + +void OSD::handle_authentication(const EntityName& name, + const AuthCapsInfo& caps_info) +{ + // TODO: store the parsed cap and associate it with the connection + if (caps_info.allow_all) { + logger().debug("{} {} has all caps", __func__, name); + return; + } + if (caps_info.caps.length() > 0) { + auto p = caps_info.caps.cbegin(); + string str; + try { + decode(str, p); + } catch (ceph::buffer::error& e) { + logger().warn("{} {} failed to decode caps string", __func__, name); + return; + } + OSDCap caps; + if (caps.parse(str)) { + logger().debug("{} {} has caps {}", __func__, name, str); + } else { + logger().warn("{} {} failed to parse caps {}", __func__, name, str); + } + } +} + +void OSD::update_stats() +{ + osd_stat_seq++; + osd_stat.up_from = get_up_epoch(); + osd_stat.hb_peers = heartbeat->get_peers(); + osd_stat.seq = (static_cast<uint64_t>(get_up_epoch()) << 32) | osd_stat_seq; + gate.dispatch_in_background("statfs", *this, [this] { + (void) store->stat().then([this](store_statfs_t&& st) { + osd_stat.statfs = st; + }); + }); +} + +MessageRef OSD::get_stats() const +{ + // todo: m-to-n: collect stats using map-reduce + // MPGStats::had_map_for is not used since PGMonitor was removed + auto m = make_message<MPGStats>(monc->get_fsid(), osdmap->get_epoch()); + m->osd_stat = osd_stat; + for (auto [pgid, pg] : pg_map.get_pgs()) { + if (pg->is_primary()) { + auto stats = pg->get_stats(); + // todo: update reported_epoch,reported_seq,last_fresh + stats.reported_epoch = osdmap->get_epoch(); + m->pg_stat.emplace(pgid.pgid, std::move(stats)); + } + } + return m; +} + +uint64_t OSD::send_pg_stats() +{ + // mgr client sends the report message in background + mgrc->report(); + return osd_stat.seq; +} + +OSD::cached_map_t OSD::get_map() const +{ + return osdmap; +} + +seastar::future<OSD::cached_map_t> OSD::get_map(epoch_t e) +{ + // TODO: use LRU cache for managing osdmap, fallback to disk if we have to + if (auto found = osdmaps.find(e); found) { + return seastar::make_ready_future<cached_map_t>(std::move(found)); + } else { + return load_map(e).then([e, this](unique_ptr<OSDMap> osdmap) { + return seastar::make_ready_future<cached_map_t>( + osdmaps.insert(e, std::move(osdmap))); + }); + } +} + +void OSD::store_map_bl(ceph::os::Transaction& t, + epoch_t e, bufferlist&& bl) +{ + meta_coll->store_map(t, e, bl); + map_bl_cache.insert(e, std::move(bl)); +} + +seastar::future<bufferlist> OSD::load_map_bl(epoch_t e) +{ + if (std::optional<bufferlist> found = map_bl_cache.find(e); found) { + return seastar::make_ready_future<bufferlist>(*found); + } else { + return meta_coll->load_map(e); + } +} + +seastar::future<std::map<epoch_t, bufferlist>> OSD::load_map_bls( + epoch_t first, + epoch_t last) +{ + return seastar::map_reduce(boost::make_counting_iterator<epoch_t>(first), + boost::make_counting_iterator<epoch_t>(last + 1), + [this](epoch_t e) { + return load_map_bl(e).then([e](auto&& bl) { + return seastar::make_ready_future<pair<epoch_t, bufferlist>>( + std::make_pair(e, std::move(bl))); + }); + }, + std::map<epoch_t, bufferlist>{}, + [](auto&& bls, auto&& epoch_bl) { + bls.emplace(std::move(epoch_bl)); + return std::move(bls); + }); +} + +seastar::future<std::unique_ptr<OSDMap>> OSD::load_map(epoch_t e) +{ + auto o = std::make_unique<OSDMap>(); + if (e > 0) { + return load_map_bl(e).then([o=std::move(o)](bufferlist bl) mutable { + o->decode(bl); + return seastar::make_ready_future<unique_ptr<OSDMap>>(std::move(o)); + }); + } else { + return seastar::make_ready_future<unique_ptr<OSDMap>>(std::move(o)); + } +} + +seastar::future<> OSD::store_maps(ceph::os::Transaction& t, + epoch_t start, Ref<MOSDMap> m) +{ + return seastar::do_for_each(boost::make_counting_iterator(start), + boost::make_counting_iterator(m->get_last() + 1), + [&t, m, this](epoch_t e) { + if (auto p = m->maps.find(e); p != m->maps.end()) { + auto o = std::make_unique<OSDMap>(); + o->decode(p->second); + logger().info("store_maps osdmap.{}", e); + store_map_bl(t, e, std::move(std::move(p->second))); + osdmaps.insert(e, std::move(o)); + return seastar::now(); + } else if (auto p = m->incremental_maps.find(e); + p != m->incremental_maps.end()) { + return load_map(e - 1).then([e, bl=p->second, &t, this](auto o) { + OSDMap::Incremental inc; + auto i = bl.cbegin(); + inc.decode(i); + o->apply_incremental(inc); + bufferlist fbl; + o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED); + store_map_bl(t, e, std::move(fbl)); + osdmaps.insert(e, std::move(o)); + return seastar::now(); + }); + } else { + logger().error("MOSDMap lied about what maps it had?"); + return seastar::now(); + } + }); +} + +bool OSD::require_mon_peer(crimson::net::Connection *conn, Ref<Message> m) +{ + if (!conn->peer_is_mon()) { + logger().info("{} received from non-mon {}, {}", + __func__, + conn->get_peer_addr(), + *m); + return false; + } + return true; +} + +seastar::future<Ref<PG>> OSD::handle_pg_create_info( + std::unique_ptr<PGCreateInfo> info) { + return seastar::do_with( + std::move(info), + [this](auto &info) -> seastar::future<Ref<PG>> { + return get_map(info->epoch).then( + [&info, this](cached_map_t startmap) -> + seastar::future<std::tuple<Ref<PG>, cached_map_t>> { + const spg_t &pgid = info->pgid; + if (info->by_mon) { + int64_t pool_id = pgid.pgid.pool(); + const pg_pool_t *pool = osdmap->get_pg_pool(pool_id); + if (!pool) { + logger().debug( + "{} ignoring pgid {}, pool dne", + __func__, + pgid); + return seastar::make_ready_future<std::tuple<Ref<PG>, cached_map_t>>( + std::make_tuple(Ref<PG>(), startmap)); + } + ceph_assert(osdmap->require_osd_release >= ceph_release_t::octopus); + if (!pool->has_flag(pg_pool_t::FLAG_CREATING)) { + // this ensures we do not process old creating messages after the + // pool's initial pgs have been created (and pg are subsequently + // allowed to split or merge). + logger().debug( + "{} dropping {} create, pool does not have CREATING flag set", + __func__, + pgid); + return seastar::make_ready_future<std::tuple<Ref<PG>, cached_map_t>>( + std::make_tuple(Ref<PG>(), startmap)); + } + } + return make_pg(startmap, pgid, true).then( + [startmap=std::move(startmap)](auto pg) mutable { + return seastar::make_ready_future<std::tuple<Ref<PG>, cached_map_t>>( + std::make_tuple(std::move(pg), std::move(startmap))); + }); + }).then([this, &info](auto&& ret) -> + seastar::future<Ref<PG>> { + auto [pg, startmap] = std::move(ret); + if (!pg) + return seastar::make_ready_future<Ref<PG>>(Ref<PG>()); + PeeringCtx rctx{ceph_release_t::octopus}; + const pg_pool_t* pp = startmap->get_pg_pool(info->pgid.pool()); + + int up_primary, acting_primary; + vector<int> up, acting; + startmap->pg_to_up_acting_osds( + info->pgid.pgid, &up, &up_primary, &acting, &acting_primary); + + int role = startmap->calc_pg_role(pg_shard_t(whoami, info->pgid.shard), + acting); + + create_pg_collection( + rctx.transaction, + info->pgid, + info->pgid.get_split_bits(pp->get_pg_num())); + init_pg_ondisk( + rctx.transaction, + info->pgid, + pp); + + pg->init( + role, + up, + up_primary, + acting, + acting_primary, + info->history, + info->past_intervals, + false, + rctx.transaction); + + return shard_services.start_operation<PGAdvanceMap>( + *this, pg, pg->get_osdmap_epoch(), + osdmap->get_epoch(), std::move(rctx), true).second.then([pg=pg] { + return seastar::make_ready_future<Ref<PG>>(pg); + }); + }); + }); +} + +seastar::future<> OSD::handle_osd_map(crimson::net::ConnectionRef conn, + Ref<MOSDMap> m) +{ + logger().info("handle_osd_map {}", *m); + if (m->fsid != superblock.cluster_fsid) { + logger().warn("fsid mismatched"); + return seastar::now(); + } + if (state.is_initializing()) { + logger().warn("i am still initializing"); + return seastar::now(); + } + + const auto first = m->get_first(); + const auto last = m->get_last(); + logger().info("handle_osd_map epochs [{}..{}], i have {}, src has [{}..{}]", + first, last, superblock.newest_map, m->oldest_map, m->newest_map); + // make sure there is something new, here, before we bother flushing + // the queues and such + if (last <= superblock.newest_map) { + return seastar::now(); + } + // missing some? + bool skip_maps = false; + epoch_t start = superblock.newest_map + 1; + if (first > start) { + logger().info("handle_osd_map message skips epochs {}..{}", + start, first - 1); + if (m->oldest_map <= start) { + return shard_services.osdmap_subscribe(start, false); + } + // always try to get the full range of maps--as many as we can. this + // 1- is good to have + // 2- is at present the only way to ensure that we get a *full* map as + // the first map! + if (m->oldest_map < first) { + return shard_services.osdmap_subscribe(m->oldest_map - 1, true); + } + skip_maps = true; + start = first; + } + + return seastar::do_with(ceph::os::Transaction{}, + [=](auto& t) { + return store_maps(t, start, m).then([=, &t] { + // even if this map isn't from a mon, we may have satisfied our subscription + monc->sub_got("osdmap", last); + if (!superblock.oldest_map || skip_maps) { + superblock.oldest_map = first; + } + superblock.newest_map = last; + superblock.current_epoch = last; + + // note in the superblock that we were clean thru the prior epoch + if (boot_epoch && boot_epoch >= superblock.mounted) { + superblock.mounted = boot_epoch; + superblock.clean_thru = last; + } + meta_coll->store_superblock(t, superblock); + return store->do_transaction(meta_coll->collection(), std::move(t)); + }); + }).then([=] { + // TODO: write to superblock and commit the transaction + return committed_osd_maps(start, last, m); + }); +} + +seastar::future<> OSD::committed_osd_maps(version_t first, + version_t last, + Ref<MOSDMap> m) +{ + logger().info("osd.{}: committed_osd_maps({}, {})", whoami, first, last); + // advance through the new maps + return seastar::do_for_each(boost::make_counting_iterator(first), + boost::make_counting_iterator(last + 1), + [this](epoch_t cur) { + return get_map(cur).then([this](cached_map_t&& o) { + osdmap = std::move(o); + shard_services.update_map(osdmap); + if (up_epoch == 0 && + osdmap->is_up(whoami) && + osdmap->get_addrs(whoami) == public_msgr->get_myaddrs()) { + up_epoch = osdmap->get_epoch(); + if (!boot_epoch) { + boot_epoch = osdmap->get_epoch(); + } + } + }); + }).then([m, this] { + if (osdmap->is_up(whoami) && + osdmap->get_addrs(whoami) == public_msgr->get_myaddrs() && + bind_epoch < osdmap->get_up_from(whoami)) { + if (state.is_booting()) { + logger().info("osd.{}: activating...", whoami); + state.set_active(); + beacon_timer.arm_periodic( + std::chrono::seconds(local_conf()->osd_beacon_report_interval)); + tick_timer.arm_periodic( + std::chrono::seconds(TICK_INTERVAL)); + } + } else if (!osdmap->is_up(whoami)) { + if (state.is_prestop()) { + got_stop_ack(); + return seastar::now(); + } + } + check_osdmap_features(); + // yay! + return consume_map(osdmap->get_epoch()); + }).then([m, this] { + if (state.is_active()) { + logger().info("osd.{}: now active", whoami); + if (!osdmap->exists(whoami)) { + return shutdown(); + } + if (should_restart()) { + return restart(); + } else { + return seastar::now(); + } + } else if (state.is_preboot()) { + logger().info("osd.{}: now preboot", whoami); + + if (m->get_source().is_mon()) { + return _preboot(m->oldest_map, m->newest_map); + } else { + logger().info("osd.{}: start_boot", whoami); + return start_boot(); + } + } else { + logger().info("osd.{}: now {}", whoami, state); + // XXX + return seastar::now(); + } + }); +} + +seastar::future<> OSD::handle_osd_op(crimson::net::ConnectionRef conn, + Ref<MOSDOp> m) +{ + (void) shard_services.start_operation<ClientRequest>( + *this, + conn, + std::move(m)); + return seastar::now(); +} + +seastar::future<> OSD::send_incremental_map(crimson::net::ConnectionRef conn, + epoch_t first) +{ + if (first >= superblock.oldest_map) { + return load_map_bls(first, superblock.newest_map) + .then([this, conn, first](auto&& bls) { + auto m = make_message<MOSDMap>(monc->get_fsid(), + osdmap->get_encoding_features()); + m->oldest_map = first; + m->newest_map = superblock.newest_map; + m->maps = std::move(bls); + return conn->send(m); + }); + } else { + return load_map_bl(osdmap->get_epoch()) + .then([this, conn](auto&& bl) mutable { + auto m = make_message<MOSDMap>(monc->get_fsid(), + osdmap->get_encoding_features()); + m->oldest_map = superblock.oldest_map; + m->newest_map = superblock.newest_map; + m->maps.emplace(osdmap->get_epoch(), std::move(bl)); + return conn->send(m); + }); + } +} + +seastar::future<> OSD::handle_rep_op(crimson::net::ConnectionRef conn, + Ref<MOSDRepOp> m) +{ + m->finish_decode(); + (void) shard_services.start_operation<RepRequest>( + *this, + std::move(conn), + std::move(m)); + return seastar::now(); +} + +seastar::future<> OSD::handle_rep_op_reply(crimson::net::ConnectionRef conn, + Ref<MOSDRepOpReply> m) +{ + const auto& pgs = pg_map.get_pgs(); + if (auto pg = pgs.find(m->get_spg()); pg != pgs.end()) { + m->finish_decode(); + pg->second->handle_rep_op_reply(conn, *m); + } else { + logger().warn("stale reply: {}", *m); + } + return seastar::now(); +} + +seastar::future<> OSD::handle_scrub(crimson::net::ConnectionRef conn, + Ref<MOSDScrub2> m) +{ + if (m->fsid != superblock.cluster_fsid) { + logger().warn("fsid mismatched"); + return seastar::now(); + } + return seastar::parallel_for_each(std::move(m->scrub_pgs), + [m, conn, this](spg_t pgid) { + pg_shard_t from_shard{static_cast<int>(m->get_source().num()), + pgid.shard}; + PeeringState::RequestScrub scrub_request{m->deep, m->repair}; + return shard_services.start_operation<RemotePeeringEvent>( + *this, + conn, + shard_services, + from_shard, + pgid, + PGPeeringEvent{m->epoch, m->epoch, scrub_request}).second; + }); +} + +seastar::future<> OSD::handle_mark_me_down(crimson::net::ConnectionRef conn, + Ref<MOSDMarkMeDown> m) +{ + if (state.is_prestop()) { + got_stop_ack(); + } + return seastar::now(); +} + +seastar::future<> OSD::handle_recovery_subreq(crimson::net::ConnectionRef conn, + Ref<MOSDFastDispatchOp> m) +{ + (void) shard_services.start_operation<RecoverySubRequest>( + *this, + conn, + std::move(m)); + return seastar::now(); +} + +bool OSD::should_restart() const +{ + if (!osdmap->is_up(whoami)) { + logger().info("map e {} marked osd.{} down", + osdmap->get_epoch(), whoami); + return true; + } else if (osdmap->get_addrs(whoami) != public_msgr->get_myaddrs()) { + logger().error("map e {} had wrong client addr ({} != my {})", + osdmap->get_epoch(), + osdmap->get_addrs(whoami), + public_msgr->get_myaddrs()); + return true; + } else if (osdmap->get_cluster_addrs(whoami) != cluster_msgr->get_myaddrs()) { + logger().error("map e {} had wrong cluster addr ({} != my {})", + osdmap->get_epoch(), + osdmap->get_cluster_addrs(whoami), + cluster_msgr->get_myaddrs()); + return true; + } else { + return false; + } +} + +seastar::future<> OSD::restart() +{ + beacon_timer.cancel(); + tick_timer.cancel(); + up_epoch = 0; + bind_epoch = osdmap->get_epoch(); + // TODO: promote to shutdown if being marked down for multiple times + // rebind messengers + return start_boot(); +} + +seastar::future<> OSD::shutdown() +{ + // TODO + superblock.mounted = boot_epoch; + superblock.clean_thru = osdmap->get_epoch(); + return seastar::now(); +} + +seastar::future<> OSD::send_beacon() +{ + if (!state.is_active()) { + return seastar::now(); + } + // FIXME: min lec should be calculated from pg_stat + // and should set m->pgs + epoch_t min_last_epoch_clean = osdmap->get_epoch(); + auto m = make_message<MOSDBeacon>(osdmap->get_epoch(), + min_last_epoch_clean, + superblock.last_purged_snaps_scrub, + local_conf()->osd_beacon_report_interval); + return monc->send_message(m); +} + +void OSD::update_heartbeat_peers() +{ + if (!state.is_active()) { + return; + } + for (auto& pg : pg_map.get_pgs()) { + vector<int> up, acting; + osdmap->pg_to_up_acting_osds(pg.first.pgid, + &up, nullptr, + &acting, nullptr); + for (int osd : boost::join(up, acting)) { + if (osd == CRUSH_ITEM_NONE || osd == whoami) { + continue; + } else { + heartbeat->add_peer(osd, osdmap->get_epoch()); + } + } + } + heartbeat->update_peers(whoami); +} + +seastar::future<> OSD::handle_peering_op( + crimson::net::ConnectionRef conn, + Ref<MOSDPeeringOp> m) +{ + const int from = m->get_source().num(); + logger().debug("handle_peering_op on {} from {}", m->get_spg(), from); + std::unique_ptr<PGPeeringEvent> evt(m->get_event()); + (void) shard_services.start_operation<RemotePeeringEvent>( + *this, + conn, + shard_services, + pg_shard_t{from, m->get_spg().shard}, + m->get_spg(), + std::move(*evt)); + return seastar::now(); +} + +void OSD::check_osdmap_features() +{ + heartbeat->set_require_authorizer(true); +} + +seastar::future<> OSD::consume_map(epoch_t epoch) +{ + // todo: m-to-n: broadcast this news to all shards + auto &pgs = pg_map.get_pgs(); + return seastar::parallel_for_each(pgs.begin(), pgs.end(), [=](auto& pg) { + return shard_services.start_operation<PGAdvanceMap>( + *this, pg.second, pg.second->get_osdmap_epoch(), epoch, + PeeringCtx{ceph_release_t::octopus}, false).second; + }).then([epoch, this] { + osdmap_gate.got_map(epoch); + return seastar::make_ready_future(); + }); +} + + +blocking_future<Ref<PG>> +OSD::get_or_create_pg( + spg_t pgid, + epoch_t epoch, + std::unique_ptr<PGCreateInfo> info) +{ + if (info) { + auto [fut, creating] = pg_map.wait_for_pg(pgid); + if (!creating) { + pg_map.set_creating(pgid); + (void)handle_pg_create_info(std::move(info)); + } + return std::move(fut); + } else { + return make_ready_blocking_future<Ref<PG>>(pg_map.get_pg(pgid)); + } +} + +blocking_future<Ref<PG>> OSD::wait_for_pg( + spg_t pgid) +{ + return pg_map.wait_for_pg(pgid).first; +} + +Ref<PG> OSD::get_pg(spg_t pgid) +{ + return pg_map.get_pg(pgid); +} + +seastar::future<> OSD::prepare_to_stop() +{ + if (osdmap && osdmap->is_up(whoami)) { + state.set_prestop(); + const auto timeout = + std::chrono::duration_cast<std::chrono::milliseconds>( + std::chrono::duration<double>( + local_conf().get_val<double>("osd_mon_shutdown_timeout"))); + + return seastar::with_timeout( + seastar::timer<>::clock::now() + timeout, + monc->send_message( + make_message<MOSDMarkMeDown>( + monc->get_fsid(), + whoami, + osdmap->get_addrs(whoami), + osdmap->get_epoch(), + true)).then([this] { + return stop_acked.get_future(); + }) + ).handle_exception_type( + [](seastar::timed_out_error&) { + return seastar::now(); + }); + } + return seastar::now(); +} + +} diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h new file mode 100644 index 000000000..889960ced --- /dev/null +++ b/src/crimson/osd/osd.h @@ -0,0 +1,250 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/gate.hh> +#include <seastar/core/shared_ptr.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/timer.hh> + +#include "crimson/common/type_helpers.h" +#include "crimson/common/auth_handler.h" +#include "crimson/common/gated.h" +#include "crimson/admin/admin_socket.h" +#include "crimson/common/simple_lru.h" +#include "crimson/common/shared_lru.h" +#include "crimson/mgr/client.h" +#include "crimson/net/Dispatcher.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/state.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/pg_map.h" +#include "crimson/osd/osd_operations/peering_event.h" + +#include "messages/MOSDOp.h" +#include "osd/PeeringState.h" +#include "osd/osd_types.h" +#include "osd/osd_perf_counters.h" +#include "osd/PGPeeringEvent.h" + +class MCommand; +class MOSDMap; +class MOSDRepOpReply; +class MOSDRepOp; +class MOSDScrub2; +class OSDMap; +class OSDMeta; +class Heartbeat; + +namespace ceph::os { + class Transaction; +} + +namespace crimson::mon { + class Client; +} + +namespace crimson::net { + class Messenger; +} + +namespace crimson::os { + class FuturizedStore; +} + +namespace crimson::osd { +class PG; + +class OSD final : public crimson::net::Dispatcher, + private OSDMapService, + private crimson::common::AuthHandler, + private crimson::mgr::WithStats { + const int whoami; + const uint32_t nonce; + seastar::timer<seastar::lowres_clock> beacon_timer; + // talk with osd + crimson::net::MessengerRef cluster_msgr; + // talk with client/mon/mgr + crimson::net::MessengerRef public_msgr; + std::unique_ptr<crimson::mon::Client> monc; + std::unique_ptr<crimson::mgr::Client> mgrc; + + SharedLRU<epoch_t, OSDMap> osdmaps; + SimpleLRU<epoch_t, bufferlist, false> map_bl_cache; + cached_map_t osdmap; + // TODO: use a wrapper for ObjectStore + std::unique_ptr<crimson::os::FuturizedStore> store; + std::unique_ptr<OSDMeta> meta_coll; + + OSDState state; + + /// _first_ epoch we were marked up (after this process started) + epoch_t boot_epoch = 0; + /// _most_recent_ epoch we were marked up + epoch_t up_epoch = 0; + //< epoch we last did a bind to new ip:ports + epoch_t bind_epoch = 0; + //< since when there is no more pending pg creates from mon + epoch_t last_pg_create_epoch = 0; + + ceph::mono_time startup_time; + + OSDSuperblock superblock; + + // Dispatcher methods + std::optional<seastar::future<>> ms_dispatch(crimson::net::ConnectionRef, MessageRef) final; + void ms_handle_reset(crimson::net::ConnectionRef conn, bool is_replace) final; + void ms_handle_remote_reset(crimson::net::ConnectionRef conn) final; + + // mgr::WithStats methods + // pg statistics including osd ones + osd_stat_t osd_stat; + uint32_t osd_stat_seq = 0; + void update_stats(); + MessageRef get_stats() const final; + + // AuthHandler methods + void handle_authentication(const EntityName& name, + const AuthCapsInfo& caps) final; + + crimson::osd::ShardServices shard_services; + + std::unique_ptr<Heartbeat> heartbeat; + seastar::timer<seastar::lowres_clock> tick_timer; + + // admin-socket + seastar::lw_shared_ptr<crimson::admin::AdminSocket> asok; + +public: + OSD(int id, uint32_t nonce, + crimson::net::MessengerRef cluster_msgr, + crimson::net::MessengerRef client_msgr, + crimson::net::MessengerRef hb_front_msgr, + crimson::net::MessengerRef hb_back_msgr); + ~OSD() final; + + seastar::future<> mkfs(uuid_d osd_uuid, uuid_d cluster_fsid); + + seastar::future<> start(); + seastar::future<> stop(); + + void dump_status(Formatter*) const; + void dump_pg_state_history(Formatter*) const; + void print(std::ostream&) const; + + seastar::future<> send_incremental_map(crimson::net::ConnectionRef conn, + epoch_t first); + + /// @return the seq id of the pg stats being sent + uint64_t send_pg_stats(); + +private: + seastar::future<> start_boot(); + seastar::future<> _preboot(version_t oldest_osdmap, version_t newest_osdmap); + seastar::future<> _send_boot(); + seastar::future<> _add_me_to_crush(); + + seastar::future<Ref<PG>> make_pg(cached_map_t create_map, + spg_t pgid, + bool do_create); + seastar::future<Ref<PG>> load_pg(spg_t pgid); + seastar::future<> load_pgs(); + + // OSDMapService methods + epoch_t get_up_epoch() const final { + return up_epoch; + } + seastar::future<cached_map_t> get_map(epoch_t e) final; + cached_map_t get_map() const final; + seastar::future<std::unique_ptr<OSDMap>> load_map(epoch_t e); + seastar::future<bufferlist> load_map_bl(epoch_t e); + seastar::future<std::map<epoch_t, bufferlist>> + load_map_bls(epoch_t first, epoch_t last); + void store_map_bl(ceph::os::Transaction& t, + epoch_t e, bufferlist&& bl); + seastar::future<> store_maps(ceph::os::Transaction& t, + epoch_t start, Ref<MOSDMap> m); + seastar::future<> osdmap_subscribe(version_t epoch, bool force_request); + + void write_superblock(ceph::os::Transaction& t); + seastar::future<> read_superblock(); + + bool require_mon_peer(crimson::net::Connection *conn, Ref<Message> m); + + seastar::future<Ref<PG>> handle_pg_create_info( + std::unique_ptr<PGCreateInfo> info); + + seastar::future<> handle_osd_map(crimson::net::ConnectionRef conn, + Ref<MOSDMap> m); + seastar::future<> handle_osd_op(crimson::net::ConnectionRef conn, + Ref<MOSDOp> m); + seastar::future<> handle_rep_op(crimson::net::ConnectionRef conn, + Ref<MOSDRepOp> m); + seastar::future<> handle_rep_op_reply(crimson::net::ConnectionRef conn, + Ref<MOSDRepOpReply> m); + seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn, + Ref<MOSDPeeringOp> m); + seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn, + Ref<MOSDFastDispatchOp> m); + seastar::future<> handle_scrub(crimson::net::ConnectionRef conn, + Ref<MOSDScrub2> m); + seastar::future<> handle_mark_me_down(crimson::net::ConnectionRef conn, + Ref<MOSDMarkMeDown> m); + + seastar::future<> committed_osd_maps(version_t first, + version_t last, + Ref<MOSDMap> m); + + void check_osdmap_features(); + + seastar::future<> handle_command(crimson::net::ConnectionRef conn, + Ref<MCommand> m); + seastar::future<> start_asok_admin(); + +public: + OSDMapGate osdmap_gate; + + ShardServices &get_shard_services() { + return shard_services; + } + + seastar::future<> consume_map(epoch_t epoch); + +private: + PGMap pg_map; + crimson::common::Gated gate; + + seastar::promise<> stop_acked; + void got_stop_ack() { + stop_acked.set_value(); + } + seastar::future<> prepare_to_stop(); +public: + blocking_future<Ref<PG>> get_or_create_pg( + spg_t pgid, + epoch_t epoch, + std::unique_ptr<PGCreateInfo> info); + blocking_future<Ref<PG>> wait_for_pg( + spg_t pgid); + Ref<PG> get_pg(spg_t pgid); + + bool should_restart() const; + seastar::future<> restart(); + seastar::future<> shutdown(); + + seastar::future<> send_beacon(); + void update_heartbeat_peers(); + + friend class PGAdvanceMap; +}; + +inline std::ostream& operator<<(std::ostream& out, const OSD& osd) { + osd.print(out); + return out; +} + +} diff --git a/src/crimson/osd/osd_connection_priv.h b/src/crimson/osd/osd_connection_priv.h new file mode 100644 index 000000000..a265bb432 --- /dev/null +++ b/src/crimson/osd/osd_connection_priv.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/replicated_request.h" + +namespace crimson::osd { + +struct OSDConnectionPriv : public crimson::net::Connection::user_private_t { + ClientRequest::ConnectionPipeline client_request_conn_pipeline; + RemotePeeringEvent::ConnectionPipeline peering_request_conn_pipeline; + RepRequest::ConnectionPipeline replicated_request_conn_pipeline; +}; + +static OSDConnectionPriv &get_osd_priv(crimson::net::Connection *conn) { + if (!conn->has_user_private()) { + conn->set_user_private(std::make_unique<OSDConnectionPriv>()); + } + return static_cast<OSDConnectionPriv&>(conn->get_user_private()); +} + +} diff --git a/src/crimson/osd/osd_meta.cc b/src/crimson/osd/osd_meta.cc new file mode 100644 index 000000000..9b9215f5b --- /dev/null +++ b/src/crimson/osd/osd_meta.cc @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd_meta.h" + +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "os/Transaction.h" + +using read_errorator = crimson::os::FuturizedStore::read_errorator; + +void OSDMeta::create(ceph::os::Transaction& t) +{ + t.create_collection(coll->get_cid(), 0); +} + +void OSDMeta::store_map(ceph::os::Transaction& t, + epoch_t e, const bufferlist& m) +{ + t.write(coll->get_cid(), osdmap_oid(e), 0, m.length(), m); +} + +seastar::future<bufferlist> OSDMeta::load_map(epoch_t e) +{ + return store->read(coll, + osdmap_oid(e), 0, 0, + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED).handle_error( + read_errorator::all_same_way([e] { + throw std::runtime_error(fmt::format("read gave enoent on {}", + osdmap_oid(e))); + })); +} + +void OSDMeta::store_superblock(ceph::os::Transaction& t, + const OSDSuperblock& superblock) +{ + bufferlist bl; + encode(superblock, bl); + t.write(coll->get_cid(), superblock_oid(), 0, bl.length(), bl); +} + +seastar::future<OSDSuperblock> OSDMeta::load_superblock() +{ + return store->read(coll, superblock_oid(), 0, 0).safe_then( + [] (bufferlist&& bl) { + auto p = bl.cbegin(); + OSDSuperblock superblock; + decode(superblock, p); + return seastar::make_ready_future<OSDSuperblock>(std::move(superblock)); + }, read_errorator::all_same_way([] { + throw std::runtime_error(fmt::format("read gave enoent on {}", + superblock_oid())); + })); +} + +seastar::future<std::tuple<pg_pool_t, + std::string, + OSDMeta::ec_profile_t>> +OSDMeta::load_final_pool_info(int64_t pool) { + return store->read(coll, final_pool_info_oid(pool), + 0, 0).safe_then([] (bufferlist&& bl) { + auto p = bl.cbegin(); + pg_pool_t pi; + string name; + ec_profile_t ec_profile; + decode(pi, p); + decode(name, p); + decode(ec_profile, p); + return seastar::make_ready_future<std::tuple<pg_pool_t, + string, + ec_profile_t>>( + std::make_tuple(std::move(pi), + std::move(name), + std::move(ec_profile))); + },read_errorator::all_same_way([pool] { + throw std::runtime_error(fmt::format("read gave enoent on {}", + final_pool_info_oid(pool))); + })); +} + +ghobject_t OSDMeta::osdmap_oid(epoch_t epoch) +{ + string name = fmt::format("osdmap.{}", epoch); + return ghobject_t(hobject_t(sobject_t(object_t(name), 0))); +} + +ghobject_t OSDMeta::final_pool_info_oid(int64_t pool) +{ + string name = fmt::format("final_pool_{}", pool); + return ghobject_t(hobject_t(sobject_t(object_t(name), CEPH_NOSNAP))); +} + +ghobject_t OSDMeta::superblock_oid() +{ + return ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0))); +} diff --git a/src/crimson/osd/osd_meta.h b/src/crimson/osd/osd_meta.h new file mode 100644 index 000000000..841572087 --- /dev/null +++ b/src/crimson/osd/osd_meta.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <string> +#include <seastar/core/future.hh> +#include "osd/osd_types.h" +#include "crimson/os/futurized_collection.h" + +namespace ceph::os { + class Transaction; +} + +namespace crimson::os { + class FuturizedCollection; + class FuturizedStore; +} + +/// metadata shared across PGs, or put in another way, +/// metadata not specific to certain PGs. +class OSDMeta { + template<typename T> using Ref = boost::intrusive_ptr<T>; + + crimson::os::FuturizedStore* store; + Ref<crimson::os::FuturizedCollection> coll; + +public: + OSDMeta(Ref<crimson::os::FuturizedCollection> coll, + crimson::os::FuturizedStore* store) + : store{store}, coll{coll} + {} + + auto collection() { + return coll; + } + void create(ceph::os::Transaction& t); + + void store_map(ceph::os::Transaction& t, + epoch_t e, const bufferlist& m); + seastar::future<bufferlist> load_map(epoch_t e); + + void store_superblock(ceph::os::Transaction& t, + const OSDSuperblock& sb); + seastar::future<OSDSuperblock> load_superblock(); + + using ec_profile_t = std::map<std::string, std::string>; + seastar::future<std::tuple<pg_pool_t, + std::string, + ec_profile_t>> load_final_pool_info(int64_t pool); +private: + static ghobject_t osdmap_oid(epoch_t epoch); + static ghobject_t final_pool_info_oid(int64_t pool); + static ghobject_t superblock_oid(); +}; diff --git a/src/crimson/osd/osd_operation.cc b/src/crimson/osd/osd_operation.cc new file mode 100644 index 000000000..b5f3c3cbb --- /dev/null +++ b/src/crimson/osd/osd_operation.cc @@ -0,0 +1,159 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "osd_operation.h" +#include "common/Formatter.h" + +namespace crimson::osd { + +void Operation::dump(ceph::Formatter* f) +{ + f->open_object_section("operation"); + f->dump_string("type", get_type_name()); + f->dump_unsigned("id", id); + { + f->open_object_section("detail"); + dump_detail(f); + f->close_section(); + } + f->open_array_section("blockers"); + for (auto &blocker : blockers) { + blocker->dump(f); + } + f->close_section(); + f->close_section(); +} + +void Operation::dump_brief(ceph::Formatter* f) +{ + f->open_object_section("operation"); + f->dump_string("type", get_type_name()); + f->dump_unsigned("id", id); + f->close_section(); +} + +std::ostream &operator<<(std::ostream &lhs, const Operation &rhs) { + lhs << rhs.get_type_name() << "(id=" << rhs.get_id() << ", detail="; + rhs.print(lhs); + lhs << ")"; + return lhs; +} + +void Blocker::dump(ceph::Formatter* f) const +{ + f->open_object_section("blocker"); + f->dump_string("op_type", get_type_name()); + { + f->open_object_section("detail"); + dump_detail(f); + f->close_section(); + } + f->close_section(); +} + +void AggregateBlocker::dump_detail(ceph::Formatter *f) const +{ + f->open_array_section("parent_blockers"); + for (auto b : parent_blockers) { + f->open_object_section("parent_blocker"); + b->dump(f); + f->close_section(); + } + f->close_section(); +} + +OperationThrottler::OperationThrottler(ConfigProxy &conf) + : scheduler(crimson::osd::scheduler::make_scheduler(conf)) +{ + conf.add_observer(this); + update_from_config(conf); +} + +void OperationThrottler::wake() +{ + while ((!max_in_progress || in_progress < max_in_progress) && + !scheduler->empty()) { + auto item = scheduler->dequeue(); + item.wake.set_value(); + ++in_progress; + --pending; + } +} + +void OperationThrottler::release_throttle() +{ + ceph_assert(in_progress > 0); + --in_progress; + wake(); +} + +blocking_future<> OperationThrottler::acquire_throttle( + crimson::osd::scheduler::params_t params) +{ + crimson::osd::scheduler::item_t item{params, seastar::promise<>()}; + auto fut = item.wake.get_future(); + scheduler->enqueue(std::move(item)); + return make_blocking_future(std::move(fut)); +} + +void OperationThrottler::dump_detail(Formatter *f) const +{ + f->dump_unsigned("max_in_progress", max_in_progress); + f->dump_unsigned("in_progress", in_progress); + f->open_object_section("scheduler"); + { + scheduler->dump(*f); + } + f->close_section(); +} + +void OperationThrottler::update_from_config(const ConfigProxy &conf) +{ + max_in_progress = conf.get_val<uint64_t>("crimson_osd_scheduler_concurrency"); + wake(); +} + +const char** OperationThrottler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "crimson_osd_scheduler_concurrency", + NULL + }; + return KEYS; +} + +void OperationThrottler::handle_conf_change( + const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + update_from_config(conf); +} + + +void OrderedPipelinePhase::Handle::exit() +{ + if (phase) { + phase->mutex.unlock(); + phase = nullptr; + } +} + +blocking_future<> OrderedPipelinePhase::Handle::enter( + OrderedPipelinePhase &new_phase) +{ + auto fut = new_phase.mutex.lock(); + exit(); + phase = &new_phase; + return new_phase.make_blocking_future(std::move(fut)); +} + +OrderedPipelinePhase::Handle::~Handle() +{ + exit(); +} + +void OrderedPipelinePhase::dump_detail(ceph::Formatter* f) const +{ +} + +} diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h new file mode 100644 index 000000000..5178749b0 --- /dev/null +++ b/src/crimson/osd/osd_operation.h @@ -0,0 +1,427 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <algorithm> +#include <array> +#include <set> +#include <vector> +#include <boost/intrusive/list.hpp> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <seastar/core/shared_mutex.hh> +#include <seastar/core/future.hh> +#include <seastar/core/timer.hh> +#include <seastar/core/lowres_clock.hh> + +#include "include/ceph_assert.h" +#include "crimson/osd/scheduler/scheduler.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +enum class OperationTypeCode { + client_request = 0, + peering_event, + compound_peering_request, + pg_advance_map, + pg_creation, + replicated_request, + background_recovery, + background_recovery_sub, + last_op +}; + +static constexpr const char* const OP_NAMES[] = { + "client_request", + "peering_event", + "compound_peering_request", + "pg_advance_map", + "pg_creation", + "replicated_request", + "background_recovery", + "background_recovery_sub", +}; + +// prevent the addition of OperationTypeCode-s with no matching OP_NAMES entry: +static_assert( + (sizeof(OP_NAMES)/sizeof(OP_NAMES[0])) == + static_cast<int>(OperationTypeCode::last_op)); + +class OperationRegistry; + +using registry_hook_t = boost::intrusive::list_member_hook< + boost::intrusive::link_mode<boost::intrusive::auto_unlink>>; + +class Operation; +class Blocker; + +/** + * Provides an abstraction for registering and unregistering a blocker + * for the duration of a future becoming available. + */ +template <typename Fut> +class blocking_future_detail { + friend class Operation; + friend class Blocker; + Blocker *blocker; + Fut fut; + blocking_future_detail(Blocker *b, Fut &&f) + : blocker(b), fut(std::move(f)) {} + + template <typename V, typename U> + friend blocking_future_detail<seastar::future<V>> make_ready_blocking_future(U&& args); + template <typename V, typename Exception> + friend blocking_future_detail<seastar::future<V>> + make_exception_blocking_future(Exception&& e); + + template <typename U> + friend blocking_future_detail<seastar::future<>> join_blocking_futures(U &&u); + + template <typename U> + friend class blocking_future_detail; + +public: + template <typename F> + auto then(F &&f) && { + using result = decltype(std::declval<Fut>().then(f)); + return blocking_future_detail<seastar::futurize_t<result>>( + blocker, + std::move(fut).then(std::forward<F>(f))); + } +}; + +template <typename T=void> +using blocking_future = blocking_future_detail<seastar::future<T>>; + +template <typename V, typename U> +blocking_future_detail<seastar::future<V>> make_ready_blocking_future(U&& args) { + return blocking_future<V>( + nullptr, + seastar::make_ready_future<V>(std::forward<U>(args))); +} + +template <typename V, typename Exception> +blocking_future_detail<seastar::future<V>> +make_exception_blocking_future(Exception&& e) { + return blocking_future<V>( + nullptr, + seastar::make_exception_future<V>(e)); +} + +/** + * Provides an interface for dumping diagnostic information about + * why a particular op is not making progress. + */ +class Blocker { +public: + template <typename T> + blocking_future<T> make_blocking_future(seastar::future<T> &&f) { + return blocking_future<T>(this, std::move(f)); + } + void dump(ceph::Formatter *f) const; + virtual ~Blocker() = default; + +private: + virtual void dump_detail(ceph::Formatter *f) const = 0; + virtual const char *get_type_name() const = 0; +}; + +template <typename T> +class BlockerT : public Blocker { +public: + virtual ~BlockerT() = default; +private: + const char *get_type_name() const final { + return T::type_name; + } +}; + +class AggregateBlocker : public BlockerT<AggregateBlocker> { + vector<Blocker*> parent_blockers; +public: + AggregateBlocker(vector<Blocker*> &&parent_blockers) + : parent_blockers(std::move(parent_blockers)) {} + static constexpr const char *type_name = "AggregateBlocker"; +private: + void dump_detail(ceph::Formatter *f) const final; +}; + +template <typename T> +blocking_future<> join_blocking_futures(T &&t) { + vector<Blocker*> blockers; + blockers.reserve(t.size()); + for (auto &&bf: t) { + blockers.push_back(bf.blocker); + bf.blocker = nullptr; + } + auto agg = std::make_unique<AggregateBlocker>(std::move(blockers)); + return agg->make_blocking_future( + seastar::parallel_for_each( + std::forward<T>(t), + [](auto &&bf) { + return std::move(bf.fut); + }).then([agg=std::move(agg)] { + return seastar::make_ready_future<>(); + })); +} + + +/** + * Common base for all crimson-osd operations. Mainly provides + * an interface for registering ops in flight and dumping + * diagnostic information. + */ +class Operation : public boost::intrusive_ref_counter< + Operation, boost::thread_unsafe_counter> { + public: + uint64_t get_id() const { + return id; + } + + virtual OperationTypeCode get_type() const = 0; + virtual const char *get_type_name() const = 0; + virtual void print(std::ostream &) const = 0; + + template <typename T> + seastar::future<T> with_blocking_future(blocking_future<T> &&f) { + if (f.fut.available()) { + return std::move(f.fut); + } + assert(f.blocker); + add_blocker(f.blocker); + return std::move(f.fut).then_wrapped([this, blocker=f.blocker](auto &&arg) { + clear_blocker(blocker); + return std::move(arg); + }); + } + + void dump(ceph::Formatter *f); + void dump_brief(ceph::Formatter *f); + virtual ~Operation() = default; + + private: + virtual void dump_detail(ceph::Formatter *f) const = 0; + + private: + registry_hook_t registry_hook; + + std::vector<Blocker*> blockers; + uint64_t id = 0; + void set_id(uint64_t in_id) { + id = in_id; + } + + void add_blocker(Blocker *b) { + blockers.push_back(b); + } + + void clear_blocker(Blocker *b) { + auto iter = std::find(blockers.begin(), blockers.end(), b); + if (iter != blockers.end()) { + blockers.erase(iter); + } + } + + friend class OperationRegistry; +}; +using OperationRef = boost::intrusive_ptr<Operation>; + +std::ostream &operator<<(std::ostream &, const Operation &op); + +template <typename T> +class OperationT : public Operation { +public: + static constexpr const char *type_name = OP_NAMES[static_cast<int>(T::type)]; + using IRef = boost::intrusive_ptr<T>; + + OperationTypeCode get_type() const final { + return T::type; + } + + const char *get_type_name() const final { + return T::type_name; + } + + virtual ~OperationT() = default; + +private: + virtual void dump_detail(ceph::Formatter *f) const = 0; +}; + +/** + * Maintains a set of lists of all active ops. + */ +class OperationRegistry { + friend class Operation; + using op_list_member_option = boost::intrusive::member_hook< + Operation, + registry_hook_t, + &Operation::registry_hook + >; + using op_list = boost::intrusive::list< + Operation, + op_list_member_option, + boost::intrusive::constant_time_size<false>>; + + std::array< + op_list, + static_cast<int>(OperationTypeCode::last_op) + > registries; + + std::array< + uint64_t, + static_cast<int>(OperationTypeCode::last_op) + > op_id_counters = {}; + + seastar::timer<seastar::lowres_clock> shutdown_timer; + seastar::promise<> shutdown; +public: + template <typename T, typename... Args> + typename T::IRef create_operation(Args&&... args) { + typename T::IRef op = new T(std::forward<Args>(args)...); + registries[static_cast<int>(T::type)].push_back(*op); + op->set_id(op_id_counters[static_cast<int>(T::type)]++); + return op; + } + + seastar::future<> stop() { + shutdown_timer.set_callback([this] { + if (std::all_of(registries.begin(), + registries.end(), + [](auto& opl) { + return opl.empty(); + })) { + shutdown.set_value(); + shutdown_timer.cancel(); + } + }); + shutdown_timer.arm_periodic(std::chrono::milliseconds(100/*TODO: use option instead*/)); + return shutdown.get_future(); + } +}; + +/** + * Throttles set of currently running operations + * + * Very primitive currently, assumes all ops are equally + * expensive and simply limits the number that can be + * concurrently active. + */ +class OperationThrottler : public Blocker, + private md_config_obs_t { +public: + OperationThrottler(ConfigProxy &conf); + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) final; + void update_from_config(const ConfigProxy &conf); + + template <typename F> + auto with_throttle( + OperationRef op, + crimson::osd::scheduler::params_t params, + F &&f) { + if (!max_in_progress) return f(); + auto fut = acquire_throttle(params); + return op->with_blocking_future(std::move(fut)) + .then(std::forward<F>(f)) + .then([this](auto x) { + release_throttle(); + return x; + }); + } + + template <typename F> + seastar::future<> with_throttle_while( + OperationRef op, + crimson::osd::scheduler::params_t params, + F &&f) { + return with_throttle(op, params, f).then([this, params, op, f](bool cont) { + if (cont) + return with_throttle_while(op, params, f); + else + return seastar::make_ready_future<>(); + }); + } + +private: + void dump_detail(Formatter *f) const final; + const char *get_type_name() const final { + return "OperationThrottler"; + } + +private: + crimson::osd::scheduler::SchedulerRef scheduler; + + uint64_t max_in_progress = 0; + uint64_t in_progress = 0; + + uint64_t pending = 0; + + void wake(); + + blocking_future<> acquire_throttle( + crimson::osd::scheduler::params_t params); + + void release_throttle(); +}; + +/** + * Ensures that at most one op may consider itself in the phase at a time. + * Ops will see enter() unblock in the order in which they tried to enter + * the phase. entering (though not necessarily waiting for the future to + * resolve) a new phase prior to exiting the previous one will ensure that + * the op ordering is preserved. + */ +class OrderedPipelinePhase : public Blocker { +private: + void dump_detail(ceph::Formatter *f) const final; + const char *get_type_name() const final { + return name; + } + +public: + /** + * Used to encapsulate pipeline residency state. + */ + class Handle { + OrderedPipelinePhase *phase = nullptr; + + public: + Handle() = default; + + Handle(const Handle&) = delete; + Handle(Handle&&) = delete; + Handle &operator=(const Handle&) = delete; + Handle &operator=(Handle&&) = delete; + + /** + * Returns a future which unblocks when the handle has entered the passed + * OrderedPipelinePhase. If already in a phase, enter will also release + * that phase after placing itself in the queue for the next one to preserve + * ordering. + */ + blocking_future<> enter(OrderedPipelinePhase &phase); + + /** + * Releases the current phase if there is one. Called in ~Handle(). + */ + void exit(); + + ~Handle(); + }; + + OrderedPipelinePhase(const char *name) : name(name) {} + +private: + const char * name; + seastar::shared_mutex mutex; +}; + +} diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc new file mode 100644 index 000000000..126e0e902 --- /dev/null +++ b/src/crimson/osd/osd_operations/background_recovery.cc @@ -0,0 +1,140 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> + +#include "messages/MOSDOp.h" + +#include "crimson/osd/pg.h" +#include "crimson/osd/shard_services.h" +#include "common/Formatter.h" +#include "crimson/osd/osd_operations/background_recovery.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +BackgroundRecovery::BackgroundRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + crimson::osd::scheduler::scheduler_class_t scheduler_class) + : pg(pg), + epoch_started(epoch_started), + ss(ss), + scheduler_class(scheduler_class) +{} + +void BackgroundRecovery::print(std::ostream &lhs) const +{ + lhs << "BackgroundRecovery(" << pg->get_pgid() << ")"; +} + +void BackgroundRecovery::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->get_pgid(); + f->open_object_section("recovery_detail"); + { + // TODO pg->dump_recovery_state(f); + } + f->close_section(); +} + +seastar::future<> BackgroundRecovery::start() +{ + logger().debug("{}: start", *this); + + IRef ref = this; + return ss.throttler.with_throttle_while( + this, get_scheduler_params(), [this] { + return do_recovery(); + }).handle_exception_type([ref, this](const std::system_error& err) { + if (err.code() == std::make_error_code(std::errc::interrupted)) { + logger().debug("{} recovery interruped: {}", *pg, err.what()); + return seastar::now(); + } + return seastar::make_exception_future<>(err); + }); +} + +seastar::future<bool> UrgentRecovery::do_recovery() +{ + if (!pg->has_reset_since(epoch_started)) { + return with_blocking_future( + pg->get_recovery_handler()->recover_missing(soid, need) + ).then([] { + return seastar::make_ready_future<bool>(false); + }); + } + return seastar::make_ready_future<bool>(false); +} + +void UrgentRecovery::print(std::ostream &lhs) const +{ + lhs << "UrgentRecovery(" << pg->get_pgid() << ", " + << soid << ", v" << need << ")"; +} + +void UrgentRecovery::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->get_pgid(); + f->open_object_section("recovery_detail"); + { + f->dump_stream("oid") << soid; + f->dump_stream("version") << need; + } + f->close_section(); +} + +PglogBasedRecovery::PglogBasedRecovery( + Ref<PG> pg, + ShardServices &ss, + const epoch_t epoch_started) + : BackgroundRecovery( + std::move(pg), + ss, + epoch_started, + crimson::osd::scheduler::scheduler_class_t::background_recovery) +{} + +seastar::future<bool> PglogBasedRecovery::do_recovery() +{ + if (pg->has_reset_since(epoch_started)) + return seastar::make_ready_future<bool>(false); + return with_blocking_future( + pg->get_recovery_handler()->start_recovery_ops( + crimson::common::local_conf()->osd_recovery_max_single_start)); +} + +BackfillRecovery::BackfillRecoveryPipeline &BackfillRecovery::bp(PG &pg) +{ + return pg.backfill_pipeline; +} + +seastar::future<bool> BackfillRecovery::do_recovery() +{ + logger().debug("{}", __func__); + + if (pg->has_reset_since(epoch_started)) { + logger().debug("{}: pg got reset since epoch_started={}", + __func__, epoch_started); + return seastar::make_ready_future<bool>(false); + } + // TODO: limits + return with_blocking_future( + // process_event() of our boost::statechart machine is non-reentrant. + // with the backfill_pipeline we protect it from a second entry from + // the implementation of BackfillListener. + // additionally, this stage serves to synchronize with PeeringEvent. + handle.enter(bp(*pg).process) + ).then([this] { + pg->get_recovery_handler()->dispatch_backfill_event(std::move(evt)); + return seastar::make_ready_future<bool>(false); + }); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/background_recovery.h b/src/crimson/osd/osd_operations/background_recovery.h new file mode 100644 index 000000000..37e46c588 --- /dev/null +++ b/src/crimson/osd/osd_operations/background_recovery.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/statechart/event_base.hpp> + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/common/type_helpers.h" + +#include "messages/MOSDOp.h" + +namespace crimson::osd { +class PG; +class ShardServices; + +class BackgroundRecovery : public OperationT<BackgroundRecovery> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::background_recovery; + + BackgroundRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + crimson::osd::scheduler::scheduler_class_t scheduler_class); + + virtual void print(std::ostream &) const; + seastar::future<> start(); + +protected: + Ref<PG> pg; + const epoch_t epoch_started; + +private: + virtual void dump_detail(Formatter *f) const; + crimson::osd::scheduler::params_t get_scheduler_params() const { + return { + 1, // cost + 0, // owner + scheduler_class + }; + } + virtual seastar::future<bool> do_recovery() = 0; + ShardServices &ss; + const crimson::osd::scheduler::scheduler_class_t scheduler_class; +}; + +/// represent a recovery initiated for serving a client request +/// +/// unlike @c PglogBasedRecovery and @c BackfillRecovery, +/// @c UrgentRecovery is not throttled by the scheduler. and it +/// utilizes @c RecoveryBackend directly to recover the unreadable +/// object. +class UrgentRecovery final : public BackgroundRecovery { +public: + UrgentRecovery( + const hobject_t& soid, + const eversion_t& need, + Ref<PG> pg, + ShardServices& ss, + epoch_t epoch_started) + : BackgroundRecovery{pg, ss, epoch_started, + crimson::osd::scheduler::scheduler_class_t::immediate}, + soid{soid}, need(need) {} + void print(std::ostream&) const final; + +private: + void dump_detail(Formatter* f) const final; + seastar::future<bool> do_recovery() override; + const hobject_t soid; + const eversion_t need; +}; + +class PglogBasedRecovery final : public BackgroundRecovery { +public: + PglogBasedRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started); + +private: + seastar::future<bool> do_recovery() override; +}; + +class BackfillRecovery final : public BackgroundRecovery { +public: + class BackfillRecoveryPipeline { + OrderedPipelinePhase process = { + "BackfillRecovery::PGPipeline::process" + }; + friend class BackfillRecovery; + friend class PeeringEvent; + }; + + template <class EventT> + BackfillRecovery( + Ref<PG> pg, + ShardServices &ss, + epoch_t epoch_started, + const EventT& evt); + + static BackfillRecoveryPipeline &bp(PG &pg); + +private: + boost::intrusive_ptr<const boost::statechart::event_base> evt; + OrderedPipelinePhase::Handle handle; + seastar::future<bool> do_recovery() override; +}; + +template <class EventT> +BackfillRecovery::BackfillRecovery( + Ref<PG> pg, + ShardServices &ss, + const epoch_t epoch_started, + const EventT& evt) + : BackgroundRecovery( + std::move(pg), + ss, + epoch_started, + crimson::osd::scheduler::scheduler_class_t::background_best_effort), + evt(evt.intrusive_from_this()) +{} + + +} diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc new file mode 100644 index 000000000..87b8fc788 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request.cc @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" + +#include "crimson/common/exception.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" +#include "common/Formatter.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_connection_priv.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +ClientRequest::ClientRequest( + OSD &osd, crimson::net::ConnectionRef conn, Ref<MOSDOp> &&m) + : osd(osd), conn(conn), m(m) +{} + +void ClientRequest::print(std::ostream &lhs) const +{ + lhs << *m; +} + +void ClientRequest::dump_detail(Formatter *f) const +{ +} + +ClientRequest::ConnectionPipeline &ClientRequest::cp() +{ + return get_osd_priv(conn.get()).client_request_conn_pipeline; +} + +ClientRequest::PGPipeline &ClientRequest::pp(PG &pg) +{ + return pg.client_request_pg_pipeline; +} + +bool ClientRequest::is_pg_op() const +{ + return std::any_of( + begin(m->ops), end(m->ops), + [](auto& op) { return ceph_osd_op_type_pg(op.op.op); }); +} + +seastar::future<> ClientRequest::start() +{ + logger().debug("{}: start", *this); + + IRef opref = this; + return crimson::common::handle_system_shutdown( + [this, opref=std::move(opref)]() mutable { + return seastar::repeat([this, opref]() mutable { + return with_blocking_future(handle.enter(cp().await_map)) + .then([this]() { + return with_blocking_future(osd.osdmap_gate.wait_for_map(m->get_min_epoch())); + }).then([this](epoch_t epoch) { + return with_blocking_future(handle.enter(cp().get_pg)); + }).then([this] { + return with_blocking_future(osd.wait_for_pg(m->get_spg())); + }).then([this, opref](Ref<PG> pgref) { + PG &pg = *pgref; + if (pg.can_discard_op(*m)) { + return osd.send_incremental_map(conn, m->get_map_epoch()); + } + return with_blocking_future( + handle.enter(pp(pg).await_map) + ).then([this, &pg]() mutable { + return with_blocking_future( + pg.osdmap_gate.wait_for_map(m->get_min_epoch())); + }).then([this, &pg](auto map) mutable { + return with_blocking_future( + handle.enter(pp(pg).wait_for_active)); + }).then([this, &pg]() mutable { + return with_blocking_future(pg.wait_for_active_blocker.wait()); + }).then([this, pgref=std::move(pgref)]() mutable { + if (m->finish_decode()) { + m->clear_payload(); + } + if (is_pg_op()) { + return process_pg_op(pgref); + } else { + return process_op(pgref); + } + }); + }).then([] { + return seastar::stop_iteration::yes; + }).handle_exception_type([](crimson::common::actingset_changed& e) { + if (e.is_primary()) { + logger().debug("operation restart, acting set changed"); + return seastar::stop_iteration::no; + } else { + logger().debug("operation abort, up primary changed"); + return seastar::stop_iteration::yes; + } + }); + }); + }); +} + +seastar::future<> ClientRequest::process_pg_op( + Ref<PG> &pg) +{ + return pg->do_pg_ops(m) + .then([this, pg=std::move(pg)](Ref<MOSDOpReply> reply) { + return conn->send(reply); + }); +} + +seastar::future<> ClientRequest::process_op( + Ref<PG> &pgref) +{ + PG& pg = *pgref; + return with_blocking_future( + handle.enter(pp(pg).recover_missing) + ).then([this, &pg, pgref] { + eversion_t ver; + const hobject_t& soid = m->get_hobj(); + logger().debug("{} check for recovery, {}", *this, soid); + if (pg.is_unreadable_object(soid, &ver) || + pg.is_degraded_or_backfilling_object(soid)) { + logger().debug("{} need to wait for recovery, {}", *this, soid); + if (pg.get_recovery_backend()->is_recovering(soid)) { + return pg.get_recovery_backend()->get_recovering(soid).wait_for_recovered(); + } else { + auto [op, fut] = osd.get_shard_services().start_operation<UrgentRecovery>( + soid, ver, pgref, osd.get_shard_services(), pg.get_osdmap_epoch()); + return std::move(fut); + } + } + return seastar::now(); + }).then([this, &pg] { + return with_blocking_future(handle.enter(pp(pg).get_obc)); + }).then([this, &pg]() -> PG::load_obc_ertr::future<> { + op_info.set_from_op(&*m, *pg.get_osdmap()); + return pg.with_locked_obc(m, op_info, this, [this, &pg](auto obc) { + return with_blocking_future( + handle.enter(pp(pg).process) + ).then([this, &pg, obc] { + if (!pg.is_primary()) { + // primary can handle both normal ops and balanced reads + if (is_misdirected(pg)) { + logger().trace("process_op: dropping misdirected op"); + return seastar::make_ready_future<Ref<MOSDOpReply>>(); + } else if (const hobject_t& hoid = m->get_hobj(); + !pg.get_peering_state().can_serve_replica_read(hoid)) { + auto reply = make_message<MOSDOpReply>( + m.get(), -EAGAIN, pg.get_osdmap_epoch(), + m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK), + !m->has_flag(CEPH_OSD_FLAG_RETURNVEC)); + return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply)); + } + } + return pg.do_osd_ops(m, obc, op_info); + }).then([this](Ref<MOSDOpReply> reply) { + if (reply) { + return conn->send(std::move(reply)); + } else { + return seastar::now(); + } + }); + }); + }).safe_then([pgref=std::move(pgref)] { + return seastar::now(); + }, PG::load_obc_ertr::all_same_way([](auto &code) { + logger().error("ClientRequest saw error code {}", code); + return seastar::now(); + })); +} + +bool ClientRequest::is_misdirected(const PG& pg) const +{ + // otherwise take a closer look + if (const int flags = m->get_flags(); + flags & CEPH_OSD_FLAG_BALANCE_READS || + flags & CEPH_OSD_FLAG_LOCALIZE_READS) { + if (!op_info.may_read()) { + // no read found, so it can't be balanced read + return true; + } + if (op_info.may_write() || op_info.may_cache()) { + // write op, but i am not primary + return true; + } + // balanced reads; any replica will do + return pg.is_nonprimary(); + } + // neither balanced nor localize reads + return true; +} + +} diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h new file mode 100644 index 000000000..ea3124a93 --- /dev/null +++ b/src/crimson/osd/osd_operations/client_request.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "osd/osd_op_util.h" +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDOp.h" + +namespace crimson::osd { +class PG; +class OSD; + +class ClientRequest final : public OperationT<ClientRequest> { + OSD &osd; + crimson::net::ConnectionRef conn; + Ref<MOSDOp> m; + OpInfo op_info; + OrderedPipelinePhase::Handle handle; + +public: + class ConnectionPipeline { + OrderedPipelinePhase await_map = { + "ClientRequest::ConnectionPipeline::await_map" + }; + OrderedPipelinePhase get_pg = { + "ClientRequest::ConnectionPipeline::get_pg" + }; + friend class ClientRequest; + }; + class PGPipeline { + OrderedPipelinePhase await_map = { + "ClientRequest::PGPipeline::await_map" + }; + OrderedPipelinePhase wait_for_active = { + "ClientRequest::PGPipeline::wait_for_active" + }; + OrderedPipelinePhase recover_missing = { + "ClientRequest::PGPipeline::recover_missing" + }; + OrderedPipelinePhase get_obc = { + "ClientRequest::PGPipeline::get_obc" + }; + OrderedPipelinePhase process = { + "ClientRequest::PGPipeline::process" + }; + friend class ClientRequest; + }; + + static constexpr OperationTypeCode type = OperationTypeCode::client_request; + + ClientRequest(OSD &osd, crimson::net::ConnectionRef, Ref<MOSDOp> &&m); + + void print(std::ostream &) const final; + void dump_detail(Formatter *f) const final; + +public: + seastar::future<> start(); + +private: + seastar::future<> process_pg_op( + Ref<PG> &pg); + seastar::future<> process_op( + Ref<PG> &pg); + bool is_pg_op() const; + + ConnectionPipeline &cp(); + PGPipeline &pp(PG &pg); + +private: + bool is_misdirected(const PG& pg) const; +}; + +} diff --git a/src/crimson/osd/osd_operations/compound_peering_request.cc b/src/crimson/osd/osd_operations/compound_peering_request.cc new file mode 100644 index 000000000..e55760096 --- /dev/null +++ b/src/crimson/osd/osd_operations/compound_peering_request.cc @@ -0,0 +1,170 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> + +#include "osd/PeeringState.h" + +#include "messages/MOSDPGQuery.h" +#include "messages/MOSDPGCreate2.h" + +#include "common/Formatter.h" + +#include "crimson/common/exception.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_operations/compound_peering_request.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace { +using namespace crimson::osd; + +struct compound_state { + seastar::promise<BufferedRecoveryMessages> promise; + // assuming crimson-osd won't need to be compatible with pre-octopus + // releases + BufferedRecoveryMessages ctx{ceph_release_t::octopus}; + compound_state() = default; + ~compound_state() { + promise.set_value(std::move(ctx)); + } +}; +using compound_state_ref = seastar::lw_shared_ptr<compound_state>; + +class PeeringSubEvent : public RemotePeeringEvent { + compound_state_ref state; +public: + template <typename... Args> + PeeringSubEvent(compound_state_ref state, Args &&... args) : + RemotePeeringEvent(std::forward<Args>(args)...), state(state) {} + + seastar::future<> complete_rctx(Ref<crimson::osd::PG> pg) final { + logger().debug("{}: submitting ctx transaction", *this); + state->ctx.accept_buffered_messages(ctx); + state = {}; + if (!pg) { + ceph_assert(ctx.transaction.empty()); + return seastar::now(); + } else { + return osd.get_shard_services().dispatch_context_transaction( + pg->get_collection_ref(), ctx); + } + } +}; + +std::vector<OperationRef> handle_pg_create( + OSD &osd, + crimson::net::ConnectionRef conn, + compound_state_ref state, + Ref<MOSDPGCreate2> m) +{ + std::vector<OperationRef> ret; + for (auto& [pgid, when] : m->pgs) { + const auto &[created, created_stamp] = when; + auto q = m->pg_extra.find(pgid); + ceph_assert(q != m->pg_extra.end()); + auto& [history, pi] = q->second; + logger().debug( + "{}: {} e{} @{} " + "history {} pi {}", + __func__, pgid, created, created_stamp, + history, pi); + if (!pi.empty() && + m->epoch < pi.get_bounds().second) { + logger().error( + "got pg_create on {} epoch {} " + "unmatched past_intervals {} (history {})", + pgid, m->epoch, + pi, history); + } else { + auto op = osd.get_shard_services().start_operation<PeeringSubEvent>( + state, + osd, + conn, + osd.get_shard_services(), + pg_shard_t(), + pgid, + m->epoch, + m->epoch, + NullEvt(), + true, + new PGCreateInfo(pgid, m->epoch, history, pi, true)).first; + ret.push_back(op); + } + } + return ret; +} + +struct SubOpBlocker : BlockerT<SubOpBlocker> { + static constexpr const char * type_name = "CompoundOpBlocker"; + + std::vector<OperationRef> subops; + SubOpBlocker(std::vector<OperationRef> &&subops) : subops(subops) {} + + virtual void dump_detail(Formatter *f) const { + f->open_array_section("dependent_operations"); + { + for (auto &i : subops) { + i->dump_brief(f); + } + } + f->close_section(); + } +}; + +} // namespace + +namespace crimson::osd { + +CompoundPeeringRequest::CompoundPeeringRequest( + OSD &osd, crimson::net::ConnectionRef conn, Ref<Message> m) + : osd(osd), + conn(conn), + m(m) +{} + +void CompoundPeeringRequest::print(std::ostream &lhs) const +{ + lhs << *m; +} + +void CompoundPeeringRequest::dump_detail(Formatter *f) const +{ + f->dump_stream("message") << *m; +} + +seastar::future<> CompoundPeeringRequest::start() +{ + logger().info("{}: starting", *this); + auto state = seastar::make_lw_shared<compound_state>(); + auto blocker = std::make_unique<SubOpBlocker>( + [&] { + assert((m->get_type() == MSG_OSD_PG_CREATE2)); + return handle_pg_create( + osd, + conn, + state, + boost::static_pointer_cast<MOSDPGCreate2>(m)); + }()); + + IRef ref = this; + logger().info("{}: about to fork future", *this); + return crimson::common::handle_system_shutdown( + [this, ref, blocker=std::move(blocker), state]() mutable { + return with_blocking_future( + blocker->make_blocking_future(state->promise.get_future()) + ).then([this, blocker=std::move(blocker)](auto &&ctx) { + logger().info("{}: sub events complete", *this); + return osd.get_shard_services().dispatch_context_messages(std::move(ctx)); + }).then([this, ref=std::move(ref)] { + logger().info("{}: complete", *this); + }); + }); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/osd_operations/compound_peering_request.h b/src/crimson/osd/osd_operations/compound_peering_request.h new file mode 100644 index 000000000..495306d75 --- /dev/null +++ b/src/crimson/osd/osd_operations/compound_peering_request.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "msg/MessageRef.h" + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" + +namespace crimson::osd { + +class OSD; +class PG; + +using osd_id_t = int; + +class CompoundPeeringRequest : public OperationT<CompoundPeeringRequest> { +public: + static constexpr OperationTypeCode type = + OperationTypeCode::compound_peering_request; + +private: + OSD &osd; + crimson::net::ConnectionRef conn; + Ref<Message> m; + +public: + CompoundPeeringRequest( + OSD &osd, crimson::net::ConnectionRef conn, Ref<Message> m); + + void print(std::ostream &) const final; + void dump_detail(Formatter *f) const final; + seastar::future<> start(); +}; + +} diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h new file mode 100644 index 000000000..a0bd9dcbb --- /dev/null +++ b/src/crimson/osd/osd_operations/osdop_params.h @@ -0,0 +1,27 @@ +#pragma once + +#include "messages/MOSDOp.h" +#include "osd/osd_types.h" +#include "crimson/common/type_helpers.h" + +// The fields in this struct are parameters that may be needed in multiple +// level of processing. I inclosed all those parameters in this struct to +// avoid passing each of them as a method parameter. +struct osd_op_params_t { + Ref<MOSDOp> req; + eversion_t at_version; + eversion_t pg_trim_to; + eversion_t min_last_complete_ondisk; + eversion_t last_complete; + version_t user_at_version = 0; + bool user_modify = false; + ObjectCleanRegions clean_regions; + + osd_op_params_t() = default; + osd_op_params_t(Ref<MOSDOp>&& req) : req(req) {} + osd_op_params_t(Ref<MOSDOp>&& req, eversion_t at_version, eversion_t pg_trim_to, + eversion_t mlcod, eversion_t lc, version_t user_at_version) : + req(req), at_version(at_version), pg_trim_to(pg_trim_to), + min_last_complete_ondisk(mlcod), last_complete(lc), + user_at_version(user_at_version) {} +}; diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc new file mode 100644 index 000000000..d3c6ccf81 --- /dev/null +++ b/src/crimson/osd/osd_operations/peering_event.cc @@ -0,0 +1,173 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <seastar/core/future.hh> + +#include "messages/MOSDPGLog.h" + +#include "common/Formatter.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_connection_priv.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +void PeeringEvent::print(std::ostream &lhs) const +{ + lhs << "PeeringEvent(" + << "from=" << from + << " pgid=" << pgid + << " sent=" << evt.get_epoch_sent() + << " requested=" << evt.get_epoch_requested() + << " evt=" << evt.get_desc() + << ")"; +} + +void PeeringEvent::dump_detail(Formatter *f) const +{ + f->open_object_section("PeeringEvent"); + f->dump_stream("from") << from; + f->dump_stream("pgid") << pgid; + f->dump_int("sent", evt.get_epoch_sent()); + f->dump_int("requested", evt.get_epoch_requested()); + f->dump_string("evt", evt.get_desc()); + f->close_section(); +} + + +PeeringEvent::PGPipeline &PeeringEvent::pp(PG &pg) +{ + return pg.peering_request_pg_pipeline; +} + +seastar::future<> PeeringEvent::start() +{ + + logger().debug("{}: start", *this); + + IRef ref = this; + return [this] { + if (delay) { + return seastar::sleep(std::chrono::milliseconds( + std::lround(delay*1000))); + } else { + return seastar::now(); + } + }().then([this] { + return get_pg(); + }).then([this](Ref<PG> pg) { + if (!pg) { + logger().warn("{}: pg absent, did not create", *this); + on_pg_absent(); + handle.exit(); + return complete_rctx(pg); + } else { + logger().debug("{}: pg present", *this); + return with_blocking_future(handle.enter(pp(*pg).await_map) + ).then([this, pg] { + return with_blocking_future( + pg->osdmap_gate.wait_for_map(evt.get_epoch_sent())); + }).then([this, pg](auto) { + return with_blocking_future(handle.enter(pp(*pg).process)); + }).then([this, pg] { + // TODO: likely we should synchronize also with the pg log-based + // recovery. + return with_blocking_future( + handle.enter(BackfillRecovery::bp(*pg).process)); + }).then([this, pg] { + pg->do_peering_event(evt, ctx); + handle.exit(); + return complete_rctx(pg); + }).then([this, pg] { + return pg->get_need_up_thru() ? shard_services.send_alive(pg->get_same_interval_since()) + : seastar::now(); + }); + } + }).then([this] { + return shard_services.send_pg_temp(); + }).then([this, ref=std::move(ref)] { + logger().debug("{}: complete", *this); + }); +} + +void PeeringEvent::on_pg_absent() +{ + logger().debug("{}: pg absent, dropping", *this); +} + +seastar::future<> PeeringEvent::complete_rctx(Ref<PG> pg) +{ + logger().debug("{}: submitting ctx", *this); + return shard_services.dispatch_context( + pg->get_collection_ref(), + std::move(ctx)); +} + +RemotePeeringEvent::ConnectionPipeline &RemotePeeringEvent::cp() +{ + return get_osd_priv(conn.get()).peering_request_conn_pipeline; +} + +void RemotePeeringEvent::on_pg_absent() +{ + if (auto& e = get_event().get_event(); + e.dynamic_type() == MQuery::static_type()) { + const auto map_epoch = + shard_services.get_osdmap_service().get_map()->get_epoch(); + const auto& q = static_cast<const MQuery&>(e); + const pg_info_t empty{spg_t{pgid.pgid, q.query.to}}; + if (q.query.type == q.query.LOG || + q.query.type == q.query.FULLLOG) { + auto m = ceph::make_message<MOSDPGLog>(q.query.from, q.query.to, + map_epoch, empty, + q.query.epoch_sent); + ctx.send_osd_message(q.from.osd, std::move(m)); + } else { + ctx.send_notify(q.from.osd, {q.query.from, q.query.to, + q.query.epoch_sent, + map_epoch, empty, + PastIntervals{}}); + } + } +} + +seastar::future<> RemotePeeringEvent::complete_rctx(Ref<PG> pg) +{ + if (pg) { + return PeeringEvent::complete_rctx(pg); + } else { + return shard_services.dispatch_context_messages(std::move(ctx)); + } +} + +seastar::future<Ref<PG>> RemotePeeringEvent::get_pg() +{ + return with_blocking_future( + handle.enter(cp().await_map) + ).then([this] { + return with_blocking_future( + osd.osdmap_gate.wait_for_map(evt.get_epoch_sent())); + }).then([this](auto epoch) { + logger().debug("{}: got map {}", *this, epoch); + return with_blocking_future(handle.enter(cp().get_pg)); + }).then([this] { + return with_blocking_future( + osd.get_or_create_pg( + pgid, evt.get_epoch_sent(), std::move(evt.create_info))); + }); +} + +seastar::future<Ref<PG>> LocalPeeringEvent::get_pg() { + return seastar::make_ready_future<Ref<PG>>(pg); +} + +LocalPeeringEvent::~LocalPeeringEvent() {} + +} diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h new file mode 100644 index 000000000..3a6c0678c --- /dev/null +++ b/src/crimson/osd/osd_operations/peering_event.h @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "crimson/osd/osd_operation.h" +#include "osd/osd_types.h" +#include "osd/PGPeeringEvent.h" +#include "osd/PeeringState.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class OSD; +class ShardServices; +class PG; + +class PeeringEvent : public OperationT<PeeringEvent> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::peering_event; + + class PGPipeline { + OrderedPipelinePhase await_map = { + "PeeringEvent::PGPipeline::await_map" + }; + OrderedPipelinePhase process = { + "PeeringEvent::PGPipeline::process" + }; + friend class PeeringEvent; + friend class PGAdvanceMap; + }; + +protected: + OrderedPipelinePhase::Handle handle; + PGPipeline &pp(PG &pg); + + ShardServices &shard_services; + PeeringCtx ctx; + pg_shard_t from; + spg_t pgid; + float delay = 0; + PGPeeringEvent evt; + + const pg_shard_t get_from() const { + return from; + } + + const spg_t get_pgid() const { + return pgid; + } + + const PGPeeringEvent &get_event() const { + return evt; + } + + virtual void on_pg_absent(); + virtual seastar::future<> complete_rctx(Ref<PG>); + virtual seastar::future<Ref<PG>> get_pg() = 0; + +public: + template <typename... Args> + PeeringEvent( + ShardServices &shard_services, const pg_shard_t &from, const spg_t &pgid, + Args&&... args) : + shard_services(shard_services), + ctx{ceph_release_t::octopus}, + from(from), + pgid(pgid), + evt(std::forward<Args>(args)...) + {} + template <typename... Args> + PeeringEvent( + ShardServices &shard_services, const pg_shard_t &from, const spg_t &pgid, + float delay, Args&&... args) : + shard_services(shard_services), + ctx{ceph_release_t::octopus}, + from(from), + pgid(pgid), + delay(delay), + evt(std::forward<Args>(args)...) + {} + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + seastar::future<> start(); +}; + +class RemotePeeringEvent : public PeeringEvent { +protected: + OSD &osd; + crimson::net::ConnectionRef conn; + + void on_pg_absent() final; + seastar::future<> complete_rctx(Ref<PG> pg) override; + seastar::future<Ref<PG>> get_pg() final; + +public: + class ConnectionPipeline { + OrderedPipelinePhase await_map = { + "PeeringRequest::ConnectionPipeline::await_map" + }; + OrderedPipelinePhase get_pg = { + "PeeringRequest::ConnectionPipeline::get_pg" + }; + friend class RemotePeeringEvent; + }; + + template <typename... Args> + RemotePeeringEvent(OSD &osd, crimson::net::ConnectionRef conn, Args&&... args) : + PeeringEvent(std::forward<Args>(args)...), + osd(osd), + conn(conn) + {} + +private: + ConnectionPipeline &cp(); +}; + +class LocalPeeringEvent final : public PeeringEvent { +protected: + seastar::future<Ref<PG>> get_pg() final; + + Ref<PG> pg; + +public: + template <typename... Args> + LocalPeeringEvent(Ref<PG> pg, Args&&... args) : + PeeringEvent(std::forward<Args>(args)...), + pg(pg) + {} + + virtual ~LocalPeeringEvent(); +}; + + +} diff --git a/src/crimson/osd/osd_operations/pg_advance_map.cc b/src/crimson/osd/osd_operations/pg_advance_map.cc new file mode 100644 index 000000000..a96479d40 --- /dev/null +++ b/src/crimson/osd/osd_operations/pg_advance_map.cc @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/osd_operations/pg_advance_map.h" + +#include <boost/smart_ptr/local_shared_ptr.hpp> +#include <seastar/core/future.hh> + +#include "include/types.h" +#include "common/Formatter.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/osd.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +PGAdvanceMap::PGAdvanceMap( + OSD &osd, Ref<PG> pg, epoch_t from, epoch_t to, + PeeringCtx &&rctx, bool do_init) + : osd(osd), pg(pg), from(from), to(to), + rctx(std::move(rctx)), do_init(do_init) {} + +PGAdvanceMap::~PGAdvanceMap() {} + +void PGAdvanceMap::print(std::ostream &lhs) const +{ + lhs << "PGAdvanceMap(" + << "pg=" << pg->get_pgid() + << " from=" << from + << " to=" << to; + if (do_init) { + lhs << " do_init"; + } + lhs << ")"; +} + +void PGAdvanceMap::dump_detail(Formatter *f) const +{ + f->open_object_section("PGAdvanceMap"); + f->dump_stream("pgid") << pg->get_pgid(); + f->dump_int("from", from); + f->dump_int("to", to); + f->dump_bool("do_init", do_init); + f->close_section(); +} + +seastar::future<> PGAdvanceMap::start() +{ + using cached_map_t = boost::local_shared_ptr<const OSDMap>; + + logger().debug("{}: start", *this); + + IRef ref = this; + return with_blocking_future( + handle.enter(pg->peering_request_pg_pipeline.process)) + .then([this] { + if (do_init) { + pg->handle_initialize(rctx); + pg->handle_activate_map(rctx); + } + return seastar::do_for_each( + boost::make_counting_iterator(from + 1), + boost::make_counting_iterator(to + 1), + [this](epoch_t next_epoch) { + return osd.get_map(next_epoch).then( + [this] (cached_map_t&& next_map) { + pg->handle_advance_map(next_map, rctx); + }); + }).then([this] { + pg->handle_activate_map(rctx); + handle.exit(); + if (do_init) { + osd.pg_map.pg_created(pg->get_pgid(), pg); + osd.shard_services.inc_pg_num(); + logger().info("PGAdvanceMap::start new pg {}", *pg); + } + return seastar::when_all_succeed( + pg->get_need_up_thru() \ + ? osd.shard_services.send_alive(pg->get_same_interval_since()) + : seastar::now(), + osd.shard_services.dispatch_context( + pg->get_collection_ref(), + std::move(rctx))); + }).then_unpack([this] { + return osd.shard_services.send_pg_temp(); + }); + }).then([this, ref=std::move(ref)] { + logger().debug("{}: complete", *this); + }); +} + +} diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h new file mode 100644 index 000000000..1b27037eb --- /dev/null +++ b/src/crimson/osd/osd_operations/pg_advance_map.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iostream> +#include <seastar/core/future.hh> + +#include "crimson/osd/osd_operation.h" +#include "osd/osd_types.h" +#include "crimson/common/type_helpers.h" +#include "osd/PeeringState.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class OSD; +class PG; + +class PGAdvanceMap : public OperationT<PGAdvanceMap> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::pg_advance_map; + +protected: + OrderedPipelinePhase::Handle handle; + + OSD &osd; + Ref<PG> pg; + + epoch_t from; + epoch_t to; + + PeeringCtx rctx; + const bool do_init; + +public: + PGAdvanceMap( + OSD &osd, Ref<PG> pg, epoch_t from, epoch_t to, + PeeringCtx &&rctx, bool do_init); + ~PGAdvanceMap(); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter *f) const final; + seastar::future<> start(); +}; + +} diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.cc b/src/crimson/osd/osd_operations/recovery_subrequest.cc new file mode 100644 index 000000000..820c7beab --- /dev/null +++ b/src/crimson/osd/osd_operations/recovery_subrequest.cc @@ -0,0 +1,29 @@ +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "crimson/osd/osd_operations/recovery_subrequest.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +seastar::future<> RecoverySubRequest::start() { + logger().debug("{}: start", *this); + + IRef opref = this; + return with_blocking_future(osd.osdmap_gate.wait_for_map(m->get_min_epoch())) + .then([this] (epoch_t epoch) { + return with_blocking_future(osd.wait_for_pg(m->get_spg())); + }).then([this, opref=std::move(opref)] (Ref<PG> pgref) { + return seastar::do_with(std::move(pgref), std::move(opref), + [this](auto& pgref, auto& opref) { + return pgref->get_recovery_backend()->handle_recovery_op(m); + }); + }); +} + +} diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h new file mode 100644 index 000000000..b151e5c1d --- /dev/null +++ b/src/crimson/osd/osd_operations/recovery_subrequest.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "osd/osd_op_util.h" +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd.h" +#include "crimson/common/type_helpers.h" +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" + +namespace crimson::osd { + +class OSD; +class PG; + +class RecoverySubRequest final : public OperationT<RecoverySubRequest> { +public: + static constexpr OperationTypeCode type = OperationTypeCode::background_recovery_sub; + + RecoverySubRequest(OSD &osd, crimson::net::ConnectionRef conn, Ref<MOSDFastDispatchOp>&& m) + : osd(osd), conn(conn), m(m) {} + + void print(std::ostream& out) const final + { + out << *m; + } + + void dump_detail(Formatter *f) const final + { + } + + seastar::future<> start(); +private: + OSD& osd; + crimson::net::ConnectionRef conn; + Ref<MOSDFastDispatchOp> m; +}; + +} diff --git a/src/crimson/osd/osd_operations/replicated_request.cc b/src/crimson/osd/osd_operations/replicated_request.cc new file mode 100644 index 000000000..34487f9e4 --- /dev/null +++ b/src/crimson/osd/osd_operations/replicated_request.cc @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "replicated_request.h" + +#include "common/Formatter.h" +#include "messages/MOSDRepOp.h" + +#include "crimson/osd/osd.h" +#include "crimson/osd/osd_connection_priv.h" +#include "crimson/osd/pg.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +RepRequest::RepRequest(OSD &osd, + crimson::net::ConnectionRef&& conn, + Ref<MOSDRepOp> &&req) + : osd{osd}, + conn{std::move(conn)}, + req{req} +{} + +void RepRequest::print(std::ostream& os) const +{ + os << "RepRequest(" + << "from=" << req->from + << " req=" << *req + << ")"; +} + +void RepRequest::dump_detail(Formatter *f) const +{ + f->open_object_section("RepRequest"); + f->dump_stream("reqid") << req->reqid; + f->dump_stream("pgid") << req->get_spg(); + f->dump_unsigned("map_epoch", req->get_map_epoch()); + f->dump_unsigned("min_epoch", req->get_min_epoch()); + f->dump_stream("oid") << req->poid; + f->dump_stream("from") << req->from; + f->close_section(); +} + +RepRequest::ConnectionPipeline &RepRequest::cp() +{ + return get_osd_priv(conn.get()).replicated_request_conn_pipeline; +} + +RepRequest::PGPipeline &RepRequest::pp(PG &pg) +{ + return pg.replicated_request_pg_pipeline; +} + +seastar::future<> RepRequest::start() +{ + logger().debug("{} start", *this); + IRef ref = this; + return with_blocking_future(handle.enter(cp().await_map)) + .then([this]() { + return with_blocking_future(osd.osdmap_gate.wait_for_map(req->get_min_epoch())); + }).then([this](epoch_t epoch) { + return with_blocking_future(handle.enter(cp().get_pg)); + }).then([this] { + return with_blocking_future(osd.wait_for_pg(req->get_spg())); + }).then([this, ref=std::move(ref)](Ref<PG> pg) { + return pg->handle_rep_op(std::move(req)); + }); +} +} diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h new file mode 100644 index 000000000..8e9cfc9fe --- /dev/null +++ b/src/crimson/osd/osd_operations/replicated_request.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/net/Connection.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/common/type_helpers.h" + +class MOSDRepOp; + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class OSD; +class PG; + +class RepRequest final : public OperationT<RepRequest> { +public: + class ConnectionPipeline { + OrderedPipelinePhase await_map = { + "RepRequest::ConnectionPipeline::await_map" + }; + OrderedPipelinePhase get_pg = { + "RepRequest::ConnectionPipeline::get_pg" + }; + friend RepRequest; + }; + class PGPipeline { + OrderedPipelinePhase await_map = { + "RepRequest::PGPipeline::await_map" + }; + OrderedPipelinePhase process = { + "RepRequest::PGPipeline::process" + }; + friend RepRequest; + }; + static constexpr OperationTypeCode type = OperationTypeCode::replicated_request; + RepRequest(OSD&, crimson::net::ConnectionRef&&, Ref<MOSDRepOp>&&); + + void print(std::ostream &) const final; + void dump_detail(ceph::Formatter* f) const final; + seastar::future<> start(); + +private: + ConnectionPipeline &cp(); + PGPipeline &pp(PG &pg); + + OSD &osd; + crimson::net::ConnectionRef conn; + Ref<MOSDRepOp> req; + OrderedPipelinePhase::Handle handle; +}; + +} diff --git a/src/crimson/osd/osdmap_gate.cc b/src/crimson/osd/osdmap_gate.cc new file mode 100644 index 000000000..90afc32b4 --- /dev/null +++ b/src/crimson/osd/osdmap_gate.cc @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/common/exception.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/shard_services.h" +#include "common/Formatter.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +void OSDMapGate::OSDMapBlocker::dump_detail(Formatter *f) const +{ + f->open_object_section("OSDMapGate"); + f->dump_int("epoch", epoch); + f->close_section(); +} + +blocking_future<epoch_t> OSDMapGate::wait_for_map(epoch_t epoch) +{ + if (__builtin_expect(stopping, false)) { + return make_exception_blocking_future<epoch_t>( + crimson::common::system_shutdown_exception()); + } + if (current >= epoch) { + return make_ready_blocking_future<epoch_t>(current); + } else { + logger().info("evt epoch is {}, i have {}, will wait", epoch, current); + auto &blocker = waiting_peering.emplace( + epoch, make_pair(blocker_type, epoch)).first->second; + auto fut = blocker.promise.get_shared_future(); + if (shard_services) { + return blocker.make_blocking_future( + (*shard_services).get().osdmap_subscribe(current, true).then( + [fut=std::move(fut)]() mutable { + return std::move(fut); + })); + } else { + return blocker.make_blocking_future(std::move(fut)); + } + } +} + +void OSDMapGate::got_map(epoch_t epoch) { + current = epoch; + auto first = waiting_peering.begin(); + auto last = waiting_peering.upper_bound(epoch); + std::for_each(first, last, [epoch](auto& blocked_requests) { + blocked_requests.second.promise.set_value(epoch); + }); + waiting_peering.erase(first, last); +} + +seastar::future<> OSDMapGate::stop() { + logger().info("osdmap::stop"); + stopping = true; + auto first = waiting_peering.begin(); + auto last = waiting_peering.end(); + std::for_each(first, last, [](auto& blocked_requests) { + blocked_requests.second.promise.set_exception( + crimson::common::system_shutdown_exception()); + }); + return seastar::now(); +} + +} diff --git a/src/crimson/osd/osdmap_gate.h b/src/crimson/osd/osdmap_gate.h new file mode 100644 index 000000000..2b73d8959 --- /dev/null +++ b/src/crimson/osd/osdmap_gate.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <functional> +#include <map> +#include <optional> + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "include/types.h" +#include "crimson/osd/osd_operation.h" + +namespace ceph { + class Formatter; +} + +namespace crimson::osd { + +class ShardServices; + +class OSDMapGate { + struct OSDMapBlocker : public Blocker { + const char * type_name; + epoch_t epoch; + + OSDMapBlocker(std::pair<const char *, epoch_t> args) + : type_name(args.first), epoch(args.second) {} + + OSDMapBlocker(const OSDMapBlocker &) = delete; + OSDMapBlocker(OSDMapBlocker &&) = delete; + OSDMapBlocker &operator=(const OSDMapBlocker &) = delete; + OSDMapBlocker &operator=(OSDMapBlocker &&) = delete; + + seastar::shared_promise<epoch_t> promise; + + void dump_detail(Formatter *f) const final; + private: + const char *get_type_name() const final { + return type_name; + } + }; + + // order the promises in ascending order of the waited osdmap epoch, + // so we can access all the waiters expecting a map whose epoch is less + // than or equal to a given epoch + using waiting_peering_t = std::map<epoch_t, + OSDMapBlocker>; + const char *blocker_type; + waiting_peering_t waiting_peering; + epoch_t current = 0; + std::optional<std::reference_wrapper<ShardServices>> shard_services; + bool stopping = false; +public: + OSDMapGate( + const char *blocker_type, + std::optional<std::reference_wrapper<ShardServices>> shard_services) + : blocker_type(blocker_type), shard_services(shard_services) {} + + // wait for an osdmap whose epoch is greater or equal to given epoch + blocking_future<epoch_t> wait_for_map(epoch_t epoch); + void got_map(epoch_t epoch); + seastar::future<> stop(); +}; + +} diff --git a/src/crimson/osd/osdmap_service.h b/src/crimson/osd/osdmap_service.h new file mode 100644 index 000000000..effd45b79 --- /dev/null +++ b/src/crimson/osd/osdmap_service.h @@ -0,0 +1,20 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/smart_ptr/local_shared_ptr.hpp> + +#include "include/types.h" + +class OSDMap; + +class OSDMapService { +public: + using cached_map_t = boost::local_shared_ptr<const OSDMap>; + virtual ~OSDMapService() = default; + virtual seastar::future<cached_map_t> get_map(epoch_t e) = 0; + /// get the latest map + virtual cached_map_t get_map() const = 0; + virtual epoch_t get_up_epoch() const = 0; +}; diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc new file mode 100644 index 000000000..0f01c1607 --- /dev/null +++ b/src/crimson/osd/pg.cc @@ -0,0 +1,1102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg.h" + +#include <functional> + +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/map.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm/copy.hpp> +#include <boost/range/algorithm/max_element.hpp> +#include <boost/range/numeric.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" + +#include "osd/OSDMap.h" + +#include "os/Transaction.h" + +#include "crimson/common/exception.h" +#include "crimson/net/Connection.h" +#include "crimson/net/Messenger.h" +#include "crimson/os/cyanstore/cyan_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/exceptions.h" +#include "crimson/osd/pg_meta.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/ops_executer.h" +#include "crimson/osd/osd_operations/osdop_params.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/pg_recovery.h" +#include "crimson/osd/replicated_recovery_backend.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace std::chrono { +std::ostream& operator<<(std::ostream& out, const signedspan& d) +{ + auto s = std::chrono::duration_cast<std::chrono::seconds>(d).count(); + auto ns = std::abs((d % 1s).count()); + fmt::print(out, "{}{}s", s, ns ? fmt::format(".{:0>9}", ns) : ""); + return out; +} +} + +namespace crimson::osd { + +using crimson::common::local_conf; + +class RecoverablePredicate : public IsPGRecoverablePredicate { +public: + bool operator()(const set<pg_shard_t> &have) const override { + return !have.empty(); + } +}; + +class ReadablePredicate: public IsPGReadablePredicate { + pg_shard_t whoami; +public: + explicit ReadablePredicate(pg_shard_t whoami) : whoami(whoami) {} + bool operator()(const set<pg_shard_t> &have) const override { + return have.count(whoami); + } +}; + +PG::PG( + spg_t pgid, + pg_shard_t pg_shard, + crimson::os::CollectionRef coll_ref, + pg_pool_t&& pool, + std::string&& name, + cached_map_t osdmap, + ShardServices &shard_services, + ec_profile_t profile) + : pgid{pgid}, + pg_whoami{pg_shard}, + coll_ref{coll_ref}, + pgmeta_oid{pgid.make_pgmeta_oid()}, + osdmap_gate("PG::osdmap_gate", std::nullopt), + shard_services{shard_services}, + osdmap{osdmap}, + backend( + PGBackend::create( + pgid.pgid, + pg_shard, + pool, + coll_ref, + shard_services, + profile)), + recovery_backend( + std::make_unique<ReplicatedRecoveryBackend>( + *this, shard_services, coll_ref, backend.get())), + recovery_handler( + std::make_unique<PGRecovery>(this)), + peering_state( + shard_services.get_cct(), + pg_shard, + pgid, + PGPool( + osdmap, + pgid.pool(), + pool, + name), + osdmap, + this, + this), + wait_for_active_blocker(this) +{ + peering_state.set_backend_predicates( + new ReadablePredicate(pg_whoami), + new RecoverablePredicate()); + osdmap_gate.got_map(osdmap->get_epoch()); +} + +PG::~PG() {} + +bool PG::try_flush_or_schedule_async() { + (void)shard_services.get_store().do_transaction( + coll_ref, + ObjectStore::Transaction()).then( + [this, epoch=get_osdmap_epoch()]() { + return shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + epoch, + epoch, + PeeringState::IntervalFlush()); + }); + return false; +} + +void PG::queue_check_readable(epoch_t last_peering_reset, ceph::timespan delay) +{ + // handle the peering event in the background + check_readable_timer.cancel(); + check_readable_timer.set_callback([last_peering_reset, this] { + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + last_peering_reset, + last_peering_reset, + PeeringState::CheckReadable{}); + }); + check_readable_timer.arm( + std::chrono::duration_cast<seastar::lowres_clock::duration>(delay)); +} + +void PG::recheck_readable() +{ + bool changed = false; + const auto mnow = shard_services.get_mnow(); + if (peering_state.state_test(PG_STATE_WAIT)) { + auto prior_readable_until_ub = peering_state.get_prior_readable_until_ub(); + if (mnow < prior_readable_until_ub) { + logger().info("{} will wait (mnow {} < prior_readable_until_ub {})", + __func__, mnow, prior_readable_until_ub); + } else { + logger().info("{} no longer wait (mnow {} >= prior_readable_until_ub {})", + __func__, mnow, prior_readable_until_ub); + peering_state.state_clear(PG_STATE_WAIT); + peering_state.clear_prior_readable_until_ub(); + changed = true; + } + } + if (peering_state.state_test(PG_STATE_LAGGY)) { + auto readable_until = peering_state.get_readable_until(); + if (readable_until == readable_until.zero()) { + logger().info("{} still laggy (mnow {}, readable_until zero)", + __func__, mnow); + } else if (mnow >= readable_until) { + logger().info("{} still laggy (mnow {} >= readable_until {})", + __func__, mnow, readable_until); + } else { + logger().info("{} no longer laggy (mnow {} < readable_until {})", + __func__, mnow, readable_until); + peering_state.state_clear(PG_STATE_LAGGY); + changed = true; + } + } + if (changed) { + publish_stats_to_osd(); + if (!peering_state.state_test(PG_STATE_WAIT) && + !peering_state.state_test(PG_STATE_LAGGY)) { + // TODO: requeue ops waiting for readable + } + } +} + +unsigned PG::get_target_pg_log_entries() const +{ + const unsigned num_pgs = shard_services.get_pg_num(); + const unsigned target = + local_conf().get_val<uint64_t>("osd_target_pg_log_entries_per_osd"); + const unsigned min_pg_log_entries = + local_conf().get_val<uint64_t>("osd_min_pg_log_entries"); + if (num_pgs > 0 && target > 0) { + // target an even spread of our budgeted log entries across all + // PGs. note that while we only get to control the entry count + // for primary PGs, we'll normally be responsible for a mix of + // primary and replica PGs (for the same pool(s) even), so this + // will work out. + const unsigned max_pg_log_entries = + local_conf().get_val<uint64_t>("osd_max_pg_log_entries"); + return std::clamp(target / num_pgs, + min_pg_log_entries, + max_pg_log_entries); + } else { + // fall back to a per-pg value. + return min_pg_log_entries; + } +} + +void PG::on_activate(interval_set<snapid_t>) +{ + projected_last_update = peering_state.get_info().last_update; +} + +void PG::on_activate_complete() +{ + wait_for_active_blocker.on_active(); + + if (peering_state.needs_recovery()) { + logger().info("{}: requesting recovery", + __func__); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::DoRecovery{}); + } else if (peering_state.needs_backfill()) { + logger().info("{}: requesting backfill", + __func__); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::RequestBackfill{}); + } else { + logger().debug("{}: no need to recover or backfill, AllReplicasRecovered", + " for pg: {}", __func__, pgid); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + get_osdmap_epoch(), + get_osdmap_epoch(), + PeeringState::AllReplicasRecovered{}); + } + backend->on_activate_complete(); +} + +void PG::prepare_write(pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ceph::os::Transaction &t) +{ + std::map<string,bufferlist> km; + std::string key_to_remove; + if (dirty_big_info || dirty_info) { + int ret = prepare_info_keymap( + shard_services.get_cct(), + &km, + &key_to_remove, + get_osdmap_epoch(), + info, + last_written_info, + past_intervals, + dirty_big_info, + need_write_epoch, + true, + nullptr, + this); + ceph_assert(ret == 0); + } + pglog.write_log_and_missing( + t, &km, coll_ref->get_cid(), pgmeta_oid, + peering_state.get_pool().info.require_rollback()); + if (!km.empty()) { + t.omap_setkeys(coll_ref->get_cid(), pgmeta_oid, km); + } + if (!key_to_remove.empty()) { + t.omap_rmkey(coll_ref->get_cid(), pgmeta_oid, key_to_remove); + } +} + +std::pair<ghobject_t, bool> +PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next) +{ + // TODO + shard_services.dec_pg_num(); + return {_next, false}; +} + +void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) +{ + // TODO: should update the stats upon finishing the scrub + peering_state.update_stats([scrub_level, this](auto& history, auto& stats) { + const utime_t now = ceph_clock_now(); + history.last_scrub = peering_state.get_info().last_update; + history.last_scrub_stamp = now; + history.last_clean_scrub_stamp = now; + if (scrub_level == scrub_level_t::deep) { + history.last_deep_scrub = history.last_scrub; + history.last_deep_scrub_stamp = now; + } + // yes, please publish the stats + return true; + }); +} + +void PG::log_state_enter(const char *state) { + logger().info("Entering state: {}", state); +} + +void PG::log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) { + logger().info( + "Exiting state: {}, entered at {}, {} spent on {} events", + state_name, + enter_time, + event_dur, + events); +} + +ceph::signedspan PG::get_mnow() +{ + return shard_services.get_mnow(); +} + +HeartbeatStampsRef PG::get_hb_stamps(int peer) +{ + return shard_services.get_hb_stamps(peer); +} + +void PG::schedule_renew_lease(epoch_t last_peering_reset, ceph::timespan delay) +{ + // handle the peering event in the background + renew_lease_timer.cancel(); + renew_lease_timer.set_callback([last_peering_reset, this] { + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + last_peering_reset, + last_peering_reset, + RenewLease{}); + }); + renew_lease_timer.arm( + std::chrono::duration_cast<seastar::lowres_clock::duration>(delay)); +} + + +void PG::init( + int role, + const vector<int>& newup, int new_up_primary, + const vector<int>& newacting, int new_acting_primary, + const pg_history_t& history, + const PastIntervals& pi, + bool backfill, + ObjectStore::Transaction &t) +{ + peering_state.init( + role, newup, new_up_primary, newacting, + new_acting_primary, history, pi, backfill, t); +} + +seastar::future<> PG::read_state(crimson::os::FuturizedStore* store) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + return seastar::do_with(PGMeta(store, pgid), [] (auto& pg_meta) { + return pg_meta.load(); + }).then([this, store](auto&& ret) { + auto [pg_info, past_intervals] = std::move(ret); + return peering_state.init_from_disk_state( + std::move(pg_info), + std::move(past_intervals), + [this, store] (PGLog &pglog) { + return pglog.read_log_and_missing_crimson( + *store, + coll_ref, + peering_state.get_info(), + pgmeta_oid); + }); + }).then([this]() { + int primary, up_primary; + vector<int> acting, up; + peering_state.get_osdmap()->pg_to_up_acting_osds( + pgid.pgid, &up, &up_primary, &acting, &primary); + peering_state.init_primary_up_acting( + up, + acting, + up_primary, + primary); + int rr = OSDMap::calc_pg_role(pg_whoami, acting); + peering_state.set_role(rr); + + epoch_t epoch = get_osdmap_epoch(); + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + epoch, + epoch, + PeeringState::Initialize()); + + return seastar::now(); + }); +} + +void PG::do_peering_event( + const boost::statechart::event_base &evt, + PeeringCtx &rctx) +{ + peering_state.handle_event( + evt, + &rctx); + peering_state.write_if_dirty(rctx.transaction); +} + +void PG::do_peering_event( + PGPeeringEvent& evt, PeeringCtx &rctx) +{ + if (!peering_state.pg_has_reset_since(evt.get_epoch_requested())) { + logger().debug("{} handling {} for pg: {}", __func__, evt.get_desc(), pgid); + do_peering_event(evt.get_event(), rctx); + } else { + logger().debug("{} ignoring {} -- pg has reset", __func__, evt.get_desc()); + } +} + +void PG::handle_advance_map( + cached_map_t next_map, PeeringCtx &rctx) +{ + vector<int> newup, newacting; + int up_primary, acting_primary; + next_map->pg_to_up_acting_osds( + pgid.pgid, + &newup, &up_primary, + &newacting, &acting_primary); + peering_state.advance_map( + next_map, + peering_state.get_osdmap(), + newup, + up_primary, + newacting, + acting_primary, + rctx); + osdmap_gate.got_map(next_map->get_epoch()); +} + +void PG::handle_activate_map(PeeringCtx &rctx) +{ + peering_state.activate_map(rctx); +} + +void PG::handle_initialize(PeeringCtx &rctx) +{ + PeeringState::Initialize evt; + peering_state.handle_event(evt, &rctx); +} + + +void PG::print(ostream& out) const +{ + out << peering_state << " "; +} + +void PG::dump_primary(Formatter* f) +{ + peering_state.dump_peering_state(f); + + f->open_array_section("recovery_state"); + PeeringState::QueryState q(f); + peering_state.handle_event(q, 0); + f->close_section(); + + // TODO: snap_trimq + // TODO: scrubber state + // TODO: agent state +} + +std::ostream& operator<<(std::ostream& os, const PG& pg) +{ + os << " pg_epoch " << pg.get_osdmap_epoch() << " "; + pg.print(os); + return os; +} + +void PG::WaitForActiveBlocker::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pg->pgid; +} + +void PG::WaitForActiveBlocker::on_active() +{ + p.set_value(); + p = {}; +} + +blocking_future<> PG::WaitForActiveBlocker::wait() +{ + if (pg->peering_state.is_active()) { + return make_blocking_future(seastar::now()); + } else { + return make_blocking_future(p.get_shared_future()); + } +} + +seastar::future<> PG::WaitForActiveBlocker::stop() +{ + p.set_exception(crimson::common::system_shutdown_exception()); + return seastar::now(); +} + +seastar::future<> PG::submit_transaction(const OpInfo& op_info, + const std::vector<OSDOp>& ops, + ObjectContextRef&& obc, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + epoch_t map_epoch = get_osdmap_epoch(); + + if (__builtin_expect(osd_op_p.at_version.epoch != map_epoch, false)) { + throw crimson::common::actingset_changed(is_primary()); + } + + std::vector<pg_log_entry_t> log_entries; + log_entries.emplace_back(obc->obs.exists ? + pg_log_entry_t::MODIFY : pg_log_entry_t::DELETE, + obc->obs.oi.soid, osd_op_p.at_version, obc->obs.oi.version, + osd_op_p.user_modify ? osd_op_p.at_version.version : 0, + osd_op_p.req->get_reqid(), osd_op_p.req->get_mtime(), + op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0); + // TODO: refactor the submit_transaction + if (op_info.allows_returnvec()) { + // also the per-op values are recorded in the pg log + log_entries.back().set_op_returns(ops); + logger().debug("{} op_returns: {}", + __func__, log_entries.back().op_returns); + } + log_entries.back().clean_regions = std::move(osd_op_p.clean_regions); + peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version); + peering_state.append_log_with_trim_to_updated(std::move(log_entries), osd_op_p.at_version, + txn, true, false); + + return backend->mutate_object(peering_state.get_acting_recovery_backfill(), + std::move(obc), + std::move(txn), + std::move(osd_op_p), + peering_state.get_last_peering_reset(), + map_epoch, + std::move(log_entries)).then( + [this, last_complete=peering_state.get_info().last_complete, + at_version=osd_op_p.at_version](auto acked) { + for (const auto& peer : acked) { + peering_state.update_peer_last_complete_ondisk( + peer.shard, peer.last_complete_ondisk); + } + peering_state.complete_write(at_version, last_complete); + return seastar::now(); + }); +} + +osd_op_params_t&& PG::fill_op_params_bump_pg_version( + osd_op_params_t&& osd_op_p, + Ref<MOSDOp> m, + const bool user_modify) +{ + osd_op_p.req = std::move(m); + osd_op_p.at_version = next_version(); + osd_op_p.pg_trim_to = get_pg_trim_to(); + osd_op_p.min_last_complete_ondisk = get_min_last_complete_ondisk(); + osd_op_p.last_complete = get_info().last_complete; + if (user_modify) { + osd_op_p.user_at_version = osd_op_p.at_version.version; + } + return std::move(osd_op_p); +} + +seastar::future<Ref<MOSDOpReply>> PG::handle_failed_op( + const std::error_code& e, + ObjectContextRef obc, + const OpsExecuter& ox, + const MOSDOp& m) const +{ + // Oops, an operation had failed. do_osd_ops() altogether with + // OpsExecuter already dropped the ObjectStore::Transaction if + // there was any. However, this is not enough to completely + // rollback as we gave OpsExecuter the very single copy of `obc` + // we maintain and we did it for both reading and writing. + // Now all modifications must be reverted. + // + // Let's just reload from the store. Evicting from the shared + // LRU would be tricky as next MOSDOp (the one at `get_obc` + // phase) could actually already finished the lookup. Fortunately, + // this is supposed to live on cold paths, so performance is not + // a concern -- simplicity wins. + // + // The conditional's purpose is to efficiently handle hot errors + // which may appear as a result of e.g. CEPH_OSD_OP_CMPXATTR or + // CEPH_OSD_OP_OMAP_CMP. These are read-like ops and clients + // typically append them before any write. If OpsExecuter hasn't + // seen any modifying operation, `obc` is supposed to be kept + // unchanged. + assert(e.value() > 0); + const bool need_reload_obc = ox.has_seen_write(); + logger().debug( + "{}: {} - object {} got error code {}, {}; need_reload_obc {}", + __func__, + m, + obc->obs.oi.soid, + e.value(), + e.message(), + need_reload_obc); + return (need_reload_obc ? reload_obc(*obc) + : load_obc_ertr::now() + ).safe_then([&e, &m, obc = std::move(obc), this] { + auto reply = make_message<MOSDOpReply>( + &m, -e.value(), get_osdmap_epoch(), 0, false); + reply->set_enoent_reply_versions( + peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply)); + }, load_obc_ertr::assert_all{ "can't live with object state messed up" }); +} + +seastar::future<Ref<MOSDOpReply>> PG::do_osd_ops( + Ref<MOSDOp> m, + ObjectContextRef obc, + const OpInfo &op_info) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + using osd_op_errorator = OpsExecuter::osd_op_errorator; + const auto oid = m->get_snapid() == CEPH_SNAPDIR ? m->get_hobj().get_head() + : m->get_hobj(); + auto ox = std::make_unique<OpsExecuter>( + obc, op_info, get_pool().info, get_backend(), *m); + return crimson::do_for_each( + m->ops, [obc, m, ox = ox.get()](OSDOp& osd_op) { + logger().debug( + "do_osd_ops: {} - object {} - handling op {}", + *m, + obc->obs.oi.soid, + ceph_osd_op_name(osd_op.op.op)); + return ox->execute_op(osd_op); + }).safe_then([this, obc, m, ox = ox.get(), &op_info] { + logger().debug( + "do_osd_ops: {} - object {} all operations successful", + *m, + obc->obs.oi.soid); + return std::move(*ox).flush_changes( + [m] (auto&& obc) -> osd_op_errorator::future<> { + logger().debug( + "do_osd_ops: {} - object {} txn is empty, bypassing mutate", + *m, + obc->obs.oi.soid); + return osd_op_errorator::now(); + }, + [this, m, &op_info] (auto&& txn, + auto&& obc, + auto&& osd_op_p, + bool user_modify) -> osd_op_errorator::future<> { + logger().debug( + "do_osd_ops: {} - object {} submitting txn", + *m, + obc->obs.oi.soid); + auto filled_osd_op_p = fill_op_params_bump_pg_version( + std::move(osd_op_p), + std::move(m), + user_modify); + return submit_transaction( + op_info, + filled_osd_op_p.req->ops, + std::move(obc), + std::move(txn), + std::move(filled_osd_op_p)); + }); + }).safe_then([this, + m, + obc, + rvec = op_info.allows_returnvec()] { + // TODO: should stop at the first op which returns a negative retval, + // cmpext uses it for returning the index of first unmatched byte + int result = m->ops.empty() ? 0 : m->ops.back().rval.code; + if (result > 0 && !rvec) { + result = 0; + } + auto reply = make_message<MOSDOpReply>(m.get(), + result, + get_osdmap_epoch(), + 0, + false); + reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK); + logger().debug( + "do_osd_ops: {} - object {} sending reply", + *m, + obc->obs.oi.soid); + return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply)); + }, osd_op_errorator::all_same_way([ox = ox.get(), + m, + obc, + this] (const std::error_code& e) { + return handle_failed_op(e, std::move(obc), *ox, *m); + })).handle_exception_type([ox_deleter = std::move(ox), + m, + obc, + this] (const crimson::osd::error& e) { + // we need this handler because throwing path which aren't errorated yet. + logger().debug("encountered the legacy error handling path!"); + return handle_failed_op(e.code(), std::move(obc), *ox_deleter, *m); + }); +} + +seastar::future<Ref<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + auto ox = std::make_unique<PgOpsExecuter>(std::as_const(*this), + std::as_const(*m)); + return seastar::do_for_each(m->ops, [ox = ox.get()](OSDOp& osd_op) { + logger().debug("will be handling pg op {}", ceph_osd_op_name(osd_op.op.op)); + return ox->execute_op(osd_op); + }).then([m, this, ox = std::move(ox)] { + auto reply = make_message<MOSDOpReply>(m.get(), 0, get_osdmap_epoch(), + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + false); + return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply)); + }).handle_exception_type([=](const crimson::osd::error& e) { + auto reply = make_message<MOSDOpReply>( + m.get(), -e.code().value(), get_osdmap_epoch(), 0, false); + reply->set_enoent_reply_versions(peering_state.get_info().last_update, + peering_state.get_info().last_user_version); + return seastar::make_ready_future<Ref<MOSDOpReply>>(std::move(reply)); + }); +} + +hobject_t PG::get_oid(const MOSDOp &m) +{ + return (m.get_snapid() == CEPH_SNAPDIR ? + m.get_hobj().get_head() : + m.get_hobj()); +} + +RWState::State PG::get_lock_type(const OpInfo &op_info) +{ + + if (op_info.rwordered() && op_info.may_read()) { + return RWState::RWEXCL; + } else if (op_info.rwordered()) { + return RWState::RWWRITE; + } else { + ceph_assert(op_info.may_read()); + return RWState::RWREAD; + } +} + +std::optional<hobject_t> PG::resolve_oid( + const SnapSet &ss, + const hobject_t &oid) +{ + if (oid.snap > ss.seq) { + return oid.get_head(); + } else { + // which clone would it be? + auto clone = std::upper_bound( + begin(ss.clones), end(ss.clones), + oid.snap); + if (clone == end(ss.clones)) { + // Doesn't exist, > last clone, < ss.seq + return std::nullopt; + } + auto citer = ss.clone_snaps.find(*clone); + // TODO: how do we want to handle this kind of logic error? + ceph_assert(citer != ss.clone_snaps.end()); + + if (std::find( + citer->second.begin(), + citer->second.end(), + *clone) == citer->second.end()) { + return std::nullopt; + } else { + auto soid = oid; + soid.snap = *clone; + return std::optional<hobject_t>(soid); + } + } +} + +template<RWState::State State> +PG::load_obc_ertr::future<> +PG::with_head_obc(hobject_t oid, with_obc_func_t&& func) +{ + assert(oid.is_head()); + auto [obc, existed] = shard_services.obc_registry.get_cached_obc(oid); + return obc->with_lock<State>( + [oid=std::move(oid), existed=existed, obc=std::move(obc), + func=std::move(func), this] { + auto loaded = load_obc_ertr::make_ready_future<ObjectContextRef>(obc); + if (existed) { + logger().debug("with_head_obc: found {} in cache", oid); + } else { + logger().debug("with_head_obc: cache miss on {}", oid); + loaded = obc->with_promoted_lock<State>([this, obc] { + return load_head_obc(obc); + }); + } + return loaded.safe_then([func=std::move(func)](auto obc) { + return func(std::move(obc)); + }); + }); +} + +template<RWState::State State> +PG::load_obc_ertr::future<> +PG::with_clone_obc(hobject_t oid, with_obc_func_t&& func) +{ + assert(!oid.is_head()); + return with_head_obc<RWState::RWREAD>(oid.get_head(), + [oid, func=std::move(func), this](auto head) -> load_obc_ertr::future<> { + auto coid = resolve_oid(head->get_ro_ss(), oid); + if (!coid) { + // TODO: return crimson::ct_error::enoent::make(); + logger().error("with_clone_obc: {} clone not found", coid); + return load_obc_ertr::make_ready_future<>(); + } + auto [clone, existed] = shard_services.obc_registry.get_cached_obc(*coid); + return clone->template with_lock<State>( + [coid=*coid, existed=existed, + head=std::move(head), clone=std::move(clone), + func=std::move(func), this]() -> load_obc_ertr::future<> { + auto loaded = load_obc_ertr::make_ready_future<ObjectContextRef>(clone); + if (existed) { + logger().debug("with_clone_obc: found {} in cache", coid); + } else { + logger().debug("with_clone_obc: cache miss on {}", coid); + loaded = clone->template with_promoted_lock<State>( + [coid, clone, head, this] { + return backend->load_metadata(coid).safe_then( + [coid, clone=std::move(clone), head=std::move(head)](auto md) mutable { + clone->set_clone_state(std::move(md->os), std::move(head)); + return clone; + }); + }); + } + return loaded.safe_then([func=std::move(func)](auto clone) { + return func(std::move(clone)); + }); + }); + }); +} + +// explicitly instantiate the used instantiations +template PG::load_obc_ertr::future<> +PG::with_head_obc<RWState::RWNONE>(hobject_t, with_obc_func_t&&); + +PG::load_obc_ertr::future<crimson::osd::ObjectContextRef> +PG::load_head_obc(ObjectContextRef obc) +{ + hobject_t oid = obc->get_oid(); + return backend->load_metadata(oid).safe_then([obc=std::move(obc)](auto md) + -> load_obc_ertr::future<crimson::osd::ObjectContextRef> { + const hobject_t& oid = md->os.oi.soid; + logger().debug( + "load_head_obc: loaded obs {} for {}", md->os.oi, oid); + if (!md->ss) { + logger().error( + "load_head_obc: oid {} missing snapset", oid); + return crimson::ct_error::object_corrupted::make(); + } + obc->set_head_state(std::move(md->os), std::move(*(md->ss))); + logger().debug( + "load_head_obc: returning obc {} for {}", + obc->obs.oi, obc->obs.oi.soid); + return load_obc_ertr::make_ready_future< + crimson::osd::ObjectContextRef>(obc); + }); +} + +PG::load_obc_ertr::future<> +PG::reload_obc(crimson::osd::ObjectContext& obc) const +{ + assert(obc.is_head()); + return backend->load_metadata(obc.get_oid()).safe_then([&obc](auto md) + -> load_obc_ertr::future<> { + logger().debug( + "{}: reloaded obs {} for {}", + __func__, + md->os.oi, + obc.get_oid()); + if (!md->ss) { + logger().error( + "{}: oid {} missing snapset", + __func__, + obc.get_oid()); + return crimson::ct_error::object_corrupted::make(); + } + obc.set_head_state(std::move(md->os), std::move(*(md->ss))); + return load_obc_ertr::now(); + }); +} + +PG::load_obc_ertr::future<> +PG::with_locked_obc(Ref<MOSDOp> &m, const OpInfo &op_info, + Operation *op, PG::with_obc_func_t &&f) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + const hobject_t oid = get_oid(*m); + switch (get_lock_type(op_info)) { + case RWState::RWREAD: + if (oid.is_head()) { + return with_head_obc<RWState::RWREAD>(oid, std::move(f)); + } else { + return with_clone_obc<RWState::RWREAD>(oid, std::move(f)); + } + case RWState::RWWRITE: + if (oid.is_head()) { + return with_head_obc<RWState::RWWRITE>(oid, std::move(f)); + } else { + return with_clone_obc<RWState::RWWRITE>(oid, std::move(f)); + } + case RWState::RWEXCL: + if (oid.is_head()) { + return with_head_obc<RWState::RWWRITE>(oid, std::move(f)); + } else { + return with_clone_obc<RWState::RWWRITE>(oid, std::move(f)); + } + default: + ceph_abort(); + }; +} + +seastar::future<> PG::handle_rep_op(Ref<MOSDRepOp> req) +{ + if (__builtin_expect(stopping, false)) { + return seastar::make_exception_future<>( + crimson::common::system_shutdown_exception()); + } + + if (can_discard_replica_op(*req)) { + return seastar::now(); + } + + ceph::os::Transaction txn; + auto encoded_txn = req->get_data().cbegin(); + decode(txn, encoded_txn); + auto p = req->logbl.cbegin(); + std::vector<pg_log_entry_t> log_entries; + decode(log_entries, p); + peering_state.append_log(std::move(log_entries), req->pg_trim_to, + req->version, req->min_last_complete_ondisk, txn, !txn.empty(), false); + return shard_services.get_store().do_transaction(coll_ref, std::move(txn)) + .then([req, lcod=peering_state.get_info().last_complete, this] { + peering_state.update_last_complete_ondisk(lcod); + const auto map_epoch = get_osdmap_epoch(); + auto reply = make_message<MOSDRepOpReply>( + req.get(), pg_whoami, 0, + map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK); + reply->set_last_complete_ondisk(lcod); + return shard_services.send_to_osd(req->from.osd, reply, map_epoch); + }); +} + +void PG::handle_rep_op_reply(crimson::net::ConnectionRef conn, + const MOSDRepOpReply& m) +{ + if (!can_discard_replica_op(m)) { + backend->got_rep_op_reply(m); + } +} + +template <typename MsgType> +bool PG::can_discard_replica_op(const MsgType& m) const +{ + // if a repop is replied after a replica goes down in a new osdmap, and + // before the pg advances to this new osdmap, the repop replies before this + // repop can be discarded by that replica OSD, because the primary resets the + // connection to it when handling the new osdmap marking it down, and also + // resets the messenger sesssion when the replica reconnects. to avoid the + // out-of-order replies, the messages from that replica should be discarded. + const auto osdmap = peering_state.get_osdmap(); + const int from_osd = m.get_source().num(); + if (osdmap->is_down(from_osd)) { + return true; + } + // Mostly, this overlaps with the old_peering_msg + // condition. An important exception is pushes + // sent by replicas not in the acting set, since + // if such a replica goes down it does not cause + // a new interval. + if (osdmap->get_down_at(from_osd) >= m.map_epoch) { + return true; + } + // same pg? + // if pg changes *at all*, we reset and repeer! + if (epoch_t lpr = peering_state.get_last_peering_reset(); + lpr > m.map_epoch) { + logger().debug("{}: pg changed {} after {}, dropping", + __func__, get_info().history, m.map_epoch); + return true; + } + return false; +} + +seastar::future<> PG::stop() +{ + logger().info("PG {} {}", pgid, __func__); + stopping = true; + return osdmap_gate.stop().then([this] { + return wait_for_active_blocker.stop(); + }).then([this] { + return recovery_handler->stop(); + }).then([this] { + return recovery_backend->stop(); + }).then([this] { + return backend->stop(); + }); +} + +void PG::on_change(ceph::os::Transaction &t) { + recovery_backend->on_peering_interval_change(t); + backend->on_actingset_changed({ is_primary() }); +} + +bool PG::can_discard_op(const MOSDOp& m) const { + return __builtin_expect(m.get_map_epoch() + < peering_state.get_info().history.same_primary_since, false); +} + +bool PG::is_degraded_or_backfilling_object(const hobject_t& soid) const { + /* The conditions below may clear (on_local_recover, before we queue + * the transaction) before we actually requeue the degraded waiters + * in on_global_recover after the transaction completes. + */ + if (peering_state.get_pg_log().get_missing().get_items().count(soid)) + return true; + ceph_assert(!get_acting_recovery_backfill().empty()); + for (auto& peer : get_acting_recovery_backfill()) { + if (peer == get_primary()) continue; + auto peer_missing_entry = peering_state.get_peer_missing().find(peer); + // If an object is missing on an async_recovery_target, return false. + // This will not block the op and the object is async recovered later. + if (peer_missing_entry != peering_state.get_peer_missing().end() && + peer_missing_entry->second.get_items().count(soid)) { + return true; + } + // Object is degraded if after last_backfill AND + // we are backfilling it + if (is_backfill_target(peer) && + peering_state.get_peer_info(peer).last_backfill <= soid && + recovery_handler->backfill_state->get_last_backfill_started() >= soid && + recovery_backend->is_recovering(soid)) { + return true; + } + } + return false; +} + +} diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h new file mode 100644 index 000000000..34676ee7a --- /dev/null +++ b/src/crimson/osd/pg.h @@ -0,0 +1,704 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <optional> +#include <boost/intrusive_ptr.hpp> +#include <boost/smart_ptr/intrusive_ref_counter.hpp> +#include <boost/smart_ptr/local_shared_ptr.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> +#include <seastar/core/sleep.hh> + +#include "common/dout.h" +#include "crimson/net/Fwd.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDOpReply.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" +#include "crimson/osd/object_context.h" +#include "osd/PeeringState.h" + +#include "crimson/common/type_helpers.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/osd_operations/client_request.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/osd_operations/replicated_request.h" +#include "crimson/osd/osd_operations/background_recovery.h" +#include "crimson/osd/shard_services.h" +#include "crimson/osd/osdmap_gate.h" +#include "crimson/osd/pg_recovery.h" +#include "crimson/osd/pg_recovery_listener.h" +#include "crimson/osd/recovery_backend.h" + +class MQuery; +class OSDMap; +class PGBackend; +class PGPeeringEvent; +class osd_op_params_t; + +namespace recovery { + class Context; +} + +namespace crimson::net { + class Messenger; +} + +namespace crimson::os { + class FuturizedStore; +} + +namespace crimson::osd { +class ClientRequest; +class OpsExecuter; + +class PG : public boost::intrusive_ref_counter< + PG, + boost::thread_unsafe_counter>, + public PGRecoveryListener, + PeeringState::PeeringListener, + DoutPrefixProvider +{ + using ec_profile_t = std::map<std::string,std::string>; + using cached_map_t = boost::local_shared_ptr<const OSDMap>; + + ClientRequest::PGPipeline client_request_pg_pipeline; + PeeringEvent::PGPipeline peering_request_pg_pipeline; + RepRequest::PGPipeline replicated_request_pg_pipeline; + + spg_t pgid; + pg_shard_t pg_whoami; + crimson::os::CollectionRef coll_ref; + ghobject_t pgmeta_oid; + + seastar::timer<seastar::lowres_clock> check_readable_timer; + seastar::timer<seastar::lowres_clock> renew_lease_timer; + +public: + PG(spg_t pgid, + pg_shard_t pg_shard, + crimson::os::CollectionRef coll_ref, + pg_pool_t&& pool, + std::string&& name, + cached_map_t osdmap, + ShardServices &shard_services, + ec_profile_t profile); + + ~PG(); + + const pg_shard_t& get_pg_whoami() const final { + return pg_whoami; + } + + const spg_t& get_pgid() const final { + return pgid; + } + + PGBackend& get_backend() { + return *backend; + } + const PGBackend& get_backend() const { + return *backend; + } + // EpochSource + epoch_t get_osdmap_epoch() const final { + return peering_state.get_osdmap_epoch(); + } + + eversion_t get_pg_trim_to() const { + return peering_state.get_pg_trim_to(); + } + + eversion_t get_min_last_complete_ondisk() const { + return peering_state.get_min_last_complete_ondisk(); + } + + const pg_info_t& get_info() const final { + return peering_state.get_info(); + } + + // DoutPrefixProvider + std::ostream& gen_prefix(std::ostream& out) const final { + return out << *this; + } + crimson::common::CephContext *get_cct() const final { + return shard_services.get_cct(); + } + unsigned get_subsys() const final { + return ceph_subsys_osd; + } + + crimson::os::CollectionRef get_collection_ref() { + return coll_ref; + } + + // PeeringListener + void prepare_write( + pg_info_t &info, + pg_info_t &last_written_info, + PastIntervals &past_intervals, + PGLog &pglog, + bool dirty_info, + bool dirty_big_info, + bool need_write_epoch, + ceph::os::Transaction &t) final; + + void on_info_history_change() final { + // Not needed yet -- mainly for scrub scheduling + } + + void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) final; + + uint64_t get_snap_trimq_size() const final { + return 0; + } + + void send_cluster_message( + int osd, MessageRef m, + epoch_t epoch, bool share_map_update=false) final { + (void)shard_services.send_to_osd(osd, m, epoch); + } + + void send_pg_created(pg_t pgid) final { + (void)shard_services.send_pg_created(pgid); + } + + bool try_flush_or_schedule_async() final; + + void start_flush_on_transaction( + ceph::os::Transaction &t) final { + t.register_on_commit( + new LambdaContext([this](int r){ + peering_state.complete_flush(); + })); + } + + void on_flushed() final { + // will be needed for unblocking IO operations/peering + } + + template <typename T> + void start_peering_event_operation(T &&evt, float delay = 0) { + (void) shard_services.start_operation<LocalPeeringEvent>( + this, + shard_services, + pg_whoami, + pgid, + delay, + std::forward<T>(evt)); + } + + void schedule_event_after( + PGPeeringEventRef event, + float delay) final { + start_peering_event_operation(std::move(*event), delay); + } + std::vector<pg_shard_t> get_replica_recovery_order() const final { + return peering_state.get_replica_recovery_order(); + } + void request_local_background_io_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) final { + shard_services.local_reserver.request_reservation( + pgid, + on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) { + start_peering_event_operation(std::move(*on_grant)); + }) : nullptr, + priority, + on_preempt ? make_lambda_context( + [this, on_preempt=std::move(on_preempt)] (int) { + start_peering_event_operation(std::move(*on_preempt)); + }) : nullptr); + } + + void update_local_background_io_priority( + unsigned priority) final { + shard_services.local_reserver.update_priority( + pgid, + priority); + } + + void cancel_local_background_io_reservation() final { + shard_services.local_reserver.cancel_reservation( + pgid); + } + + void request_remote_recovery_reservation( + unsigned priority, + PGPeeringEventURef on_grant, + PGPeeringEventURef on_preempt) final { + shard_services.remote_reserver.request_reservation( + pgid, + on_grant ? make_lambda_context([this, on_grant=std::move(on_grant)] (int) { + start_peering_event_operation(std::move(*on_grant)); + }) : nullptr, + priority, + on_preempt ? make_lambda_context( + [this, on_preempt=std::move(on_preempt)] (int) { + start_peering_event_operation(std::move(*on_preempt)); + }) : nullptr); + } + + void cancel_remote_recovery_reservation() final { + shard_services.remote_reserver.cancel_reservation( + pgid); + } + + void schedule_event_on_commit( + ceph::os::Transaction &t, + PGPeeringEventRef on_commit) final { + t.register_on_commit( + make_lambda_context( + [this, on_commit=std::move(on_commit)](int) { + start_peering_event_operation(std::move(*on_commit)); + })); + } + + void update_heartbeat_peers(set<int> peers) final { + // Not needed yet + } + void set_probe_targets(const set<pg_shard_t> &probe_set) final { + // Not needed yet + } + void clear_probe_targets() final { + // Not needed yet + } + void queue_want_pg_temp(const std::vector<int> &wanted) final { + shard_services.queue_want_pg_temp(pgid.pgid, wanted); + } + void clear_want_pg_temp() final { + shard_services.remove_want_pg_temp(pgid.pgid); + } + void publish_stats_to_osd() final { + if (!is_primary()) + return; + + (void) peering_state.prepare_stats_for_publish( + false, + pg_stat_t(), + object_stat_collection_t()); + } + void clear_publish_stats() final { + // Not needed yet + } + void check_recovery_sources(const OSDMapRef& newmap) final { + // Not needed yet + } + void check_blocklisted_watchers() final { + // Not needed yet + } + void clear_primary_state() final { + // Not needed yet + } + + void queue_check_readable(epoch_t last_peering_reset, + ceph::timespan delay) final; + void recheck_readable() final; + + unsigned get_target_pg_log_entries() const final; + + void on_pool_change() final { + // Not needed yet + } + void on_role_change() final { + // Not needed yet + } + void on_change(ceph::os::Transaction &t) final; + void on_activate(interval_set<snapid_t> to_trim) final; + void on_activate_complete() final; + void on_new_interval() final { + // Not needed yet + } + Context *on_clean() final { + // Not needed yet (will be needed for IO unblocking) + return nullptr; + } + void on_activate_committed() final { + // Not needed yet (will be needed for IO unblocking) + } + void on_active_exit() final { + // Not needed yet + } + + void on_removal(ceph::os::Transaction &t) final { + // TODO + } + std::pair<ghobject_t, bool> + do_delete_work(ceph::os::Transaction &t, ghobject_t _next) final; + + // merge/split not ready + void clear_ready_to_merge() final {} + void set_not_ready_to_merge_target(pg_t pgid, pg_t src) final {} + void set_not_ready_to_merge_source(pg_t pgid) final {} + void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) final {} + void set_ready_to_merge_source(eversion_t lu) final {} + + void on_active_actmap() final { + // Not needed yet + } + void on_active_advmap(const OSDMapRef &osdmap) final { + // Not needed yet + } + epoch_t oldest_stored_osdmap() final { + // TODO + return 0; + } + + void on_backfill_reserved() final { + recovery_handler->on_backfill_reserved(); + } + void on_backfill_canceled() final { + ceph_assert(0 == "Not implemented"); + } + + void on_recovery_reserved() final { + recovery_handler->start_pglogbased_recovery(); + } + + + bool try_reserve_recovery_space( + int64_t primary_num_bytes, int64_t local_num_bytes) final { + // TODO + return true; + } + void unreserve_recovery_space() final {} + + struct PGLogEntryHandler : public PGLog::LogEntryHandler { + PG *pg; + ceph::os::Transaction *t; + PGLogEntryHandler(PG *pg, ceph::os::Transaction *t) : pg(pg), t(t) {} + + // LogEntryHandler + void remove(const hobject_t &hoid) override { + // TODO + } + void try_stash(const hobject_t &hoid, version_t v) override { + // TODO + } + void rollback(const pg_log_entry_t &entry) override { + // TODO + } + void rollforward(const pg_log_entry_t &entry) override { + // TODO + } + void trim(const pg_log_entry_t &entry) override { + // TODO + } + }; + PGLog::LogEntryHandlerRef get_log_handler( + ceph::os::Transaction &t) final { + return std::make_unique<PG::PGLogEntryHandler>(this, &t); + } + + void rebuild_missing_set_with_deletes(PGLog &pglog) final { + ceph_assert(0 == "Impossible for crimson"); + } + + PerfCounters &get_peering_perf() final { + return shard_services.get_recoverystate_perf_logger(); + } + PerfCounters &get_perf_logger() final { + return shard_services.get_perf_logger(); + } + + void log_state_enter(const char *state) final; + void log_state_exit( + const char *state_name, utime_t enter_time, + uint64_t events, utime_t event_dur) final; + + void dump_recovery_info(Formatter *f) const final { + } + + OstreamTemp get_clog_info() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_INFO, nullptr); + } + OstreamTemp get_clog_debug() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_DEBUG, nullptr); + } + OstreamTemp get_clog_error() final { + // not needed yet: replace with not a stub (needs to be wired up to monc) + return OstreamTemp(CLOG_ERROR, nullptr); + } + + ceph::signedspan get_mnow() final; + HeartbeatStampsRef get_hb_stamps(int peer) final; + void schedule_renew_lease(epoch_t plr, ceph::timespan delay) final; + + + // Utility + bool is_primary() const final { + return peering_state.is_primary(); + } + bool is_nonprimary() const { + return peering_state.is_nonprimary(); + } + bool is_peered() const final { + return peering_state.is_peered(); + } + bool is_recovering() const final { + return peering_state.is_recovering(); + } + bool is_backfilling() const final { + return peering_state.is_backfilling(); + } + pg_stat_t get_stats() { + auto stats = peering_state.prepare_stats_for_publish( + false, + pg_stat_t(), + object_stat_collection_t()); + ceph_assert(stats); + return *stats; + } + bool get_need_up_thru() const { + return peering_state.get_need_up_thru(); + } + epoch_t get_same_interval_since() const { + return get_info().history.same_interval_since; + } + + const auto& get_pool() const { + return peering_state.get_pool(); + } + pg_shard_t get_primary() const { + return peering_state.get_primary(); + } + + /// initialize created PG + void init( + int role, + const std::vector<int>& up, + int up_primary, + const std::vector<int>& acting, + int acting_primary, + const pg_history_t& history, + const PastIntervals& pim, + bool backfill, + ceph::os::Transaction &t); + + seastar::future<> read_state(crimson::os::FuturizedStore* store); + + void do_peering_event( + PGPeeringEvent& evt, PeeringCtx &rctx); + + void handle_advance_map(cached_map_t next_map, PeeringCtx &rctx); + void handle_activate_map(PeeringCtx &rctx); + void handle_initialize(PeeringCtx &rctx); + + static hobject_t get_oid(const MOSDOp &m); + static RWState::State get_lock_type(const OpInfo &op_info); + static std::optional<hobject_t> resolve_oid( + const SnapSet &snapset, + const hobject_t &oid); + + using load_obc_ertr = crimson::errorator< + crimson::ct_error::object_corrupted>; + + load_obc_ertr::future<crimson::osd::ObjectContextRef> + load_head_obc(ObjectContextRef obc); + + load_obc_ertr::future<> + reload_obc(crimson::osd::ObjectContext& obc) const; + +public: + using with_obc_func_t = + std::function<load_obc_ertr::future<> (ObjectContextRef)>; + + template<RWState::State State> + load_obc_ertr::future<> with_head_obc(hobject_t oid, with_obc_func_t&& func); + + load_obc_ertr::future<> with_locked_obc( + Ref<MOSDOp> &m, + const OpInfo &op_info, + Operation *op, + with_obc_func_t&& f); + + seastar::future<> handle_rep_op(Ref<MOSDRepOp> m); + void handle_rep_op_reply(crimson::net::ConnectionRef conn, + const MOSDRepOpReply& m); + + void print(std::ostream& os) const; + void dump_primary(Formatter*); + +private: + template<RWState::State State> + load_obc_ertr::future<> with_clone_obc(hobject_t oid, with_obc_func_t&& func); + + load_obc_ertr::future<ObjectContextRef> get_locked_obc( + Operation *op, + const hobject_t &oid, + RWState::State type); + + void do_peering_event( + const boost::statechart::event_base &evt, + PeeringCtx &rctx); + osd_op_params_t&& fill_op_params_bump_pg_version( + osd_op_params_t&& osd_op_p, + Ref<MOSDOp> m, + const bool user_modify); + seastar::future<Ref<MOSDOpReply>> handle_failed_op( + const std::error_code& e, + ObjectContextRef obc, + const OpsExecuter& ox, + const MOSDOp& m) const; + seastar::future<Ref<MOSDOpReply>> do_osd_ops( + Ref<MOSDOp> m, + ObjectContextRef obc, + const OpInfo &op_info); + seastar::future<Ref<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m); + seastar::future<> submit_transaction(const OpInfo& op_info, + const std::vector<OSDOp>& ops, + ObjectContextRef&& obc, + ceph::os::Transaction&& txn, + const osd_op_params_t& oop); + +private: + OSDMapGate osdmap_gate; + ShardServices &shard_services; + + cached_map_t osdmap; + +public: + cached_map_t get_osdmap() { return osdmap; } + eversion_t next_version() { + return eversion_t(get_osdmap_epoch(), + ++projected_last_update.version); + } + ShardServices& get_shard_services() final { + return shard_services; + } + seastar::future<> stop(); + +private: + std::unique_ptr<PGBackend> backend; + std::unique_ptr<RecoveryBackend> recovery_backend; + std::unique_ptr<PGRecovery> recovery_handler; + + PeeringState peering_state; + eversion_t projected_last_update; +public: + RecoveryBackend* get_recovery_backend() final { + return recovery_backend.get(); + } + PGRecovery* get_recovery_handler() final { + return recovery_handler.get(); + } + PeeringState& get_peering_state() final { + return peering_state; + } + bool has_reset_since(epoch_t epoch) const final { + return peering_state.pg_has_reset_since(epoch); + } + + const pg_missing_tracker_t& get_local_missing() const { + return peering_state.get_pg_log().get_missing(); + } + epoch_t get_last_peering_reset() const final { + return peering_state.get_last_peering_reset(); + } + const set<pg_shard_t> &get_acting_recovery_backfill() const { + return peering_state.get_acting_recovery_backfill(); + } + bool is_backfill_target(pg_shard_t osd) const { + return peering_state.is_backfill_target(osd); + } + void begin_peer_recover(pg_shard_t peer, const hobject_t oid) { + peering_state.begin_peer_recover(peer, oid); + } + uint64_t min_peer_features() const { + return peering_state.get_min_peer_features(); + } + const map<hobject_t, set<pg_shard_t>>& + get_missing_loc_shards() const { + return peering_state.get_missing_loc().get_missing_locs(); + } + const map<pg_shard_t, pg_missing_t> &get_shard_missing() const { + return peering_state.get_peer_missing(); + } + const pg_missing_const_i* get_shard_missing(pg_shard_t shard) const { + if (shard == pg_whoami) + return &get_local_missing(); + else { + auto it = peering_state.get_peer_missing().find(shard); + if (it == peering_state.get_peer_missing().end()) + return nullptr; + else + return &it->second; + } + } + int get_recovery_op_priority() const { + int64_t pri = 0; + get_pool().info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri); + return pri > 0 ? pri : crimson::common::local_conf()->osd_recovery_op_priority; + } + seastar::future<> mark_unfound_lost(int) { + // TODO: see PrimaryLogPG::mark_all_unfound_lost() + return seastar::now(); + } + +private: + // instead of seastar::gate, we use a boolean flag to indicate + // whether the system is shutting down, as we don't need to track + // continuations here. + bool stopping = false; + + class WaitForActiveBlocker : public BlockerT<WaitForActiveBlocker> { + PG *pg; + + const spg_t pgid; + seastar::shared_promise<> p; + + protected: + void dump_detail(Formatter *f) const; + + public: + static constexpr const char *type_name = "WaitForActiveBlocker"; + + WaitForActiveBlocker(PG *pg) : pg(pg) {} + void on_active(); + blocking_future<> wait(); + seastar::future<> stop(); + } wait_for_active_blocker; + + friend std::ostream& operator<<(std::ostream&, const PG& pg); + friend class ClientRequest; + friend class PGAdvanceMap; + friend class PeeringEvent; + friend class RepRequest; + friend class BackfillRecovery; + friend struct PGFacade; +private: + seastar::future<bool> find_unfound() { + return seastar::make_ready_future<bool>(true); + } + + template <typename MsgType> + bool can_discard_replica_op(const MsgType& m) const; + bool can_discard_op(const MOSDOp& m) const; + bool is_missing_object(const hobject_t& soid) const { + return peering_state.get_pg_log().get_missing().get_items().count(soid); + } + bool is_unreadable_object(const hobject_t &oid, + eversion_t* v = 0) const final { + return is_missing_object(oid) || + !peering_state.get_missing_loc().readable_with_acting( + oid, get_actingset(), v); + } + bool is_degraded_or_backfilling_object(const hobject_t& soid) const; + const set<pg_shard_t> &get_actingset() const { + return peering_state.get_actingset(); + } + +private: + BackfillRecovery::BackfillRecoveryPipeline backfill_pipeline; +}; + +std::ostream& operator<<(std::ostream&, const PG& pg); + +} diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc new file mode 100644 index 000000000..38dbdbf41 --- /dev/null +++ b/src/crimson/osd/pg_backend.cc @@ -0,0 +1,1171 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg_backend.h" + +#include <optional> +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <boost/range/algorithm/copy.hpp> +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <seastar/core/print.hh> + +#include "messages/MOSDOp.h" +#include "os/Transaction.h" +#include "common/Checksummer.h" +#include "common/Clock.h" + +#include "crimson/common/exception.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/osd_operation.h" +#include "replicated_backend.h" +#include "replicated_recovery_backend.h" +#include "ec_backend.h" +#include "exceptions.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +using crimson::common::local_conf; + +std::unique_ptr<PGBackend> +PGBackend::create(pg_t pgid, + const pg_shard_t pg_shard, + const pg_pool_t& pool, + crimson::os::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile) +{ + switch (pool.type) { + case pg_pool_t::TYPE_REPLICATED: + return std::make_unique<ReplicatedBackend>(pgid, pg_shard, + coll, shard_services); + case pg_pool_t::TYPE_ERASURE: + return std::make_unique<ECBackend>(pg_shard.shard, coll, shard_services, + std::move(ec_profile), + pool.stripe_width); + default: + throw runtime_error(seastar::format("unsupported pool type '{}'", + pool.type)); + } +} + +PGBackend::PGBackend(shard_id_t shard, + CollectionRef coll, + crimson::os::FuturizedStore* store) + : shard{shard}, + coll{coll}, + store{store} +{} + +PGBackend::load_metadata_ertr::future<PGBackend::loaded_object_md_t::ref> +PGBackend::load_metadata(const hobject_t& oid) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + return store->get_attrs( + coll, + ghobject_t{oid, ghobject_t::NO_GEN, shard}).safe_then( + [oid](auto &&attrs) -> load_metadata_ertr::future<loaded_object_md_t::ref>{ + loaded_object_md_t::ref ret(new loaded_object_md_t()); + if (auto oiiter = attrs.find(OI_ATTR); oiiter != attrs.end()) { + bufferlist bl; + bl.push_back(std::move(oiiter->second)); + ret->os = ObjectState( + object_info_t(bl), + true); + } else { + logger().error( + "load_metadata: object {} present but missing object info", + oid); + return crimson::ct_error::object_corrupted::make(); + } + + if (oid.is_head()) { + if (auto ssiter = attrs.find(SS_ATTR); ssiter != attrs.end()) { + bufferlist bl; + bl.push_back(std::move(ssiter->second)); + ret->ss = SnapSet(bl); + } else { + /* TODO: add support for writing out snapsets + logger().error( + "load_metadata: object {} present but missing snapset", + oid); + //return crimson::ct_error::object_corrupted::make(); + */ + ret->ss = SnapSet(); + } + } + + return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>( + std::move(ret)); + }, crimson::ct_error::enoent::handle([oid] { + logger().debug( + "load_metadata: object {} doesn't exist, returning empty metadata", + oid); + return load_metadata_ertr::make_ready_future<loaded_object_md_t::ref>( + new loaded_object_md_t{ + ObjectState( + object_info_t(oid), + false), + oid.is_head() ? std::optional<SnapSet>(SnapSet()) : std::nullopt + }); + })); +} + +seastar::future<crimson::osd::acked_peers_t> +PGBackend::mutate_object( + std::set<pg_shard_t> pg_shards, + crimson::osd::ObjectContextRef &&obc, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, + epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + logger().trace("mutate_object: num_ops={}", txn.get_num_ops()); + if (obc->obs.exists) { +#if 0 + obc->obs.oi.version = ctx->at_version; + obc->obs.oi.prior_version = ctx->obs->oi.version; +#endif + + auto& m = osd_op_p.req; + obc->obs.oi.prior_version = obc->obs.oi.version; + obc->obs.oi.version = osd_op_p.at_version; + if (osd_op_p.user_at_version > obc->obs.oi.user_version) + obc->obs.oi.user_version = osd_op_p.user_at_version; + obc->obs.oi.last_reqid = m->get_reqid(); + obc->obs.oi.mtime = m->get_mtime(); + obc->obs.oi.local_mtime = ceph_clock_now(); + + // object_info_t + { + ceph::bufferlist osv; + encode(obc->obs.oi, osv, CEPH_FEATURES_ALL); + // TODO: get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); + txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, OI_ATTR, osv); + } + } else { + // reset cached ObjectState without enforcing eviction + obc->obs.oi = object_info_t(obc->obs.oi.soid); + } + return _submit_transaction( + std::move(pg_shards), obc->obs.oi.soid, std::move(txn), + std::move(osd_op_p), min_epoch, map_epoch, std::move(log_entries)); +} + +static inline bool _read_verify_data( + const object_info_t& oi, + const ceph::bufferlist& data) +{ + if (oi.is_data_digest() && oi.size == data.length()) { + // whole object? can we verify the checksum? + if (auto crc = data.crc32c(-1); crc != oi.data_digest) { + logger().error("full-object read crc {} != expected {} on {}", + crc, oi.data_digest, oi.soid); + // todo: mark soid missing, perform recovery, and retry + return false; + } + } + return true; +} + +PGBackend::read_errorator::future<> +PGBackend::read(const ObjectState& os, OSDOp& osd_op) +{ + const auto& oi = os.oi; + const ceph_osd_op& op = osd_op.op; + const uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + logger().trace("read: {} {}~{}", oi.soid, offset, length); + + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + // are we beyond truncate_size? + size_t size = oi.size; + if ((op.extent.truncate_seq > oi.truncate_seq) && + (op.extent.truncate_size < offset + length) && + (op.extent.truncate_size < size)) { + size = op.extent.truncate_size; + } + if (offset >= size) { + // read size was trimmed to zero and it is expected to do nothing, + return read_errorator::now(); + } + if (!length) { + // read the whole object if length is 0 + length = size; + } + return _read(oi.soid, offset, length, op.flags).safe_then( + [&oi, &osd_op](auto&& bl) -> read_errorator::future<> { + if (!_read_verify_data(oi, bl)) { + return crimson::ct_error::object_corrupted::make(); + } + logger().debug("read: data length: {}", bl.length()); + osd_op.rval = bl.length(); + osd_op.outdata = std::move(bl); + return read_errorator::now(); + }); +} + +PGBackend::read_errorator::future<> +PGBackend::sparse_read(const ObjectState& os, OSDOp& osd_op) +{ + const auto& op = osd_op.op; + logger().trace("sparse_read: {} {}~{}", + os.oi.soid, op.extent.offset, op.extent.length); + return store->fiemap(coll, ghobject_t{os.oi.soid}, + op.extent.offset, + op.extent.length).then([&os, &osd_op, this](auto&& m) { + return seastar::do_with(interval_set<uint64_t>{std::move(m)}, + [&os, &osd_op, this](auto&& extents) { + return store->readv(coll, ghobject_t{os.oi.soid}, + extents, osd_op.op.flags).safe_then( + [&os, &osd_op, &extents](auto&& bl) -> read_errorator::future<> { + if (_read_verify_data(os.oi, bl)) { + osd_op.op.extent.length = bl.length(); + // re-encode since it might be modified + ceph::encode(extents, osd_op.outdata); + encode_destructively(bl, osd_op.outdata); + logger().trace("sparse_read got {} bytes from object {}", + osd_op.op.extent.length, os.oi.soid); + return read_errorator::make_ready_future<>(); + } else { + // TODO: repair it if crc mismatches + return crimson::ct_error::object_corrupted::make(); + } + }); + }); + }); +} + +namespace { + + template<class CSum> + PGBackend::checksum_errorator::future<> + do_checksum(ceph::bufferlist& init_value_bl, + size_t chunk_size, + const ceph::bufferlist& buf, + ceph::bufferlist& result) + { + typename CSum::init_value_t init_value; + auto init_value_p = init_value_bl.cbegin(); + try { + decode(init_value, init_value_p); + // chop off the consumed part + init_value_bl.splice(0, init_value_p.get_off()); + } catch (const ceph::buffer::end_of_buffer&) { + logger().warn("{}: init value not provided", __func__); + return crimson::ct_error::invarg::make(); + } + const uint32_t chunk_count = buf.length() / chunk_size; + ceph::bufferptr csum_data{ + ceph::buffer::create(sizeof(typename CSum::value_t) * chunk_count)}; + Checksummer::calculate<CSum>( + init_value, chunk_size, 0, buf.length(), buf, &csum_data); + encode(chunk_count, result); + result.append(std::move(csum_data)); + return PGBackend::checksum_errorator::now(); + } +} + +PGBackend::checksum_errorator::future<> +PGBackend::checksum(const ObjectState& os, OSDOp& osd_op) +{ + // sanity tests and normalize the argments + auto& checksum = osd_op.op.checksum; + if (checksum.offset == 0 && checksum.length == 0) { + // zeroed offset+length implies checksum whole object + checksum.length = os.oi.size; + } else if (checksum.offset >= os.oi.size) { + // read size was trimmed to zero, do nothing, + // see PGBackend::read() + return checksum_errorator::now(); + } + if (checksum.chunk_size > 0) { + if (checksum.length == 0) { + logger().warn("{}: length required when chunk size provided", __func__); + return crimson::ct_error::invarg::make(); + } + if (checksum.length % checksum.chunk_size != 0) { + logger().warn("{}: length not aligned to chunk size", __func__); + return crimson::ct_error::invarg::make(); + } + } else { + checksum.chunk_size = checksum.length; + } + if (checksum.length == 0) { + uint32_t count = 0; + encode(count, osd_op.outdata); + return checksum_errorator::now(); + } + + // read the chunk to be checksum'ed + return _read(os.oi.soid, checksum.offset, checksum.length, osd_op.op.flags).safe_then( + [&osd_op](auto&& read_bl) mutable -> checksum_errorator::future<> { + auto& checksum = osd_op.op.checksum; + if (read_bl.length() != checksum.length) { + logger().warn("checksum: bytes read {} != {}", + read_bl.length(), checksum.length); + return crimson::ct_error::invarg::make(); + } + // calculate its checksum and put the result in outdata + switch (checksum.type) { + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32: + return do_checksum<Checksummer::xxhash32>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64: + return do_checksum<Checksummer::xxhash64>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C: + return do_checksum<Checksummer::crc32c>(osd_op.indata, + checksum.chunk_size, + read_bl, + osd_op.outdata); + default: + logger().warn("checksum: unknown crc type ({})", + static_cast<uint32_t>(checksum.type)); + return crimson::ct_error::invarg::make(); + } + }); +} + +PGBackend::cmp_ext_errorator::future<> +PGBackend::cmp_ext(const ObjectState& os, OSDOp& osd_op) +{ + const ceph_osd_op& op = osd_op.op; + // return the index of the first unmatched byte in the payload, hence the + // strange limit and check + if (op.extent.length > MAX_ERRNO) { + return crimson::ct_error::invarg::make(); + } + uint64_t obj_size = os.oi.size; + if (os.oi.truncate_seq < op.extent.truncate_seq && + op.extent.offset + op.extent.length > op.extent.truncate_size) { + obj_size = op.extent.truncate_size; + } + uint64_t ext_len; + if (op.extent.offset >= obj_size) { + ext_len = 0; + } else if (op.extent.offset + op.extent.length > obj_size) { + ext_len = obj_size - op.extent.offset; + } else { + ext_len = op.extent.length; + } + auto read_ext = ll_read_errorator::make_ready_future<ceph::bufferlist>(); + if (ext_len == 0) { + logger().debug("{}: zero length extent", __func__); + } else if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + } else { + read_ext = _read(os.oi.soid, op.extent.offset, ext_len, 0); + } + return read_ext.safe_then([&osd_op](auto&& read_bl) { + int32_t retcode = 0; + for (unsigned index = 0; index < osd_op.indata.length(); index++) { + char byte_in_op = osd_op.indata[index]; + char byte_from_disk = (index < read_bl.length() ? read_bl[index] : 0); + if (byte_in_op != byte_from_disk) { + logger().debug("cmp_ext: mismatch at {}", index); + retcode = -MAX_ERRNO - index; + break; + } + } + logger().debug("cmp_ext: {}", retcode); + osd_op.rval = retcode; + }); +} + +PGBackend::stat_errorator::future<> PGBackend::stat( + const ObjectState& os, + OSDOp& osd_op) +{ + if (os.exists/* TODO: && !os.is_whiteout() */) { + logger().debug("stat os.oi.size={}, os.oi.mtime={}", os.oi.size, os.oi.mtime); + encode(os.oi.size, osd_op.outdata); + encode(os.oi.mtime, osd_op.outdata); + } else { + logger().debug("stat object does not exist"); + return crimson::ct_error::enoent::make(); + } + return stat_errorator::now(); + // TODO: ctx->delta_stats.num_rd++; +} + +bool PGBackend::maybe_create_new_object( + ObjectState& os, + ceph::os::Transaction& txn) +{ + if (!os.exists) { + ceph_assert(!os.oi.is_whiteout()); + os.exists = true; + os.oi.new_object(); + + txn.touch(coll->get_cid(), ghobject_t{os.oi.soid}); + // TODO: delta_stats.num_objects++ + return false; + } else if (os.oi.is_whiteout()) { + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + // TODO: delta_stats.num_whiteouts-- + } + return true; +} + +static bool is_offset_and_length_valid( + const std::uint64_t offset, + const std::uint64_t length) +{ + if (const std::uint64_t max = local_conf()->osd_max_object_size; + offset >= max || length > max || offset + length > max) { + logger().debug("{} osd_max_object_size: {}, offset: {}, len: {}; " + "Hard limit of object size is 4GB", + __func__, max, offset, length); + return false; + } else { + return true; + } +} + +seastar::future<> PGBackend::write( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + const ceph_osd_op& op = osd_op.op; + uint64_t offset = op.extent.offset; + uint64_t length = op.extent.length; + bufferlist buf = osd_op.indata; + if (auto seq = os.oi.truncate_seq; + seq != 0 && op.extent.truncate_seq < seq) { + // old write, arrived after trimtrunc + if (offset + length > os.oi.size) { + // no-op + if (offset > os.oi.size) { + length = 0; + buf.clear(); + } else { + // truncate + auto len = os.oi.size - offset; + buf.splice(len, length); + length = len; + } + } + } else if (op.extent.truncate_seq > seq) { + // write arrives before trimtrunc + if (os.exists && !os.oi.is_whiteout()) { + txn.truncate(coll->get_cid(), + ghobject_t{os.oi.soid}, op.extent.truncate_size); + if (op.extent.truncate_size != os.oi.size) { + os.oi.size = length; + // TODO: truncate_update_size_and_usage() + if (op.extent.truncate_size > os.oi.size) { + osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size, + op.extent.truncate_size - os.oi.size); + } else { + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.truncate_size, + os.oi.size - op.extent.truncate_size); + } + } + } + os.oi.truncate_seq = op.extent.truncate_seq; + os.oi.truncate_size = op.extent.truncate_size; + } + maybe_create_new_object(os, txn); + if (length == 0) { + if (offset > os.oi.size) { + txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, op.extent.offset); + } else { + txn.nop(); + } + } else { + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + offset, length, std::move(buf), op.flags); + os.oi.size = std::max(offset + length, os.oi.size); + } + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset, + op.extent.length); + + return seastar::now(); +} + +seastar::future<> PGBackend::write_same( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + const ceph_osd_op& op = osd_op.op; + const uint64_t len = op.writesame.length; + if (len == 0) { + return seastar::now(); + } + if (op.writesame.data_length == 0 || + len % op.writesame.data_length != 0 || + op.writesame.data_length != osd_op.indata.length()) { + throw crimson::osd::invalid_argument(); + } + ceph::bufferlist repeated_indata; + for (uint64_t size = 0; size < len; size += op.writesame.data_length) { + repeated_indata.append(osd_op.indata); + } + maybe_create_new_object(os, txn); + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + op.writesame.offset, len, + std::move(repeated_indata), op.flags); + os.oi.size = len; + osd_op_params.clean_regions.mark_data_region_dirty(op.writesame.offset, len); + return seastar::now(); +} + +seastar::future<> PGBackend::writefull( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + const ceph_osd_op& op = osd_op.op; + if (op.extent.length != osd_op.indata.length()) { + throw crimson::osd::invalid_argument(); + } + + const bool existing = maybe_create_new_object(os, txn); + if (existing && op.extent.length < os.oi.size) { + txn.truncate(coll->get_cid(), ghobject_t{os.oi.soid}, op.extent.length); + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.length, + os.oi.size - op.extent.length); + } + if (op.extent.length) { + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, 0, op.extent.length, + osd_op.indata, op.flags); + os.oi.size = op.extent.length; + osd_op_params.clean_regions.mark_data_region_dirty(0, + std::max((uint64_t) op.extent.length, os.oi.size)); + } + return seastar::now(); +} + +PGBackend::append_errorator::future<> PGBackend::append( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + const ceph_osd_op& op = osd_op.op; + if (op.extent.length != osd_op.indata.length()) { + return crimson::ct_error::invarg::make(); + } + maybe_create_new_object(os, txn); + if (op.extent.length) { + txn.write(coll->get_cid(), ghobject_t{os.oi.soid}, + os.oi.size /* offset */, op.extent.length, + std::move(osd_op.indata), op.flags); + os.oi.size += op.extent.length; + osd_op_params.clean_regions.mark_data_region_dirty(os.oi.size, + op.extent.length); + } + return seastar::now(); +} + +PGBackend::write_ertr::future<> PGBackend::truncate( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{} object dne, truncate is a no-op", __func__); + return write_ertr::now(); + } + const ceph_osd_op& op = osd_op.op; + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + if (op.extent.truncate_seq) { + assert(op.extent.offset == op.extent.truncate_size); + if (op.extent.truncate_seq <= os.oi.truncate_seq) { + logger().debug("{} truncate seq {} <= current {}, no-op", + __func__, op.extent.truncate_seq, os.oi.truncate_seq); + return write_ertr::make_ready_future<>(); + } else { + logger().debug("{} truncate seq {} > current {}, truncating", + __func__, op.extent.truncate_seq, os.oi.truncate_seq); + os.oi.truncate_seq = op.extent.truncate_seq; + os.oi.truncate_size = op.extent.truncate_size; + } + } + maybe_create_new_object(os, txn); + if (os.oi.size != op.extent.offset) { + txn.truncate(coll->get_cid(), + ghobject_t{os.oi.soid}, op.extent.offset); + if (os.oi.size > op.extent.offset) { + // TODO: modified_ranges.union_of(trim); + osd_op_params.clean_regions.mark_data_region_dirty( + op.extent.offset, + os.oi.size - op.extent.offset); + } else { + // os.oi.size < op.extent.offset + osd_op_params.clean_regions.mark_data_region_dirty( + os.oi.size, + op.extent.offset - os.oi.size); + } + os.oi.size = op.extent.offset; + os.oi.clear_data_digest(); + } + // TODO: truncate_update_size_and_usage() + // TODO: ctx->delta_stats.num_wr++; + // ---- + // do no set exists, or we will break above DELETE -> TRUNCATE munging. + return write_ertr::now(); +} + +PGBackend::write_ertr::future<> PGBackend::zero( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{} object dne, zero is a no-op", __func__); + return write_ertr::now(); + } + const ceph_osd_op& op = osd_op.op; + if (!is_offset_and_length_valid(op.extent.offset, op.extent.length)) { + return crimson::ct_error::file_too_large::make(); + } + assert(op.extent.length); + txn.zero(coll->get_cid(), + ghobject_t{os.oi.soid}, + op.extent.offset, + op.extent.length); + // TODO: modified_ranges.union_of(zeroed); + osd_op_params.clean_regions.mark_data_region_dirty(op.extent.offset, + op.extent.length); + // TODO: ctx->delta_stats.num_wr++; + os.oi.clear_data_digest(); + return write_ertr::now(); +} + +seastar::future<> PGBackend::create( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + if (os.exists && !os.oi.is_whiteout() && + (osd_op.op.flags & CEPH_OSD_OP_FLAG_EXCL)) { + // this is an exclusive create + throw crimson::osd::make_error(-EEXIST); + } + + if (osd_op.indata.length()) { + // handle the legacy. `category` is no longer implemented. + try { + auto p = osd_op.indata.cbegin(); + std::string category; + decode(category, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument(); + } + } + maybe_create_new_object(os, txn); + txn.nop(); + return seastar::now(); +} + +seastar::future<> PGBackend::remove(ObjectState& os, + ceph::os::Transaction& txn) +{ + // todo: snapset + txn.remove(coll->get_cid(), + ghobject_t{os.oi.soid, ghobject_t::NO_GEN, shard}); + os.oi.size = 0; + os.oi.new_object(); + os.exists = false; + // todo: update watchers + if (os.oi.is_whiteout()) { + os.oi.clear_flag(object_info_t::FLAG_WHITEOUT); + } + return seastar::now(); +} + +seastar::future<std::tuple<std::vector<hobject_t>, hobject_t>> +PGBackend::list_objects(const hobject_t& start, uint64_t limit) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + auto gstart = start.is_min() ? ghobject_t{} : ghobject_t{start, 0, shard}; + return store->list_objects(coll, + gstart, + ghobject_t::get_max(), + limit) + .then([](auto ret) { + auto& [gobjects, next] = ret; + std::vector<hobject_t> objects; + boost::copy(gobjects | + boost::adaptors::filtered([](const ghobject_t& o) { + if (o.is_pgmeta()) { + return false; + } else if (o.hobj.is_temp()) { + return false; + } else { + return o.is_no_gen(); + } + }) | + boost::adaptors::transformed([](const ghobject_t& o) { + return o.hobj; + }), + std::back_inserter(objects)); + return seastar::make_ready_future<std::tuple<std::vector<hobject_t>, hobject_t>>( + std::make_tuple(objects, next.hobj)); + }); +} + +seastar::future<> PGBackend::setxattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + if (local_conf()->osd_max_attr_size > 0 && + osd_op.op.xattr.value_len > local_conf()->osd_max_attr_size) { + throw crimson::osd::make_error(-EFBIG); + } + + const auto max_name_len = std::min<uint64_t>( + store->get_max_attr_name_length(), local_conf()->osd_max_attr_name_len); + if (osd_op.op.xattr.name_len > max_name_len) { + throw crimson::osd::make_error(-ENAMETOOLONG); + } + + maybe_create_new_object(os, txn); + + std::string name{"_"}; + ceph::bufferlist val; + { + auto bp = osd_op.indata.cbegin(); + bp.copy(osd_op.op.xattr.name_len, name); + bp.copy(osd_op.op.xattr.value_len, val); + } + logger().debug("setxattr on obj={} for attr={}", os.oi.soid, name); + + txn.setattr(coll->get_cid(), ghobject_t{os.oi.soid}, name, val); + return seastar::now(); + //ctx->delta_stats.num_wr++; +} + +PGBackend::get_attr_errorator::future<> PGBackend::getxattr( + const ObjectState& os, + OSDOp& osd_op) const +{ + std::string name; + ceph::bufferlist val; + { + auto bp = osd_op.indata.cbegin(); + std::string aname; + bp.copy(osd_op.op.xattr.name_len, aname); + name = "_" + aname; + } + logger().debug("getxattr on obj={} for attr={}", os.oi.soid, name); + return getxattr(os.oi.soid, name).safe_then([&osd_op] (ceph::bufferptr val) { + osd_op.outdata.clear(); + osd_op.outdata.push_back(std::move(val)); + osd_op.op.xattr.value_len = osd_op.outdata.length(); + return get_attr_errorator::now(); + //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + }); + //ctx->delta_stats.num_rd++; +} + +PGBackend::get_attr_errorator::future<ceph::bufferptr> PGBackend::getxattr( + const hobject_t& soid, + std::string_view key) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + return store->get_attr(coll, ghobject_t{soid}, key); +} + +PGBackend::get_attr_errorator::future<> PGBackend::get_xattrs( + const ObjectState& os, + OSDOp& osd_op) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + return store->get_attrs(coll, ghobject_t{os.oi.soid}).safe_then( + [&osd_op](auto&& attrs) { + std::vector<std::pair<std::string, bufferlist>> user_xattrs; + for (auto& [key, val] : attrs) { + if (key.size() > 1 && key[0] == '_') { + ceph::bufferlist bl; + bl.append(std::move(val)); + user_xattrs.emplace_back(key.substr(1), std::move(bl)); + } + } + ceph::encode(user_xattrs, osd_op.outdata); + return get_attr_errorator::now(); + }); +} + +PGBackend::rm_xattr_ertr::future<> PGBackend::rm_xattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: {} DNE", __func__, os.oi.soid); + return crimson::ct_error::enoent::make(); + } + auto bp = osd_op.indata.cbegin(); + string attr_name{"_"}; + bp.copy(osd_op.op.xattr.name_len, attr_name); + txn.rmattr(coll->get_cid(), ghobject_t{os.oi.soid}, attr_name); + return rm_xattr_ertr::now(); +} + +using get_omap_ertr = + crimson::os::FuturizedStore::read_errorator::extend< + crimson::ct_error::enodata>; +static +get_omap_ertr::future< + crimson::os::FuturizedStore::omap_values_t> +maybe_get_omap_vals_by_keys( + crimson::os::FuturizedStore* store, + const crimson::os::CollectionRef& coll, + const object_info_t& oi, + const std::set<std::string>& keys_to_get) +{ + if (oi.is_omap()) { + return store->omap_get_values(coll, ghobject_t{oi.soid}, keys_to_get); + } else { + return crimson::ct_error::enodata::make(); + } +} + +static +get_omap_ertr::future< + std::tuple<bool, crimson::os::FuturizedStore::omap_values_t>> +maybe_get_omap_vals( + crimson::os::FuturizedStore* store, + const crimson::os::CollectionRef& coll, + const object_info_t& oi, + const std::string& start_after) +{ + if (oi.is_omap()) { + return store->omap_get_values(coll, ghobject_t{oi.soid}, start_after); + } else { + return crimson::ct_error::enodata::make(); + } +} + +PGBackend::ll_read_errorator::future<ceph::bufferlist> +PGBackend::omap_get_header( + const crimson::os::CollectionRef& c, + const ghobject_t& oid) const +{ + return store->omap_get_header(c, oid); +} + +PGBackend::ll_read_errorator::future<> +PGBackend::omap_get_header( + const ObjectState& os, + OSDOp& osd_op) const +{ + return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then( + [&osd_op] (ceph::bufferlist&& header) { + osd_op.outdata = std::move(header); + return seastar::now(); + }); +} + +PGBackend::ll_read_errorator::future<> +PGBackend::omap_get_keys( + const ObjectState& os, + OSDOp& osd_op) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + std::string start_after; + uint64_t max_return; + try { + auto p = osd_op.indata.cbegin(); + decode(start_after, p); + decode(max_return, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + max_return = + std::min(max_return, local_conf()->osd_max_omap_entries_per_request); + + // TODO: truly chunk the reading + return maybe_get_omap_vals(store, coll, os.oi, start_after).safe_then( + [=, &osd_op](auto ret) { + ceph::bufferlist result; + bool truncated = false; + uint32_t num = 0; + for (auto &[key, val] : std::get<1>(ret)) { + if (num >= max_return || + result.length() >= local_conf()->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(key, result); + ++num; + } + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(result); + encode(truncated, osd_op.outdata); + return seastar::now(); + }).handle_error( + crimson::ct_error::enodata::handle([&osd_op] { + uint32_t num = 0; + bool truncated = false; + encode(num, osd_op.outdata); + encode(truncated, osd_op.outdata); + return seastar::now(); + }), + ll_read_errorator::pass_further{} + ); + // TODO: + //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + //ctx->delta_stats.num_rd++; +} + +PGBackend::ll_read_errorator::future<> +PGBackend::omap_get_vals( + const ObjectState& os, + OSDOp& osd_op) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + + std::string start_after; + uint64_t max_return; + std::string filter_prefix; + try { + auto p = osd_op.indata.cbegin(); + decode(start_after, p); + decode(max_return, p); + decode(filter_prefix, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + max_return = \ + std::min(max_return, local_conf()->osd_max_omap_entries_per_request); + + // TODO: truly chunk the reading + return maybe_get_omap_vals(store, coll, os.oi, start_after).safe_then( + [=, &osd_op] (auto&& ret) { + auto [done, vals] = std::move(ret); + assert(done); + ceph::bufferlist result; + bool truncated = false; + uint32_t num = 0; + auto iter = filter_prefix > start_after ? vals.lower_bound(filter_prefix) + : std::begin(vals); + for (; iter != std::end(vals); ++iter) { + const auto& [key, value] = *iter; + if (key.substr(0, filter_prefix.size()) != filter_prefix) { + break; + } else if (num >= max_return || + result.length() >= local_conf()->osd_max_omap_bytes_per_request) { + truncated = true; + break; + } + encode(key, result); + encode(value, result); + ++num; + } + encode(num, osd_op.outdata); + osd_op.outdata.claim_append(result); + encode(truncated, osd_op.outdata); + return ll_read_errorator::now(); + }).handle_error( + crimson::ct_error::enodata::handle([&osd_op] { + encode(uint32_t{0} /* num */, osd_op.outdata); + encode(bool{false} /* truncated */, osd_op.outdata); + return ll_read_errorator::now(); + }), + ll_read_errorator::pass_further{} + ); + + // TODO: + //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + //ctx->delta_stats.num_rd++; +} + +PGBackend::ll_read_errorator::future<> +PGBackend::omap_get_vals_by_keys( + const ObjectState& os, + OSDOp& osd_op) const +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + + std::set<std::string> keys_to_get; + try { + auto p = osd_op.indata.cbegin(); + decode(keys_to_get, p); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument(); + } + return maybe_get_omap_vals_by_keys(store, coll, os.oi, keys_to_get).safe_then( + [&osd_op] (crimson::os::FuturizedStore::omap_values_t&& vals) { + encode(vals, osd_op.outdata); + return ll_read_errorator::now(); + }).handle_error( + crimson::ct_error::enodata::handle([&osd_op] { + uint32_t num = 0; + encode(num, osd_op.outdata); + return ll_read_errorator::now(); + }), + ll_read_errorator::pass_further{} + ); + + // TODO: + //ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); + //ctx->delta_stats.num_rd++; +} + +seastar::future<> PGBackend::omap_set_vals( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + maybe_create_new_object(os, txn); + + ceph::bufferlist to_set_bl; + try { + auto p = osd_op.indata.cbegin(); + decode_str_str_map_to_bl(p, &to_set_bl); + } catch (buffer::error&) { + throw crimson::osd::invalid_argument{}; + } + + txn.omap_setkeys(coll->get_cid(), ghobject_t{os.oi.soid}, to_set_bl); + + // TODO: + //ctx->clean_regions.mark_omap_dirty(); + + // TODO: + //ctx->delta_stats.num_wr++; + //ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10); + os.oi.set_flag(object_info_t::FLAG_OMAP); + os.oi.clear_omap_digest(); + osd_op_params.clean_regions.mark_omap_dirty(); + return seastar::now(); +} + +seastar::future<> PGBackend::omap_set_header( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + maybe_create_new_object(os, txn); + txn.omap_setheader(coll->get_cid(), ghobject_t{os.oi.soid}, osd_op.indata); + //TODO: + //ctx->clean_regions.mark_omap_dirty(); + //ctx->delta_stats.num_wr++; + os.oi.set_flag(object_info_t::FLAG_OMAP); + os.oi.clear_omap_digest(); + return seastar::now(); +} + +seastar::future<> PGBackend::omap_remove_range( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& txn) +{ + std::string key_begin, key_end; + try { + auto p = osd_op.indata.cbegin(); + decode(key_begin, p); + decode(key_end, p); + } catch (buffer::error& e) { + throw crimson::osd::invalid_argument{}; + } + txn.omap_rmkeyrange(coll->get_cid(), ghobject_t{os.oi.soid}, key_begin, key_end); + //TODO: + //ctx->delta_stats.num_wr++; + os.oi.clear_omap_digest(); + return seastar::now(); +} + +PGBackend::omap_clear_ertr::future<> +PGBackend::omap_clear( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& txn, + osd_op_params_t& osd_op_params) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + if (!os.exists || os.oi.is_whiteout()) { + logger().debug("{}: object does not exist: {}", os.oi.soid); + return crimson::ct_error::enoent::make(); + } + if (!os.oi.is_omap()) { + return omap_clear_ertr::now(); + } + txn.omap_clear(coll->get_cid(), ghobject_t{os.oi.soid}); + osd_op_params.clean_regions.mark_omap_dirty(); + os.oi.clear_omap_digest(); + os.oi.clear_flag(object_info_t::FLAG_OMAP); + return omap_clear_ertr::now(); +} + +seastar::future<struct stat> PGBackend::stat( + CollectionRef c, + const ghobject_t& oid) const +{ + return store->stat(c, oid); +} + +seastar::future<std::map<uint64_t, uint64_t>> +PGBackend::fiemap( + CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len) +{ + return store->fiemap(c, oid, off, len); +} + +void PGBackend::on_activate_complete() { + peering.reset(); +} + diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h new file mode 100644 index 000000000..d8fa8b2ac --- /dev/null +++ b/src/crimson/osd/pg_backend.h @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <memory> +#include <string> +#include <boost/smart_ptr/local_shared_ptr.hpp> +#include <boost/container/flat_set.hpp> + +#include "crimson/os/futurized_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/acked_peers.h" +#include "crimson/osd/pg.h" +#include "crimson/common/shared_lru.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "os/Transaction.h" +#include "osd/osd_types.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/osd_operations/osdop_params.h" + +struct hobject_t; + +namespace ceph::os { + class Transaction; +} + +namespace crimson::osd { + class ShardServices; +} + +class PGBackend +{ +protected: + using CollectionRef = crimson::os::CollectionRef; + using ec_profile_t = std::map<std::string, std::string>; + // low-level read errorator + using ll_read_errorator = crimson::os::FuturizedStore::read_errorator; + +public: + using load_metadata_ertr = crimson::errorator< + crimson::ct_error::object_corrupted>; + PGBackend(shard_id_t shard, CollectionRef coll, crimson::os::FuturizedStore* store); + virtual ~PGBackend() = default; + static std::unique_ptr<PGBackend> create(pg_t pgid, + const pg_shard_t pg_shard, + const pg_pool_t& pool, + crimson::os::CollectionRef coll, + crimson::osd::ShardServices& shard_services, + const ec_profile_t& ec_profile); + using attrs_t = + std::map<std::string, ceph::bufferptr, std::less<>>; + using read_errorator = ll_read_errorator::extend< + crimson::ct_error::object_corrupted>; + read_errorator::future<> read( + const ObjectState& os, + OSDOp& osd_op); + read_errorator::future<> sparse_read( + const ObjectState& os, + OSDOp& osd_op); + using checksum_errorator = ll_read_errorator::extend< + crimson::ct_error::object_corrupted, + crimson::ct_error::invarg>; + checksum_errorator::future<> checksum( + const ObjectState& os, + OSDOp& osd_op); + using cmp_ext_errorator = ll_read_errorator::extend< + crimson::ct_error::invarg>; + cmp_ext_errorator::future<> cmp_ext( + const ObjectState& os, + OSDOp& osd_op); + using stat_errorator = crimson::errorator<crimson::ct_error::enoent>; + stat_errorator::future<> stat( + const ObjectState& os, + OSDOp& osd_op); + + // TODO: switch the entire write family to errorator. + using write_ertr = crimson::errorator< + crimson::ct_error::file_too_large>; + seastar::future<> create( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + seastar::future<> remove( + ObjectState& os, + ceph::os::Transaction& txn); + seastar::future<> write( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + seastar::future<> write_same( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + seastar::future<> writefull( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + using append_errorator = crimson::errorator< + crimson::ct_error::invarg>; + append_errorator::future<> append( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + write_ertr::future<> truncate( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + write_ertr::future<> zero( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + seastar::future<crimson::osd::acked_peers_t> mutate_object( + std::set<pg_shard_t> pg_shards, + crimson::osd::ObjectContextRef &&obc, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, + epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries); + seastar::future<std::tuple<std::vector<hobject_t>, hobject_t>> list_objects( + const hobject_t& start, + uint64_t limit) const; + seastar::future<> setxattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + using get_attr_errorator = crimson::os::FuturizedStore::get_attr_errorator; + get_attr_errorator::future<> getxattr( + const ObjectState& os, + OSDOp& osd_op) const; + get_attr_errorator::future<ceph::bufferptr> getxattr( + const hobject_t& soid, + std::string_view key) const; + get_attr_errorator::future<> get_xattrs( + const ObjectState& os, + OSDOp& osd_op) const; + using rm_xattr_ertr = crimson::errorator<crimson::ct_error::enoent>; + rm_xattr_ertr::future<> rm_xattr( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + seastar::future<struct stat> stat( + CollectionRef c, + const ghobject_t& oid) const; + seastar::future<std::map<uint64_t, uint64_t>> fiemap( + CollectionRef c, + const ghobject_t& oid, + uint64_t off, + uint64_t len); + + // OMAP + ll_read_errorator::future<> omap_get_keys( + const ObjectState& os, + OSDOp& osd_op) const; + ll_read_errorator::future<> omap_get_vals( + const ObjectState& os, + OSDOp& osd_op) const; + ll_read_errorator::future<> omap_get_vals_by_keys( + const ObjectState& os, + OSDOp& osd_op) const; + seastar::future<> omap_set_vals( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + ll_read_errorator::future<ceph::bufferlist> omap_get_header( + const crimson::os::CollectionRef& c, + const ghobject_t& oid) const; + ll_read_errorator::future<> omap_get_header( + const ObjectState& os, + OSDOp& osd_op) const; + seastar::future<> omap_set_header( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + seastar::future<> omap_remove_range( + ObjectState& os, + const OSDOp& osd_op, + ceph::os::Transaction& trans); + using omap_clear_ertr = crimson::errorator<crimson::ct_error::enoent>; + omap_clear_ertr::future<> omap_clear( + ObjectState& os, + OSDOp& osd_op, + ceph::os::Transaction& trans, + osd_op_params_t& osd_op_params); + + virtual void got_rep_op_reply(const MOSDRepOpReply&) {} + virtual seastar::future<> stop() = 0; + struct peering_info_t { + bool is_primary; + }; + virtual void on_actingset_changed(peering_info_t pi) = 0; + virtual void on_activate_complete(); +protected: + const shard_id_t shard; + CollectionRef coll; + crimson::os::FuturizedStore* store; + bool stopping = false; + std::optional<peering_info_t> peering; +public: + struct loaded_object_md_t { + ObjectState os; + std::optional<SnapSet> ss; + using ref = std::unique_ptr<loaded_object_md_t>; + }; + load_metadata_ertr::future<loaded_object_md_t::ref> load_metadata( + const hobject_t &oid); + +private: + virtual ll_read_errorator::future<ceph::bufferlist> _read( + const hobject_t& hoid, + size_t offset, + size_t length, + uint32_t flags) = 0; + + bool maybe_create_new_object(ObjectState& os, ceph::os::Transaction& txn); + virtual seastar::future<crimson::osd::acked_peers_t> + _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) = 0; + friend class ReplicatedRecoveryBackend; +}; diff --git a/src/crimson/osd/pg_map.cc b/src/crimson/osd/pg_map.cc new file mode 100644 index 000000000..08071f260 --- /dev/null +++ b/src/crimson/osd/pg_map.cc @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/pg_map.h" + +#include "crimson/osd/pg.h" +#include "common/Formatter.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +PGMap::PGCreationState::PGCreationState(spg_t pgid) : pgid(pgid) {} +PGMap::PGCreationState::~PGCreationState() {} + +void PGMap::PGCreationState::dump_detail(Formatter *f) const +{ + f->dump_stream("pgid") << pgid; + f->dump_bool("creating", creating); +} + +std::pair<blocking_future<Ref<PG>>, bool> PGMap::wait_for_pg(spg_t pgid) +{ + if (auto pg = get_pg(pgid)) { + return make_pair(make_ready_blocking_future<Ref<PG>>(pg), true); + } else { + auto &state = pgs_creating.emplace(pgid, pgid).first->second; + return make_pair( + state.make_blocking_future(state.promise.get_shared_future()), + state.creating); + } +} + +Ref<PG> PGMap::get_pg(spg_t pgid) +{ + if (auto pg = pgs.find(pgid); pg != pgs.end()) { + return pg->second; + } else { + return nullptr; + } +} + +void PGMap::set_creating(spg_t pgid) +{ + logger().debug("Creating {}", pgid); + ceph_assert(pgs.count(pgid) == 0); + auto pg = pgs_creating.find(pgid); + ceph_assert(pg != pgs_creating.end()); + ceph_assert(pg->second.creating == false); + pg->second.creating = true; +} + +void PGMap::pg_created(spg_t pgid, Ref<PG> pg) +{ + logger().debug("Created {}", pgid); + ceph_assert(!pgs.count(pgid)); + pgs.emplace(pgid, pg); + + auto state = pgs_creating.find(pgid); + ceph_assert(state != pgs_creating.end()); + state->second.promise.set_value(pg); + pgs_creating.erase(pgid); +} + +void PGMap::pg_loaded(spg_t pgid, Ref<PG> pg) +{ + ceph_assert(!pgs.count(pgid)); + pgs.emplace(pgid, pg); +} + +PGMap::~PGMap() {} + +} diff --git a/src/crimson/osd/pg_map.h b/src/crimson/osd/pg_map.h new file mode 100644 index 000000000..b3fe4b562 --- /dev/null +++ b/src/crimson/osd/pg_map.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> + +#include <seastar/core/future.hh> +#include <seastar/core/shared_future.hh> + +#include "include/types.h" +#include "crimson/common/type_helpers.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/pg.h" +#include "osd/osd_types.h" + +namespace crimson::osd { +class PG; + +class PGMap { + struct PGCreationState : BlockerT<PGCreationState> { + static constexpr const char * type_name = "PGCreation"; + + void dump_detail(Formatter *f) const final; + + spg_t pgid; + seastar::shared_promise<Ref<PG>> promise; + bool creating = false; + PGCreationState(spg_t pgid); + + PGCreationState(const PGCreationState &) = delete; + PGCreationState(PGCreationState &&) = delete; + PGCreationState &operator=(const PGCreationState &) = delete; + PGCreationState &operator=(PGCreationState &&) = delete; + + ~PGCreationState(); + }; + + std::map<spg_t, PGCreationState> pgs_creating; + using pgs_t = std::map<spg_t, Ref<PG>>; + pgs_t pgs; + +public: + /** + * Get future for pg with a bool indicating whether it's already being + * created. + */ + std::pair<blocking_future<Ref<PG>>, bool> wait_for_pg(spg_t pgid); + + /** + * get PG in non-blocking manner + */ + Ref<PG> get_pg(spg_t pgid); + + /** + * Set creating + */ + void set_creating(spg_t pgid); + + /** + * Set newly created pg + */ + void pg_created(spg_t pgid, Ref<PG> pg); + + /** + * Add newly loaded pg + */ + void pg_loaded(spg_t pgid, Ref<PG> pg); + + pgs_t& get_pgs() { return pgs; } + const pgs_t& get_pgs() const { return pgs; } + PGMap() = default; + ~PGMap(); +}; + +} diff --git a/src/crimson/osd/pg_meta.cc b/src/crimson/osd/pg_meta.cc new file mode 100644 index 000000000..ad5385963 --- /dev/null +++ b/src/crimson/osd/pg_meta.cc @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "pg_meta.h" + +#include <string_view> + +#include "crimson/os/futurized_collection.h" +#include "crimson/os/futurized_store.h" + +// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can +// easily skip them +using crimson::os::FuturizedStore; + +PGMeta::PGMeta(FuturizedStore* store, spg_t pgid) + : store{store}, + pgid{pgid} +{} + +namespace { + template<typename T> + std::optional<T> find_value(const FuturizedStore::omap_values_t& values, + string_view key) + { + auto found = values.find(key); + if (found == values.end()) { + return {}; + } + auto p = found->second.cbegin(); + T value; + decode(value, p); + return std::make_optional(std::move(value)); + } +} + +seastar::future<epoch_t> PGMeta::get_epoch() +{ + return store->open_collection(coll_t{pgid}).then([this](auto ch) { + return store->omap_get_values(ch, + pgid.make_pgmeta_oid(), + {string{infover_key}, + string{epoch_key}}).safe_then( + [](auto&& values) { + { + // sanity check + auto infover = find_value<__u8>(values, infover_key); + assert(infover); + if (*infover < 10) { + throw std::runtime_error("incompatible pg meta"); + } + } + { + auto epoch = find_value<epoch_t>(values, epoch_key); + assert(epoch); + return seastar::make_ready_future<epoch_t>(*epoch); + } + }, + FuturizedStore::read_errorator::assert_all{ + "PGMeta::get_epoch: unable to read pgmeta" + }); + }); +} + +seastar::future<std::tuple<pg_info_t, PastIntervals>> PGMeta::load() +{ + return store->open_collection(coll_t{pgid}).then([this](auto ch) { + return store->omap_get_values(ch, + pgid.make_pgmeta_oid(), + {string{infover_key}, + string{info_key}, + string{biginfo_key}, + string{fastinfo_key}}); + }).safe_then([](auto&& values) { + { + // sanity check + auto infover = find_value<__u8>(values, infover_key); + assert(infover); + if (infover < 10) { + throw std::runtime_error("incompatible pg meta"); + } + } + pg_info_t info; + { + auto found = find_value<pg_info_t>(values, info_key); + assert(found); + info = *std::move(found); + } + PastIntervals past_intervals; + { + using biginfo_t = std::pair<PastIntervals, decltype(info.purged_snaps)>; + auto big_info = find_value<biginfo_t>(values, biginfo_key); + assert(big_info); + past_intervals = std::move(big_info->first); + info.purged_snaps = std::move(big_info->second); + } + { + auto fast_info = find_value<pg_fast_info_t>(values, fastinfo_key); + if (fast_info) { + fast_info->try_apply_to(&info); + } + } + return seastar::make_ready_future<std::tuple<pg_info_t, PastIntervals>>( + std::make_tuple(std::move(info), std::move(past_intervals))); + }, + FuturizedStore::read_errorator::assert_all{ + "PGMeta::load: unable to read pgmeta" + }); +} diff --git a/src/crimson/osd/pg_meta.h b/src/crimson/osd/pg_meta.h new file mode 100644 index 000000000..e0aa02716 --- /dev/null +++ b/src/crimson/osd/pg_meta.h @@ -0,0 +1,23 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <tuple> +#include <seastar/core/future.hh> +#include "osd/osd_types.h" + +namespace crimson::os { + class FuturizedStore; +} + +/// PG related metadata +class PGMeta +{ + crimson::os::FuturizedStore* store; + const spg_t pgid; +public: + PGMeta(crimson::os::FuturizedStore *store, spg_t pgid); + seastar::future<epoch_t> get_epoch(); + seastar::future<std::tuple<pg_info_t, PastIntervals>> load(); +}; diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc new file mode 100644 index 000000000..7d70b5e8f --- /dev/null +++ b/src/crimson/osd/pg_recovery.cc @@ -0,0 +1,550 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/format.h> +#include <fmt/ostream.h> + +#include "crimson/common/type_helpers.h" +#include "crimson/osd/backfill_facades.h" +#include "crimson/osd/osd_operations/background_recovery.h" +#include "crimson/osd/osd_operations/peering_event.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "crimson/osd/pg_recovery.h" + +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" + +#include "osd/osd_types.h" +#include "osd/PeeringState.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +void PGRecovery::start_pglogbased_recovery() +{ + using PglogBasedRecovery = crimson::osd::PglogBasedRecovery; + (void) pg->get_shard_services().start_operation<PglogBasedRecovery>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_osdmap_epoch()); +} + +crimson::osd::blocking_future<bool> +PGRecovery::start_recovery_ops(size_t max_to_start) +{ + assert(pg->is_primary()); + assert(pg->is_peered()); + assert(pg->is_recovering()); + // in ceph-osd the do_recovery() path handles both the pg log-based + // recovery and the backfill, albeit they are separated at the layer + // of PeeringState. In crimson-osd backfill has been cut from it, so + // and do_recovery() is actually solely for pg log-based recovery. + // At the time of writing it's considered to move it to FSM and fix + // the naming as well. + assert(!pg->is_backfilling()); + assert(!pg->get_peering_state().is_deleting()); + + std::vector<crimson::osd::blocking_future<>> started; + started.reserve(max_to_start); + max_to_start -= start_primary_recovery_ops(max_to_start, &started); + if (max_to_start > 0) { + max_to_start -= start_replica_recovery_ops(max_to_start, &started); + } + return crimson::osd::join_blocking_futures(std::move(started)).then( + [this] { + bool done = !pg->get_peering_state().needs_recovery(); + if (done) { + logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}", + pg->get_pgid()); + using LocalPeeringEvent = crimson::osd::LocalPeeringEvent; + if (!pg->get_peering_state().needs_backfill()) { + logger().debug("start_recovery_ops: AllReplicasRecovered for pg: {}", + pg->get_pgid()); + (void) pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::AllReplicasRecovered{}); + } else { + logger().debug("start_recovery_ops: RequestBackfill for pg: {}", + pg->get_pgid()); + (void) pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::RequestBackfill{}); + } + } + return seastar::make_ready_future<bool>(!done); + }); +} + +size_t PGRecovery::start_primary_recovery_ops( + size_t max_to_start, + std::vector<crimson::osd::blocking_future<>> *out) +{ + if (!pg->is_recovering()) { + return 0; + } + + if (!pg->get_peering_state().have_missing()) { + pg->get_peering_state().local_recovery_complete(); + return 0; + } + + const auto &missing = pg->get_peering_state().get_pg_log().get_missing(); + + logger().info("{} recovering {} in pg {}, missing {}", __func__, + pg->get_recovery_backend()->total_recovering(), + *static_cast<crimson::osd::PG*>(pg), + missing); + + unsigned started = 0; + int skipped = 0; + + map<version_t, hobject_t>::const_iterator p = + missing.get_rmissing().lower_bound(pg->get_peering_state().get_pg_log().get_log().last_requested); + while (started < max_to_start && p != missing.get_rmissing().end()) { + // TODO: chain futures here to enable yielding to scheduler? + hobject_t soid; + version_t v = p->first; + + auto it_objects = pg->get_peering_state().get_pg_log().get_log().objects.find(p->second); + if (it_objects != pg->get_peering_state().get_pg_log().get_log().objects.end()) { + // look at log! + pg_log_entry_t *latest = it_objects->second; + assert(latest->is_update() || latest->is_delete()); + soid = latest->soid; + } else { + soid = p->second; + } + const pg_missing_item& item = missing.get_items().find(p->second)->second; + ++p; + + hobject_t head = soid.get_head(); + + logger().info( + "{} {} item.need {} {} {} {} {}", + __func__, + soid, + item.need, + missing.is_missing(soid) ? " (missing)":"", + missing.is_missing(head) ? " (missing head)":"", + pg->get_recovery_backend()->is_recovering(soid) ? " (recovering)":"", + pg->get_recovery_backend()->is_recovering(head) ? " (recovering head)":""); + + // TODO: handle lost/unfound + if (pg->get_recovery_backend()->is_recovering(soid)) { + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + out->push_back(recovery_waiter.wait_for_recovered_blocking()); + ++started; + } else if (pg->get_recovery_backend()->is_recovering(head)) { + ++skipped; + } else { + out->push_back(recover_missing(soid, item.need)); + ++started; + } + + if (!skipped) + pg->get_peering_state().set_last_requested(v); + } + + logger().info("{} started {} skipped {}", __func__, started, skipped); + + return started; +} + +size_t PGRecovery::start_replica_recovery_ops( + size_t max_to_start, + std::vector<crimson::osd::blocking_future<>> *out) +{ + if (!pg->is_recovering()) { + return 0; + } + uint64_t started = 0; + + assert(!pg->get_peering_state().get_acting_recovery_backfill().empty()); + + auto recovery_order = get_replica_recovery_order(); + for (auto &peer : recovery_order) { + assert(peer != pg->get_peering_state().get_primary()); + const auto& pm = pg->get_peering_state().get_peer_missing(peer); + + logger().debug("{}: peer osd.{} missing {} objects", __func__, + peer, pm.num_missing()); + logger().trace("{}: peer osd.{} missing {}", __func__, + peer, pm.get_items()); + + // recover oldest first + for (auto p = pm.get_rmissing().begin(); + p != pm.get_rmissing().end() && started < max_to_start; + ++p) { + const auto &soid = p->second; + + if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) { + logger().debug("{}: object {} still unfound", __func__, soid); + continue; + } + + const pg_info_t &pi = pg->get_peering_state().get_peer_info(peer); + if (soid > pi.last_backfill) { + if (!pg->get_recovery_backend()->is_recovering(soid)) { + logger().error( + "{}: object {} in missing set for backfill (last_backfill {})" + " but not in recovering", + __func__, + soid, + pi.last_backfill); + ceph_abort(); + } + continue; + } + + if (pg->get_recovery_backend()->is_recovering(soid)) { + logger().debug("{}: already recovering object {}", __func__, soid); + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + out->push_back(recovery_waiter.wait_for_recovered_blocking()); + started++; + continue; + } + + if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { + logger().debug("{}: soid {} is a delete, removing", __func__, soid); + map<hobject_t,pg_missing_item>::const_iterator r = + pm.get_items().find(soid); + started += prep_object_replica_deletes( + soid, r->second.need, out); + continue; + } + + if (soid.is_snap() && + pg->get_peering_state().get_pg_log().get_missing().is_missing( + soid.get_head())) { + logger().debug("{}: head {} still missing on primary", __func__, + soid.get_head()); + continue; + } + + if (pg->get_peering_state().get_pg_log().get_missing().is_missing(soid)) { + logger().debug("{}: soid {} still missing on primary", __func__, soid); + continue; + } + + logger().debug("{}: recover_object_replicas({})", __func__,soid); + map<hobject_t,pg_missing_item>::const_iterator r = pm.get_items().find( + soid); + started += prep_object_replica_pushes( + soid, r->second.need, out); + } + } + + return started; +} + +crimson::osd::blocking_future<> PGRecovery::recover_missing( + const hobject_t &soid, eversion_t need) +{ + if (pg->get_peering_state().get_missing_loc().is_deleted(soid)) { + return pg->get_recovery_backend()->add_recovering(soid).make_blocking_future( + pg->get_recovery_backend()->recover_delete(soid, need)); + } else { + return pg->get_recovery_backend()->add_recovering(soid).make_blocking_future( + pg->get_recovery_backend()->recover_object(soid, need).handle_exception( + [=, soid = std::move(soid)] (auto e) { + on_failed_recover({ pg->get_pg_whoami() }, soid, need); + return seastar::make_ready_future<>(); + }) + ); + } +} + +size_t PGRecovery::prep_object_replica_deletes( + const hobject_t& soid, + eversion_t need, + std::vector<crimson::osd::blocking_future<>> *in_progress) +{ + in_progress->push_back( + pg->get_recovery_backend()->add_recovering(soid).make_blocking_future( + pg->get_recovery_backend()->push_delete(soid, need).then([=] { + object_stat_sum_t stat_diff; + stat_diff.num_objects_recovered = 1; + on_global_recover(soid, stat_diff, true); + return seastar::make_ready_future<>(); + }) + ) + ); + return 1; +} + +size_t PGRecovery::prep_object_replica_pushes( + const hobject_t& soid, + eversion_t need, + std::vector<crimson::osd::blocking_future<>> *in_progress) +{ + in_progress->push_back( + pg->get_recovery_backend()->add_recovering(soid).make_blocking_future( + pg->get_recovery_backend()->recover_object(soid, need).handle_exception( + [=, soid = std::move(soid)] (auto e) { + on_failed_recover({ pg->get_pg_whoami() }, soid, need); + return seastar::make_ready_future<>(); + }) + ) + ); + return 1; +} + +void PGRecovery::on_local_recover( + const hobject_t& soid, + const ObjectRecoveryInfo& recovery_info, + const bool is_delete, + ceph::os::Transaction& t) +{ + pg->get_peering_state().recover_got(soid, + recovery_info.version, is_delete, t); + + if (pg->is_primary()) { + if (!is_delete) { + auto& obc = pg->get_recovery_backend()->get_recovering(soid).obc; //TODO: move to pg backend? + obc->obs.exists = true; + obc->obs.oi = recovery_info.oi; + } + if (!pg->is_unreadable_object(soid)) { + pg->get_recovery_backend()->get_recovering(soid).set_readable(); + } + pg->publish_stats_to_osd(); + } +} + +void PGRecovery::on_global_recover ( + const hobject_t& soid, + const object_stat_sum_t& stat_diff, + const bool is_delete) +{ + logger().info("{} {}", __func__, soid); + pg->get_peering_state().object_recovered(soid, stat_diff); + pg->publish_stats_to_osd(); + auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid); + if (!is_delete) + recovery_waiter.obc->drop_recovery_read(); + recovery_waiter.set_recovered(); + pg->get_recovery_backend()->remove_recovering(soid); +} + +void PGRecovery::on_failed_recover( + const set<pg_shard_t>& from, + const hobject_t& soid, + const eversion_t& v) +{ + for (auto pg_shard : from) { + if (pg_shard != pg->get_pg_whoami()) { + pg->get_peering_state().force_object_missing(pg_shard, soid, v); + } + } +} + +void PGRecovery::on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info) +{ + crimson::get_logger(ceph_subsys_osd).debug( + "{}: {}, {} on {}", __func__, oid, + recovery_info.version, peer); + pg->get_peering_state().on_peer_recover(peer, oid, recovery_info.version); +} + +void PGRecovery::_committed_pushed_object(epoch_t epoch, + eversion_t last_complete) +{ + if (!pg->has_reset_since(epoch)) { + pg->get_peering_state().recovery_committed_to(last_complete); + } else { + crimson::get_logger(ceph_subsys_osd).debug( + "{} pg has changed, not touching last_complete_ondisk", + __func__); + } +} + +template <class EventT> +void PGRecovery::start_backfill_recovery(const EventT& evt) +{ + using BackfillRecovery = crimson::osd::BackfillRecovery; + std::ignore = pg->get_shard_services().start_operation<BackfillRecovery>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_osdmap_epoch(), + evt); +} + +void PGRecovery::request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) +{ + logger().debug("{}: target.osd={}", __func__, target.osd); + auto msg = make_message<MOSDPGScan>( + MOSDPGScan::OP_SCAN_GET_DIGEST, + pg->get_pg_whoami(), + pg->get_osdmap_epoch(), + pg->get_last_peering_reset(), + spg_t(pg->get_pgid().pgid, target.shard), + begin, + end); + std::ignore = pg->get_shard_services().send_to_osd( + target.osd, + std::move(msg), + pg->get_osdmap_epoch()); +} + +void PGRecovery::request_primary_scan( + const hobject_t& begin) +{ + logger().debug("{}", __func__); + using crimson::common::local_conf; + std::ignore = pg->get_recovery_backend()->scan_for_backfill( + begin, + local_conf()->osd_backfill_scan_min, + local_conf()->osd_backfill_scan_max + ).then([this] (BackfillInterval bi) { + logger().debug("request_primary_scan:{}", __func__); + using BackfillState = crimson::osd::BackfillState; + start_backfill_recovery(BackfillState::PrimaryScanned{ std::move(bi) }); + }); +} + +void PGRecovery::enqueue_push( + const hobject_t& obj, + const eversion_t& v) +{ + logger().debug("{}: obj={} v={}", + __func__, obj, v); + pg->get_recovery_backend()->add_recovering(obj); + std::ignore = pg->get_recovery_backend()->recover_object(obj, v).\ + handle_exception([] (auto) { + ceph_abort_msg("got exception on backfill's push"); + return seastar::make_ready_future<>(); + }).then([this, obj] { + logger().debug("enqueue_push:{}", __func__); + using BackfillState = crimson::osd::BackfillState; + start_backfill_recovery(BackfillState::ObjectPushed(std::move(obj))); + }); +} + +void PGRecovery::enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) +{ + // allocate a pair if target is seen for the first time + auto& req = backfill_drop_requests[target]; + if (!req) { + req = ceph::make_message<MOSDPGBackfillRemove>( + spg_t(pg->get_pgid().pgid, target.shard), pg->get_osdmap_epoch()); + } + req->ls.emplace_back(obj, v); +} + +void PGRecovery::maybe_flush() +{ + for (auto& [target, req] : backfill_drop_requests) { + std::ignore = pg->get_shard_services().send_to_osd( + target.osd, + std::move(req), + pg->get_osdmap_epoch()); + } + backfill_drop_requests.clear(); +} + +void PGRecovery::update_peers_last_backfill( + const hobject_t& new_last_backfill) +{ + logger().debug("{}: new_last_backfill={}", + __func__, new_last_backfill); + // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to + // all the backfill targets. Otherwise, we will move last_backfill up on + // those targets need it and send OP_BACKFILL_PROGRESS to them. + for (const auto& bt : pg->get_peering_state().get_backfill_targets()) { + if (const pg_info_t& pinfo = pg->get_peering_state().get_peer_info(bt); + new_last_backfill > pinfo.last_backfill) { + pg->get_peering_state().update_peer_last_backfill(bt, new_last_backfill); + auto m = make_message<MOSDPGBackfill>( + pinfo.last_backfill.is_max() ? MOSDPGBackfill::OP_BACKFILL_FINISH + : MOSDPGBackfill::OP_BACKFILL_PROGRESS, + pg->get_osdmap_epoch(), + pg->get_last_peering_reset(), + spg_t(pg->get_pgid().pgid, bt.shard)); + // Use default priority here, must match sub_op priority + // TODO: if pinfo.last_backfill.is_max(), then + // start_recovery_op(hobject_t::get_max()); + m->last_backfill = pinfo.last_backfill; + m->stats = pinfo.stats; + std::ignore = pg->get_shard_services().send_to_osd( + bt.osd, std::move(m), pg->get_osdmap_epoch()); + logger().info("{}: peer {} num_objects now {} / {}", + __func__, + bt, + pinfo.stats.stats.sum.num_objects, + pg->get_info().stats.stats.sum.num_objects); + } + } +} + +bool PGRecovery::budget_available() const +{ + // TODO: the limits! + return true; +} + +void PGRecovery::backfilled() +{ + using LocalPeeringEvent = crimson::osd::LocalPeeringEvent; + std::ignore = pg->get_shard_services().start_operation<LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(pg), + pg->get_shard_services(), + pg->get_pg_whoami(), + pg->get_pgid(), + pg->get_osdmap_epoch(), + pg->get_osdmap_epoch(), + PeeringState::Backfilled{}); +} + +void PGRecovery::dispatch_backfill_event( + boost::intrusive_ptr<const boost::statechart::event_base> evt) +{ + logger().debug("{}", __func__); + backfill_state->process_event(evt); +} + +void PGRecovery::on_backfill_reserved() +{ + logger().debug("{}", __func__); + // PIMP and depedency injection for the sake unittestability. + // I'm not afraid about the performance here. + using BackfillState = crimson::osd::BackfillState; + backfill_state = std::make_unique<BackfillState>( + *this, + std::make_unique<crimson::osd::PeeringFacade>(pg->get_peering_state()), + std::make_unique<crimson::osd::PGFacade>( + *static_cast<crimson::osd::PG*>(pg))); + // yes, it's **not** backfilling yet. The PG_STATE_BACKFILLING + // will be set after on_backfill_reserved() returns. + // Backfill needs to take this into consideration when scheduling + // events -- they must be mutually exclusive with PeeringEvent + // instances. Otherwise the execution might begin without having + // the state updated. + ceph_assert(!pg->get_peering_state().is_backfilling()); + start_backfill_recovery(BackfillState::Triggered{}); +} diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h new file mode 100644 index 000000000..86f259de5 --- /dev/null +++ b/src/crimson/osd/pg_recovery.h @@ -0,0 +1,113 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "crimson/osd/backfill_state.h" +#include "crimson/osd/osd_operation.h" +#include "crimson/osd/pg_recovery_listener.h" +#include "crimson/osd/scheduler/scheduler.h" +#include "crimson/osd/shard_services.h" + +#include "osd/object_state.h" + +class MOSDPGBackfillRemove; +class PGBackend; + +class PGRecovery : public crimson::osd::BackfillState::BackfillListener { +public: + PGRecovery(PGRecoveryListener* pg) : pg(pg) {} + virtual ~PGRecovery() {} + void start_pglogbased_recovery(); + + crimson::osd::blocking_future<bool> start_recovery_ops(size_t max_to_start); + void on_backfill_reserved(); + void dispatch_backfill_event( + boost::intrusive_ptr<const boost::statechart::event_base> evt); + + seastar::future<> stop() { return seastar::now(); } +private: + PGRecoveryListener* pg; + size_t start_primary_recovery_ops( + size_t max_to_start, + std::vector<crimson::osd::blocking_future<>> *out); + size_t start_replica_recovery_ops( + size_t max_to_start, + std::vector<crimson::osd::blocking_future<>> *out); + + std::vector<pg_shard_t> get_replica_recovery_order() const { + return pg->get_replica_recovery_order(); + } + crimson::osd::blocking_future<> recover_missing( + const hobject_t &soid, eversion_t need); + size_t prep_object_replica_deletes( + const hobject_t& soid, + eversion_t need, + std::vector<crimson::osd::blocking_future<>> *in_progress); + size_t prep_object_replica_pushes( + const hobject_t& soid, + eversion_t need, + std::vector<crimson::osd::blocking_future<>> *in_progress); + + void on_local_recover( + const hobject_t& soid, + const ObjectRecoveryInfo& recovery_info, + bool is_delete, + ceph::os::Transaction& t); + void on_global_recover ( + const hobject_t& soid, + const object_stat_sum_t& stat_diff, + bool is_delete); + void on_failed_recover( + const set<pg_shard_t>& from, + const hobject_t& soid, + const eversion_t& v); + void on_peer_recover( + pg_shard_t peer, + const hobject_t &oid, + const ObjectRecoveryInfo &recovery_info); + void _committed_pushed_object(epoch_t epoch, + eversion_t last_complete); + friend class ReplicatedRecoveryBackend; + friend class crimson::osd::UrgentRecovery; + seastar::future<> handle_pull(Ref<MOSDPGPull> m); + seastar::future<> handle_push(Ref<MOSDPGPush> m); + seastar::future<> handle_push_reply(Ref<MOSDPGPushReply> m); + seastar::future<> handle_recovery_delete(Ref<MOSDPGRecoveryDelete> m); + seastar::future<> handle_recovery_delete_reply( + Ref<MOSDPGRecoveryDeleteReply> m); + seastar::future<> handle_pull_response(Ref<MOSDPGPush> m); + seastar::future<> handle_scan(MOSDPGScan& m); + + // backfill begin + std::unique_ptr<crimson::osd::BackfillState> backfill_state; + std::map<pg_shard_t, + ceph::ref_t<MOSDPGBackfillRemove>> backfill_drop_requests; + + template <class EventT> + void start_backfill_recovery( + const EventT& evt); + void request_replica_scan( + const pg_shard_t& target, + const hobject_t& begin, + const hobject_t& end) final; + void request_primary_scan( + const hobject_t& begin) final; + void enqueue_push( + const hobject_t& obj, + const eversion_t& v) final; + void enqueue_drop( + const pg_shard_t& target, + const hobject_t& obj, + const eversion_t& v) final; + void maybe_flush() final; + void update_peers_last_backfill( + const hobject_t& new_last_backfill) final; + bool budget_available() const final; + void backfilled() final; + friend crimson::osd::BackfillState::PGFacade; + friend crimson::osd::PG; + // backfill end +}; diff --git a/src/crimson/osd/pg_recovery_listener.h b/src/crimson/osd/pg_recovery_listener.h new file mode 100644 index 000000000..c922b9956 --- /dev/null +++ b/src/crimson/osd/pg_recovery_listener.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "common/hobject.h" +#include "include/types.h" +#include "osd/osd_types.h" + +namespace crimson::osd { + class ShardServices; +}; + +class RecoveryBackend; +class PGRecovery; + +class PGRecoveryListener { +public: + virtual crimson::osd::ShardServices& get_shard_services() = 0; + virtual PGRecovery* get_recovery_handler() = 0; + virtual epoch_t get_osdmap_epoch() const = 0; + virtual bool is_primary() const = 0; + virtual bool is_peered() const = 0; + virtual bool is_recovering() const = 0; + virtual bool is_backfilling() const = 0; + virtual PeeringState& get_peering_state() = 0; + virtual const pg_shard_t& get_pg_whoami() const = 0; + virtual const spg_t& get_pgid() const = 0; + virtual RecoveryBackend* get_recovery_backend() = 0; + virtual bool is_unreadable_object(const hobject_t&, eversion_t* v = 0) const = 0; + virtual bool has_reset_since(epoch_t) const = 0; + virtual std::vector<pg_shard_t> get_replica_recovery_order() const = 0; + virtual epoch_t get_last_peering_reset() const = 0; + virtual const pg_info_t& get_info() const= 0; + virtual seastar::future<> stop() = 0; + virtual void publish_stats_to_osd() = 0; +}; diff --git a/src/crimson/osd/recovery_backend.cc b/src/crimson/osd/recovery_backend.cc new file mode 100644 index 000000000..aeec0d14b --- /dev/null +++ b/src/crimson/osd/recovery_backend.cc @@ -0,0 +1,298 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/format.h> + +#include "crimson/common/exception.h" +#include "crimson/osd/recovery_backend.h" +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" + +#include "messages/MOSDFastDispatchOp.h" +#include "osd/osd_types.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +hobject_t RecoveryBackend::get_temp_recovery_object( + const hobject_t& target, + eversion_t version) const +{ + hobject_t hoid = + target.make_temp_hobject(fmt::format("temp_recovering_{}_{}_{}_{}", + pg.get_info().pgid, + version, + pg.get_info().history.same_interval_since, + target.snap)); + logger().debug("{} {}", __func__, hoid); + return hoid; +} + +void RecoveryBackend::clean_up(ceph::os::Transaction& t, + std::string_view why) +{ + for (auto& soid : temp_contents) { + t.remove(pg.get_collection_ref()->get_cid(), + ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard)); + } + temp_contents.clear(); + + for (auto& [soid, recovery_waiter] : recovering) { + if ((recovery_waiter.pi && recovery_waiter.pi->is_complete()) + || (!recovery_waiter.pi + && recovery_waiter.obc && recovery_waiter.obc->obs.exists)) { + recovery_waiter.obc->interrupt( + ::crimson::common::actingset_changed( + pg.is_primary())); + recovery_waiter.interrupt(why); + } + } + recovering.clear(); +} + +void RecoveryBackend::WaitForObjectRecovery::stop() { + readable.set_exception( + crimson::common::system_shutdown_exception()); + recovered.set_exception( + crimson::common::system_shutdown_exception()); + pulled.set_exception( + crimson::common::system_shutdown_exception()); + for (auto& [pg_shard, pr] : pushes) { + pr.set_exception( + crimson::common::system_shutdown_exception()); + } +} + +void RecoveryBackend::handle_backfill_finish( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + ceph_assert(!pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 1); + auto reply = make_message<MOSDPGBackfill>( + MOSDPGBackfill::OP_BACKFILL_FINISH_ACK, + pg.get_osdmap_epoch(), + m.query_epoch, + spg_t(pg.get_pgid().pgid, pg.get_primary().shard)); + reply->set_priority(pg.get_recovery_op_priority()); + std::ignore = m.get_connection()->send(std::move(reply)); + shard_services.start_operation<crimson::osd::LocalPeeringEvent>( + static_cast<crimson::osd::PG*>(&pg), + shard_services, + pg.get_pg_whoami(), + pg.get_pgid(), + pg.get_osdmap_epoch(), + pg.get_osdmap_epoch(), + RecoveryDone{}); +} + +seastar::future<> RecoveryBackend::handle_backfill_progress( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + ceph_assert(!pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 2); + + ObjectStore::Transaction t; + pg.get_peering_state().update_backfill_progress( + m.last_backfill, + m.stats, + m.op == MOSDPGBackfill::OP_BACKFILL_PROGRESS, + t); + return shard_services.get_store().do_transaction( + pg.get_collection_ref(), std::move(t) + ).or_terminate(); +} + +seastar::future<> RecoveryBackend::handle_backfill_finish_ack( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + ceph_assert(pg.is_primary()); + ceph_assert(crimson::common::local_conf()->osd_kill_backfill_at != 3); + // TODO: + // finish_recovery_op(hobject_t::get_max()); + return seastar::now(); +} + +seastar::future<> RecoveryBackend::handle_backfill( + MOSDPGBackfill& m) +{ + logger().debug("{}", __func__); + switch (m.op) { + case MOSDPGBackfill::OP_BACKFILL_FINISH: + handle_backfill_finish(m); + [[fallthrough]]; + case MOSDPGBackfill::OP_BACKFILL_PROGRESS: + return handle_backfill_progress(m); + case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK: + return handle_backfill_finish_ack(m); + default: + ceph_assert("unknown op type for pg backfill"); + return seastar::now(); + } +} + +seastar::future<> RecoveryBackend::handle_backfill_remove( + MOSDPGBackfillRemove& m) +{ + logger().debug("{} m.ls={}", __func__, m.ls); + assert(m.get_type() == MSG_OSD_PG_BACKFILL_REMOVE); + + ObjectStore::Transaction t; + for ([[maybe_unused]] const auto& [soid, ver] : m.ls) { + // TODO: the reserved space management. PG::try_reserve_recovery_space(). + t.remove(pg.get_collection_ref()->get_cid(), + ghobject_t(soid, ghobject_t::NO_GEN, pg.get_pg_whoami().shard)); + } + return shard_services.get_store().do_transaction( + pg.get_collection_ref(), std::move(t) + ).or_terminate(); +} + +seastar::future<BackfillInterval> RecoveryBackend::scan_for_backfill( + const hobject_t& start, + [[maybe_unused]] const std::int64_t min, + const std::int64_t max) +{ + logger().debug("{} starting from {}", __func__, start); + auto version_map = seastar::make_lw_shared<std::map<hobject_t, eversion_t>>(); + return backend->list_objects(start, max).then( + [this, start, version_map] (auto&& ret) { + auto&& [objects, next] = std::move(ret); + return seastar::parallel_for_each(std::move(objects), + [this, version_map] (const hobject_t& object) { + crimson::osd::ObjectContextRef obc; + if (pg.is_primary()) { + obc = shard_services.obc_registry.maybe_get_cached_obc(object); + } + if (obc) { + if (obc->obs.exists) { + logger().debug("scan_for_backfill found (primary): {} {}", + object, obc->obs.oi.version); + version_map->emplace(object, obc->obs.oi.version); + } else { + // if the object does not exist here, it must have been removed + // between the collection_list_partial and here. This can happen + // for the first item in the range, which is usually last_backfill. + } + return seastar::now(); + } else { + return backend->load_metadata(object).safe_then( + [version_map, object] (auto md) { + if (md->os.exists) { + logger().debug("scan_for_backfill found: {} {}", + object, md->os.oi.version); + version_map->emplace(object, md->os.oi.version); + } + return seastar::now(); + }, PGBackend::load_metadata_ertr::assert_all{}); + } + }).then([version_map, start=std::move(start), next=std::move(next), this] { + BackfillInterval bi; + bi.begin = std::move(start); + bi.end = std::move(next); + bi.version = pg.get_info().last_update; + bi.objects = std::move(*version_map); + logger().debug("{} BackfillInterval filled, leaving", + "scan_for_backfill"); + return seastar::make_ready_future<BackfillInterval>(std::move(bi)); + }); + }); +} + +seastar::future<> RecoveryBackend::handle_scan_get_digest( + MOSDPGScan& m) +{ + logger().debug("{}", __func__); + if (false /* FIXME: check for backfill too full */) { + std::ignore = shard_services.start_operation<crimson::osd::LocalPeeringEvent>( + // TODO: abstract start_background_recovery + static_cast<crimson::osd::PG*>(&pg), + shard_services, + pg.get_pg_whoami(), + pg.get_pgid(), + pg.get_osdmap_epoch(), + pg.get_osdmap_epoch(), + PeeringState::BackfillTooFull()); + return seastar::now(); + } + return scan_for_backfill( + std::move(m.begin), + crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_min"), + crimson::common::local_conf().get_val<std::int64_t>("osd_backfill_scan_max") + ).then([this, + query_epoch=m.query_epoch, + conn=m.get_connection()] (auto backfill_interval) { + auto reply = make_message<MOSDPGScan>( + MOSDPGScan::OP_SCAN_DIGEST, + pg.get_pg_whoami(), + pg.get_osdmap_epoch(), + query_epoch, + spg_t(pg.get_info().pgid.pgid, pg.get_primary().shard), + backfill_interval.begin, + backfill_interval.end); + encode(backfill_interval.objects, reply->get_data()); + return conn->send(std::move(reply)); + }); +} + +seastar::future<> RecoveryBackend::handle_scan_digest( + MOSDPGScan& m) +{ + logger().debug("{}", __func__); + // Check that from is in backfill_targets vector + ceph_assert(pg.is_backfill_target(m.from)); + + BackfillInterval bi; + bi.begin = m.begin; + bi.end = m.end; + { + auto p = m.get_data().cbegin(); + // take care to preserve ordering! + bi.clear_objects(); + ::decode_noclear(bi.objects, p); + } + shard_services.start_operation<crimson::osd::BackfillRecovery>( + static_cast<crimson::osd::PG*>(&pg), + shard_services, + pg.get_osdmap_epoch(), + crimson::osd::BackfillState::ReplicaScanned{ m.from, std::move(bi) }); + return seastar::now(); +} + +seastar::future<> RecoveryBackend::handle_scan( + MOSDPGScan& m) +{ + logger().debug("{}", __func__); + switch (m.op) { + case MOSDPGScan::OP_SCAN_GET_DIGEST: + return handle_scan_get_digest(m); + case MOSDPGScan::OP_SCAN_DIGEST: + return handle_scan_digest(m); + default: + // FIXME: move to errorator + ceph_assert("unknown op type for pg scan"); + return seastar::now(); + } +} + +seastar::future<> RecoveryBackend::handle_recovery_op( + Ref<MOSDFastDispatchOp> m) +{ + switch (m->get_header().type) { + case MSG_OSD_PG_BACKFILL: + return handle_backfill(*boost::static_pointer_cast<MOSDPGBackfill>(m)); + case MSG_OSD_PG_BACKFILL_REMOVE: + return handle_backfill_remove(*boost::static_pointer_cast<MOSDPGBackfillRemove>(m)); + case MSG_OSD_PG_SCAN: + return handle_scan(*boost::static_pointer_cast<MOSDPGScan>(m)); + default: + return seastar::make_exception_future<>( + std::invalid_argument(fmt::format("invalid request type: {}", + m->get_header().type))); + } +} diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h new file mode 100644 index 000000000..cb0ae9f20 --- /dev/null +++ b/src/crimson/osd/recovery_backend.h @@ -0,0 +1,203 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <seastar/core/future.hh> + +#include "crimson/common/type_helpers.h" +#include "crimson/os/futurized_store.h" +#include "crimson/os/futurized_collection.h" +#include "crimson/osd/object_context.h" +#include "crimson/osd/shard_services.h" + +#include "messages/MOSDPGBackfill.h" +#include "messages/MOSDPGBackfillRemove.h" +#include "messages/MOSDPGScan.h" +#include "osd/recovery_types.h" +#include "osd/osd_types.h" + +namespace crimson::osd{ + class PG; +} + +class PGBackend; + +class RecoveryBackend { + void handle_backfill_finish( + MOSDPGBackfill& m); + seastar::future<> handle_backfill_progress( + MOSDPGBackfill& m); + seastar::future<> handle_backfill_finish_ack( + MOSDPGBackfill& m); + seastar::future<> handle_backfill(MOSDPGBackfill& m); + + seastar::future<> handle_backfill_remove(MOSDPGBackfillRemove& m); + + seastar::future<> handle_scan_get_digest( + MOSDPGScan& m); + seastar::future<> handle_scan_digest( + MOSDPGScan& m); + seastar::future<> handle_scan( + MOSDPGScan& m); +protected: + class WaitForObjectRecovery; +public: + RecoveryBackend(crimson::osd::PG& pg, + crimson::osd::ShardServices& shard_services, + crimson::os::CollectionRef coll, + PGBackend* backend) + : pg{pg}, + shard_services{shard_services}, + store{&shard_services.get_store()}, + coll{coll}, + backend{backend} {} + virtual ~RecoveryBackend() {} + WaitForObjectRecovery& add_recovering(const hobject_t& soid) { + auto [it, added] = recovering.emplace(soid, WaitForObjectRecovery{}); + assert(added); + return it->second; + } + WaitForObjectRecovery& get_recovering(const hobject_t& soid) { + assert(is_recovering(soid)); + return recovering.at(soid); + } + void remove_recovering(const hobject_t& soid) { + recovering.erase(soid); + } + bool is_recovering(const hobject_t& soid) const { + return recovering.count(soid) != 0; + } + uint64_t total_recovering() const { + return recovering.size(); + } + + virtual seastar::future<> handle_recovery_op( + Ref<MOSDFastDispatchOp> m); + + virtual seastar::future<> recover_object( + const hobject_t& soid, + eversion_t need) = 0; + virtual seastar::future<> recover_delete( + const hobject_t& soid, + eversion_t need) = 0; + virtual seastar::future<> push_delete( + const hobject_t& soid, + eversion_t need) = 0; + + seastar::future<BackfillInterval> scan_for_backfill( + const hobject_t& from, + std::int64_t min, + std::int64_t max); + + void on_peering_interval_change(ceph::os::Transaction& t) { + clean_up(t, "new peering interval"); + } + + seastar::future<> stop() { + for (auto& [soid, recovery_waiter] : recovering) { + recovery_waiter.stop(); + } + return on_stop(); + } +protected: + crimson::osd::PG& pg; + crimson::osd::ShardServices& shard_services; + crimson::os::FuturizedStore* store; + crimson::os::CollectionRef coll; + PGBackend* backend; + + struct PullInfo { + pg_shard_t from; + hobject_t soid; + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + crimson::osd::ObjectContextRef head_ctx; + crimson::osd::ObjectContextRef obc; + object_stat_sum_t stat; + bool is_complete() const { + return recovery_progress.is_complete(recovery_info); + } + }; + + struct PushInfo { + ObjectRecoveryProgress recovery_progress; + ObjectRecoveryInfo recovery_info; + crimson::osd::ObjectContextRef obc; + object_stat_sum_t stat; + }; + + class WaitForObjectRecovery : public crimson::osd::BlockerT<WaitForObjectRecovery> { + seastar::shared_promise<> readable, recovered, pulled; + std::map<pg_shard_t, seastar::shared_promise<>> pushes; + public: + static constexpr const char* type_name = "WaitForObjectRecovery"; + + crimson::osd::ObjectContextRef obc; + std::optional<PullInfo> pi; + std::map<pg_shard_t, PushInfo> pushing; + + seastar::future<> wait_for_readable() { + return readable.get_shared_future(); + } + seastar::future<> wait_for_pushes(pg_shard_t shard) { + return pushes[shard].get_shared_future(); + } + seastar::future<> wait_for_recovered() { + return recovered.get_shared_future(); + } + crimson::osd::blocking_future<> + wait_for_recovered_blocking() { + return make_blocking_future( + recovered.get_shared_future()); + } + seastar::future<> wait_for_pull() { + return pulled.get_shared_future(); + } + void set_readable() { + readable.set_value(); + } + void set_recovered() { + recovered.set_value(); + } + void set_pushed(pg_shard_t shard) { + pushes[shard].set_value(); + } + void set_pulled() { + pulled.set_value(); + } + void set_push_failed(pg_shard_t shard, std::exception_ptr e) { + pushes.at(shard).set_exception(e); + } + void interrupt(std::string_view why) { + readable.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + recovered.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + pulled.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + for (auto& [pg_shard, pr] : pushes) { + pr.set_exception(std::system_error( + std::make_error_code(std::errc::interrupted), why.data())); + } + } + void stop(); + void dump_detail(Formatter* f) const { + } + }; + std::map<hobject_t, WaitForObjectRecovery> recovering; + hobject_t get_temp_recovery_object( + const hobject_t& target, + eversion_t version) const; + + boost::container::flat_set<hobject_t> temp_contents; + + void add_temp_obj(const hobject_t &oid) { + temp_contents.insert(oid); + } + void clear_temp_obj(const hobject_t &oid) { + temp_contents.erase(oid); + } + void clean_up(ceph::os::Transaction& t, std::string_view why); + virtual seastar::future<> on_stop() = 0; +}; diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc new file mode 100644 index 000000000..3a131278b --- /dev/null +++ b/src/crimson/osd/replicated_backend.cc @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "replicated_backend.h" + +#include "messages/MOSDRepOpReply.h" + +#include "crimson/common/exception.h" +#include "crimson/common/log.h" +#include "crimson/os/futurized_store.h" +#include "crimson/osd/shard_services.h" +#include "osd/PeeringState.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +ReplicatedBackend::ReplicatedBackend(pg_t pgid, + pg_shard_t whoami, + ReplicatedBackend::CollectionRef coll, + crimson::osd::ShardServices& shard_services) + : PGBackend{whoami.shard, coll, &shard_services.get_store()}, + pgid{pgid}, + whoami{whoami}, + shard_services{shard_services} +{} + +ReplicatedBackend::ll_read_errorator::future<ceph::bufferlist> +ReplicatedBackend::_read(const hobject_t& hoid, + const uint64_t off, + const uint64_t len, + const uint32_t flags) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + return store->read(coll, ghobject_t{hoid}, off, len, flags); +} + +seastar::future<crimson::osd::acked_peers_t> +ReplicatedBackend::_submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, epoch_t map_epoch, + std::vector<pg_log_entry_t>&& log_entries) +{ + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + if (__builtin_expect((bool)peering, false)) { + throw crimson::common::actingset_changed(peering->is_primary); + } + + const ceph_tid_t tid = next_txn_id++; + auto req_id = osd_op_p.req->get_reqid(); + auto pending_txn = + pending_trans.emplace(tid, pg_shards.size()).first; + bufferlist encoded_txn; + encode(txn, encoded_txn); + + return seastar::parallel_for_each(std::move(pg_shards), + [=, encoded_txn=std::move(encoded_txn), txn=std::move(txn)] + (auto pg_shard) mutable { + if (pg_shard == whoami) { + return shard_services.get_store().do_transaction(coll,std::move(txn)); + } else { + auto m = make_message<MOSDRepOp>(req_id, whoami, + spg_t{pgid, pg_shard.shard}, hoid, + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + map_epoch, min_epoch, + tid, osd_op_p.at_version); + m->set_data(encoded_txn); + pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}}); + encode(log_entries, m->logbl); + m->pg_trim_to = osd_op_p.pg_trim_to; + m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk; + m->set_rollback_to(osd_op_p.at_version); + // TODO: set more stuff. e.g., pg_states + return shard_services.send_to_osd(pg_shard.osd, std::move(m), map_epoch); + } + }).then([this, peers=pending_txn->second.weak_from_this()] { + if (!peers) { + // for now, only actingset_changed can cause peers + // to be nullptr + assert(peering); + throw crimson::common::actingset_changed(peering->is_primary); + } + if (--peers->pending == 0) { + peers->all_committed.set_value(); + peers->all_committed = {}; + return seastar::now(); + } + return peers->all_committed.get_future(); + }).then([pending_txn, this] { + auto acked_peers = std::move(pending_txn->second.acked_peers); + pending_trans.erase(pending_txn); + return seastar::make_ready_future<crimson::osd::acked_peers_t>(std::move(acked_peers)); + }); +} + +void ReplicatedBackend::on_actingset_changed(peering_info_t pi) +{ + peering.emplace(pi); + crimson::common::actingset_changed e_actingset_changed{peering->is_primary}; + for (auto& [tid, pending_txn] : pending_trans) { + pending_txn.all_committed.set_exception(e_actingset_changed); + } + pending_trans.clear(); +} + +void ReplicatedBackend::got_rep_op_reply(const MOSDRepOpReply& reply) +{ + auto found = pending_trans.find(reply.get_tid()); + if (found == pending_trans.end()) { + logger().warn("{}: no matched pending rep op: {}", __func__, reply); + return; + } + auto& peers = found->second; + for (auto& peer : peers.acked_peers) { + if (peer.shard == reply.from) { + peer.last_complete_ondisk = reply.get_last_complete_ondisk(); + if (--peers.pending == 0) { + peers.all_committed.set_value(); + peers.all_committed = {}; + } + return; + } + } +} + +seastar::future<> ReplicatedBackend::stop() +{ + logger().info("ReplicatedBackend::stop {}", coll->get_cid()); + stopping = true; + for (auto& [tid, pending_on] : pending_trans) { + pending_on.all_committed.set_exception( + crimson::common::system_shutdown_exception()); + } + pending_trans.clear(); + return seastar::now(); +} diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h new file mode 100644 index 000000000..01c0bba64 --- /dev/null +++ b/src/crimson/osd/replicated_backend.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> +#include <seastar/core/weak_ptr.hh> +#include "include/buffer_fwd.h" +#include "osd/osd_types.h" + +#include "acked_peers.h" +#include "pg_backend.h" + +namespace crimson::osd { + class ShardServices; +} + +class ReplicatedBackend : public PGBackend +{ +public: + ReplicatedBackend(pg_t pgid, pg_shard_t whoami, + CollectionRef coll, + crimson::osd::ShardServices& shard_services); + void got_rep_op_reply(const MOSDRepOpReply& reply) final; + seastar::future<> stop() final; + void on_actingset_changed(peering_info_t pi) final; +private: + ll_read_errorator::future<ceph::bufferlist> _read(const hobject_t& hoid, + uint64_t off, + uint64_t len, + uint32_t flags) override; + seastar::future<crimson::osd::acked_peers_t> + _submit_transaction(std::set<pg_shard_t>&& pg_shards, + const hobject_t& hoid, + ceph::os::Transaction&& txn, + const osd_op_params_t& osd_op_p, + epoch_t min_epoch, epoch_t max_epoch, + std::vector<pg_log_entry_t>&& log_entries) final; + const pg_t pgid; + const pg_shard_t whoami; + crimson::osd::ShardServices& shard_services; + ceph_tid_t next_txn_id = 0; + class pending_on_t : public seastar::weakly_referencable<pending_on_t> { + public: + pending_on_t(size_t pending) + : pending{static_cast<unsigned>(pending)} + {} + unsigned pending; + crimson::osd::acked_peers_t acked_peers; + seastar::promise<> all_committed; + }; + using pending_transactions_t = std::map<ceph_tid_t, pending_on_t>; + pending_transactions_t pending_trans; +}; diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc new file mode 100644 index 000000000..0812003bb --- /dev/null +++ b/src/crimson/osd/replicated_recovery_backend.cc @@ -0,0 +1,1076 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <fmt/format.h> +#include <fmt/ostream.h> +#include <seastar/core/future.hh> +#include <seastar/core/do_with.hh> + +#include "crimson/osd/pg.h" +#include "crimson/osd/pg_backend.h" +#include "replicated_recovery_backend.h" + +#include "msg/Message.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +seastar::future<> ReplicatedRecoveryBackend::recover_object( + const hobject_t& soid, + eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + // always add_recovering(soid) before recover_object(soid) + assert(is_recovering(soid)); + // start tracking the recovery of soid + return maybe_pull_missing_obj(soid, need).then([this, soid, need] { + logger().debug("recover_object: loading obc: {}", soid); + return pg.with_head_obc<RWState::RWREAD>(soid, + [this, soid, need](auto obc) { + logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid); + auto& recovery_waiter = recovering.at(soid); + recovery_waiter.obc = obc; + recovery_waiter.obc->wait_recovery_read(); + return maybe_push_shards(soid, need); + }).handle_error( + crimson::osd::PG::load_obc_ertr::all_same_way([soid](auto& code) { + // TODO: may need eio handling? + logger().error("recover_object saw error code {}, ignoring object {}", + code, soid); + })); + }); +} + +seastar::future<> +ReplicatedRecoveryBackend::maybe_push_shards( + const hobject_t& soid, + eversion_t need) +{ + return seastar::parallel_for_each(get_shards_to_push(soid), + [this, need, soid](auto shard) { + return prep_push(soid, need, shard).then([this, soid, shard](auto push) { + auto msg = make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->pushes.push_back(std::move(push)); + msg->set_priority(pg.get_recovery_op_priority()); + return shard_services.send_to_osd(shard.osd, + std::move(msg), + pg.get_osdmap_epoch()).then( + [this, soid, shard] { + return recovering.at(soid).wait_for_pushes(shard); + }); + }); + }).then([this, soid] { + auto &recovery = recovering.at(soid); + auto push_info = recovery.pushing.begin(); + object_stat_sum_t stat = {}; + if (push_info != recovery.pushing.end()) { + stat = push_info->second.stat; + } else { + // no push happened, take pull_info's stat + assert(recovery.pi); + stat = recovery.pi->stat; + } + pg.get_recovery_handler()->on_global_recover(soid, stat, false); + return seastar::make_ready_future<>(); + }).handle_exception([this, soid](auto e) { + auto &recovery = recovering.at(soid); + if (recovery.obc) { + recovery.obc->drop_recovery_read(); + } + recovering.erase(soid); + return seastar::make_exception_future<>(e); + }); +} + +seastar::future<> +ReplicatedRecoveryBackend::maybe_pull_missing_obj( + const hobject_t& soid, + eversion_t need) +{ + pg_missing_tracker_t local_missing = pg.get_local_missing(); + if (!local_missing.is_missing(soid)) { + return seastar::make_ready_future<>(); + } + PullOp po; + auto& recovery_waiter = recovering.at(soid); + recovery_waiter.pi = std::make_optional<RecoveryBackend::PullInfo>(); + auto& pi = *recovery_waiter.pi; + prepare_pull(po, pi, soid, need); + auto msg = make_message<MOSDPGPull>(); + msg->from = pg.get_pg_whoami(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_pulls({std::move(po)}); + return shard_services.send_to_osd( + pi.from.osd, + std::move(msg), + pg.get_osdmap_epoch() + ).then([&recovery_waiter] { + return recovery_waiter.wait_for_pull(); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::push_delete( + const hobject_t& soid, + eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + recovering[soid]; + epoch_t min_epoch = pg.get_last_peering_reset(); + + assert(pg.get_acting_recovery_backfill().size() > 0); + return seastar::parallel_for_each(pg.get_acting_recovery_backfill(), + [this, soid, need, min_epoch](pg_shard_t shard) { + if (shard == pg.get_pg_whoami()) + return seastar::make_ready_future<>(); + auto iter = pg.get_shard_missing().find(shard); + if (iter == pg.get_shard_missing().end()) + return seastar::make_ready_future<>(); + if (iter->second.is_missing(soid)) { + logger().debug("push_delete: will remove {} from {}", soid, shard); + pg.begin_peer_recover(shard, soid); + spg_t target_pg(pg.get_info().pgid.pgid, shard.shard); + auto msg = make_message<MOSDPGRecoveryDelete>( + pg.get_pg_whoami(), target_pg, pg.get_osdmap_epoch(), min_epoch); + msg->set_priority(pg.get_recovery_op_priority()); + msg->objects.push_back(std::make_pair(soid, need)); + return shard_services.send_to_osd(shard.osd, std::move(msg), + pg.get_osdmap_epoch()).then( + [this, soid, shard] { + return recovering.at(soid).wait_for_pushes(shard); + }); + } + return seastar::make_ready_future<>(); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::handle_recovery_delete( + Ref<MOSDPGRecoveryDelete> m) +{ + logger().debug("{}: {}", __func__, *m); + + auto& p = m->objects.front(); //TODO: only one delete per message for now. + return local_recover_delete(p.first, p.second, pg.get_osdmap_epoch()).then( + [this, m] { + auto reply = make_message<MOSDPGRecoveryDeleteReply>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = spg_t(pg.get_info().pgid.pgid, m->from.shard); + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->objects = m->objects; + return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch()); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::on_local_recover_persist( + const hobject_t& soid, + const ObjectRecoveryInfo& _recovery_info, + bool is_delete, + epoch_t epoch_frozen) +{ + logger().debug("{}", __func__); + ceph::os::Transaction t; + pg.get_recovery_handler()->on_local_recover(soid, _recovery_info, is_delete, t); + return shard_services.get_store().do_transaction(coll, std::move(t)).then( + [this, epoch_frozen, last_complete = pg.get_info().last_complete] { + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + return seastar::make_ready_future<>(); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::local_recover_delete( + const hobject_t& soid, + eversion_t need, + epoch_t epoch_to_freeze) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + return backend->load_metadata(soid).safe_then([this] + (auto lomt) { + if (lomt->os.exists) { + return seastar::do_with(ceph::os::Transaction(), + [this, lomt = std::move(lomt)](auto& txn) { + return backend->remove(lomt->os, txn).then([this, &txn]() mutable { + return shard_services.get_store().do_transaction(coll, + std::move(txn)); + }); + }); + } + return seastar::make_ready_future<>(); + }).safe_then([this, soid, epoch_to_freeze, need] { + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = need; + return on_local_recover_persist(soid, recovery_info, + true, epoch_to_freeze); + }, PGBackend::load_metadata_ertr::all_same_way( + [this, soid, epoch_to_freeze, need] (auto e) { + ObjectRecoveryInfo recovery_info; + recovery_info.soid = soid; + recovery_info.version = need; + return on_local_recover_persist(soid, recovery_info, + true, epoch_to_freeze); + }) + ); +} + +seastar::future<> ReplicatedRecoveryBackend::recover_delete( + const hobject_t &soid, eversion_t need) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + + epoch_t cur_epoch = pg.get_osdmap_epoch(); + return seastar::do_with(object_stat_sum_t(), + [this, soid, need, cur_epoch](auto& stat_diff) { + return local_recover_delete(soid, need, cur_epoch).then( + [this, &stat_diff, cur_epoch, soid, need] { + if (!pg.has_reset_since(cur_epoch)) { + bool object_missing = false; + for (const auto& shard : pg.get_acting_recovery_backfill()) { + if (shard == pg.get_pg_whoami()) + continue; + if (pg.get_shard_missing(shard)->is_missing(soid)) { + logger().debug("recover_delete: soid {} needs to deleted from replca {}", + soid, shard); + object_missing = true; + break; + } + } + + if (!object_missing) { + stat_diff.num_objects_recovered = 1; + return seastar::make_ready_future<>(); + } else { + return push_delete(soid, need); + } + } + return seastar::make_ready_future<>(); + }).then([this, soid, &stat_diff] { + pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true); + return seastar::make_ready_future<>(); + }); + }); +} + +seastar::future<PushOp> +ReplicatedRecoveryBackend::prep_push( + const hobject_t& soid, + eversion_t need, + pg_shard_t pg_shard) +{ + logger().debug("{}: {}, {}", __func__, soid, need); + + auto& recovery_waiter = recovering.at(soid); + auto& obc = recovery_waiter.obc; + interval_set<uint64_t> data_subset; + if (obc->obs.oi.size) { + data_subset.insert(0, obc->obs.oi.size); + } + const auto& missing = pg.get_shard_missing().find(pg_shard)->second; + if (HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS)) { + const auto it = missing.get_items().find(soid); + assert(it != missing.get_items().end()); + data_subset.intersection_of(it->second.clean_regions.get_dirty_regions()); + logger().debug("prep_push: {} data_subset {}", soid, data_subset); + } + + logger().debug("prep_push: {} to {}", soid, pg_shard); + auto& pi = recovery_waiter.pushing[pg_shard]; + pg.begin_peer_recover(pg_shard, soid); + const auto pmissing_iter = pg.get_shard_missing().find(pg_shard); + const auto missing_iter = pmissing_iter->second.get_items().find(soid); + assert(missing_iter != pmissing_iter->second.get_items().end()); + + pi.obc = obc; + pi.recovery_info.size = obc->obs.oi.size; + pi.recovery_info.copy_subset = data_subset; + pi.recovery_info.soid = soid; + pi.recovery_info.oi = obc->obs.oi; + pi.recovery_info.version = obc->obs.oi.version; + pi.recovery_info.object_exist = + missing_iter->second.clean_regions.object_is_exist(); + pi.recovery_progress.omap_complete = + (!missing_iter->second.clean_regions.omap_is_dirty() && + HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS)); + + return build_push_op(pi.recovery_info, pi.recovery_progress, &pi.stat).then( + [this, soid, pg_shard](auto pop) { + auto& recovery_waiter = recovering.at(soid); + auto& pi = recovery_waiter.pushing[pg_shard]; + pi.recovery_progress = pop.after_progress; + return pop; + }); +} + +void ReplicatedRecoveryBackend::prepare_pull(PullOp& po, PullInfo& pi, + const hobject_t& soid, + eversion_t need) { + logger().debug("{}: {}, {}", __func__, soid, need); + + pg_missing_tracker_t local_missing = pg.get_local_missing(); + const auto missing_iter = local_missing.get_items().find(soid); + auto m = pg.get_missing_loc_shards(); + pg_shard_t fromshard = *(m[soid].begin()); + + //TODO: skipped snap objects case for now + po.recovery_info.copy_subset.insert(0, (uint64_t) -1); + if (HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS)) + po.recovery_info.copy_subset.intersection_of( + missing_iter->second.clean_regions.get_dirty_regions()); + po.recovery_info.size = ((uint64_t) -1); + po.recovery_info.object_exist = + missing_iter->second.clean_regions.object_is_exist(); + po.recovery_info.soid = soid; + po.soid = soid; + po.recovery_progress.data_complete = false; + po.recovery_progress.omap_complete = + !missing_iter->second.clean_regions.omap_is_dirty() && + HAVE_FEATURE(pg.min_peer_features(), SERVER_OCTOPUS); + po.recovery_progress.data_recovered_to = 0; + po.recovery_progress.first = true; + + pi.from = fromshard; + pi.soid = soid; + pi.recovery_info = po.recovery_info; + pi.recovery_progress = po.recovery_progress; +} + +seastar::future<PushOp> ReplicatedRecoveryBackend::build_push_op( + const ObjectRecoveryInfo& recovery_info, + const ObjectRecoveryProgress& progress, + object_stat_sum_t* stat) +{ + logger().debug("{} {} @{}", + __func__, recovery_info.soid, recovery_info.version); + return seastar::do_with(ObjectRecoveryProgress(progress), + uint64_t(crimson::common::local_conf() + ->osd_recovery_max_chunk), + recovery_info.version, + PushOp(), + [this, &recovery_info, &progress, stat] + (auto new_progress, auto available, auto v, auto pop) { + return read_metadata_for_push_op(recovery_info.soid, + progress, new_progress, + v, &pop).then([&](eversion_t local_ver) mutable { + // If requestor didn't know the version, use ours + if (v == eversion_t()) { + v = local_ver; + } else if (v != local_ver) { + logger().error("build_push_op: {} push {} v{} failed because local copy is {}", + pg.get_pgid(), recovery_info.soid, recovery_info.version, local_ver); + // TODO: bail out + } + return read_omap_for_push_op(recovery_info.soid, + progress, + new_progress, + &available, &pop); + }).then([this, &recovery_info, &progress, &available, &pop]() mutable { + logger().debug("build_push_op: available: {}, copy_subset: {}", + available, recovery_info.copy_subset); + return read_object_for_push_op(recovery_info.soid, + recovery_info.copy_subset, + progress.data_recovered_to, + available, &pop); + }).then([&recovery_info, &v, &progress, &new_progress, stat, &pop] + (uint64_t recovered_to) mutable { + new_progress.data_recovered_to = recovered_to; + if (new_progress.is_complete(recovery_info)) { + new_progress.data_complete = true; + if (stat) + stat->num_objects_recovered++; + } else if (progress.first && progress.omap_complete) { + // If omap is not changed, we need recovery omap + // when recovery cannot be completed once + new_progress.omap_complete = false; + } + if (stat) { + stat->num_keys_recovered += pop.omap_entries.size(); + stat->num_bytes_recovered += pop.data.length(); + } + pop.version = v; + pop.soid = recovery_info.soid; + pop.recovery_info = recovery_info; + pop.after_progress = new_progress; + pop.before_progress = progress; + logger().debug("build_push_op: pop version: {}, pop data length: {}", + pop.version, pop.data.length()); + return seastar::make_ready_future<PushOp>(std::move(pop)); + }); + }); +} + +seastar::future<eversion_t> +ReplicatedRecoveryBackend::read_metadata_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + eversion_t ver, + PushOp* push_op) +{ + if (!progress.first) { + return seastar::make_ready_future<eversion_t>(ver); + } + return seastar::when_all_succeed( + backend->omap_get_header(coll, ghobject_t(oid)).handle_error( + crimson::os::FuturizedStore::read_errorator::all_same_way( + [] (const std::error_code& e) { + return seastar::make_ready_future<bufferlist>(); + })), + store->get_attrs(coll, ghobject_t(oid)).handle_error( + crimson::os::FuturizedStore::get_attrs_ertr::all_same_way( + [] (const std::error_code& e) { + return seastar::make_ready_future<crimson::os::FuturizedStore::attrs_t>(); + })) + ).then_unpack([&new_progress, push_op](auto bl, auto attrs) { + if (bl.length() == 0) { + logger().error("read_metadata_for_push_op: fail to read omap header"); + return eversion_t{}; + } else if (attrs.empty()) { + logger().error("read_metadata_for_push_op: fail to read attrs"); + return eversion_t{}; + } + push_op->omap_header.claim_append(std::move(bl)); + for (auto&& [key, val] : std::move(attrs)) { + push_op->attrset[key].push_back(val); + } + logger().debug("read_metadata_for_push_op: {}", push_op->attrset[OI_ATTR]); + object_info_t oi; + oi.decode(push_op->attrset[OI_ATTR]); + new_progress.first = false; + return oi.version; + }); +} + +seastar::future<uint64_t> +ReplicatedRecoveryBackend::read_object_for_push_op( + const hobject_t& oid, + const interval_set<uint64_t>& copy_subset, + uint64_t offset, + uint64_t max_len, + PushOp* push_op) +{ + if (max_len == 0 || copy_subset.empty()) { + push_op->data_included.clear(); + return seastar::make_ready_future<uint64_t>(offset); + } + // 1. get the extents in the interested range + return backend->fiemap(coll, ghobject_t{oid}, + 0, copy_subset.range_end()).then_wrapped( + [=](auto&& fiemap_included) mutable { + interval_set<uint64_t> extents; + try { + extents.intersection_of(copy_subset, fiemap_included.get0()); + } catch (std::exception &) { + // if fiemap() fails, we will read nothing, as the intersection of + // copy_subset and an empty interval_set would be empty anyway + extents.clear(); + } + // 2. we can read up to "max_len" bytes from "offset", so truncate the + // extents down to this quota. no need to return the number of consumed + // bytes, as this is the last consumer of this quota + push_op->data_included.span_of(extents, offset, max_len); + // 3. read the truncated extents + // TODO: check if the returned extents are pruned + return store->readv(coll, ghobject_t{oid}, push_op->data_included, 0); + }).safe_then([push_op, range_end=copy_subset.range_end()](auto &&bl) { + push_op->data.claim_append(std::move(bl)); + uint64_t recovered_to = 0; + if (push_op->data_included.empty()) { + // zero filled section, skip to end! + recovered_to = range_end; + } else { + // note down the progress, we will start from there next time + recovered_to = push_op->data_included.range_end(); + } + return seastar::make_ready_future<uint64_t>(recovered_to); + }, PGBackend::read_errorator::all_same_way([](auto e) { + logger().debug("build_push_op: read exception"); + return seastar::make_exception_future<uint64_t>(e); + })); +} + +seastar::future<> +ReplicatedRecoveryBackend::read_omap_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + uint64_t* max_len, + PushOp* push_op) +{ + if (progress.omap_complete) { + return seastar::make_ready_future<>(); + } + return shard_services.get_store().get_omap_iterator(coll, ghobject_t{oid}) + .then([&progress, &new_progress, max_len, push_op](auto omap_iter) { + return omap_iter->lower_bound(progress.omap_recovered_to).then( + [omap_iter, &new_progress, max_len, push_op] { + return seastar::do_until([omap_iter, &new_progress, max_len, push_op] { + if (!omap_iter->valid()) { + new_progress.omap_complete = true; + return true; + } + if (push_op->omap_entries.empty()) { + return false; + } + if (const uint64_t entries_per_chunk = + crimson::common::local_conf()->osd_recovery_max_omap_entries_per_chunk; + entries_per_chunk > 0 && + push_op->omap_entries.size() >= entries_per_chunk) { + new_progress.omap_recovered_to = omap_iter->key(); + return true; + } + if (omap_iter->key().size() + omap_iter->value().length() > *max_len) { + new_progress.omap_recovered_to = omap_iter->key(); + return true; + } + return false; + }, + [omap_iter, max_len, push_op] { + push_op->omap_entries.emplace(omap_iter->key(), omap_iter->value()); + if (const uint64_t entry_size = + omap_iter->key().size() + omap_iter->value().length(); + entry_size > *max_len) { + *max_len -= entry_size; + } else { + *max_len = 0; + } + return omap_iter->next(); + }); + }); + }); +} + +std::vector<pg_shard_t> +ReplicatedRecoveryBackend::get_shards_to_push(const hobject_t& soid) const +{ + std::vector<pg_shard_t> shards; + assert(pg.get_acting_recovery_backfill().size() > 0); + for (const auto& peer : pg.get_acting_recovery_backfill()) { + if (peer == pg.get_pg_whoami()) + continue; + auto shard_missing = + pg.get_shard_missing().find(peer); + assert(shard_missing != pg.get_shard_missing().end()); + if (shard_missing->second.is_missing(soid)) { + shards.push_back(shard_missing->first); + } + } + return shards; +} + +seastar::future<> ReplicatedRecoveryBackend::handle_pull(Ref<MOSDPGPull> m) +{ + logger().debug("{}: {}", __func__, *m); + return seastar::parallel_for_each(m->take_pulls(), + [this, from=m->from](auto& pull_op) { + const hobject_t& soid = pull_op.soid; + logger().debug("handle_pull: {}", soid); + return backend->stat(coll, ghobject_t(soid)).then( + [this, &pull_op](auto st) { + ObjectRecoveryInfo &recovery_info = pull_op.recovery_info; + ObjectRecoveryProgress &progress = pull_op.recovery_progress; + if (progress.first && recovery_info.size == ((uint64_t) -1)) { + // Adjust size and copy_subset + recovery_info.size = st.st_size; + if (st.st_size) { + interval_set<uint64_t> object_range; + object_range.insert(0, st.st_size); + recovery_info.copy_subset.intersection_of(object_range); + } else { + recovery_info.copy_subset.clear(); + } + assert(recovery_info.clone_subset.empty()); + } + return build_push_op(recovery_info, progress, 0); + }).then([this, from](auto pop) { + auto msg = make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pushes.push_back(std::move(pop)); + return shard_services.send_to_osd(from.osd, std::move(msg), + pg.get_osdmap_epoch()); + }); + }); +} + +seastar::future<bool> ReplicatedRecoveryBackend::_handle_pull_response( + pg_shard_t from, + const PushOp& pop, + PullOp* response, + ceph::os::Transaction* t) +{ + logger().debug("handle_pull_response {} {} data.size() is {} data_included: {}", + pop.recovery_info, pop.after_progress, pop.data.length(), pop.data_included); + + const hobject_t &hoid = pop.soid; + auto& recovery_waiter = recovering.at(hoid); + auto& pi = *recovery_waiter.pi; + if (pi.recovery_info.size == (uint64_t(-1))) { + pi.recovery_info.size = pop.recovery_info.size; + pi.recovery_info.copy_subset.intersection_of( + pop.recovery_info.copy_subset); + } + + // If primary doesn't have object info and didn't know version + if (pi.recovery_info.version == eversion_t()) + pi.recovery_info.version = pop.version; + + auto prepare_waiter = seastar::make_ready_future<>(); + if (pi.recovery_progress.first) { + prepare_waiter = pg.with_head_obc<RWState::RWNONE>( + pi.recovery_info.soid, [&pi, &recovery_waiter, &pop](auto obc) { + pi.obc = obc; + recovery_waiter.obc = obc; + obc->obs.oi.decode(pop.attrset.at(OI_ATTR)); + pi.recovery_info.oi = obc->obs.oi; + return crimson::osd::PG::load_obc_ertr::now(); + }).handle_error(crimson::ct_error::assert_all{}); + }; + return prepare_waiter.then([this, &pi, &pop, t, response]() mutable { + const bool first = pi.recovery_progress.first; + pi.recovery_progress = pop.after_progress; + logger().debug("new recovery_info {}, new progress {}", + pi.recovery_info, pi.recovery_progress); + interval_set<uint64_t> data_zeros; + { + uint64_t offset = pop.before_progress.data_recovered_to; + uint64_t length = (pop.after_progress.data_recovered_to - + pop.before_progress.data_recovered_to); + if (length) { + data_zeros.insert(offset, length); + } + } + auto [usable_intervals, data] = + trim_pushed_data(pi.recovery_info.copy_subset, + pop.data_included, pop.data); + bool complete = pi.is_complete(); + bool clear_omap = !pop.before_progress.omap_complete; + return submit_push_data(pi.recovery_info, first, complete, clear_omap, + std::move(data_zeros), usable_intervals, data, pop.omap_header, + pop.attrset, pop.omap_entries, t).then( + [this, response, &pi, &pop, complete, t, bytes_recovered=data.length()] { + pi.stat.num_keys_recovered += pop.omap_entries.size(); + pi.stat.num_bytes_recovered += bytes_recovered; + + if (complete) { + pi.stat.num_objects_recovered++; + pg.get_recovery_handler()->on_local_recover( + pop.soid, recovering.at(pop.soid).pi->recovery_info, + false, *t); + return true; + } else { + response->soid = pop.soid; + response->recovery_info = pi.recovery_info; + response->recovery_progress = pi.recovery_progress; + return false; + } + }); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::handle_pull_response( + Ref<MOSDPGPush> m) +{ + const PushOp& pop = m->pushes[0]; //TODO: only one push per message for now. + if (pop.version == eversion_t()) { + // replica doesn't have it! + pg.get_recovery_handler()->on_failed_recover({ m->from }, pop.soid, + get_recovering(pop.soid).pi->recovery_info.version); + return seastar::make_exception_future<>( + std::runtime_error(fmt::format( + "Error on pushing side {} when pulling obj {}", + m->from, pop.soid))); + } + + logger().debug("{}: {}", __func__, *m); + return seastar::do_with(PullOp(), [this, m](auto& response) { + return seastar::do_with(ceph::os::Transaction(), m.get(), + [this, &response](auto& t, auto& m) { + pg_shard_t from = m->from; + PushOp& pop = m->pushes[0]; // only one push per message for now + return _handle_pull_response(from, pop, &response, &t).then( + [this, &t](bool complete) { + epoch_t epoch_frozen = pg.get_osdmap_epoch(); + return shard_services.get_store().do_transaction(coll, std::move(t)) + .then([this, epoch_frozen, complete, + last_complete = pg.get_info().last_complete] { + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + return seastar::make_ready_future<bool>(complete); + }); + }); + }).then([this, m, &response](bool complete) { + if (complete) { + auto& pop = m->pushes[0]; + recovering.at(pop.soid).set_pulled(); + return seastar::make_ready_future<>(); + } else { + auto reply = make_message<MOSDPGPull>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = pg.get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + reply->set_pulls({std::move(response)}); + return shard_services.send_to_osd(m->from.osd, std::move(reply), pg.get_osdmap_epoch()); + } + }); + }); +} + +seastar::future<> ReplicatedRecoveryBackend::_handle_push( + pg_shard_t from, + const PushOp &pop, + PushReplyOp *response, + ceph::os::Transaction *t) +{ + logger().debug("{}", __func__); + + bool first = pop.before_progress.first; + interval_set<uint64_t> data_zeros; + { + uint64_t offset = pop.before_progress.data_recovered_to; + uint64_t length = (pop.after_progress.data_recovered_to - + pop.before_progress.data_recovered_to); + if (length) { + data_zeros.insert(offset, length); + } + } + bool complete = (pop.after_progress.data_complete && + pop.after_progress.omap_complete); + bool clear_omap = !pop.before_progress.omap_complete; + response->soid = pop.recovery_info.soid; + + return submit_push_data(pop.recovery_info, first, complete, clear_omap, + std::move(data_zeros), pop.data_included, pop.data, pop.omap_header, + pop.attrset, pop.omap_entries, t).then([this, complete, &pop, t] { + if (complete) { + pg.get_recovery_handler()->on_local_recover( + pop.recovery_info.soid, pop.recovery_info, + false, *t); + } + }); +} + +seastar::future<> ReplicatedRecoveryBackend::handle_push( + Ref<MOSDPGPush> m) +{ + if (pg.is_primary()) { + return handle_pull_response(m); + } + + logger().debug("{}: {}", __func__, *m); + return seastar::do_with(PushReplyOp(), [this, m](auto& response) { + const PushOp& pop = m->pushes[0]; //TODO: only one push per message for now + return seastar::do_with(ceph::os::Transaction(), + [this, m, &pop, &response](auto& t) { + return _handle_push(m->from, pop, &response, &t).then( + [this, &t] { + epoch_t epoch_frozen = pg.get_osdmap_epoch(); + return shard_services.get_store().do_transaction(coll, std::move(t)).then( + [this, epoch_frozen, last_complete = pg.get_info().last_complete] { + //TODO: this should be grouped with pg.on_local_recover somehow. + pg.get_recovery_handler()->_committed_pushed_object(epoch_frozen, last_complete); + }); + }); + }).then([this, m, &response]() mutable { + auto reply = make_message<MOSDPGPushReply>(); + reply->from = pg.get_pg_whoami(); + reply->set_priority(m->get_priority()); + reply->pgid = pg.get_info().pgid; + reply->map_epoch = m->map_epoch; + reply->min_epoch = m->min_epoch; + std::vector<PushReplyOp> replies = { std::move(response) }; + reply->replies.swap(replies); + return shard_services.send_to_osd(m->from.osd, + std::move(reply), pg.get_osdmap_epoch()); + }); + }); +} + +seastar::future<std::optional<PushOp>> +ReplicatedRecoveryBackend::_handle_push_reply( + pg_shard_t peer, + const PushReplyOp &op) +{ + const hobject_t& soid = op.soid; + logger().debug("{}, soid {}, from {}", __func__, soid, peer); + auto recovering_iter = recovering.find(soid); + if (recovering_iter == recovering.end() + || !recovering_iter->second.pushing.count(peer)) { + logger().debug("huh, i wasn't pushing {} to osd.{}", soid, peer); + return seastar::make_ready_future<std::optional<PushOp>>(); + } else { + auto& pi = recovering_iter->second.pushing[peer]; + bool error = pi.recovery_progress.error; + if (!pi.recovery_progress.data_complete && !error) { + return build_push_op(pi.recovery_info, pi.recovery_progress, + &pi.stat).then([&pi] (auto pop) { + pi.recovery_progress = pop.after_progress; + return seastar::make_ready_future<std::optional<PushOp>>(std::move(pop)); + }).handle_exception([recovering_iter, &pi, peer] (auto e) { + pi.recovery_progress.error = true; + recovering_iter->second.set_push_failed(peer, e); + return seastar::make_ready_future<std::optional<PushOp>>(); + }); + } + if (!error) { + pg.get_recovery_handler()->on_peer_recover(peer, soid, pi.recovery_info); + } + recovering_iter->second.set_pushed(peer); + return seastar::make_ready_future<std::optional<PushOp>>(); + } +} + +seastar::future<> ReplicatedRecoveryBackend::handle_push_reply( + Ref<MOSDPGPushReply> m) +{ + logger().debug("{}: {}", __func__, *m); + auto from = m->from; + auto& push_reply = m->replies[0]; //TODO: only one reply per message + + return _handle_push_reply(from, push_reply).then( + [this, from](std::optional<PushOp> push_op) { + if (push_op) { + auto msg = make_message<MOSDPGPush>(); + msg->from = pg.get_pg_whoami(); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pushes.push_back(std::move(*push_op)); + return shard_services.send_to_osd(from.osd, + std::move(msg), + pg.get_osdmap_epoch()); + } else { + return seastar::make_ready_future<>(); + } + }); +} + +std::pair<interval_set<uint64_t>, + bufferlist> +ReplicatedRecoveryBackend::trim_pushed_data( + const interval_set<uint64_t> ©_subset, + const interval_set<uint64_t> &intervals_received, + ceph::bufferlist data_received) +{ + logger().debug("{}", __func__); + // what i have is only a subset of what i want + if (intervals_received.subset_of(copy_subset)) { + return {intervals_received, data_received}; + } + // only collect the extents included by copy_subset and intervals_received + interval_set<uint64_t> intervals_usable; + bufferlist data_usable; + intervals_usable.intersection_of(copy_subset, intervals_received); + uint64_t have_off = 0; + for (auto [have_start, have_len] : intervals_received) { + interval_set<uint64_t> want; + want.insert(have_start, have_len); + want.intersection_of(copy_subset); + for (auto [want_start, want_len] : want) { + bufferlist sub; + uint64_t data_off = have_off + (want_start - have_start); + sub.substr_of(data_received, data_off, want_len); + data_usable.claim_append(sub); + } + have_off += have_len; + } + return {intervals_usable, data_usable}; +} + +seastar::future<> ReplicatedRecoveryBackend::submit_push_data( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + interval_set<uint64_t> data_zeros, + const interval_set<uint64_t> &intervals_included, + bufferlist data_included, + bufferlist omap_header, + const map<string, bufferlist> &attrs, + const map<string, bufferlist> &omap_entries, + ObjectStore::Transaction *t) +{ + logger().debug("{}", __func__); + hobject_t target_oid; + if (first && complete) { + target_oid = recovery_info.soid; + } else { + target_oid = get_temp_recovery_object(recovery_info.soid, + recovery_info.version); + if (first) { + logger().debug("{}: Adding oid {} in the temp collection", + __func__, target_oid); + add_temp_obj(target_oid); + } + } + + return [this, &recovery_info, first, complete, t, + &omap_header, &attrs, target_oid, clear_omap] { + if (first) { + if (!complete) { + t->remove(coll->get_cid(), ghobject_t(target_oid)); + t->touch(coll->get_cid(), ghobject_t(target_oid)); + bufferlist bv = attrs.at(OI_ATTR); + object_info_t oi(bv); + t->set_alloc_hint(coll->get_cid(), ghobject_t(target_oid), + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + } else { + if (!recovery_info.object_exist) { + t->remove(coll->get_cid(), ghobject_t(target_oid)); + t->touch(coll->get_cid(), ghobject_t(target_oid)); + bufferlist bv = attrs.at(OI_ATTR); + object_info_t oi(bv); + t->set_alloc_hint(coll->get_cid(), ghobject_t(target_oid), + oi.expected_object_size, + oi.expected_write_size, + oi.alloc_hint_flags); + } + //remove xattr and update later if overwrite on original object + t->rmattrs(coll->get_cid(), ghobject_t(target_oid)); + //if need update omap, clear the previous content first + if (clear_omap) + t->omap_clear(coll->get_cid(), ghobject_t(target_oid)); + } + + t->truncate(coll->get_cid(), ghobject_t(target_oid), recovery_info.size); + if (omap_header.length()) + t->omap_setheader(coll->get_cid(), ghobject_t(target_oid), omap_header); + + return store->stat(coll, ghobject_t(recovery_info.soid)).then( + [this, &recovery_info, complete, t, target_oid, + omap_header = std::move(omap_header)] (auto st) { + //TODO: pg num bytes counting + if (!complete) { + //clone overlap content in local object + if (recovery_info.object_exist) { + uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size); + interval_set<uint64_t> local_intervals_included, local_intervals_excluded; + if (local_size) { + local_intervals_included.insert(0, local_size); + local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset); + local_intervals_included.subtract(local_intervals_excluded); + } + for (auto [off, len] : local_intervals_included) { + logger().debug(" clone_range {} {}~{}", + recovery_info.soid, off, len); + t->clone_range(coll->get_cid(), ghobject_t(recovery_info.soid), + ghobject_t(target_oid), off, len, off); + } + } + } + return seastar::make_ready_future<>(); + }); + } + return seastar::make_ready_future<>(); + }().then([this, data_zeros=std::move(data_zeros), + &recovery_info, &intervals_included, t, target_oid, + &omap_entries, &attrs, data_included, complete, first]() mutable { + uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL; + // Punch zeros for data, if fiemap indicates nothing but it is marked dirty + if (!data_zeros.empty()) { + data_zeros.intersection_of(recovery_info.copy_subset); + assert(intervals_included.subset_of(data_zeros)); + data_zeros.subtract(intervals_included); + + logger().debug("submit_push_data recovering object {} copy_subset: {} " + "intervals_included: {} data_zeros: {}", + recovery_info.soid, recovery_info.copy_subset, + intervals_included, data_zeros); + + for (auto [start, len] : data_zeros) { + t->zero(coll->get_cid(), ghobject_t(target_oid), start, len); + } + } + uint64_t off = 0; + for (auto [start, len] : intervals_included) { + bufferlist bit; + bit.substr_of(data_included, off, len); + t->write(coll->get_cid(), ghobject_t(target_oid), + start, len, bit, fadvise_flags); + off += len; + } + + if (!omap_entries.empty()) + t->omap_setkeys(coll->get_cid(), ghobject_t(target_oid), omap_entries); + if (!attrs.empty()) + t->setattrs(coll->get_cid(), ghobject_t(target_oid), attrs); + + if (complete) { + if (!first) { + logger().debug("submit_push_data: Removing oid {} from the temp collection", + target_oid); + clear_temp_obj(target_oid); + t->remove(coll->get_cid(), ghobject_t(recovery_info.soid)); + t->collection_move_rename(coll->get_cid(), ghobject_t(target_oid), + coll->get_cid(), ghobject_t(recovery_info.soid)); + } + submit_push_complete(recovery_info, t); + } + logger().debug("submit_push_data: done"); + return seastar::make_ready_future<>(); + }); +} + +void ReplicatedRecoveryBackend::submit_push_complete( + const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t) +{ + for (const auto& [oid, extents] : recovery_info.clone_subset) { + for (const auto [off, len] : extents) { + logger().debug(" clone_range {} {}~{}", oid, off, len); + t->clone_range(coll->get_cid(), ghobject_t(oid), ghobject_t(recovery_info.soid), + off, len, off); + } + } +} + +seastar::future<> ReplicatedRecoveryBackend::handle_recovery_delete_reply( + Ref<MOSDPGRecoveryDeleteReply> m) +{ + auto& p = m->objects.front(); + hobject_t soid = p.first; + ObjectRecoveryInfo recovery_info; + recovery_info.version = p.second; + pg.get_recovery_handler()->on_peer_recover(m->from, soid, recovery_info); + get_recovering(soid).set_pushed(m->from); + return seastar::now(); +} + +seastar::future<> ReplicatedRecoveryBackend::handle_recovery_op(Ref<MOSDFastDispatchOp> m) +{ + switch (m->get_header().type) { + case MSG_OSD_PG_PULL: + return handle_pull(boost::static_pointer_cast<MOSDPGPull>(m)); + case MSG_OSD_PG_PUSH: + return handle_push(boost::static_pointer_cast<MOSDPGPush>(m)); + case MSG_OSD_PG_PUSH_REPLY: + return handle_push_reply( + boost::static_pointer_cast<MOSDPGPushReply>(m)); + case MSG_OSD_PG_RECOVERY_DELETE: + return handle_recovery_delete( + boost::static_pointer_cast<MOSDPGRecoveryDelete>(m)); + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + return handle_recovery_delete_reply( + boost::static_pointer_cast<MOSDPGRecoveryDeleteReply>(m)); + default: + // delegate to parent class for handling backend-agnostic recovery ops. + return RecoveryBackend::handle_recovery_op(std::move(m)); + } +} + diff --git a/src/crimson/osd/replicated_recovery_backend.h b/src/crimson/osd/replicated_recovery_backend.h new file mode 100644 index 000000000..d99538a75 --- /dev/null +++ b/src/crimson/osd/replicated_recovery_backend.h @@ -0,0 +1,151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/osd/recovery_backend.h" + +#include "messages/MOSDPGPull.h" +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" +#include "os/ObjectStore.h" + +class ReplicatedRecoveryBackend : public RecoveryBackend { +public: + ReplicatedRecoveryBackend(crimson::osd::PG& pg, + crimson::osd::ShardServices& shard_services, + crimson::os::CollectionRef coll, + PGBackend* backend) + : RecoveryBackend(pg, shard_services, coll, backend) {} + seastar::future<> handle_recovery_op( + Ref<MOSDFastDispatchOp> m) final; + + seastar::future<> recover_object( + const hobject_t& soid, + eversion_t need) final; + seastar::future<> recover_delete( + const hobject_t& soid, + eversion_t need) final; + seastar::future<> push_delete( + const hobject_t& soid, + eversion_t need) final; +protected: + seastar::future<> handle_pull( + Ref<MOSDPGPull> m); + seastar::future<> handle_pull_response( + Ref<MOSDPGPush> m); + seastar::future<> handle_push( + Ref<MOSDPGPush> m); + seastar::future<> handle_push_reply( + Ref<MOSDPGPushReply> m); + seastar::future<> handle_recovery_delete( + Ref<MOSDPGRecoveryDelete> m); + seastar::future<> handle_recovery_delete_reply( + Ref<MOSDPGRecoveryDeleteReply> m); + seastar::future<PushOp> prep_push( + const hobject_t& soid, + eversion_t need, + pg_shard_t pg_shard); + void prepare_pull( + PullOp& po, + PullInfo& pi, + const hobject_t& soid, + eversion_t need); + std::vector<pg_shard_t> get_shards_to_push( + const hobject_t& soid) const; + seastar::future<PushOp> build_push_op( + const ObjectRecoveryInfo& recovery_info, + const ObjectRecoveryProgress& progress, + object_stat_sum_t* stat); + /// @returns true if this push op is the last push op for + /// recovery @c pop.soid + seastar::future<bool> _handle_pull_response( + pg_shard_t from, + const PushOp& pop, + PullOp* response, + ceph::os::Transaction* t); + std::pair<interval_set<uint64_t>, ceph::bufferlist> trim_pushed_data( + const interval_set<uint64_t> ©_subset, + const interval_set<uint64_t> &intervals_received, + ceph::bufferlist data_received); + seastar::future<> submit_push_data( + const ObjectRecoveryInfo &recovery_info, + bool first, + bool complete, + bool clear_omap, + interval_set<uint64_t> data_zeros, + const interval_set<uint64_t> &intervals_included, + ceph::bufferlist data_included, + ceph::bufferlist omap_header, + const std::map<string, bufferlist> &attrs, + const std::map<string, bufferlist> &omap_entries, + ceph::os::Transaction *t); + void submit_push_complete( + const ObjectRecoveryInfo &recovery_info, + ObjectStore::Transaction *t); + seastar::future<> _handle_push( + pg_shard_t from, + const PushOp &pop, + PushReplyOp *response, + ceph::os::Transaction *t); + seastar::future<std::optional<PushOp>> _handle_push_reply( + pg_shard_t peer, + const PushReplyOp &op); + seastar::future<> on_local_recover_persist( + const hobject_t& soid, + const ObjectRecoveryInfo& _recovery_info, + bool is_delete, + epoch_t epoch_to_freeze); + seastar::future<> local_recover_delete( + const hobject_t& soid, + eversion_t need, + epoch_t epoch_frozen); + seastar::future<> on_stop() final { + return seastar::now(); + } +private: + /// pull missing object from peer + seastar::future<> maybe_pull_missing_obj( + const hobject_t& soid, + eversion_t need); + + /// load object context for recovery if it is not ready yet + using load_obc_ertr = crimson::errorator< + crimson::ct_error::object_corrupted>; + + seastar::future<> maybe_push_shards( + const hobject_t& soid, + eversion_t need); + + /// read the data attached to given object. the size of them is supposed to + /// be relatively small. + /// + /// @return @c oi.version + seastar::future<eversion_t> read_metadata_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + eversion_t ver, + PushOp* push_op); + /// read the remaining extents of object to be recovered and fill push_op + /// with them + /// + /// @param oid object being recovered + /// @param copy_subset extents we want + /// @param offset the offset in object from where we should read + /// @return the new offset + seastar::future<uint64_t> read_object_for_push_op( + const hobject_t& oid, + const interval_set<uint64_t>& copy_subset, + uint64_t offset, + uint64_t max_len, + PushOp* push_op); + seastar::future<> read_omap_for_push_op( + const hobject_t& oid, + const ObjectRecoveryProgress& progress, + ObjectRecoveryProgress& new_progress, + uint64_t* max_len, + PushOp* push_op); +}; diff --git a/src/crimson/osd/scheduler/mclock_scheduler.cc b/src/crimson/osd/scheduler/mclock_scheduler.cc new file mode 100644 index 000000000..195ea8dd8 --- /dev/null +++ b/src/crimson/osd/scheduler/mclock_scheduler.cc @@ -0,0 +1,165 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <memory> +#include <functional> + +#include "crimson/osd/scheduler/mclock_scheduler.h" +#include "common/dout.h" + +namespace dmc = crimson::dmclock; +using namespace std::placeholders; + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout + + +namespace crimson::osd::scheduler { + +mClockScheduler::mClockScheduler(ConfigProxy &conf) : + scheduler( + std::bind(&mClockScheduler::ClientRegistry::get_info, + &client_registry, + _1), + dmc::AtLimit::Allow, + conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout")) +{ + conf.add_observer(this); + client_registry.update_from_config(conf); +} + +void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf) +{ + default_external_client_info.update( + conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"), + conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim")); + + internal_client_infos[ + static_cast<size_t>(scheduler_class_t::background_recovery)].update( + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim")); + + internal_client_infos[ + static_cast<size_t>(scheduler_class_t::background_best_effort)].update( + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim")); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client( + const client_profile_id_t &client) const +{ + auto ret = external_client_infos.find(client); + if (ret == external_client_infos.end()) + return &default_external_client_info; + else + return &(ret->second); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info( + const scheduler_id_t &id) const { + switch (id.class_id) { + case scheduler_class_t::immediate: + ceph_assert(0 == "Cannot schedule immediate"); + return (dmc::ClientInfo*)nullptr; + case scheduler_class_t::repop: + case scheduler_class_t::client: + return get_external_client(id.client_profile_id); + default: + ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size()); + return &internal_client_infos[static_cast<size_t>(id.class_id)]; + } +} + +void mClockScheduler::dump(ceph::Formatter &f) const +{ +} + +void mClockScheduler::enqueue(item_t&& item) +{ + auto id = get_scheduler_id(item); + auto cost = item.params.cost; + + if (scheduler_class_t::immediate == item.params.klass) { + immediate.push_front(std::move(item)); + } else { + scheduler.add_request( + std::move(item), + id, + cost); + } +} + +void mClockScheduler::enqueue_front(item_t&& item) +{ + immediate.push_back(std::move(item)); + // TODO: item may not be immediate, update mclock machinery to permit + // putting the item back in the queue +} + +item_t mClockScheduler::dequeue() +{ + if (!immediate.empty()) { + auto ret = std::move(immediate.back()); + immediate.pop_back(); + return ret; + } else { + mclock_queue_t::PullReq result = scheduler.pull_request(); + if (result.is_future()) { + ceph_assert( + 0 == "Not implemented, user would have to be able to be woken up"); + return std::move(*(item_t*)nullptr); + } else if (result.is_none()) { + ceph_assert( + 0 == "Impossible, must have checked empty() first"); + return std::move(*(item_t*)nullptr); + } else { + ceph_assert(result.is_retn()); + + auto &retn = result.get_retn(); + return std::move(*retn.request); + } + } +} + +const char** mClockScheduler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_mclock_scheduler_client_res", + "osd_mclock_scheduler_client_wgt", + "osd_mclock_scheduler_client_lim", + "osd_mclock_scheduler_background_recovery_res", + "osd_mclock_scheduler_background_recovery_wgt", + "osd_mclock_scheduler_background_recovery_lim", + "osd_mclock_scheduler_background_best_effort_res", + "osd_mclock_scheduler_background_best_effort_wgt", + "osd_mclock_scheduler_background_best_effort_lim", + NULL + }; + return KEYS; +} + +void mClockScheduler::handle_conf_change( + const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + client_registry.update_from_config(conf); +} + +} diff --git a/src/crimson/osd/scheduler/mclock_scheduler.h b/src/crimson/osd/scheduler/mclock_scheduler.h new file mode 100644 index 000000000..c3edbe729 --- /dev/null +++ b/src/crimson/osd/scheduler/mclock_scheduler.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include <ostream> +#include <map> +#include <vector> + +#include "boost/variant.hpp" + +#include "dmclock/src/dmclock_server.h" + +#include "crimson/osd/scheduler/scheduler.h" +#include "common/config.h" +#include "include/cmp.h" +#include "common/ceph_context.h" + + +namespace crimson::osd::scheduler { + +using client_id_t = uint64_t; +using profile_id_t = uint64_t; + +struct client_profile_id_t { + client_id_t client_id; + profile_id_t profile_id; +}; + +WRITE_EQ_OPERATORS_2(client_profile_id_t, client_id, profile_id) +WRITE_CMP_OPERATORS_2(client_profile_id_t, client_id, profile_id) + + +struct scheduler_id_t { + scheduler_class_t class_id; + client_profile_id_t client_profile_id; +}; + +WRITE_EQ_OPERATORS_2(scheduler_id_t, class_id, client_profile_id) +WRITE_CMP_OPERATORS_2(scheduler_id_t, class_id, client_profile_id) + +/** + * Scheduler implementation based on mclock. + * + * TODO: explain configs + */ +class mClockScheduler : public Scheduler, md_config_obs_t { + + class ClientRegistry { + std::array< + crimson::dmclock::ClientInfo, + static_cast<size_t>(scheduler_class_t::client) + > internal_client_infos = { + // Placeholder, gets replaced with configured values + crimson::dmclock::ClientInfo(1, 1, 1), + crimson::dmclock::ClientInfo(1, 1, 1) + }; + + crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1}; + std::map<client_profile_id_t, + crimson::dmclock::ClientInfo> external_client_infos; + const crimson::dmclock::ClientInfo *get_external_client( + const client_profile_id_t &client) const; + public: + void update_from_config(const ConfigProxy &conf); + const crimson::dmclock::ClientInfo *get_info( + const scheduler_id_t &id) const; + } client_registry; + + using mclock_queue_t = crimson::dmclock::PullPriorityQueue< + scheduler_id_t, + item_t, + true, + true, + 2>; + mclock_queue_t scheduler; + std::list<item_t> immediate; + + static scheduler_id_t get_scheduler_id(const item_t &item) { + return scheduler_id_t{ + item.params.klass, + client_profile_id_t{ + item.params.owner, + 0 + } + }; + } + +public: + mClockScheduler(ConfigProxy &conf); + + // Enqueue op in the back of the regular queue + void enqueue(item_t &&item) final; + + // Enqueue the op in the front of the regular queue + void enqueue_front(item_t &&item) final; + + // Return an op to be dispatch + item_t dequeue() final; + + // Returns if the queue is empty + bool empty() const final { + return immediate.empty() && scheduler.empty(); + } + + // Formatted output of the queue + void dump(ceph::Formatter &f) const final; + + void print(std::ostream &ostream) const final { + ostream << "mClockScheduler"; + } + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) final; +}; + +} diff --git a/src/crimson/osd/scheduler/scheduler.cc b/src/crimson/osd/scheduler/scheduler.cc new file mode 100644 index 000000000..c85cb388e --- /dev/null +++ b/src/crimson/osd/scheduler/scheduler.cc @@ -0,0 +1,181 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <ostream> + +#include <seastar/core/print.hh> + +#include "crimson/osd/scheduler/scheduler.h" +#include "crimson/osd/scheduler/mclock_scheduler.h" +#include "common/WeightedPriorityQueue.h" + +namespace crimson::osd::scheduler { + +std::ostream &operator<<(std::ostream &lhs, const scheduler_class_t &c) +{ + switch (c) { + case scheduler_class_t::background_best_effort: + return lhs << "background_best_effort"; + case scheduler_class_t::background_recovery: + return lhs << "background_recovery"; + case scheduler_class_t::client: + return lhs << "client"; + case scheduler_class_t::repop: + return lhs << "repop"; + case scheduler_class_t::immediate: + return lhs << "immediate"; + default: + return lhs; + } +} + +/** + * Implements Scheduler in terms of OpQueue + * + * Templated on queue type to avoid dynamic dispatch, T should implement + * OpQueue<Scheduleritem_t, client_t>. This adapter is mainly responsible for + * the boilerplate priority cutoff/strict concept which is needed for + * OpQueue based implementations. + */ +template <typename T> +class ClassedOpQueueScheduler final : public Scheduler { + const scheduler_class_t cutoff; + T queue; + + using priority_t = uint64_t; + std::array< + priority_t, + static_cast<size_t>(scheduler_class_t::immediate) + > priority_map = { + // Placeholder, gets replaced with configured values + 0, 0, 0 + }; + + static scheduler_class_t get_io_prio_cut(ConfigProxy &conf) { + if (conf.get_val<std::string>("osd_op_queue_cut_off") == "debug_random") { + srand(time(NULL)); + return (rand() % 2 < 1) ? + scheduler_class_t::repop : scheduler_class_t::immediate; + } else if (conf.get_val<std::string>("osd_op_queue_cut_off") == "high") { + return scheduler_class_t::immediate; + } else { + return scheduler_class_t::repop; + } + } + + bool use_strict(scheduler_class_t kl) const { + return static_cast<uint8_t>(kl) >= static_cast<uint8_t>(cutoff); + } + + priority_t get_priority(scheduler_class_t kl) const { + ceph_assert(static_cast<size_t>(kl) < + static_cast<size_t>(scheduler_class_t::immediate)); + return priority_map[static_cast<size_t>(kl)]; + } + +public: + template <typename... Args> + ClassedOpQueueScheduler(ConfigProxy &conf, Args&&... args) : + cutoff(get_io_prio_cut(conf)), + queue(std::forward<Args>(args)...) + { + priority_map[ + static_cast<size_t>(scheduler_class_t::background_best_effort) + ] = conf.get_val<uint64_t>("osd_scrub_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::background_recovery) + ] = conf.get_val<uint64_t>("osd_recovery_op_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::client) + ] = conf.get_val<uint64_t>("osd_client_op_priority"); + priority_map[ + static_cast<size_t>(scheduler_class_t::repop) + ] = conf.get_val<uint64_t>("osd_client_op_priority"); + } + + void enqueue(item_t &&item) final { + if (use_strict(item.params.klass)) + queue.enqueue_strict( + item.params.owner, get_priority(item.params.klass), std::move(item)); + else + queue.enqueue( + item.params.owner, get_priority(item.params.klass), + item.params.cost, std::move(item)); + } + + void enqueue_front(item_t &&item) final { + if (use_strict(item.params.klass)) + queue.enqueue_strict_front( + item.params.owner, get_priority(item.params.klass), std::move(item)); + else + queue.enqueue_front( + item.params.owner, get_priority(item.params.klass), + item.params.cost, std::move(item)); + } + + bool empty() const final { + return queue.empty(); + } + + item_t dequeue() final { + return queue.dequeue(); + } + + void dump(ceph::Formatter &f) const final { + return queue.dump(&f); + } + + void print(std::ostream &out) const final { + out << "ClassedOpQueueScheduler(queue="; + queue.print(out); + out << ", cutoff=" << cutoff << ")"; + } + + ~ClassedOpQueueScheduler() final {}; +}; + +SchedulerRef make_scheduler(ConfigProxy &conf) +{ + const std::string _type = conf.get_val<std::string>("osd_op_queue"); + const std::string *type = &_type; + if (*type == "debug_random") { + static const std::string index_lookup[] = { "mclock_scheduler", + "wpq" }; + srand(time(NULL)); + unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0])); + type = &index_lookup[which]; + } + + if (*type == "wpq" ) { + // default is 'wpq' + return std::make_unique< + ClassedOpQueueScheduler<WeightedPriorityQueue<item_t, client_t>>>( + conf, + conf.get_val<uint64_t>("osd_op_pq_max_tokens_per_priority"), + conf->osd_op_pq_min_cost + ); + } else if (*type == "mclock_scheduler") { + return std::make_unique<mClockScheduler>(conf); + } else { + ceph_assert("Invalid choice of wq" == 0); + return std::unique_ptr<mClockScheduler>(); + } +} + +std::ostream &operator<<(std::ostream &lhs, const Scheduler &rhs) { + rhs.print(lhs); + return lhs; +} + +} diff --git a/src/crimson/osd/scheduler/scheduler.h b/src/crimson/osd/scheduler/scheduler.h new file mode 100644 index 000000000..a014991ab --- /dev/null +++ b/src/crimson/osd/scheduler/scheduler.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <seastar/core/future.hh> +#include <ostream> + +#include "crimson/common/config_proxy.h" + +namespace crimson::osd::scheduler { + +enum class scheduler_class_t : uint8_t { + background_best_effort = 0, + background_recovery, + client, + repop, + immediate, +}; + +std::ostream &operator<<(std::ostream &, const scheduler_class_t &); + +using client_t = uint64_t; +using cost_t = uint64_t; + +struct params_t { + cost_t cost = 1; + client_t owner; + scheduler_class_t klass; +}; + +struct item_t { + params_t params; + seastar::promise<> wake; +}; + +/** + * Base interface for classes responsible for choosing + * op processing order in the OSD. + */ +class Scheduler { +public: + // Enqueue op for scheduling + virtual void enqueue(item_t &&item) = 0; + + // Enqueue op for processing as though it were enqueued prior + // to other items already scheduled. + virtual void enqueue_front(item_t &&item) = 0; + + // Returns true iff there are no ops scheduled + virtual bool empty() const = 0; + + // Return next op to be processed + virtual item_t dequeue() = 0; + + // Dump formatted representation for the queue + virtual void dump(ceph::Formatter &f) const = 0; + + // Print human readable brief description with relevant parameters + virtual void print(std::ostream &out) const = 0; + + // Destructor + virtual ~Scheduler() {}; +}; + +std::ostream &operator<<(std::ostream &lhs, const Scheduler &); +using SchedulerRef = std::unique_ptr<Scheduler>; + +SchedulerRef make_scheduler(ConfigProxy &); + +} diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc new file mode 100644 index 000000000..8c2cfc415 --- /dev/null +++ b/src/crimson/osd/shard_services.cc @@ -0,0 +1,311 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/shard_services.h" + +#include "messages/MOSDAlive.h" + +#include "osd/osd_perf_counters.h" +#include "osd/PeeringState.h" +#include "crimson/common/config_proxy.h" +#include "crimson/mgr/client.h" +#include "crimson/mon/MonClient.h" +#include "crimson/net/Messenger.h" +#include "crimson/net/Connection.h" +#include "crimson/os/cyanstore/cyan_store.h" +#include "crimson/osd/osdmap_service.h" +#include "messages/MOSDPGTemp.h" +#include "messages/MOSDPGCreated.h" +#include "messages/MOSDPGNotify.h" +#include "messages/MOSDPGInfo.h" +#include "messages/MOSDPGQuery.h" + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +ShardServices::ShardServices( + OSDMapService &osdmap_service, + const int whoami, + crimson::net::Messenger &cluster_msgr, + crimson::net::Messenger &public_msgr, + crimson::mon::Client &monc, + crimson::mgr::Client &mgrc, + crimson::os::FuturizedStore &store) + : osdmap_service(osdmap_service), + whoami(whoami), + cluster_msgr(cluster_msgr), + public_msgr(public_msgr), + monc(monc), + mgrc(mgrc), + store(store), + throttler(crimson::common::local_conf()), + obc_registry(crimson::common::local_conf()), + local_reserver( + &cct, + &finisher, + crimson::common::local_conf()->osd_max_backfills, + crimson::common::local_conf()->osd_min_recovery_priority), + remote_reserver( + &cct, + &finisher, + crimson::common::local_conf()->osd_max_backfills, + crimson::common::local_conf()->osd_min_recovery_priority) +{ + perf = build_osd_logger(&cct); + cct.get_perfcounters_collection()->add(perf); + + recoverystate_perf = build_recoverystate_perf(&cct); + cct.get_perfcounters_collection()->add(recoverystate_perf); + + crimson::common::local_conf().add_observer(this); +} + +const char** ShardServices::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_max_backfills", + "osd_min_recovery_priority", + nullptr + }; + return KEYS; +} + +void ShardServices::handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) +{ + if (changed.count("osd_max_backfills")) { + local_reserver.set_max(conf->osd_max_backfills); + remote_reserver.set_max(conf->osd_max_backfills); + } + if (changed.count("osd_min_recovery_priority")) { + local_reserver.set_min_priority(conf->osd_min_recovery_priority); + remote_reserver.set_min_priority(conf->osd_min_recovery_priority); + } +} + +seastar::future<> ShardServices::send_to_osd( + int peer, Ref<Message> m, epoch_t from_epoch) { + if (osdmap->is_down(peer)) { + logger().info("{}: osd.{} is_down", __func__, peer); + return seastar::now(); + } else if (osdmap->get_info(peer).up_from > from_epoch) { + logger().info("{}: osd.{} {} > {}", __func__, peer, + osdmap->get_info(peer).up_from, from_epoch); + return seastar::now(); + } else { + auto conn = cluster_msgr.connect( + osdmap->get_cluster_addrs(peer).front(), CEPH_ENTITY_TYPE_OSD); + return conn->send(m); + } +} + +seastar::future<> ShardServices::dispatch_context_transaction( + crimson::os::CollectionRef col, PeeringCtx &ctx) { + auto ret = store.do_transaction( + col, + std::move(ctx.transaction)); + ctx.reset_transaction(); + return ret; +} + +seastar::future<> ShardServices::dispatch_context_messages( + BufferedRecoveryMessages &&ctx) +{ + auto ret = seastar::parallel_for_each(std::move(ctx.message_map), + [this](auto& osd_messages) { + auto& [peer, messages] = osd_messages; + logger().debug("dispatch_context_messages sending messages to {}", peer); + return seastar::parallel_for_each( + std::move(messages), [=, peer=peer](auto& m) { + return send_to_osd(peer, m, osdmap->get_epoch()); + }); + }); + ctx.message_map.clear(); + return ret; +} + +seastar::future<> ShardServices::dispatch_context( + crimson::os::CollectionRef col, + PeeringCtx &&ctx) +{ + ceph_assert(col || ctx.transaction.empty()); + return seastar::when_all_succeed( + dispatch_context_messages( + BufferedRecoveryMessages{ceph_release_t::octopus, ctx}), + col ? dispatch_context_transaction(col, ctx) : seastar::now() + ).then_unpack([] { + return seastar::now(); + }); +} + +void ShardServices::queue_want_pg_temp(pg_t pgid, + const vector<int>& want, + bool forced) +{ + auto p = pg_temp_pending.find(pgid); + if (p == pg_temp_pending.end() || + p->second.acting != want || + forced) { + pg_temp_wanted[pgid] = {want, forced}; + } +} + +void ShardServices::remove_want_pg_temp(pg_t pgid) +{ + pg_temp_wanted.erase(pgid); + pg_temp_pending.erase(pgid); +} + +void ShardServices::requeue_pg_temp() +{ + unsigned old_wanted = pg_temp_wanted.size(); + unsigned old_pending = pg_temp_pending.size(); + pg_temp_wanted.merge(pg_temp_pending); + pg_temp_pending.clear(); + logger().debug( + "{}: {} + {} -> {}", + __func__ , + old_wanted, + old_pending, + pg_temp_wanted.size()); +} + +std::ostream& operator<<( + std::ostream& out, + const ShardServices::pg_temp_t& pg_temp) +{ + out << pg_temp.acting; + if (pg_temp.forced) { + out << " (forced)"; + } + return out; +} + +seastar::future<> ShardServices::send_pg_temp() +{ + if (pg_temp_wanted.empty()) + return seastar::now(); + logger().debug("{}: {}", __func__, pg_temp_wanted); + boost::intrusive_ptr<MOSDPGTemp> ms[2] = {nullptr, nullptr}; + for (auto& [pgid, pg_temp] : pg_temp_wanted) { + auto& m = ms[pg_temp.forced]; + if (!m) { + m = make_message<MOSDPGTemp>(osdmap->get_epoch()); + m->forced = pg_temp.forced; + } + m->pg_temp.emplace(pgid, pg_temp.acting); + } + pg_temp_pending.merge(pg_temp_wanted); + pg_temp_wanted.clear(); + return seastar::parallel_for_each(std::begin(ms), std::end(ms), + [this](auto m) { + if (m) { + return monc.send_message(m); + } else { + return seastar::now(); + } + }); +} + +void ShardServices::update_map(cached_map_t new_osdmap) +{ + osdmap = std::move(new_osdmap); +} + +ShardServices::cached_map_t &ShardServices::get_osdmap() +{ + return osdmap; +} + +seastar::future<> ShardServices::send_pg_created(pg_t pgid) +{ + logger().debug(__func__); + auto o = get_osdmap(); + ceph_assert(o->require_osd_release >= ceph_release_t::luminous); + pg_created.insert(pgid); + return monc.send_message(make_message<MOSDPGCreated>(pgid)); +} + +seastar::future<> ShardServices::send_pg_created() +{ + logger().debug(__func__); + auto o = get_osdmap(); + ceph_assert(o->require_osd_release >= ceph_release_t::luminous); + return seastar::parallel_for_each(pg_created, + [this](auto &pgid) { + return monc.send_message(make_message<MOSDPGCreated>(pgid)); + }); +} + +void ShardServices::prune_pg_created() +{ + logger().debug(__func__); + auto o = get_osdmap(); + auto i = pg_created.begin(); + while (i != pg_created.end()) { + auto p = o->get_pg_pool(i->pool()); + if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) { + logger().debug("{} pruning {}", __func__, *i); + i = pg_created.erase(i); + } else { + logger().debug(" keeping {}", __func__, *i); + ++i; + } + } +} + +seastar::future<> ShardServices::osdmap_subscribe(version_t epoch, bool force_request) +{ + logger().info("{}({})", __func__, epoch); + if (monc.sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) || + force_request) { + return monc.renew_subs(); + } else { + return seastar::now(); + } +} + +HeartbeatStampsRef ShardServices::get_hb_stamps(int peer) +{ + auto [stamps, added] = heartbeat_stamps.try_emplace(peer); + if (added) { + stamps->second = ceph::make_ref<HeartbeatStamps>(peer); + } + return stamps->second; +} + +seastar::future<> ShardServices::send_alive(const epoch_t want) +{ + logger().info( + "{} want={} up_thru_wanted={}", + __func__, + want, + up_thru_wanted); + + if (want > up_thru_wanted) { + up_thru_wanted = want; + } else { + logger().debug("{} want={} <= up_thru_wanted={}; skipping", + __func__, want, up_thru_wanted); + return seastar::now(); + } + if (!osdmap->exists(whoami)) { + logger().warn("{} DNE", __func__); + return seastar::now(); + } if (const epoch_t up_thru = osdmap->get_up_thru(whoami); + up_thru_wanted > up_thru) { + logger().debug("{} up_thru_wanted={} up_thru={}", __func__, want, up_thru); + return monc.send_message( + make_message<MOSDAlive>(osdmap->get_epoch(), want)); + } else { + logger().debug("{} {} <= {}", __func__, want, osdmap->get_up_thru(whoami)); + return seastar::now(); + } +} + +}; diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h new file mode 100644 index 000000000..2957639c6 --- /dev/null +++ b/src/crimson/osd/shard_services.h @@ -0,0 +1,215 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/intrusive_ptr.hpp> +#include <seastar/core/future.hh> + +#include "include/common_fwd.h" +#include "osd_operation.h" +#include "msg/MessageRef.h" +#include "crimson/common/exception.h" +#include "crimson/os/futurized_collection.h" +#include "osd/PeeringState.h" +#include "crimson/osd/osdmap_service.h" +#include "crimson/osd/object_context.h" +#include "common/AsyncReserver.h" + +namespace crimson::net { + class Messenger; +} + +namespace crimson::mgr { + class Client; +} + +namespace crimson::mon { + class Client; +} + +namespace crimson::os { + class FuturizedStore; +} + +class OSDMap; +class PeeringCtx; +class BufferedRecoveryMessages; + +namespace crimson::osd { + +/** + * Represents services available to each PG + */ +class ShardServices : public md_config_obs_t { + using cached_map_t = boost::local_shared_ptr<const OSDMap>; + OSDMapService &osdmap_service; + const int whoami; + crimson::net::Messenger &cluster_msgr; + crimson::net::Messenger &public_msgr; + crimson::mon::Client &monc; + crimson::mgr::Client &mgrc; + crimson::os::FuturizedStore &store; + + crimson::common::CephContext cct; + + PerfCounters *perf = nullptr; + PerfCounters *recoverystate_perf = nullptr; + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set <std::string> &changed) final; +public: + ShardServices( + OSDMapService &osdmap_service, + const int whoami, + crimson::net::Messenger &cluster_msgr, + crimson::net::Messenger &public_msgr, + crimson::mon::Client &monc, + crimson::mgr::Client &mgrc, + crimson::os::FuturizedStore &store); + + seastar::future<> send_to_osd( + int peer, + MessageRef m, + epoch_t from_epoch); + + crimson::os::FuturizedStore &get_store() { + return store; + } + + crimson::common::CephContext *get_cct() { + return &cct; + } + + // OSDMapService + const OSDMapService &get_osdmap_service() const { + return osdmap_service; + } + + // Op Management + OperationRegistry registry; + OperationThrottler throttler; + + template <typename T, typename... Args> + auto start_operation(Args&&... args) { + if (__builtin_expect(stopping, false)) { + throw crimson::common::system_shutdown_exception(); + } + auto op = registry.create_operation<T>(std::forward<Args>(args)...); + return std::make_pair(op, op->start()); + } + + seastar::future<> stop() { + stopping = true; + return registry.stop(); + } + + // Loggers + PerfCounters &get_recoverystate_perf_logger() { + return *recoverystate_perf; + } + PerfCounters &get_perf_logger() { + return *perf; + } + + /// Dispatch and reset ctx transaction + seastar::future<> dispatch_context_transaction( + crimson::os::CollectionRef col, PeeringCtx &ctx); + + /// Dispatch and reset ctx messages + seastar::future<> dispatch_context_messages( + BufferedRecoveryMessages &&ctx); + + /// Dispatch ctx and dispose of context + seastar::future<> dispatch_context( + crimson::os::CollectionRef col, + PeeringCtx &&ctx); + + /// Dispatch ctx and dispose of ctx, transaction must be empty + seastar::future<> dispatch_context( + PeeringCtx &&ctx) { + return dispatch_context({}, std::move(ctx)); + } + + // PG Temp State +private: + // TODO: hook into map processing and some kind of heartbeat/peering + // message processing + struct pg_temp_t { + std::vector<int> acting; + bool forced = false; + }; + map<pg_t, pg_temp_t> pg_temp_wanted; + map<pg_t, pg_temp_t> pg_temp_pending; + friend std::ostream& operator<<(std::ostream&, const pg_temp_t&); +public: + void queue_want_pg_temp(pg_t pgid, const vector<int>& want, + bool forced = false); + void remove_want_pg_temp(pg_t pgid); + void requeue_pg_temp(); + seastar::future<> send_pg_temp(); + + // Shard-local OSDMap +private: + cached_map_t osdmap; +public: + void update_map(cached_map_t new_osdmap); + cached_map_t &get_osdmap(); + + // PG Created State +private: + set<pg_t> pg_created; +public: + seastar::future<> send_pg_created(pg_t pgid); + seastar::future<> send_pg_created(); + void prune_pg_created(); + + unsigned get_pg_num() const { + return num_pgs; + } + void inc_pg_num() { + ++num_pgs; + } + void dec_pg_num() { + --num_pgs; + } + + seastar::future<> osdmap_subscribe(version_t epoch, bool force_request); + + // Time state + ceph::mono_time startup_time = ceph::mono_clock::now(); + ceph::signedspan get_mnow() const { + return ceph::mono_clock::now() - startup_time; + } + HeartbeatStampsRef get_hb_stamps(int peer); + std::map<int, HeartbeatStampsRef> heartbeat_stamps; + + crimson::osd::ObjectContextRegistry obc_registry; + + // Async Reservers +private: + unsigned num_pgs = 0; + + struct DirectFinisher { + void queue(Context *c) { + c->complete(0); + } + } finisher; + // prevent creating new osd operations when system is shutting down, + // this is necessary because there are chances that a new operation + // is created, after the interruption of all ongoing operations, and + // creats and waits on a new and may-never-resolve future, in which + // case the shutdown may never succeed. + bool stopping = false; +public: + AsyncReserver<spg_t, DirectFinisher> local_reserver; + AsyncReserver<spg_t, DirectFinisher> remote_reserver; + +private: + epoch_t up_thru_wanted = 0; +public: + seastar::future<> send_alive(epoch_t want); +}; + +} diff --git a/src/crimson/osd/state.h b/src/crimson/osd/state.h new file mode 100644 index 000000000..ba48cd36f --- /dev/null +++ b/src/crimson/osd/state.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string_view> +#include <ostream> + +class OSDMap; + +class OSDState { + + enum class State { + INITIALIZING, + PREBOOT, + BOOTING, + ACTIVE, + PRESTOP, + STOPPING, + WAITING_FOR_HEALTHY, + }; + + State state = State::INITIALIZING; + +public: + bool is_initializing() const { + return state == State::INITIALIZING; + } + bool is_preboot() const { + return state == State::PREBOOT; + } + bool is_booting() const { + return state == State::BOOTING; + } + bool is_active() const { + return state == State::ACTIVE; + } + bool is_prestop() const { + return state == State::PRESTOP; + } + bool is_stopping() const { + return state == State::STOPPING; + } + bool is_waiting_for_healthy() const { + return state == State::WAITING_FOR_HEALTHY; + } + void set_preboot() { + state = State::PREBOOT; + } + void set_booting() { + state = State::BOOTING; + } + void set_active() { + state = State::ACTIVE; + } + void set_prestop() { + state = State::PRESTOP; + } + void set_stopping() { + state = State::STOPPING; + } + std::string_view to_string() const { + switch (state) { + case State::INITIALIZING: return "initializing"; + case State::PREBOOT: return "preboot"; + case State::BOOTING: return "booting"; + case State::ACTIVE: return "active"; + case State::PRESTOP: return "prestop"; + case State::STOPPING: return "stopping"; + case State::WAITING_FOR_HEALTHY: return "waiting_for_healthy"; + default: return "???"; + } + } +}; + +inline std::ostream& +operator<<(std::ostream& os, const OSDState& s) { + return os << s.to_string(); +} diff --git a/src/crimson/osd/watch.cc b/src/crimson/osd/watch.cc new file mode 100644 index 000000000..a7a3311aa --- /dev/null +++ b/src/crimson/osd/watch.cc @@ -0,0 +1,169 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/osd/watch.h" +#include "messages/MWatchNotify.h" + + +namespace { + seastar::logger& logger() { + return crimson::get_logger(ceph_subsys_osd); + } +} + +namespace crimson::osd { + +bool Watch::NotifyCmp::operator()(NotifyRef lhs, NotifyRef rhs) const +{ + ceph_assert(lhs); + ceph_assert(rhs); + return lhs->get_id() < rhs->get_id(); +} + +seastar::future<> Watch::connect(crimson::net::ConnectionRef conn, bool) +{ + if (this->conn == conn) { + logger().debug("conn={} already connected", conn); + } + + this->conn = std::move(conn); + return seastar::now(); +} + +seastar::future<> Watch::send_notify_msg(NotifyRef notify) +{ + logger().info("{} for notify(id={})", __func__, notify->ninfo.notify_id); + return conn->send(make_message<MWatchNotify>( + winfo.cookie, + notify->user_version, + notify->ninfo.notify_id, + CEPH_WATCH_EVENT_NOTIFY, + notify->ninfo.bl, + notify->client_gid)); +} + +seastar::future<> Watch::start_notify(NotifyRef notify) +{ + logger().info("{} adding notify(id={})", __func__, notify->ninfo.notify_id); + auto [ it, emplaced ] = in_progress_notifies.emplace(std::move(notify)); + ceph_assert(emplaced); + ceph_assert(is_alive()); + return is_connected() ? send_notify_msg(*it) : seastar::now(); +} + +seastar::future<> Watch::notify_ack( + const uint64_t notify_id, + const ceph::bufferlist& reply_bl) +{ + logger().info("{}", __func__); + return seastar::do_for_each(in_progress_notifies, + [this_shared=shared_from_this(), &reply_bl] (auto notify) { + return notify->complete_watcher(this_shared, reply_bl); + } + ).then([this] { + in_progress_notifies.clear(); + return seastar::now(); + }); +} + +seastar::future<> Watch::send_disconnect_msg() +{ + if (!is_connected()) { + return seastar::now(); + } + ceph::bufferlist empty; + return conn->send(make_message<MWatchNotify>( + winfo.cookie, + 0, + 0, + CEPH_WATCH_EVENT_DISCONNECT, + empty)); +} + +void Watch::discard_state() +{ + ceph_assert(obc); + in_progress_notifies.clear(); +} + +seastar::future<> Watch::remove(const bool send_disconnect) +{ + logger().info("{}", __func__); + auto disconnected = send_disconnect ? send_disconnect_msg() + : seastar::now(); + return std::move(disconnected).then([this] { + return seastar::do_for_each(in_progress_notifies, + [this_shared=shared_from_this()] (auto notify) { + return notify->remove_watcher(this_shared); + }).then([this] { + discard_state(); + return seastar::now(); + }); + }); +} + +bool notify_reply_t::operator<(const notify_reply_t& rhs) const +{ + // comparing std::pairs to emphasize our legacy. ceph-osd stores + // notify_replies as std::multimap<std::pair<gid, cookie>, bl>. + // unfortunately, what seems to be an implementation detail, got + // exposed as part of our public API (the `reply_buffer` parameter + // of the `rados_notify` family). + const auto lhsp = std::make_pair(watcher_gid, watcher_cookie); + const auto rhsp = std::make_pair(rhs.watcher_gid, rhs.watcher_cookie); + return lhsp < rhsp; +} + +seastar::future<> Notify::remove_watcher(WatchRef watch) +{ + if (discarded || complete) { + return seastar::now(); + } + [[maybe_unused]] const auto num_removed = watchers.erase(watch); + assert(num_removed > 0); + return maybe_send_completion(); +} + + +seastar::future<> Notify::complete_watcher( + WatchRef watch, + const ceph::bufferlist& reply_bl) +{ + if (discarded || complete) { + return seastar::now(); + } + notify_replies.emplace(notify_reply_t{ + watch->get_watcher_gid(), + watch->get_cookie(), + reply_bl}); + return remove_watcher(std::move(watch)); +} + +seastar::future<> Notify::maybe_send_completion() +{ + logger().info("{} -- {} in progress watchers", __func__, watchers.size()); + if (watchers.empty()) { + // prepare reply + ceph::bufferlist bl; + encode(notify_replies, bl); + // FIXME: this is just a stub + std::list<std::pair<uint64_t,uint64_t>> missed; + encode(missed, bl); + + complete = true; + + ceph::bufferlist empty; + auto reply = make_message<MWatchNotify>( + ninfo.cookie, + user_version, + ninfo.notify_id, + CEPH_WATCH_EVENT_NOTIFY_COMPLETE, + empty, + client_gid); + reply->set_data(bl); + return conn->send(std::move(reply)); + } + return seastar::now(); +} + +} // namespace crimson::osd diff --git a/src/crimson/osd/watch.h b/src/crimson/osd/watch.h new file mode 100644 index 000000000..6049e16cf --- /dev/null +++ b/src/crimson/osd/watch.h @@ -0,0 +1,194 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <iterator> +#include <map> +#include <set> + +#include <seastar/core/shared_ptr.hh> + +#include "crimson/net/Connection.h" +#include "crimson/osd/object_context.h" +#include "include/denc.h" + +namespace crimson::osd { + +class Notify; +using NotifyRef = seastar::shared_ptr<Notify>; + +// NOTE: really need to have this public. Otherwise `shared_from_this()` +// will abort. According to cppreference.com: +// +// "The constructors of std::shared_ptr detect the presence +// of an unambiguous and accessible (ie. public inheritance +// is mandatory) (since C++17) enable_shared_from_this base". +// +// I expect the `seastar::shared_ptr` shares this behaviour. +class Watch : public seastar::enable_shared_from_this<Watch> { + // this is a private tag for the public constructor that turns it into + // de facto private one. The motivation behind the hack is make_shared + // used by create(). + struct private_ctag_t{}; + + struct NotifyCmp { + inline bool operator()(NotifyRef lhs, NotifyRef rhs) const; + }; + std::set<NotifyRef, NotifyCmp> in_progress_notifies; + crimson::net::ConnectionRef conn; + crimson::osd::ObjectContextRef obc; + + watch_info_t winfo; + entity_name_t entity_name; + + seastar::future<> start_notify(NotifyRef); + seastar::future<> send_notify_msg(NotifyRef); + seastar::future<> send_disconnect_msg(); + void discard_state(); + + friend Notify; + +public: + Watch(private_ctag_t, + crimson::osd::ObjectContextRef obc, + const watch_info_t& winfo, + const entity_name_t& entity_name) + : obc(std::move(obc)), + winfo(winfo), + entity_name(entity_name) { + } + + seastar::future<> connect(crimson::net::ConnectionRef, bool); + bool is_alive() const { + return true; + } + bool is_connected() const { + return static_cast<bool>(conn); + } + void got_ping(utime_t) { + // NOP + } + + seastar::future<> remove(bool send_disconnect); + + /// Call when notify_ack received on notify_id + seastar::future<> notify_ack( + uint64_t notify_id, ///< [in] id of acked notify + const ceph::bufferlist& reply_bl); ///< [in] notify reply buffer + + template <class... Args> + static seastar::shared_ptr<Watch> create(Args&&... args) { + return seastar::make_shared<Watch>(private_ctag_t{}, + std::forward<Args>(args)...); + }; + + uint64_t get_watcher_gid() const { + return entity_name.num(); + } + uint64_t get_cookie() const { + return winfo.cookie; + } +}; + +using WatchRef = seastar::shared_ptr<Watch>; + +struct notify_reply_t { + uint64_t watcher_gid; + uint64_t watcher_cookie; + ceph::bufferlist bl; + + bool operator<(const notify_reply_t& rhs) const; + DENC(notify_reply_t, v, p) { + DENC_START(1, 1, p); + denc(v.watcher_gid, p); + denc(v.watcher_cookie, p); + denc(v.bl, p); + DENC_FINISH(p); + } +}; + +class Notify { + std::set<WatchRef> watchers; + notify_info_t ninfo; + crimson::net::ConnectionRef conn; + uint64_t client_gid; + uint64_t user_version; + bool complete = false; + bool discarded = false; + + /// (gid,cookie) -> reply_bl for everyone who acked the notify + std::multiset<notify_reply_t> notify_replies; + + uint64_t get_id() const { return ninfo.notify_id; } + seastar::future<> maybe_send_completion(); + + template <class WatchIteratorT> + Notify(WatchIteratorT begin, + WatchIteratorT end, + crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version); + // this is a private tag for the public constructor that turns it into + // de facto private one. The motivation behind the hack is make_shared + // used by create_n_propagate factory. + struct private_ctag_t{}; + + friend Watch; + +public: + template <class... Args> + Notify(private_ctag_t, Args&&... args) : Notify(std::forward<Args>(args)...) { + } + + template <class WatchIteratorT, class... Args> + static seastar::future<> create_n_propagate( + WatchIteratorT begin, + WatchIteratorT end, + Args&&... args); + + seastar::future<> remove_watcher(WatchRef watch); + seastar::future<> complete_watcher(WatchRef watch, + const ceph::bufferlist& reply_bl); +}; + + +template <class WatchIteratorT> +Notify::Notify(WatchIteratorT begin, + WatchIteratorT end, + crimson::net::ConnectionRef conn, + const notify_info_t& ninfo, + const uint64_t client_gid, + const uint64_t user_version) + : watchers(begin, end), + ninfo(ninfo), + conn(std::move(conn)), + client_gid(client_gid), + user_version(user_version) { +} + +template <class WatchIteratorT, class... Args> +seastar::future<> Notify::create_n_propagate( + WatchIteratorT begin, + WatchIteratorT end, + Args&&... args) +{ + static_assert( + std::is_same_v<typename std::iterator_traits<WatchIteratorT>::value_type, + crimson::osd::WatchRef>); + auto notify = seastar::make_shared<Notify>( + private_ctag_t{}, + begin, + end, + std::forward<Args>(args)...); + return seastar::do_for_each(begin, end, [=] (auto& watchref) { + return watchref->start_notify(notify); + }).then([notify = std::move(notify)] { + return notify->maybe_send_completion(); + }); +} + +} // namespace crimson::osd + +WRITE_CLASS_DENC(crimson::osd::notify_reply_t) |