// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab #pragma once #include #include #include #include #include #include #include #include #include #include #include "common/version.h" #include "include/Context.h" #include "scrub_machine_lstnr.h" #include "scrubber_common.h" using namespace std::string_literals; class PG; // holding a pointer to that one - just for testing class PgScrubber; namespace Scrub { namespace sc = ::boost::statechart; namespace mpl = ::boost::mpl; // // EVENTS // void on_event_creation(std::string_view nm); void on_event_discard(std::string_view nm); #define MEV(E) \ struct E : sc::event { \ inline static int actv{0}; \ E() \ { \ if (!actv++) \ on_event_creation(#E); \ } \ ~E() \ { \ if (!--actv) \ on_event_discard(#E); \ } \ void print(std::ostream* out) const { *out << #E; } \ std::string_view print() const { return #E; } \ }; MEV(RemotesReserved) ///< all replicas have granted our reserve request MEV(ReservationFailure) ///< a reservation request has failed MEV(StartScrub) ///< initiate a new scrubbing session (relevant if we are a Primary) MEV(AfterRepairScrub) ///< initiate a new scrubbing session. Only triggered at Recovery ///< completion. MEV(Unblocked) ///< triggered when the PG unblocked an object that was marked for ///< scrubbing. Via the PGScrubUnblocked op MEV(InternalSchedScrub) MEV(SelectedChunkFree) MEV(ChunkIsBusy) MEV(ActivePushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery ///< that is in-flight to the local ObjectStore MEV(UpdatesApplied) ///< (Primary only) all updates are committed MEV(InternalAllUpdates) ///< the internal counterpart of UpdatesApplied MEV(GotReplicas) ///< got a map from a replica MEV(IntBmPreempted) ///< internal - BuildMap preempted. Required, as detected within the ///< ctor MEV(InternalError) MEV(IntLocalMapDone) MEV(DigestUpdate) ///< external. called upon success of a MODIFY op. See ///< scrub_snapshot_metadata() MEV(MapsCompared) ///< (Crimson) maps_compare_n_cleanup() transactions are done MEV(StartReplica) ///< initiating replica scrub. MEV(StartReplicaNoWait) ///< 'start replica' when there are no pending updates MEV(SchedReplica) MEV(ReplicaPushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery ///< that is in-flight to the local ObjectStore MEV(FullReset) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive) MEV(NextChunk) ///< finished handling this chunk. Go get the next one MEV(ScrubFinished) ///< all chunks handled struct NotActive; ///< the quiescent state. No active scrubbing. struct ReservingReplicas; ///< securing scrub resources from replicas' OSDs struct ActiveScrubbing; ///< the active state for a Primary. A sub-machine. struct ReplicaWaitUpdates; ///< an active state for a replica. Waiting for all active ///< operations to finish. struct ActiveReplica; ///< an active state for a replica. class ScrubMachine : public sc::state_machine { public: friend class PgScrubber; public: explicit ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub); ~ScrubMachine(); spg_t m_pg_id; ScrubMachineListener* m_scrbr; std::ostream& gen_prefix(std::ostream& out) const; std::string current_states_desc() const; void assert_not_active() const; [[nodiscard]] bool is_reserving() const; [[nodiscard]] bool is_accepting_updates() const; }; /** * The Scrubber's base (quiescent) state. * Scrubbing is triggered by one of the following events: * - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources * reservation process. Will be issued by PG::scrub(), following a * queued "PGScrub" op. * - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is * not required to reserve resources. * - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming * MOSDRepScrub message. * * note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting * for replica resources to be acquired. But once replicas started using the * resource-request to identify and tag the scrub session, this bypass cannot be * supported anymore. */ struct NotActive : sc::state { explicit NotActive(my_context ctx); using reactions = mpl::list, // a scrubbing that was initiated at recovery completion, // and requires no resource reservations: sc::transition, sc::transition, sc::transition>; }; struct ReservingReplicas : sc::state { explicit ReservingReplicas(my_context ctx); using reactions = mpl::list, // all replicas granted our resources request sc::transition, sc::custom_reaction>; sc::result react(const FullReset&); /// at least one replica denied us the scrub resources we've requested sc::result react(const ReservationFailure&); }; // the "active" sub-states struct RangeBlocked; ///< the objects range is blocked struct PendingTimer; ///< either delaying the scrub by some time and requeuing, or just ///< requeue struct NewChunk; ///< select a chunk to scrub, and verify its availability struct WaitPushes; struct WaitLastUpdate; struct BuildMap; struct DrainReplMaps; ///< a problem during BuildMap. Wait for all replicas to report, ///< then restart. struct WaitReplicas; ///< wait for all replicas to report struct WaitDigestUpdate; struct ActiveScrubbing : sc::state { explicit ActiveScrubbing(my_context ctx); ~ActiveScrubbing(); using reactions = mpl::list< sc::custom_reaction, sc::custom_reaction>; sc::result react(const FullReset&); sc::result react(const InternalError&); }; struct RangeBlocked : sc::state { explicit RangeBlocked(my_context ctx); using reactions = mpl::list>; }; struct PendingTimer : sc::state { explicit PendingTimer(my_context ctx); using reactions = mpl::list>; }; struct NewChunk : sc::state { explicit NewChunk(my_context ctx); using reactions = mpl::list, sc::custom_reaction>; sc::result react(const SelectedChunkFree&); }; /** * initiate the update process for this chunk * * Wait fo 'active_pushes' to clear. * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence * scrub waits until the correct data is readable (in-flight data to the Objectstore is * not readable until written to disk, termed 'applied' here) */ struct WaitPushes : sc::state { explicit WaitPushes(my_context ctx); using reactions = mpl::list>; sc::result react(const ActivePushesUpd&); }; struct WaitLastUpdate : sc::state { explicit WaitLastUpdate(my_context ctx); void on_new_updates(const UpdatesApplied&); using reactions = mpl::list, sc::in_state_reaction>; sc::result react(const InternalAllUpdates&); }; struct BuildMap : sc::state { explicit BuildMap(my_context ctx); // possible error scenarios: // - an error reported by the backend will trigger an 'InternalError' event, // handled by our parent state; // - if preempted, we switch to DrainReplMaps, where we will wait for all // replicas to send their maps before acknowledging the preemption; // - an interval change will be handled by the relevant 'send-event' functions, // and will translated into a 'FullReset' event. using reactions = mpl::list, sc::transition, // looping, waiting // for the backend to // finish sc::custom_reaction>; sc::result react(const IntLocalMapDone&); }; /* * "drain" scrub-maps responses from replicas */ struct DrainReplMaps : sc::state { explicit DrainReplMaps(my_context ctx); using reactions = mpl::list // all replicas are accounted for >; sc::result react(const GotReplicas&); }; struct WaitReplicas : sc::state { explicit WaitReplicas(my_context ctx); using reactions = mpl::list, // all replicas are accounted for sc::transition, sc::deferral // might arrive before we've reached WDU >; sc::result react(const GotReplicas&); bool all_maps_already_called{false}; // see comment in react code }; struct WaitDigestUpdate : sc::state { explicit WaitDigestUpdate(my_context ctx); using reactions = mpl::list, sc::custom_reaction, sc::transition>; sc::result react(const DigestUpdate&); sc::result react(const ScrubFinished&); }; // ----------------------------- the "replica active" states ----------------------- /* * Waiting for 'active_pushes' to complete * * When in this state: * - the details of the Primary's request were internalized by PgScrubber; * - 'active' scrubbing is set */ struct ReplicaWaitUpdates : sc::state { explicit ReplicaWaitUpdates(my_context ctx); using reactions = mpl::list, sc::custom_reaction>; sc::result react(const ReplicaPushesUpd&); sc::result react(const FullReset&); }; struct ActiveReplica : sc::state { explicit ActiveReplica(my_context ctx); using reactions = mpl::list, sc::custom_reaction, sc::transition>; sc::result react(const SchedReplica&); sc::result react(const FullReset&); }; } // namespace Scrub