From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/osd/pg_scrubber.h | 821 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 821 insertions(+) create mode 100644 src/osd/pg_scrubber.h (limited to 'src/osd/pg_scrubber.h') diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h new file mode 100644 index 000000000..392a4a588 --- /dev/null +++ b/src/osd/pg_scrubber.h @@ -0,0 +1,821 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "PG.h" +#include "ScrubStore.h" +#include "scrub_machine_lstnr.h" +#include "scrubber_common.h" + +class Callback; + +namespace Scrub { +class ScrubMachine; +struct BuildMap; + +/** + * Reserving/freeing scrub resources at the replicas. + * + * When constructed - sends reservation requests to the acting_set. + * A rejection triggers a "couldn't acquire the replicas' scrub resources" event. + * All previous requests, whether already granted or not, are explicitly released. + * + * A note re performance: I've measured a few container alternatives for + * m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as + * expected. flat_set is only slightly better. Surprisingly - std::vector (with no + * sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve. + */ +class ReplicaReservations { + using OrigSet = decltype(std::declval().get_actingset()); + + PG* m_pg; + OrigSet m_acting_set; + OSDService* m_osds; + std::vector m_waited_for_peers; + std::vector m_reserved_peers; + bool m_had_rejections{false}; + int m_pending{-1}; + + void release_replica(pg_shard_t peer, epoch_t epoch); + + void send_all_done(); ///< all reservations are granted + + /// notify the scrubber that we have failed to reserve replicas' resources + void send_reject(); + + public: + std::string m_log_msg_prefix; + + /** + * quietly discard all knowledge about existing reservations. No messages + * are sent to peers. + * To be used upon interval change, as we know the the running scrub is no longer + * relevant, and that the replicas had reset the reservations on their side. + */ + void discard_all(); + + ReplicaReservations(PG* pg, pg_shard_t whoami); + + ~ReplicaReservations(); + + void handle_reserve_grant(OpRequestRef op, pg_shard_t from); + + void handle_reserve_reject(OpRequestRef op, pg_shard_t from); + + std::ostream& gen_prefix(std::ostream& out) const; +}; + +/** + * wraps the local OSD scrub resource reservation in an RAII wrapper + */ +class LocalReservation { + OSDService* m_osds; + bool m_holding_local_reservation{false}; + + public: + LocalReservation(OSDService* osds); + ~LocalReservation(); + bool is_reserved() const { return m_holding_local_reservation; } +}; + +/** + * wraps the OSD resource we are using when reserved as a replica by a scrubbing master. + */ +class ReservedByRemotePrimary { + const PgScrubber* m_scrubber; ///< we will be using its gen_prefix() + PG* m_pg; + OSDService* m_osds; + bool m_reserved_by_remote_primary{false}; + const epoch_t m_reserved_at; + + public: + ReservedByRemotePrimary(const PgScrubber* scrubber, PG* pg, OSDService* osds, epoch_t epoch); + ~ReservedByRemotePrimary(); + [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; } + + /// compare the remembered reserved-at epoch to the current interval + [[nodiscard]] bool is_stale() const; + + std::ostream& gen_prefix(std::ostream& out) const; +}; + +/** + * Once all replicas' scrub maps are received, we go on to compare the maps. That is - + * unless we we have not yet completed building our own scrub map. MapsCollectionStatus + * combines the status of waiting for both the local map and the replicas, without + * resorting to adding dummy entries into a list. + */ +class MapsCollectionStatus { + + bool m_local_map_ready{false}; + std::vector m_maps_awaited_for; + + public: + [[nodiscard]] bool are_all_maps_available() const + { + return m_local_map_ready && m_maps_awaited_for.empty(); + } + + void mark_local_map_ready() { m_local_map_ready = true; } + + void mark_replica_map_request(pg_shard_t from_whom) + { + m_maps_awaited_for.push_back(from_whom); + } + + /// @returns true if indeed waiting for this one. Otherwise: an error string + auto mark_arriving_map(pg_shard_t from) -> std::tuple; + + std::vector get_awaited() const { return m_maps_awaited_for; } + + void reset(); + + std::string dump() const; + + friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf); +}; + +} // namespace Scrub + + +/** + * the scrub operation flags. Primary only. + * Set at scrub start. Checked in multiple locations - mostly + * at finish. + */ +struct scrub_flags_t { + + unsigned int priority{0}; + + /** + * set by queue_scrub() if either planned_scrub.auto_repair or + * need_auto were set. + * Tested at scrub end. + */ + bool auto_repair{false}; + + /// this flag indicates that we are scrubbing post repair to verify everything is fixed + bool check_repair{false}; + + /// checked at the end of the scrub, to possibly initiate a deep-scrub + bool deep_scrub_on_error{false}; + + /** + * scrub must not be aborted. + * Set for explicitly requested scrubs, and for scrubs originated by the pairing + * process with the 'repair' flag set (in the RequestScrub event). + */ + bool required{false}; +}; + +ostream& operator<<(ostream& out, const scrub_flags_t& sf); + + +/** + * The part of PG-scrubbing code that isn't state-machine wiring. + * + * Why the separation? I wish to move to a different FSM implementation. Thus I + * am forced to strongly decouple the state-machine implementation details from + * the actual scrubbing code. + */ +class PgScrubber : public ScrubPgIF, public ScrubMachineListener { + + public: + explicit PgScrubber(PG* pg); + + // ------------------ the I/F exposed to the PG (ScrubPgIF) ------------- + + /// are we waiting for resource reservation grants form our replicas? + [[nodiscard]] bool is_reserving() const final; + + void initiate_regular_scrub(epoch_t epoch_queued) final; + + void initiate_scrub_after_repair(epoch_t epoch_queued) final; + + void send_scrub_resched(epoch_t epoch_queued) final; + + void active_pushes_notification(epoch_t epoch_queued) final; + + void update_applied_notification(epoch_t epoch_queued) final; + + void send_scrub_unblock(epoch_t epoch_queued) final; + + void digest_update_notification(epoch_t epoch_queued) final; + + void send_replica_maps_ready(epoch_t epoch_queued) final; + + void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final; + + void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final; + + void send_replica_pushes_upd(epoch_t epoch_queued) final; + /** + * The PG has updated its 'applied version'. It might be that we are waiting for this + * information: after selecting a range of objects to scrub, we've marked the latest + * version of these objects in m_subset_last_update. We will not start the map building + * before we know that the PG has reached this version. + */ + void on_applied_when_primary(const eversion_t& applied_version) final; + + void send_full_reset(epoch_t epoch_queued) final; + + void send_chunk_free(epoch_t epoch_queued) final; + + void send_chunk_busy(epoch_t epoch_queued) final; + + void send_local_map_done(epoch_t epoch_queued) final; + + void send_maps_compared(epoch_t epoch_queued) final; + + void send_get_next_chunk(epoch_t epoch_queued) final; + + void send_scrub_is_finished(epoch_t epoch_queued) final; + + /** + * we allow some number of preemptions of the scrub, which mean we do + * not block. Then we start to block. Once we start blocking, we do + * not stop until the scrub range is completed. + */ + bool write_blocked_by_scrub(const hobject_t& soid) final; + + /// true if the given range intersects the scrub interval in any way + bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final; + + /** + * we are a replica being asked by the Primary to reserve OSD resources for + * scrubbing + */ + void handle_scrub_reserve_request(OpRequestRef op) final; + + void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final; + void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final; + void handle_scrub_reserve_release(OpRequestRef op) final; + void discard_replica_reservations() final; + void clear_scrub_reservations() final; // PG::clear... fwds to here + void unreserve_replicas() final; + + // managing scrub op registration + + void reg_next_scrub(const requested_scrub_t& request_flags) final; + + void unreg_next_scrub() final; + + void scrub_requested(scrub_level_t scrub_level, + scrub_type_t scrub_type, + requested_scrub_t& req_flags) final; + + /** + * Reserve local scrub resources (managed by the OSD) + * + * Fails if OSD's local-scrubs budget was exhausted + * \returns were local resources reserved? + */ + bool reserve_local() final; + + void handle_query_state(ceph::Formatter* f) final; + + void dump(ceph::Formatter* f) const override; + + // used if we are a replica + + void replica_scrub_op(OpRequestRef op) final; + + /// the op priority, taken from the primary's request message + Scrub::scrub_prio_t replica_op_priority() const final + { + return m_replica_request_priority; + }; + + unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const final; + /// the version that refers to m_flags.priority + unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final; + + void add_callback(Context* context) final { m_callbacks.push_back(context); } + + [[nodiscard]] bool are_callbacks_pending() const final // used for an assert in PG.cc + { + return !m_callbacks.empty(); + } + + /// handle a message carrying a replica map + void map_from_replica(OpRequestRef op) final; + + void scrub_clear_state() final; + + bool is_queued_or_active() const final; + + /** + * add to scrub statistics, but only if the soid is below the scrub start + */ + virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) override + { + ceph_assert(false); + } + + /** + * finalize the parameters of the initiated scrubbing session: + * + * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set; + * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set. + */ + void set_op_parameters(requested_scrub_t& request) final; + + void cleanup_store(ObjectStore::Transaction* t) final; + + bool get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const override + { + return false; + } + + // ------------------------------------------------------------------------------------------- + // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener) + + [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); } + + void select_range_n_notify() final; + + /// walk the log to find the latest update that affects our chunk + eversion_t search_log_for_updates() const final; + + eversion_t get_last_update_applied() const final + { + return m_pg->recovery_state.get_last_update_applied(); + } + + int pending_active_pushes() const final { return m_pg->active_pushes; } + + void on_init() final; + void on_replica_init() final; + void replica_handling_done() final; + + /// the version of 'scrub_clear_state()' that does not try to invoke FSM services + /// (thus can be called from FSM reactions) + void clear_pgscrub_state() final; + + /* + * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep' + * is asserted - after a configuration-dependent timeout. + */ + void add_delayed_scheduling() final; + + void get_replicas_maps(bool replica_can_preempt) final; + + void on_digest_updates() final; + + void scrub_begin() final; + + void scrub_finish() final; + + ScrubMachineListener::MsgAndEpoch + prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final; + + void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final; + + void send_preempted_replica() final; + + void send_remotes_reserved(epoch_t epoch_queued) final; + void send_reservation_failure(epoch_t epoch_queued) final; + + /** + * does the PG have newer updates than what we (the scrubber) know? + */ + [[nodiscard]] bool has_pg_marked_new_updates() const final; + + void set_subset_last_update(eversion_t e) final; + + void maps_compare_n_cleanup() final; + + Scrub::preemption_t& get_preemptor() final; + + int build_primary_map_chunk() final; + + int build_replica_map_chunk() final; + + void reserve_replicas() final; + + [[nodiscard]] bool was_epoch_changed() const final; + + void set_queued_or_active() final; + void clear_queued_or_active() final; + + void mark_local_map_ready() final; + + [[nodiscard]] bool are_all_maps_available() const final; + + std::string dump_awaited_maps() const final; + + std::ostream& gen_prefix(std::ostream& out) const final; + + protected: + bool state_test(uint64_t m) const { return m_pg->state_test(m); } + void state_set(uint64_t m) { m_pg->state_set(m); } + void state_clear(uint64_t m) { m_pg->state_clear(m); } + + [[nodiscard]] bool is_scrub_registered() const; + + virtual void _scrub_clear_state() {} + + utime_t m_scrub_reg_stamp; ///< stamp we registered for + + ostream& show(ostream& out) const override; + + public: + // ------------------------------------------------------------------------------------------- + + friend ostream& operator<<(ostream& out, const PgScrubber& scrubber); + + static utime_t scrub_must_stamp() { return utime_t(1, 1); } + + virtual ~PgScrubber(); // must be defined separately, in the .cc file + + [[nodiscard]] bool is_scrub_active() const final { return m_active; } + + private: + void reset_internal_state(); + + /** + * the current scrubbing operation is done. We should mark that fact, so that + * all events related to the previous operation can be discarded. + */ + void advance_token(); + + bool is_token_current(Scrub::act_token_t received_token); + + void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); } + + void _scan_snaps(ScrubMap& smap); + + ScrubMap clean_meta_map(); + + /** + * mark down some parameters of the initiated scrub: + * - the epoch when started; + * - the depth of the scrub requested (from the PG_STATE variable) + */ + void reset_epoch(epoch_t epoch_queued); + + void run_callbacks(); + + // ----- methods used to verify the relevance of incoming events: + + /** + * is the incoming event still relevant, and should be processed? + * + * It isn't if: + * - (1) we are no longer 'actively scrubbing'; or + * - (2) the message is from an epoch prior to when we started the current scrub + * session; or + * - (3) the message epoch is from a previous interval; or + * - (4) the 'abort' configuration flags were set. + * + * For (1) & (2) - teh incoming message is discarded, w/o further action. + * + * For (3): (see check_interval() for a full description) if we have not reacted yet + * to this specific new interval, we do now: + * - replica reservations are silently discarded (we count on the replicas to notice + * the interval change and un-reserve themselves); + * - the scrubbing is halted. + * + * For (4): the message will be discarded, but also: + * if this is the first time we've noticed the 'abort' request, we perform the abort. + * + * \returns should the incoming event be processed? + */ + bool is_message_relevant(epoch_t epoch_to_verify); + + /** + * check the 'no scrub' configuration options. + */ + [[nodiscard]] bool should_abort() const; + + /** + * Check the 'no scrub' configuration flags. + * + * Reset everything if the abort was not handled before. + * @returns false if the message was discarded due to abort flag. + */ + [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify); + + [[nodiscard]] bool check_interval(epoch_t epoch_to_verify); + + epoch_t m_last_aborted{}; // last time we've noticed a request to abort + + /** + * return true if any inconsistency/missing is repaired, false otherwise + */ + [[nodiscard]] bool scrub_process_inconsistent(); + + void scrub_compare_maps(); + + bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? always + ///< 'true', unless we just got out of a sleep period + + utime_t m_sleep_started_at; + + + // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed' + // to guarantee un-reserving when deleted. + std::optional m_reservations; + std::optional m_local_osd_resource; + + /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing + std::optional m_remote_osd_resource; + + void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when + // Active->NotActive + + protected: + PG* const m_pg; + + /** + * the derivative-specific scrub-finishing touches: + */ + virtual void _scrub_finish() {} + + /** + * Validate consistency of the object info and snap sets. + */ + virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) + {} + + // common code used by build_primary_map_chunk() and build_replica_map_chunk(): + int build_scrub_map_chunk(ScrubMap& map, // primary or replica? + ScrubMapBuilder& pos, + hobject_t start, + hobject_t end, + bool deep); + + std::unique_ptr m_fsm; + const spg_t m_pg_id; ///< a local copy of m_pg->pg_id + OSDService* const m_osds; + const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami; + + epoch_t m_interval_start{0}; ///< interval's 'from' of when scrubbing was first scheduled + /* + * the exact epoch when the scrubbing actually started (started here - cleared checks + * for no-scrub conf). Incoming events are verified against this, with stale events + * discarded. + */ + epoch_t m_epoch_start{0}; ///< the actual epoch when scrubbing started + + /** + * (replica) a tag identifying a specific scrub "session". Incremented whenever the + * Primary releases the replica scrub resources. + * When the scrub session is terminated (even if the interval remains unchanged, as + * might happen following an asok no-scrub command), stale scrub-resched messages + * triggered by the backend will be discarded. + */ + Scrub::act_token_t m_current_token{1}; + + scrub_flags_t m_flags; + + bool m_active{false}; + + /** + * a flag designed to prevent the initiation of a second scrub on a PG for which scrubbing + * has been initiated. + * + * set once scrubbing was initiated (i.e. - even before the FSM event that + * will trigger a state-change out of Inactive was handled), and only reset + * once the FSM is back in Inactive. + * In other words - its ON period encompasses: + * - the time period covered today by 'queued', and + * - the time when m_active is set, and + * - all the time from scrub_finish() calling update_stats() till the + * FSM handles the 'finished' event + * + * Compared with 'm_active', this flag is asserted earlier and remains ON for longer. + */ + bool m_queued_or_active{false}; + + eversion_t m_subset_last_update{}; + + std::unique_ptr m_store; + + int num_digest_updates_pending{0}; + hobject_t m_start, m_end; ///< note: half-closed: [start,end) + + /// Returns reference to current osdmap + const OSDMapRef& get_osdmap() const; + + /// Returns epoch of current osdmap + epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); } + + CephContext* get_pg_cct() const { return m_pg->cct; } + + // collected statistics + int m_shallow_errors{0}; + int m_deep_errors{0}; + int m_fixed_count{0}; + + /// Maps from objects with errors to missing peers + HobjToShardSetMapping m_missing; + + protected: + /** + * 'm_is_deep' - is the running scrub a deep one? + * + * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is + * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is + * meaningful both for the primary and the replicas, and is used as a parameter when + * building the scrub maps. + */ + bool m_is_deep{false}; + + /** + * If set: affects the backend & scrubber-backend functions called after all + * scrub maps are available. + * + * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be + * a "user facing" status display only). + */ + bool m_is_repair{false}; + + /** + * User-readable summary of the scrubber's current mode of operation. Used for + * both osd.*.log and the cluster log. + * One of: + * "repair" + * "deep-scrub", + * "scrub + * + * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for + * auto_repair will show as "deep-scrub" and not as "repair" (until the first error + * is detected). + */ + std::string_view m_mode_desc; + + void update_op_mode_text(); + +private: + + /** + * initiate a deep-scrub after the current scrub ended with errors. + */ + void request_rescrubbing(requested_scrub_t& req_flags); + + /* + * Select a range of objects to scrub. + * + * By: + * - setting tentative range based on conf and divisor + * - requesting a partial list of elements from the backend; + * - handling some head/clones issues + * + * The selected range is set directly into 'm_start' and 'm_end' + */ + bool select_range(); + + std::list m_callbacks; + + /** + * send a replica (un)reservation request to the acting set + * + * @param opcode - one of MOSDScrubReserve::REQUEST + * or MOSDScrubReserve::RELEASE + */ + void message_all_replicas(int32_t opcode, std::string_view op_text); + + hobject_t m_max_end; ///< Largest end that may have been sent to replicas + ScrubMap m_primary_scrubmap; + ScrubMapBuilder m_primary_scrubmap_pos; + + std::map m_received_maps; + + /// Cleaned std::map pending snap metadata scrub + ScrubMap m_cleaned_meta_map; + + void _request_scrub_map(pg_shard_t replica, + eversion_t version, + hobject_t start, + hobject_t end, + bool deep, + bool allow_preemption); + + + Scrub::MapsCollectionStatus m_maps_status; + + omap_stat_t m_omap_stats = (const struct omap_stat_t){0}; + + /// Maps from objects with errors to inconsistent peers + HobjToShardSetMapping m_inconsistent; + + /// Maps from object with errors to good peers + std::map>> m_authoritative; + + // ------------ members used if we are a replica + + epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message + + ScrubMapBuilder replica_scrubmap_pos; + ScrubMap replica_scrubmap; + + /** + * we mark the request priority as it arrived. It influences the queuing priority + * when we wait for local updates + */ + Scrub::scrub_prio_t m_replica_request_priority; + + /** + * the 'preemption' "state-machine". + * Note: I was considering an orthogonal sub-machine implementation, but as + * the state diagram is extremely simple, the added complexity wasn't justified. + */ + class preemption_data_t : public Scrub::preemption_t { + public: + preemption_data_t(PG* pg); // the PG access is used for conf access (and logs) + + [[nodiscard]] bool is_preemptable() const final { return m_preemptable; } + + bool do_preempt() final + { + if (m_preempted || !m_preemptable) + return false; + + std::lock_guard lk{m_preemption_lock}; + if (!m_preemptable) + return false; + + m_preempted = true; + return true; + } + + /// same as 'do_preempt()' but w/o checks (as once a replica + /// was preempted, we cannot continue) + void replica_preempted() { m_preempted = true; } + + void enable_preemption() + { + std::lock_guard lk{m_preemption_lock}; + if (are_preemptions_left() && !m_preempted) { + m_preemptable = true; + } + } + + /// used by a replica to set preemptability state according to the Primary's request + void force_preemptability(bool is_allowed) + { + // note: no need to lock for a replica + m_preempted = false; + m_preemptable = is_allowed; + } + + bool disable_and_test() final + { + std::lock_guard lk{m_preemption_lock}; + m_preemptable = false; + return m_preempted; + } + + [[nodiscard]] bool was_preempted() const { return m_preempted; } + + [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; } + + void reset(); + + void adjust_parameters() final + { + std::lock_guard lk{m_preemption_lock}; + + if (m_preempted) { + m_preempted = false; + m_preemptable = adjust_left(); + } else { + m_preemptable = are_preemptions_left(); + } + } + + private: + PG* m_pg; + mutable std::mutex m_preemption_lock; + bool m_preemptable{false}; + bool m_preempted{false}; + int m_left; + size_t m_size_divisor{1}; + bool are_preemptions_left() const { return m_left > 0; } + + bool adjust_left() + { + if (m_left > 0) { + --m_left; + m_size_divisor *= 2; + } + return m_left > 0; + } + }; + + preemption_data_t preemption_data; +}; -- cgit v1.2.3