From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/osd/scrubber/pg_scrubber.h | 1047 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1047 insertions(+) create mode 100644 src/osd/scrubber/pg_scrubber.h (limited to 'src/osd/scrubber/pg_scrubber.h') diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h new file mode 100644 index 000000000..10e08b72e --- /dev/null +++ b/src/osd/scrubber/pg_scrubber.h @@ -0,0 +1,1047 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +// clang-format off +/* + +Main Scrubber interfaces: + +┌──────────────────────────────────────────────┬────┐ +│ │ │ +│ │ │ +│ PG │ │ +│ │ │ +│ │ │ +├──────────────────────────────────────────────┘ │ +│ │ +│ PrimaryLogPG │ +└────────────────────────────────┬──────────────────┘ + │ + │ + │ ownes & uses + │ + │ + │ +┌────────────────────────────────▼──────────────────┐ +│ <> │ +└───────────────────────────▲───────────────────────┘ + │ + │ + │implements + │ + │ + │ +┌───────────────────────────┴───────────────┬───────┐ +│ │ │ +│ PgScrubber │ │ +│ │ │ +│ │ ├───────┐ +├───────────────────────────────────────────┘ │ │ +│ │ │ +│ PrimaryLogScrub │ │ +└─────┬───────────────────┬─────────────────────────┘ │ + │ │ implements + │ ownes & uses │ │ + │ │ ┌─────────────────────────▼──────┐ + │ │ │ <> │ + │ │ └─────────▲──────────────────────┘ + │ │ │ + │ │ │ + │ ▼ │ + │ ┌────────────────────────────────┴───────┐ + │ │ │ + │ │ ScrubMachine │ + │ │ │ + │ └────────────────────────────────────────┘ + │ + ┌───▼─────────────────────────────────┐ + │ │ + │ ScrubStore │ + │ │ + └─────────────────────────────────────┘ + +*/ +// clang-format on + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "osd/PG.h" +#include "osd/scrubber_common.h" + +#include "ScrubStore.h" +#include "osd_scrub_sched.h" +#include "scrub_backend.h" +#include "scrub_machine_lstnr.h" + +namespace Scrub { +class ScrubMachine; +struct BuildMap; + +/** + * Reserving/freeing scrub resources at the replicas. + * + * When constructed - sends reservation requests to the acting_set. + * A rejection triggers a "couldn't acquire the replicas' scrub resources" + * event. All previous requests, whether already granted or not, are explicitly + * released. + * + * Timeouts: + * + * Slow-Secondary Warning: + * Once at least half of the replicas have accepted the reservation, we start + * reporting any secondary that takes too long (more than milliseconds + * after the previous response received) to respond to the reservation request. + * (Why? because we have encountered real-life situations where a specific OSD + * was systematically very slow (e.g. 5 seconds) to respond to the reservation + * requests, slowing the scrub process to a crawl). + * + * Reservation Timeout: + * We limit the total time we wait for the replicas to respond to the + * reservation request. If we don't get all the responses (either Grant or + * Reject) within milliseconds, we give up and release all the + * reservations we have acquired so far. + * (Why? because we have encountered instances where a reservation request was + * lost - either due to a bug or due to a network issue.) + * + * A note re performance: I've measured a few container alternatives for + * m_reserved_peers, with its specific usage pattern. Std::set is extremely + * slow, as expected. flat_set is only slightly better. Surprisingly - + * std::vector (with no sorting) is better than boost::small_vec. And for + * std::vector: no need to pre-reserve. + */ +class ReplicaReservations { + using clock = std::chrono::system_clock; + using tpoint_t = std::chrono::time_point; + + /// a no-reply timeout handler + struct no_reply_t { + explicit no_reply_t( + OSDService* osds, + const ConfigProxy& conf, + ReplicaReservations& parent, + std::string_view log_prfx); + + ~no_reply_t(); + OSDService* m_osds; + const ConfigProxy& m_conf; + ReplicaReservations& m_parent; + std::string m_log_prfx; + Context* m_abort_callback{nullptr}; + }; + + PG* m_pg; + std::set m_acting_set; + OSDService* m_osds; + std::vector m_waited_for_peers; + std::vector m_reserved_peers; + bool m_had_rejections{false}; + int m_pending{-1}; + const pg_info_t& m_pg_info; + ScrubQueue::ScrubJobRef m_scrub_job; ///< a ref to this PG's scrub job + const ConfigProxy& m_conf; + + // detecting slow peers (see 'slow-secondary' above) + std::chrono::milliseconds m_timeout; + std::optional m_timeout_point; + + // detecting & handling a "no show" of a replica + std::unique_ptr m_no_reply; + + void release_replica(pg_shard_t peer, epoch_t epoch); + + void send_all_done(); ///< all reservations are granted + + /// notify the scrubber that we have failed to reserve replicas' resources + void send_reject(); + + std::optional update_latecomers(tpoint_t now_is); + + public: + std::string m_log_msg_prefix; + + /** + * quietly discard all knowledge about existing reservations. No messages + * are sent to peers. + * To be used upon interval change, as we know the the running scrub is no + * longer relevant, and that the replicas had reset the reservations on + * their side. + */ + void discard_all(); + + ReplicaReservations(PG* pg, + pg_shard_t whoami, + ScrubQueue::ScrubJobRef scrubjob, + const ConfigProxy& conf); + + ~ReplicaReservations(); + + void handle_reserve_grant(OpRequestRef op, pg_shard_t from); + + void handle_reserve_reject(OpRequestRef op, pg_shard_t from); + + // if timing out on receiving replies from our replicas: + void handle_no_reply_timeout(); + + std::ostream& gen_prefix(std::ostream& out) const; +}; + +/** + * wraps the local OSD scrub resource reservation in an RAII wrapper + */ +class LocalReservation { + OSDService* m_osds; + bool m_holding_local_reservation{false}; + + public: + explicit LocalReservation(OSDService* osds); + ~LocalReservation(); + bool is_reserved() const { return m_holding_local_reservation; } +}; + +/** + * wraps the OSD resource we are using when reserved as a replica by a + * scrubbing primary. + */ +class ReservedByRemotePrimary { + const PgScrubber* m_scrubber; ///< we will be using its gen_prefix() + PG* m_pg; + OSDService* m_osds; + bool m_reserved_by_remote_primary{false}; + const epoch_t m_reserved_at; + + public: + ReservedByRemotePrimary(const PgScrubber* scrubber, + PG* pg, + OSDService* osds, + epoch_t epoch); + ~ReservedByRemotePrimary(); + [[nodiscard]] bool is_reserved() const + { + return m_reserved_by_remote_primary; + } + + /// compare the remembered reserved-at epoch to the current interval + [[nodiscard]] bool is_stale() const; + + std::ostream& gen_prefix(std::ostream& out) const; +}; + +/** + * Once all replicas' scrub maps are received, we go on to compare the maps. + * That is - unless we we have not yet completed building our own scrub map. + * MapsCollectionStatus combines the status of waiting for both the local map + * and the replicas, without resorting to adding dummy entries into a list. + */ +class MapsCollectionStatus { + + bool m_local_map_ready{false}; + std::vector m_maps_awaited_for; + + public: + [[nodiscard]] bool are_all_maps_available() const + { + return m_local_map_ready && m_maps_awaited_for.empty(); + } + + void mark_local_map_ready() { m_local_map_ready = true; } + + void mark_replica_map_request(pg_shard_t from_whom) + { + m_maps_awaited_for.push_back(from_whom); + } + + /// @returns true if indeed waiting for this one. Otherwise: an error string + auto mark_arriving_map(pg_shard_t from) -> std::tuple; + + [[nodiscard]] std::vector get_awaited() const + { + return m_maps_awaited_for; + } + + void reset(); + + std::string dump() const; + + friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf); +}; + + +} // namespace Scrub + + +/** + * the scrub operation flags. Primary only. + * Set at scrub start. Checked in multiple locations - mostly + * at finish. + */ +struct scrub_flags_t { + + unsigned int priority{0}; + + /** + * set by queue_scrub() if either planned_scrub.auto_repair or + * need_auto were set. + * Tested at scrub end. + */ + bool auto_repair{false}; + + /// this flag indicates that we are scrubbing post repair to verify everything + /// is fixed + bool check_repair{false}; + + /// checked at the end of the scrub, to possibly initiate a deep-scrub + bool deep_scrub_on_error{false}; + + /** + * scrub must not be aborted. + * Set for explicitly requested scrubs, and for scrubs originated by the + * pairing process with the 'repair' flag set (in the RequestScrub event). + */ + bool required{false}; +}; + +ostream& operator<<(ostream& out, const scrub_flags_t& sf); + + +/** + * The part of PG-scrubbing code that isn't state-machine wiring. + * + * Why the separation? I wish to move to a different FSM implementation. Thus I + * am forced to strongly decouple the state-machine implementation details from + * the actual scrubbing code. + */ +class PgScrubber : public ScrubPgIF, + public ScrubMachineListener, + public ScrubBeListener { + public: + explicit PgScrubber(PG* pg); + + friend class ScrubBackend; // will be replaced by a limited interface + + // ------------------ the I/F exposed to the PG (ScrubPgIF) ------------- + + /// are we waiting for resource reservation grants form our replicas? + [[nodiscard]] bool is_reserving() const final; + + void initiate_regular_scrub(epoch_t epoch_queued) final; + + void initiate_scrub_after_repair(epoch_t epoch_queued) final; + + void send_scrub_resched(epoch_t epoch_queued) final; + + void active_pushes_notification(epoch_t epoch_queued) final; + + void update_applied_notification(epoch_t epoch_queued) final; + + void send_scrub_unblock(epoch_t epoch_queued) final; + + void digest_update_notification(epoch_t epoch_queued) final; + + void send_replica_maps_ready(epoch_t epoch_queued) final; + + void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final; + + void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final; + + void send_replica_pushes_upd(epoch_t epoch_queued) final; + /** + * The PG has updated its 'applied version'. It might be that we are waiting + * for this information: after selecting a range of objects to scrub, we've + * marked the latest version of these objects in m_subset_last_update. We will + * not start the map building before we know that the PG has reached this + * version. + */ + void on_applied_when_primary(const eversion_t& applied_version) final; + + void send_full_reset(epoch_t epoch_queued) final; + + void send_chunk_free(epoch_t epoch_queued) final; + + void send_chunk_busy(epoch_t epoch_queued) final; + + void send_local_map_done(epoch_t epoch_queued) final; + + void send_get_next_chunk(epoch_t epoch_queued) final; + + void send_scrub_is_finished(epoch_t epoch_queued) final; + + /** + * we allow some number of preemptions of the scrub, which mean we do + * not block. Then we start to block. Once we start blocking, we do + * not stop until the scrub range is completed. + */ + bool write_blocked_by_scrub(const hobject_t& soid) final; + + /// true if the given range intersects the scrub interval in any way + bool range_intersects_scrub(const hobject_t& start, + const hobject_t& end) final; + + /** + * we are a replica being asked by the Primary to reserve OSD resources for + * scrubbing + */ + void handle_scrub_reserve_request(OpRequestRef op) final; + + void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final; + void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final; + void handle_scrub_reserve_release(OpRequestRef op) final; + void discard_replica_reservations() final; + void clear_scrub_reservations() final; // PG::clear... fwds to here + void unreserve_replicas() final; + + // managing scrub op registration + + void update_scrub_job(const requested_scrub_t& request_flags) final; + + void rm_from_osd_scrubbing() final; + + void on_primary_change( + std::string_view caller, + const requested_scrub_t& request_flags) final; + + void scrub_requested(scrub_level_t scrub_level, + scrub_type_t scrub_type, + requested_scrub_t& req_flags) final; + + /** + * Reserve local scrub resources (managed by the OSD) + * + * Fails if OSD's local-scrubs budget was exhausted + * \returns were local resources reserved? + */ + bool reserve_local() final; + + void handle_query_state(ceph::Formatter* f) final; + + pg_scrubbing_status_t get_schedule() const final; + + void dump_scrubber(ceph::Formatter* f, + const requested_scrub_t& request_flags) const final; + + // used if we are a replica + + void replica_scrub_op(OpRequestRef op) final; + + /// the op priority, taken from the primary's request message + Scrub::scrub_prio_t replica_op_priority() const final + { + return m_replica_request_priority; + }; + + unsigned int scrub_requeue_priority( + Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const final; + /// the version that refers to m_flags.priority + unsigned int scrub_requeue_priority( + Scrub::scrub_prio_t with_priority) const final; + + void add_callback(Context* context) final { m_callbacks.push_back(context); } + + [[nodiscard]] bool are_callbacks_pending() const final // used for an assert + // in PG.cc + { + return !m_callbacks.empty(); + } + + /// handle a message carrying a replica map + void map_from_replica(OpRequestRef op) final; + + void scrub_clear_state() final; + + bool is_queued_or_active() const final; + + /** + * add to scrub statistics, but only if the soid is below the scrub start + */ + void stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) override + { + ceph_assert(false); + } + + /** + * finalize the parameters of the initiated scrubbing session: + * + * The "current scrub" flags (m_flags) are set from the 'planned_scrub' + * flag-set; PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & + * PG_STATE_REPAIR are set. + */ + void set_op_parameters(const requested_scrub_t& request) final; + + void cleanup_store(ObjectStore::Transaction* t) final; + + bool get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const override + { + return false; + } + + void update_scrub_stats(ceph::coarse_real_clock::time_point now_is) final; + + int asok_debug(std::string_view cmd, + std::string param, + Formatter* f, + std::stringstream& ss) override; + int m_debug_blockrange{0}; + + // -------------------------------------------------------------------------- + // the I/F used by the state-machine (i.e. the implementation of + // ScrubMachineListener) + + [[nodiscard]] bool is_primary() const final + { + return m_pg->recovery_state.is_primary(); + } + + void set_state_name(const char* name) final + { + m_fsm_state_name = name; + } + + void select_range_n_notify() final; + + Scrub::BlockedRangeWarning acquire_blocked_alarm() final; + void set_scrub_blocked(utime_t since) final; + void clear_scrub_blocked() final; + + + /// walk the log to find the latest update that affects our chunk + eversion_t search_log_for_updates() const final; + + eversion_t get_last_update_applied() const final + { + return m_pg->recovery_state.get_last_update_applied(); + } + + int pending_active_pushes() const final { return m_pg->active_pushes; } + + void on_init() final; + void on_replica_init() final; + void replica_handling_done() final; + + /// the version of 'scrub_clear_state()' that does not try to invoke FSM + /// services (thus can be called from FSM reactions) + void clear_pgscrub_state() final; + + /* + * Send an 'InternalSchedScrub' FSM event either immediately, or - if + * 'm_need_sleep' is asserted - after a configuration-dependent timeout. + */ + void add_delayed_scheduling() final; + + void get_replicas_maps(bool replica_can_preempt) final; + + void on_digest_updates() final; + + void scrub_finish() final; + + ScrubMachineListener::MsgAndEpoch prep_replica_map_msg( + Scrub::PreemptionNoted was_preempted) final; + + void send_replica_map( + const ScrubMachineListener::MsgAndEpoch& preprepared) final; + + void send_preempted_replica() final; + + void send_remotes_reserved(epoch_t epoch_queued) final; + void send_reservation_failure(epoch_t epoch_queued) final; + + /** + * does the PG have newer updates than what we (the scrubber) know? + */ + [[nodiscard]] bool has_pg_marked_new_updates() const final; + + void set_subset_last_update(eversion_t e) final; + + void maps_compare_n_cleanup() final; + + Scrub::preemption_t& get_preemptor() final; + + int build_primary_map_chunk() final; + + int build_replica_map_chunk() final; + + void reserve_replicas() final; + + void set_reserving_now() final; + void clear_reserving_now() final; + + [[nodiscard]] bool was_epoch_changed() const final; + + void set_queued_or_active() final; + /// Clears `m_queued_or_active` and restarts snaptrimming + void clear_queued_or_active() final; + + void mark_local_map_ready() final; + + [[nodiscard]] bool are_all_maps_available() const final; + + std::string dump_awaited_maps() const final; + + void set_scrub_begin_time() final; + + void set_scrub_duration() final; + + utime_t scrub_begin_stamp; + std::ostream& gen_prefix(std::ostream& out) const final; + + /// facilitate scrub-backend access to SnapMapper mappings + Scrub::SnapMapReaderI& get_snap_mapper_accessor() + { + return m_pg->snap_mapper; + } + + void log_cluster_warning(const std::string& warning) const final; + + protected: + bool state_test(uint64_t m) const { return m_pg->state_test(m); } + void state_set(uint64_t m) { m_pg->state_set(m); } + void state_clear(uint64_t m) { m_pg->state_clear(m); } + + [[nodiscard]] bool is_scrub_registered() const; + + /// the 'is-in-scheduling-queue' status, using relaxed-semantics access to the + /// status + std::string_view registration_state() const; + + virtual void _scrub_clear_state() {} + + utime_t m_scrub_reg_stamp; ///< stamp we registered for + ScrubQueue::ScrubJobRef m_scrub_job; ///< the scrub-job used by the OSD to + ///< schedule us + + ostream& show(ostream& out) const override; + + public: + // ------------------ the I/F used by the ScrubBackend (ScrubBeListener) + + // note: the reason we must have these forwarders, is because of the + // artificial PG vs. PrimaryLogPG distinction. Some of the services used + // by the scrubber backend are PrimaryLog-specific. + + void add_to_stats(const object_stat_sum_t& stat) override + { + ceph_assert(0 && "expecting a PrimaryLogScrub object"); + } + + void submit_digest_fixes(const digests_fixes_t& fixes) override + { + ceph_assert(0 && "expecting a PrimaryLogScrub object"); + } + + CephContext* get_pg_cct() const final { return m_pg->cct; } + + LoggerSinkSet& get_logger() const final; + + spg_t get_pgid() const final { return m_pg->get_pgid(); } + + /// Returns reference to current osdmap + const OSDMapRef& get_osdmap() const final; + + + // --------------------------------------------------------------------------- + + friend ostream& operator<<(ostream& out, const PgScrubber& scrubber); + + static utime_t scrub_must_stamp() { return utime_t(1, 1); } + + virtual ~PgScrubber(); // must be defined separately, in the .cc file + + [[nodiscard]] bool is_scrub_active() const final { return m_active; } + + private: + void reset_internal_state(); + + /** + * the current scrubbing operation is done. We should mark that fact, so that + * all events related to the previous operation can be discarded. + */ + void advance_token(); + + bool is_token_current(Scrub::act_token_t received_token); + + void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); } + + /** + * mark down some parameters of the initiated scrub: + * - the epoch when started; + * - the depth of the scrub requested (from the PG_STATE variable) + */ + void reset_epoch(epoch_t epoch_queued); + + void run_callbacks(); + + // 'query' command data for an active scrub + void dump_active_scrubber(ceph::Formatter* f, bool is_deep) const; + + // ----- methods used to verify the relevance of incoming events: + + /** + * is the incoming event still relevant and should be forwarded to the FSM? + * + * It isn't if: + * - (1) we are no longer 'actively scrubbing'; or + * - (2) the message is from an epoch prior to when we started the current + * scrub session; or + * - (3) the message epoch is from a previous interval; or + * - (4) the 'abort' configuration flags were set. + * + * For (1) & (2) - the incoming message is discarded, w/o further action. + * + * For (3): (see check_interval() for a full description) if we have not + * reacted yet to this specific new interval, we do now: + * - replica reservations are silently discarded (we count on the replicas to + * notice the interval change and un-reserve themselves); + * - the scrubbing is halted. + * + * For (4): the message will be discarded, but also: + * if this is the first time we've noticed the 'abort' request, we perform + * the abort. + * + * \returns should the incoming event be processed? + */ + bool is_message_relevant(epoch_t epoch_to_verify); + + /** + * check the 'no scrub' configuration options. + */ + [[nodiscard]] bool should_abort() const; + + /** + * Check the 'no scrub' configuration flags. + * + * Reset everything if the abort was not handled before. + * @returns false if the message was discarded due to abort flag. + */ + [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify); + + [[nodiscard]] bool check_interval(epoch_t epoch_to_verify); + + epoch_t m_last_aborted{}; // last time we've noticed a request to abort + + bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? + ///< always 'true', unless we just got out of a + ///< sleep period + + utime_t m_sleep_started_at; + + + // 'optional', as 'ReplicaReservations' & 'LocalReservation' are + // 'RAII-designed' to guarantee un-reserving when deleted. + std::optional m_reservations; + std::optional m_local_osd_resource; + + /// the 'remote' resource we, as a replica, grant our Primary when it is + /// scrubbing + std::optional m_remote_osd_resource; + + void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when + // Active->NotActive + + protected: + PG* const m_pg; + + /** + * the derivative-specific scrub-finishing touches: + */ + virtual void _scrub_finish() {} + + // common code used by build_primary_map_chunk() and + // build_replica_map_chunk(): + int build_scrub_map_chunk(ScrubMap& map, // primary or replica? + ScrubMapBuilder& pos, + hobject_t start, + hobject_t end, + bool deep); + + std::unique_ptr m_fsm; + /// the FSM state, as a string for logging + const char* m_fsm_state_name{nullptr}; + const spg_t m_pg_id; ///< a local copy of m_pg->pg_id + OSDService* const m_osds; + const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami; + + epoch_t m_interval_start{0}; ///< interval's 'from' of when scrubbing was + ///< first scheduled + + void repair_oinfo_oid(ScrubMap& smap); + + /* + * the exact epoch when the scrubbing actually started (started here - cleared + * checks for no-scrub conf). Incoming events are verified against this, with + * stale events discarded. + */ + epoch_t m_epoch_start{0}; ///< the actual epoch when scrubbing started + + /** + * (replica) a tag identifying a specific scrub "session". Incremented + * whenever the Primary releases the replica scrub resources. When the scrub + * session is terminated (even if the interval remains unchanged, as might + * happen following an asok no-scrub command), stale scrub-resched messages + * triggered by the backend will be discarded. + */ + Scrub::act_token_t m_current_token{1}; + + /** + * (primary/replica) a test aid. A counter that is incremented whenever a + * scrub starts, and again when it terminates. Exposed as part of the 'pg + * query' command, to be used by test scripts. + * + * @ATTN: not guaranteed to be accurate. To be only used for tests. This is + * why it is initialized to a meaningless number; + */ + int32_t m_sessions_counter{ + (int32_t)((int64_t)(this) & 0x0000'0000'00ff'fff0)}; + bool m_publish_sessions{false}; //< will the counter be part of 'query' + //output? + + scrub_flags_t m_flags; + + /// a reference to the details of the next scrub (as requested and managed by + /// the PG) + requested_scrub_t& m_planned_scrub; + + bool m_active{false}; + + /** + * a flag designed to prevent the initiation of a second scrub on a PG for + * which scrubbing has been initiated. + * + * set once scrubbing was initiated (i.e. - even before the FSM event that + * will trigger a state-change out of Inactive was handled), and only reset + * once the FSM is back in Inactive. + * In other words - its ON period encompasses: + * - the time period covered today by 'queued', and + * - the time when m_active is set, and + * - all the time from scrub_finish() calling update_stats() till the + * FSM handles the 'finished' event + * + * Compared with 'm_active', this flag is asserted earlier and remains ON for + * longer. + */ + bool m_queued_or_active{false}; + + eversion_t m_subset_last_update{}; + + std::unique_ptr m_store; + + int num_digest_updates_pending{0}; + hobject_t m_start, m_end; ///< note: half-closed: [start,end) + + /// Returns epoch of current osdmap + epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); } + + // collected statistics + int m_shallow_errors{0}; + int m_deep_errors{0}; + int m_fixed_count{0}; + + protected: + /** + * 'm_is_deep' - is the running scrub a deep one? + * + * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is + * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' + * is meaningful both for the primary and the replicas, and is used as a + * parameter when building the scrub maps. + */ + bool m_is_deep{false}; + + /** + * If set: affects the backend & scrubber-backend functions called after all + * scrub maps are available. + * + * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be + * a "user facing" status display only). + */ + bool m_is_repair{false}; + + /** + * User-readable summary of the scrubber's current mode of operation. Used for + * both osd.*.log and the cluster log. + * One of: + * "repair" + * "deep-scrub", + * "scrub + * + * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for + * auto_repair will show as "deep-scrub" and not as "repair" (until the first + * error is detected). + */ + std::string_view m_mode_desc; + + void update_op_mode_text(); + + private: + /** + * initiate a deep-scrub after the current scrub ended with errors. + */ + void request_rescrubbing(requested_scrub_t& req_flags); + + void unregister_from_osd(); + + /* + * Select a range of objects to scrub. + * + * By: + * - setting tentative range based on conf and divisor + * - requesting a partial list of elements from the backend; + * - handling some head/clones issues + * + * The selected range is set directly into 'm_start' and 'm_end' + */ + bool select_range(); + + std::list m_callbacks; + + /** + * send a replica (un)reservation request to the acting set + * + * @param opcode - one of MOSDScrubReserve::REQUEST + * or MOSDScrubReserve::RELEASE + */ + void message_all_replicas(int32_t opcode, std::string_view op_text); + + hobject_t m_max_end; ///< Largest end that may have been sent to replicas + ScrubMapBuilder m_primary_scrubmap_pos; + + void _request_scrub_map(pg_shard_t replica, + eversion_t version, + hobject_t start, + hobject_t end, + bool deep, + bool allow_preemption); + + + Scrub::MapsCollectionStatus m_maps_status; + + void persist_scrub_results(inconsistent_objs_t&& all_errors); + void apply_snap_mapper_fixes( + const std::vector& fix_list); + + // our latest periodic 'publish_stats_to_osd()'. Required frequency depends on + // scrub state. + ceph::coarse_real_clock::time_point m_last_stat_upd{}; + + // ------------ members used if we are a replica + + epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message + + ScrubMapBuilder replica_scrubmap_pos; + ScrubMap replica_scrubmap; + + // the backend, handling the details of comparing maps & fixing objects + std::unique_ptr m_be; + + /** + * we mark the request priority as it arrived. It influences the queuing + * priority when we wait for local updates + */ + Scrub::scrub_prio_t m_replica_request_priority; + + /** + * the 'preemption' "state-machine". + * Note: I was considering an orthogonal sub-machine implementation, but as + * the state diagram is extremely simple, the added complexity wasn't + * justified. + */ + class preemption_data_t : public Scrub::preemption_t { + public: + explicit preemption_data_t(PG* pg); // the PG access is used for conf + // access (and logs) + + [[nodiscard]] bool is_preemptable() const final { return m_preemptable; } + + preemption_data_t(const preemption_data_t&) = delete; + preemption_data_t(preemption_data_t&&) = delete; + + bool do_preempt() final + { + if (m_preempted || !m_preemptable) + return false; + + std::lock_guard lk{m_preemption_lock}; + if (!m_preemptable) + return false; + + m_preempted = true; + return true; + } + + /// same as 'do_preempt()' but w/o checks (as once a replica + /// was preempted, we cannot continue) + void replica_preempted() { m_preempted = true; } + + void enable_preemption() + { + std::lock_guard lk{m_preemption_lock}; + if (are_preemptions_left() && !m_preempted) { + m_preemptable = true; + } + } + + /// used by a replica to set preemptability state according to the Primary's + /// request + void force_preemptability(bool is_allowed) + { + // note: no need to lock for a replica + m_preempted = false; + m_preemptable = is_allowed; + } + + bool disable_and_test() final + { + std::lock_guard lk{m_preemption_lock}; + m_preemptable = false; + return m_preempted; + } + + [[nodiscard]] bool was_preempted() const { return m_preempted; } + + [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; } + + void reset(); + + void adjust_parameters() final + { + std::lock_guard lk{m_preemption_lock}; + + if (m_preempted) { + m_preempted = false; + m_preemptable = adjust_left(); + } else { + m_preemptable = are_preemptions_left(); + } + } + + private: + PG* m_pg; + mutable ceph::mutex m_preemption_lock = ceph::make_mutex("preemption_lock"); + bool m_preemptable{false}; + bool m_preempted{false}; + int m_left; + size_t m_size_divisor{1}; + bool are_preemptions_left() const { return m_left > 0; } + + bool adjust_left() + { + if (m_left > 0) { + --m_left; + m_size_divisor *= 2; + } + return m_left > 0; + } + }; + + preemption_data_t preemption_data; +}; -- cgit v1.2.3