1 files changed, 821 insertions, 0 deletions
diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h
new file mode 100644
index 000000000..392a4a588
--- /dev/null
+++ b/src/osd/pg_scrubber.h
@@ -0,0 +1,821 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "PG.h"
+#include "ScrubStore.h"
+#include "scrub_machine_lstnr.h"
+#include "scrubber_common.h"
+
+class Callback;
+
+namespace Scrub {
+class ScrubMachine;
+struct BuildMap;
+
+/**
+ * Reserving/freeing scrub resources at the replicas.
+ *
+ *  When constructed - sends reservation requests to the acting_set.
+ *  A rejection triggers a "couldn't acquire the replicas' scrub resources" event.
+ *  All previous requests, whether already granted or not, are explicitly released.
+ *
+ *  A note re performance: I've measured a few container alternatives for
+ *  m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as
+ *  expected. flat_set is only slightly better. Surprisingly - std::vector (with no
+ *  sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve.
+ */
+class ReplicaReservations {
+  using OrigSet = decltype(std::declval<PG>().get_actingset());
+
+  PG* m_pg;
+  OrigSet m_acting_set;
+  OSDService* m_osds;
+  std::vector<pg_shard_t> m_waited_for_peers;
+  std::vector<pg_shard_t> m_reserved_peers;
+  bool m_had_rejections{false};
+  int m_pending{-1};
+
+  void release_replica(pg_shard_t peer, epoch_t epoch);
+
+  void send_all_done();	 ///< all reservations are granted
+
+  /// notify the scrubber that we have failed to reserve replicas' resources
+  void send_reject();
+
+ public:
+  std::string m_log_msg_prefix;
+
+  /**
+   *  quietly discard all knowledge about existing reservations. No messages
+   *  are sent to peers.
+   *  To be used upon interval change, as we know the the running scrub is no longer
+   *  relevant, and that the replicas had reset the reservations on their side.
+   */
+  void discard_all();
+
+  ReplicaReservations(PG* pg, pg_shard_t whoami);
+
+  ~ReplicaReservations();
+
+  void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+
+  void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
+
+  std::ostream& gen_prefix(std::ostream& out) const;
+};
+
+/**
+ *  wraps the local OSD scrub resource reservation in an RAII wrapper
+ */
+class LocalReservation {
+  OSDService* m_osds;
+  bool m_holding_local_reservation{false};
+
+ public:
+  LocalReservation(OSDService* osds);
+  ~LocalReservation();
+  bool is_reserved() const { return m_holding_local_reservation; }
+};
+
+/**
+ *  wraps the OSD resource we are using when reserved as a replica by a scrubbing master.
+ */
+class ReservedByRemotePrimary {
+  const PgScrubber* m_scrubber; ///< we will be using its gen_prefix()
+  PG* m_pg;
+  OSDService* m_osds;
+  bool m_reserved_by_remote_primary{false};
+  const epoch_t m_reserved_at;
+
+ public:
+  ReservedByRemotePrimary(const PgScrubber* scrubber, PG* pg, OSDService* osds, epoch_t epoch);
+  ~ReservedByRemotePrimary();
+  [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; }
+
+  /// compare the remembered reserved-at epoch to the current interval
+  [[nodiscard]] bool is_stale() const;
+
+  std::ostream& gen_prefix(std::ostream& out) const;
+};
+
+/**
+ * Once all replicas' scrub maps are received, we go on to compare the maps. That is -
+ * unless we we have not yet completed building our own scrub map. MapsCollectionStatus
+ * combines the status of waiting for both the local map and the replicas, without
+ * resorting to adding dummy entries into a list.
+ */
+class MapsCollectionStatus {
+
+  bool m_local_map_ready{false};
+  std::vector<pg_shard_t> m_maps_awaited_for;
+
+ public:
+  [[nodiscard]] bool are_all_maps_available() const
+  {
+    return m_local_map_ready && m_maps_awaited_for.empty();
+  }
+
+  void mark_local_map_ready() { m_local_map_ready = true; }
+
+  void mark_replica_map_request(pg_shard_t from_whom)
+  {
+    m_maps_awaited_for.push_back(from_whom);
+  }
+
+  /// @returns true if indeed waiting for this one. Otherwise: an error string
+  auto mark_arriving_map(pg_shard_t from) -> std::tuple<bool, std::string_view>;
+
+  std::vector<pg_shard_t> get_awaited() const { return m_maps_awaited_for; }
+
+  void reset();
+
+  std::string dump() const;
+
+  friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf);
+};
+
+}  // namespace Scrub
+
+
+/**
+ * the scrub operation flags. Primary only.
+ * Set at scrub start. Checked in multiple locations - mostly
+ * at finish.
+ */
+struct scrub_flags_t {
+
+  unsigned int priority{0};
+
+  /**
+   * set by queue_scrub() if either planned_scrub.auto_repair or
+   * need_auto were set.
+   * Tested at scrub end.
+   */
+  bool auto_repair{false};
+
+  /// this flag indicates that we are scrubbing post repair to verify everything is fixed
+  bool check_repair{false};
+
+  /// checked at the end of the scrub, to possibly initiate a deep-scrub
+  bool deep_scrub_on_error{false};
+
+  /**
+   * scrub must not be aborted.
+   * Set for explicitly requested scrubs, and for scrubs originated by the pairing
+   * process with the 'repair' flag set (in the RequestScrub event).
+   */
+  bool required{false};
+};
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf);
+
+
+/**
+ * The part of PG-scrubbing code that isn't state-machine wiring.
+ *
+ * Why the separation? I wish to move to a different FSM implementation. Thus I
+ * am forced to strongly decouple the state-machine implementation details from
+ * the actual scrubbing code.
+ */
+class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
+
+ public:
+  explicit PgScrubber(PG* pg);
+
+  //  ------------------  the I/F exposed to the PG (ScrubPgIF) -------------
+
+  /// are we waiting for resource reservation grants form our replicas?
+  [[nodiscard]] bool is_reserving() const final;
+
+  void initiate_regular_scrub(epoch_t epoch_queued) final;
+
+  void initiate_scrub_after_repair(epoch_t epoch_queued) final;
+
+  void send_scrub_resched(epoch_t epoch_queued) final;
+
+  void active_pushes_notification(epoch_t epoch_queued) final;
+
+  void update_applied_notification(epoch_t epoch_queued) final;
+
+  void send_scrub_unblock(epoch_t epoch_queued) final;
+
+  void digest_update_notification(epoch_t epoch_queued) final;
+
+  void send_replica_maps_ready(epoch_t epoch_queued) final;
+
+  void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
+
+  void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
+
+  void send_replica_pushes_upd(epoch_t epoch_queued) final;
+  /**
+   *  The PG has updated its 'applied version'. It might be that we are waiting for this
+   *  information: after selecting a range of objects to scrub, we've marked the latest
+   *  version of these objects in m_subset_last_update. We will not start the map building
+   *  before we know that the PG has reached this version.
+   */
+  void on_applied_when_primary(const eversion_t& applied_version) final;
+
+  void send_full_reset(epoch_t epoch_queued) final;
+
+  void send_chunk_free(epoch_t epoch_queued) final;
+
+  void send_chunk_busy(epoch_t epoch_queued) final;
+
+  void send_local_map_done(epoch_t epoch_queued) final;
+
+  void send_maps_compared(epoch_t epoch_queued) final;
+
+  void send_get_next_chunk(epoch_t epoch_queued) final;
+
+  void send_scrub_is_finished(epoch_t epoch_queued) final;
+
+  /**
+   *  we allow some number of preemptions of the scrub, which mean we do
+   *  not block.  Then we start to block.  Once we start blocking, we do
+   *  not stop until the scrub range is completed.
+   */
+  bool write_blocked_by_scrub(const hobject_t& soid) final;
+
+  /// true if the given range intersects the scrub interval in any way
+  bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
+
+  /**
+   *  we are a replica being asked by the Primary to reserve OSD resources for
+   *  scrubbing
+   */
+  void handle_scrub_reserve_request(OpRequestRef op) final;
+
+  void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
+  void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
+  void handle_scrub_reserve_release(OpRequestRef op) final;
+  void discard_replica_reservations() final;
+  void clear_scrub_reservations() final;  // PG::clear... fwds to here
+  void unreserve_replicas() final;
+
+  // managing scrub op registration
+
+  void reg_next_scrub(const requested_scrub_t& request_flags) final;
+
+  void unreg_next_scrub() final;
+
+  void scrub_requested(scrub_level_t scrub_level,
+		       scrub_type_t scrub_type,
+		       requested_scrub_t& req_flags) final;
+
+  /**
+   * Reserve local scrub resources (managed by the OSD)
+   *
+   * Fails if OSD's local-scrubs budget was exhausted
+   * \returns were local resources reserved?
+   */
+  bool reserve_local() final;
+
+  void handle_query_state(ceph::Formatter* f) final;
+
+  void dump(ceph::Formatter* f) const override;
+
+  // used if we are a replica
+
+  void replica_scrub_op(OpRequestRef op) final;
+
+  /// the op priority, taken from the primary's request message
+  Scrub::scrub_prio_t replica_op_priority() const final
+  {
+    return m_replica_request_priority;
+  };
+
+  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+				      unsigned int suggested_priority) const final;
+  /// the version that refers to m_flags.priority
+  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final;
+
+  void add_callback(Context* context) final { m_callbacks.push_back(context); }
+
+  [[nodiscard]] bool are_callbacks_pending() const final  // used for an assert in PG.cc
+  {
+    return !m_callbacks.empty();
+  }
+
+  /// handle a message carrying a replica map
+  void map_from_replica(OpRequestRef op) final;
+
+  void scrub_clear_state() final;
+
+  bool is_queued_or_active() const final;
+
+  /**
+   *  add to scrub statistics, but only if the soid is below the scrub start
+   */
+  virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+					const hobject_t& soid) override
+  {
+    ceph_assert(false);
+  }
+
+  /**
+   * finalize the parameters of the initiated scrubbing session:
+   *
+   * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
+   * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
+   */
+  void set_op_parameters(requested_scrub_t& request) final;
+
+  void cleanup_store(ObjectStore::Transaction* t) final;
+
+  bool get_store_errors(const scrub_ls_arg_t& arg,
+			scrub_ls_result_t& res_inout) const override
+  {
+    return false;
+  }
+
+  // -------------------------------------------------------------------------------------------
+  // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
+
+  [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); }
+
+  void select_range_n_notify() final;
+
+  /// walk the log to find the latest update that affects our chunk
+  eversion_t search_log_for_updates() const final;
+
+  eversion_t get_last_update_applied() const final
+  {
+    return m_pg->recovery_state.get_last_update_applied();
+  }
+
+  int pending_active_pushes() const final { return m_pg->active_pushes; }
+
+  void on_init() final;
+  void on_replica_init() final;
+  void replica_handling_done() final;
+
+  /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+  /// (thus can be called from FSM reactions)
+  void clear_pgscrub_state() final;
+
+  /*
+   * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
+   * is asserted - after a configuration-dependent timeout.
+   */
+  void add_delayed_scheduling() final;
+
+  void get_replicas_maps(bool replica_can_preempt) final;
+
+  void on_digest_updates() final;
+
+  void scrub_begin() final;
+
+  void scrub_finish() final;
+
+  ScrubMachineListener::MsgAndEpoch
+  prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final;
+
+  void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final;
+
+  void send_preempted_replica() final;
+
+  void send_remotes_reserved(epoch_t epoch_queued) final;
+  void send_reservation_failure(epoch_t epoch_queued) final;
+
+  /**
+   *  does the PG have newer updates than what we (the scrubber) know?
+   */
+  [[nodiscard]] bool has_pg_marked_new_updates() const final;
+
+  void set_subset_last_update(eversion_t e) final;
+
+  void maps_compare_n_cleanup() final;
+
+  Scrub::preemption_t& get_preemptor() final;
+
+  int build_primary_map_chunk() final;
+
+  int build_replica_map_chunk() final;
+
+  void reserve_replicas() final;
+
+  [[nodiscard]] bool was_epoch_changed() const final;
+
+  void set_queued_or_active() final;
+  void clear_queued_or_active() final;
+
+  void mark_local_map_ready() final;
+
+  [[nodiscard]] bool are_all_maps_available() const final;
+
+  std::string dump_awaited_maps() const final;
+
+  std::ostream& gen_prefix(std::ostream& out) const final;
+
+ protected:
+  bool state_test(uint64_t m) const { return m_pg->state_test(m); }
+  void state_set(uint64_t m) { m_pg->state_set(m); }
+  void state_clear(uint64_t m) { m_pg->state_clear(m); }
+
+  [[nodiscard]] bool is_scrub_registered() const;
+
+  virtual void _scrub_clear_state() {}
+
+  utime_t m_scrub_reg_stamp;  ///< stamp we registered for
+
+  ostream& show(ostream& out) const override;
+
+ public:
+  // -------------------------------------------------------------------------------------------
+
+  friend ostream& operator<<(ostream& out, const PgScrubber& scrubber);
+
+  static utime_t scrub_must_stamp() { return utime_t(1, 1); }
+
+  virtual ~PgScrubber();  // must be defined separately, in the .cc file
+
+  [[nodiscard]] bool is_scrub_active() const final { return m_active; }
+
+ private:
+  void reset_internal_state();
+
+  /**
+   *  the current scrubbing operation is done. We should mark that fact, so that
+   *  all events related to the previous operation can be discarded.
+   */
+  void advance_token();
+
+  bool is_token_current(Scrub::act_token_t received_token);
+
+  void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
+
+  void _scan_snaps(ScrubMap& smap);
+
+  ScrubMap clean_meta_map();
+
+  /**
+   *  mark down some parameters of the initiated scrub:
+   *  - the epoch when started;
+   *  - the depth of the scrub requested (from the PG_STATE variable)
+   */
+  void reset_epoch(epoch_t epoch_queued);
+
+  void run_callbacks();
+
+  // -----     methods used to verify the relevance of incoming events:
+
+  /**
+   *  is the incoming event still relevant, and should be processed?
+   *
+   *  It isn't if:
+   *  - (1) we are no longer 'actively scrubbing'; or
+   *  - (2) the message is from an epoch prior to when we started the current scrub
+   * session; or
+   *  - (3) the message epoch is from a previous interval; or
+   *  - (4) the 'abort' configuration flags were set.
+   *
+   *  For (1) & (2) - teh incoming message is discarded, w/o further action.
+   *
+   *  For (3): (see check_interval() for a full description) if we have not reacted yet
+   *  to this specific new interval, we do now:
+   *  - replica reservations are silently discarded (we count on the replicas to notice
+   *        the interval change and un-reserve themselves);
+   *  - the scrubbing is halted.
+   *
+   *  For (4): the message will be discarded, but also:
+   *    if this is the first time we've noticed the 'abort' request, we perform the abort.
+   *
+   *  \returns should the incoming event be processed?
+   */
+  bool is_message_relevant(epoch_t epoch_to_verify);
+
+  /**
+   * check the 'no scrub' configuration options.
+   */
+  [[nodiscard]] bool should_abort() const;
+
+  /**
+   * Check the 'no scrub' configuration flags.
+   *
+   * Reset everything if the abort was not handled before.
+   * @returns false if the message was discarded due to abort flag.
+   */
+  [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify);
+
+  [[nodiscard]] bool check_interval(epoch_t epoch_to_verify);
+
+  epoch_t m_last_aborted{};  // last time we've noticed a request to abort
+
+  /**
+   * return true if any inconsistency/missing is repaired, false otherwise
+   */
+  [[nodiscard]] bool scrub_process_inconsistent();
+
+  void scrub_compare_maps();
+
+  bool m_needs_sleep{true};  ///< should we sleep before being rescheduled? always
+			     ///< 'true', unless we just got out of a sleep period
+
+  utime_t m_sleep_started_at;
+
+
+  // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
+  // to guarantee un-reserving when deleted.
+  std::optional<Scrub::ReplicaReservations> m_reservations;
+  std::optional<Scrub::LocalReservation> m_local_osd_resource;
+
+  /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
+  std::optional<Scrub::ReservedByRemotePrimary> m_remote_osd_resource;
+
+  void cleanup_on_finish();  // scrub_clear_state() as called for a Primary when
+			     // Active->NotActive
+
+ protected:
+  PG* const m_pg;
+
+  /**
+   * the derivative-specific scrub-finishing touches:
+   */
+  virtual void _scrub_finish() {}
+
+  /**
+   * Validate consistency of the object info and snap sets.
+   */
+  virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest)
+  {}
+
+  // common code used by build_primary_map_chunk() and build_replica_map_chunk():
+  int build_scrub_map_chunk(ScrubMap& map,  // primary or replica?
+			    ScrubMapBuilder& pos,
+			    hobject_t start,
+			    hobject_t end,
+			    bool deep);
+
+  std::unique_ptr<Scrub::ScrubMachine> m_fsm;
+  const spg_t m_pg_id;	///< a local copy of m_pg->pg_id
+  OSDService* const m_osds;
+  const pg_shard_t m_pg_whoami;	 ///< a local copy of m_pg->pg_whoami;
+
+  epoch_t m_interval_start{0};  ///< interval's 'from' of when scrubbing was first scheduled
+  /*
+   * the exact epoch when the scrubbing actually started (started here - cleared checks
+   *  for no-scrub conf). Incoming events are verified against this, with stale events
+   *  discarded.
+   */
+  epoch_t m_epoch_start{0};  ///< the actual epoch when scrubbing started
+
+  /**
+   *  (replica) a tag identifying a specific scrub "session". Incremented whenever the
+   *  Primary releases the replica scrub resources.
+   *  When the scrub session is terminated (even if the interval remains unchanged, as
+   *  might happen following an asok no-scrub command), stale scrub-resched messages
+   *  triggered by the backend will be discarded.
+   */
+  Scrub::act_token_t m_current_token{1};
+
+  scrub_flags_t m_flags;
+
+  bool m_active{false};
+
+  /**
+   * a flag designed to prevent the initiation of a second scrub on a PG for which scrubbing
+   * has been initiated.
+   *
+   * set once scrubbing was initiated (i.e. - even before the FSM event that
+   * will trigger a state-change out of Inactive was handled), and only reset
+   * once the FSM is back in Inactive.
+   * In other words - its ON period encompasses:
+   *   - the time period covered today by 'queued', and
+   *   - the time when m_active is set, and
+   *   - all the time from scrub_finish() calling update_stats() till the
+   *     FSM handles the 'finished' event
+   *
+   * Compared with 'm_active', this flag is asserted earlier  and remains ON for longer.
+   */
+  bool m_queued_or_active{false};
+
+  eversion_t m_subset_last_update{};
+
+  std::unique_ptr<Scrub::Store> m_store;
+
+  int num_digest_updates_pending{0};
+  hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
+
+  /// Returns reference to current osdmap
+  const OSDMapRef& get_osdmap() const;
+
+  /// Returns epoch of current osdmap
+  epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
+
+  CephContext* get_pg_cct() const { return m_pg->cct; }
+
+  // collected statistics
+  int m_shallow_errors{0};
+  int m_deep_errors{0};
+  int m_fixed_count{0};
+
+  /// Maps from objects with errors to missing peers
+  HobjToShardSetMapping m_missing;
+
+ protected:
+  /**
+   * 'm_is_deep' - is the running scrub a deep one?
+   *
+   * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
+   * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
+   * meaningful both for the primary and the replicas, and is used as a parameter when
+   * building the scrub maps.
+   */
+  bool m_is_deep{false};
+
+  /**
+   * If set: affects the backend & scrubber-backend functions called after all
+   * scrub maps are available.
+   *
+   * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be
+   * a "user facing" status display only).
+   */
+  bool m_is_repair{false};
+
+  /**
+   * User-readable summary of the scrubber's current mode of operation. Used for
+   * both osd.*.log and the cluster log.
+   * One of:
+   *    "repair"
+   *    "deep-scrub",
+   *    "scrub
+   *
+   * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for
+   * auto_repair will show as "deep-scrub" and not as "repair" (until the first error
+   * is detected).
+   */
+  std::string_view m_mode_desc;
+
+  void update_op_mode_text();
+
+private:
+
+  /**
+   * initiate a deep-scrub after the current scrub ended with errors.
+   */
+  void request_rescrubbing(requested_scrub_t& req_flags);
+
+  /*
+   * Select a range of objects to scrub.
+   *
+   * By:
+   * - setting tentative range based on conf and divisor
+   * - requesting a partial list of elements from the backend;
+   * - handling some head/clones issues
+   *
+   * The selected range is set directly into 'm_start' and 'm_end'
+   */
+  bool select_range();
+
+  std::list<Context*> m_callbacks;
+
+  /**
+   * send a replica (un)reservation request to the acting set
+   *
+   * @param opcode - one of MOSDScrubReserve::REQUEST
+   *                  or MOSDScrubReserve::RELEASE
+   */
+  void message_all_replicas(int32_t opcode, std::string_view op_text);
+
+  hobject_t m_max_end;	///< Largest end that may have been sent to replicas
+  ScrubMap m_primary_scrubmap;
+  ScrubMapBuilder m_primary_scrubmap_pos;
+
+  std::map<pg_shard_t, ScrubMap> m_received_maps;
+
+  /// Cleaned std::map pending snap metadata scrub
+  ScrubMap m_cleaned_meta_map;
+
+  void _request_scrub_map(pg_shard_t replica,
+			  eversion_t version,
+			  hobject_t start,
+			  hobject_t end,
+			  bool deep,
+			  bool allow_preemption);
+
+
+  Scrub::MapsCollectionStatus m_maps_status;
+
+  omap_stat_t m_omap_stats = (const struct omap_stat_t){0};
+
+  /// Maps from objects with errors to inconsistent peers
+  HobjToShardSetMapping m_inconsistent;
+
+  /// Maps from object with errors to good peers
+  std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t>>> m_authoritative;
+
+  // ------------ members used if we are a replica
+
+  epoch_t m_replica_min_epoch;	///< the min epoch needed to handle this message
+
+  ScrubMapBuilder replica_scrubmap_pos;
+  ScrubMap replica_scrubmap;
+
+  /**
+   * we mark the request priority as it arrived. It influences the queuing priority
+   * when we wait for local updates
+   */
+  Scrub::scrub_prio_t m_replica_request_priority;
+
+  /**
+   * the 'preemption' "state-machine".
+   * Note: I was considering an orthogonal sub-machine implementation, but as
+   * the state diagram is extremely simple, the added complexity wasn't justified.
+   */
+  class preemption_data_t : public Scrub::preemption_t {
+   public:
+    preemption_data_t(PG* pg);	// the PG access is used for conf access (and logs)
+
+    [[nodiscard]] bool is_preemptable() const final { return m_preemptable; }
+
+    bool do_preempt() final
+    {
+      if (m_preempted || !m_preemptable)
+	return false;
+
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+      if (!m_preemptable)
+	return false;
+
+      m_preempted = true;
+      return true;
+    }
+
+    /// same as 'do_preempt()' but w/o checks (as once a replica
+    /// was preempted, we cannot continue)
+    void replica_preempted() { m_preempted = true; }
+
+    void enable_preemption()
+    {
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+      if (are_preemptions_left() && !m_preempted) {
+	m_preemptable = true;
+      }
+    }
+
+    /// used by a replica to set preemptability state according to the Primary's request
+    void force_preemptability(bool is_allowed)
+    {
+      // note: no need to lock for a replica
+      m_preempted = false;
+      m_preemptable = is_allowed;
+    }
+
+    bool disable_and_test() final
+    {
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+      m_preemptable = false;
+      return m_preempted;
+    }
+
+    [[nodiscard]] bool was_preempted() const { return m_preempted; }
+
+    [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; }
+
+    void reset();
+
+    void adjust_parameters() final
+    {
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+
+      if (m_preempted) {
+	m_preempted = false;
+	m_preemptable = adjust_left();
+      } else {
+	m_preemptable = are_preemptions_left();
+      }
+    }
+
+   private:
+    PG* m_pg;
+    mutable std::mutex m_preemption_lock;
+    bool m_preemptable{false};
+    bool m_preempted{false};
+    int m_left;
+    size_t m_size_divisor{1};
+    bool are_preemptions_left() const { return m_left > 0; }
+
+    bool adjust_left()
+    {
+      if (m_left > 0) {
+	--m_left;
+	m_size_divisor *= 2;
+      }
+      return m_left > 0;
+    }
+  };
+
+  preemption_data_t preemption_data;
+};