1 files changed, 553 insertions, 0 deletions
diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h
new file mode 100644
index 000000000..17cd0e493
--- /dev/null
+++ b/src/osd/scrubber/osd_scrub_sched.h
@@ -0,0 +1,553 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+// clang-format off
+/*
+┌───────────────────────┐
+│ OSD                   │
+│ OSDService           ─┼───┐
+│                       │   │
+│                       │   │
+└───────────────────────┘   │   Ownes & uses the following
+                            │   ScrubQueue interfaces:
+                            │
+                            │
+                            │   - resource management (*1)
+                            │
+                            │   - environment conditions (*2)
+                            │
+                            │   - scrub scheduling (*3)
+                            │
+                            │
+                            │
+                            │
+                            │
+                            │
+ ScrubQueue                 │
+┌───────────────────────────▼────────────┐
+│                                        │
+│                                        │
+│  ScrubQContainer    to_scrub <>────────┼────────┐
+│  ScrubQContainer    penalized          │        │
+│                                        │        │
+│                                        │        │
+│  OSD_wide resource counters            │        │
+│                                        │        │
+│                                        │        │
+│  "env scrub conditions" monitoring     │        │
+│                                        │        │
+│                                        │        │
+│                                        │        │
+│                                        │        │
+└─▲──────────────────────────────────────┘        │
+  │                                               │
+  │                                               │
+  │uses interface <4>                             │
+  │                                               │
+  │                                               │
+  │            ┌──────────────────────────────────┘
+  │            │                 shared ownership of jobs
+  │            │
+  │      ┌─────▼──────┐
+  │      │ScrubJob    │
+  │      │            ├┐
+  │      │            ││
+  │      │            │┼┐
+  │      │            │┼│
+  └──────┤            │┼┤◄──────┐
+         │            │┼│       │
+         │            │┼│       │
+         │            │┼│       │
+         └┬───────────┼┼│       │shared ownership
+          └─┼┼┼┼┼┼┼┼┼┼┼┼│       │
+            └───────────┘       │
+                                │
+                                │
+                                │
+                                │
+┌───────────────────────────────┼─┐
+│                               <>│
+│PgScrubber                       │
+│                                 │
+│                                 │
+│                                 │
+│                                 │
+│                                 │
+└─────────────────────────────────┘
+
+
+ScrubQueue interfaces (main functions):
+
+<1> - OSD/PG resources management:
+
+  - can_inc_scrubs()
+  - {inc/dec}_scrubs_{local/remote}()
+  - dump_scrub_reservations()
+  - {set/clear/is}_reserving_now()
+
+<2> - environment conditions:
+
+  - update_loadavg()
+
+  - scrub_load_below_threshold()
+  - scrub_time_permit()
+
+<3> - scheduling scrubs:
+
+  - select_pg_and_scrub()
+  - dump_scrubs()
+
+<4> - manipulating a job's state:
+
+  - register_with_osd()
+  - remove_from_osd_queue()
+  - update_job()
+
+ */
+// clang-format on
+
+#include <atomic>
+#include <chrono>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "common/RefCountedObj.h"
+#include "common/ceph_atomic.h"
+#include "osd/osd_types.h"
+#include "osd/scrubber_common.h"
+#include "include/utime_fmt.h"
+#include "osd/osd_types_fmt.h"
+#include "utime.h"
+
+class PG;
+
+namespace Scrub {
+
+using namespace ::std::literals;
+
+// possible outcome when trying to select a PG and scrub it
+enum class schedule_result_t {
+  scrub_initiated,     // successfully started a scrub
+  none_ready,	       // no pg to scrub
+  no_local_resources,  // failure to secure local OSD scrub resource
+  already_started,     // failed, as already started scrubbing this pg
+  no_such_pg,	       // can't find this pg
+  bad_pg_state,	       // pg state (clean, active, etc.)
+  preconditions	       // time, configuration, etc.
+};
+
+// the OSD services provided to the scrub scheduler
+class ScrubSchedListener {
+ public:
+  virtual int get_nodeid() const = 0;  // returns the OSD number ('whoami')
+
+  /**
+   * A callback used by the ScrubQueue object to initiate a scrub on a specific
+   * PG.
+   *
+   * The request might fail for multiple reasons, as ScrubQueue cannot by its
+   * own check some of the PG-specific preconditions and those are checked here.
+   * See attempt_t definition.
+   *
+   * @return a Scrub::attempt_t detailing either a success, or the failure
+   * reason.
+   */
+  virtual schedule_result_t initiate_a_scrub(
+    spg_t pgid,
+    bool allow_requested_repair_only) = 0;
+
+  virtual ~ScrubSchedListener() {}
+};
+
+}  // namespace Scrub
+
+/**
+ * the queue of PGs waiting to be scrubbed.
+ * Main operations are scheduling/unscheduling a PG to be scrubbed at a certain
+ * time.
+ *
+ * A "penalty" queue maintains those PGs that have failed to reserve the
+ * resources of their replicas. The PGs in this list will be reinstated into the
+ * scrub queue when all eligible PGs were already handled, or after a timeout
+ * (or if their deadline has passed [[disabled at this time]]).
+ */
+class ScrubQueue {
+ public:
+  enum class must_scrub_t { not_mandatory, mandatory };
+
+  enum class qu_state_t {
+    not_registered,  // not a primary, thus not considered for scrubbing by this
+		     // OSD (also the temporary state when just created)
+    registered,	     // in either of the two queues ('to_scrub' or 'penalized')
+    unregistering    // in the process of being unregistered. Will be finalized
+		     // under lock
+  };
+
+  ScrubQueue(CephContext* cct, Scrub::ScrubSchedListener& osds);
+  virtual ~ScrubQueue() = default;
+
+  struct scrub_schedule_t {
+    utime_t scheduled_at{};
+    utime_t deadline{0, 0};
+  };
+
+  struct sched_params_t {
+    utime_t proposed_time{};
+    double min_interval{0.0};
+    double max_interval{0.0};
+    must_scrub_t is_must{ScrubQueue::must_scrub_t::not_mandatory};
+  };
+
+  struct ScrubJob final : public RefCountedObject {
+
+    /**
+     *  a time scheduled for scrub, and a deadline: The scrub could be delayed
+     * if system load is too high (but not if after the deadline),or if trying
+     * to scrub out of scrub hours.
+     */
+    scrub_schedule_t schedule;
+
+    /// pg to be scrubbed
+    const spg_t pgid;
+
+    /// the OSD id (for the log)
+    const int whoami;
+
+    ceph::atomic<qu_state_t> state{qu_state_t::not_registered};
+
+    /**
+     * the old 'is_registered'. Set whenever the job is registered with the OSD,
+     * i.e. is in either the 'to_scrub' or the 'penalized' vectors.
+     */
+    std::atomic_bool in_queues{false};
+
+    /// last scrub attempt failed to secure replica resources
+    bool resources_failure{false};
+
+    /**
+     *  'updated' is a temporary flag, used to create a barrier after
+     *  'sched_time' and 'deadline' (or any other job entry) were modified by
+     *  different task.
+     *  'updated' also signals the need to move a job back from the penalized
+     *  queue to the regular one.
+     */
+    std::atomic_bool updated{false};
+
+    /**
+     * the scrubber is waiting for locked objects to be unlocked.
+     * Set after a grace period has passed.
+     */
+    bool blocked{false};
+    utime_t blocked_since{};
+
+    utime_t penalty_timeout{0, 0};
+
+    CephContext* cct;
+
+    ScrubJob(CephContext* cct, const spg_t& pg, int node_id);
+
+    utime_t get_sched_time() const { return schedule.scheduled_at; }
+
+    /**
+     * relatively low-cost(*) access to the scrub job's state, to be used in
+     * logging.
+     *  (*) not a low-cost access on x64 architecture
+     */
+    std::string_view state_desc() const
+    {
+      return ScrubQueue::qu_state_text(state.load(std::memory_order_relaxed));
+    }
+
+    void update_schedule(const ScrubQueue::scrub_schedule_t& adjusted);
+
+    void dump(ceph::Formatter* f) const;
+
+    /*
+     * as the atomic 'in_queues' appears in many log prints, accessing it for
+     * display-only should be made less expensive (on ARM. On x86 the _relaxed
+     * produces the same code as '_cs')
+     */
+    std::string_view registration_state() const
+    {
+      return in_queues.load(std::memory_order_relaxed) ? "in-queue"
+						       : "not-queued";
+    }
+
+    /**
+     * a text description of the "scheduling intentions" of this PG:
+     * are we already scheduled for a scrub/deep scrub? when?
+     */
+    std::string scheduling_state(utime_t now_is, bool is_deep_expected) const;
+
+    friend std::ostream& operator<<(std::ostream& out, const ScrubJob& pg);
+  };
+
+  friend class TestOSDScrub;
+  friend class ScrubSchedTestWrapper; ///< unit-tests structure
+
+  using ScrubJobRef = ceph::ref_t<ScrubJob>;
+  using ScrubQContainer = std::vector<ScrubJobRef>;
+
+  static std::string_view qu_state_text(qu_state_t st);
+
+  /**
+   * called periodically by the OSD to select the first scrub-eligible PG
+   * and scrub it.
+   *
+   * Selection is affected by:
+   * - time of day: scheduled scrubbing might be configured to only happen
+   *   during certain hours;
+   * - same for days of the week, and for the system load;
+   *
+   * @param preconds: what types of scrub are allowed, given system status &
+   *                  config. Some of the preconditions are calculated here.
+   * @return Scrub::attempt_t::scrubbing if a scrub session was successfully
+   *         initiated. Otherwise - the failure cause.
+   *
+   * locking: locks jobs_lock
+   */
+  Scrub::schedule_result_t select_pg_and_scrub(Scrub::ScrubPreconds& preconds);
+
+  /**
+   * Translate attempt_ values into readable text
+   */
+  static std::string_view attempt_res_text(Scrub::schedule_result_t v);
+
+  /**
+   * remove the pg from set of PGs to be scanned for scrubbing.
+   * To be used if we are no longer the PG's primary, or if the PG is removed.
+   */
+  void remove_from_osd_queue(ScrubJobRef sjob);
+
+  /**
+   * @return the list (not std::set!) of all scrub jobs registered
+   *   (apart from PGs in the process of being removed)
+   */
+  ScrubQContainer list_registered_jobs() const;
+
+  /**
+   * Add the scrub job to the list of jobs (i.e. list of PGs) to be periodically
+   * scrubbed by the OSD.
+   * The registration is active as long as the PG exists and the OSD is its
+   * primary.
+   *
+   * See update_job() for the handling of the 'suggested' parameter.
+   *
+   * locking: might lock jobs_lock
+   */
+  void register_with_osd(ScrubJobRef sjob, const sched_params_t& suggested);
+
+  /**
+   * modify a scrub-job's scheduled time and deadline
+   *
+   * There are 3 argument combinations to consider:
+   * - 'must' is asserted, and the suggested time is 'scrub_must_stamp':
+   *   the registration will be with "beginning of time" target, making the
+   *   scrub-job eligible to immediate scrub (given that external conditions
+   *   do not prevent scrubbing)
+   *
+   * - 'must' is asserted, and the suggested time is 'now':
+   *   This happens if our stats are unknown. The results are similar to the
+   *   previous scenario.
+   *
+   * - not a 'must': we take the suggested time as a basis, and add to it some
+   *   configuration / random delays.
+   *
+   *  ('must' is sched_params_t.is_must)
+   *
+   *  locking: not using the jobs_lock
+   */
+  void update_job(ScrubJobRef sjob, const sched_params_t& suggested);
+
+  sched_params_t determine_scrub_time(const requested_scrub_t& request_flags,
+				      const pg_info_t& pg_info,
+				      const pool_opts_t& pool_conf) const;
+
+ public:
+  void dump_scrubs(ceph::Formatter* f) const;
+
+  /**
+   * No new scrub session will start while a scrub was initiated on a PG,
+   * and that PG is trying to acquire replica resources.
+   */
+  void set_reserving_now() { a_pg_is_reserving = true; }
+  void clear_reserving_now() { a_pg_is_reserving = false; }
+  bool is_reserving_now() const { return a_pg_is_reserving; }
+
+  bool can_inc_scrubs() const;
+  bool inc_scrubs_local();
+  void dec_scrubs_local();
+  bool inc_scrubs_remote();
+  void dec_scrubs_remote();
+  void dump_scrub_reservations(ceph::Formatter* f) const;
+
+  /// counting the number of PGs stuck while scrubbing, waiting for objects
+  void mark_pg_scrub_blocked(spg_t blocked_pg);
+  void clear_pg_scrub_blocked(spg_t blocked_pg);
+  int get_blocked_pgs_count() const;
+
+  /**
+   * Pacing the scrub operation by inserting delays (mostly between chunks)
+   *
+   * Special handling for regular scrubs that continued into "no scrub" times.
+   * Scrubbing will continue, but the delays will be controlled by a separate
+   * (read - with higher value) configuration element
+   * (osd_scrub_extended_sleep).
+   */
+  double scrub_sleep_time(bool must_scrub) const;  /// \todo (future) return
+						   /// milliseconds
+
+  /**
+   *  called every heartbeat to update the "daily" load average
+   *
+   *  @returns a load value for the logger
+   */
+  [[nodiscard]] std::optional<double> update_load_average();
+
+ private:
+  CephContext* cct;
+  Scrub::ScrubSchedListener& osd_service;
+
+#ifdef WITH_SEASTAR
+  auto& conf() const { return local_conf(); }
+#else
+  auto& conf() const { return cct->_conf; }
+#endif
+
+  /**
+   *  jobs_lock protects the job containers and the relevant scrub-jobs state
+   *  variables. Specifically, the following are guaranteed:
+   *  - 'in_queues' is asserted only if the job is in one of the queues;
+   *  - a job will only be in state 'registered' if in one of the queues;
+   *  - no job will be in the two queues simultaneously;
+   *
+   *  Note that PG locks should not be acquired while holding jobs_lock.
+   */
+  mutable ceph::mutex jobs_lock = ceph::make_mutex("ScrubQueue::jobs_lock");
+
+  ScrubQContainer to_scrub;   ///< scrub jobs (i.e. PGs) to scrub
+  ScrubQContainer penalized;  ///< those that failed to reserve remote resources
+  bool restore_penalized{false};
+
+  double daily_loadavg{0.0};
+
+  static inline constexpr auto registered_job = [](const auto& jobref) -> bool {
+    return jobref->state == qu_state_t::registered;
+  };
+
+  static inline constexpr auto invalid_state = [](const auto& jobref) -> bool {
+    return jobref->state == qu_state_t::not_registered;
+  };
+
+  /**
+   * Are there scrub jobs that should be reinstated?
+   */
+  void scan_penalized(bool forgive_all, utime_t time_now);
+
+  /**
+   * clear dead entries (unregistered, or belonging to removed PGs) from a
+   * queue. Job state is changed to match new status.
+   */
+  void rm_unregistered_jobs(ScrubQContainer& group);
+
+  /**
+   * the set of all scrub jobs in 'group' which are ready to be scrubbed
+   * (ready = their scheduled time has passed).
+   * The scrub jobs in the new collection are sorted according to
+   * their scheduled time.
+   *
+   * Note that the returned container holds independent refs to the
+   * scrub jobs.
+   */
+  ScrubQContainer collect_ripe_jobs(ScrubQContainer& group, utime_t time_now);
+
+
+  /// scrub resources management lock (guarding scrubs_local & scrubs_remote)
+  mutable ceph::mutex resource_lock =
+    ceph::make_mutex("ScrubQueue::resource_lock");
+
+  /// the counters used to manage scrub activity parallelism:
+  int scrubs_local{0};
+  int scrubs_remote{0};
+
+  /**
+   * The scrubbing of PGs might be delayed if the scrubbed chunk of objects is
+   * locked by some other operation. A bug might cause this to be an infinite
+   * delay. If that happens, the OSDs "scrub resources" (i.e. the
+   * counters that limit the number of concurrent scrub operations) might
+   * be exhausted.
+   * We do issue a cluster-log warning in such occasions, but that message is
+   * easy to miss. The 'some pg is blocked' global flag is used to note the
+   * existence of such a situation in the scrub-queue log messages.
+   */
+  std::atomic_int_fast16_t blocked_scrubs_cnt{0};
+
+  std::atomic_bool a_pg_is_reserving{false};
+
+  [[nodiscard]] bool scrub_load_below_threshold() const;
+  [[nodiscard]] bool scrub_time_permit(utime_t now) const;
+
+  /**
+   * If the scrub job was not explicitly requested, we postpone it by some
+   * random length of time.
+   * And if delaying the scrub - we calculate, based on pool parameters, a
+   * deadline we should scrub before.
+   *
+   * @return a pair of values: the determined scrub time, and the deadline
+   */
+  scrub_schedule_t adjust_target_time(
+    const sched_params_t& recomputed_params) const;
+
+  /**
+   * Look for scrub jobs that have their 'resources_failure' set. These jobs
+   * have failed to acquire remote resources last time we've initiated a scrub
+   * session on them. They are now moved from the 'to_scrub' queue to the
+   * 'penalized' set.
+   *
+   * locking: called with job_lock held
+   */
+  void move_failed_pgs(utime_t now_is);
+
+  Scrub::schedule_result_t select_from_group(
+    ScrubQContainer& group,
+    const Scrub::ScrubPreconds& preconds,
+    utime_t now_is);
+
+protected: // used by the unit-tests
+  /**
+   * unit-tests will override this function to return a mock time
+   */
+  virtual utime_t time_now() const { return ceph_clock_now(); }
+};
+
+template <>
+struct fmt::formatter<ScrubQueue::qu_state_t>
+    : fmt::formatter<std::string_view> {
+  template <typename FormatContext>
+  auto format(const ScrubQueue::qu_state_t& s, FormatContext& ctx)
+  {
+    auto out = ctx.out();
+    out = fmt::formatter<string_view>::format(
+      std::string{ScrubQueue::qu_state_text(s)}, ctx);
+    return out;
+  }
+};
+
+template <>
+struct fmt::formatter<ScrubQueue::ScrubJob> {
+  constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+  template <typename FormatContext>
+  auto format(const ScrubQueue::ScrubJob& sjob, FormatContext& ctx)
+  {
+    return fmt::format_to(
+      ctx.out(),
+      "pg[{}] @ {:s} (dl:{:s}) - <{}> / failure: {} / pen. t.o.: {:s} / queue "
+      "state: {:.7}",
+      sjob.pgid, sjob.schedule.scheduled_at, sjob.schedule.deadline,
+      sjob.registration_state(), sjob.resources_failure, sjob.penalty_timeout,
+      sjob.state.load(std::memory_order_relaxed));
+  }
+};