summaryrefslogtreecommitdiffstats
path: root/src/osd/scrubber/osd_scrub_sched.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/osd/scrubber/osd_scrub_sched.h')
-rw-r--r--src/osd/scrubber/osd_scrub_sched.h553
1 files changed, 553 insertions, 0 deletions
diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h
new file mode 100644
index 000000000..17cd0e493
--- /dev/null
+++ b/src/osd/scrubber/osd_scrub_sched.h
@@ -0,0 +1,553 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+// clang-format off
+/*
+┌───────────────────────┐
+│ OSD │
+│ OSDService ─┼───┐
+│ │ │
+│ │ │
+└───────────────────────┘ │ Ownes & uses the following
+ │ ScrubQueue interfaces:
+ │
+ │
+ │ - resource management (*1)
+ │
+ │ - environment conditions (*2)
+ │
+ │ - scrub scheduling (*3)
+ │
+ │
+ │
+ │
+ │
+ │
+ ScrubQueue │
+┌───────────────────────────▼────────────┐
+│ │
+│ │
+│ ScrubQContainer to_scrub <>────────┼────────┐
+│ ScrubQContainer penalized │ │
+│ │ │
+│ │ │
+│ OSD_wide resource counters │ │
+│ │ │
+│ │ │
+│ "env scrub conditions" monitoring │ │
+│ │ │
+│ │ │
+│ │ │
+│ │ │
+└─▲──────────────────────────────────────┘ │
+ │ │
+ │ │
+ │uses interface <4> │
+ │ │
+ │ │
+ │ ┌──────────────────────────────────┘
+ │ │ shared ownership of jobs
+ │ │
+ │ ┌─────▼──────┐
+ │ │ScrubJob │
+ │ │ ├┐
+ │ │ ││
+ │ │ │┼┐
+ │ │ │┼│
+ └──────┤ │┼┤◄──────┐
+ │ │┼│ │
+ │ │┼│ │
+ │ │┼│ │
+ └┬───────────┼┼│ │shared ownership
+ └─┼┼┼┼┼┼┼┼┼┼┼┼│ │
+ └───────────┘ │
+ │
+ │
+ │
+ │
+┌───────────────────────────────┼─┐
+│ <>│
+│PgScrubber │
+│ │
+│ │
+│ │
+│ │
+│ │
+└─────────────────────────────────┘
+
+
+ScrubQueue interfaces (main functions):
+
+<1> - OSD/PG resources management:
+
+ - can_inc_scrubs()
+ - {inc/dec}_scrubs_{local/remote}()
+ - dump_scrub_reservations()
+ - {set/clear/is}_reserving_now()
+
+<2> - environment conditions:
+
+ - update_loadavg()
+
+ - scrub_load_below_threshold()
+ - scrub_time_permit()
+
+<3> - scheduling scrubs:
+
+ - select_pg_and_scrub()
+ - dump_scrubs()
+
+<4> - manipulating a job's state:
+
+ - register_with_osd()
+ - remove_from_osd_queue()
+ - update_job()
+
+ */
+// clang-format on
+
+#include <atomic>
+#include <chrono>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "common/RefCountedObj.h"
+#include "common/ceph_atomic.h"
+#include "osd/osd_types.h"
+#include "osd/scrubber_common.h"
+#include "include/utime_fmt.h"
+#include "osd/osd_types_fmt.h"
+#include "utime.h"
+
+class PG;
+
+namespace Scrub {
+
+using namespace ::std::literals;
+
+// possible outcome when trying to select a PG and scrub it
+enum class schedule_result_t {
+ scrub_initiated, // successfully started a scrub
+ none_ready, // no pg to scrub
+ no_local_resources, // failure to secure local OSD scrub resource
+ already_started, // failed, as already started scrubbing this pg
+ no_such_pg, // can't find this pg
+ bad_pg_state, // pg state (clean, active, etc.)
+ preconditions // time, configuration, etc.
+};
+
+// the OSD services provided to the scrub scheduler
+class ScrubSchedListener {
+ public:
+ virtual int get_nodeid() const = 0; // returns the OSD number ('whoami')
+
+ /**
+ * A callback used by the ScrubQueue object to initiate a scrub on a specific
+ * PG.
+ *
+ * The request might fail for multiple reasons, as ScrubQueue cannot by its
+ * own check some of the PG-specific preconditions and those are checked here.
+ * See attempt_t definition.
+ *
+ * @return a Scrub::attempt_t detailing either a success, or the failure
+ * reason.
+ */
+ virtual schedule_result_t initiate_a_scrub(
+ spg_t pgid,
+ bool allow_requested_repair_only) = 0;
+
+ virtual ~ScrubSchedListener() {}
+};
+
+} // namespace Scrub
+
+/**
+ * the queue of PGs waiting to be scrubbed.
+ * Main operations are scheduling/unscheduling a PG to be scrubbed at a certain
+ * time.
+ *
+ * A "penalty" queue maintains those PGs that have failed to reserve the
+ * resources of their replicas. The PGs in this list will be reinstated into the
+ * scrub queue when all eligible PGs were already handled, or after a timeout
+ * (or if their deadline has passed [[disabled at this time]]).
+ */
+class ScrubQueue {
+ public:
+ enum class must_scrub_t { not_mandatory, mandatory };
+
+ enum class qu_state_t {
+ not_registered, // not a primary, thus not considered for scrubbing by this
+ // OSD (also the temporary state when just created)
+ registered, // in either of the two queues ('to_scrub' or 'penalized')
+ unregistering // in the process of being unregistered. Will be finalized
+ // under lock
+ };
+
+ ScrubQueue(CephContext* cct, Scrub::ScrubSchedListener& osds);
+ virtual ~ScrubQueue() = default;
+
+ struct scrub_schedule_t {
+ utime_t scheduled_at{};
+ utime_t deadline{0, 0};
+ };
+
+ struct sched_params_t {
+ utime_t proposed_time{};
+ double min_interval{0.0};
+ double max_interval{0.0};
+ must_scrub_t is_must{ScrubQueue::must_scrub_t::not_mandatory};
+ };
+
+ struct ScrubJob final : public RefCountedObject {
+
+ /**
+ * a time scheduled for scrub, and a deadline: The scrub could be delayed
+ * if system load is too high (but not if after the deadline),or if trying
+ * to scrub out of scrub hours.
+ */
+ scrub_schedule_t schedule;
+
+ /// pg to be scrubbed
+ const spg_t pgid;
+
+ /// the OSD id (for the log)
+ const int whoami;
+
+ ceph::atomic<qu_state_t> state{qu_state_t::not_registered};
+
+ /**
+ * the old 'is_registered'. Set whenever the job is registered with the OSD,
+ * i.e. is in either the 'to_scrub' or the 'penalized' vectors.
+ */
+ std::atomic_bool in_queues{false};
+
+ /// last scrub attempt failed to secure replica resources
+ bool resources_failure{false};
+
+ /**
+ * 'updated' is a temporary flag, used to create a barrier after
+ * 'sched_time' and 'deadline' (or any other job entry) were modified by
+ * different task.
+ * 'updated' also signals the need to move a job back from the penalized
+ * queue to the regular one.
+ */
+ std::atomic_bool updated{false};
+
+ /**
+ * the scrubber is waiting for locked objects to be unlocked.
+ * Set after a grace period has passed.
+ */
+ bool blocked{false};
+ utime_t blocked_since{};
+
+ utime_t penalty_timeout{0, 0};
+
+ CephContext* cct;
+
+ ScrubJob(CephContext* cct, const spg_t& pg, int node_id);
+
+ utime_t get_sched_time() const { return schedule.scheduled_at; }
+
+ /**
+ * relatively low-cost(*) access to the scrub job's state, to be used in
+ * logging.
+ * (*) not a low-cost access on x64 architecture
+ */
+ std::string_view state_desc() const
+ {
+ return ScrubQueue::qu_state_text(state.load(std::memory_order_relaxed));
+ }
+
+ void update_schedule(const ScrubQueue::scrub_schedule_t& adjusted);
+
+ void dump(ceph::Formatter* f) const;
+
+ /*
+ * as the atomic 'in_queues' appears in many log prints, accessing it for
+ * display-only should be made less expensive (on ARM. On x86 the _relaxed
+ * produces the same code as '_cs')
+ */
+ std::string_view registration_state() const
+ {
+ return in_queues.load(std::memory_order_relaxed) ? "in-queue"
+ : "not-queued";
+ }
+
+ /**
+ * a text description of the "scheduling intentions" of this PG:
+ * are we already scheduled for a scrub/deep scrub? when?
+ */
+ std::string scheduling_state(utime_t now_is, bool is_deep_expected) const;
+
+ friend std::ostream& operator<<(std::ostream& out, const ScrubJob& pg);
+ };
+
+ friend class TestOSDScrub;
+ friend class ScrubSchedTestWrapper; ///< unit-tests structure
+
+ using ScrubJobRef = ceph::ref_t<ScrubJob>;
+ using ScrubQContainer = std::vector<ScrubJobRef>;
+
+ static std::string_view qu_state_text(qu_state_t st);
+
+ /**
+ * called periodically by the OSD to select the first scrub-eligible PG
+ * and scrub it.
+ *
+ * Selection is affected by:
+ * - time of day: scheduled scrubbing might be configured to only happen
+ * during certain hours;
+ * - same for days of the week, and for the system load;
+ *
+ * @param preconds: what types of scrub are allowed, given system status &
+ * config. Some of the preconditions are calculated here.
+ * @return Scrub::attempt_t::scrubbing if a scrub session was successfully
+ * initiated. Otherwise - the failure cause.
+ *
+ * locking: locks jobs_lock
+ */
+ Scrub::schedule_result_t select_pg_and_scrub(Scrub::ScrubPreconds& preconds);
+
+ /**
+ * Translate attempt_ values into readable text
+ */
+ static std::string_view attempt_res_text(Scrub::schedule_result_t v);
+
+ /**
+ * remove the pg from set of PGs to be scanned for scrubbing.
+ * To be used if we are no longer the PG's primary, or if the PG is removed.
+ */
+ void remove_from_osd_queue(ScrubJobRef sjob);
+
+ /**
+ * @return the list (not std::set!) of all scrub jobs registered
+ * (apart from PGs in the process of being removed)
+ */
+ ScrubQContainer list_registered_jobs() const;
+
+ /**
+ * Add the scrub job to the list of jobs (i.e. list of PGs) to be periodically
+ * scrubbed by the OSD.
+ * The registration is active as long as the PG exists and the OSD is its
+ * primary.
+ *
+ * See update_job() for the handling of the 'suggested' parameter.
+ *
+ * locking: might lock jobs_lock
+ */
+ void register_with_osd(ScrubJobRef sjob, const sched_params_t& suggested);
+
+ /**
+ * modify a scrub-job's scheduled time and deadline
+ *
+ * There are 3 argument combinations to consider:
+ * - 'must' is asserted, and the suggested time is 'scrub_must_stamp':
+ * the registration will be with "beginning of time" target, making the
+ * scrub-job eligible to immediate scrub (given that external conditions
+ * do not prevent scrubbing)
+ *
+ * - 'must' is asserted, and the suggested time is 'now':
+ * This happens if our stats are unknown. The results are similar to the
+ * previous scenario.
+ *
+ * - not a 'must': we take the suggested time as a basis, and add to it some
+ * configuration / random delays.
+ *
+ * ('must' is sched_params_t.is_must)
+ *
+ * locking: not using the jobs_lock
+ */
+ void update_job(ScrubJobRef sjob, const sched_params_t& suggested);
+
+ sched_params_t determine_scrub_time(const requested_scrub_t& request_flags,
+ const pg_info_t& pg_info,
+ const pool_opts_t& pool_conf) const;
+
+ public:
+ void dump_scrubs(ceph::Formatter* f) const;
+
+ /**
+ * No new scrub session will start while a scrub was initiated on a PG,
+ * and that PG is trying to acquire replica resources.
+ */
+ void set_reserving_now() { a_pg_is_reserving = true; }
+ void clear_reserving_now() { a_pg_is_reserving = false; }
+ bool is_reserving_now() const { return a_pg_is_reserving; }
+
+ bool can_inc_scrubs() const;
+ bool inc_scrubs_local();
+ void dec_scrubs_local();
+ bool inc_scrubs_remote();
+ void dec_scrubs_remote();
+ void dump_scrub_reservations(ceph::Formatter* f) const;
+
+ /// counting the number of PGs stuck while scrubbing, waiting for objects
+ void mark_pg_scrub_blocked(spg_t blocked_pg);
+ void clear_pg_scrub_blocked(spg_t blocked_pg);
+ int get_blocked_pgs_count() const;
+
+ /**
+ * Pacing the scrub operation by inserting delays (mostly between chunks)
+ *
+ * Special handling for regular scrubs that continued into "no scrub" times.
+ * Scrubbing will continue, but the delays will be controlled by a separate
+ * (read - with higher value) configuration element
+ * (osd_scrub_extended_sleep).
+ */
+ double scrub_sleep_time(bool must_scrub) const; /// \todo (future) return
+ /// milliseconds
+
+ /**
+ * called every heartbeat to update the "daily" load average
+ *
+ * @returns a load value for the logger
+ */
+ [[nodiscard]] std::optional<double> update_load_average();
+
+ private:
+ CephContext* cct;
+ Scrub::ScrubSchedListener& osd_service;
+
+#ifdef WITH_SEASTAR
+ auto& conf() const { return local_conf(); }
+#else
+ auto& conf() const { return cct->_conf; }
+#endif
+
+ /**
+ * jobs_lock protects the job containers and the relevant scrub-jobs state
+ * variables. Specifically, the following are guaranteed:
+ * - 'in_queues' is asserted only if the job is in one of the queues;
+ * - a job will only be in state 'registered' if in one of the queues;
+ * - no job will be in the two queues simultaneously;
+ *
+ * Note that PG locks should not be acquired while holding jobs_lock.
+ */
+ mutable ceph::mutex jobs_lock = ceph::make_mutex("ScrubQueue::jobs_lock");
+
+ ScrubQContainer to_scrub; ///< scrub jobs (i.e. PGs) to scrub
+ ScrubQContainer penalized; ///< those that failed to reserve remote resources
+ bool restore_penalized{false};
+
+ double daily_loadavg{0.0};
+
+ static inline constexpr auto registered_job = [](const auto& jobref) -> bool {
+ return jobref->state == qu_state_t::registered;
+ };
+
+ static inline constexpr auto invalid_state = [](const auto& jobref) -> bool {
+ return jobref->state == qu_state_t::not_registered;
+ };
+
+ /**
+ * Are there scrub jobs that should be reinstated?
+ */
+ void scan_penalized(bool forgive_all, utime_t time_now);
+
+ /**
+ * clear dead entries (unregistered, or belonging to removed PGs) from a
+ * queue. Job state is changed to match new status.
+ */
+ void rm_unregistered_jobs(ScrubQContainer& group);
+
+ /**
+ * the set of all scrub jobs in 'group' which are ready to be scrubbed
+ * (ready = their scheduled time has passed).
+ * The scrub jobs in the new collection are sorted according to
+ * their scheduled time.
+ *
+ * Note that the returned container holds independent refs to the
+ * scrub jobs.
+ */
+ ScrubQContainer collect_ripe_jobs(ScrubQContainer& group, utime_t time_now);
+
+
+ /// scrub resources management lock (guarding scrubs_local & scrubs_remote)
+ mutable ceph::mutex resource_lock =
+ ceph::make_mutex("ScrubQueue::resource_lock");
+
+ /// the counters used to manage scrub activity parallelism:
+ int scrubs_local{0};
+ int scrubs_remote{0};
+
+ /**
+ * The scrubbing of PGs might be delayed if the scrubbed chunk of objects is
+ * locked by some other operation. A bug might cause this to be an infinite
+ * delay. If that happens, the OSDs "scrub resources" (i.e. the
+ * counters that limit the number of concurrent scrub operations) might
+ * be exhausted.
+ * We do issue a cluster-log warning in such occasions, but that message is
+ * easy to miss. The 'some pg is blocked' global flag is used to note the
+ * existence of such a situation in the scrub-queue log messages.
+ */
+ std::atomic_int_fast16_t blocked_scrubs_cnt{0};
+
+ std::atomic_bool a_pg_is_reserving{false};
+
+ [[nodiscard]] bool scrub_load_below_threshold() const;
+ [[nodiscard]] bool scrub_time_permit(utime_t now) const;
+
+ /**
+ * If the scrub job was not explicitly requested, we postpone it by some
+ * random length of time.
+ * And if delaying the scrub - we calculate, based on pool parameters, a
+ * deadline we should scrub before.
+ *
+ * @return a pair of values: the determined scrub time, and the deadline
+ */
+ scrub_schedule_t adjust_target_time(
+ const sched_params_t& recomputed_params) const;
+
+ /**
+ * Look for scrub jobs that have their 'resources_failure' set. These jobs
+ * have failed to acquire remote resources last time we've initiated a scrub
+ * session on them. They are now moved from the 'to_scrub' queue to the
+ * 'penalized' set.
+ *
+ * locking: called with job_lock held
+ */
+ void move_failed_pgs(utime_t now_is);
+
+ Scrub::schedule_result_t select_from_group(
+ ScrubQContainer& group,
+ const Scrub::ScrubPreconds& preconds,
+ utime_t now_is);
+
+protected: // used by the unit-tests
+ /**
+ * unit-tests will override this function to return a mock time
+ */
+ virtual utime_t time_now() const { return ceph_clock_now(); }
+};
+
+template <>
+struct fmt::formatter<ScrubQueue::qu_state_t>
+ : fmt::formatter<std::string_view> {
+ template <typename FormatContext>
+ auto format(const ScrubQueue::qu_state_t& s, FormatContext& ctx)
+ {
+ auto out = ctx.out();
+ out = fmt::formatter<string_view>::format(
+ std::string{ScrubQueue::qu_state_text(s)}, ctx);
+ return out;
+ }
+};
+
+template <>
+struct fmt::formatter<ScrubQueue::ScrubJob> {
+ constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+ template <typename FormatContext>
+ auto format(const ScrubQueue::ScrubJob& sjob, FormatContext& ctx)
+ {
+ return fmt::format_to(
+ ctx.out(),
+ "pg[{}] @ {:s} (dl:{:s}) - <{}> / failure: {} / pen. t.o.: {:s} / queue "
+ "state: {:.7}",
+ sjob.pgid, sjob.schedule.scheduled_at, sjob.schedule.deadline,
+ sjob.registration_state(), sjob.resources_failure, sjob.penalty_timeout,
+ sjob.state.load(std::memory_order_relaxed));
+ }
+};