diff options
Diffstat (limited to 'src/osd/scrubber/osd_scrub_sched.h')
-rw-r--r-- | src/osd/scrubber/osd_scrub_sched.h | 553 |
1 files changed, 553 insertions, 0 deletions
diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h new file mode 100644 index 000000000..17cd0e493 --- /dev/null +++ b/src/osd/scrubber/osd_scrub_sched.h @@ -0,0 +1,553 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once +// clang-format off +/* +┌───────────────────────┐ +│ OSD │ +│ OSDService ─┼───┐ +│ │ │ +│ │ │ +└───────────────────────┘ │ Ownes & uses the following + │ ScrubQueue interfaces: + │ + │ + │ - resource management (*1) + │ + │ - environment conditions (*2) + │ + │ - scrub scheduling (*3) + │ + │ + │ + │ + │ + │ + ScrubQueue │ +┌───────────────────────────▼────────────┐ +│ │ +│ │ +│ ScrubQContainer to_scrub <>────────┼────────┐ +│ ScrubQContainer penalized │ │ +│ │ │ +│ │ │ +│ OSD_wide resource counters │ │ +│ │ │ +│ │ │ +│ "env scrub conditions" monitoring │ │ +│ │ │ +│ │ │ +│ │ │ +│ │ │ +└─▲──────────────────────────────────────┘ │ + │ │ + │ │ + │uses interface <4> │ + │ │ + │ │ + │ ┌──────────────────────────────────┘ + │ │ shared ownership of jobs + │ │ + │ ┌─────▼──────┐ + │ │ScrubJob │ + │ │ ├┐ + │ │ ││ + │ │ │┼┐ + │ │ │┼│ + └──────┤ │┼┤◄──────┐ + │ │┼│ │ + │ │┼│ │ + │ │┼│ │ + └┬───────────┼┼│ │shared ownership + └─┼┼┼┼┼┼┼┼┼┼┼┼│ │ + └───────────┘ │ + │ + │ + │ + │ +┌───────────────────────────────┼─┐ +│ <>│ +│PgScrubber │ +│ │ +│ │ +│ │ +│ │ +│ │ +└─────────────────────────────────┘ + + +ScrubQueue interfaces (main functions): + +<1> - OSD/PG resources management: + + - can_inc_scrubs() + - {inc/dec}_scrubs_{local/remote}() + - dump_scrub_reservations() + - {set/clear/is}_reserving_now() + +<2> - environment conditions: + + - update_loadavg() + + - scrub_load_below_threshold() + - scrub_time_permit() + +<3> - scheduling scrubs: + + - select_pg_and_scrub() + - dump_scrubs() + +<4> - manipulating a job's state: + + - register_with_osd() + - remove_from_osd_queue() + - update_job() + + */ +// clang-format on + +#include <atomic> +#include <chrono> +#include <memory> +#include <optional> +#include <vector> + +#include "common/RefCountedObj.h" +#include "common/ceph_atomic.h" +#include "osd/osd_types.h" +#include "osd/scrubber_common.h" +#include "include/utime_fmt.h" +#include "osd/osd_types_fmt.h" +#include "utime.h" + +class PG; + +namespace Scrub { + +using namespace ::std::literals; + +// possible outcome when trying to select a PG and scrub it +enum class schedule_result_t { + scrub_initiated, // successfully started a scrub + none_ready, // no pg to scrub + no_local_resources, // failure to secure local OSD scrub resource + already_started, // failed, as already started scrubbing this pg + no_such_pg, // can't find this pg + bad_pg_state, // pg state (clean, active, etc.) + preconditions // time, configuration, etc. +}; + +// the OSD services provided to the scrub scheduler +class ScrubSchedListener { + public: + virtual int get_nodeid() const = 0; // returns the OSD number ('whoami') + + /** + * A callback used by the ScrubQueue object to initiate a scrub on a specific + * PG. + * + * The request might fail for multiple reasons, as ScrubQueue cannot by its + * own check some of the PG-specific preconditions and those are checked here. + * See attempt_t definition. + * + * @return a Scrub::attempt_t detailing either a success, or the failure + * reason. + */ + virtual schedule_result_t initiate_a_scrub( + spg_t pgid, + bool allow_requested_repair_only) = 0; + + virtual ~ScrubSchedListener() {} +}; + +} // namespace Scrub + +/** + * the queue of PGs waiting to be scrubbed. + * Main operations are scheduling/unscheduling a PG to be scrubbed at a certain + * time. + * + * A "penalty" queue maintains those PGs that have failed to reserve the + * resources of their replicas. The PGs in this list will be reinstated into the + * scrub queue when all eligible PGs were already handled, or after a timeout + * (or if their deadline has passed [[disabled at this time]]). + */ +class ScrubQueue { + public: + enum class must_scrub_t { not_mandatory, mandatory }; + + enum class qu_state_t { + not_registered, // not a primary, thus not considered for scrubbing by this + // OSD (also the temporary state when just created) + registered, // in either of the two queues ('to_scrub' or 'penalized') + unregistering // in the process of being unregistered. Will be finalized + // under lock + }; + + ScrubQueue(CephContext* cct, Scrub::ScrubSchedListener& osds); + virtual ~ScrubQueue() = default; + + struct scrub_schedule_t { + utime_t scheduled_at{}; + utime_t deadline{0, 0}; + }; + + struct sched_params_t { + utime_t proposed_time{}; + double min_interval{0.0}; + double max_interval{0.0}; + must_scrub_t is_must{ScrubQueue::must_scrub_t::not_mandatory}; + }; + + struct ScrubJob final : public RefCountedObject { + + /** + * a time scheduled for scrub, and a deadline: The scrub could be delayed + * if system load is too high (but not if after the deadline),or if trying + * to scrub out of scrub hours. + */ + scrub_schedule_t schedule; + + /// pg to be scrubbed + const spg_t pgid; + + /// the OSD id (for the log) + const int whoami; + + ceph::atomic<qu_state_t> state{qu_state_t::not_registered}; + + /** + * the old 'is_registered'. Set whenever the job is registered with the OSD, + * i.e. is in either the 'to_scrub' or the 'penalized' vectors. + */ + std::atomic_bool in_queues{false}; + + /// last scrub attempt failed to secure replica resources + bool resources_failure{false}; + + /** + * 'updated' is a temporary flag, used to create a barrier after + * 'sched_time' and 'deadline' (or any other job entry) were modified by + * different task. + * 'updated' also signals the need to move a job back from the penalized + * queue to the regular one. + */ + std::atomic_bool updated{false}; + + /** + * the scrubber is waiting for locked objects to be unlocked. + * Set after a grace period has passed. + */ + bool blocked{false}; + utime_t blocked_since{}; + + utime_t penalty_timeout{0, 0}; + + CephContext* cct; + + ScrubJob(CephContext* cct, const spg_t& pg, int node_id); + + utime_t get_sched_time() const { return schedule.scheduled_at; } + + /** + * relatively low-cost(*) access to the scrub job's state, to be used in + * logging. + * (*) not a low-cost access on x64 architecture + */ + std::string_view state_desc() const + { + return ScrubQueue::qu_state_text(state.load(std::memory_order_relaxed)); + } + + void update_schedule(const ScrubQueue::scrub_schedule_t& adjusted); + + void dump(ceph::Formatter* f) const; + + /* + * as the atomic 'in_queues' appears in many log prints, accessing it for + * display-only should be made less expensive (on ARM. On x86 the _relaxed + * produces the same code as '_cs') + */ + std::string_view registration_state() const + { + return in_queues.load(std::memory_order_relaxed) ? "in-queue" + : "not-queued"; + } + + /** + * a text description of the "scheduling intentions" of this PG: + * are we already scheduled for a scrub/deep scrub? when? + */ + std::string scheduling_state(utime_t now_is, bool is_deep_expected) const; + + friend std::ostream& operator<<(std::ostream& out, const ScrubJob& pg); + }; + + friend class TestOSDScrub; + friend class ScrubSchedTestWrapper; ///< unit-tests structure + + using ScrubJobRef = ceph::ref_t<ScrubJob>; + using ScrubQContainer = std::vector<ScrubJobRef>; + + static std::string_view qu_state_text(qu_state_t st); + + /** + * called periodically by the OSD to select the first scrub-eligible PG + * and scrub it. + * + * Selection is affected by: + * - time of day: scheduled scrubbing might be configured to only happen + * during certain hours; + * - same for days of the week, and for the system load; + * + * @param preconds: what types of scrub are allowed, given system status & + * config. Some of the preconditions are calculated here. + * @return Scrub::attempt_t::scrubbing if a scrub session was successfully + * initiated. Otherwise - the failure cause. + * + * locking: locks jobs_lock + */ + Scrub::schedule_result_t select_pg_and_scrub(Scrub::ScrubPreconds& preconds); + + /** + * Translate attempt_ values into readable text + */ + static std::string_view attempt_res_text(Scrub::schedule_result_t v); + + /** + * remove the pg from set of PGs to be scanned for scrubbing. + * To be used if we are no longer the PG's primary, or if the PG is removed. + */ + void remove_from_osd_queue(ScrubJobRef sjob); + + /** + * @return the list (not std::set!) of all scrub jobs registered + * (apart from PGs in the process of being removed) + */ + ScrubQContainer list_registered_jobs() const; + + /** + * Add the scrub job to the list of jobs (i.e. list of PGs) to be periodically + * scrubbed by the OSD. + * The registration is active as long as the PG exists and the OSD is its + * primary. + * + * See update_job() for the handling of the 'suggested' parameter. + * + * locking: might lock jobs_lock + */ + void register_with_osd(ScrubJobRef sjob, const sched_params_t& suggested); + + /** + * modify a scrub-job's scheduled time and deadline + * + * There are 3 argument combinations to consider: + * - 'must' is asserted, and the suggested time is 'scrub_must_stamp': + * the registration will be with "beginning of time" target, making the + * scrub-job eligible to immediate scrub (given that external conditions + * do not prevent scrubbing) + * + * - 'must' is asserted, and the suggested time is 'now': + * This happens if our stats are unknown. The results are similar to the + * previous scenario. + * + * - not a 'must': we take the suggested time as a basis, and add to it some + * configuration / random delays. + * + * ('must' is sched_params_t.is_must) + * + * locking: not using the jobs_lock + */ + void update_job(ScrubJobRef sjob, const sched_params_t& suggested); + + sched_params_t determine_scrub_time(const requested_scrub_t& request_flags, + const pg_info_t& pg_info, + const pool_opts_t& pool_conf) const; + + public: + void dump_scrubs(ceph::Formatter* f) const; + + /** + * No new scrub session will start while a scrub was initiated on a PG, + * and that PG is trying to acquire replica resources. + */ + void set_reserving_now() { a_pg_is_reserving = true; } + void clear_reserving_now() { a_pg_is_reserving = false; } + bool is_reserving_now() const { return a_pg_is_reserving; } + + bool can_inc_scrubs() const; + bool inc_scrubs_local(); + void dec_scrubs_local(); + bool inc_scrubs_remote(); + void dec_scrubs_remote(); + void dump_scrub_reservations(ceph::Formatter* f) const; + + /// counting the number of PGs stuck while scrubbing, waiting for objects + void mark_pg_scrub_blocked(spg_t blocked_pg); + void clear_pg_scrub_blocked(spg_t blocked_pg); + int get_blocked_pgs_count() const; + + /** + * Pacing the scrub operation by inserting delays (mostly between chunks) + * + * Special handling for regular scrubs that continued into "no scrub" times. + * Scrubbing will continue, but the delays will be controlled by a separate + * (read - with higher value) configuration element + * (osd_scrub_extended_sleep). + */ + double scrub_sleep_time(bool must_scrub) const; /// \todo (future) return + /// milliseconds + + /** + * called every heartbeat to update the "daily" load average + * + * @returns a load value for the logger + */ + [[nodiscard]] std::optional<double> update_load_average(); + + private: + CephContext* cct; + Scrub::ScrubSchedListener& osd_service; + +#ifdef WITH_SEASTAR + auto& conf() const { return local_conf(); } +#else + auto& conf() const { return cct->_conf; } +#endif + + /** + * jobs_lock protects the job containers and the relevant scrub-jobs state + * variables. Specifically, the following are guaranteed: + * - 'in_queues' is asserted only if the job is in one of the queues; + * - a job will only be in state 'registered' if in one of the queues; + * - no job will be in the two queues simultaneously; + * + * Note that PG locks should not be acquired while holding jobs_lock. + */ + mutable ceph::mutex jobs_lock = ceph::make_mutex("ScrubQueue::jobs_lock"); + + ScrubQContainer to_scrub; ///< scrub jobs (i.e. PGs) to scrub + ScrubQContainer penalized; ///< those that failed to reserve remote resources + bool restore_penalized{false}; + + double daily_loadavg{0.0}; + + static inline constexpr auto registered_job = [](const auto& jobref) -> bool { + return jobref->state == qu_state_t::registered; + }; + + static inline constexpr auto invalid_state = [](const auto& jobref) -> bool { + return jobref->state == qu_state_t::not_registered; + }; + + /** + * Are there scrub jobs that should be reinstated? + */ + void scan_penalized(bool forgive_all, utime_t time_now); + + /** + * clear dead entries (unregistered, or belonging to removed PGs) from a + * queue. Job state is changed to match new status. + */ + void rm_unregistered_jobs(ScrubQContainer& group); + + /** + * the set of all scrub jobs in 'group' which are ready to be scrubbed + * (ready = their scheduled time has passed). + * The scrub jobs in the new collection are sorted according to + * their scheduled time. + * + * Note that the returned container holds independent refs to the + * scrub jobs. + */ + ScrubQContainer collect_ripe_jobs(ScrubQContainer& group, utime_t time_now); + + + /// scrub resources management lock (guarding scrubs_local & scrubs_remote) + mutable ceph::mutex resource_lock = + ceph::make_mutex("ScrubQueue::resource_lock"); + + /// the counters used to manage scrub activity parallelism: + int scrubs_local{0}; + int scrubs_remote{0}; + + /** + * The scrubbing of PGs might be delayed if the scrubbed chunk of objects is + * locked by some other operation. A bug might cause this to be an infinite + * delay. If that happens, the OSDs "scrub resources" (i.e. the + * counters that limit the number of concurrent scrub operations) might + * be exhausted. + * We do issue a cluster-log warning in such occasions, but that message is + * easy to miss. The 'some pg is blocked' global flag is used to note the + * existence of such a situation in the scrub-queue log messages. + */ + std::atomic_int_fast16_t blocked_scrubs_cnt{0}; + + std::atomic_bool a_pg_is_reserving{false}; + + [[nodiscard]] bool scrub_load_below_threshold() const; + [[nodiscard]] bool scrub_time_permit(utime_t now) const; + + /** + * If the scrub job was not explicitly requested, we postpone it by some + * random length of time. + * And if delaying the scrub - we calculate, based on pool parameters, a + * deadline we should scrub before. + * + * @return a pair of values: the determined scrub time, and the deadline + */ + scrub_schedule_t adjust_target_time( + const sched_params_t& recomputed_params) const; + + /** + * Look for scrub jobs that have their 'resources_failure' set. These jobs + * have failed to acquire remote resources last time we've initiated a scrub + * session on them. They are now moved from the 'to_scrub' queue to the + * 'penalized' set. + * + * locking: called with job_lock held + */ + void move_failed_pgs(utime_t now_is); + + Scrub::schedule_result_t select_from_group( + ScrubQContainer& group, + const Scrub::ScrubPreconds& preconds, + utime_t now_is); + +protected: // used by the unit-tests + /** + * unit-tests will override this function to return a mock time + */ + virtual utime_t time_now() const { return ceph_clock_now(); } +}; + +template <> +struct fmt::formatter<ScrubQueue::qu_state_t> + : fmt::formatter<std::string_view> { + template <typename FormatContext> + auto format(const ScrubQueue::qu_state_t& s, FormatContext& ctx) + { + auto out = ctx.out(); + out = fmt::formatter<string_view>::format( + std::string{ScrubQueue::qu_state_text(s)}, ctx); + return out; + } +}; + +template <> +struct fmt::formatter<ScrubQueue::ScrubJob> { + constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } + + template <typename FormatContext> + auto format(const ScrubQueue::ScrubJob& sjob, FormatContext& ctx) + { + return fmt::format_to( + ctx.out(), + "pg[{}] @ {:s} (dl:{:s}) - <{}> / failure: {} / pen. t.o.: {:s} / queue " + "state: {:.7}", + sjob.pgid, sjob.schedule.scheduled_at, sjob.schedule.deadline, + sjob.registration_state(), sjob.resources_failure, sjob.penalty_timeout, + sjob.state.load(std::memory_order_relaxed)); + } +}; |