summaryrefslogtreecommitdiffstats
path: root/src/osd/scrubber/osd_scrub_sched.cc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/osd/scrubber/osd_scrub_sched.cc
parentInitial commit. (diff)
downloadceph-upstream/18.2.2.tar.xz
ceph-upstream/18.2.2.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/osd/scrubber/osd_scrub_sched.cc')
-rw-r--r--src/osd/scrubber/osd_scrub_sched.cc817
1 files changed, 817 insertions, 0 deletions
diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc
new file mode 100644
index 000000000..82b7c689d
--- /dev/null
+++ b/src/osd/scrubber/osd_scrub_sched.cc
@@ -0,0 +1,817 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "./osd_scrub_sched.h"
+
+#include "osd/OSD.h"
+
+#include "pg_scrubber.h"
+
+using namespace ::std::literals;
+
+// ////////////////////////////////////////////////////////////////////////// //
+// ScrubJob
+
+#define dout_context (cct)
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout << "osd." << whoami << " "
+
+ScrubQueue::ScrubJob::ScrubJob(CephContext* cct, const spg_t& pg, int node_id)
+ : RefCountedObject{cct}
+ , pgid{pg}
+ , whoami{node_id}
+ , cct{cct}
+{}
+
+// debug usage only
+ostream& operator<<(ostream& out, const ScrubQueue::ScrubJob& sjob)
+{
+ out << sjob.pgid << ", " << sjob.schedule.scheduled_at
+ << " dead: " << sjob.schedule.deadline << " - "
+ << sjob.registration_state() << " / failure: " << sjob.resources_failure
+ << " / pen. t.o.: " << sjob.penalty_timeout
+ << " / queue state: " << ScrubQueue::qu_state_text(sjob.state);
+
+ return out;
+}
+
+void ScrubQueue::ScrubJob::update_schedule(
+ const ScrubQueue::scrub_schedule_t& adjusted)
+{
+ schedule = adjusted;
+ penalty_timeout = utime_t(0, 0); // helps with debugging
+
+ // 'updated' is changed here while not holding jobs_lock. That's OK, as
+ // the (atomic) flag will only be cleared by select_pg_and_scrub() after
+ // scan_penalized() is called and the job was moved to the to_scrub queue.
+ updated = true;
+ dout(10) << fmt::format("{}: pg[{}] adjusted: {:s} ({})", __func__, pgid,
+ schedule.scheduled_at, registration_state()) << dendl;
+}
+
+std::string ScrubQueue::ScrubJob::scheduling_state(utime_t now_is,
+ bool is_deep_expected) const
+{
+ // if not in the OSD scheduling queues, not a candidate for scrubbing
+ if (state != qu_state_t::registered) {
+ return "no scrub is scheduled";
+ }
+
+ // if the time has passed, we are surely in the queue
+ // (note that for now we do not tell client if 'penalized')
+ if (now_is > schedule.scheduled_at) {
+ // we are never sure that the next scrub will indeed be shallow:
+ return fmt::format("queued for {}scrub", (is_deep_expected ? "deep " : ""));
+ }
+
+ return fmt::format("{}scrub scheduled @ {:s}",
+ (is_deep_expected ? "deep " : ""),
+ schedule.scheduled_at);
+}
+
+
+// ////////////////////////////////////////////////////////////////////////// //
+// ScrubQueue
+
+#undef dout_context
+#define dout_context (cct)
+#undef dout_prefix
+#define dout_prefix \
+ *_dout << "osd." << osd_service.get_nodeid() << " scrub-queue::" << __func__ \
+ << " "
+
+
+ScrubQueue::ScrubQueue(CephContext* cct, Scrub::ScrubSchedListener& osds)
+ : cct{cct}
+ , osd_service{osds}
+{
+ // initialize the daily loadavg with current 15min loadavg
+ if (double loadavgs[3]; getloadavg(loadavgs, 3) == 3) {
+ daily_loadavg = loadavgs[2];
+ } else {
+ derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
+ daily_loadavg = 1.0;
+ }
+}
+
+std::optional<double> ScrubQueue::update_load_average()
+{
+ int hb_interval = conf()->osd_heartbeat_interval;
+ int n_samples = 60 * 24 * 24;
+ if (hb_interval > 1) {
+ n_samples /= hb_interval;
+ if (n_samples < 1)
+ n_samples = 1;
+ }
+
+ // get CPU load avg
+ double loadavg;
+ if (getloadavg(&loadavg, 1) == 1) {
+ daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavg) / n_samples;
+ dout(17) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
+ return 100 * loadavg;
+ }
+
+ return std::nullopt;
+}
+
+/*
+ * Modify the scrub job state:
+ * - if 'registered' (as expected): mark as 'unregistering'. The job will be
+ * dequeued the next time sched_scrub() is called.
+ * - if already 'not_registered': shouldn't really happen, but not a problem.
+ * The state will not be modified.
+ * - same for 'unregistering'.
+ *
+ * Note: not holding the jobs lock
+ */
+void ScrubQueue::remove_from_osd_queue(ScrubJobRef scrub_job)
+{
+ dout(15) << "removing pg[" << scrub_job->pgid << "] from OSD scrub queue"
+ << dendl;
+
+ qu_state_t expected_state{qu_state_t::registered};
+ auto ret =
+ scrub_job->state.compare_exchange_strong(expected_state,
+ qu_state_t::unregistering);
+
+ if (ret) {
+
+ dout(10) << "pg[" << scrub_job->pgid << "] sched-state changed from "
+ << qu_state_text(expected_state) << " to "
+ << qu_state_text(scrub_job->state) << dendl;
+
+ } else {
+
+ // job wasn't in state 'registered' coming in
+ dout(5) << "removing pg[" << scrub_job->pgid
+ << "] failed. State was: " << qu_state_text(expected_state)
+ << dendl;
+ }
+}
+
+void ScrubQueue::register_with_osd(
+ ScrubJobRef scrub_job,
+ const ScrubQueue::sched_params_t& suggested)
+{
+ qu_state_t state_at_entry = scrub_job->state.load();
+ dout(20) << fmt::format(
+ "pg[{}] state at entry: <{:.14}>", scrub_job->pgid,
+ state_at_entry)
+ << dendl;
+
+ switch (state_at_entry) {
+ case qu_state_t::registered:
+ // just updating the schedule?
+ update_job(scrub_job, suggested);
+ break;
+
+ case qu_state_t::not_registered:
+ // insertion under lock
+ {
+ std::unique_lock lck{jobs_lock};
+
+ if (state_at_entry != scrub_job->state) {
+ lck.unlock();
+ dout(5) << " scrub job state changed. Retrying." << dendl;
+ // retry
+ register_with_osd(scrub_job, suggested);
+ break;
+ }
+
+ update_job(scrub_job, suggested);
+ to_scrub.push_back(scrub_job);
+ scrub_job->in_queues = true;
+ scrub_job->state = qu_state_t::registered;
+ }
+ break;
+
+ case qu_state_t::unregistering:
+ // restore to the to_sched queue
+ {
+ // must be under lock, as the job might be removed from the queue
+ // at any minute
+ std::lock_guard lck{jobs_lock};
+
+ update_job(scrub_job, suggested);
+ if (scrub_job->state == qu_state_t::not_registered) {
+ dout(5) << " scrub job state changed to 'not registered'" << dendl;
+ to_scrub.push_back(scrub_job);
+ }
+ scrub_job->in_queues = true;
+ scrub_job->state = qu_state_t::registered;
+ }
+ break;
+ }
+
+ dout(10) << fmt::format(
+ "pg[{}] sched-state changed from <{:.14}> to <{:.14}> (@{:s})",
+ scrub_job->pgid, state_at_entry, scrub_job->state.load(),
+ scrub_job->schedule.scheduled_at)
+ << dendl;
+}
+
+// look mommy - no locks!
+void ScrubQueue::update_job(ScrubJobRef scrub_job,
+ const ScrubQueue::sched_params_t& suggested)
+{
+ // adjust the suggested scrub time according to OSD-wide status
+ auto adjusted = adjust_target_time(suggested);
+ scrub_job->update_schedule(adjusted);
+}
+
+ScrubQueue::sched_params_t ScrubQueue::determine_scrub_time(
+ const requested_scrub_t& request_flags,
+ const pg_info_t& pg_info,
+ const pool_opts_t& pool_conf) const
+{
+ ScrubQueue::sched_params_t res;
+
+ if (request_flags.must_scrub || request_flags.need_auto) {
+
+ // Set the smallest time that isn't utime_t()
+ res.proposed_time = PgScrubber::scrub_must_stamp();
+ res.is_must = ScrubQueue::must_scrub_t::mandatory;
+ // we do not need the interval data in this case
+
+ } else if (pg_info.stats.stats_invalid && conf()->osd_scrub_invalid_stats) {
+ res.proposed_time = time_now();
+ res.is_must = ScrubQueue::must_scrub_t::mandatory;
+
+ } else {
+ res.proposed_time = pg_info.history.last_scrub_stamp;
+ res.min_interval = pool_conf.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0);
+ res.max_interval = pool_conf.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0);
+ }
+
+ dout(15) << fmt::format(
+ "suggested: {:s} hist: {:s} v:{}/{} must:{} pool-min:{} {}",
+ res.proposed_time, pg_info.history.last_scrub_stamp,
+ (bool)pg_info.stats.stats_invalid,
+ conf()->osd_scrub_invalid_stats,
+ (res.is_must == must_scrub_t::mandatory ? "y" : "n"),
+ res.min_interval, request_flags)
+ << dendl;
+ return res;
+}
+
+
+// used under jobs_lock
+void ScrubQueue::move_failed_pgs(utime_t now_is)
+{
+ int punished_cnt{0}; // for log/debug only
+
+ for (auto job = to_scrub.begin(); job != to_scrub.end();) {
+ if ((*job)->resources_failure) {
+ auto sjob = *job;
+
+ // last time it was scheduled for a scrub, this PG failed in securing
+ // remote resources. Move it to the secondary scrub queue.
+
+ dout(15) << "moving " << sjob->pgid
+ << " state: " << ScrubQueue::qu_state_text(sjob->state) << dendl;
+
+ // determine the penalty time, after which the job should be reinstated
+ utime_t after = now_is;
+ after += conf()->osd_scrub_sleep * 2 + utime_t{300'000ms};
+
+ // note: currently - not taking 'deadline' into account when determining
+ // 'penalty_timeout'.
+ sjob->penalty_timeout = after;
+ sjob->resources_failure = false;
+ sjob->updated = false; // as otherwise will be pardoned immediately
+
+ // place in the penalty list, and remove from the to-scrub group
+ penalized.push_back(sjob);
+ job = to_scrub.erase(job);
+ punished_cnt++;
+ } else {
+ job++;
+ }
+ }
+
+ if (punished_cnt) {
+ dout(15) << "# of jobs penalized: " << punished_cnt << dendl;
+ }
+}
+
+// clang-format off
+/*
+ * Implementation note:
+ * Clang (10 & 11) produces here efficient table-based code, comparable to using
+ * a direct index into an array of strings.
+ * Gcc (11, trunk) is almost as efficient.
+ */
+std::string_view ScrubQueue::attempt_res_text(Scrub::schedule_result_t v)
+{
+ switch (v) {
+ case Scrub::schedule_result_t::scrub_initiated: return "scrubbing"sv;
+ case Scrub::schedule_result_t::none_ready: return "no ready job"sv;
+ case Scrub::schedule_result_t::no_local_resources: return "local resources shortage"sv;
+ case Scrub::schedule_result_t::already_started: return "denied as already started"sv;
+ case Scrub::schedule_result_t::no_such_pg: return "pg not found"sv;
+ case Scrub::schedule_result_t::bad_pg_state: return "prevented by pg state"sv;
+ case Scrub::schedule_result_t::preconditions: return "preconditions not met"sv;
+ }
+ // g++ (unlike CLANG), requires an extra 'return' here
+ return "(unknown)"sv;
+}
+
+std::string_view ScrubQueue::qu_state_text(qu_state_t st)
+{
+ switch (st) {
+ case qu_state_t::not_registered: return "not registered w/ OSD"sv;
+ case qu_state_t::registered: return "registered"sv;
+ case qu_state_t::unregistering: return "unregistering"sv;
+ }
+ // g++ (unlike CLANG), requires an extra 'return' here
+ return "(unknown)"sv;
+}
+// clang-format on
+
+/**
+ * a note regarding 'to_scrub_copy':
+ * 'to_scrub_copy' is a sorted set of all the ripe jobs from to_copy.
+ * As we usually expect to refer to only the first job in this set, we could
+ * consider an alternative implementation:
+ * - have collect_ripe_jobs() return the copied set without sorting it;
+ * - loop, performing:
+ * - use std::min_element() to find a candidate;
+ * - try that one. If not suitable, discard from 'to_scrub_copy'
+ */
+Scrub::schedule_result_t ScrubQueue::select_pg_and_scrub(
+ Scrub::ScrubPreconds& preconds)
+{
+ dout(10) << " reg./pen. sizes: " << to_scrub.size() << " / "
+ << penalized.size() << dendl;
+
+ utime_t now_is = time_now();
+
+ preconds.time_permit = scrub_time_permit(now_is);
+ preconds.load_is_low = scrub_load_below_threshold();
+ preconds.only_deadlined = !preconds.time_permit || !preconds.load_is_low;
+
+ // create a list of candidates (copying, as otherwise creating a deadlock):
+ // - possibly restore penalized
+ // - (if we didn't handle directly) remove invalid jobs
+ // - create a copy of the to_scrub (possibly up to first not-ripe)
+ // - same for the penalized (although that usually be a waste)
+ // unlock, then try the lists
+
+ std::unique_lock lck{jobs_lock};
+
+ // pardon all penalized jobs that have deadlined (or were updated)
+ scan_penalized(restore_penalized, now_is);
+ restore_penalized = false;
+
+ // remove the 'updated' flag from all entries
+ std::for_each(to_scrub.begin(),
+ to_scrub.end(),
+ [](const auto& jobref) -> void { jobref->updated = false; });
+
+ // add failed scrub attempts to the penalized list
+ move_failed_pgs(now_is);
+
+ // collect all valid & ripe jobs from the two lists. Note that we must copy,
+ // as when we use the lists we will not be holding jobs_lock (see
+ // explanation above)
+
+ auto to_scrub_copy = collect_ripe_jobs(to_scrub, now_is);
+ auto penalized_copy = collect_ripe_jobs(penalized, now_is);
+ lck.unlock();
+
+ // try the regular queue first
+ auto res = select_from_group(to_scrub_copy, preconds, now_is);
+
+ // in the sole scenario in which we've gone over all ripe jobs without success
+ // - we will try the penalized
+ if (res == Scrub::schedule_result_t::none_ready && !penalized_copy.empty()) {
+ res = select_from_group(penalized_copy, preconds, now_is);
+ dout(10) << "tried the penalized. Res: "
+ << ScrubQueue::attempt_res_text(res) << dendl;
+ restore_penalized = true;
+ }
+
+ dout(15) << dendl;
+ return res;
+}
+
+// must be called under lock
+void ScrubQueue::rm_unregistered_jobs(ScrubQContainer& group)
+{
+ std::for_each(group.begin(), group.end(), [](auto& job) {
+ if (job->state == qu_state_t::unregistering) {
+ job->in_queues = false;
+ job->state = qu_state_t::not_registered;
+ } else if (job->state == qu_state_t::not_registered) {
+ job->in_queues = false;
+ }
+ });
+
+ group.erase(std::remove_if(group.begin(), group.end(), invalid_state),
+ group.end());
+}
+
+namespace {
+struct cmp_sched_time_t {
+ bool operator()(const ScrubQueue::ScrubJobRef& lhs,
+ const ScrubQueue::ScrubJobRef& rhs) const
+ {
+ return lhs->schedule.scheduled_at < rhs->schedule.scheduled_at;
+ }
+};
+} // namespace
+
+// called under lock
+ScrubQueue::ScrubQContainer ScrubQueue::collect_ripe_jobs(
+ ScrubQContainer& group,
+ utime_t time_now)
+{
+ rm_unregistered_jobs(group);
+
+ // copy ripe jobs
+ ScrubQueue::ScrubQContainer ripes;
+ ripes.reserve(group.size());
+
+ std::copy_if(group.begin(),
+ group.end(),
+ std::back_inserter(ripes),
+ [time_now](const auto& jobref) -> bool {
+ return jobref->schedule.scheduled_at <= time_now;
+ });
+ std::sort(ripes.begin(), ripes.end(), cmp_sched_time_t{});
+
+ if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+ for (const auto& jobref : group) {
+ if (jobref->schedule.scheduled_at > time_now) {
+ dout(20) << " not ripe: " << jobref->pgid << " @ "
+ << jobref->schedule.scheduled_at << dendl;
+ }
+ }
+ }
+
+ return ripes;
+}
+
+// not holding jobs_lock. 'group' is a copy of the actual list.
+Scrub::schedule_result_t ScrubQueue::select_from_group(
+ ScrubQContainer& group,
+ const Scrub::ScrubPreconds& preconds,
+ utime_t now_is)
+{
+ dout(15) << "jobs #: " << group.size() << dendl;
+
+ for (auto& candidate : group) {
+
+ // we expect the first job in the list to be a good candidate (if any)
+
+ dout(20) << "try initiating scrub for " << candidate->pgid << dendl;
+
+ if (preconds.only_deadlined && (candidate->schedule.deadline.is_zero() ||
+ candidate->schedule.deadline >= now_is)) {
+ dout(15) << " not scheduling scrub for " << candidate->pgid << " due to "
+ << (preconds.time_permit ? "high load" : "time not permitting")
+ << dendl;
+ continue;
+ }
+
+ // we have a candidate to scrub. We turn to the OSD to verify that the PG
+ // configuration allows the specified type of scrub, and to initiate the
+ // scrub.
+ switch (
+ osd_service.initiate_a_scrub(candidate->pgid,
+ preconds.allow_requested_repair_only)) {
+
+ case Scrub::schedule_result_t::scrub_initiated:
+ // the happy path. We are done
+ dout(20) << " initiated for " << candidate->pgid << dendl;
+ return Scrub::schedule_result_t::scrub_initiated;
+
+ case Scrub::schedule_result_t::already_started:
+ case Scrub::schedule_result_t::preconditions:
+ case Scrub::schedule_result_t::bad_pg_state:
+ // continue with the next job
+ dout(20) << "failed (state/cond/started) " << candidate->pgid << dendl;
+ break;
+
+ case Scrub::schedule_result_t::no_such_pg:
+ // The pg is no longer there
+ dout(20) << "failed (no pg) " << candidate->pgid << dendl;
+ break;
+
+ case Scrub::schedule_result_t::no_local_resources:
+ // failure to secure local resources. No point in trying the other
+ // PGs at this time. Note that this is not the same as replica resources
+ // failure!
+ dout(20) << "failed (local) " << candidate->pgid << dendl;
+ return Scrub::schedule_result_t::no_local_resources;
+
+ case Scrub::schedule_result_t::none_ready:
+ // can't happen. Just for the compiler.
+ dout(5) << "failed !!! " << candidate->pgid << dendl;
+ return Scrub::schedule_result_t::none_ready;
+ }
+ }
+
+ dout(20) << " returning 'none ready'" << dendl;
+ return Scrub::schedule_result_t::none_ready;
+}
+
+ScrubQueue::scrub_schedule_t ScrubQueue::adjust_target_time(
+ const sched_params_t& times) const
+{
+ ScrubQueue::scrub_schedule_t sched_n_dead{
+ times.proposed_time, times.proposed_time};
+
+ if (times.is_must == ScrubQueue::must_scrub_t::not_mandatory) {
+ // unless explicitly requested, postpone the scrub with a random delay
+ double scrub_min_interval = times.min_interval > 0
+ ? times.min_interval
+ : conf()->osd_scrub_min_interval;
+ double scrub_max_interval = times.max_interval > 0
+ ? times.max_interval
+ : conf()->osd_scrub_max_interval;
+
+ sched_n_dead.scheduled_at += scrub_min_interval;
+ double r = rand() / (double)RAND_MAX;
+ sched_n_dead.scheduled_at +=
+ scrub_min_interval * conf()->osd_scrub_interval_randomize_ratio * r;
+
+ if (scrub_max_interval <= 0) {
+ sched_n_dead.deadline = utime_t{};
+ } else {
+ sched_n_dead.deadline += scrub_max_interval;
+ }
+ // note: no specific job can be named in the log message
+ dout(20) << fmt::format(
+ "not-must. Was:{:s} {{min:{}/{} max:{}/{} ratio:{}}} "
+ "Adjusted:{:s} ({:s})",
+ times.proposed_time, fmt::group_digits(times.min_interval),
+ fmt::group_digits(conf()->osd_scrub_min_interval),
+ fmt::group_digits(times.max_interval),
+ fmt::group_digits(conf()->osd_scrub_max_interval),
+ conf()->osd_scrub_interval_randomize_ratio,
+ sched_n_dead.scheduled_at, sched_n_dead.deadline)
+ << dendl;
+ }
+ // else - no log needed. All relevant data will be logged by the caller
+ return sched_n_dead;
+}
+
+double ScrubQueue::scrub_sleep_time(bool must_scrub) const
+{
+ double regular_sleep_period = conf()->osd_scrub_sleep;
+
+ if (must_scrub || scrub_time_permit(time_now())) {
+ return regular_sleep_period;
+ }
+
+ // relevant if scrubbing started during allowed time, but continued into
+ // forbidden hours
+ double extended_sleep = conf()->osd_scrub_extended_sleep;
+ dout(20) << "w/ extended sleep (" << extended_sleep << ")" << dendl;
+ return std::max(extended_sleep, regular_sleep_period);
+}
+
+bool ScrubQueue::scrub_load_below_threshold() const
+{
+ double loadavgs[3];
+ if (getloadavg(loadavgs, 3) != 3) {
+ dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
+ return false;
+ }
+
+ // allow scrub if below configured threshold
+ long cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
+ if (loadavg_per_cpu < conf()->osd_scrub_load_threshold) {
+ dout(20) << "loadavg per cpu " << loadavg_per_cpu << " < max "
+ << conf()->osd_scrub_load_threshold << " = yes" << dendl;
+ return true;
+ }
+
+ // allow scrub if below daily avg and currently decreasing
+ if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
+ dout(20) << "loadavg " << loadavgs[0] << " < daily_loadavg "
+ << daily_loadavg << " and < 15m avg " << loadavgs[2] << " = yes"
+ << dendl;
+ return true;
+ }
+
+ dout(20) << "loadavg " << loadavgs[0] << " >= max "
+ << conf()->osd_scrub_load_threshold << " and ( >= daily_loadavg "
+ << daily_loadavg << " or >= 15m avg " << loadavgs[2] << ") = no"
+ << dendl;
+ return false;
+}
+
+
+// note: called with jobs_lock held
+void ScrubQueue::scan_penalized(bool forgive_all, utime_t time_now)
+{
+ dout(20) << time_now << (forgive_all ? " all " : " - ") << penalized.size()
+ << dendl;
+
+ // clear dead entries (deleted PGs, or those PGs we are no longer their
+ // primary)
+ rm_unregistered_jobs(penalized);
+
+ if (forgive_all) {
+
+ std::copy(penalized.begin(), penalized.end(), std::back_inserter(to_scrub));
+ penalized.clear();
+
+ } else {
+
+ auto forgiven_last = std::partition(
+ penalized.begin(),
+ penalized.end(),
+ [time_now](const auto& e) {
+ return (*e).updated || ((*e).penalty_timeout <= time_now);
+ });
+
+ std::copy(penalized.begin(), forgiven_last, std::back_inserter(to_scrub));
+ penalized.erase(penalized.begin(), forgiven_last);
+ dout(20) << "penalized after screening: " << penalized.size() << dendl;
+ }
+}
+
+// checks for half-closed ranges. Modify the (p<till)to '<=' to check for
+// closed.
+static inline bool isbetween_modulo(int64_t from, int64_t till, int p)
+{
+ // the 1st condition is because we have defined from==till as "always true"
+ return (till == from) || ((till >= from) ^ (p >= from) ^ (p < till));
+}
+
+bool ScrubQueue::scrub_time_permit(utime_t now) const
+{
+ tm bdt;
+ time_t tt = now.sec();
+ localtime_r(&tt, &bdt);
+
+ bool day_permit = isbetween_modulo(conf()->osd_scrub_begin_week_day,
+ conf()->osd_scrub_end_week_day,
+ bdt.tm_wday);
+ if (!day_permit) {
+ dout(20) << "should run between week day "
+ << conf()->osd_scrub_begin_week_day << " - "
+ << conf()->osd_scrub_end_week_day << " now " << bdt.tm_wday
+ << " - no" << dendl;
+ return false;
+ }
+
+ bool time_permit = isbetween_modulo(conf()->osd_scrub_begin_hour,
+ conf()->osd_scrub_end_hour,
+ bdt.tm_hour);
+ dout(20) << "should run between " << conf()->osd_scrub_begin_hour << " - "
+ << conf()->osd_scrub_end_hour << " now (" << bdt.tm_hour
+ << ") = " << (time_permit ? "yes" : "no") << dendl;
+ return time_permit;
+}
+
+void ScrubQueue::ScrubJob::dump(ceph::Formatter* f) const
+{
+ f->open_object_section("scrub");
+ f->dump_stream("pgid") << pgid;
+ f->dump_stream("sched_time") << schedule.scheduled_at;
+ f->dump_stream("deadline") << schedule.deadline;
+ f->dump_bool("forced",
+ schedule.scheduled_at == PgScrubber::scrub_must_stamp());
+ f->close_section();
+}
+
+void ScrubQueue::dump_scrubs(ceph::Formatter* f) const
+{
+ ceph_assert(f != nullptr);
+ std::lock_guard lck(jobs_lock);
+
+ f->open_array_section("scrubs");
+
+ std::for_each(to_scrub.cbegin(), to_scrub.cend(), [&f](const ScrubJobRef& j) {
+ j->dump(f);
+ });
+
+ std::for_each(penalized.cbegin(),
+ penalized.cend(),
+ [&f](const ScrubJobRef& j) { j->dump(f); });
+
+ f->close_section();
+}
+
+ScrubQueue::ScrubQContainer ScrubQueue::list_registered_jobs() const
+{
+ ScrubQueue::ScrubQContainer all_jobs;
+ all_jobs.reserve(to_scrub.size() + penalized.size());
+ dout(20) << " size: " << all_jobs.capacity() << dendl;
+
+ std::lock_guard lck{jobs_lock};
+
+ std::copy_if(to_scrub.begin(),
+ to_scrub.end(),
+ std::back_inserter(all_jobs),
+ registered_job);
+ std::copy_if(penalized.begin(),
+ penalized.end(),
+ std::back_inserter(all_jobs),
+ registered_job);
+
+ return all_jobs;
+}
+
+// ////////////////////////////////////////////////////////////////////////// //
+// ScrubQueue - scrub resource management
+
+bool ScrubQueue::can_inc_scrubs() const
+{
+ // consider removing the lock here. Caller already handles delayed
+ // inc_scrubs_local() failures
+ std::lock_guard lck{resource_lock};
+
+ if (scrubs_local + scrubs_remote < conf()->osd_max_scrubs) {
+ return true;
+ }
+
+ dout(20) << " == false. " << scrubs_local << " local + " << scrubs_remote
+ << " remote >= max " << conf()->osd_max_scrubs << dendl;
+ return false;
+}
+
+bool ScrubQueue::inc_scrubs_local()
+{
+ std::lock_guard lck{resource_lock};
+
+ if (scrubs_local + scrubs_remote < conf()->osd_max_scrubs) {
+ ++scrubs_local;
+ return true;
+ }
+
+ dout(20) << ": " << scrubs_local << " local + " << scrubs_remote
+ << " remote >= max " << conf()->osd_max_scrubs << dendl;
+ return false;
+}
+
+void ScrubQueue::dec_scrubs_local()
+{
+ std::lock_guard lck{resource_lock};
+ dout(20) << ": " << scrubs_local << " -> " << (scrubs_local - 1) << " (max "
+ << conf()->osd_max_scrubs << ", remote " << scrubs_remote << ")"
+ << dendl;
+
+ --scrubs_local;
+ ceph_assert(scrubs_local >= 0);
+}
+
+bool ScrubQueue::inc_scrubs_remote()
+{
+ std::lock_guard lck{resource_lock};
+
+ if (scrubs_local + scrubs_remote < conf()->osd_max_scrubs) {
+ dout(20) << ": " << scrubs_remote << " -> " << (scrubs_remote + 1)
+ << " (max " << conf()->osd_max_scrubs << ", local "
+ << scrubs_local << ")" << dendl;
+ ++scrubs_remote;
+ return true;
+ }
+
+ dout(20) << ": " << scrubs_local << " local + " << scrubs_remote
+ << " remote >= max " << conf()->osd_max_scrubs << dendl;
+ return false;
+}
+
+void ScrubQueue::dec_scrubs_remote()
+{
+ std::lock_guard lck{resource_lock};
+ dout(20) << ": " << scrubs_remote << " -> " << (scrubs_remote - 1) << " (max "
+ << conf()->osd_max_scrubs << ", local " << scrubs_local << ")"
+ << dendl;
+ --scrubs_remote;
+ ceph_assert(scrubs_remote >= 0);
+}
+
+void ScrubQueue::dump_scrub_reservations(ceph::Formatter* f) const
+{
+ std::lock_guard lck{resource_lock};
+ f->dump_int("scrubs_local", scrubs_local);
+ f->dump_int("scrubs_remote", scrubs_remote);
+ f->dump_int("osd_max_scrubs", conf()->osd_max_scrubs);
+}
+
+void ScrubQueue::clear_pg_scrub_blocked(spg_t blocked_pg)
+{
+ dout(5) << fmt::format(": pg {} is unblocked", blocked_pg) << dendl;
+ --blocked_scrubs_cnt;
+ ceph_assert(blocked_scrubs_cnt >= 0);
+}
+
+void ScrubQueue::mark_pg_scrub_blocked(spg_t blocked_pg)
+{
+ dout(5) << fmt::format(": pg {} is blocked on an object", blocked_pg)
+ << dendl;
+ ++blocked_scrubs_cnt;
+}
+
+int ScrubQueue::get_blocked_pgs_count() const
+{
+ return blocked_scrubs_cnt;
+}