diff options
Diffstat (limited to 'src/osd/scheduler')
-rw-r--r-- | src/osd/scheduler/OpScheduler.cc | 60 | ||||
-rw-r--r-- | src/osd/scheduler/OpScheduler.h | 149 | ||||
-rw-r--r-- | src/osd/scheduler/OpSchedulerItem.cc | 272 | ||||
-rw-r--r-- | src/osd/scheduler/OpSchedulerItem.h | 607 | ||||
-rw-r--r-- | src/osd/scheduler/mClockScheduler.cc | 597 | ||||
-rw-r--r-- | src/osd/scheduler/mClockScheduler.h | 277 |
6 files changed, 1962 insertions, 0 deletions
diff --git a/src/osd/scheduler/OpScheduler.cc b/src/osd/scheduler/OpScheduler.cc new file mode 100644 index 000000000..cb5ef13b6 --- /dev/null +++ b/src/osd/scheduler/OpScheduler.cc @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <ostream> + +#include "osd/scheduler/OpScheduler.h" + +#include "common/WeightedPriorityQueue.h" +#include "osd/scheduler/mClockScheduler.h" + +namespace ceph::osd::scheduler { + +OpSchedulerRef make_scheduler( + CephContext *cct, int whoami, uint32_t num_shards, int shard_id, + bool is_rotational, std::string_view osd_objectstore, MonClient *monc) +{ + const std::string *type = &cct->_conf->osd_op_queue; + if (*type == "debug_random") { + static const std::string index_lookup[] = { "mclock_scheduler", + "wpq" }; + srand(time(NULL)); + unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0])); + type = &index_lookup[which]; + } + + // Force the use of 'wpq' scheduler for filestore OSDs. + // The 'mclock_scheduler' is not supported for filestore OSDs. + if (*type == "wpq" || osd_objectstore == "filestore") { + return std::make_unique< + ClassedOpQueueScheduler<WeightedPriorityQueue<OpSchedulerItem, client>>>( + cct, + cct->_conf->osd_op_pq_max_tokens_per_priority, + cct->_conf->osd_op_pq_min_cost + ); + } else if (*type == "mclock_scheduler") { + // default is 'mclock_scheduler' + return std::make_unique< + mClockScheduler>(cct, whoami, num_shards, shard_id, is_rotational, monc); + } else { + ceph_assert("Invalid choice of wq" == 0); + } +} + +std::ostream &operator<<(std::ostream &lhs, const OpScheduler &rhs) { + rhs.print(lhs); + return lhs; +} + +} diff --git a/src/osd/scheduler/OpScheduler.h b/src/osd/scheduler/OpScheduler.h new file mode 100644 index 000000000..1575bcae4 --- /dev/null +++ b/src/osd/scheduler/OpScheduler.h @@ -0,0 +1,149 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <ostream> +#include <variant> + +#include "common/ceph_context.h" +#include "mon/MonClient.h" +#include "osd/scheduler/OpSchedulerItem.h" + +namespace ceph::osd::scheduler { + +using client = uint64_t; +using WorkItem = std::variant<std::monostate, OpSchedulerItem, double>; + +/** + * Base interface for classes responsible for choosing + * op processing order in the OSD. + */ +class OpScheduler { +public: + // Enqueue op for scheduling + virtual void enqueue(OpSchedulerItem &&item) = 0; + + // Enqueue op for processing as though it were enqueued prior + // to other items already scheduled. + virtual void enqueue_front(OpSchedulerItem &&item) = 0; + + // Returns true iff there are no ops scheduled + virtual bool empty() const = 0; + + // Return next op to be processed + virtual WorkItem dequeue() = 0; + + // Dump formatted representation for the queue + virtual void dump(ceph::Formatter &f) const = 0; + + // Print human readable brief description with relevant parameters + virtual void print(std::ostream &out) const = 0; + + // Apply config changes to the scheduler (if any) + virtual void update_configuration() = 0; + + // Destructor + virtual ~OpScheduler() {}; +}; + +std::ostream &operator<<(std::ostream &lhs, const OpScheduler &); +using OpSchedulerRef = std::unique_ptr<OpScheduler>; + +OpSchedulerRef make_scheduler( + CephContext *cct, int whoami, uint32_t num_shards, int shard_id, + bool is_rotational, std::string_view osd_objectstore, MonClient *monc); + +/** + * Implements OpScheduler in terms of OpQueue + * + * Templated on queue type to avoid dynamic dispatch, T should implement + * OpQueue<OpSchedulerItem, client>. This adapter is mainly responsible for + * the boilerplate priority cutoff/strict concept which is needed for + * OpQueue based implementations. + */ +template <typename T> +class ClassedOpQueueScheduler final : public OpScheduler { + unsigned cutoff; + T queue; + + static unsigned int get_io_prio_cut(CephContext *cct) { + if (cct->_conf->osd_op_queue_cut_off == "debug_random") { + srand(time(NULL)); + return (rand() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW; + } else if (cct->_conf->osd_op_queue_cut_off == "high") { + return CEPH_MSG_PRIO_HIGH; + } else { + // default / catch-all is 'low' + return CEPH_MSG_PRIO_LOW; + } + } +public: + template <typename... Args> + ClassedOpQueueScheduler(CephContext *cct, Args&&... args) : + cutoff(get_io_prio_cut(cct)), + queue(std::forward<Args>(args)...) + {} + + void enqueue(OpSchedulerItem &&item) final { + unsigned priority = item.get_priority(); + unsigned cost = item.get_cost(); + + if (priority >= cutoff) + queue.enqueue_strict( + item.get_owner(), priority, std::move(item)); + else + queue.enqueue( + item.get_owner(), priority, cost, std::move(item)); + } + + void enqueue_front(OpSchedulerItem &&item) final { + unsigned priority = item.get_priority(); + unsigned cost = item.get_cost(); + if (priority >= cutoff) + queue.enqueue_strict_front( + item.get_owner(), + priority, std::move(item)); + else + queue.enqueue_front( + item.get_owner(), + priority, cost, std::move(item)); + } + + bool empty() const final { + return queue.empty(); + } + + WorkItem dequeue() final { + return queue.dequeue(); + } + + void dump(ceph::Formatter &f) const final { + return queue.dump(&f); + } + + void print(std::ostream &out) const final { + out << "ClassedOpQueueScheduler(queue="; + queue.print(out); + out << ", cutoff=" << cutoff << ")"; + } + + void update_configuration() final { + // no-op + } + + ~ClassedOpQueueScheduler() final {}; +}; + +} diff --git a/src/osd/scheduler/OpSchedulerItem.cc b/src/osd/scheduler/OpSchedulerItem.cc new file mode 100644 index 000000000..d1abc264a --- /dev/null +++ b/src/osd/scheduler/OpSchedulerItem.cc @@ -0,0 +1,272 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "osd/scheduler/OpSchedulerItem.h" +#include "osd/OSD.h" +#include "osd/osd_tracer.h" + + +namespace ceph::osd::scheduler { + +std::ostream& operator<<(std::ostream& out, const op_scheduler_class& class_id) { + out << static_cast<size_t>(class_id); + return out; +} + +void PGOpItem::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->dequeue_op(pg, op, handle); + pg->unlock(); +} + +void PGPeeringItem::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->dequeue_peering_evt(sdata, pg.get(), evt, handle); +} + +void PGSnapTrim::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + pg->snap_trimmer(epoch_queued); + pg->unlock(); +} + +void PGScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) +{ + pg->scrub(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubAfterRepair::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->recovery_scrub(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubResched::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_scrub_resched(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubResourcesOK::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_resources_granted(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubDenied::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_resources_denied(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubPushesUpdate::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_pushes_update(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubAppliedUpdate::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_applied_update(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubUnblocked::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_unblocking(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubDigestUpdate::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_digest_update(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubGotLocalMap::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_local_map_ready(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubGotReplMaps::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_replmaps_ready(epoch_queued, handle); + pg->unlock(); +} + +void PGRepScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) +{ + pg->replica_scrub(epoch_queued, activation_index, handle); + pg->unlock(); +} + +void PGRepScrubResched::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->replica_scrub_resched(epoch_queued, activation_index, handle); + pg->unlock(); +} + +void PGScrubReplicaPushes::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_replica_pushes(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubScrubFinished::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_scrub_is_finished(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubGetNextChunk::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_get_next_chunk(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubChunkIsBusy::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_chunk_busy(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubChunkIsFree::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_chunk_free(epoch_queued, handle); + pg->unlock(); +} + +void PGRecovery::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->logger->tinc( + l_osd_recovery_queue_lat, + time_queued - ceph_clock_now()); + osd->do_recovery(pg.get(), epoch_queued, reserved_pushes, priority, handle); + pg->unlock(); +} + +void PGRecoveryContext::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->logger->tinc( + l_osd_recovery_context_queue_lat, + time_queued - ceph_clock_now()); + c.release()->complete(handle); + pg->unlock(); +} + +void PGDelete::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + osd->dequeue_delete(sdata, pg.get(), epoch_queued, handle); +} + +void PGRecoveryMsg::run( + OSD *osd, + OSDShard *sdata, + PGRef& pg, + ThreadPool::TPHandle &handle) +{ + auto latency = time_queued - ceph_clock_now(); + switch (op->get_req()->get_type()) { + case MSG_OSD_PG_PUSH: + osd->logger->tinc(l_osd_recovery_push_queue_lat, latency); + case MSG_OSD_PG_PUSH_REPLY: + osd->logger->tinc(l_osd_recovery_push_reply_queue_lat, latency); + case MSG_OSD_PG_PULL: + osd->logger->tinc(l_osd_recovery_pull_queue_lat, latency); + case MSG_OSD_PG_BACKFILL: + osd->logger->tinc(l_osd_recovery_backfill_queue_lat, latency); + case MSG_OSD_PG_BACKFILL_REMOVE: + osd->logger->tinc(l_osd_recovery_backfill_remove_queue_lat, latency); + case MSG_OSD_PG_SCAN: + osd->logger->tinc(l_osd_recovery_scan_queue_lat, latency); + } + osd->dequeue_op(pg, op, handle); + pg->unlock(); +} + +} diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h new file mode 100644 index 000000000..a9ec14de3 --- /dev/null +++ b/src/osd/scheduler/OpSchedulerItem.h @@ -0,0 +1,607 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <ostream> + +#include "include/types.h" +#include "include/utime.h" +#include "osd/OpRequest.h" +#include "osd/PG.h" +#include "osd/PGPeeringEvent.h" +#include "messages/MOSDOp.h" + + +class OSD; +struct OSDShard; + +namespace ceph::osd::scheduler { + +enum class op_scheduler_class : uint8_t { + background_recovery = 0, + background_best_effort, + immediate, + client, +}; + +std::ostream& operator<<(std::ostream& out, const op_scheduler_class& class_id); + +class OpSchedulerItem { +public: + // Abstraction for operations queueable in the op queue + class OpQueueable { + public: + using Ref = std::unique_ptr<OpQueueable>; + + /// Items with the same queue token will end up in the same shard + virtual uint32_t get_queue_token() const = 0; + + /* Items will be dequeued and locked atomically w.r.t. other items with the + * same ordering token */ + virtual const spg_t& get_ordering_token() const = 0; + + virtual std::optional<OpRequestRef> maybe_get_op() const { + return std::nullopt; + } + + virtual uint64_t get_reserved_pushes() const { + return 0; + } + + virtual bool is_peering() const { + return false; + } + virtual bool peering_requires_pg() const { + ceph_abort(); + } + virtual const PGCreateInfo *creates_pg() const { + return nullptr; + } + + virtual std::ostream &print(std::ostream &rhs) const = 0; + + virtual void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) = 0; + virtual op_scheduler_class get_scheduler_class() const = 0; + + virtual ~OpQueueable() {} + friend std::ostream& operator<<(std::ostream& out, const OpQueueable& q) { + return q.print(out); + } + + }; + +private: + OpQueueable::Ref qitem; + int cost; + unsigned priority; + utime_t start_time; + uint64_t owner; ///< global id (e.g., client.XXX) + epoch_t map_epoch; ///< an epoch we expect the PG to exist in + + /** + * qos_cost + * + * Set by mClockScheduler iff queued into mclock proper and not the + * high/immediate queues. Represents mClockScheduler's adjusted + * cost value. + */ + uint32_t qos_cost = 0; + + /// True iff queued via mclock proper, not the high/immediate queues + bool was_queued_via_mclock() const { + return qos_cost > 0; + } + +public: + OpSchedulerItem( + OpQueueable::Ref &&item, + int cost, + unsigned priority, + utime_t start_time, + uint64_t owner, + epoch_t e) + : qitem(std::move(item)), + cost(cost), + priority(priority), + start_time(start_time), + owner(owner), + map_epoch(e) {} + OpSchedulerItem(OpSchedulerItem &&) = default; + OpSchedulerItem(const OpSchedulerItem &) = delete; + OpSchedulerItem &operator=(OpSchedulerItem &&) = default; + OpSchedulerItem &operator=(const OpSchedulerItem &) = delete; + + uint32_t get_queue_token() const { + return qitem->get_queue_token(); + } + const spg_t& get_ordering_token() const { + return qitem->get_ordering_token(); + } + std::optional<OpRequestRef> maybe_get_op() const { + return qitem->maybe_get_op(); + } + uint64_t get_reserved_pushes() const { + return qitem->get_reserved_pushes(); + } + void run(OSD *osd, OSDShard *sdata,PGRef& pg, ThreadPool::TPHandle &handle) { + qitem->run(osd, sdata, pg, handle); + } + unsigned get_priority() const { return priority; } + int get_cost() const { return cost; } + utime_t get_start_time() const { return start_time; } + uint64_t get_owner() const { return owner; } + epoch_t get_map_epoch() const { return map_epoch; } + + bool is_peering() const { + return qitem->is_peering(); + } + + const PGCreateInfo *creates_pg() const { + return qitem->creates_pg(); + } + + bool peering_requires_pg() const { + return qitem->peering_requires_pg(); + } + + op_scheduler_class get_scheduler_class() const { + return qitem->get_scheduler_class(); + } + + void set_qos_cost(uint32_t scaled_cost) { + qos_cost = scaled_cost; + } + + friend std::ostream& operator<<(std::ostream& out, const OpSchedulerItem& item) { + out << "OpSchedulerItem(" + << item.get_ordering_token() << " " << *item.qitem; + + out << " class_id " << item.get_scheduler_class(); + + out << " prio " << item.get_priority(); + + if (item.was_queued_via_mclock()) { + out << " qos_cost " << item.qos_cost; + } + + out << " cost " << item.get_cost() + << " e" << item.get_map_epoch(); + + if (item.get_reserved_pushes()) { + out << " reserved_pushes " << item.get_reserved_pushes(); + } + + return out << ")"; + } +}; // class OpSchedulerItem + +/// Implements boilerplate for operations queued for the pg lock +class PGOpQueueable : public OpSchedulerItem::OpQueueable { + spg_t pgid; +protected: + const spg_t& get_pgid() const { + return pgid; + } + + static op_scheduler_class priority_to_scheduler_class(int priority) { + if (priority >= CEPH_MSG_PRIO_HIGH) { + return op_scheduler_class::immediate; + } else if (priority >= PeeringState::recovery_msg_priority_t::DEGRADED) { + return op_scheduler_class::background_recovery; + } else { + return op_scheduler_class::background_best_effort; + } + } + +public: + explicit PGOpQueueable(spg_t pg) : pgid(pg) {} + uint32_t get_queue_token() const final { + return get_pgid().ps(); + } + + const spg_t& get_ordering_token() const final { + return get_pgid(); + } +}; + +class PGOpItem : public PGOpQueueable { + OpRequestRef op; + +public: + PGOpItem(spg_t pg, OpRequestRef op) : PGOpQueueable(pg), op(std::move(op)) {} + + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGOpItem(op=" << *(op->get_req()) << ")"; + } + + std::optional<OpRequestRef> maybe_get_op() const final { + return op; + } + + op_scheduler_class get_scheduler_class() const final { + auto type = op->get_req()->get_type(); + if (type == CEPH_MSG_OSD_OP || + type == CEPH_MSG_OSD_BACKOFF) { + return op_scheduler_class::client; + } else { + return op_scheduler_class::immediate; + } + } + + void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; +}; + +class PGPeeringItem : public PGOpQueueable { + PGPeeringEventRef evt; +public: + PGPeeringItem(spg_t pg, PGPeeringEventRef e) : PGOpQueueable(pg), evt(e) {} + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGPeeringEvent(" << evt->get_desc() << ")"; + } + void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + bool is_peering() const override { + return true; + } + bool peering_requires_pg() const override { + return evt->requires_pg; + } + const PGCreateInfo *creates_pg() const override { + return evt->create_info.get(); + } + op_scheduler_class get_scheduler_class() const final { + return op_scheduler_class::immediate; + } +}; + +class PGSnapTrim : public PGOpQueueable { + epoch_t epoch_queued; +public: + PGSnapTrim( + spg_t pg, + epoch_t epoch_queued) + : PGOpQueueable(pg), epoch_queued(epoch_queued) {} + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGSnapTrim(pgid=" << get_pgid() + << " epoch_queued=" << epoch_queued + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + op_scheduler_class get_scheduler_class() const final { + return op_scheduler_class::background_best_effort; + } +}; + +class PGScrub : public PGOpQueueable { + epoch_t epoch_queued; +public: + PGScrub( + spg_t pg, + epoch_t epoch_queued) + : PGOpQueueable(pg), epoch_queued(epoch_queued) {} + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGScrub(pgid=" << get_pgid() + << "epoch_queued=" << epoch_queued + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + op_scheduler_class get_scheduler_class() const final { + return op_scheduler_class::background_best_effort; + } +}; + +class PGScrubItem : public PGOpQueueable { + protected: + epoch_t epoch_queued; + Scrub::act_token_t activation_index; + std::string_view message_name; + PGScrubItem(spg_t pg, epoch_t epoch_queued, std::string_view derivative_name) + : PGOpQueueable{pg} + , epoch_queued{epoch_queued} + , activation_index{0} + , message_name{derivative_name} + {} + PGScrubItem(spg_t pg, + epoch_t epoch_queued, + Scrub::act_token_t op_index, + std::string_view derivative_name) + : PGOpQueueable{pg} + , epoch_queued{epoch_queued} + , activation_index{op_index} + , message_name{derivative_name} + {} + std::ostream& print(std::ostream& rhs) const final + { + return rhs << message_name << "(pgid=" << get_pgid() + << "epoch_queued=" << epoch_queued + << " scrub-token=" << activation_index << ")"; + } + void run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) override = 0; + op_scheduler_class get_scheduler_class() const final + { + return op_scheduler_class::background_best_effort; + } +}; + +class PGScrubResched : public PGScrubItem { + public: + PGScrubResched(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubResched"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +/** + * all replicas have granted our scrub resources request + */ +class PGScrubResourcesOK : public PGScrubItem { + public: + PGScrubResourcesOK(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubResourcesOK"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +/** + * scrub resources requests denied by replica(s) + */ +class PGScrubDenied : public PGScrubItem { + public: + PGScrubDenied(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubDenied"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +/** + * called when a repair process completes, to initiate scrubbing. No local/remote + * resources are allocated. + */ +class PGScrubAfterRepair : public PGScrubItem { + public: + PGScrubAfterRepair(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubAfterRepair"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubPushesUpdate : public PGScrubItem { + public: + PGScrubPushesUpdate(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubPushesUpdate"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubAppliedUpdate : public PGScrubItem { + public: + PGScrubAppliedUpdate(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubAppliedUpdate"} + {} + void run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + [[maybe_unused]] ThreadPool::TPHandle& handle) final; +}; + +class PGScrubUnblocked : public PGScrubItem { + public: + PGScrubUnblocked(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubUnblocked"} + {} + void run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + [[maybe_unused]] ThreadPool::TPHandle& handle) final; +}; + +class PGScrubDigestUpdate : public PGScrubItem { + public: + PGScrubDigestUpdate(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubDigestUpdate"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubGotLocalMap : public PGScrubItem { + public: + PGScrubGotLocalMap(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubGotLocalMap"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubGotReplMaps : public PGScrubItem { + public: + PGScrubGotReplMaps(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubGotReplMaps"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGRepScrub : public PGScrubItem { + public: + PGRepScrub(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token) + : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrub"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGRepScrubResched : public PGScrubItem { + public: + PGRepScrubResched(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token) + : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrubResched"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubReplicaPushes : public PGScrubItem { + public: + PGScrubReplicaPushes(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubReplicaPushes"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubScrubFinished : public PGScrubItem { + public: + PGScrubScrubFinished(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubScrubFinished"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubGetNextChunk : public PGScrubItem { + public: + PGScrubGetNextChunk(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubGetNextChunk"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubChunkIsBusy : public PGScrubItem { + public: + PGScrubChunkIsBusy(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubChunkIsBusy"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubChunkIsFree : public PGScrubItem { + public: + PGScrubChunkIsFree(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubChunkIsFree"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGRecovery : public PGOpQueueable { + utime_t time_queued; + epoch_t epoch_queued; + uint64_t reserved_pushes; + int priority; +public: + PGRecovery( + spg_t pg, + epoch_t epoch_queued, + uint64_t reserved_pushes, + int priority) + : PGOpQueueable(pg), + time_queued(ceph_clock_now()), + epoch_queued(epoch_queued), + reserved_pushes(reserved_pushes), + priority(priority) {} + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGRecovery(pgid=" << get_pgid() + << " epoch_queued=" << epoch_queued + << " reserved_pushes=" << reserved_pushes + << ")"; + } + uint64_t get_reserved_pushes() const final { + return reserved_pushes; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + op_scheduler_class get_scheduler_class() const final { + return priority_to_scheduler_class(priority); + } +}; + +class PGRecoveryContext : public PGOpQueueable { + utime_t time_queued; + std::unique_ptr<GenContext<ThreadPool::TPHandle&>> c; + epoch_t epoch; + int priority; +public: + PGRecoveryContext(spg_t pgid, + GenContext<ThreadPool::TPHandle&> *c, epoch_t epoch, + int priority) + : PGOpQueueable(pgid), + time_queued(ceph_clock_now()), + c(c), epoch(epoch), priority(priority) {} + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGRecoveryContext(pgid=" << get_pgid() + << " c=" << c.get() << " epoch=" << epoch + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + op_scheduler_class get_scheduler_class() const final { + return priority_to_scheduler_class(priority); + } +}; + +class PGDelete : public PGOpQueueable { + epoch_t epoch_queued; +public: + PGDelete( + spg_t pg, + epoch_t epoch_queued) + : PGOpQueueable(pg), + epoch_queued(epoch_queued) {} + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGDelete(" << get_pgid() + << " e" << epoch_queued + << ")"; + } + void run( + OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; + op_scheduler_class get_scheduler_class() const final { + return op_scheduler_class::background_best_effort; + } +}; + +class PGRecoveryMsg : public PGOpQueueable { + utime_t time_queued; + OpRequestRef op; + +public: + PGRecoveryMsg(spg_t pg, OpRequestRef op) + : PGOpQueueable(pg), time_queued(ceph_clock_now()), op(std::move(op)) {} + + static bool is_recovery_msg(OpRequestRef &op) { + switch (op->get_req()->get_type()) { + case MSG_OSD_PG_PUSH: + case MSG_OSD_PG_PUSH_REPLY: + case MSG_OSD_PG_PULL: + case MSG_OSD_PG_BACKFILL: + case MSG_OSD_PG_BACKFILL_REMOVE: + case MSG_OSD_PG_SCAN: + return true; + default: + return false; + } + } + + std::ostream &print(std::ostream &rhs) const final { + return rhs << "PGRecoveryMsg(op=" << *(op->get_req()) << ")"; + } + + std::optional<OpRequestRef> maybe_get_op() const final { + return op; + } + + op_scheduler_class get_scheduler_class() const final { + return priority_to_scheduler_class(op->get_req()->get_priority()); + } + + void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final; +}; + +} diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc new file mode 100644 index 000000000..0ea519655 --- /dev/null +++ b/src/osd/scheduler/mClockScheduler.cc @@ -0,0 +1,597 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <memory> +#include <functional> + +#include "osd/scheduler/mClockScheduler.h" +#include "common/dout.h" + +namespace dmc = crimson::dmclock; +using namespace std::placeholders; + +#define dout_context cct +#define dout_subsys ceph_subsys_mclock +#undef dout_prefix +#define dout_prefix *_dout << "mClockScheduler: " + + +namespace ceph::osd::scheduler { + +mClockScheduler::mClockScheduler(CephContext *cct, + int whoami, + uint32_t num_shards, + int shard_id, + bool is_rotational, + MonClient *monc) + : cct(cct), + whoami(whoami), + num_shards(num_shards), + shard_id(shard_id), + is_rotational(is_rotational), + monc(monc), + scheduler( + std::bind(&mClockScheduler::ClientRegistry::get_info, + &client_registry, + _1), + dmc::AtLimit::Wait, + cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout")) +{ + cct->_conf.add_observer(this); + ceph_assert(num_shards > 0); + set_osd_capacity_params_from_config(); + set_config_defaults_from_profile(); + client_registry.update_from_config( + cct->_conf, osd_bandwidth_capacity_per_shard); +} + +/* ClientRegistry holds the dmclock::ClientInfo configuration parameters + * (reservation (bytes/second), weight (unitless), limit (bytes/second)) + * for each IO class in the OSD (client, background_recovery, + * background_best_effort). + * + * mclock expects limit and reservation to have units of <cost>/second + * (bytes/second), but osd_mclock_scheduler_client_(lim|res) are provided + * as ratios of the OSD's capacity. We convert from the one to the other + * using the capacity_per_shard parameter. + * + * Note, mclock profile information will already have been set as a default + * for the osd_mclock_scheduler_client_* parameters prior to calling + * update_from_config -- see set_config_defaults_from_profile(). + */ +void mClockScheduler::ClientRegistry::update_from_config( + const ConfigProxy &conf, + const double capacity_per_shard) +{ + + auto get_res = [&](double res) { + if (res) { + return res * capacity_per_shard; + } else { + return default_min; // min reservation + } + }; + + auto get_lim = [&](double lim) { + if (lim) { + return lim * capacity_per_shard; + } else { + return default_max; // high limit + } + }; + + // Set external client infos + double res = conf.get_val<double>( + "osd_mclock_scheduler_client_res"); + double lim = conf.get_val<double>( + "osd_mclock_scheduler_client_lim"); + uint64_t wgt = conf.get_val<uint64_t>( + "osd_mclock_scheduler_client_wgt"); + default_external_client_info.update( + get_res(res), + wgt, + get_lim(lim)); + + // Set background recovery client infos + res = conf.get_val<double>( + "osd_mclock_scheduler_background_recovery_res"); + lim = conf.get_val<double>( + "osd_mclock_scheduler_background_recovery_lim"); + wgt = conf.get_val<uint64_t>( + "osd_mclock_scheduler_background_recovery_wgt"); + internal_client_infos[ + static_cast<size_t>(op_scheduler_class::background_recovery)].update( + get_res(res), + wgt, + get_lim(lim)); + + // Set background best effort client infos + res = conf.get_val<double>( + "osd_mclock_scheduler_background_best_effort_res"); + lim = conf.get_val<double>( + "osd_mclock_scheduler_background_best_effort_lim"); + wgt = conf.get_val<uint64_t>( + "osd_mclock_scheduler_background_best_effort_wgt"); + internal_client_infos[ + static_cast<size_t>(op_scheduler_class::background_best_effort)].update( + get_res(res), + wgt, + get_lim(lim)); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client( + const client_profile_id_t &client) const +{ + auto ret = external_client_infos.find(client); + if (ret == external_client_infos.end()) + return &default_external_client_info; + else + return &(ret->second); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info( + const scheduler_id_t &id) const { + switch (id.class_id) { + case op_scheduler_class::immediate: + ceph_assert(0 == "Cannot schedule immediate"); + return (dmc::ClientInfo*)nullptr; + case op_scheduler_class::client: + return get_external_client(id.client_profile_id); + default: + ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size()); + return &internal_client_infos[static_cast<size_t>(id.class_id)]; + } +} + +void mClockScheduler::set_osd_capacity_params_from_config() +{ + uint64_t osd_bandwidth_capacity; + double osd_iop_capacity; + + std::tie(osd_bandwidth_capacity, osd_iop_capacity) = [&, this] { + if (is_rotational) { + return std::make_tuple( + cct->_conf.get_val<Option::size_t>( + "osd_mclock_max_sequential_bandwidth_hdd"), + cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd")); + } else { + return std::make_tuple( + cct->_conf.get_val<Option::size_t>( + "osd_mclock_max_sequential_bandwidth_ssd"), + cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd")); + } + }(); + + osd_bandwidth_capacity = std::max<uint64_t>(1, osd_bandwidth_capacity); + osd_iop_capacity = std::max<double>(1.0, osd_iop_capacity); + + osd_bandwidth_cost_per_io = + static_cast<double>(osd_bandwidth_capacity) / osd_iop_capacity; + osd_bandwidth_capacity_per_shard = static_cast<double>(osd_bandwidth_capacity) + / static_cast<double>(num_shards); + + dout(1) << __func__ << ": osd_bandwidth_cost_per_io: " + << std::fixed << std::setprecision(2) + << osd_bandwidth_cost_per_io << " bytes/io" + << ", osd_bandwidth_capacity_per_shard " + << osd_bandwidth_capacity_per_shard << " bytes/second" + << dendl; +} + +/** + * profile_t + * + * mclock profile -- 3 params for each of 3 client classes + * 0 (min): specifies no minimum reservation + * 0 (max): specifies no upper limit + */ +struct profile_t { + struct client_config_t { + double reservation; + uint64_t weight; + double limit; + }; + client_config_t client; + client_config_t background_recovery; + client_config_t background_best_effort; +}; + +static std::ostream &operator<<( + std::ostream &lhs, const profile_t::client_config_t &rhs) +{ + return lhs << "{res: " << rhs.reservation + << ", wgt: " << rhs.weight + << ", lim: " << rhs.limit + << "}"; +} + +static std::ostream &operator<<(std::ostream &lhs, const profile_t &rhs) +{ + return lhs << "[client: " << rhs.client + << ", background_recovery: " << rhs.background_recovery + << ", background_best_effort: " << rhs.background_best_effort + << "]"; +} + +void mClockScheduler::set_config_defaults_from_profile() +{ + // Let only a single osd shard (id:0) set the profile configs + if (shard_id > 0) { + return; + } + + /** + * high_client_ops + * + * Client Allocation: + * reservation: 60% | weight: 2 | limit: 0 (max) | + * Background Recovery Allocation: + * reservation: 40% | weight: 1 | limit: 0 (max) | + * Background Best Effort Allocation: + * reservation: 0 (min) | weight: 1 | limit: 70% | + */ + static constexpr profile_t high_client_ops_profile{ + { .6, 2, 0 }, + { .4, 1, 0 }, + { 0, 1, .7 } + }; + + /** + * high_recovery_ops + * + * Client Allocation: + * reservation: 30% | weight: 1 | limit: 0 (max) | + * Background Recovery Allocation: + * reservation: 70% | weight: 2 | limit: 0 (max) | + * Background Best Effort Allocation: + * reservation: 0 (min) | weight: 1 | limit: 0 (max) | + */ + static constexpr profile_t high_recovery_ops_profile{ + { .3, 1, 0 }, + { .7, 2, 0 }, + { 0, 1, 0 } + }; + + /** + * balanced + * + * Client Allocation: + * reservation: 50% | weight: 1 | limit: 0 (max) | + * Background Recovery Allocation: + * reservation: 50% | weight: 1 | limit: 0 (max) | + * Background Best Effort Allocation: + * reservation: 0 (min) | weight: 1 | limit: 90% | + */ + static constexpr profile_t balanced_profile{ + { .5, 1, 0 }, + { .5, 1, 0 }, + { 0, 1, .9 } + }; + + const profile_t *profile = nullptr; + auto mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile"); + if (mclock_profile == "high_client_ops") { + profile = &high_client_ops_profile; + dout(10) << "Setting high_client_ops profile " << *profile << dendl; + } else if (mclock_profile == "high_recovery_ops") { + profile = &high_recovery_ops_profile; + dout(10) << "Setting high_recovery_ops profile " << *profile << dendl; + } else if (mclock_profile == "balanced") { + profile = &balanced_profile; + dout(10) << "Setting balanced profile " << *profile << dendl; + } else if (mclock_profile == "custom") { + dout(10) << "Profile set to custom, not setting defaults" << dendl; + return; + } else { + derr << "Invalid mclock profile: " << mclock_profile << dendl; + ceph_assert("Invalid choice of mclock profile" == 0); + return; + } + ceph_assert(nullptr != profile); + + auto set_config = [&conf = cct->_conf](const char *key, auto val) { + conf.set_val_default(key, std::to_string(val)); + }; + + set_config("osd_mclock_scheduler_client_res", profile->client.reservation); + set_config("osd_mclock_scheduler_client_wgt", profile->client.weight); + set_config("osd_mclock_scheduler_client_lim", profile->client.limit); + + set_config( + "osd_mclock_scheduler_background_recovery_res", + profile->background_recovery.reservation); + set_config( + "osd_mclock_scheduler_background_recovery_wgt", + profile->background_recovery.weight); + set_config( + "osd_mclock_scheduler_background_recovery_lim", + profile->background_recovery.limit); + + set_config( + "osd_mclock_scheduler_background_best_effort_res", + profile->background_best_effort.reservation); + set_config( + "osd_mclock_scheduler_background_best_effort_wgt", + profile->background_best_effort.weight); + set_config( + "osd_mclock_scheduler_background_best_effort_lim", + profile->background_best_effort.limit); + + cct->_conf.apply_changes(nullptr); +} + +uint32_t mClockScheduler::calc_scaled_cost(int item_cost) +{ + auto cost = static_cast<uint32_t>( + std::max<int>( + 1, // ensure cost is non-zero and positive + item_cost)); + auto cost_per_io = static_cast<uint32_t>(osd_bandwidth_cost_per_io); + + return std::max<uint32_t>(cost, cost_per_io); +} + +void mClockScheduler::update_configuration() +{ + // Apply configuration change. The expectation is that + // at least one of the tracked mclock config option keys + // is modified before calling this method. + cct->_conf.apply_changes(nullptr); +} + +void mClockScheduler::dump(ceph::Formatter &f) const +{ + // Display queue sizes + f.open_object_section("queue_sizes"); + f.dump_int("high_priority_queue", high_priority.size()); + f.dump_int("scheduler", scheduler.request_count()); + f.close_section(); + + // client map and queue tops (res, wgt, lim) + std::ostringstream out; + f.open_object_section("mClockClients"); + f.dump_int("client_count", scheduler.client_count()); + out << scheduler; + f.dump_string("clients", out.str()); + f.close_section(); + + // Display sorted queues (res, wgt, lim) + f.open_object_section("mClockQueues"); + f.dump_string("queues", display_queues()); + f.close_section(); + + f.open_object_section("HighPriorityQueue"); + for (auto it = high_priority.begin(); + it != high_priority.end(); it++) { + f.dump_int("priority", it->first); + f.dump_int("queue_size", it->second.size()); + } + f.close_section(); +} + +void mClockScheduler::enqueue(OpSchedulerItem&& item) +{ + auto id = get_scheduler_id(item); + unsigned priority = item.get_priority(); + + // TODO: move this check into OpSchedulerItem, handle backwards compat + if (op_scheduler_class::immediate == id.class_id) { + enqueue_high(immediate_class_priority, std::move(item)); + } else if (priority >= cutoff_priority) { + enqueue_high(priority, std::move(item)); + } else { + auto cost = calc_scaled_cost(item.get_cost()); + item.set_qos_cost(cost); + dout(20) << __func__ << " " << id + << " item_cost: " << item.get_cost() + << " scaled_cost: " << cost + << dendl; + + // Add item to scheduler queue + scheduler.add_request( + std::move(item), + id, + cost); + } + + dout(20) << __func__ << " client_count: " << scheduler.client_count() + << " queue_sizes: [ " + << " high_priority_queue: " << high_priority.size() + << " sched: " << scheduler.request_count() << " ]" + << dendl; + dout(30) << __func__ << " mClockClients: " + << scheduler + << dendl; + dout(30) << __func__ << " mClockQueues: { " + << display_queues() << " }" + << dendl; +} + +void mClockScheduler::enqueue_front(OpSchedulerItem&& item) +{ + unsigned priority = item.get_priority(); + auto id = get_scheduler_id(item); + + if (op_scheduler_class::immediate == id.class_id) { + enqueue_high(immediate_class_priority, std::move(item), true); + } else if (priority >= cutoff_priority) { + enqueue_high(priority, std::move(item), true); + } else { + // mClock does not support enqueue at front, so we use + // the high queue with priority 0 + enqueue_high(0, std::move(item), true); + } +} + +void mClockScheduler::enqueue_high(unsigned priority, + OpSchedulerItem&& item, + bool front) +{ + if (front) { + high_priority[priority].push_back(std::move(item)); + } else { + high_priority[priority].push_front(std::move(item)); + } +} + +WorkItem mClockScheduler::dequeue() +{ + if (!high_priority.empty()) { + auto iter = high_priority.begin(); + // invariant: high_priority entries are never empty + assert(!iter->second.empty()); + WorkItem ret{std::move(iter->second.back())}; + iter->second.pop_back(); + if (iter->second.empty()) { + // maintain invariant, high priority entries are never empty + high_priority.erase(iter); + } + ceph_assert(std::get_if<OpSchedulerItem>(&ret)); + return ret; + } else { + mclock_queue_t::PullReq result = scheduler.pull_request(); + if (result.is_future()) { + return result.getTime(); + } else if (result.is_none()) { + ceph_assert( + 0 == "Impossible, must have checked empty() first"); + return {}; + } else { + ceph_assert(result.is_retn()); + + auto &retn = result.get_retn(); + return std::move(*retn.request); + } + } +} + +std::string mClockScheduler::display_queues() const +{ + std::ostringstream out; + scheduler.display_queues(out); + return out.str(); +} + +const char** mClockScheduler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_mclock_scheduler_client_res", + "osd_mclock_scheduler_client_wgt", + "osd_mclock_scheduler_client_lim", + "osd_mclock_scheduler_background_recovery_res", + "osd_mclock_scheduler_background_recovery_wgt", + "osd_mclock_scheduler_background_recovery_lim", + "osd_mclock_scheduler_background_best_effort_res", + "osd_mclock_scheduler_background_best_effort_wgt", + "osd_mclock_scheduler_background_best_effort_lim", + "osd_mclock_max_capacity_iops_hdd", + "osd_mclock_max_capacity_iops_ssd", + "osd_mclock_max_sequential_bandwidth_hdd", + "osd_mclock_max_sequential_bandwidth_ssd", + "osd_mclock_profile", + NULL + }; + return KEYS; +} + +void mClockScheduler::handle_conf_change( + const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + if (changed.count("osd_mclock_max_capacity_iops_hdd") || + changed.count("osd_mclock_max_capacity_iops_ssd")) { + set_osd_capacity_params_from_config(); + client_registry.update_from_config( + conf, osd_bandwidth_capacity_per_shard); + } + if (changed.count("osd_mclock_max_sequential_bandwidth_hdd") || + changed.count("osd_mclock_max_sequential_bandwidth_ssd")) { + set_osd_capacity_params_from_config(); + client_registry.update_from_config( + conf, osd_bandwidth_capacity_per_shard); + } + if (changed.count("osd_mclock_profile")) { + set_config_defaults_from_profile(); + client_registry.update_from_config( + conf, osd_bandwidth_capacity_per_shard); + } + + auto get_changed_key = [&changed]() -> std::optional<std::string> { + static const std::vector<std::string> qos_params = { + "osd_mclock_scheduler_client_res", + "osd_mclock_scheduler_client_wgt", + "osd_mclock_scheduler_client_lim", + "osd_mclock_scheduler_background_recovery_res", + "osd_mclock_scheduler_background_recovery_wgt", + "osd_mclock_scheduler_background_recovery_lim", + "osd_mclock_scheduler_background_best_effort_res", + "osd_mclock_scheduler_background_best_effort_wgt", + "osd_mclock_scheduler_background_best_effort_lim" + }; + + for (auto &qp : qos_params) { + if (changed.count(qp)) { + return qp; + } + } + return std::nullopt; + }; + + if (auto key = get_changed_key(); key.has_value()) { + auto mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile"); + if (mclock_profile == "custom") { + client_registry.update_from_config( + conf, osd_bandwidth_capacity_per_shard); + } else { + // Attempt to change QoS parameter for a built-in profile. Restore the + // profile defaults by making one of the OSD shards remove the key from + // config monitor store. Note: monc is included in the check since the + // mock unit test currently doesn't initialize it. + if (shard_id == 0 && monc) { + static const std::vector<std::string> osds = { + "osd", + "osd." + std::to_string(whoami) + }; + + for (auto osd : osds) { + std::string cmd = + "{" + "\"prefix\": \"config rm\", " + "\"who\": \"" + osd + "\", " + "\"name\": \"" + *key + "\"" + "}"; + std::vector<std::string> vcmd{cmd}; + + dout(10) << __func__ << " Removing Key: " << *key + << " for " << osd << " from Mon db" << dendl; + monc->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr); + } + } + } + // Alternatively, the QoS parameter, if set ephemerally for this OSD via + // the 'daemon' or 'tell' interfaces must be removed. + if (!cct->_conf.rm_val(*key)) { + dout(10) << __func__ << " Restored " << *key << " to default" << dendl; + cct->_conf.apply_changes(nullptr); + } + } +} + +mClockScheduler::~mClockScheduler() +{ + cct->_conf.remove_observer(this); +} + +} diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h new file mode 100644 index 000000000..f708b1d7a --- /dev/null +++ b/src/osd/scheduler/mClockScheduler.h @@ -0,0 +1,277 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#pragma once + +#include <functional> +#include <ostream> +#include <map> +#include <vector> + +#include "boost/variant.hpp" + +#include "dmclock/src/dmclock_server.h" + +#include "osd/scheduler/OpScheduler.h" +#include "common/config.h" +#include "common/ceph_context.h" +#include "common/mClockPriorityQueue.h" +#include "osd/scheduler/OpSchedulerItem.h" + + +namespace ceph::osd::scheduler { + +constexpr double default_min = 0.0; +constexpr double default_max = std::numeric_limits<double>::is_iec559 ? + std::numeric_limits<double>::infinity() : + std::numeric_limits<double>::max(); + +/** + * client_profile_id_t + * + * client_id - global id (client.####) for client QoS + * profile_id - id generated by client's QoS profile + * + * Currently (Reef and below), both members are set to + * 0 which ensures that all external clients share the + * mClock profile allocated reservation and limit + * bandwidth. + * + * Note: Post Reef, both members will be set to non-zero + * values when the distributed feature of the mClock + * algorithm is utilized. + */ +struct client_profile_id_t { + uint64_t client_id = 0; + uint64_t profile_id = 0; + + client_profile_id_t(uint64_t _client_id, uint64_t _profile_id) : + client_id(_client_id), + profile_id(_profile_id) {} + + client_profile_id_t() = default; + + auto operator<=>(const client_profile_id_t&) const = default; + friend std::ostream& operator<<(std::ostream& out, + const client_profile_id_t& client_profile) { + out << " client_id: " << client_profile.client_id + << " profile_id: " << client_profile.profile_id; + return out; + } +}; + +struct scheduler_id_t { + op_scheduler_class class_id; + client_profile_id_t client_profile_id; + + auto operator<=>(const scheduler_id_t&) const = default; + friend std::ostream& operator<<(std::ostream& out, + const scheduler_id_t& sched_id) { + out << "{ class_id: " << sched_id.class_id + << sched_id.client_profile_id; + return out << " }"; + } +}; + +/** + * Scheduler implementation based on mclock. + * + * TODO: explain configs + */ +class mClockScheduler : public OpScheduler, md_config_obs_t { + + CephContext *cct; + const int whoami; + const uint32_t num_shards; + const int shard_id; + const bool is_rotational; + MonClient *monc; + + /** + * osd_bandwidth_cost_per_io + * + * mClock expects all queued items to have a uniform expression of + * "cost". However, IO devices generally have quite different capacity + * for sequential IO vs small random IO. This implementation handles this + * by expressing all costs as a number of sequential bytes written adding + * additional cost for each random IO equal to osd_bandwidth_cost_per_io. + * + * Thus, an IO operation requiring a total of <size> bytes to be written + * accross <iops> different locations will have a cost of + * <size> + (osd_bandwidth_cost_per_io * <iops>) bytes. + * + * Set in set_osd_capacity_params_from_config in the constructor and upon + * config change. + * + * Has units bytes/io. + */ + double osd_bandwidth_cost_per_io; + + /** + * osd_bandwidth_capacity_per_shard + * + * mClock expects reservation and limit paramters to be expressed in units + * of cost/second -- which means bytes/second for this implementation. + * + * Rather than expecting users to compute appropriate limit and reservation + * values for each class of OSDs in their cluster, we instead express + * reservation and limit paramaters as ratios of the OSD's maxmimum capacity. + * osd_bandwidth_capacity_per_shard is that capacity divided by the number + * of shards. + * + * Set in set_osd_capacity_params_from_config in the constructor and upon + * config change. + * + * This value gets passed to ClientRegistry::update_from_config in order + * to resolve the full reservaiton and limit parameters for mclock from + * the configured ratios. + * + * Has units bytes/second. + */ + double osd_bandwidth_capacity_per_shard; + + class ClientRegistry { + std::array< + crimson::dmclock::ClientInfo, + static_cast<size_t>(op_scheduler_class::immediate) + > internal_client_infos = { + // Placeholder, gets replaced with configured values + crimson::dmclock::ClientInfo(1, 1, 1), + crimson::dmclock::ClientInfo(1, 1, 1) + }; + + crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1}; + std::map<client_profile_id_t, + crimson::dmclock::ClientInfo> external_client_infos; + const crimson::dmclock::ClientInfo *get_external_client( + const client_profile_id_t &client) const; + public: + /** + * update_from_config + * + * Sets the mclock paramaters (reservation, weight, and limit) + * for each class of IO (background_recovery, background_best_effort, + * and client). + */ + void update_from_config( + const ConfigProxy &conf, + double capacity_per_shard); + const crimson::dmclock::ClientInfo *get_info( + const scheduler_id_t &id) const; + } client_registry; + + using mclock_queue_t = crimson::dmclock::PullPriorityQueue< + scheduler_id_t, + OpSchedulerItem, + true, + true, + 2>; + using priority_t = unsigned; + using SubQueue = std::map<priority_t, + std::list<OpSchedulerItem>, + std::greater<priority_t>>; + mclock_queue_t scheduler; + /** + * high_priority + * + * Holds entries to be dequeued in strict order ahead of mClock + * Invariant: entries are never empty + */ + SubQueue high_priority; + priority_t immediate_class_priority = std::numeric_limits<priority_t>::max(); + + static scheduler_id_t get_scheduler_id(const OpSchedulerItem &item) { + return scheduler_id_t{ + item.get_scheduler_class(), + client_profile_id_t() + }; + } + + static unsigned int get_io_prio_cut(CephContext *cct) { + if (cct->_conf->osd_op_queue_cut_off == "debug_random") { + std::random_device rd; + std::mt19937 random_gen(rd()); + return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW; + } else if (cct->_conf->osd_op_queue_cut_off == "high") { + return CEPH_MSG_PRIO_HIGH; + } else { + // default / catch-all is 'low' + return CEPH_MSG_PRIO_LOW; + } + } + + unsigned cutoff_priority = get_io_prio_cut(cct); + + /** + * set_osd_capacity_params_from_config + * + * mClockScheduler uses two parameters, osd_bandwidth_cost_per_io + * and osd_bandwidth_capacity_per_shard, internally. These two + * parameters are derived from config parameters + * osd_mclock_max_capacity_iops_(hdd|ssd) and + * osd_mclock_max_sequential_bandwidth_(hdd|ssd) as well as num_shards. + * Invoking set_osd_capacity_params_from_config() resets those derived + * params based on the current config and should be invoked any time they + * are modified as well as in the constructor. See handle_conf_change(). + */ + void set_osd_capacity_params_from_config(); + + // Set the mclock related config params based on the profile + void set_config_defaults_from_profile(); + +public: + mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards, + int shard_id, bool is_rotational, MonClient *monc); + ~mClockScheduler() override; + + /// Calculate scaled cost per item + uint32_t calc_scaled_cost(int cost); + + // Helper method to display mclock queues + std::string display_queues() const; + + // Enqueue op in the back of the regular queue + void enqueue(OpSchedulerItem &&item) final; + + // Enqueue the op in the front of the high priority queue + void enqueue_front(OpSchedulerItem &&item) final; + + // Return an op to be dispatch + WorkItem dequeue() final; + + // Returns if the queue is empty + bool empty() const final { + return scheduler.empty() && high_priority.empty(); + } + + // Formatted output of the queue + void dump(ceph::Formatter &f) const final; + + void print(std::ostream &ostream) const final { + ostream << "mClockScheduler"; + } + + // Update data associated with the modified mclock config key(s) + void update_configuration() final; + + const char** get_tracked_conf_keys() const final; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) final; +private: + // Enqueue the op to the high priority queue + void enqueue_high(unsigned prio, OpSchedulerItem &&item, bool front = false); +}; + +} |