summaryrefslogtreecommitdiffstats
path: root/src/osd/scheduler/mClockScheduler.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/osd/scheduler/mClockScheduler.cc')
-rw-r--r--src/osd/scheduler/mClockScheduler.cc597
1 files changed, 597 insertions, 0 deletions
diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc
new file mode 100644
index 000000000..0ea519655
--- /dev/null
+++ b/src/osd/scheduler/mClockScheduler.cc
@@ -0,0 +1,597 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include <memory>
+#include <functional>
+
+#include "osd/scheduler/mClockScheduler.h"
+#include "common/dout.h"
+
+namespace dmc = crimson::dmclock;
+using namespace std::placeholders;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_mclock
+#undef dout_prefix
+#define dout_prefix *_dout << "mClockScheduler: "
+
+
+namespace ceph::osd::scheduler {
+
+mClockScheduler::mClockScheduler(CephContext *cct,
+ int whoami,
+ uint32_t num_shards,
+ int shard_id,
+ bool is_rotational,
+ MonClient *monc)
+ : cct(cct),
+ whoami(whoami),
+ num_shards(num_shards),
+ shard_id(shard_id),
+ is_rotational(is_rotational),
+ monc(monc),
+ scheduler(
+ std::bind(&mClockScheduler::ClientRegistry::get_info,
+ &client_registry,
+ _1),
+ dmc::AtLimit::Wait,
+ cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
+{
+ cct->_conf.add_observer(this);
+ ceph_assert(num_shards > 0);
+ set_osd_capacity_params_from_config();
+ set_config_defaults_from_profile();
+ client_registry.update_from_config(
+ cct->_conf, osd_bandwidth_capacity_per_shard);
+}
+
+/* ClientRegistry holds the dmclock::ClientInfo configuration parameters
+ * (reservation (bytes/second), weight (unitless), limit (bytes/second))
+ * for each IO class in the OSD (client, background_recovery,
+ * background_best_effort).
+ *
+ * mclock expects limit and reservation to have units of <cost>/second
+ * (bytes/second), but osd_mclock_scheduler_client_(lim|res) are provided
+ * as ratios of the OSD's capacity. We convert from the one to the other
+ * using the capacity_per_shard parameter.
+ *
+ * Note, mclock profile information will already have been set as a default
+ * for the osd_mclock_scheduler_client_* parameters prior to calling
+ * update_from_config -- see set_config_defaults_from_profile().
+ */
+void mClockScheduler::ClientRegistry::update_from_config(
+ const ConfigProxy &conf,
+ const double capacity_per_shard)
+{
+
+ auto get_res = [&](double res) {
+ if (res) {
+ return res * capacity_per_shard;
+ } else {
+ return default_min; // min reservation
+ }
+ };
+
+ auto get_lim = [&](double lim) {
+ if (lim) {
+ return lim * capacity_per_shard;
+ } else {
+ return default_max; // high limit
+ }
+ };
+
+ // Set external client infos
+ double res = conf.get_val<double>(
+ "osd_mclock_scheduler_client_res");
+ double lim = conf.get_val<double>(
+ "osd_mclock_scheduler_client_lim");
+ uint64_t wgt = conf.get_val<uint64_t>(
+ "osd_mclock_scheduler_client_wgt");
+ default_external_client_info.update(
+ get_res(res),
+ wgt,
+ get_lim(lim));
+
+ // Set background recovery client infos
+ res = conf.get_val<double>(
+ "osd_mclock_scheduler_background_recovery_res");
+ lim = conf.get_val<double>(
+ "osd_mclock_scheduler_background_recovery_lim");
+ wgt = conf.get_val<uint64_t>(
+ "osd_mclock_scheduler_background_recovery_wgt");
+ internal_client_infos[
+ static_cast<size_t>(op_scheduler_class::background_recovery)].update(
+ get_res(res),
+ wgt,
+ get_lim(lim));
+
+ // Set background best effort client infos
+ res = conf.get_val<double>(
+ "osd_mclock_scheduler_background_best_effort_res");
+ lim = conf.get_val<double>(
+ "osd_mclock_scheduler_background_best_effort_lim");
+ wgt = conf.get_val<uint64_t>(
+ "osd_mclock_scheduler_background_best_effort_wgt");
+ internal_client_infos[
+ static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
+ get_res(res),
+ wgt,
+ get_lim(lim));
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
+ const client_profile_id_t &client) const
+{
+ auto ret = external_client_infos.find(client);
+ if (ret == external_client_infos.end())
+ return &default_external_client_info;
+ else
+ return &(ret->second);
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
+ const scheduler_id_t &id) const {
+ switch (id.class_id) {
+ case op_scheduler_class::immediate:
+ ceph_assert(0 == "Cannot schedule immediate");
+ return (dmc::ClientInfo*)nullptr;
+ case op_scheduler_class::client:
+ return get_external_client(id.client_profile_id);
+ default:
+ ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
+ return &internal_client_infos[static_cast<size_t>(id.class_id)];
+ }
+}
+
+void mClockScheduler::set_osd_capacity_params_from_config()
+{
+ uint64_t osd_bandwidth_capacity;
+ double osd_iop_capacity;
+
+ std::tie(osd_bandwidth_capacity, osd_iop_capacity) = [&, this] {
+ if (is_rotational) {
+ return std::make_tuple(
+ cct->_conf.get_val<Option::size_t>(
+ "osd_mclock_max_sequential_bandwidth_hdd"),
+ cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd"));
+ } else {
+ return std::make_tuple(
+ cct->_conf.get_val<Option::size_t>(
+ "osd_mclock_max_sequential_bandwidth_ssd"),
+ cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd"));
+ }
+ }();
+
+ osd_bandwidth_capacity = std::max<uint64_t>(1, osd_bandwidth_capacity);
+ osd_iop_capacity = std::max<double>(1.0, osd_iop_capacity);
+
+ osd_bandwidth_cost_per_io =
+ static_cast<double>(osd_bandwidth_capacity) / osd_iop_capacity;
+ osd_bandwidth_capacity_per_shard = static_cast<double>(osd_bandwidth_capacity)
+ / static_cast<double>(num_shards);
+
+ dout(1) << __func__ << ": osd_bandwidth_cost_per_io: "
+ << std::fixed << std::setprecision(2)
+ << osd_bandwidth_cost_per_io << " bytes/io"
+ << ", osd_bandwidth_capacity_per_shard "
+ << osd_bandwidth_capacity_per_shard << " bytes/second"
+ << dendl;
+}
+
+/**
+ * profile_t
+ *
+ * mclock profile -- 3 params for each of 3 client classes
+ * 0 (min): specifies no minimum reservation
+ * 0 (max): specifies no upper limit
+ */
+struct profile_t {
+ struct client_config_t {
+ double reservation;
+ uint64_t weight;
+ double limit;
+ };
+ client_config_t client;
+ client_config_t background_recovery;
+ client_config_t background_best_effort;
+};
+
+static std::ostream &operator<<(
+ std::ostream &lhs, const profile_t::client_config_t &rhs)
+{
+ return lhs << "{res: " << rhs.reservation
+ << ", wgt: " << rhs.weight
+ << ", lim: " << rhs.limit
+ << "}";
+}
+
+static std::ostream &operator<<(std::ostream &lhs, const profile_t &rhs)
+{
+ return lhs << "[client: " << rhs.client
+ << ", background_recovery: " << rhs.background_recovery
+ << ", background_best_effort: " << rhs.background_best_effort
+ << "]";
+}
+
+void mClockScheduler::set_config_defaults_from_profile()
+{
+ // Let only a single osd shard (id:0) set the profile configs
+ if (shard_id > 0) {
+ return;
+ }
+
+ /**
+ * high_client_ops
+ *
+ * Client Allocation:
+ * reservation: 60% | weight: 2 | limit: 0 (max) |
+ * Background Recovery Allocation:
+ * reservation: 40% | weight: 1 | limit: 0 (max) |
+ * Background Best Effort Allocation:
+ * reservation: 0 (min) | weight: 1 | limit: 70% |
+ */
+ static constexpr profile_t high_client_ops_profile{
+ { .6, 2, 0 },
+ { .4, 1, 0 },
+ { 0, 1, .7 }
+ };
+
+ /**
+ * high_recovery_ops
+ *
+ * Client Allocation:
+ * reservation: 30% | weight: 1 | limit: 0 (max) |
+ * Background Recovery Allocation:
+ * reservation: 70% | weight: 2 | limit: 0 (max) |
+ * Background Best Effort Allocation:
+ * reservation: 0 (min) | weight: 1 | limit: 0 (max) |
+ */
+ static constexpr profile_t high_recovery_ops_profile{
+ { .3, 1, 0 },
+ { .7, 2, 0 },
+ { 0, 1, 0 }
+ };
+
+ /**
+ * balanced
+ *
+ * Client Allocation:
+ * reservation: 50% | weight: 1 | limit: 0 (max) |
+ * Background Recovery Allocation:
+ * reservation: 50% | weight: 1 | limit: 0 (max) |
+ * Background Best Effort Allocation:
+ * reservation: 0 (min) | weight: 1 | limit: 90% |
+ */
+ static constexpr profile_t balanced_profile{
+ { .5, 1, 0 },
+ { .5, 1, 0 },
+ { 0, 1, .9 }
+ };
+
+ const profile_t *profile = nullptr;
+ auto mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
+ if (mclock_profile == "high_client_ops") {
+ profile = &high_client_ops_profile;
+ dout(10) << "Setting high_client_ops profile " << *profile << dendl;
+ } else if (mclock_profile == "high_recovery_ops") {
+ profile = &high_recovery_ops_profile;
+ dout(10) << "Setting high_recovery_ops profile " << *profile << dendl;
+ } else if (mclock_profile == "balanced") {
+ profile = &balanced_profile;
+ dout(10) << "Setting balanced profile " << *profile << dendl;
+ } else if (mclock_profile == "custom") {
+ dout(10) << "Profile set to custom, not setting defaults" << dendl;
+ return;
+ } else {
+ derr << "Invalid mclock profile: " << mclock_profile << dendl;
+ ceph_assert("Invalid choice of mclock profile" == 0);
+ return;
+ }
+ ceph_assert(nullptr != profile);
+
+ auto set_config = [&conf = cct->_conf](const char *key, auto val) {
+ conf.set_val_default(key, std::to_string(val));
+ };
+
+ set_config("osd_mclock_scheduler_client_res", profile->client.reservation);
+ set_config("osd_mclock_scheduler_client_wgt", profile->client.weight);
+ set_config("osd_mclock_scheduler_client_lim", profile->client.limit);
+
+ set_config(
+ "osd_mclock_scheduler_background_recovery_res",
+ profile->background_recovery.reservation);
+ set_config(
+ "osd_mclock_scheduler_background_recovery_wgt",
+ profile->background_recovery.weight);
+ set_config(
+ "osd_mclock_scheduler_background_recovery_lim",
+ profile->background_recovery.limit);
+
+ set_config(
+ "osd_mclock_scheduler_background_best_effort_res",
+ profile->background_best_effort.reservation);
+ set_config(
+ "osd_mclock_scheduler_background_best_effort_wgt",
+ profile->background_best_effort.weight);
+ set_config(
+ "osd_mclock_scheduler_background_best_effort_lim",
+ profile->background_best_effort.limit);
+
+ cct->_conf.apply_changes(nullptr);
+}
+
+uint32_t mClockScheduler::calc_scaled_cost(int item_cost)
+{
+ auto cost = static_cast<uint32_t>(
+ std::max<int>(
+ 1, // ensure cost is non-zero and positive
+ item_cost));
+ auto cost_per_io = static_cast<uint32_t>(osd_bandwidth_cost_per_io);
+
+ return std::max<uint32_t>(cost, cost_per_io);
+}
+
+void mClockScheduler::update_configuration()
+{
+ // Apply configuration change. The expectation is that
+ // at least one of the tracked mclock config option keys
+ // is modified before calling this method.
+ cct->_conf.apply_changes(nullptr);
+}
+
+void mClockScheduler::dump(ceph::Formatter &f) const
+{
+ // Display queue sizes
+ f.open_object_section("queue_sizes");
+ f.dump_int("high_priority_queue", high_priority.size());
+ f.dump_int("scheduler", scheduler.request_count());
+ f.close_section();
+
+ // client map and queue tops (res, wgt, lim)
+ std::ostringstream out;
+ f.open_object_section("mClockClients");
+ f.dump_int("client_count", scheduler.client_count());
+ out << scheduler;
+ f.dump_string("clients", out.str());
+ f.close_section();
+
+ // Display sorted queues (res, wgt, lim)
+ f.open_object_section("mClockQueues");
+ f.dump_string("queues", display_queues());
+ f.close_section();
+
+ f.open_object_section("HighPriorityQueue");
+ for (auto it = high_priority.begin();
+ it != high_priority.end(); it++) {
+ f.dump_int("priority", it->first);
+ f.dump_int("queue_size", it->second.size());
+ }
+ f.close_section();
+}
+
+void mClockScheduler::enqueue(OpSchedulerItem&& item)
+{
+ auto id = get_scheduler_id(item);
+ unsigned priority = item.get_priority();
+
+ // TODO: move this check into OpSchedulerItem, handle backwards compat
+ if (op_scheduler_class::immediate == id.class_id) {
+ enqueue_high(immediate_class_priority, std::move(item));
+ } else if (priority >= cutoff_priority) {
+ enqueue_high(priority, std::move(item));
+ } else {
+ auto cost = calc_scaled_cost(item.get_cost());
+ item.set_qos_cost(cost);
+ dout(20) << __func__ << " " << id
+ << " item_cost: " << item.get_cost()
+ << " scaled_cost: " << cost
+ << dendl;
+
+ // Add item to scheduler queue
+ scheduler.add_request(
+ std::move(item),
+ id,
+ cost);
+ }
+
+ dout(20) << __func__ << " client_count: " << scheduler.client_count()
+ << " queue_sizes: [ "
+ << " high_priority_queue: " << high_priority.size()
+ << " sched: " << scheduler.request_count() << " ]"
+ << dendl;
+ dout(30) << __func__ << " mClockClients: "
+ << scheduler
+ << dendl;
+ dout(30) << __func__ << " mClockQueues: { "
+ << display_queues() << " }"
+ << dendl;
+}
+
+void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
+{
+ unsigned priority = item.get_priority();
+ auto id = get_scheduler_id(item);
+
+ if (op_scheduler_class::immediate == id.class_id) {
+ enqueue_high(immediate_class_priority, std::move(item), true);
+ } else if (priority >= cutoff_priority) {
+ enqueue_high(priority, std::move(item), true);
+ } else {
+ // mClock does not support enqueue at front, so we use
+ // the high queue with priority 0
+ enqueue_high(0, std::move(item), true);
+ }
+}
+
+void mClockScheduler::enqueue_high(unsigned priority,
+ OpSchedulerItem&& item,
+ bool front)
+{
+ if (front) {
+ high_priority[priority].push_back(std::move(item));
+ } else {
+ high_priority[priority].push_front(std::move(item));
+ }
+}
+
+WorkItem mClockScheduler::dequeue()
+{
+ if (!high_priority.empty()) {
+ auto iter = high_priority.begin();
+ // invariant: high_priority entries are never empty
+ assert(!iter->second.empty());
+ WorkItem ret{std::move(iter->second.back())};
+ iter->second.pop_back();
+ if (iter->second.empty()) {
+ // maintain invariant, high priority entries are never empty
+ high_priority.erase(iter);
+ }
+ ceph_assert(std::get_if<OpSchedulerItem>(&ret));
+ return ret;
+ } else {
+ mclock_queue_t::PullReq result = scheduler.pull_request();
+ if (result.is_future()) {
+ return result.getTime();
+ } else if (result.is_none()) {
+ ceph_assert(
+ 0 == "Impossible, must have checked empty() first");
+ return {};
+ } else {
+ ceph_assert(result.is_retn());
+
+ auto &retn = result.get_retn();
+ return std::move(*retn.request);
+ }
+ }
+}
+
+std::string mClockScheduler::display_queues() const
+{
+ std::ostringstream out;
+ scheduler.display_queues(out);
+ return out.str();
+}
+
+const char** mClockScheduler::get_tracked_conf_keys() const
+{
+ static const char* KEYS[] = {
+ "osd_mclock_scheduler_client_res",
+ "osd_mclock_scheduler_client_wgt",
+ "osd_mclock_scheduler_client_lim",
+ "osd_mclock_scheduler_background_recovery_res",
+ "osd_mclock_scheduler_background_recovery_wgt",
+ "osd_mclock_scheduler_background_recovery_lim",
+ "osd_mclock_scheduler_background_best_effort_res",
+ "osd_mclock_scheduler_background_best_effort_wgt",
+ "osd_mclock_scheduler_background_best_effort_lim",
+ "osd_mclock_max_capacity_iops_hdd",
+ "osd_mclock_max_capacity_iops_ssd",
+ "osd_mclock_max_sequential_bandwidth_hdd",
+ "osd_mclock_max_sequential_bandwidth_ssd",
+ "osd_mclock_profile",
+ NULL
+ };
+ return KEYS;
+}
+
+void mClockScheduler::handle_conf_change(
+ const ConfigProxy& conf,
+ const std::set<std::string> &changed)
+{
+ if (changed.count("osd_mclock_max_capacity_iops_hdd") ||
+ changed.count("osd_mclock_max_capacity_iops_ssd")) {
+ set_osd_capacity_params_from_config();
+ client_registry.update_from_config(
+ conf, osd_bandwidth_capacity_per_shard);
+ }
+ if (changed.count("osd_mclock_max_sequential_bandwidth_hdd") ||
+ changed.count("osd_mclock_max_sequential_bandwidth_ssd")) {
+ set_osd_capacity_params_from_config();
+ client_registry.update_from_config(
+ conf, osd_bandwidth_capacity_per_shard);
+ }
+ if (changed.count("osd_mclock_profile")) {
+ set_config_defaults_from_profile();
+ client_registry.update_from_config(
+ conf, osd_bandwidth_capacity_per_shard);
+ }
+
+ auto get_changed_key = [&changed]() -> std::optional<std::string> {
+ static const std::vector<std::string> qos_params = {
+ "osd_mclock_scheduler_client_res",
+ "osd_mclock_scheduler_client_wgt",
+ "osd_mclock_scheduler_client_lim",
+ "osd_mclock_scheduler_background_recovery_res",
+ "osd_mclock_scheduler_background_recovery_wgt",
+ "osd_mclock_scheduler_background_recovery_lim",
+ "osd_mclock_scheduler_background_best_effort_res",
+ "osd_mclock_scheduler_background_best_effort_wgt",
+ "osd_mclock_scheduler_background_best_effort_lim"
+ };
+
+ for (auto &qp : qos_params) {
+ if (changed.count(qp)) {
+ return qp;
+ }
+ }
+ return std::nullopt;
+ };
+
+ if (auto key = get_changed_key(); key.has_value()) {
+ auto mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
+ if (mclock_profile == "custom") {
+ client_registry.update_from_config(
+ conf, osd_bandwidth_capacity_per_shard);
+ } else {
+ // Attempt to change QoS parameter for a built-in profile. Restore the
+ // profile defaults by making one of the OSD shards remove the key from
+ // config monitor store. Note: monc is included in the check since the
+ // mock unit test currently doesn't initialize it.
+ if (shard_id == 0 && monc) {
+ static const std::vector<std::string> osds = {
+ "osd",
+ "osd." + std::to_string(whoami)
+ };
+
+ for (auto osd : osds) {
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config rm\", "
+ "\"who\": \"" + osd + "\", "
+ "\"name\": \"" + *key + "\""
+ "}";
+ std::vector<std::string> vcmd{cmd};
+
+ dout(10) << __func__ << " Removing Key: " << *key
+ << " for " << osd << " from Mon db" << dendl;
+ monc->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr);
+ }
+ }
+ }
+ // Alternatively, the QoS parameter, if set ephemerally for this OSD via
+ // the 'daemon' or 'tell' interfaces must be removed.
+ if (!cct->_conf.rm_val(*key)) {
+ dout(10) << __func__ << " Restored " << *key << " to default" << dendl;
+ cct->_conf.apply_changes(nullptr);
+ }
+ }
+}
+
+mClockScheduler::~mClockScheduler()
+{
+ cct->_conf.remove_observer(this);
+}
+
+}