summaryrefslogtreecommitdiffstats
path: root/src/osd/scheduler/mClockScheduler.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/osd/scheduler/mClockScheduler.h')
-rw-r--r--src/osd/scheduler/mClockScheduler.h277
1 files changed, 277 insertions, 0 deletions
diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h
new file mode 100644
index 000000000..f708b1d7a
--- /dev/null
+++ b/src/osd/scheduler/mClockScheduler.h
@@ -0,0 +1,277 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <functional>
+#include <ostream>
+#include <map>
+#include <vector>
+
+#include "boost/variant.hpp"
+
+#include "dmclock/src/dmclock_server.h"
+
+#include "osd/scheduler/OpScheduler.h"
+#include "common/config.h"
+#include "common/ceph_context.h"
+#include "common/mClockPriorityQueue.h"
+#include "osd/scheduler/OpSchedulerItem.h"
+
+
+namespace ceph::osd::scheduler {
+
+constexpr double default_min = 0.0;
+constexpr double default_max = std::numeric_limits<double>::is_iec559 ?
+ std::numeric_limits<double>::infinity() :
+ std::numeric_limits<double>::max();
+
+/**
+ * client_profile_id_t
+ *
+ * client_id - global id (client.####) for client QoS
+ * profile_id - id generated by client's QoS profile
+ *
+ * Currently (Reef and below), both members are set to
+ * 0 which ensures that all external clients share the
+ * mClock profile allocated reservation and limit
+ * bandwidth.
+ *
+ * Note: Post Reef, both members will be set to non-zero
+ * values when the distributed feature of the mClock
+ * algorithm is utilized.
+ */
+struct client_profile_id_t {
+ uint64_t client_id = 0;
+ uint64_t profile_id = 0;
+
+ client_profile_id_t(uint64_t _client_id, uint64_t _profile_id) :
+ client_id(_client_id),
+ profile_id(_profile_id) {}
+
+ client_profile_id_t() = default;
+
+ auto operator<=>(const client_profile_id_t&) const = default;
+ friend std::ostream& operator<<(std::ostream& out,
+ const client_profile_id_t& client_profile) {
+ out << " client_id: " << client_profile.client_id
+ << " profile_id: " << client_profile.profile_id;
+ return out;
+ }
+};
+
+struct scheduler_id_t {
+ op_scheduler_class class_id;
+ client_profile_id_t client_profile_id;
+
+ auto operator<=>(const scheduler_id_t&) const = default;
+ friend std::ostream& operator<<(std::ostream& out,
+ const scheduler_id_t& sched_id) {
+ out << "{ class_id: " << sched_id.class_id
+ << sched_id.client_profile_id;
+ return out << " }";
+ }
+};
+
+/**
+ * Scheduler implementation based on mclock.
+ *
+ * TODO: explain configs
+ */
+class mClockScheduler : public OpScheduler, md_config_obs_t {
+
+ CephContext *cct;
+ const int whoami;
+ const uint32_t num_shards;
+ const int shard_id;
+ const bool is_rotational;
+ MonClient *monc;
+
+ /**
+ * osd_bandwidth_cost_per_io
+ *
+ * mClock expects all queued items to have a uniform expression of
+ * "cost". However, IO devices generally have quite different capacity
+ * for sequential IO vs small random IO. This implementation handles this
+ * by expressing all costs as a number of sequential bytes written adding
+ * additional cost for each random IO equal to osd_bandwidth_cost_per_io.
+ *
+ * Thus, an IO operation requiring a total of <size> bytes to be written
+ * accross <iops> different locations will have a cost of
+ * <size> + (osd_bandwidth_cost_per_io * <iops>) bytes.
+ *
+ * Set in set_osd_capacity_params_from_config in the constructor and upon
+ * config change.
+ *
+ * Has units bytes/io.
+ */
+ double osd_bandwidth_cost_per_io;
+
+ /**
+ * osd_bandwidth_capacity_per_shard
+ *
+ * mClock expects reservation and limit paramters to be expressed in units
+ * of cost/second -- which means bytes/second for this implementation.
+ *
+ * Rather than expecting users to compute appropriate limit and reservation
+ * values for each class of OSDs in their cluster, we instead express
+ * reservation and limit paramaters as ratios of the OSD's maxmimum capacity.
+ * osd_bandwidth_capacity_per_shard is that capacity divided by the number
+ * of shards.
+ *
+ * Set in set_osd_capacity_params_from_config in the constructor and upon
+ * config change.
+ *
+ * This value gets passed to ClientRegistry::update_from_config in order
+ * to resolve the full reservaiton and limit parameters for mclock from
+ * the configured ratios.
+ *
+ * Has units bytes/second.
+ */
+ double osd_bandwidth_capacity_per_shard;
+
+ class ClientRegistry {
+ std::array<
+ crimson::dmclock::ClientInfo,
+ static_cast<size_t>(op_scheduler_class::immediate)
+ > internal_client_infos = {
+ // Placeholder, gets replaced with configured values
+ crimson::dmclock::ClientInfo(1, 1, 1),
+ crimson::dmclock::ClientInfo(1, 1, 1)
+ };
+
+ crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1};
+ std::map<client_profile_id_t,
+ crimson::dmclock::ClientInfo> external_client_infos;
+ const crimson::dmclock::ClientInfo *get_external_client(
+ const client_profile_id_t &client) const;
+ public:
+ /**
+ * update_from_config
+ *
+ * Sets the mclock paramaters (reservation, weight, and limit)
+ * for each class of IO (background_recovery, background_best_effort,
+ * and client).
+ */
+ void update_from_config(
+ const ConfigProxy &conf,
+ double capacity_per_shard);
+ const crimson::dmclock::ClientInfo *get_info(
+ const scheduler_id_t &id) const;
+ } client_registry;
+
+ using mclock_queue_t = crimson::dmclock::PullPriorityQueue<
+ scheduler_id_t,
+ OpSchedulerItem,
+ true,
+ true,
+ 2>;
+ using priority_t = unsigned;
+ using SubQueue = std::map<priority_t,
+ std::list<OpSchedulerItem>,
+ std::greater<priority_t>>;
+ mclock_queue_t scheduler;
+ /**
+ * high_priority
+ *
+ * Holds entries to be dequeued in strict order ahead of mClock
+ * Invariant: entries are never empty
+ */
+ SubQueue high_priority;
+ priority_t immediate_class_priority = std::numeric_limits<priority_t>::max();
+
+ static scheduler_id_t get_scheduler_id(const OpSchedulerItem &item) {
+ return scheduler_id_t{
+ item.get_scheduler_class(),
+ client_profile_id_t()
+ };
+ }
+
+ static unsigned int get_io_prio_cut(CephContext *cct) {
+ if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
+ std::random_device rd;
+ std::mt19937 random_gen(rd());
+ return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
+ } else if (cct->_conf->osd_op_queue_cut_off == "high") {
+ return CEPH_MSG_PRIO_HIGH;
+ } else {
+ // default / catch-all is 'low'
+ return CEPH_MSG_PRIO_LOW;
+ }
+ }
+
+ unsigned cutoff_priority = get_io_prio_cut(cct);
+
+ /**
+ * set_osd_capacity_params_from_config
+ *
+ * mClockScheduler uses two parameters, osd_bandwidth_cost_per_io
+ * and osd_bandwidth_capacity_per_shard, internally. These two
+ * parameters are derived from config parameters
+ * osd_mclock_max_capacity_iops_(hdd|ssd) and
+ * osd_mclock_max_sequential_bandwidth_(hdd|ssd) as well as num_shards.
+ * Invoking set_osd_capacity_params_from_config() resets those derived
+ * params based on the current config and should be invoked any time they
+ * are modified as well as in the constructor. See handle_conf_change().
+ */
+ void set_osd_capacity_params_from_config();
+
+ // Set the mclock related config params based on the profile
+ void set_config_defaults_from_profile();
+
+public:
+ mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards,
+ int shard_id, bool is_rotational, MonClient *monc);
+ ~mClockScheduler() override;
+
+ /// Calculate scaled cost per item
+ uint32_t calc_scaled_cost(int cost);
+
+ // Helper method to display mclock queues
+ std::string display_queues() const;
+
+ // Enqueue op in the back of the regular queue
+ void enqueue(OpSchedulerItem &&item) final;
+
+ // Enqueue the op in the front of the high priority queue
+ void enqueue_front(OpSchedulerItem &&item) final;
+
+ // Return an op to be dispatch
+ WorkItem dequeue() final;
+
+ // Returns if the queue is empty
+ bool empty() const final {
+ return scheduler.empty() && high_priority.empty();
+ }
+
+ // Formatted output of the queue
+ void dump(ceph::Formatter &f) const final;
+
+ void print(std::ostream &ostream) const final {
+ ostream << "mClockScheduler";
+ }
+
+ // Update data associated with the modified mclock config key(s)
+ void update_configuration() final;
+
+ const char** get_tracked_conf_keys() const final;
+ void handle_conf_change(const ConfigProxy& conf,
+ const std::set<std::string> &changed) final;
+private:
+ // Enqueue the op to the high priority queue
+ void enqueue_high(unsigned prio, OpSchedulerItem &&item, bool front = false);
+};
+
+}