src/osd/scheduler/mClockScheduler.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
 * Ceph - scalable distributed file system
 *
 * Copyright (C) 2016 Red Hat Inc.
 *
 * This is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License version 2.1, as published by the Free Software
 * Foundation.  See file COPYING.
 *
 */


#pragma once

#include <functional>
#include <ostream>
#include <map>
#include <vector>

#include "boost/variant.hpp"

#include "dmclock/src/dmclock_server.h"

#include "osd/scheduler/OpScheduler.h"
#include "common/config.h"
#include "common/ceph_context.h"
#include "common/mClockPriorityQueue.h"
#include "osd/scheduler/OpSchedulerItem.h"


namespace ceph::osd::scheduler {

constexpr double default_min = 0.0;
constexpr double default_max = std::numeric_limits<double>::is_iec559 ?
  std::numeric_limits<double>::infinity() :
  std::numeric_limits<double>::max();

/**
 * client_profile_id_t
 *
 * client_id - global id (client.####) for client QoS
 * profile_id - id generated by client's QoS profile
 *
 * Currently (Reef and below), both members are set to
 * 0 which ensures that all external clients share the
 * mClock profile allocated reservation and limit
 * bandwidth.
 *
 * Note: Post Reef, both members will be set to non-zero
 * values when the distributed feature of the mClock
 * algorithm is utilized.
 */
struct client_profile_id_t {
  uint64_t client_id = 0;
  uint64_t profile_id = 0;

  client_profile_id_t(uint64_t _client_id, uint64_t _profile_id) :
    client_id(_client_id),
    profile_id(_profile_id) {}

  client_profile_id_t() = default;

  auto operator<=>(const client_profile_id_t&) const = default;
  friend std::ostream& operator<<(std::ostream& out,
                                  const client_profile_id_t& client_profile) {
    out << " client_id: " << client_profile.client_id
        << " profile_id: " << client_profile.profile_id;
    return out;
  }
};

struct scheduler_id_t {
  op_scheduler_class class_id;
  client_profile_id_t client_profile_id;

  auto operator<=>(const scheduler_id_t&) const = default;
  friend std::ostream& operator<<(std::ostream& out,
                                  const scheduler_id_t& sched_id) {
    out << "{ class_id: " << sched_id.class_id
        << sched_id.client_profile_id;
    return out << " }";
  }
};

/**
 * Scheduler implementation based on mclock.
 *
 * TODO: explain configs
 */
class mClockScheduler : public OpScheduler, md_config_obs_t {

  CephContext *cct;
  const int whoami;
  const uint32_t num_shards;
  const int shard_id;
  const bool is_rotational;
  MonClient *monc;

  /**
   * osd_bandwidth_cost_per_io
   *
   * mClock expects all queued items to have a uniform expression of
   * "cost".  However, IO devices generally have quite different capacity
   * for sequential IO vs small random IO.  This implementation handles this
   * by expressing all costs as a number of sequential bytes written adding
   * additional cost for each random IO equal to osd_bandwidth_cost_per_io.
   *
   * Thus, an IO operation requiring a total of <size> bytes to be written
   * accross <iops> different locations will have a cost of
   * <size> + (osd_bandwidth_cost_per_io * <iops>) bytes.
   *
   * Set in set_osd_capacity_params_from_config in the constructor and upon
   * config change.
   *
   * Has units bytes/io.
   */
  double osd_bandwidth_cost_per_io;

  /**
   * osd_bandwidth_capacity_per_shard
   *
   * mClock expects reservation and limit paramters to be expressed in units
   * of cost/second -- which means bytes/second for this implementation.
   *
   * Rather than expecting users to compute appropriate limit and reservation
   * values for each class of OSDs in their cluster, we instead express
   * reservation and limit paramaters as ratios of the OSD's maxmimum capacity.
   * osd_bandwidth_capacity_per_shard is that capacity divided by the number
   * of shards.
   *
   * Set in set_osd_capacity_params_from_config in the constructor and upon
   * config change.
   *
   * This value gets passed to ClientRegistry::update_from_config in order
   * to resolve the full reservaiton and limit parameters for mclock from
   * the configured ratios.
   *
   * Has units bytes/second.
   */
  double osd_bandwidth_capacity_per_shard;

  class ClientRegistry {
    std::array<
      crimson::dmclock::ClientInfo,
      static_cast<size_t>(op_scheduler_class::immediate)
    > internal_client_infos = {
      // Placeholder, gets replaced with configured values
      crimson::dmclock::ClientInfo(1, 1, 1),
      crimson::dmclock::ClientInfo(1, 1, 1)
    };

    crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1};
    std::map<client_profile_id_t,
	     crimson::dmclock::ClientInfo> external_client_infos;
    const crimson::dmclock::ClientInfo *get_external_client(
      const client_profile_id_t &client) const;
  public:
    /**
     * update_from_config
     *
     * Sets the mclock paramaters (reservation, weight, and limit)
     * for each class of IO (background_recovery, background_best_effort,
     * and client).
     */
    void update_from_config(
      const ConfigProxy &conf,
      double capacity_per_shard);
    const crimson::dmclock::ClientInfo *get_info(
      const scheduler_id_t &id) const;
  } client_registry;

  using mclock_queue_t = crimson::dmclock::PullPriorityQueue<
    scheduler_id_t,
    OpSchedulerItem,
    true,
    true,
    2>;
  using priority_t = unsigned;
  using SubQueue = std::map<priority_t,
	std::list<OpSchedulerItem>,
	std::greater<priority_t>>;
  mclock_queue_t scheduler;
  /**
   * high_priority
   *
   * Holds entries to be dequeued in strict order ahead of mClock
   * Invariant: entries are never empty
   */
  SubQueue high_priority;
  priority_t immediate_class_priority = std::numeric_limits<priority_t>::max();

  static scheduler_id_t get_scheduler_id(const OpSchedulerItem &item) {
    return scheduler_id_t{
      item.get_scheduler_class(),
      client_profile_id_t()
    };
  }

  static unsigned int get_io_prio_cut(CephContext *cct) {
    if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
      std::random_device rd;
      std::mt19937 random_gen(rd());
      return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
    } else if (cct->_conf->osd_op_queue_cut_off == "high") {
      return CEPH_MSG_PRIO_HIGH;
    } else {
      // default / catch-all is 'low'
      return CEPH_MSG_PRIO_LOW;
    }
  }

  unsigned cutoff_priority = get_io_prio_cut(cct);

  /**
   * set_osd_capacity_params_from_config
   *
   * mClockScheduler uses two parameters, osd_bandwidth_cost_per_io
   * and osd_bandwidth_capacity_per_shard, internally.  These two
   * parameters are derived from config parameters
   * osd_mclock_max_capacity_iops_(hdd|ssd) and
   * osd_mclock_max_sequential_bandwidth_(hdd|ssd) as well as num_shards.
   * Invoking set_osd_capacity_params_from_config() resets those derived
   * params based on the current config and should be invoked any time they
   * are modified as well as in the constructor.  See handle_conf_change().
   */
  void set_osd_capacity_params_from_config();

  // Set the mclock related config params based on the profile
  void set_config_defaults_from_profile();

public: 
  mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards,
    int shard_id, bool is_rotational, MonClient *monc);
  ~mClockScheduler() override;

  /// Calculate scaled cost per item
  uint32_t calc_scaled_cost(int cost);

  // Helper method to display mclock queues
  std::string display_queues() const;

  // Enqueue op in the back of the regular queue
  void enqueue(OpSchedulerItem &&item) final;

  // Enqueue the op in the front of the high priority queue
  void enqueue_front(OpSchedulerItem &&item) final;

  // Return an op to be dispatch
  WorkItem dequeue() final;

  // Returns if the queue is empty
  bool empty() const final {
    return scheduler.empty() && high_priority.empty();
  }

  // Formatted output of the queue
  void dump(ceph::Formatter &f) const final;

  void print(std::ostream &ostream) const final {
    ostream << "mClockScheduler";
  }

  // Update data associated with the modified mclock config key(s)
  void update_configuration() final;

  const char** get_tracked_conf_keys() const final;
  void handle_conf_change(const ConfigProxy& conf,
			  const std::set<std::string> &changed) final;
private:
  // Enqueue the op to the high priority queue
  void enqueue_high(unsigned prio, OpSchedulerItem &&item, bool front = false);
};

}