diff options
Diffstat (limited to '')
-rw-r--r-- | src/osd/scheduler/mClockScheduler.cc | 514 |
1 files changed, 514 insertions, 0 deletions
diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc new file mode 100644 index 000000000..f2f0ffc3d --- /dev/null +++ b/src/osd/scheduler/mClockScheduler.cc @@ -0,0 +1,514 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <memory> +#include <functional> + +#include "osd/scheduler/mClockScheduler.h" +#include "common/dout.h" + +namespace dmc = crimson::dmclock; +using namespace std::placeholders; + +#define dout_context cct +#define dout_subsys ceph_subsys_osd +#undef dout_prefix +#define dout_prefix *_dout << "mClockScheduler: " + + +namespace ceph::osd::scheduler { + +mClockScheduler::mClockScheduler(CephContext *cct, + uint32_t num_shards, + bool is_rotational) + : cct(cct), + num_shards(num_shards), + is_rotational(is_rotational), + scheduler( + std::bind(&mClockScheduler::ClientRegistry::get_info, + &client_registry, + _1), + dmc::AtLimit::Wait, + cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout")) +{ + cct->_conf.add_observer(this); + ceph_assert(num_shards > 0); + set_max_osd_capacity(); + set_osd_mclock_cost_per_io(); + set_osd_mclock_cost_per_byte(); + set_mclock_profile(); + enable_mclock_profile_settings(); + client_registry.update_from_config(cct->_conf); +} + +void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf) +{ + default_external_client_info.update( + conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"), + conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim")); + + internal_client_infos[ + static_cast<size_t>(op_scheduler_class::background_recovery)].update( + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim")); + + internal_client_infos[ + static_cast<size_t>(op_scheduler_class::background_best_effort)].update( + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"), + conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim")); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client( + const client_profile_id_t &client) const +{ + auto ret = external_client_infos.find(client); + if (ret == external_client_infos.end()) + return &default_external_client_info; + else + return &(ret->second); +} + +const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info( + const scheduler_id_t &id) const { + switch (id.class_id) { + case op_scheduler_class::immediate: + ceph_assert(0 == "Cannot schedule immediate"); + return (dmc::ClientInfo*)nullptr; + case op_scheduler_class::client: + return get_external_client(id.client_profile_id); + default: + ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size()); + return &internal_client_infos[static_cast<size_t>(id.class_id)]; + } +} + +void mClockScheduler::set_max_osd_capacity() +{ + if (is_rotational) { + max_osd_capacity = + cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd"); + } else { + max_osd_capacity = + cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd"); + } + // Set per op-shard iops limit + max_osd_capacity /= num_shards; + dout(1) << __func__ << " #op shards: " << num_shards + << std::fixed << std::setprecision(2) + << " max osd capacity(iops) per shard: " << max_osd_capacity + << dendl; +} + +void mClockScheduler::set_osd_mclock_cost_per_io() +{ + std::chrono::seconds sec(1); + if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) { + osd_mclock_cost_per_io = + cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec"); + } else { + if (is_rotational) { + osd_mclock_cost_per_io = + cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd"); + // For HDDs, convert value to seconds + osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count(); + } else { + // For SSDs, convert value to milliseconds + osd_mclock_cost_per_io = + cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd"); + osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count(); + } + } + dout(1) << __func__ << " osd_mclock_cost_per_io: " + << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io + << dendl; +} + +void mClockScheduler::set_osd_mclock_cost_per_byte() +{ + std::chrono::seconds sec(1); + if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) { + osd_mclock_cost_per_byte = + cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec"); + } else { + if (is_rotational) { + osd_mclock_cost_per_byte = + cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd"); + // For HDDs, convert value to seconds + osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count(); + } else { + osd_mclock_cost_per_byte = + cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd"); + // For SSDs, convert value to milliseconds + osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count(); + } + } + dout(1) << __func__ << " osd_mclock_cost_per_byte: " + << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte + << dendl; +} + +void mClockScheduler::set_mclock_profile() +{ + mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile"); + dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl; +} + +std::string mClockScheduler::get_mclock_profile() +{ + return mclock_profile; +} + +void mClockScheduler::set_balanced_profile_allocations() +{ + // Client Allocation: + // reservation: 40% | weight: 1 | limit: 100% | + // Background Recovery Allocation: + // reservation: 40% | weight: 1 | limit: 150% | + // Background Best Effort Allocation: + // reservation: 20% | weight: 2 | limit: max | + + // Client + uint64_t client_res = static_cast<uint64_t>( + std::round(0.40 * max_osd_capacity)); + uint64_t client_lim = static_cast<uint64_t>( + std::round(max_osd_capacity)); + uint64_t client_wgt = default_min; + + // Background Recovery + uint64_t rec_res = static_cast<uint64_t>( + std::round(0.40 * max_osd_capacity)); + uint64_t rec_lim = static_cast<uint64_t>( + std::round(1.5 * max_osd_capacity)); + uint64_t rec_wgt = default_min; + + // Background Best Effort + uint64_t best_effort_res = static_cast<uint64_t>( + std::round(0.20 * max_osd_capacity)); + uint64_t best_effort_lim = default_max; + uint64_t best_effort_wgt = 2; + + // Set the allocations for the mclock clients + client_allocs[ + static_cast<size_t>(op_scheduler_class::client)].update( + client_res, + client_wgt, + client_lim); + client_allocs[ + static_cast<size_t>(op_scheduler_class::background_recovery)].update( + rec_res, + rec_wgt, + rec_lim); + client_allocs[ + static_cast<size_t>(op_scheduler_class::background_best_effort)].update( + best_effort_res, + best_effort_wgt, + best_effort_lim); +} + +void mClockScheduler::set_high_recovery_ops_profile_allocations() +{ + // Client Allocation: + // reservation: 30% | weight: 1 | limit: 80% | + // Background Recovery Allocation: + // reservation: 60% | weight: 2 | limit: 200% | + // Background Best Effort Allocation: + // reservation: 1 | weight: 2 | limit: max | + + // Client + uint64_t client_res = static_cast<uint64_t>( + std::round(0.30 * max_osd_capacity)); + uint64_t client_lim = static_cast<uint64_t>( + std::round(0.80 * max_osd_capacity)); + uint64_t client_wgt = default_min; + + // Background Recovery + uint64_t rec_res = static_cast<uint64_t>( + std::round(0.60 * max_osd_capacity)); + uint64_t rec_lim = static_cast<uint64_t>( + std::round(2.0 * max_osd_capacity)); + uint64_t rec_wgt = 2; + + // Background Best Effort + uint64_t best_effort_res = default_min; + uint64_t best_effort_lim = default_max; + uint64_t best_effort_wgt = 2; + + // Set the allocations for the mclock clients + client_allocs[ + static_cast<size_t>(op_scheduler_class::client)].update( + client_res, + client_wgt, + client_lim); + client_allocs[ + static_cast<size_t>(op_scheduler_class::background_recovery)].update( + rec_res, + rec_wgt, + rec_lim); + client_allocs[ + static_cast<size_t>(op_scheduler_class::background_best_effort)].update( + best_effort_res, + best_effort_wgt, + best_effort_lim); +} + +void mClockScheduler::set_high_client_ops_profile_allocations() +{ + // Client Allocation: + // reservation: 50% | weight: 2 | limit: max | + // Background Recovery Allocation: + // reservation: 25% | weight: 1 | limit: 100% | + // Background Best Effort Allocation: + // reservation: 25% | weight: 2 | limit: max | + + // Client + uint64_t client_res = static_cast<uint64_t>( + std::round(0.50 * max_osd_capacity)); + uint64_t client_wgt = 2; + uint64_t client_lim = default_max; + + // Background Recovery + uint64_t rec_res = static_cast<uint64_t>( + std::round(0.25 * max_osd_capacity)); + uint64_t rec_lim = static_cast<uint64_t>( + std::round(max_osd_capacity)); + uint64_t rec_wgt = default_min; + + // Background Best Effort + uint64_t best_effort_res = static_cast<uint64_t>( + std::round(0.25 * max_osd_capacity)); + uint64_t best_effort_lim = default_max; + uint64_t best_effort_wgt = 2; + + // Set the allocations for the mclock clients + client_allocs[ + static_cast<size_t>(op_scheduler_class::client)].update( + client_res, + client_wgt, + client_lim); + client_allocs[ + static_cast<size_t>(op_scheduler_class::background_recovery)].update( + rec_res, + rec_wgt, + rec_lim); + client_allocs[ + static_cast<size_t>(op_scheduler_class::background_best_effort)].update( + best_effort_res, + best_effort_wgt, + best_effort_lim); +} + +void mClockScheduler::enable_mclock_profile_settings() +{ + // Nothing to do for "custom" profile + if (mclock_profile == "custom") { + return; + } + + // Set mclock and ceph config options for the chosen profile + if (mclock_profile == "balanced") { + set_balanced_profile_allocations(); + } else if (mclock_profile == "high_recovery_ops") { + set_high_recovery_ops_profile_allocations(); + } else if (mclock_profile == "high_client_ops") { + set_high_client_ops_profile_allocations(); + } else { + ceph_assert("Invalid choice of mclock profile" == 0); + return; + } + + // Set the mclock config parameters + set_profile_config(); +} + +void mClockScheduler::set_profile_config() +{ + ClientAllocs client = client_allocs[ + static_cast<size_t>(op_scheduler_class::client)]; + ClientAllocs rec = client_allocs[ + static_cast<size_t>(op_scheduler_class::background_recovery)]; + ClientAllocs best_effort = client_allocs[ + static_cast<size_t>(op_scheduler_class::background_best_effort)]; + + // Set external client params + cct->_conf.set_val("osd_mclock_scheduler_client_res", + std::to_string(client.res)); + cct->_conf.set_val("osd_mclock_scheduler_client_wgt", + std::to_string(client.wgt)); + cct->_conf.set_val("osd_mclock_scheduler_client_lim", + std::to_string(client.lim)); + + // Set background recovery client params + cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res", + std::to_string(rec.res)); + cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt", + std::to_string(rec.wgt)); + cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim", + std::to_string(rec.lim)); + + // Set background best effort client params + cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res", + std::to_string(best_effort.res)); + cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt", + std::to_string(best_effort.wgt)); + cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim", + std::to_string(best_effort.lim)); +} + +int mClockScheduler::calc_scaled_cost(int item_cost) +{ + // Calculate total scaled cost in secs + int scaled_cost = + std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost)); + return std::max(scaled_cost, 1); +} + +void mClockScheduler::update_configuration() +{ + // Apply configuration change. The expectation is that + // at least one of the tracked mclock config option keys + // is modified before calling this method. + cct->_conf.apply_changes(nullptr); +} + +void mClockScheduler::dump(ceph::Formatter &f) const +{ +} + +void mClockScheduler::enqueue(OpSchedulerItem&& item) +{ + auto id = get_scheduler_id(item); + + // TODO: move this check into OpSchedulerItem, handle backwards compat + if (op_scheduler_class::immediate == id.class_id) { + immediate.push_front(std::move(item)); + } else { + int cost = calc_scaled_cost(item.get_cost()); + // Add item to scheduler queue + scheduler.add_request( + std::move(item), + id, + cost); + } +} + +void mClockScheduler::enqueue_front(OpSchedulerItem&& item) +{ + immediate.push_back(std::move(item)); + // TODO: item may not be immediate, update mclock machinery to permit + // putting the item back in the queue +} + +WorkItem mClockScheduler::dequeue() +{ + if (!immediate.empty()) { + WorkItem work_item{std::move(immediate.back())}; + immediate.pop_back(); + return work_item; + } else { + mclock_queue_t::PullReq result = scheduler.pull_request(); + if (result.is_future()) { + return result.getTime(); + } else if (result.is_none()) { + ceph_assert( + 0 == "Impossible, must have checked empty() first"); + return {}; + } else { + ceph_assert(result.is_retn()); + + auto &retn = result.get_retn(); + return std::move(*retn.request); + } + } +} + +const char** mClockScheduler::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "osd_mclock_scheduler_client_res", + "osd_mclock_scheduler_client_wgt", + "osd_mclock_scheduler_client_lim", + "osd_mclock_scheduler_background_recovery_res", + "osd_mclock_scheduler_background_recovery_wgt", + "osd_mclock_scheduler_background_recovery_lim", + "osd_mclock_scheduler_background_best_effort_res", + "osd_mclock_scheduler_background_best_effort_wgt", + "osd_mclock_scheduler_background_best_effort_lim", + "osd_mclock_cost_per_io_usec", + "osd_mclock_cost_per_io_usec_hdd", + "osd_mclock_cost_per_io_usec_ssd", + "osd_mclock_cost_per_byte_usec", + "osd_mclock_cost_per_byte_usec_hdd", + "osd_mclock_cost_per_byte_usec_ssd", + "osd_mclock_max_capacity_iops_hdd", + "osd_mclock_max_capacity_iops_ssd", + "osd_mclock_profile", + NULL + }; + return KEYS; +} + +void mClockScheduler::handle_conf_change( + const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + if (changed.count("osd_mclock_cost_per_io_usec") || + changed.count("osd_mclock_cost_per_io_usec_hdd") || + changed.count("osd_mclock_cost_per_io_usec_ssd")) { + set_osd_mclock_cost_per_io(); + } + if (changed.count("osd_mclock_cost_per_byte_usec") || + changed.count("osd_mclock_cost_per_byte_usec_hdd") || + changed.count("osd_mclock_cost_per_byte_usec_ssd")) { + set_osd_mclock_cost_per_byte(); + } + if (changed.count("osd_mclock_max_capacity_iops_hdd") || + changed.count("osd_mclock_max_capacity_iops_ssd")) { + set_max_osd_capacity(); + if (mclock_profile != "custom") { + enable_mclock_profile_settings(); + client_registry.update_from_config(conf); + } + } + if (changed.count("osd_mclock_profile")) { + set_mclock_profile(); + if (mclock_profile != "custom") { + enable_mclock_profile_settings(); + client_registry.update_from_config(conf); + } + } + if (changed.count("osd_mclock_scheduler_client_res") || + changed.count("osd_mclock_scheduler_client_wgt") || + changed.count("osd_mclock_scheduler_client_lim") || + changed.count("osd_mclock_scheduler_background_recovery_res") || + changed.count("osd_mclock_scheduler_background_recovery_wgt") || + changed.count("osd_mclock_scheduler_background_recovery_lim") || + changed.count("osd_mclock_scheduler_background_best_effort_res") || + changed.count("osd_mclock_scheduler_background_best_effort_wgt") || + changed.count("osd_mclock_scheduler_background_best_effort_lim")) { + if (mclock_profile == "custom") { + client_registry.update_from_config(conf); + } + } +} + +mClockScheduler::~mClockScheduler() +{ + cct->_conf.remove_observer(this); +} + +} |