// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2016 Red Hat Inc. * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include #include #include "osd/scheduler/mClockScheduler.h" #include "common/dout.h" namespace dmc = crimson::dmclock; using namespace std::placeholders; #define dout_context cct #define dout_subsys ceph_subsys_osd #undef dout_prefix #define dout_prefix *_dout << "mClockScheduler: " namespace ceph::osd::scheduler { mClockScheduler::mClockScheduler(CephContext *cct, uint32_t num_shards, bool is_rotational) : cct(cct), num_shards(num_shards), is_rotational(is_rotational), scheduler( std::bind(&mClockScheduler::ClientRegistry::get_info, &client_registry, _1), dmc::AtLimit::Wait, cct->_conf.get_val("osd_mclock_scheduler_anticipation_timeout")) { cct->_conf.add_observer(this); ceph_assert(num_shards > 0); set_max_osd_capacity(); set_osd_mclock_cost_per_io(); set_osd_mclock_cost_per_byte(); set_mclock_profile(); enable_mclock_profile_settings(); client_registry.update_from_config(cct->_conf); } void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf) { default_external_client_info.update( conf.get_val("osd_mclock_scheduler_client_res"), conf.get_val("osd_mclock_scheduler_client_wgt"), conf.get_val("osd_mclock_scheduler_client_lim")); internal_client_infos[ static_cast(op_scheduler_class::background_recovery)].update( conf.get_val("osd_mclock_scheduler_background_recovery_res"), conf.get_val("osd_mclock_scheduler_background_recovery_wgt"), conf.get_val("osd_mclock_scheduler_background_recovery_lim")); internal_client_infos[ static_cast(op_scheduler_class::background_best_effort)].update( conf.get_val("osd_mclock_scheduler_background_best_effort_res"), conf.get_val("osd_mclock_scheduler_background_best_effort_wgt"), conf.get_val("osd_mclock_scheduler_background_best_effort_lim")); } const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client( const client_profile_id_t &client) const { auto ret = external_client_infos.find(client); if (ret == external_client_infos.end()) return &default_external_client_info; else return &(ret->second); } const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info( const scheduler_id_t &id) const { switch (id.class_id) { case op_scheduler_class::immediate: ceph_assert(0 == "Cannot schedule immediate"); return (dmc::ClientInfo*)nullptr; case op_scheduler_class::client: return get_external_client(id.client_profile_id); default: ceph_assert(static_cast(id.class_id) < internal_client_infos.size()); return &internal_client_infos[static_cast(id.class_id)]; } } void mClockScheduler::set_max_osd_capacity() { if (is_rotational) { max_osd_capacity = cct->_conf.get_val("osd_mclock_max_capacity_iops_hdd"); } else { max_osd_capacity = cct->_conf.get_val("osd_mclock_max_capacity_iops_ssd"); } // Set per op-shard iops limit max_osd_capacity /= num_shards; dout(1) << __func__ << " #op shards: " << num_shards << std::fixed << std::setprecision(2) << " max osd capacity(iops) per shard: " << max_osd_capacity << dendl; } void mClockScheduler::set_osd_mclock_cost_per_io() { std::chrono::seconds sec(1); if (cct->_conf.get_val("osd_mclock_cost_per_io_usec")) { osd_mclock_cost_per_io = cct->_conf.get_val("osd_mclock_cost_per_io_usec"); } else { if (is_rotational) { osd_mclock_cost_per_io = cct->_conf.get_val("osd_mclock_cost_per_io_usec_hdd"); // For HDDs, convert value to seconds osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count(); } else { // For SSDs, convert value to milliseconds osd_mclock_cost_per_io = cct->_conf.get_val("osd_mclock_cost_per_io_usec_ssd"); osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count(); } } dout(1) << __func__ << " osd_mclock_cost_per_io: " << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io << dendl; } void mClockScheduler::set_osd_mclock_cost_per_byte() { std::chrono::seconds sec(1); if (cct->_conf.get_val("osd_mclock_cost_per_byte_usec")) { osd_mclock_cost_per_byte = cct->_conf.get_val("osd_mclock_cost_per_byte_usec"); } else { if (is_rotational) { osd_mclock_cost_per_byte = cct->_conf.get_val("osd_mclock_cost_per_byte_usec_hdd"); // For HDDs, convert value to seconds osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count(); } else { osd_mclock_cost_per_byte = cct->_conf.get_val("osd_mclock_cost_per_byte_usec_ssd"); // For SSDs, convert value to milliseconds osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count(); } } dout(1) << __func__ << " osd_mclock_cost_per_byte: " << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte << dendl; } void mClockScheduler::set_mclock_profile() { mclock_profile = cct->_conf.get_val("osd_mclock_profile"); dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl; } std::string mClockScheduler::get_mclock_profile() { return mclock_profile; } void mClockScheduler::set_balanced_profile_allocations() { // Client Allocation: // reservation: 40% | weight: 1 | limit: 100% | // Background Recovery Allocation: // reservation: 40% | weight: 1 | limit: 150% | // Background Best Effort Allocation: // reservation: 20% | weight: 2 | limit: max | // Client uint64_t client_res = static_cast( std::round(0.40 * max_osd_capacity)); uint64_t client_lim = static_cast( std::round(max_osd_capacity)); uint64_t client_wgt = default_min; // Background Recovery uint64_t rec_res = static_cast( std::round(0.40 * max_osd_capacity)); uint64_t rec_lim = static_cast( std::round(1.5 * max_osd_capacity)); uint64_t rec_wgt = default_min; // Background Best Effort uint64_t best_effort_res = static_cast( std::round(0.20 * max_osd_capacity)); uint64_t best_effort_lim = default_max; uint64_t best_effort_wgt = 2; // Set the allocations for the mclock clients client_allocs[ static_cast(op_scheduler_class::client)].update( client_res, client_wgt, client_lim); client_allocs[ static_cast(op_scheduler_class::background_recovery)].update( rec_res, rec_wgt, rec_lim); client_allocs[ static_cast(op_scheduler_class::background_best_effort)].update( best_effort_res, best_effort_wgt, best_effort_lim); } void mClockScheduler::set_high_recovery_ops_profile_allocations() { // Client Allocation: // reservation: 30% | weight: 1 | limit: 80% | // Background Recovery Allocation: // reservation: 60% | weight: 2 | limit: 200% | // Background Best Effort Allocation: // reservation: 1 | weight: 2 | limit: max | // Client uint64_t client_res = static_cast( std::round(0.30 * max_osd_capacity)); uint64_t client_lim = static_cast( std::round(0.80 * max_osd_capacity)); uint64_t client_wgt = default_min; // Background Recovery uint64_t rec_res = static_cast( std::round(0.60 * max_osd_capacity)); uint64_t rec_lim = static_cast( std::round(2.0 * max_osd_capacity)); uint64_t rec_wgt = 2; // Background Best Effort uint64_t best_effort_res = default_min; uint64_t best_effort_lim = default_max; uint64_t best_effort_wgt = 2; // Set the allocations for the mclock clients client_allocs[ static_cast(op_scheduler_class::client)].update( client_res, client_wgt, client_lim); client_allocs[ static_cast(op_scheduler_class::background_recovery)].update( rec_res, rec_wgt, rec_lim); client_allocs[ static_cast(op_scheduler_class::background_best_effort)].update( best_effort_res, best_effort_wgt, best_effort_lim); } void mClockScheduler::set_high_client_ops_profile_allocations() { // Client Allocation: // reservation: 50% | weight: 2 | limit: max | // Background Recovery Allocation: // reservation: 25% | weight: 1 | limit: 100% | // Background Best Effort Allocation: // reservation: 25% | weight: 2 | limit: max | // Client uint64_t client_res = static_cast( std::round(0.50 * max_osd_capacity)); uint64_t client_wgt = 2; uint64_t client_lim = default_max; // Background Recovery uint64_t rec_res = static_cast( std::round(0.25 * max_osd_capacity)); uint64_t rec_lim = static_cast( std::round(max_osd_capacity)); uint64_t rec_wgt = default_min; // Background Best Effort uint64_t best_effort_res = static_cast( std::round(0.25 * max_osd_capacity)); uint64_t best_effort_lim = default_max; uint64_t best_effort_wgt = 2; // Set the allocations for the mclock clients client_allocs[ static_cast(op_scheduler_class::client)].update( client_res, client_wgt, client_lim); client_allocs[ static_cast(op_scheduler_class::background_recovery)].update( rec_res, rec_wgt, rec_lim); client_allocs[ static_cast(op_scheduler_class::background_best_effort)].update( best_effort_res, best_effort_wgt, best_effort_lim); } void mClockScheduler::enable_mclock_profile_settings() { // Nothing to do for "custom" profile if (mclock_profile == "custom") { return; } // Set mclock and ceph config options for the chosen profile if (mclock_profile == "balanced") { set_balanced_profile_allocations(); } else if (mclock_profile == "high_recovery_ops") { set_high_recovery_ops_profile_allocations(); } else if (mclock_profile == "high_client_ops") { set_high_client_ops_profile_allocations(); } else { ceph_assert("Invalid choice of mclock profile" == 0); return; } // Set the mclock config parameters set_profile_config(); } void mClockScheduler::set_profile_config() { ClientAllocs client = client_allocs[ static_cast(op_scheduler_class::client)]; ClientAllocs rec = client_allocs[ static_cast(op_scheduler_class::background_recovery)]; ClientAllocs best_effort = client_allocs[ static_cast(op_scheduler_class::background_best_effort)]; // Set external client params cct->_conf.set_val("osd_mclock_scheduler_client_res", std::to_string(client.res)); cct->_conf.set_val("osd_mclock_scheduler_client_wgt", std::to_string(client.wgt)); cct->_conf.set_val("osd_mclock_scheduler_client_lim", std::to_string(client.lim)); // Set background recovery client params cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res", std::to_string(rec.res)); cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt", std::to_string(rec.wgt)); cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim", std::to_string(rec.lim)); // Set background best effort client params cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res", std::to_string(best_effort.res)); cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt", std::to_string(best_effort.wgt)); cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim", std::to_string(best_effort.lim)); } int mClockScheduler::calc_scaled_cost(int item_cost) { // Calculate total scaled cost in secs int scaled_cost = std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost)); return std::max(scaled_cost, 1); } void mClockScheduler::update_configuration() { // Apply configuration change. The expectation is that // at least one of the tracked mclock config option keys // is modified before calling this method. cct->_conf.apply_changes(nullptr); } void mClockScheduler::dump(ceph::Formatter &f) const { } void mClockScheduler::enqueue(OpSchedulerItem&& item) { auto id = get_scheduler_id(item); // TODO: move this check into OpSchedulerItem, handle backwards compat if (op_scheduler_class::immediate == id.class_id) { immediate.push_front(std::move(item)); } else { int cost = calc_scaled_cost(item.get_cost()); // Add item to scheduler queue scheduler.add_request( std::move(item), id, cost); } } void mClockScheduler::enqueue_front(OpSchedulerItem&& item) { immediate.push_back(std::move(item)); // TODO: item may not be immediate, update mclock machinery to permit // putting the item back in the queue } WorkItem mClockScheduler::dequeue() { if (!immediate.empty()) { WorkItem work_item{std::move(immediate.back())}; immediate.pop_back(); return work_item; } else { mclock_queue_t::PullReq result = scheduler.pull_request(); if (result.is_future()) { return result.getTime(); } else if (result.is_none()) { ceph_assert( 0 == "Impossible, must have checked empty() first"); return {}; } else { ceph_assert(result.is_retn()); auto &retn = result.get_retn(); return std::move(*retn.request); } } } const char** mClockScheduler::get_tracked_conf_keys() const { static const char* KEYS[] = { "osd_mclock_scheduler_client_res", "osd_mclock_scheduler_client_wgt", "osd_mclock_scheduler_client_lim", "osd_mclock_scheduler_background_recovery_res", "osd_mclock_scheduler_background_recovery_wgt", "osd_mclock_scheduler_background_recovery_lim", "osd_mclock_scheduler_background_best_effort_res", "osd_mclock_scheduler_background_best_effort_wgt", "osd_mclock_scheduler_background_best_effort_lim", "osd_mclock_cost_per_io_usec", "osd_mclock_cost_per_io_usec_hdd", "osd_mclock_cost_per_io_usec_ssd", "osd_mclock_cost_per_byte_usec", "osd_mclock_cost_per_byte_usec_hdd", "osd_mclock_cost_per_byte_usec_ssd", "osd_mclock_max_capacity_iops_hdd", "osd_mclock_max_capacity_iops_ssd", "osd_mclock_profile", NULL }; return KEYS; } void mClockScheduler::handle_conf_change( const ConfigProxy& conf, const std::set &changed) { if (changed.count("osd_mclock_cost_per_io_usec") || changed.count("osd_mclock_cost_per_io_usec_hdd") || changed.count("osd_mclock_cost_per_io_usec_ssd")) { set_osd_mclock_cost_per_io(); } if (changed.count("osd_mclock_cost_per_byte_usec") || changed.count("osd_mclock_cost_per_byte_usec_hdd") || changed.count("osd_mclock_cost_per_byte_usec_ssd")) { set_osd_mclock_cost_per_byte(); } if (changed.count("osd_mclock_max_capacity_iops_hdd") || changed.count("osd_mclock_max_capacity_iops_ssd")) { set_max_osd_capacity(); if (mclock_profile != "custom") { enable_mclock_profile_settings(); client_registry.update_from_config(conf); } } if (changed.count("osd_mclock_profile")) { set_mclock_profile(); if (mclock_profile != "custom") { enable_mclock_profile_settings(); client_registry.update_from_config(conf); } } if (changed.count("osd_mclock_scheduler_client_res") || changed.count("osd_mclock_scheduler_client_wgt") || changed.count("osd_mclock_scheduler_client_lim") || changed.count("osd_mclock_scheduler_background_recovery_res") || changed.count("osd_mclock_scheduler_background_recovery_wgt") || changed.count("osd_mclock_scheduler_background_recovery_lim") || changed.count("osd_mclock_scheduler_background_best_effort_res") || changed.count("osd_mclock_scheduler_background_best_effort_wgt") || changed.count("osd_mclock_scheduler_background_best_effort_lim")) { if (mclock_profile == "custom") { client_registry.update_from_config(conf); } } } mClockScheduler::~mClockScheduler() { cct->_conf.remove_observer(this); } }