summaryrefslogtreecommitdiffstats
path: root/src/mds/MDBalancer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/mds/MDBalancer.cc')
-rw-r--r--src/mds/MDBalancer.cc1515
1 files changed, 1515 insertions, 0 deletions
diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc
new file mode 100644
index 000000000..baa43bb43
--- /dev/null
+++ b/src/mds/MDBalancer.cc
@@ -0,0 +1,1515 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "mdstypes.h"
+
+#include "mon/MonClient.h"
+#include "MDBalancer.h"
+#include "MDSRank.h"
+#include "MDSMap.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "MDCache.h"
+#include "Migrator.h"
+#include "Mantle.h"
+
+#include "include/Context.h"
+#include "msg/Messenger.h"
+
+#include <fstream>
+#include <vector>
+#include <map>
+
+using namespace std;
+
+#include "common/config.h"
+#include "common/errno.h"
+
+/* Note, by default debug_mds_balancer is 1/5. For debug messages 1<lvl<=5,
+ * should_gather (below) will be true; so, debug_mds will be ignored even if
+ * set to 20/20. For this reason, some messages may not appear in the log.
+ * Increase both debug levels to get expected output!
+ */
+#define dout_context g_ceph_context
+#undef dout_prefix
+#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal " << __func__ << " "
+#undef dout
+#define dout(lvl) \
+ do {\
+ auto subsys = ceph_subsys_mds;\
+ if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
+ subsys = ceph_subsys_mds_balancer;\
+ }\
+ dout_impl(dout_context, ceph::dout::need_dynamic(subsys), lvl) dout_prefix
+#undef dendl
+#define dendl dendl_impl; } while (0)
+
+
+#define MIN_LOAD 50 // ??
+#define MIN_REEXPORT 5 // will automatically reexport
+#define MIN_OFFLOAD 10 // point at which i stop trying, close enough
+
+
+int MDBalancer::proc_message(const cref_t<Message> &m)
+{
+ switch (m->get_type()) {
+
+ case MSG_MDS_HEARTBEAT:
+ handle_heartbeat(ref_cast<MHeartbeat>(m));
+ break;
+
+ default:
+ derr << " balancer unknown message " << m->get_type() << dendl_impl;
+ ceph_abort_msg("balancer unknown message");
+ }
+
+ return 0;
+}
+
+MDBalancer::MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
+ mds(m), messenger(msgr), mon_client(monc)
+{
+ bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
+ bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
+}
+
+void MDBalancer::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map)
+{
+ if (changed.count("mds_bal_fragment_dirs")) {
+ bal_fragment_dirs = g_conf().get_val<bool>("mds_bal_fragment_dirs");
+ }
+ if (changed.count("mds_bal_fragment_interval")) {
+ bal_fragment_interval = g_conf().get_val<int64_t>("mds_bal_fragment_interval");
+ }
+}
+
+bool MDBalancer::test_rank_mask(mds_rank_t rank)
+{
+ return mds->mdsmap->get_bal_rank_mask_bitset().test(rank);
+}
+
+void MDBalancer::handle_export_pins(void)
+{
+ const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
+ auto mdcache = mds->mdcache;
+
+ auto &q = mdcache->export_pin_queue;
+ auto it = q.begin();
+ dout(20) << "export_pin_queue size=" << q.size() << dendl;
+ while (it != q.end()) {
+ auto cur = it++;
+ CInode *in = *cur;
+ ceph_assert(in->is_dir());
+
+ mds_rank_t export_pin = in->get_export_pin(false);
+ in->check_pin_policy(export_pin);
+
+ if (export_pin >= max_mds) {
+ dout(20) << " delay export_pin=" << export_pin << " on " << *in << dendl;
+ in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
+ q.erase(cur);
+
+ in->state_set(CInode::STATE_DELAYEDEXPORTPIN);
+ mdcache->export_pin_delayed_queue.insert(in);
+ continue;
+ }
+
+ dout(20) << " executing export_pin=" << export_pin << " on " << *in << dendl;
+ unsigned min_frag_bits = 0;
+ mds_rank_t target = MDS_RANK_NONE;
+ if (export_pin >= 0)
+ target = export_pin;
+ else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
+ target = mdcache->hash_into_rank_bucket(in->ino());
+ else if (export_pin == MDS_RANK_EPHEMERAL_DIST)
+ min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+
+ bool remove = true;
+ for (auto&& dir : in->get_dirfrags()) {
+ if (!dir->is_auth())
+ continue;
+
+ if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
+ if (dir->get_frag().bits() < min_frag_bits) {
+ if (!dir->state_test(CDir::STATE_CREATING) &&
+ !dir->is_frozen() && !dir->is_freezing()) {
+ queue_split(dir, true);
+ }
+ remove = false;
+ continue;
+ }
+ target = mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
+ }
+
+ if (target == MDS_RANK_NONE) {
+ if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
+ if (dir->is_frozen() || dir->is_freezing()) {
+ // try again later
+ remove = false;
+ continue;
+ }
+ dout(10) << " clear auxsubtree on " << *dir << dendl;
+ dir->state_clear(CDir::STATE_AUXSUBTREE);
+ mds->mdcache->try_subtree_merge(dir);
+ }
+ } else if (target == mds->get_nodeid()) {
+ if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
+ ceph_assert(dir->is_subtree_root());
+ } else if (dir->state_test(CDir::STATE_CREATING) ||
+ dir->is_frozen() || dir->is_freezing()) {
+ // try again later
+ remove = false;
+ continue;
+ } else if (!dir->is_subtree_root()) {
+ dir->state_set(CDir::STATE_AUXSUBTREE);
+ mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
+ dout(10) << " create aux subtree on " << *dir << dendl;
+ } else {
+ dout(10) << " set auxsubtree bit on " << *dir << dendl;
+ dir->state_set(CDir::STATE_AUXSUBTREE);
+ }
+ } else {
+ /* Only export a directory if it's non-empty. An empty directory will
+ * be sent back by the importer.
+ */
+ if (dir->get_num_head_items() > 0) {
+ mds->mdcache->migrator->export_dir(dir, target);
+ }
+ remove = false;
+ }
+ }
+
+ if (remove) {
+ in->state_clear(CInode::STATE_QUEUEDEXPORTPIN);
+ q.erase(cur);
+ }
+ }
+
+ std::vector<CDir*> authsubs = mdcache->get_auth_subtrees();
+ bool print_auth_subtrees = true;
+
+ if (authsubs.size() > AUTH_TREES_THRESHOLD &&
+ !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
+ dout(15) << "number of auth trees = " << authsubs.size() << "; not "
+ "printing auth trees" << dendl;
+ print_auth_subtrees = false;
+ }
+
+ for (auto &cd : authsubs) {
+ mds_rank_t export_pin = cd->inode->get_export_pin();
+ cd->inode->check_pin_policy(export_pin);
+
+ if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
+ export_pin = mdcache->hash_into_rank_bucket(cd->ino(), cd->get_frag());
+ } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
+ export_pin = mdcache->hash_into_rank_bucket(cd->ino());
+ }
+
+ if (print_auth_subtrees)
+ dout(25) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
+
+ if (export_pin >= 0 && export_pin != mds->get_nodeid() &&
+ export_pin < mds->mdsmap->get_max_mds()) {
+ mdcache->migrator->export_dir(cd, export_pin);
+ }
+ }
+}
+
+void MDBalancer::tick()
+{
+ static int num_bal_times = g_conf()->mds_bal_max;
+ auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
+ auto bal_max_until = g_conf().get_val<int64_t>("mds_bal_max_until");
+ time now = clock::now();
+
+ if (g_conf()->mds_bal_export_pin) {
+ handle_export_pins();
+ }
+
+ // sample?
+ if (chrono::duration<double>(now-last_sample).count() >
+ g_conf()->mds_bal_sample_interval) {
+ dout(15) << "tick last_sample now " << now << dendl;
+ last_sample = now;
+ }
+
+ // We can use duration_cast below, although the result is an int,
+ // because the values from g_conf are also integers.
+ // balance?
+ if (mds->get_nodeid() == 0
+ && mds->is_active()
+ && bal_interval > 0
+ && chrono::duration_cast<chrono::seconds>(now - last_heartbeat).count() >= bal_interval
+ && (num_bal_times || (bal_max_until >= 0 && mds->get_uptime().count() > bal_max_until))) {
+ last_heartbeat = now;
+ send_heartbeat();
+ num_bal_times--;
+ }
+
+ mds->mdcache->show_subtrees(10, true);
+}
+
+
+
+
+class C_Bal_SendHeartbeat : public MDSInternalContext {
+public:
+ explicit C_Bal_SendHeartbeat(MDSRank *mds_) : MDSInternalContext(mds_) { }
+ void finish(int f) override {
+ mds->balancer->send_heartbeat();
+ }
+};
+
+
+double mds_load_t::mds_load() const
+{
+ switch(g_conf()->mds_bal_mode) {
+ case 0:
+ return
+ .8 * auth.meta_load() +
+ .2 * all.meta_load() +
+ req_rate +
+ 10.0 * queue_len;
+
+ case 1:
+ return req_rate + 10.0*queue_len;
+
+ case 2:
+ return cpu_load_avg;
+
+ }
+ ceph_abort();
+ return 0;
+}
+
+mds_load_t MDBalancer::get_load()
+{
+ auto now = clock::now();
+
+ mds_load_t load{DecayRate()}; /* zero DecayRate! */
+
+ if (mds->mdcache->get_root()) {
+ auto&& ls = mds->mdcache->get_root()->get_dirfrags();
+ for (auto &d : ls) {
+ load.auth.add(d->pop_auth_subtree_nested);
+ load.all.add(d->pop_nested);
+ }
+ } else {
+ dout(20) << "no root, no load" << dendl;
+ }
+
+ uint64_t num_requests = mds->get_num_requests();
+ uint64_t num_traverse = mds->logger->get(l_mds_traverse);
+ uint64_t num_traverse_hit = mds->logger->get(l_mds_traverse_hit);
+
+ uint64_t cpu_time = 1;
+ {
+ string stat_path = PROCPREFIX "/proc/self/stat";
+ ifstream stat_file(stat_path);
+ if (stat_file.is_open()) {
+ vector<string> stat_vec(std::istream_iterator<string>{stat_file},
+ std::istream_iterator<string>());
+ if (stat_vec.size() >= 15) {
+ // utime + stime
+ cpu_time = strtoll(stat_vec[13].c_str(), nullptr, 10) +
+ strtoll(stat_vec[14].c_str(), nullptr, 10);
+ } else {
+ derr << "input file '" << stat_path << "' not resolvable" << dendl_impl;
+ }
+ } else {
+ derr << "input file '" << stat_path << "' not found" << dendl_impl;
+ }
+ }
+
+ load.queue_len = messenger->get_dispatch_queue_len();
+
+ bool update_last = true;
+ if (last_get_load != clock::zero() &&
+ now > last_get_load) {
+ double el = std::chrono::duration<double>(now-last_get_load).count();
+ if (el >= 1.0) {
+ if (num_requests > last_num_requests)
+ load.req_rate = (num_requests - last_num_requests) / el;
+ if (cpu_time > last_cpu_time)
+ load.cpu_load_avg = (cpu_time - last_cpu_time) / el;
+ if (num_traverse > last_num_traverse && num_traverse_hit > last_num_traverse_hit)
+ load.cache_hit_rate = (double)(num_traverse_hit - last_num_traverse_hit) / (num_traverse - last_num_traverse);
+ } else {
+ auto p = mds_load.find(mds->get_nodeid());
+ if (p != mds_load.end()) {
+ load.req_rate = p->second.req_rate;
+ load.cpu_load_avg = p->second.cpu_load_avg;
+ load.cache_hit_rate = p->second.cache_hit_rate;
+ }
+ if (num_requests >= last_num_requests && cpu_time >= last_cpu_time &&
+ num_traverse >= last_num_traverse && num_traverse_hit >= last_num_traverse_hit)
+ update_last = false;
+ }
+ }
+
+ if (update_last) {
+ last_num_requests = num_requests;
+ last_cpu_time = cpu_time;
+ last_get_load = now;
+ last_num_traverse = num_traverse;
+ last_num_traverse_hit = num_traverse_hit;
+ }
+
+ dout(15) << load << dendl;
+ return load;
+}
+
+/*
+ * Read synchronously from RADOS using a timeout. We cannot do daemon-local
+ * fallbacks (i.e. kick off async read when we are processing the map and
+ * check status when we get here) with the way the mds is structured.
+ */
+int MDBalancer::localize_balancer()
+{
+ /* reset everything */
+ bool ack = false;
+ int r = 0;
+ bufferlist lua_src;
+ ceph::mutex lock = ceph::make_mutex("lock");
+ ceph::condition_variable cond;
+
+ /* we assume that balancer is in the metadata pool */
+ object_t oid = object_t(mds->mdsmap->get_balancer());
+ object_locator_t oloc(mds->get_metadata_pool());
+ ceph_tid_t tid = mds->objecter->read(oid, oloc, 0, 0, CEPH_NOSNAP, &lua_src, 0,
+ new C_SafeCond(lock, cond, &ack, &r));
+ dout(15) << "launched non-blocking read tid=" << tid
+ << " oid=" << oid << " oloc=" << oloc << dendl;
+
+ /* timeout: if we waste half our time waiting for RADOS, then abort! */
+ std::cv_status ret_t = [&] {
+ auto bal_interval = g_conf().get_val<int64_t>("mds_bal_interval");
+ std::unique_lock locker{lock};
+ return cond.wait_for(locker, std::chrono::seconds(bal_interval / 2));
+ }();
+ /* success: store the balancer in memory and set the version. */
+ if (!r) {
+ if (ret_t == std::cv_status::timeout) {
+ mds->objecter->op_cancel(tid, -CEPHFS_ECANCELED);
+ return -CEPHFS_ETIMEDOUT;
+ }
+ bal_code.assign(lua_src.to_str());
+ bal_version.assign(oid.name);
+ dout(10) "bal_code=" << bal_code << dendl;
+ }
+ return r;
+}
+
+void MDBalancer::send_heartbeat()
+{
+ if (mds->is_cluster_degraded()) {
+ dout(10) << "degraded" << dendl;
+ return;
+ }
+
+ if (!mds->mdcache->is_open()) {
+ dout(10) << "not open" << dendl;
+ mds->mdcache->wait_for_open(new C_Bal_SendHeartbeat(mds));
+ return;
+ }
+
+ if (mds->get_nodeid() == 0) {
+ beat_epoch++;
+ mds_load.clear();
+ }
+
+ // my load
+ mds_load_t load = get_load();
+ mds->logger->set(l_mds_load_cent, 100 * load.mds_load());
+ mds->logger->set(l_mds_dispatch_queue_len, load.queue_len);
+
+ auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(mds->get_nodeid()), std::forward_as_tuple(load));
+ if (!em.second) {
+ em.first->second = load;
+ }
+
+ // import_map -- how much do i import from whom
+ map<mds_rank_t, float> import_map;
+ for (auto& im : mds->mdcache->get_auth_subtrees()) {
+ mds_rank_t from = im->inode->authority().first;
+ if (from == mds->get_nodeid()) continue;
+ if (im->get_inode()->is_stray()) continue;
+ import_map[from] += im->pop_auth_subtree.meta_load();
+ }
+ mds_import_map[ mds->get_nodeid() ] = import_map;
+
+
+ dout(3) << " epoch " << beat_epoch << " load " << load << dendl;
+ for (const auto& [rank, load] : import_map) {
+ dout(5) << " import_map from " << rank << " -> " << load << dendl;
+ }
+
+
+ set<mds_rank_t> up;
+ mds->get_mds_map()->get_up_mds_set(up);
+ for (const auto& r : up) {
+ if (r == mds->get_nodeid())
+ continue;
+ auto hb = make_message<MHeartbeat>(load, beat_epoch);
+ hb->get_import_map() = import_map;
+ mds->send_message_mds(hb, r);
+ }
+}
+
+void MDBalancer::handle_heartbeat(const cref_t<MHeartbeat> &m)
+{
+ mds_rank_t who = mds_rank_t(m->get_source().num());
+ dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl;
+
+ if (!mds->is_active())
+ return;
+
+ if (!mds->mdcache->is_open()) {
+ dout(10) << "opening root on handle_heartbeat" << dendl;
+ mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ if (mds->is_cluster_degraded()) {
+ dout(10) << " degraded, ignoring" << dendl;
+ return;
+ }
+
+ if (mds->get_nodeid() != 0 && m->get_beat() > beat_epoch) {
+ dout(10) << "receive next epoch " << m->get_beat() << " from mds." << who << " before mds0" << dendl;
+
+ beat_epoch = m->get_beat();
+ // clear the mds load info whose epoch is less than beat_epoch
+ mds_load.clear();
+ }
+
+ if (who == 0) {
+ dout(20) << " from mds0, new epoch " << m->get_beat() << dendl;
+ if (beat_epoch != m->get_beat()) {
+ beat_epoch = m->get_beat();
+ mds_load.clear();
+ }
+
+ send_heartbeat();
+
+ mds->mdcache->show_subtrees();
+ } else if (mds->get_nodeid() == 0) {
+ if (beat_epoch != m->get_beat()) {
+ dout(10) << " old heartbeat epoch, ignoring" << dendl;
+ return;
+ }
+ }
+
+ {
+ auto em = mds_load.emplace(std::piecewise_construct, std::forward_as_tuple(who), std::forward_as_tuple(m->get_load()));
+ if (!em.second) {
+ em.first->second = m->get_load();
+ }
+ }
+ mds_import_map[who] = m->get_import_map();
+
+ mds->mdsmap->update_num_mdss_in_rank_mask_bitset();
+
+ if (mds->mdsmap->get_num_mdss_in_rank_mask_bitset() > 0)
+ {
+ unsigned cluster_size = mds->get_mds_map()->get_num_in_mds();
+ if (mds_load.size() == cluster_size) {
+ // let's go!
+ //export_empties(); // no!
+
+ /* avoid spamming ceph -w if user does not turn mantle on */
+ if (mds->mdsmap->get_balancer() != "") {
+ int r = mantle_prep_rebalance();
+ if (!r) return;
+ mds->clog->warn() << "using old balancer; mantle failed for "
+ << "balancer=" << mds->mdsmap->get_balancer()
+ << " : " << cpp_strerror(r);
+ }
+ prep_rebalance(m->get_beat());
+ }
+ }
+}
+
+double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
+ mds_rank_t im, double& maxim)
+{
+ if (maxex <= 0 || maxim <= 0) return 0.0;
+
+ double howmuch = std::min(maxex, maxim);
+
+ dout(5) << " - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
+
+ if (ex == mds->get_nodeid())
+ state.targets[im] += howmuch;
+
+ state.exported[ex] += howmuch;
+ state.imported[im] += howmuch;
+
+ maxex -= howmuch;
+ maxim -= howmuch;
+
+ return howmuch;
+}
+
+void MDBalancer::queue_split(const CDir *dir, bool fast)
+{
+ dout(10) << __func__ << " enqueuing " << *dir
+ << " (fast=" << fast << ")" << dendl;
+
+ const dirfrag_t df = dir->dirfrag();
+
+ auto callback = [this, df](int r) {
+ if (split_pending.erase(df) == 0) {
+ // Someone beat me to it. This can happen in the fast splitting
+ // path, because we spawn two contexts, one with mds->timer and
+ // one with mds->queue_waiter. The loser can safely just drop
+ // out.
+ return;
+ }
+
+ auto mdcache = mds->mdcache;
+
+ CDir *dir = mdcache->get_dirfrag(df);
+ if (!dir) {
+ dout(10) << "drop split on " << df << " because not in cache" << dendl;
+ return;
+ }
+ if (!dir->is_auth()) {
+ dout(10) << "drop split on " << df << " because non-auth" << dendl;
+ return;
+ }
+
+ // Pass on to MDCache: note that the split might still not
+ // happen if the checks in MDCache::can_fragment fail.
+ dout(10) << __func__ << " splitting " << *dir << dendl;
+ int bits = g_conf()->mds_bal_split_bits;
+ if (dir->inode->is_ephemeral_dist()) {
+ unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+ if (df.frag.bits() + bits < min_frag_bits)
+ bits = min_frag_bits - df.frag.bits();
+ }
+ mdcache->split_dir(dir, bits);
+ };
+
+ auto ret = split_pending.insert(df);
+ bool is_new = ret.second;
+
+ if (fast) {
+ // Do the split ASAP: enqueue it in the MDSRank waiters which are
+ // run at the end of dispatching the current request
+ mds->queue_waiter(new MDSInternalContextWrapper(mds,
+ new LambdaContext(std::move(callback))));
+ } else if (is_new) {
+ // Set a timer to really do the split: we don't do it immediately
+ // so that bursts of ops on a directory have a chance to go through
+ // before we freeze it.
+ mds->timer.add_event_after(bal_fragment_interval,
+ new LambdaContext(std::move(callback)));
+ }
+}
+
+void MDBalancer::queue_merge(CDir *dir)
+{
+ const auto frag = dir->dirfrag();
+ auto callback = [this, frag](int r) {
+ ceph_assert(frag.frag != frag_t());
+
+ // frag must be in this set because only one context is in flight
+ // for a given frag at a time (because merge_pending is checked before
+ // starting one), and this context is the only one that erases it.
+ merge_pending.erase(frag);
+
+ auto mdcache = mds->mdcache;
+ CDir *dir = mdcache->get_dirfrag(frag);
+ if (!dir) {
+ dout(10) << "drop merge on " << frag << " because not in cache" << dendl;
+ return;
+ }
+ ceph_assert(dir->dirfrag() == frag);
+
+ if(!dir->is_auth()) {
+ dout(10) << "drop merge on " << *dir << " because lost auth" << dendl;
+ return;
+ }
+
+ dout(10) << "merging " << *dir << dendl;
+
+ CInode *diri = dir->get_inode();
+
+ unsigned min_frag_bits = 0;
+ if (diri->is_ephemeral_dist())
+ min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
+
+ frag_t fg = dir->get_frag();
+ while (fg.bits() > min_frag_bits) {
+ frag_t sibfg = fg.get_sibling();
+ auto&& [complete, sibs] = diri->get_dirfrags_under(sibfg);
+ if (!complete) {
+ dout(10) << " not all sibs under " << sibfg << " in cache (have " << sibs << ")" << dendl;
+ break;
+ }
+ bool all = true;
+ for (auto& sib : sibs) {
+ if (!sib->is_auth() || !sib->should_merge()) {
+ all = false;
+ break;
+ }
+ }
+ if (!all) {
+ dout(10) << " not all sibs under " << sibfg << " " << sibs << " should_merge" << dendl;
+ break;
+ }
+ dout(10) << " all sibs under " << sibfg << " " << sibs << " should merge" << dendl;
+ fg = fg.parent();
+ }
+
+ if (fg != dir->get_frag())
+ mdcache->merge_dir(diri, fg);
+ };
+
+ if (merge_pending.count(frag) == 0) {
+ dout(20) << " enqueued dir " << *dir << dendl;
+ merge_pending.insert(frag);
+ mds->timer.add_event_after(bal_fragment_interval,
+ new LambdaContext(std::move(callback)));
+ } else {
+ dout(20) << " dir already in queue " << *dir << dendl;
+ }
+}
+
+void MDBalancer::prep_rebalance(int beat)
+{
+ balance_state_t state;
+
+ if (g_conf()->mds_thrash_exports) {
+ //we're going to randomly export to all the mds in the cluster
+ set<mds_rank_t> up_mds;
+ mds->get_mds_map()->get_up_mds_set(up_mds);
+ for (const auto &rank : up_mds) {
+ state.targets[rank] = 0.0;
+ }
+ } else {
+ int cluster_size = mds->get_mds_map()->get_num_in_mds();
+ mds_rank_t whoami = mds->get_nodeid();
+ rebalance_time = clock::now();
+
+ dout(7) << "cluster loads are" << dendl;
+
+ mds->mdcache->migrator->clear_export_queue();
+
+ // rescale! turn my mds_load back into meta_load units
+ double load_fac = 1.0;
+ map<mds_rank_t, mds_load_t>::iterator m = mds_load.find(whoami);
+ if ((m != mds_load.end()) && (m->second.mds_load() > 0)) {
+ double metald = m->second.auth.meta_load();
+ double mdsld = m->second.mds_load();
+ load_fac = metald / mdsld;
+ dout(7) << " load_fac is " << load_fac
+ << " <- " << m->second.auth << " " << metald
+ << " / " << mdsld
+ << dendl;
+ }
+
+ mds_meta_load.clear();
+
+ double total_load = 0.0;
+ multimap<double,mds_rank_t> load_map;
+ for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
+ mds_load_t& load = mds_load.at(i);
+
+ double l = load.mds_load() * load_fac;
+ mds_meta_load[i] = l;
+
+ if (whoami == 0)
+ dout(7) << " mds." << i
+ << " " << load
+ << " = " << load.mds_load()
+ << " ~ " << l << dendl;
+
+ if (whoami == i) my_load = l;
+ total_load += l;
+
+ load_map.insert(pair<double,mds_rank_t>( l, i ));
+ }
+
+ // target load
+ target_load = total_load / (double)mds->mdsmap->get_num_mdss_in_rank_mask_bitset();
+ dout(7) << "my load " << my_load
+ << " target " << target_load
+ << " total " << total_load
+ << dendl;
+
+ // under or over?
+ for (const auto& [load, rank] : load_map) {
+ if (test_rank_mask(rank) &&
+ load < target_load * (1.0 + g_conf()->mds_bal_min_rebalance)) {
+ dout(7) << " mds." << rank << " is underloaded or barely overloaded." << dendl;
+ mds_last_epoch_under_map[rank] = beat_epoch;
+ }
+ }
+
+ int last_epoch_under = mds_last_epoch_under_map[whoami];
+ if (last_epoch_under == beat_epoch) {
+ dout(7) << " i am underloaded or barely overloaded, doing nothing." << dendl;
+ return;
+ }
+ // am i over long enough?
+ if (last_epoch_under && beat_epoch - last_epoch_under < 2) {
+ dout(7) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl;
+ return;
+ }
+
+ dout(7) << " i am sufficiently overloaded" << dendl;
+
+
+ // first separate exporters and importers
+ multimap<double,mds_rank_t> importers;
+ multimap<double,mds_rank_t> exporters;
+ set<mds_rank_t> importer_set;
+ set<mds_rank_t> exporter_set;
+
+ for (multimap<double,mds_rank_t>::iterator it = load_map.begin();
+ it != load_map.end();
+ ++it) {
+ if (it->first < target_load && test_rank_mask(it->second)) {
+ dout(15) << " mds." << it->second << " is importer" << dendl;
+ importers.insert(pair<double,mds_rank_t>(it->first,it->second));
+ importer_set.insert(it->second);
+ } else {
+ int mds_last_epoch_under = mds_last_epoch_under_map[it->second];
+ if (!(mds_last_epoch_under && beat_epoch - mds_last_epoch_under < 2)) {
+ dout(15) << " mds." << it->second << " is exporter" << dendl;
+ exporters.insert(pair<double,mds_rank_t>(it->first,it->second));
+ exporter_set.insert(it->second);
+ }
+ }
+ }
+
+
+ // determine load transfer mapping
+
+ if (true) {
+ // analyze import_map; do any matches i can
+
+ dout(15) << " matching exporters to import sources" << dendl;
+
+ // big -> small exporters
+ for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
+ ex != exporters.rend();
+ ++ex) {
+ double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0;
+ double maxex = get_maxex(state, ex->second, ex_target_load);
+ if (maxex <= .001) continue;
+
+ // check importers. for now, just in arbitrary order (no intelligent matching).
+ for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
+ im != mds_import_map[ex->second].end();
+ ++im) {
+ double maxim = get_maxim(state, im->first, target_load);
+ if (maxim <= .001) continue;
+ try_match(state, ex->second, maxex, im->first, maxim);
+ if (maxex <= .001) break;
+ }
+ }
+ }
+
+ // old way
+ if (beat % 2 == 1) {
+ dout(15) << " matching big exporters to big importers" << dendl;
+ // big exporters to big importers
+ multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
+ multimap<double,mds_rank_t>::iterator im = importers.begin();
+ while (ex != exporters.rend() &&
+ im != importers.end()) {
+ double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0;
+ double maxex = get_maxex(state, ex->second, ex_target_load);
+ double maxim = get_maxim(state, im->second, target_load);
+ if (maxex < .001 || maxim < .001) break;
+ try_match(state, ex->second, maxex, im->second, maxim);
+ if (maxex <= .001) ++ex;
+ if (maxim <= .001) ++im;
+ }
+ } else { // new way
+ dout(15) << " matching small exporters to big importers" << dendl;
+ // small exporters to big importers
+ multimap<double,mds_rank_t>::iterator ex = exporters.begin();
+ multimap<double,mds_rank_t>::iterator im = importers.begin();
+ while (ex != exporters.end() &&
+ im != importers.end()) {
+ double ex_target_load = test_rank_mask(ex->second) ? target_load : 0.0;
+ double maxex = get_maxex(state, ex->second, ex_target_load);
+ double maxim = get_maxim(state, im->second, target_load);
+ if (maxex < .001 || maxim < .001) break;
+ try_match(state, ex->second, maxex, im->second, maxim);
+ if (maxex <= .001) ++ex;
+ if (maxim <= .001) ++im;
+ }
+ }
+ }
+ try_rebalance(state);
+}
+
+int MDBalancer::mantle_prep_rebalance()
+{
+ balance_state_t state;
+
+ /* refresh balancer if it has changed */
+ if (bal_version != mds->mdsmap->get_balancer()) {
+ bal_version.assign("");
+ int r = localize_balancer();
+ if (r) return r;
+
+ /* only spam the cluster log from 1 mds on version changes */
+ if (mds->get_nodeid() == 0)
+ mds->clog->info() << "mantle balancer version changed: " << bal_version;
+ }
+
+ /* prepare for balancing */
+ int cluster_size = mds->get_mds_map()->get_num_in_mds();
+ rebalance_time = clock::now();
+ mds->mdcache->migrator->clear_export_queue();
+
+ /* fill in the metrics for each mds by grabbing load struct */
+ vector < map<string, double> > metrics (cluster_size);
+ for (mds_rank_t i=mds_rank_t(0); i < mds_rank_t(cluster_size); i++) {
+ mds_load_t& load = mds_load.at(i);
+
+ metrics[i] = {{"auth.meta_load", load.auth.meta_load()},
+ {"all.meta_load", load.all.meta_load()},
+ {"req_rate", load.req_rate},
+ {"queue_len", load.queue_len},
+ {"cpu_load_avg", load.cpu_load_avg}};
+ }
+
+ /* execute the balancer */
+ Mantle mantle;
+ int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
+ dout(7) << " mantle decided that new targets=" << state.targets << dendl;
+
+ /* mantle doesn't know about cluster size, so check target len here */
+ if ((int) state.targets.size() != cluster_size)
+ return -CEPHFS_EINVAL;
+ else if (ret)
+ return ret;
+
+ try_rebalance(state);
+ return 0;
+}
+
+
+
+void MDBalancer::try_rebalance(balance_state_t& state)
+{
+ if (g_conf()->mds_thrash_exports) {
+ dout(5) << "mds_thrash is on; not performing standard rebalance operation!"
+ << dendl;
+ return;
+ }
+
+ // make a sorted list of my imports
+ multimap<double, CDir*> import_pop_map;
+ multimap<mds_rank_t, pair<CDir*, double> > import_from_map;
+
+ for (auto& dir : mds->mdcache->get_fullauth_subtrees()) {
+ CInode *diri = dir->get_inode();
+ if (diri->is_mdsdir())
+ continue;
+ if (diri->get_export_pin(false) != MDS_RANK_NONE)
+ continue;
+ if (dir->is_freezing() || dir->is_frozen())
+ continue; // export pbly already in progress
+
+ mds_rank_t from = diri->authority().first;
+ double pop = dir->pop_auth_subtree.meta_load();
+ if (g_conf()->mds_bal_idle_threshold > 0 &&
+ pop < g_conf()->mds_bal_idle_threshold &&
+ diri != mds->mdcache->get_root() &&
+ from != mds->get_nodeid()) {
+ dout(5) << " exporting idle (" << pop << ") import " << *dir
+ << " back to mds." << from << dendl;
+ mds->mdcache->migrator->export_dir_nicely(dir, from);
+ continue;
+ }
+
+ dout(15) << " map: i imported " << *dir << " from " << from << dendl;
+ import_pop_map.insert(make_pair(pop, dir));
+ import_from_map.insert(make_pair(from, make_pair(dir, pop)));
+ }
+
+ // do my exports!
+ map<mds_rank_t, double> export_pop_map;
+
+ for (auto &it : state.targets) {
+ mds_rank_t target = it.first;
+ double amount = it.second;
+
+ if (amount < MIN_OFFLOAD) {
+ continue;
+ }
+ if (amount * 10 * state.targets.size() < target_load) {
+ continue;
+ }
+
+ dout(5) << "want to send " << amount << " to mds." << target
+ //<< " .. " << (*it).second << " * " << load_fac
+ << " -> " << amount
+ << dendl;//" .. fudge is " << fudge << dendl;
+
+ double& have = export_pop_map[target];
+
+ mds->mdcache->show_subtrees();
+
+ // search imports from target
+ if (import_from_map.count(target)) {
+ dout(7) << " aha, looking through imports from target mds." << target << dendl;
+ for (auto p = import_from_map.equal_range(target);
+ p.first != p.second; ) {
+ CDir *dir = p.first->second.first;
+ double pop = p.first->second.second;
+ dout(7) << "considering " << *dir << " from " << (*p.first).first << dendl;
+ auto plast = p.first++;
+
+ if (dir->inode->is_base())
+ continue;
+ ceph_assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy
+
+ if (pop <= amount-have) {
+ dout(7) << "reexporting " << *dir << " pop " << pop
+ << " back to mds." << target << dendl;
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
+ have += pop;
+ import_from_map.erase(plast);
+ for (auto q = import_pop_map.equal_range(pop);
+ q.first != q.second; ) {
+ if (q.first->second == dir) {
+ import_pop_map.erase(q.first);
+ break;
+ }
+ q.first++;
+ }
+ } else {
+ dout(7) << "can't reexport " << *dir << ", too big " << pop << dendl;
+ }
+ if (amount-have < MIN_OFFLOAD)
+ break;
+ }
+ }
+ }
+
+ // any other imports
+ for (auto &it : state.targets) {
+ mds_rank_t target = it.first;
+ double amount = it.second;
+
+ if (!export_pop_map.count(target))
+ continue;
+ double& have = export_pop_map[target];
+ if (amount-have < MIN_OFFLOAD)
+ continue;
+
+ for (auto p = import_pop_map.begin();
+ p != import_pop_map.end(); ) {
+ CDir *dir = p->second;
+ if (dir->inode->is_base()) {
+ ++p;
+ continue;
+ }
+
+ double pop = p->first;
+ if (pop <= amount-have && pop > MIN_REEXPORT) {
+ dout(5) << "reexporting " << *dir << " pop " << pop
+ << " to mds." << target << dendl;
+ have += pop;
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
+ import_pop_map.erase(p++);
+ } else {
+ ++p;
+ }
+ if (amount-have < MIN_OFFLOAD)
+ break;
+ }
+ }
+
+ set<CDir*> already_exporting;
+
+ for (auto &it : state.targets) {
+ mds_rank_t target = it.first;
+ double amount = it.second;
+
+ if (!export_pop_map.count(target))
+ continue;
+ double& have = export_pop_map[target];
+ if (amount-have < MIN_OFFLOAD)
+ continue;
+
+ // okay, search for fragments of my workload
+ std::vector<CDir*> exports;
+
+ for (auto p = import_pop_map.rbegin();
+ p != import_pop_map.rend();
+ ++p) {
+ CDir *dir = p->second;
+ find_exports(dir, amount, &exports, have, already_exporting);
+ if (amount-have < MIN_OFFLOAD)
+ break;
+ }
+ //fudge = amount - have;
+
+ for (const auto& dir : exports) {
+ dout(5) << " - exporting " << dir->pop_auth_subtree
+ << " " << dir->pop_auth_subtree.meta_load()
+ << " to mds." << target << " " << *dir << dendl;
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
+ }
+ }
+
+ dout(7) << "done" << dendl;
+ mds->mdcache->show_subtrees();
+}
+
+void MDBalancer::find_exports(CDir *dir,
+ double amount,
+ std::vector<CDir*>* exports,
+ double& have,
+ set<CDir*>& already_exporting)
+{
+ auto now = clock::now();
+ auto duration = std::chrono::duration<double>(now-rebalance_time).count();
+ if (duration > 0.1) {
+ derr << " balancer runs too long" << dendl_impl;
+ have = amount;
+ return;
+ }
+
+ ceph_assert(dir->is_auth());
+
+ double need = amount - have;
+ if (need < amount * g_conf()->mds_bal_min_start)
+ return; // good enough!
+
+ double needmax = need * g_conf()->mds_bal_need_max;
+ double needmin = need * g_conf()->mds_bal_need_min;
+ double midchunk = need * g_conf()->mds_bal_midchunk;
+ double minchunk = need * g_conf()->mds_bal_minchunk;
+
+ std::vector<CDir*> bigger_rep, bigger_unrep;
+ multimap<double, CDir*> smaller;
+
+ double dir_pop = dir->pop_auth_subtree.meta_load();
+ dout(7) << "in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl;
+
+ double subdir_sum = 0;
+ for (elist<CInode*>::iterator it = dir->pop_lru_subdirs.begin_use_current();
+ !it.end(); ) {
+ CInode *in = *it;
+ ++it;
+
+ ceph_assert(in->is_dir());
+ ceph_assert(in->get_parent_dir() == dir);
+
+ auto&& dfls = in->get_nested_dirfrags();
+
+ size_t num_idle_frags = 0;
+ for (const auto& subdir : dfls) {
+ if (already_exporting.count(subdir))
+ continue;
+
+ // we know all ancestor dirfrags up to subtree root are not freezing or frozen.
+ // It's more efficient to use CDir::is_{freezing,frozen}_tree_root()
+ if (subdir->is_frozen_dir() || subdir->is_frozen_tree_root() ||
+ subdir->is_freezing_dir() || subdir->is_freezing_tree_root())
+ continue; // can't export this right now!
+
+ // how popular?
+ double pop = subdir->pop_auth_subtree.meta_load();
+ subdir_sum += pop;
+ dout(15) << " subdir pop " << pop << " " << *subdir << dendl;
+
+ if (pop < minchunk) {
+ num_idle_frags++;
+ continue;
+ }
+
+ // lucky find?
+ if (pop > needmin && pop < needmax) {
+ exports->push_back(subdir);
+ already_exporting.insert(subdir);
+ have += pop;
+ return;
+ }
+
+ if (pop > need) {
+ if (subdir->is_rep())
+ bigger_rep.push_back(subdir);
+ else
+ bigger_unrep.push_back(subdir);
+ } else
+ smaller.insert(pair<double,CDir*>(pop, subdir));
+ }
+ if (dfls.size() == num_idle_frags)
+ in->item_pop_lru.remove_myself();
+ }
+ dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl;
+
+ // grab some sufficiently big small items
+ multimap<double,CDir*>::reverse_iterator it;
+ for (it = smaller.rbegin();
+ it != smaller.rend();
+ ++it) {
+
+ if ((*it).first < midchunk)
+ break; // try later
+
+ dout(7) << " taking smaller " << *(*it).second << dendl;
+
+ exports->push_back((*it).second);
+ already_exporting.insert((*it).second);
+ have += (*it).first;
+ if (have > needmin)
+ return;
+ }
+
+ // apprently not enough; drill deeper into the hierarchy (if non-replicated)
+ for (const auto& dir : bigger_unrep) {
+ dout(15) << " descending into " << *dir << dendl;
+ find_exports(dir, amount, exports, have, already_exporting);
+ if (have > needmin)
+ return;
+ }
+
+ // ok fine, use smaller bits
+ for (;
+ it != smaller.rend();
+ ++it) {
+ dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl;
+
+ exports->push_back((*it).second);
+ already_exporting.insert((*it).second);
+ have += (*it).first;
+ if (have > needmin)
+ return;
+ }
+
+ // ok fine, drill into replicated dirs
+ for (const auto& dir : bigger_rep) {
+ dout(7) << " descending into replicated " << *dir << dendl;
+ find_exports(dir, amount, exports, have, already_exporting);
+ if (have > needmin)
+ return;
+ }
+}
+
+void MDBalancer::hit_inode(CInode *in, int type)
+{
+ // hit inode
+ in->pop.get(type).hit();
+
+ if (in->get_parent_dn())
+ hit_dir(in->get_parent_dn()->get_dir(), type);
+}
+
+void MDBalancer::maybe_fragment(CDir *dir, bool hot)
+{
+ // split/merge
+ if (bal_fragment_dirs && bal_fragment_interval > 0 &&
+ dir->is_auth() &&
+ !dir->inode->is_base() && // not root/mdsdir (for now at least)
+ !dir->inode->is_stray()) { // not straydir
+
+ // split
+ if (dir->should_split() || hot) {
+ if (split_pending.count(dir->dirfrag()) == 0) {
+ queue_split(dir, false);
+ } else {
+ if (dir->should_split_fast()) {
+ queue_split(dir, true);
+ } else {
+ dout(10) << ": fragment already enqueued to split: "
+ << *dir << dendl;
+ }
+ }
+ }
+
+ // merge?
+ if (dir->should_merge() && merge_pending.count(dir->dirfrag()) == 0) {
+ queue_merge(dir);
+ }
+ }
+}
+
+void MDBalancer::hit_dir(CDir *dir, int type, double amount)
+{
+ if (dir->inode->is_stray())
+ return;
+ // hit me
+ double v = dir->pop_me.get(type).hit(amount);
+
+ const bool hot = (v > g_conf()->mds_bal_split_rd && type == META_POP_IRD) ||
+ (v > g_conf()->mds_bal_split_wr && type == META_POP_IWR);
+
+ dout(20) << type << " pop is " << v << ", frag " << dir->get_frag()
+ << " size " << dir->get_frag_size() << " " << dir->pop_me << dendl;
+
+ maybe_fragment(dir, hot);
+
+ // replicate?
+ const bool readop = (type == META_POP_IRD || type == META_POP_READDIR);
+ double rd_adj = 0.0;
+ if (readop && dir->last_popularity_sample < last_sample) {
+ double dir_pop = dir->pop_auth_subtree.get(type).get(); // hmm??
+ dir_pop += v * 10;
+ dir->last_popularity_sample = last_sample;
+
+ dout(20) << type << " pop " << dir_pop << " spread in " << *dir << dendl;
+ if (dir->is_auth() && !dir->is_ambiguous_auth() && dir->can_rep()) {
+ if (dir_pop >= g_conf()->mds_bal_replicate_threshold) {
+ // replicate
+ double rdp = dir->pop_me.get(META_POP_IRD).get();
+ rd_adj = rdp / mds->get_mds_map()->get_num_in_mds() - rdp;
+ rd_adj /= 2.0; // temper somewhat
+
+ dout(5) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl;
+
+ dir->dir_rep = CDir::REP_ALL;
+ mds->mdcache->send_dir_updates(dir, true);
+
+ // fixme this should adjust the whole pop hierarchy
+ dir->pop_me.get(META_POP_IRD).adjust(rd_adj);
+ dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
+ }
+
+ if (dir->ino() != 1 &&
+ dir->is_rep() &&
+ dir_pop < g_conf()->mds_bal_unreplicate_threshold) {
+ // unreplicate
+ dout(5) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl;
+
+ dir->dir_rep = CDir::REP_NONE;
+ mds->mdcache->send_dir_updates(dir);
+ }
+ }
+ }
+
+ // adjust ancestors
+ bool hit_subtree = dir->is_auth(); // current auth subtree (if any)
+ bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees
+
+ while (true) {
+ CDir *pdir = dir->inode->get_parent_dir();
+ dir->pop_nested.get(type).hit(amount);
+ if (rd_adj != 0.0)
+ dir->pop_nested.get(META_POP_IRD).adjust(rd_adj);
+
+ if (hit_subtree) {
+ dir->pop_auth_subtree.get(type).hit(amount);
+
+ if (rd_adj != 0.0)
+ dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj);
+
+ if (dir->is_subtree_root())
+ hit_subtree = false; // end of auth domain, stop hitting auth counters.
+ else if (pdir)
+ pdir->pop_lru_subdirs.push_front(&dir->get_inode()->item_pop_lru);
+ }
+
+ if (hit_subtree_nested) {
+ dir->pop_auth_subtree_nested.get(type).hit(amount);
+ if (rd_adj != 0.0)
+ dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(rd_adj);
+ }
+ if (!pdir) break;
+ dir = pdir;
+ }
+}
+
+
+/*
+ * subtract off an exported chunk.
+ * this excludes *dir itself (encode_export_dir should have take care of that)
+ * we _just_ do the parents' nested counters.
+ *
+ * NOTE: call me _after_ forcing *dir into a subtree root,
+ * but _before_ doing the encode_export_dirs.
+ */
+void MDBalancer::subtract_export(CDir *dir)
+{
+ dirfrag_load_vec_t subload = dir->pop_auth_subtree;
+
+ while (true) {
+ dir = dir->inode->get_parent_dir();
+ if (!dir) break;
+
+ dir->pop_nested.sub(subload);
+ dir->pop_auth_subtree_nested.sub(subload);
+ }
+}
+
+
+void MDBalancer::add_import(CDir *dir)
+{
+ dirfrag_load_vec_t subload = dir->pop_auth_subtree;
+
+ while (true) {
+ dir = dir->inode->get_parent_dir();
+ if (!dir) break;
+
+ dir->pop_nested.add(subload);
+ dir->pop_auth_subtree_nested.add(subload);
+ }
+}
+
+void MDBalancer::adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc)
+{
+ bool adjust_subtree_nest = dir->is_auth();
+ bool adjust_subtree = adjust_subtree_nest && !dir->is_subtree_root();
+ CDir *cur = dir;
+ while (true) {
+ if (inc) {
+ pdir->pop_nested.add(dir->pop_nested);
+ if (adjust_subtree) {
+ pdir->pop_auth_subtree.add(dir->pop_auth_subtree);
+ pdir->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
+ }
+
+ if (adjust_subtree_nest)
+ pdir->pop_auth_subtree_nested.add(dir->pop_auth_subtree_nested);
+ } else {
+ pdir->pop_nested.sub(dir->pop_nested);
+ if (adjust_subtree)
+ pdir->pop_auth_subtree.sub(dir->pop_auth_subtree);
+
+ if (adjust_subtree_nest)
+ pdir->pop_auth_subtree_nested.sub(dir->pop_auth_subtree_nested);
+ }
+
+ if (pdir->is_subtree_root())
+ adjust_subtree = false;
+ cur = pdir;
+ pdir = pdir->inode->get_parent_dir();
+ if (!pdir) break;
+ }
+}
+
+void MDBalancer::handle_mds_failure(mds_rank_t who)
+{
+ if (0 == who) {
+ mds_last_epoch_under_map.clear();
+ }
+}
+
+int MDBalancer::dump_loads(Formatter *f, int64_t depth) const
+{
+ std::deque<pair<CDir*, int>> dfs;
+ std::deque<CDir*> dfs_root;
+ if (mds->mdcache->get_root()) {
+ mds->mdcache->get_root()->get_dirfrags(dfs_root);
+ while (!dfs_root.empty()) {
+ CDir *dir = dfs_root.front();
+ dfs_root.pop_front();
+ dfs.push_back(make_pair(dir, 0));
+ }
+ } else {
+ dout(10) << "no root" << dendl;
+ }
+
+ f->open_object_section("loads");
+
+ f->open_array_section("dirfrags");
+ while (!dfs.empty()) {
+ auto [dir, cur_depth] = dfs.front();
+ dfs.pop_front();
+
+ f->open_object_section("dir");
+ dir->dump_load(f);
+ f->close_section();
+
+ //limit output dirfrags depth
+ if (depth >= 0 && (cur_depth + 1) > depth) {
+ continue;
+ }
+
+ for (auto it = dir->begin(); it != dir->end(); ++it) {
+ CInode *in = it->second->get_linkage()->get_inode();
+ if (!in || !in->is_dir())
+ continue;
+
+ auto&& ls = in->get_dirfrags();
+ for (const auto& subdir : ls) {
+
+ if (subdir->pop_nested.meta_load() < .001)
+ continue;
+ dfs.push_back(make_pair(subdir, cur_depth+1));
+ }
+ }
+ }
+ f->close_section(); // dirfrags array
+
+ f->open_object_section("mds_load");
+ {
+
+ auto dump_mds_load = [f](const mds_load_t& load) {
+ f->dump_float("request_rate", load.req_rate);
+ f->dump_float("cache_hit_rate", load.cache_hit_rate);
+ f->dump_float("queue_length", load.queue_len);
+ f->dump_float("cpu_load", load.cpu_load_avg);
+ f->dump_float("mds_load", load.mds_load());
+
+ f->open_object_section("auth_dirfrags");
+ load.auth.dump(f);
+ f->close_section();
+ f->open_object_section("all_dirfrags");
+ load.all.dump(f);
+ f->close_section();
+ };
+
+ for (const auto& [rank, load] : mds_load) {
+ CachedStackStringStream css;
+ *css << "mds." << rank;
+ f->open_object_section(css->strv());
+ dump_mds_load(load);
+ f->close_section();
+ }
+ }
+ f->close_section(); // mds_load
+
+ f->open_object_section("mds_meta_load");
+ for (auto& [rank, mload] : mds_meta_load) {
+ CachedStackStringStream css;
+ *css << "mds." << rank;
+ f->dump_float(css->strv(), mload);
+ }
+ f->close_section(); // mds_meta_load
+
+ f->open_object_section("mds_import_map");
+ for (auto& [rank, imports] : mds_import_map) {
+ {
+ CachedStackStringStream css;
+ *css << "mds." << rank;
+ f->open_array_section(css->strv());
+ }
+ for (auto& [rank_from, mload] : imports) {
+ f->open_object_section("from");
+ CachedStackStringStream css;
+ *css << "mds." << rank_from;
+ f->dump_float(css->strv(), mload);
+ f->close_section();
+ }
+ f->close_section(); // mds.? array
+ }
+ f->close_section(); // mds_import_map
+
+ f->close_section(); // loads
+ return 0;
+}