From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/mds/MDSMap.cc | 1146 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1146 insertions(+) create mode 100644 src/mds/MDSMap.cc (limited to 'src/mds/MDSMap.cc') diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc new file mode 100644 index 000000000..f611d86a5 --- /dev/null +++ b/src/mds/MDSMap.cc @@ -0,0 +1,1146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include + +#include "common/debug.h" +#include "mon/health_check.h" + +#include "MDSMap.h" + +using std::dec; +using std::hex; +using std::list; +using std::make_pair; +using std::map; +using std::multimap; +using std::ostream; +using std::pair; +using std::string; +using std::set; + +using ceph::bufferlist; +using ceph::Formatter; + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_ + +// features +CompatSet MDSMap::get_compat_set_all() { + CompatSet::FeatureSet feature_compat; + CompatSet::FeatureSet feature_ro_compat; + CompatSet::FeatureSet feature_incompat; + feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2); + + return CompatSet(feature_compat, feature_ro_compat, feature_incompat); +} + +CompatSet MDSMap::get_compat_set_default() { + CompatSet::FeatureSet feature_compat; + CompatSet::FeatureSet feature_ro_compat; + CompatSet::FeatureSet feature_incompat; + feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2); + + return CompatSet(feature_compat, feature_ro_compat, feature_incompat); +} + +// base (pre v0.20) +CompatSet MDSMap::get_compat_set_base() { + CompatSet::FeatureSet feature_compat_base; + CompatSet::FeatureSet feature_incompat_base; + feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE); + CompatSet::FeatureSet feature_ro_compat_base; + + return CompatSet(feature_compat_base, feature_ro_compat_base, feature_incompat_base); +} + +// pre-v16.2.5 CompatSet in MDS beacon +CompatSet MDSMap::get_compat_set_v16_2_4() { + CompatSet::FeatureSet feature_compat; + CompatSet::FeatureSet feature_ro_compat; + CompatSet::FeatureSet feature_incompat; + feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2); + feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2); + return CompatSet(feature_compat, feature_ro_compat, feature_incompat); +} + +void MDSMap::mds_info_t::dump(Formatter *f) const +{ + f->dump_unsigned("gid", global_id); + f->dump_string("name", name); + f->dump_int("rank", rank); + f->dump_int("incarnation", inc); + f->dump_stream("state") << ceph_mds_state_name(state); + f->dump_int("state_seq", state_seq); + f->dump_stream("addr") << addrs.get_legacy_str(); + f->dump_object("addrs", addrs); + f->dump_int("join_fscid", join_fscid); + if (laggy_since != utime_t()) + f->dump_stream("laggy_since") << laggy_since; + + f->open_array_section("export_targets"); + for (set::iterator p = export_targets.begin(); + p != export_targets.end(); ++p) { + f->dump_int("mds", *p); + } + f->close_section(); + f->dump_unsigned("features", mds_features); + f->dump_unsigned("flags", flags); + f->dump_object("compat", compat); +} + +void MDSMap::mds_info_t::dump(std::ostream& o) const +{ + o << "[mds." << name << "{" << rank << ":" << global_id << "}" + << " state " << ceph_mds_state_name(state) + << " seq " << state_seq; + if (laggy()) { + o << " laggy since " << laggy_since; + } + if (!export_targets.empty()) { + o << " export targets " << export_targets; + } + if (is_frozen()) { + o << " frozen"; + } + if (join_fscid != FS_CLUSTER_ID_NONE) { + o << " join_fscid=" << join_fscid; + } + o << " addr " << addrs; + o << " compat "; + compat.printlite(o); + o << "]"; +} + +void MDSMap::mds_info_t::generate_test_instances(std::list& ls) +{ + mds_info_t *sample = new mds_info_t(); + ls.push_back(sample); + sample = new mds_info_t(); + sample->global_id = 1; + sample->name = "test_instance"; + sample->rank = 0; + ls.push_back(sample); +} + +void MDSMap::dump(Formatter *f) const +{ + f->dump_int("epoch", epoch); + f->dump_unsigned("flags", flags); + f->dump_unsigned("ever_allowed_features", ever_allowed_features); + f->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features); + f->dump_stream("created") << created; + f->dump_stream("modified") << modified; + f->dump_int("tableserver", tableserver); + f->dump_int("root", root); + f->dump_int("session_timeout", session_timeout); + f->dump_int("session_autoclose", session_autoclose); + f->open_object_section("required_client_features"); + cephfs_dump_features(f, required_client_features); + f->close_section(); + f->dump_int("max_file_size", max_file_size); + f->dump_int("last_failure", last_failure); + f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch); + f->open_object_section("compat"); + compat.dump(f); + f->close_section(); + f->dump_int("max_mds", max_mds); + f->open_array_section("in"); + for (set::const_iterator p = in.begin(); p != in.end(); ++p) + f->dump_int("mds", *p); + f->close_section(); + f->open_object_section("up"); + for (map::const_iterator p = up.begin(); p != up.end(); ++p) { + char s[14]; + sprintf(s, "mds_%d", int(p->first)); + f->dump_int(s, p->second); + } + f->close_section(); + f->open_array_section("failed"); + for (set::const_iterator p = failed.begin(); p != failed.end(); ++p) + f->dump_int("mds", *p); + f->close_section(); + f->open_array_section("damaged"); + for (set::const_iterator p = damaged.begin(); p != damaged.end(); ++p) + f->dump_int("mds", *p); + f->close_section(); + f->open_array_section("stopped"); + for (set::const_iterator p = stopped.begin(); p != stopped.end(); ++p) + f->dump_int("mds", *p); + f->close_section(); + f->open_object_section("info"); + for (const auto& [gid, info] : mds_info) { + char s[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0' + sprintf(s, "gid_%llu", (long long unsigned)gid); + f->open_object_section(s); + info.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("data_pools"); + for (const auto& p: data_pools) + f->dump_int("pool", p); + f->close_section(); + f->dump_int("metadata_pool", metadata_pool); + f->dump_bool("enabled", enabled); + f->dump_string("fs_name", fs_name); + f->dump_string("balancer", balancer); + f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted)); +} + +void MDSMap::generate_test_instances(std::list& ls) +{ + MDSMap *m = new MDSMap(); + m->max_mds = 1; + m->data_pools.push_back(0); + m->metadata_pool = 1; + m->cas_pool = 2; + m->compat = get_compat_set_all(); + + // these aren't the defaults, just in case anybody gets confused + m->session_timeout = 61; + m->session_autoclose = 301; + m->max_file_size = 1<<24; + ls.push_back(m); +} + +void MDSMap::print(ostream& out) const +{ + out << "fs_name\t" << fs_name << "\n"; + out << "epoch\t" << epoch << "\n"; + out << "flags\t" << hex << flags << dec << "\n"; + out << "created\t" << created << "\n"; + out << "modified\t" << modified << "\n"; + out << "tableserver\t" << tableserver << "\n"; + out << "root\t" << root << "\n"; + out << "session_timeout\t" << session_timeout << "\n" + << "session_autoclose\t" << session_autoclose << "\n"; + out << "max_file_size\t" << max_file_size << "\n"; + out << "required_client_features\t" << cephfs_stringify_features(required_client_features) << "\n"; + out << "last_failure\t" << last_failure << "\n" + << "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n"; + out << "compat\t" << compat << "\n"; + out << "max_mds\t" << max_mds << "\n"; + out << "in\t" << in << "\n" + << "up\t" << up << "\n" + << "failed\t" << failed << "\n" + << "damaged\t" << damaged << "\n" + << "stopped\t" << stopped << "\n"; + out << "data_pools\t" << data_pools << "\n"; + out << "metadata_pool\t" << metadata_pool << "\n"; + out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n"; + out << "balancer\t" << balancer << "\n"; + out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n"; + + multimap< pair, mds_gid_t > foo; + for (const auto &p : mds_info) { + foo.insert(std::make_pair( + std::make_pair(p.second.rank, p.second.inc-1), p.first)); + } + + for (const auto &p : foo) { + out << mds_info.at(p.second) << "\n"; + } +} + +void MDSMap::print_summary(Formatter *f, ostream *out) const +{ + map by_rank; + map by_state; + + if (f) { + f->dump_unsigned("epoch", get_epoch()); + f->dump_unsigned("up", up.size()); + f->dump_unsigned("in", in.size()); + f->dump_unsigned("max", max_mds); + } else { + *out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up"; + } + + if (f) + f->open_array_section("by_rank"); + for (const auto &p : mds_info) { + string s = ceph_mds_state_name(p.second.state); + if (p.second.laggy()) + s += "(laggy or crashed)"; + + if (p.second.rank >= 0 && p.second.state != MDSMap::STATE_STANDBY_REPLAY) { + if (f) { + f->open_object_section("mds"); + f->dump_unsigned("rank", p.second.rank); + f->dump_string("name", p.second.name); + f->dump_string("status", s); + f->close_section(); + } else { + by_rank[p.second.rank] = p.second.name + "=" + s; + } + } else { + by_state[s]++; + } + } + if (f) { + f->close_section(); + } else { + if (!by_rank.empty()) + *out << " " << by_rank; + } + + for (map::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) { + if (f) { + f->dump_unsigned(p->first.c_str(), p->second); + } else { + *out << ", " << p->second << " " << p->first; + } + } + + if (!failed.empty()) { + if (f) { + f->dump_unsigned("failed", failed.size()); + } else { + *out << ", " << failed.size() << " failed"; + } + } + + if (!damaged.empty()) { + if (f) { + f->dump_unsigned("damaged", damaged.size()); + } else { + *out << ", " << damaged.size() << " damaged"; + } + } + //if (stopped.size()) + //out << ", " << stopped.size() << " stopped"; +} + +void MDSMap::get_health(list >& summary, + list > *detail) const +{ + if (!failed.empty()) { + CachedStackStringStream css; + *css << "mds rank" + << ((failed.size() > 1) ? "s ":" ") + << failed + << ((failed.size() > 1) ? " have":" has") + << " failed"; + summary.push_back(make_pair(HEALTH_ERR, css->str())); + if (detail) { + for (const auto& r : failed) { + CachedStackStringStream css; + *css << "mds." << r << " has failed"; + detail->push_back(make_pair(HEALTH_ERR, css->str())); + } + } + } + + if (!damaged.empty()) { + CachedStackStringStream css; + *css << "mds rank" + << ((damaged.size() > 1) ? "s ":" ") + << damaged + << ((damaged.size() > 1) ? " are":" is") + << " damaged"; + summary.push_back(make_pair(HEALTH_ERR, css->str())); + if (detail) { + for (const auto& r : damaged) { + CachedStackStringStream css; + *css << "mds." << r << " is damaged"; + detail->push_back(make_pair(HEALTH_ERR, css->str())); + } + } + } + + if (is_degraded()) { + summary.push_back(make_pair(HEALTH_WARN, "mds cluster is degraded")); + if (detail) { + detail->push_back(make_pair(HEALTH_WARN, "mds cluster is degraded")); + for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) { + if (!is_up(i)) + continue; + mds_gid_t gid = up.find(i)->second; + const auto& info = mds_info.at(gid); + CachedStackStringStream css; + if (is_resolve(i)) + *css << "mds." << info.name << " at " << info.addrs + << " rank " << i << " is resolving"; + if (is_replay(i)) + *css << "mds." << info.name << " at " << info.addrs + << " rank " << i << " is replaying journal"; + if (is_rejoin(i)) + *css << "mds." << info.name << " at " << info.addrs + << " rank " << i << " is rejoining"; + if (is_reconnect(i)) + *css << "mds." << info.name << " at " << info.addrs + << " rank " << i << " is reconnecting to clients"; + if (css->strv().length()) + detail->push_back(make_pair(HEALTH_WARN, css->str())); + } + } + } + + { + CachedStackStringStream css; + *css << fs_name << " max_mds " << max_mds; + summary.push_back(make_pair(HEALTH_WARN, css->str())); + } + + if ((mds_rank_t)up.size() < max_mds) { + CachedStackStringStream css; + *css << fs_name << " has " << up.size() + << " active MDS(s), but has max_mds of " << max_mds; + summary.push_back(make_pair(HEALTH_WARN, css->str())); + } + + set laggy; + for (const auto &u : up) { + const auto& info = mds_info.at(u.second); + if (info.laggy()) { + laggy.insert(info.name); + if (detail) { + CachedStackStringStream css; + *css << "mds." << info.name << " at " << info.addrs + << " is laggy/unresponsive"; + detail->push_back(make_pair(HEALTH_WARN, css->str())); + } + } + } + + if (!laggy.empty()) { + CachedStackStringStream css; + *css << "mds " << laggy + << ((laggy.size() > 1) ? " are":" is") + << " laggy"; + summary.push_back(make_pair(HEALTH_WARN, css->str())); + } + + if (get_max_mds() > 1 && + was_snaps_ever_allowed() && !allows_multimds_snaps()) { + CachedStackStringStream css; + *css << "multi-active mds while there are snapshots possibly created by pre-mimic MDS"; + summary.push_back(make_pair(HEALTH_WARN, css->str())); + } +} + +void MDSMap::get_health_checks(health_check_map_t *checks) const +{ + // MDS_DAMAGE + if (!damaged.empty()) { + health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR, + "%num% mds daemon%plurals% damaged", + damaged.size()); + for (const auto& p : damaged) { + CachedStackStringStream css; + *css << "fs " << fs_name << " mds." << p << " is damaged"; + check.detail.push_back(css->str()); + } + } + + // FS_DEGRADED + if (is_degraded()) { + health_check_t& fscheck = checks->get_or_add( + "FS_DEGRADED", HEALTH_WARN, + "%num% filesystem%plurals% %isorare% degraded", 1); + CachedStackStringStream css; + *css << "fs " << fs_name << " is degraded"; + fscheck.detail.push_back(css->str()); + + list detail; + for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) { + if (!is_up(i)) + continue; + mds_gid_t gid = up.find(i)->second; + const auto& info = mds_info.at(gid); + CachedStackStringStream css; + *css << "fs " << fs_name << " mds." << info.name << " at " + << info.addrs << " rank " << i; + if (is_resolve(i)) + *css << " is resolving"; + if (is_replay(i)) + *css << " is replaying journal"; + if (is_rejoin(i)) + *css << " is rejoining"; + if (is_reconnect(i)) + *css << " is reconnecting to clients"; + if (css->strv().length()) + detail.push_back(css->str()); + } + } + + // MDS_UP_LESS_THAN_MAX + if ((mds_rank_t)get_num_in_mds() < get_max_mds()) { + health_check_t& check = checks->add( + "MDS_UP_LESS_THAN_MAX", HEALTH_WARN, + "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds", 1); + CachedStackStringStream css; + *css << "fs " << fs_name << " has " << get_num_in_mds() + << " MDS online, but wants " << get_max_mds(); + check.detail.push_back(css->str()); + } + + // MDS_ALL_DOWN + if ((mds_rank_t)get_num_up_mds() == 0 && get_max_mds() > 0) { + health_check_t &check = checks->add( + "MDS_ALL_DOWN", HEALTH_ERR, + "%num% filesystem%plurals% %isorare% offline", 1); + CachedStackStringStream css; + *css << "fs " << fs_name << " is offline because no MDS is active for it."; + check.detail.push_back(css->str()); + } + + if (get_max_mds() > 1 && + was_snaps_ever_allowed() && !allows_multimds_snaps()) { + health_check_t &check = checks->add( + "MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR, + "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots", 1); + CachedStackStringStream css; + *css << "multi-active mds while there are snapshots possibly created by pre-mimic MDS"; + check.detail.push_back(css->str()); + } + + if (get_inline_data_enabled()) { + health_check_t &check = checks->add( + "FS_INLINE_DATA_DEPRECATED", HEALTH_WARN, + "%num% filesystem%plurals% with deprecated feature inline_data", 1); + CachedStackStringStream css; + *css << "fs " << fs_name << " has deprecated feature inline_data enabled."; + check.detail.push_back(css->str()); + } +} + +void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const +{ + __u8 v = 10; + if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 7; + } + ENCODE_START(v, 4, bl); + encode(global_id, bl); + encode(name, bl); + encode(rank, bl); + encode(inc, bl); + encode((int32_t)state, bl); + encode(state_seq, bl); + if (v < 8) { + encode(addrs.legacy_addr(), bl, features); + } else { + encode(addrs, bl, features); + } + encode(laggy_since, bl); + encode(MDS_RANK_NONE, bl); /* standby_for_rank */ + encode(std::string(), bl); /* standby_for_name */ + encode(export_targets, bl); + encode(mds_features, bl); + encode(join_fscid, bl); /* formerly: standby_for_fscid */ + encode(false, bl); + if (v >= 9) { + encode(flags, bl); + } + if (v >= 10) { + encode(compat, bl); + } + ENCODE_FINISH(bl); +} + +void MDSMap::mds_info_t::encode_unversioned(bufferlist& bl) const +{ + __u8 struct_v = 3; + using ceph::encode; + encode(struct_v, bl); + encode(global_id, bl); + encode(name, bl); + encode(rank, bl); + encode(inc, bl); + encode((int32_t)state, bl); + encode(state_seq, bl); + encode(addrs.legacy_addr(), bl, 0); + encode(laggy_since, bl); + encode(MDS_RANK_NONE, bl); + encode(std::string(), bl); + encode(export_targets, bl); +} + +void MDSMap::mds_info_t::decode(bufferlist::const_iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl); + decode(global_id, bl); + decode(name, bl); + decode(rank, bl); + decode(inc, bl); + int32_t raw_state; + decode(raw_state, bl); + state = (MDSMap::DaemonState)raw_state; + decode(state_seq, bl); + decode(addrs, bl); + decode(laggy_since, bl); + { + mds_rank_t standby_for_rank; + decode(standby_for_rank, bl); + } + { + std::string standby_for_name; + decode(standby_for_name, bl); + } + if (struct_v >= 2) + decode(export_targets, bl); + if (struct_v >= 5) + decode(mds_features, bl); + if (struct_v >= 6) { + decode(join_fscid, bl); + } + if (struct_v >= 7) { + bool standby_replay; + decode(standby_replay, bl); + } + if (struct_v >= 9) { + decode(flags, bl); + } + if (struct_v >= 10) { + decode(compat, bl); + } else { + compat = MDSMap::get_compat_set_v16_2_4(); + } + DECODE_FINISH(bl); +} + +std::string MDSMap::mds_info_t::human_name() const +{ + // Like "daemon mds.myhost restarted", "Activating daemon mds.myhost" + CachedStackStringStream css; + *css << "daemon mds." << name; + return css->str(); +} + +void MDSMap::encode(bufferlist& bl, uint64_t features) const +{ + std::map inc; // Legacy field, fake it so that + // old-mon peers have something sane + // during upgrade + for (const auto rank : in) { + inc.insert(std::make_pair(rank, epoch)); + } + + using ceph::encode; + if ((features & CEPH_FEATURE_PGID64) == 0) { + __u16 v = 2; + encode(v, bl); + encode(epoch, bl); + encode(flags, bl); + encode(last_failure, bl); + encode(root, bl); + encode(session_timeout, bl); + encode(session_autoclose, bl); + encode(max_file_size, bl); + encode(max_mds, bl); + __u32 n = mds_info.size(); + encode(n, bl); + for (map::const_iterator i = mds_info.begin(); + i != mds_info.end(); ++i) { + encode(i->first, bl); + encode(i->second, bl, features); + } + n = data_pools.size(); + encode(n, bl); + for (const auto p: data_pools) { + n = p; + encode(n, bl); + } + + int32_t m = cas_pool; + encode(m, bl); + return; + } else if ((features & CEPH_FEATURE_MDSENC) == 0) { + __u16 v = 3; + encode(v, bl); + encode(epoch, bl); + encode(flags, bl); + encode(last_failure, bl); + encode(root, bl); + encode(session_timeout, bl); + encode(session_autoclose, bl); + encode(max_file_size, bl); + encode(max_mds, bl); + __u32 n = mds_info.size(); + encode(n, bl); + for (map::const_iterator i = mds_info.begin(); + i != mds_info.end(); ++i) { + encode(i->first, bl); + encode(i->second, bl, features); + } + encode(data_pools, bl); + encode(cas_pool, bl); + + __u16 ev = 5; + encode(ev, bl); + encode(compat, bl); + encode(metadata_pool, bl); + encode(created, bl); + encode(modified, bl); + encode(tableserver, bl); + encode(in, bl); + encode(inc, bl); + encode(up, bl); + encode(failed, bl); + encode(stopped, bl); + encode(last_failure_osd_epoch, bl); + return; + } + + ENCODE_START(5, 4, bl); + encode(epoch, bl); + encode(flags, bl); + encode(last_failure, bl); + encode(root, bl); + encode(session_timeout, bl); + encode(session_autoclose, bl); + encode(max_file_size, bl); + encode(max_mds, bl); + encode(mds_info, bl, features); + encode(data_pools, bl); + encode(cas_pool, bl); + + __u16 ev = 16; + encode(ev, bl); + encode(compat, bl); + encode(metadata_pool, bl); + encode(created, bl); + encode(modified, bl); + encode(tableserver, bl); + encode(in, bl); + encode(inc, bl); + encode(up, bl); + encode(failed, bl); + encode(stopped, bl); + encode(last_failure_osd_epoch, bl); + encode(ever_allowed_features, bl); + encode(explicitly_allowed_features, bl); + encode(inline_data_enabled, bl); + encode(enabled, bl); + encode(fs_name, bl); + encode(damaged, bl); + encode(balancer, bl); + encode(standby_count_wanted, bl); + encode(old_max_mds, bl); + { + ceph_release_t min_compat_client = ceph_release_t::unknown; + encode(min_compat_client, bl); + } + encode(required_client_features, bl); + ENCODE_FINISH(bl); +} + +void MDSMap::sanitize(const std::function& pool_exists) +{ + /* Before we did stricter checking, it was possible to remove a data pool + * without also deleting it from the MDSMap. Check for that here after + * decoding the data pools. + */ + + for (auto it = data_pools.begin(); it != data_pools.end();) { + if (!pool_exists(*it)) { + dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl; + it = data_pools.erase(it); + } else { + it++; + } + } +} + +void MDSMap::decode(bufferlist::const_iterator& p) +{ + std::map inc; // Legacy field, parse and drop + + cached_up_features = 0; + DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p); + decode(epoch, p); + decode(flags, p); + decode(last_failure, p); + decode(root, p); + decode(session_timeout, p); + decode(session_autoclose, p); + decode(max_file_size, p); + decode(max_mds, p); + decode(mds_info, p); + if (struct_v < 3) { + __u32 n; + decode(n, p); + while (n--) { + __u32 m; + decode(m, p); + data_pools.push_back(m); + } + __s32 s; + decode(s, p); + cas_pool = s; + } else { + decode(data_pools, p); + decode(cas_pool, p); + } + + // kclient ignores everything from here + __u16 ev = 1; + if (struct_v >= 2) + decode(ev, p); + if (ev >= 3) + decode(compat, p); + else + compat = get_compat_set_base(); + if (ev < 5) { + __u32 n; + decode(n, p); + metadata_pool = n; + } else { + decode(metadata_pool, p); + } + decode(created, p); + decode(modified, p); + decode(tableserver, p); + decode(in, p); + decode(inc, p); + decode(up, p); + decode(failed, p); + decode(stopped, p); + if (ev >= 4) + decode(last_failure_osd_epoch, p); + if (ev >= 6) { + if (ev < 10) { + // previously this was a bool about snaps, not a flag map + bool flag; + decode(flag, p); + ever_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0; + decode(flag, p); + explicitly_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0; + } else { + decode(ever_allowed_features, p); + decode(explicitly_allowed_features, p); + } + } else { + ever_allowed_features = 0; + explicitly_allowed_features = 0; + } + if (ev >= 7) + decode(inline_data_enabled, p); + + if (ev >= 8) { + ceph_assert(struct_v >= 5); + decode(enabled, p); + decode(fs_name, p); + } else { + if (epoch > 1) { + // If an MDS has ever been started, epoch will be greater than 1, + // assume filesystem is enabled. + enabled = true; + } else { + // Upgrading from a cluster that never used an MDS, switch off + // filesystem until it's explicitly enabled. + enabled = false; + } + } + + if (ev >= 9) { + decode(damaged, p); + } + + if (ev >= 11) { + decode(balancer, p); + } + + if (ev >= 12) { + decode(standby_count_wanted, p); + } + + if (ev >= 13) { + decode(old_max_mds, p); + } + + if (ev >= 14) { + ceph_release_t min_compat_client; + if (ev == 14) { + int8_t r; + decode(r, p); + if (r < 0) { + min_compat_client = ceph_release_t::unknown; + } else { + min_compat_client = ceph_release_t{static_cast(r)}; + } + } else if (ev >= 15) { + decode(min_compat_client, p); + } + if (ev >= 16) { + decode(required_client_features, p); + } else { + set_min_compat_client(min_compat_client); + } + } + + /* All MDS since at least v14.0.0 understand INLINE */ + /* TODO: remove after R is released */ + compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE); + + for (auto& p: mds_info) { + static const CompatSet empty; + auto& info = p.second; + if (empty.compare(info.compat) == 0) { + /* bootstrap old compat; mds_info_t::decode does not have access to MDSMap */ + info.compat = compat; + } + /* All MDS since at least v14.0.0 understand INLINE */ + /* TODO: remove after R is released */ + info.compat.incompat.insert(MDS_FEATURE_INCOMPAT_INLINE); + } + + DECODE_FINISH(p); +} + +MDSMap::availability_t MDSMap::is_cluster_available() const +{ + if (epoch == 0) { + // If I'm a client, this means I'm looking at an MDSMap instance + // that was never actually initialized from the mons. Client should + // wait. + return TRANSIENT_UNAVAILABLE; + } + + // If a rank is marked damage (unavailable until operator intervenes) + if (damaged.size()) { + return STUCK_UNAVAILABLE; + } + + // If no ranks are created (filesystem not initialized) + if (in.empty()) { + return STUCK_UNAVAILABLE; + } + + for (const auto rank : in) { + if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) { + // This might only be transient, but because we can't see + // standbys, we have no way of knowing whether there is a + // standby available to replace the laggy guy. + return STUCK_UNAVAILABLE; + } + } + + if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) { + // Nobody looks stuck, so indicate to client they should go ahead + // and try mounting if anybody is active. This may include e.g. + // one MDS failing over and another active: the client should + // proceed to start talking to the active one and let the + // transiently-unavailable guy catch up later. + return AVAILABLE; + } else { + // Nothing indicating we were stuck, but nobody active (yet) + //return TRANSIENT_UNAVAILABLE; + + // Because we don't have standbys in the MDSMap any more, we can't + // reliably indicate transient vs. stuck, so always say stuck so + // that the client doesn't block. + return STUCK_UNAVAILABLE; + } +} + +bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next) +{ + if (next == prev) + return true; + if (next == MDSMap::STATE_DAMAGED) + return true; + + if (prev == MDSMap::STATE_BOOT) { + return next == MDSMap::STATE_STANDBY; + } else if (prev == MDSMap::STATE_STANDBY) { + return next == MDSMap::STATE_STANDBY_REPLAY || + next == MDSMap::STATE_REPLAY || + next == MDSMap::STATE_CREATING || + next == MDSMap::STATE_STARTING; + } else if (prev == MDSMap::STATE_CREATING || prev == MDSMap::STATE_STARTING) { + return next == MDSMap::STATE_ACTIVE; + } else if (prev == MDSMap::STATE_STANDBY_REPLAY) { + return next == MDSMap::STATE_REPLAY; + } else if (prev == MDSMap::STATE_REPLAY) { + return next == MDSMap::STATE_RESOLVE || + next == MDSMap::STATE_RECONNECT; + } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) { + // Once I have entered replay, the only allowable transitions are to + // the next next along in the sequence. + // Except... + if (prev == MDSMap::STATE_REJOIN && + (next == MDSMap::STATE_ACTIVE || // No need to do client replay + next == MDSMap::STATE_STOPPED)) { // no subtrees + return true; + } + return next == prev + 1; + } else if (prev == MDSMap::STATE_ACTIVE) { + return next == MDSMap::STATE_STOPPING; + } else if (prev == MDSMap::STATE_STOPPING) { + return next == MDSMap::STATE_STOPPED; + } else { + derr << __func__ << ": Unknown prev state " + << ceph_mds_state_name(prev) << "(" << prev << ")" << dendl; + return false; + } +} + +bool MDSMap::check_health(mds_rank_t standby_daemon_count) +{ + std::set standbys; + get_standby_replay_mds_set(standbys); + std::set actives; + get_active_mds_set(actives); + mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count; + + /* If there are standby daemons available/replaying and + * standby_count_wanted is unset (default), then we set it to 1. This will + * happen during health checks by the mons. Also, during initial creation + * of the FS we will have no actives so we don't want to change the default + * yet. + */ + if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) { + set_standby_count_wanted(1); + return true; + } + return false; +} + +mds_gid_t MDSMap::find_mds_gid_by_name(std::string_view s) const { + for (const auto& [gid, info] : mds_info) { + if (info.name == s) { + return gid; + } + } + return MDS_GID_NONE; +} + +unsigned MDSMap::get_num_mds(int state) const { + unsigned n = 0; + for (std::map::const_iterator p = mds_info.begin(); + p != mds_info.end(); + ++p) + if (p->second.state == state) ++n; + return n; +} + +void MDSMap::get_up_mds_set(std::set& s) const { + for (std::map::const_iterator p = up.begin(); + p != up.end(); + ++p) + s.insert(p->first); +} + +uint64_t MDSMap::get_up_features() { + if (!cached_up_features) { + bool first = true; + for (std::map::const_iterator p = up.begin(); + p != up.end(); + ++p) { + std::map::const_iterator q = + mds_info.find(p->second); + ceph_assert(q != mds_info.end()); + if (first) { + cached_up_features = q->second.mds_features; + first = false; + } else { + cached_up_features &= q->second.mds_features; + } + } + } + return cached_up_features; +} + +void MDSMap::get_recovery_mds_set(std::set& s) const { + s = failed; + for (const auto& p : damaged) + s.insert(p); + for (const auto& p : mds_info) + if (p.second.state >= STATE_REPLAY && p.second.state <= STATE_STOPPING) + s.insert(p.second.rank); +} + +void MDSMap::get_mds_set_lower_bound(std::set& s, DaemonState first) const { + for (std::map::const_iterator p = mds_info.begin(); + p != mds_info.end(); + ++p) + if (p->second.state >= first && p->second.state <= STATE_STOPPING) + s.insert(p->second.rank); +} + +void MDSMap::get_mds_set(std::set& s, DaemonState state) const { + for (std::map::const_iterator p = mds_info.begin(); + p != mds_info.end(); + ++p) + if (p->second.state == state) + s.insert(p->second.rank); +} + +mds_gid_t MDSMap::get_standby_replay(mds_rank_t r) const { + for (auto& [gid,info] : mds_info) { + if (info.rank == r && info.state == STATE_STANDBY_REPLAY) { + return gid; + } + } + return MDS_GID_NONE; +} + +bool MDSMap::is_degraded() const { + if (!failed.empty() || !damaged.empty()) + return true; + for (const auto& p : mds_info) { + if (p.second.is_degraded()) + return true; + } + return false; +} + +void MDSMap::set_min_compat_client(ceph_release_t version) +{ + vector bits = CEPHFS_FEATURES_MDS_REQUIRED; + + if (version >= ceph_release_t::octopus) + bits.push_back(CEPHFS_FEATURE_OCTOPUS); + else if (version >= ceph_release_t::nautilus) + bits.push_back(CEPHFS_FEATURE_NAUTILUS); + else if (version >= ceph_release_t::mimic) + bits.push_back(CEPHFS_FEATURE_MIMIC); + else if (version >= ceph_release_t::luminous) + bits.push_back(CEPHFS_FEATURE_LUMINOUS); + else if (version >= ceph_release_t::kraken) + bits.push_back(CEPHFS_FEATURE_KRAKEN); + else if (version >= ceph_release_t::jewel) + bits.push_back(CEPHFS_FEATURE_JEWEL); + + std::sort(bits.begin(), bits.end()); + required_client_features = feature_bitset_t(bits); +} -- cgit v1.2.3