From 17d6a993fc17d533460c5f40f3908c708e057c18 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 23 May 2024 18:45:17 +0200 Subject: Merging upstream version 18.2.3. Signed-off-by: Daniel Baumann --- src/mon/AuthMonitor.cc | 5 ---- src/mon/ConfigMap.cc | 4 +-- src/mon/ConfigMap.h | 2 ++ src/mon/ConfigMonitor.cc | 1 + src/mon/ConnectionTracker.cc | 34 ++++++++++++++++------ src/mon/FSCommands.cc | 58 +++++++++++++++++++++++++++++++++++++ src/mon/HealthMonitor.cc | 2 +- src/mon/MDSMonitor.cc | 1 + src/mon/MgrMap.h | 13 +++++++-- src/mon/MgrMonitor.cc | 69 ++++++++++++++++++++++++++++++++++---------- src/mon/MgrMonitor.h | 3 +- src/mon/MonCommands.h | 29 +++++++++++++++---- src/mon/OSDMonitor.cc | 32 +++++++++++++------- src/mon/OSDMonitor.h | 22 ++++++++++++-- 14 files changed, 220 insertions(+), 55 deletions(-) (limited to 'src/mon') diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc index 395ff4926..59adc404e 100644 --- a/src/mon/AuthMonitor.cc +++ b/src/mon/AuthMonitor.cc @@ -1769,11 +1769,6 @@ bool AuthMonitor::prepare_command(MonOpRequestRef op) ++it; } - if (cap != "r" && cap.compare(0, 2, "rw")) { - ss << "Permission flags must start with 'r' or 'rw'."; - err = -EINVAL; - goto done; - } if (cap.compare(0, 2, "rw") == 0) osd_cap_wanted = "rw"; diff --git a/src/mon/ConfigMap.cc b/src/mon/ConfigMap.cc index 763b8ce9b..7a639947b 100644 --- a/src/mon/ConfigMap.cc +++ b/src/mon/ConfigMap.cc @@ -66,7 +66,7 @@ void OptionMask::dump(Formatter *f) const void MaskedOption::dump(Formatter *f) const { - f->dump_string("name", opt->name); + f->dump_string("name", localized_name); f->dump_string("value", raw_value); f->dump_string("level", Option::level_to_str(opt->level)); f->dump_bool("can_update_at_runtime", opt->can_update_at_runtime()); @@ -76,7 +76,7 @@ void MaskedOption::dump(Formatter *f) const ostream& operator<<(ostream& out, const MaskedOption& o) { - out << o.opt->name; + out << o.localized_name; if (o.mask.location_type.size()) { out << "@" << o.mask.location_type << '=' << o.mask.location_value; } diff --git a/src/mon/ConfigMap.h b/src/mon/ConfigMap.h index a21e77265..34af942a6 100644 --- a/src/mon/ConfigMap.h +++ b/src/mon/ConfigMap.h @@ -63,6 +63,7 @@ struct MaskedOption { const Option *opt; ///< the option OptionMask mask; std::unique_ptr unknown_opt; ///< if fabricated for an unknown option + std::string localized_name; ///< localized name for the option MaskedOption(const Option *o, bool fab=false) : opt(o) { if (fab) { @@ -74,6 +75,7 @@ struct MaskedOption { opt = o.opt; mask = std::move(o.mask); unknown_opt = std::move(o.unknown_opt); + localized_name = std::move(o.localized_name); } const MaskedOption& operator=(const MaskedOption& o) = delete; const MaskedOption& operator=(MaskedOption&& o) = delete; diff --git a/src/mon/ConfigMonitor.cc b/src/mon/ConfigMonitor.cc index e24ccbc18..cccda96f4 100644 --- a/src/mon/ConfigMonitor.cc +++ b/src/mon/ConfigMonitor.cc @@ -831,6 +831,7 @@ void ConfigMonitor::load_config() MaskedOption mopt(opt); mopt.raw_value = value; + mopt.localized_name = name; string section_name; if (who.size() && !ConfigMap::parse_mask(who, §ion_name, &mopt.mask)) { diff --git a/src/mon/ConnectionTracker.cc b/src/mon/ConnectionTracker.cc index 272ad40c2..c87d614f6 100644 --- a/src/mon/ConnectionTracker.cc +++ b/src/mon/ConnectionTracker.cc @@ -62,7 +62,9 @@ void ConnectionTracker::receive_peer_report(const ConnectionTracker& o) ldout(cct, 30) << __func__ << dendl; for (auto& i : o.peer_reports) { const ConnectionReport& report = i.second; - if (i.first == rank) continue; + if (i.first == rank || i.first < 0) { + continue; + } ConnectionReport& existing = *reports(i.first); if (report.epoch > existing.epoch || (report.epoch == existing.epoch && @@ -79,26 +81,32 @@ void ConnectionTracker::receive_peer_report(const ConnectionTracker& o) bool ConnectionTracker::increase_epoch(epoch_t e) { ldout(cct, 30) << __func__ << " to " << e << dendl; - if (e > epoch) { + if (e > epoch && rank >= 0) { my_reports.epoch_version = version = 0; my_reports.epoch = epoch = e; peer_reports[rank] = my_reports; encoding.clear(); return true; } + ldout(cct, 10) << "Either got a report from a rank -1 or our epoch is >= to " + << e << " not increasing our epoch!" << dendl; return false; } void ConnectionTracker::increase_version() { ldout(cct, 30) << __func__ << " to " << version+1 << dendl; - encoding.clear(); - ++version; - my_reports.epoch_version = version; - peer_reports[rank] = my_reports; - if ((version % persist_interval) == 0 ) { - ldout(cct, 30) << version << " % " << persist_interval << " == 0" << dendl; - owner->persist_connectivity_scores(); + if (rank >= 0) { + encoding.clear(); + ++version; + my_reports.epoch_version = version; + peer_reports[rank] = my_reports; + if ((version % persist_interval) == 0 ) { + ldout(cct, 30) << version << " % " << persist_interval << " == 0" << dendl; + owner->persist_connectivity_scores(); + } + } else { + ldout(cct, 10) << "Got a report from a rank -1, not increasing our version!" << dendl; } } @@ -110,6 +118,10 @@ void ConnectionTracker::report_live_connection(int peer_rank, double units_alive lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl; return; } + if (peer_rank < 0) { + ldout(cct, 10) << "Got a report from a rank -1, not adding that to our report!" << dendl; + return; + } // we need to "auto-initialize" to 1, do shenanigans auto i = my_reports.history.find(peer_rank); if (i == my_reports.history.end()) { @@ -138,6 +150,10 @@ void ConnectionTracker::report_dead_connection(int peer_rank, double units_dead) lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl; return; } + if (peer_rank < 0) { + ldout(cct, 10) << "Got a report from a rank -1, not adding that to our report!" << dendl; + return; + } // we need to "auto-initialize" to 1, do shenanigans auto i = my_reports.history.find(peer_rank); if (i == my_reports.history.end()) { diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index 65d2c356b..05d28cc4c 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -461,6 +461,17 @@ public: { fs->mds_map.set_max_filesize(n); }); + } else if (var == "max_xattr_size") { + if (interr.length()) { + ss << var << " requires an integer value"; + return -EINVAL; + } + fsmap.modify_filesystem( + fs->fscid, + [n](std::shared_ptr fs) + { + fs->mds_map.set_max_xattr_size(n); + }); } else if (var == "allow_new_snaps") { bool enable_snaps = false; int r = parse_bool(val, &enable_snaps, ss); @@ -667,6 +678,21 @@ public: } }; fsmap.modify_filesystem(fs->fscid, std::move(f)); + } else if (var == "balance_automate") { + bool allow = false; + int r = parse_bool(val, &allow, ss); + if (r != 0) { + return r; + } + + auto f = [allow](auto&& fs) { + if (allow) { + fs->mds_map.set_balance_automate(); + } else { + fs->mds_map.clear_balance_automate(); + } + }; + fsmap.modify_filesystem(fs->fscid, std::move(f)); } else if (var == "min_compat_client") { auto vno = ceph_release_from_name(val.c_str()); if (!vno) { @@ -713,6 +739,38 @@ public: ss << "client(s) already allowed to establish new session(s)"; } } + } else if (var == "refuse_standby_for_another_fs") { + bool refuse_standby_for_another_fs = false; + int r = parse_bool(val, &refuse_standby_for_another_fs, ss); + if (r != 0) { + return r; + } + + if (refuse_standby_for_another_fs) { + if (!(fs->mds_map.test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS))) { + fsmap.modify_filesystem( + fs->fscid, + [](std::shared_ptr fs) + { + fs->mds_map.set_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS); + }); + ss << "set to refuse standby for another fs"; + } else { + ss << "to refuse standby for another fs is already set"; + } + } else { + if (fs->mds_map.test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS)) { + fsmap.modify_filesystem( + fs->fscid, + [](std::shared_ptr fs) + { + fs->mds_map.clear_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS); + }); + ss << "allowed to use standby for another fs"; + } else { + ss << "to use standby for another fs is already allowed"; + } + } } else { ss << "unknown variable " << var; return -EINVAL; diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index 6eed2b0f0..0c42734ef 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -400,7 +400,7 @@ void HealthMonitor::tick() bool HealthMonitor::check_mutes() { - bool changed = true; + bool changed = false; auto now = ceph_clock_now(); health_check_map_t all; gather_all_health_checks(&all); diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 4b27d828c..0ac5060f7 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -136,6 +136,7 @@ void MDSMonitor::update_from_paxos(bool *need_bootstrap) << ", my e " << get_fsmap().epoch << dendl; ceph_assert(version > get_fsmap().epoch); + load_metadata(pending_metadata); load_health(); // read and decode diff --git a/src/mon/MgrMap.h b/src/mon/MgrMap.h index f37ed97fd..b36acef5a 100644 --- a/src/mon/MgrMap.h +++ b/src/mon/MgrMap.h @@ -225,6 +225,10 @@ public: epoch_t epoch = 0; epoch_t last_failure_osd_epoch = 0; + + static const uint64_t FLAG_DOWN = (1<<0); + uint64_t flags = 0; + /// global_id of the ceph-mgr instance selected as a leader uint64_t active_gid = 0; /// server address reported by the leader once it is active @@ -401,7 +405,7 @@ public: ENCODE_FINISH(bl); return; } - ENCODE_START(12, 6, bl); + ENCODE_START(13, 6, bl); encode(epoch, bl); encode(active_addrs, bl, features); encode(active_gid, bl); @@ -425,13 +429,14 @@ public: // backwards compatible messsage for older monitors. encode(clients_addrs, bl, features); encode(clients_names, bl, features); + encode(flags, bl); ENCODE_FINISH(bl); return; } void decode(ceph::buffer::list::const_iterator& p) { - DECODE_START(12, p); + DECODE_START(13, p); decode(epoch, p); decode(active_addrs, p); decode(active_gid, p); @@ -498,11 +503,15 @@ public: } } } + if (struct_v >= 13) { + decode(flags, p); + } DECODE_FINISH(p); } void dump(ceph::Formatter *f) const { f->dump_int("epoch", epoch); + f->dump_int("flags", flags); f->dump_int("active_gid", get_active_gid()); f->dump_string("active_name", get_active_name()); f->dump_object("active_addrs", active_addrs); diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index 958bf6691..01ce3f1a0 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -601,22 +601,23 @@ bool MgrMonitor::prepare_beacon(MonOpRequestRef op) if (pending_map.standbys.count(m->get_gid())) { drop_standby(m->get_gid(), false); } - dout(4) << "selecting new active " << m->get_gid() - << " " << m->get_name() - << " (was " << pending_map.active_gid << " " - << pending_map.active_name << ")" << dendl; - pending_map.active_gid = m->get_gid(); - pending_map.active_name = m->get_name(); - pending_map.active_change = ceph_clock_now(); - pending_map.active_mgr_features = m->get_mgr_features(); - pending_map.available_modules = m->get_available_modules(); - encode(m->get_metadata(), pending_metadata[m->get_name()]); - pending_metadata_rm.erase(m->get_name()); - - mon.clog->info() << "Activating manager daemon " - << pending_map.active_name; + if (!(pending_map.flags & MgrMap::FLAG_DOWN)) { + dout(4) << "selecting new active " << m->get_gid() + << " " << m->get_name() + << " (was " << pending_map.active_gid << " " + << pending_map.active_name << ")" << dendl; + pending_map.active_gid = m->get_gid(); + pending_map.active_name = m->get_name(); + pending_map.active_change = ceph_clock_now(); + pending_map.active_mgr_features = m->get_mgr_features(); + pending_map.available_modules = m->get_available_modules(); + encode(m->get_metadata(), pending_metadata[m->get_name()]); + pending_metadata_rm.erase(m->get_name()); - updated = true; + mon.clog->info() << "Activating manager daemon " + << pending_map.active_name; + updated = true; + } } else { if (pending_map.standbys.count(m->get_gid()) > 0) { dout(10) << "from existing standby " << m->get_gid() << dendl; @@ -891,6 +892,9 @@ void MgrMonitor::on_restart() bool MgrMonitor::promote_standby() { ceph_assert(pending_map.active_gid == 0); + if (pending_map.flags & MgrMap::FLAG_DOWN) { + return false; + } if (pending_map.standbys.size()) { // Promote a replacement (arbitrary choice of standby) auto replacement_gid = pending_map.standbys.begin()->first; @@ -904,6 +908,9 @@ bool MgrMonitor::promote_standby() pending_map.active_addrs = entity_addrvec_t(); pending_map.active_change = ceph_clock_now(); + mon.clog->info() << "Activating manager daemon " + << pending_map.active_name; + drop_standby(replacement_gid, false); return true; @@ -1195,7 +1202,37 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op) int r = 0; bool plugged = false; - if (prefix == "mgr fail") { + if (prefix == "mgr set") { + std::string var; + if (!cmd_getval(cmdmap, "var", var) || var.empty()) { + ss << "Invalid variable"; + return -EINVAL; + } + string val; + if (!cmd_getval(cmdmap, "val", val)) { + return -EINVAL; + } + + if (var == "down") { + bool enable_down = false; + int r = parse_bool(val, &enable_down, ss); + if (r != 0) { + return r; + } + if (enable_down) { + if (!mon.osdmon()->is_writeable()) { + mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + pending_map.flags |= MgrMap::FLAG_DOWN; + plugged |= drop_active(); + } else { + pending_map.flags &= ~(MgrMap::FLAG_DOWN); + } + } else { + return -EINVAL; + } + } else if (prefix == "mgr fail") { string who; if (!cmd_getval(cmdmap, "who", who)) { if (!map.active_gid) { diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h index 79d4e5005..a2a84c141 100644 --- a/src/mon/MgrMonitor.h +++ b/src/mon/MgrMonitor.h @@ -21,8 +21,9 @@ #include "MgrMap.h" #include "PaxosService.h" #include "MonCommand.h" +#include "CommandHandler.h" -class MgrMonitor: public PaxosService +class MgrMonitor: public PaxosService, public CommandHandler { MgrMap map; MgrMap pending_map; diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 52af09c8c..58e30f64a 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -374,11 +374,26 @@ COMMAND("fs get name=fs_name,type=CephString", "fs", "r") COMMAND("fs set " "name=fs_name,type=CephString " - "name=var,type=CephChoices,strings=max_mds|max_file_size" - "|allow_new_snaps|inline_data|cluster_down|allow_dirfrags|balancer" - "|standby_count_wanted|session_timeout|session_autoclose" - "|allow_standby_replay|down|joinable|min_compat_client|bal_rank_mask" - "|refuse_client_session " + "name=var,type=CephChoices,strings=max_mds" + "|allow_dirfrags" + "|allow_new_snaps" + "|allow_standby_replay" + "|bal_rank_mask" + "|balance_automate" + "|balancer" + "|cluster_down" + "|down" + "|inline_data" + "|joinable" + "|max_file_size" + "|max_xattr_size" + "|min_compat_client" + "|refuse_client_session" + "|refuse_standby_for_another_fs" + "|session_autoclose" + "|session_timeout" + "|standby_count_wanted" + " " "name=val,type=CephString " "name=yes_i_really_mean_it,type=CephBool,req=false " "name=yes_i_really_really_mean_it,type=CephBool,req=false", @@ -1273,6 +1288,10 @@ COMMAND("mgr dump " "name=epoch,type=CephInt,range=0,req=false", "dump the latest MgrMap", "mgr", "r") +COMMAND("mgr set " + "name=var,type=CephChoices,strings=down " + "name=val,type=CephString ", + "set mgr parameter to ", "mgr", "rw") COMMAND("mgr fail name=who,type=CephString,req=false", "treat the named manager daemon as failed", "mgr", "rw") COMMAND("mgr module ls", diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 360bd036b..6543da85c 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -395,7 +395,7 @@ void LastEpochClean::report(unsigned pg_num, const pg_t& pg, return lec.report(pg_num, pg.ps(), last_epoch_clean); } -epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const +epoch_t LastEpochClean::get_lower_bound_by_pool(const OSDMap& latest) const { auto floor = latest.get_epoch(); for (auto& pool : latest.get_pools()) { @@ -901,12 +901,7 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) if (state & CEPH_OSD_UP) { // could be marked up *or* down, but we're too lazy to check which last_osd_report.erase(osd); - } - } - for (auto [osd, weight] : inc.new_weight) { - if (weight == CEPH_OSD_OUT) { - // manually marked out, so drop it - osd_epochs.erase(osd); + osd_epochs.erase(osd); } } } @@ -2329,13 +2324,21 @@ version_t OSDMonitor::get_trim_to() const return 0; } +/* There are two constraints on trimming: + * 1. we must not trim past the last_epoch_clean for any pg + * 2. we must not trim past the last reported epoch for any up + * osds. + * + * LastEpochClean::get_lower_bound_by_pool gives a value <= constraint 1. + * For constraint 2, we take the min over osd_epochs, which is populated with + * MOSDBeacon::version, see OSDMonitor::prepare_beacon + */ epoch_t OSDMonitor::get_min_last_epoch_clean() const { - auto floor = last_epoch_clean.get_lower_bound(osdmap); - // also scan osd epochs - // don't trim past the oldest reported osd epoch + auto floor = last_epoch_clean.get_lower_bound_by_pool(osdmap); for (auto [osd, epoch] : osd_epochs) { if (epoch < floor) { + ceph_assert(osdmap.is_up(osd)); floor = epoch; } } @@ -4443,8 +4446,8 @@ bool OSDMonitor::prepare_beacon(MonOpRequestRef op) last_osd_report[from].first = ceph_clock_now(); last_osd_report[from].second = beacon->osd_beacon_report_interval; + ceph_assert(osdmap.is_up(from)); osd_epochs[from] = beacon->version; - for (const auto& pg : beacon->pgs) { if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) { unsigned pg_num = pool->get_pg_num(); @@ -11302,6 +11305,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, err = 0; goto reply_no_propose; } + bool force_no_fake = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", force_no_fake); if (!force) { err = -EPERM; ss << "will not override erasure code profile " << name @@ -11310,6 +11315,11 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, << " is different from the proposed profile " << profile_map; goto reply_no_propose; + } else if (!force_no_fake) { + err = -EPERM; + ss << "overriding erasure code profile can be DANGEROUS" + << "; add --yes-i-really-mean-it to do it anyway"; + goto reply_no_propose; } } diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 16880f077..e20da536b 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -114,7 +114,13 @@ class LastEpochClean { public: void report(unsigned pg_num, const pg_t& pg, epoch_t last_epoch_clean); void remove_pool(uint64_t pool); - epoch_t get_lower_bound(const OSDMap& latest) const; + /** + * get_lower_bound_by_pool + * + * Returns epoch e such that e <= pg.last_epoch_clean for all pgs in cluster. + * May return 0 if any pool does not have comprehensive values for all pgs. + */ + epoch_t get_lower_bound_by_pool(const OSDMap& latest) const; void dump(Formatter *f) const; }; @@ -643,8 +649,18 @@ protected: // when we last received PG stats from each osd and the osd's osd_beacon_report_interval std::map> last_osd_report; - // TODO: use last_osd_report to store the osd report epochs, once we don't - // need to upgrade from pre-luminous releases. + /** + * osd_epochs + * + * Records the MOSDBeacon::version (the osd epoch at which the OSD sent the + * beacon) of the most recent beacon recevied from each currently up OSD. + * Used in OSDMonitor::get_min_last_epoch_clean(). + * Down osds are trimmed upon commit of each map + * (OSDMonitor::update_from_paxos). + * + * TODO: use last_osd_report to store the osd report epochs, once we don't + * need to upgrade from pre-luminous releases. + */ std::map osd_epochs; LastEpochClean last_epoch_clean; bool preprocess_beacon(MonOpRequestRef op); -- cgit v1.2.3