diff options
Diffstat (limited to '')
61 files changed, 60089 insertions, 0 deletions
diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc new file mode 100644 index 000000000..f08608c61 --- /dev/null +++ b/src/mon/AuthMonitor.cc @@ -0,0 +1,2033 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <sstream> + +#include "mon/AuthMonitor.h" +#include "mon/Monitor.h" +#include "mon/MonitorDBStore.h" +#include "mon/OSDMonitor.h" +#include "mon/MDSMonitor.h" +#include "mon/ConfigMonitor.h" + +#include "messages/MMonCommand.h" +#include "messages/MAuth.h" +#include "messages/MAuthReply.h" +#include "messages/MMonGlobalID.h" +#include "msg/Messenger.h" + +#include "auth/AuthServiceHandler.h" +#include "auth/KeyRing.h" +#include "include/stringify.h" +#include "include/ceph_assert.h" + +#include "mds/MDSAuthCaps.h" +#include "mgr/MgrCap.h" +#include "osd/OSDCap.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, get_last_committed()) +using namespace TOPNSPC::common; + +using std::cerr; +using std::cout; +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::setfill; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; +using std::unique_ptr; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::make_message; +using ceph::mono_clock; +using ceph::mono_time; +using ceph::timespan_str; +static ostream& _prefix(std::ostream *_dout, Monitor &mon, version_t v) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() + << ").auth v" << v << " "; +} + +ostream& operator<<(ostream &out, const AuthMonitor &pm) +{ + return out << "auth"; +} + +bool AuthMonitor::check_rotate() +{ + KeyServerData::Incremental rot_inc; + rot_inc.op = KeyServerData::AUTH_INC_SET_ROTATING; + if (mon.key_server.prepare_rotating_update(rot_inc.rotating_bl)) { + dout(10) << __func__ << " updating rotating" << dendl; + push_cephx_inc(rot_inc); + return true; + } + return false; +} + +/* + Tick function to update the map based on performance every N seconds +*/ + +void AuthMonitor::tick() +{ + if (!is_active()) return; + + dout(10) << *this << dendl; + + // increase global_id? + bool propose = false; + bool increase; + { + std::lock_guard l(mon.auth_lock); + increase = _should_increase_max_global_id(); + } + if (increase) { + if (mon.is_leader()) { + increase_max_global_id(); + propose = true; + } else { + dout(10) << __func__ << "requesting more ids from leader" << dendl; + int leader = mon.get_leader(); + MMonGlobalID *req = new MMonGlobalID(); + req->old_max_id = max_global_id; + mon.send_mon_message(req, leader); + } + } + + if (!mon.is_leader()) { + return; + } + + if (check_rotate()) { + propose = true; + } + + if (propose) { + propose_pending(); + } +} + +void AuthMonitor::on_active() +{ + dout(10) << "AuthMonitor::on_active()" << dendl; + + if (!mon.is_leader()) + return; + + mon.key_server.start_server(); + + if (is_writeable()) { + bool propose = false; + if (check_rotate()) { + propose = true; + } + bool increase; + { + std::lock_guard l(mon.auth_lock); + increase = _should_increase_max_global_id(); + } + if (increase) { + increase_max_global_id(); + propose = true; + } + if (propose) { + propose_pending(); + } + } +} + +bufferlist _encode_cap(const string& cap) +{ + bufferlist bl; + encode(cap, bl); + return bl; +} + +void AuthMonitor::get_initial_keyring(KeyRing *keyring) +{ + dout(10) << __func__ << dendl; + ceph_assert(keyring != nullptr); + + bufferlist bl; + int ret = mon.store->get("mkfs", "keyring", bl); + if (ret == -ENOENT) { + return; + } + // fail hard only if there's an error we're not expecting to see + ceph_assert(ret == 0); + + auto p = bl.cbegin(); + decode(*keyring, p); +} + +void _generate_bootstrap_keys( + list<pair<EntityName,EntityAuth> >* auth_lst) +{ + ceph_assert(auth_lst != nullptr); + + map<string,map<string,bufferlist> > bootstrap = { + { "admin", { + { "mon", _encode_cap("allow *") }, + { "osd", _encode_cap("allow *") }, + { "mds", _encode_cap("allow *") }, + { "mgr", _encode_cap("allow *") } + } }, + { "bootstrap-osd", { + { "mon", _encode_cap("allow profile bootstrap-osd") } + } }, + { "bootstrap-rgw", { + { "mon", _encode_cap("allow profile bootstrap-rgw") } + } }, + { "bootstrap-mds", { + { "mon", _encode_cap("allow profile bootstrap-mds") } + } }, + { "bootstrap-mgr", { + { "mon", _encode_cap("allow profile bootstrap-mgr") } + } }, + { "bootstrap-rbd", { + { "mon", _encode_cap("allow profile bootstrap-rbd") } + } }, + { "bootstrap-rbd-mirror", { + { "mon", _encode_cap("allow profile bootstrap-rbd-mirror") } + } } + }; + + for (auto &p : bootstrap) { + EntityName name; + name.from_str("client." + p.first); + EntityAuth auth; + auth.key.create(g_ceph_context, CEPH_CRYPTO_AES); + auth.caps = p.second; + + auth_lst->push_back(make_pair(name, auth)); + } +} + +void AuthMonitor::create_initial_keys(KeyRing *keyring) +{ + dout(10) << __func__ << " with keyring" << dendl; + ceph_assert(keyring != nullptr); + + list<pair<EntityName,EntityAuth> > auth_lst; + _generate_bootstrap_keys(&auth_lst); + + for (auto &p : auth_lst) { + if (keyring->exists(p.first)) { + continue; + } + keyring->add(p.first, p.second); + } +} + +void AuthMonitor::create_initial() +{ + dout(10) << "create_initial -- creating initial map" << dendl; + + // initialize rotating keys + mon.key_server.clear_secrets(); + check_rotate(); + ceph_assert(pending_auth.size() == 1); + + if (mon.is_keyring_required()) { + KeyRing keyring; + // attempt to obtain an existing mkfs-time keyring + get_initial_keyring(&keyring); + // create missing keys in the keyring + create_initial_keys(&keyring); + // import the resulting keyring + import_keyring(keyring); + } + + max_global_id = MIN_GLOBAL_ID; + + Incremental inc; + inc.inc_type = GLOBAL_ID; + inc.max_global_id = max_global_id; + pending_auth.push_back(inc); + + format_version = 3; +} + +void AuthMonitor::update_from_paxos(bool *need_bootstrap) +{ + dout(10) << __func__ << dendl; + load_health(); + + version_t version = get_last_committed(); + version_t keys_ver = mon.key_server.get_ver(); + if (version == keys_ver) + return; + ceph_assert(version > keys_ver); + + version_t latest_full = get_version_latest_full(); + + dout(10) << __func__ << " version " << version << " keys ver " << keys_ver + << " latest " << latest_full << dendl; + + if ((latest_full > 0) && (latest_full > keys_ver)) { + bufferlist latest_bl; + int err = get_version_full(latest_full, latest_bl); + ceph_assert(err == 0); + ceph_assert(latest_bl.length() != 0); + dout(7) << __func__ << " loading summary e " << latest_full << dendl; + dout(7) << __func__ << " latest length " << latest_bl.length() << dendl; + auto p = latest_bl.cbegin(); + __u8 struct_v; + decode(struct_v, p); + decode(max_global_id, p); + decode(mon.key_server, p); + mon.key_server.set_ver(latest_full); + keys_ver = latest_full; + } + + dout(10) << __func__ << " key server version " << mon.key_server.get_ver() << dendl; + + // walk through incrementals + while (version > keys_ver) { + bufferlist bl; + int ret = get_version(keys_ver+1, bl); + ceph_assert(ret == 0); + ceph_assert(bl.length()); + + // reset if we are moving to initial state. we will normally have + // keys in here temporarily for bootstrapping that we need to + // clear out. + if (keys_ver == 0) + mon.key_server.clear_secrets(); + + dout(20) << __func__ << " walking through version " << (keys_ver+1) + << " len " << bl.length() << dendl; + + auto p = bl.cbegin(); + __u8 v; + decode(v, p); + while (!p.end()) { + Incremental inc; + decode(inc, p); + switch (inc.inc_type) { + case GLOBAL_ID: + max_global_id = inc.max_global_id; + break; + + case AUTH_DATA: + { + KeyServerData::Incremental auth_inc; + auto iter = inc.auth_data.cbegin(); + decode(auth_inc, iter); + mon.key_server.apply_data_incremental(auth_inc); + break; + } + } + } + + keys_ver++; + mon.key_server.set_ver(keys_ver); + + if (keys_ver == 1 && mon.is_keyring_required()) { + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->erase("mkfs", "keyring"); + mon.store->apply_transaction(t); + } + } + + { + std::lock_guard l(mon.auth_lock); + if (last_allocated_id == 0) { + last_allocated_id = max_global_id; + dout(10) << __func__ << " last_allocated_id initialized to " + << max_global_id << dendl; + } + } + + dout(10) << __func__ << " max_global_id=" << max_global_id + << " format_version " << format_version + << dendl; + + mon.key_server.dump(); +} + +bool AuthMonitor::_should_increase_max_global_id() +{ + ceph_assert(ceph_mutex_is_locked(mon.auth_lock)); + auto num_prealloc = g_conf()->mon_globalid_prealloc; + if (max_global_id < num_prealloc || + (last_allocated_id + 1) >= max_global_id - num_prealloc / 2) { + return true; + } + return false; +} + +void AuthMonitor::increase_max_global_id() +{ + ceph_assert(mon.is_leader()); + + Incremental inc; + inc.inc_type = GLOBAL_ID; + inc.max_global_id = max_global_id + g_conf()->mon_globalid_prealloc; + dout(10) << "increasing max_global_id to " << inc.max_global_id << dendl; + pending_auth.push_back(inc); +} + +bool AuthMonitor::should_propose(double& delay) +{ + return (!pending_auth.empty()); +} + +void AuthMonitor::create_pending() +{ + pending_auth.clear(); + dout(10) << "create_pending v " << (get_last_committed() + 1) << dendl; +} + +void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t) +{ + dout(10) << __func__ << " v " << (get_last_committed() + 1) << dendl; + + bufferlist bl; + + __u8 v = 1; + encode(v, bl); + vector<Incremental>::iterator p; + for (p = pending_auth.begin(); p != pending_auth.end(); ++p) + p->encode(bl, mon.get_quorum_con_features()); + + version_t version = get_last_committed() + 1; + put_version(t, version, bl); + put_last_committed(t, version); + + // health + health_check_map_t next; + map<string,list<string>> bad_detail; // entity -> details + for (auto i = mon.key_server.secrets_begin(); + i != mon.key_server.secrets_end(); + ++i) { + for (auto& p : i->second.caps) { + ostringstream ss; + if (!valid_caps(p.first, p.second, &ss)) { + ostringstream ss2; + ss2 << i->first << " " << ss.str(); + bad_detail[i->first.to_str()].push_back(ss2.str()); + } + } + } + for (auto& inc : pending_auth) { + if (inc.inc_type == AUTH_DATA) { + KeyServerData::Incremental auth_inc; + auto iter = inc.auth_data.cbegin(); + decode(auth_inc, iter); + if (auth_inc.op == KeyServerData::AUTH_INC_DEL) { + bad_detail.erase(auth_inc.name.to_str()); + } else if (auth_inc.op == KeyServerData::AUTH_INC_ADD) { + for (auto& p : auth_inc.auth.caps) { + ostringstream ss; + if (!valid_caps(p.first, p.second, &ss)) { + ostringstream ss2; + ss2 << auth_inc.name << " " << ss.str(); + bad_detail[auth_inc.name.to_str()].push_back(ss2.str()); + } + } + } + } + } + if (bad_detail.size()) { + ostringstream ss; + ss << bad_detail.size() << " auth entities have invalid capabilities"; + health_check_t *check = &next.add("AUTH_BAD_CAPS", HEALTH_ERR, ss.str(), + bad_detail.size()); + for (auto& i : bad_detail) { + for (auto& j : i.second) { + check->detail.push_back(j); + } + } + } + encode_health(next, t); +} + +void AuthMonitor::encode_full(MonitorDBStore::TransactionRef t) +{ + version_t version = mon.key_server.get_ver(); + // do not stash full version 0 as it will never be removed nor read + if (version == 0) + return; + + dout(10) << __func__ << " auth v " << version << dendl; + ceph_assert(get_last_committed() == version); + + bufferlist full_bl; + std::scoped_lock l{mon.key_server.get_lock()}; + dout(20) << __func__ << " key server has " + << (mon.key_server.has_secrets() ? "" : "no ") + << "secrets!" << dendl; + __u8 v = 1; + encode(v, full_bl); + encode(max_global_id, full_bl); + encode(mon.key_server, full_bl); + + put_version_full(t, version, full_bl); + put_version_latest_full(t, version); +} + +version_t AuthMonitor::get_trim_to() const +{ + unsigned max = g_conf()->paxos_max_join_drift * 2; + version_t version = get_last_committed(); + if (mon.is_leader() && (version > max)) + return version - max; + return 0; +} + +bool AuthMonitor::preprocess_query(MonOpRequestRef op) +{ + auto m = op->get_req<PaxosServiceMessage>(); + dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl; + switch (m->get_type()) { + case MSG_MON_COMMAND: + try { + return preprocess_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + + case CEPH_MSG_AUTH: + return prep_auth(op, false); + + case MSG_MON_GLOBAL_ID: + return false; + + default: + ceph_abort(); + return true; + } +} + +bool AuthMonitor::prepare_update(MonOpRequestRef op) +{ + auto m = op->get_req<PaxosServiceMessage>(); + dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl; + switch (m->get_type()) { + case MSG_MON_COMMAND: + try { + return prepare_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + case MSG_MON_GLOBAL_ID: + return prepare_global_id(op); + case CEPH_MSG_AUTH: + return prep_auth(op, true); + default: + ceph_abort(); + return false; + } +} + +void AuthMonitor::_set_mon_num_rank(int num, int rank) +{ + dout(10) << __func__ << " num " << num << " rank " << rank << dendl; + ceph_assert(ceph_mutex_is_locked(mon.auth_lock)); + mon_num = num; + mon_rank = rank; +} + +uint64_t AuthMonitor::_assign_global_id() +{ + ceph_assert(ceph_mutex_is_locked(mon.auth_lock)); + if (mon_num < 1 || mon_rank < 0) { + dout(10) << __func__ << " inactive (num_mon " << mon_num + << " rank " << mon_rank << ")" << dendl; + return 0; + } + if (!last_allocated_id) { + dout(10) << __func__ << " last_allocated_id == 0" << dendl; + return 0; + } + + uint64_t id = last_allocated_id + 1; + int remainder = id % mon_num; + if (remainder) { + remainder = mon_num - remainder; + } + id += remainder + mon_rank; + + if (id >= max_global_id) { + dout(10) << __func__ << " failed (max " << max_global_id << ")" << dendl; + return 0; + } + + last_allocated_id = id; + dout(10) << __func__ << " " << id << " (max " << max_global_id << ")" + << dendl; + return id; +} + +uint64_t AuthMonitor::assign_global_id(bool should_increase_max) +{ + uint64_t id; + { + std::lock_guard l(mon.auth_lock); + id =_assign_global_id(); + if (should_increase_max) { + should_increase_max = _should_increase_max_global_id(); + } + } + if (mon.is_leader() && + should_increase_max) { + increase_max_global_id(); + } + return id; +} + +bool AuthMonitor::prep_auth(MonOpRequestRef op, bool paxos_writable) +{ + auto m = op->get_req<MAuth>(); + dout(10) << "prep_auth() blob_size=" << m->get_auth_payload().length() << dendl; + + MonSession *s = op->get_session(); + if (!s) { + dout(10) << "no session, dropping" << dendl; + return true; + } + + int ret = 0; + MAuthReply *reply; + bufferlist response_bl; + auto indata = m->auth_payload.cbegin(); + __u32 proto = m->protocol; + bool start = false; + bool finished = false; + EntityName entity_name; + bool is_new_global_id = false; + + // set up handler? + if (m->protocol == 0 && !s->auth_handler) { + set<__u32> supported; + + try { + __u8 struct_v = 1; + decode(struct_v, indata); + decode(supported, indata); + decode(entity_name, indata); + decode(s->con->peer_global_id, indata); + } catch (const ceph::buffer::error &e) { + dout(10) << "failed to decode initial auth message" << dendl; + ret = -EINVAL; + goto reply; + } + + // do we require cephx signatures? + + if (!m->get_connection()->has_feature(CEPH_FEATURE_MSG_AUTH)) { + if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON || + entity_name.get_type() == CEPH_ENTITY_TYPE_OSD || + entity_name.get_type() == CEPH_ENTITY_TYPE_MDS || + entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) { + if (g_conf()->cephx_cluster_require_signatures || + g_conf()->cephx_require_signatures) { + dout(1) << m->get_source_inst() + << " supports cephx but not signatures and" + << " 'cephx [cluster] require signatures = true';" + << " disallowing cephx" << dendl; + supported.erase(CEPH_AUTH_CEPHX); + } + } else { + if (g_conf()->cephx_service_require_signatures || + g_conf()->cephx_require_signatures) { + dout(1) << m->get_source_inst() + << " supports cephx but not signatures and" + << " 'cephx [service] require signatures = true';" + << " disallowing cephx" << dendl; + supported.erase(CEPH_AUTH_CEPHX); + } + } + } else if (!m->get_connection()->has_feature(CEPH_FEATURE_CEPHX_V2)) { + if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON || + entity_name.get_type() == CEPH_ENTITY_TYPE_OSD || + entity_name.get_type() == CEPH_ENTITY_TYPE_MDS || + entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) { + if (g_conf()->cephx_cluster_require_version >= 2 || + g_conf()->cephx_require_version >= 2) { + dout(1) << m->get_source_inst() + << " supports cephx but not v2 and" + << " 'cephx [cluster] require version >= 2';" + << " disallowing cephx" << dendl; + supported.erase(CEPH_AUTH_CEPHX); + } + } else { + if (g_conf()->cephx_service_require_version >= 2 || + g_conf()->cephx_require_version >= 2) { + dout(1) << m->get_source_inst() + << " supports cephx but not v2 and" + << " 'cephx [service] require version >= 2';" + << " disallowing cephx" << dendl; + supported.erase(CEPH_AUTH_CEPHX); + } + } + } + + int type; + if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON || + entity_name.get_type() == CEPH_ENTITY_TYPE_OSD || + entity_name.get_type() == CEPH_ENTITY_TYPE_MDS || + entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) + type = mon.auth_cluster_required.pick(supported); + else + type = mon.auth_service_required.pick(supported); + + s->auth_handler = get_auth_service_handler(type, g_ceph_context, &mon.key_server); + if (!s->auth_handler) { + dout(1) << "client did not provide supported auth type" << dendl; + ret = -ENOTSUP; + goto reply; + } + start = true; + proto = type; + } else if (!s->auth_handler) { + dout(10) << "protocol specified but no s->auth_handler" << dendl; + ret = -EINVAL; + goto reply; + } + + /* assign a new global_id? we assume this should only happen on the first + request. If a client tries to send it later, it'll screw up its auth + session */ + if (!s->con->peer_global_id) { + s->con->peer_global_id = assign_global_id(paxos_writable); + if (!s->con->peer_global_id) { + + delete s->auth_handler; + s->auth_handler = NULL; + + if (mon.is_leader() && paxos_writable) { + dout(10) << "increasing global id, waitlisting message" << dendl; + wait_for_active(op, new C_RetryMessage(this, op)); + goto done; + } + + if (!mon.is_leader()) { + dout(10) << "not the leader, requesting more ids from leader" << dendl; + int leader = mon.get_leader(); + MMonGlobalID *req = new MMonGlobalID(); + req->old_max_id = max_global_id; + mon.send_mon_message(req, leader); + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + + ceph_assert(!paxos_writable); + return false; + } + is_new_global_id = true; + } + + try { + if (start) { + // new session + ret = s->auth_handler->start_session(entity_name, + s->con->peer_global_id, + is_new_global_id, + &response_bl, + &s->con->peer_caps_info); + } else { + // request + ret = s->auth_handler->handle_request( + indata, + 0, // no connection_secret needed + &response_bl, + &s->con->peer_caps_info, + nullptr, nullptr); + } + if (ret == -EIO) { + wait_for_active(op, new C_RetryMessage(this,op)); + goto done; + } + if (ret > 0) { + if (!s->authenticated && + mon.ms_handle_authentication(s->con.get()) > 0) { + finished = true; + } + ret = 0; + } + } catch (const ceph::buffer::error &err) { + ret = -EINVAL; + dout(0) << "caught error when trying to handle auth request, probably malformed request" << dendl; + } + +reply: + reply = new MAuthReply(proto, &response_bl, ret, s->con->peer_global_id); + mon.send_reply(op, reply); + if (finished) { + // always send the latest monmap. + if (m->monmap_epoch < mon.monmap->get_epoch()) + mon.send_latest_monmap(m->get_connection().get()); + + mon.configmon()->check_sub(s); + } +done: + return true; +} + +bool AuthMonitor::preprocess_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + int r = -1; + bufferlist rdata; + stringstream ss, ds; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + // ss has reason for failure + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed()); + return true; + } + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + if (prefix == "auth add" || + prefix == "auth del" || + prefix == "auth rm" || + prefix == "auth get-or-create" || + prefix == "auth get-or-create-key" || + prefix == "fs authorize" || + prefix == "auth import" || + prefix == "auth caps") { + return false; + } + + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed()); + return true; + } + + // entity might not be supplied, but if it is, it should be valid + string entity_name; + cmd_getval(cmdmap, "entity", entity_name); + EntityName entity; + if (!entity_name.empty() && !entity.from_str(entity_name)) { + ss << "invalid entity_auth " << entity_name; + mon.reply_command(op, -EINVAL, ss.str(), get_last_committed()); + return true; + } + + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + if (prefix == "auth export") { + KeyRing keyring; + export_keyring(keyring); + if (!entity_name.empty()) { + EntityAuth eauth; + if (keyring.get_auth(entity, eauth)) { + KeyRing kr; + kr.add(entity, eauth); + if (f) + kr.encode_formatted("auth", f.get(), rdata); + else + kr.encode_plaintext(rdata); + ss << "export " << eauth; + r = 0; + } else { + ss << "no key for " << eauth; + r = -ENOENT; + } + } else { + if (f) + keyring.encode_formatted("auth", f.get(), rdata); + else + keyring.encode_plaintext(rdata); + + ss << "exported master keyring"; + r = 0; + } + } else if (prefix == "auth get" && !entity_name.empty()) { + KeyRing keyring; + EntityAuth entity_auth; + if(!mon.key_server.get_auth(entity, entity_auth)) { + ss << "failed to find " << entity_name << " in keyring"; + r = -ENOENT; + } else { + keyring.add(entity, entity_auth); + if (f) + keyring.encode_formatted("auth", f.get(), rdata); + else + keyring.encode_plaintext(rdata); + ss << "exported keyring for " << entity_name; + r = 0; + } + } else if (prefix == "auth print-key" || + prefix == "auth print_key" || + prefix == "auth get-key") { + EntityAuth auth; + if (!mon.key_server.get_auth(entity, auth)) { + ss << "don't have " << entity; + r = -ENOENT; + goto done; + } + if (f) { + auth.key.encode_formatted("auth", f.get(), rdata); + } else { + auth.key.encode_plaintext(rdata); + } + r = 0; + } else if (prefix == "auth list" || + prefix == "auth ls") { + if (f) { + mon.key_server.encode_formatted("auth", f.get(), rdata); + } else { + mon.key_server.encode_plaintext(rdata); + if (rdata.length() > 0) + ss << "installed auth entries:" << std::endl; + else + ss << "no installed auth entries!" << std::endl; + } + r = 0; + goto done; + } else { + ss << "invalid command"; + r = -EINVAL; + } + + done: + rdata.append(ds); + string rs; + getline(ss, rs, '\0'); + mon.reply_command(op, r, rs, rdata, get_last_committed()); + return true; +} + +void AuthMonitor::export_keyring(KeyRing& keyring) +{ + mon.key_server.export_keyring(keyring); +} + +int AuthMonitor::import_keyring(KeyRing& keyring) +{ + dout(10) << __func__ << " " << keyring.size() << " keys" << dendl; + + for (map<EntityName, EntityAuth>::iterator p = keyring.get_keys().begin(); + p != keyring.get_keys().end(); + ++p) { + if (p->second.caps.empty()) { + dout(0) << "import: no caps supplied" << dendl; + return -EINVAL; + } + int err = add_entity(p->first, p->second); + ceph_assert(err == 0); + } + return 0; +} + +int AuthMonitor::remove_entity(const EntityName &entity) +{ + dout(10) << __func__ << " " << entity << dendl; + if (!mon.key_server.contains(entity)) + return -ENOENT; + + KeyServerData::Incremental auth_inc; + auth_inc.name = entity; + auth_inc.op = KeyServerData::AUTH_INC_DEL; + push_cephx_inc(auth_inc); + + return 0; +} + +bool AuthMonitor::entity_is_pending(EntityName& entity) +{ + // are we about to have it? + for (auto& p : pending_auth) { + if (p.inc_type == AUTH_DATA) { + KeyServerData::Incremental inc; + auto q = p.auth_data.cbegin(); + decode(inc, q); + if (inc.op == KeyServerData::AUTH_INC_ADD && + inc.name == entity) { + return true; + } + } + } + return false; +} + +int AuthMonitor::exists_and_matches_entity( + const auth_entity_t& entity, + bool has_secret, + stringstream& ss) +{ + return exists_and_matches_entity(entity.name, entity.auth, + entity.auth.caps, has_secret, ss); +} + +int AuthMonitor::exists_and_matches_entity( + const EntityName& name, + const EntityAuth& auth, + const map<string,bufferlist>& caps, + bool has_secret, + stringstream& ss) +{ + + dout(20) << __func__ << " entity " << name << " auth " << auth + << " caps " << caps << " has_secret " << has_secret << dendl; + + EntityAuth existing_auth; + // does entry already exist? + if (mon.key_server.get_auth(name, existing_auth)) { + // key match? + if (has_secret) { + if (existing_auth.key.get_secret().cmp(auth.key.get_secret())) { + ss << "entity " << name << " exists but key does not match"; + return -EEXIST; + } + } + + // caps match? + if (caps.size() != existing_auth.caps.size()) { + ss << "entity " << name << " exists but caps do not match"; + return -EINVAL; + } + for (auto& it : caps) { + if (existing_auth.caps.count(it.first) == 0 || + !existing_auth.caps[it.first].contents_equal(it.second)) { + ss << "entity " << name << " exists but cap " + << it.first << " does not match"; + return -EINVAL; + } + } + + // they match, no-op + return 0; + } + return -ENOENT; +} + +int AuthMonitor::add_entity( + const EntityName& name, + const EntityAuth& auth) +{ + + // okay, add it. + KeyServerData::Incremental auth_inc; + auth_inc.op = KeyServerData::AUTH_INC_ADD; + auth_inc.name = name; + auth_inc.auth = auth; + + dout(10) << " add auth entity " << auth_inc.name << dendl; + dout(30) << " " << auth_inc.auth << dendl; + push_cephx_inc(auth_inc); + return 0; +} + +int AuthMonitor::validate_osd_destroy( + int32_t id, + const uuid_d& uuid, + EntityName& cephx_entity, + EntityName& lockbox_entity, + stringstream& ss) +{ + ceph_assert(paxos.is_plugged()); + + dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl; + + string cephx_str = "osd." + stringify(id); + string lockbox_str = "client.osd-lockbox." + stringify(uuid); + + if (!cephx_entity.from_str(cephx_str)) { + dout(10) << __func__ << " invalid cephx entity '" + << cephx_str << "'" << dendl; + ss << "invalid cephx key entity '" << cephx_str << "'"; + return -EINVAL; + } + + if (!lockbox_entity.from_str(lockbox_str)) { + dout(10) << __func__ << " invalid lockbox entity '" + << lockbox_str << "'" << dendl; + ss << "invalid lockbox key entity '" << lockbox_str << "'"; + return -EINVAL; + } + + if (!mon.key_server.contains(cephx_entity) && + !mon.key_server.contains(lockbox_entity)) { + return -ENOENT; + } + + return 0; +} + +int AuthMonitor::do_osd_destroy( + const EntityName& cephx_entity, + const EntityName& lockbox_entity) +{ + ceph_assert(paxos.is_plugged()); + + dout(10) << __func__ << " cephx " << cephx_entity + << " lockbox " << lockbox_entity << dendl; + + bool removed = false; + + int err = remove_entity(cephx_entity); + if (err == -ENOENT) { + dout(10) << __func__ << " " << cephx_entity << " does not exist" << dendl; + } else { + removed = true; + } + + err = remove_entity(lockbox_entity); + if (err == -ENOENT) { + dout(10) << __func__ << " " << lockbox_entity << " does not exist" << dendl; + } else { + removed = true; + } + + if (!removed) { + dout(10) << __func__ << " entities do not exist -- no-op." << dendl; + return 0; + } + + // given we have paxos plugged, this will not result in a proposal + // being triggered, but it will still be needed so that we get our + // pending state encoded into the paxos' pending transaction. + propose_pending(); + return 0; +} + +int _create_auth( + EntityAuth& auth, + const string& key, + const map<string,bufferlist>& caps) +{ + if (key.empty()) + return -EINVAL; + try { + auth.key.decode_base64(key); + } catch (ceph::buffer::error& e) { + return -EINVAL; + } + auth.caps = caps; + return 0; +} + +int AuthMonitor::validate_osd_new( + int32_t id, + const uuid_d& uuid, + const string& cephx_secret, + const string& lockbox_secret, + auth_entity_t& cephx_entity, + auth_entity_t& lockbox_entity, + stringstream& ss) +{ + + dout(10) << __func__ << " osd." << id << " uuid " << uuid << dendl; + + map<string,bufferlist> cephx_caps = { + { "osd", _encode_cap("allow *") }, + { "mon", _encode_cap("allow profile osd") }, + { "mgr", _encode_cap("allow profile osd") } + }; + map<string,bufferlist> lockbox_caps = { + { "mon", _encode_cap("allow command \"config-key get\" " + "with key=\"dm-crypt/osd/" + + stringify(uuid) + + "/luks\"") } + }; + + bool has_lockbox = !lockbox_secret.empty(); + + string cephx_name = "osd." + stringify(id); + string lockbox_name = "client.osd-lockbox." + stringify(uuid); + + if (!cephx_entity.name.from_str(cephx_name)) { + dout(10) << __func__ << " invalid cephx entity '" + << cephx_name << "'" << dendl; + ss << "invalid cephx key entity '" << cephx_name << "'"; + return -EINVAL; + } + + if (has_lockbox) { + if (!lockbox_entity.name.from_str(lockbox_name)) { + dout(10) << __func__ << " invalid cephx lockbox entity '" + << lockbox_name << "'" << dendl; + ss << "invalid cephx lockbox entity '" << lockbox_name << "'"; + return -EINVAL; + } + } + + if (entity_is_pending(cephx_entity.name) || + (has_lockbox && entity_is_pending(lockbox_entity.name))) { + // If we have pending entities for either the cephx secret or the + // lockbox secret, then our safest bet is to retry the command at + // a later time. These entities may be pending because an `osd new` + // command has been run (which is unlikely, due to the nature of + // the operation, which will force a paxos proposal), or (more likely) + // because a competing client created those entities before we handled + // the `osd new` command. Regardless, let's wait and see. + return -EAGAIN; + } + + if (!is_valid_cephx_key(cephx_secret)) { + ss << "invalid cephx secret."; + return -EINVAL; + } + + if (has_lockbox && !is_valid_cephx_key(lockbox_secret)) { + ss << "invalid cephx lockbox secret."; + return -EINVAL; + } + + int err = _create_auth(cephx_entity.auth, cephx_secret, cephx_caps); + ceph_assert(0 == err); + + bool cephx_is_idempotent = false, lockbox_is_idempotent = false; + err = exists_and_matches_entity(cephx_entity, true, ss); + + if (err != -ENOENT) { + if (err < 0) { + return err; + } + ceph_assert(0 == err); + cephx_is_idempotent = true; + } + + if (has_lockbox) { + err = _create_auth(lockbox_entity.auth, lockbox_secret, lockbox_caps); + ceph_assert(err == 0); + err = exists_and_matches_entity(lockbox_entity, true, ss); + if (err != -ENOENT) { + if (err < 0) { + return err; + } + ceph_assert(0 == err); + lockbox_is_idempotent = true; + } + } + + if (cephx_is_idempotent && (!has_lockbox || lockbox_is_idempotent)) { + return EEXIST; + } + + return 0; +} + +int AuthMonitor::do_osd_new( + const auth_entity_t& cephx_entity, + const auth_entity_t& lockbox_entity, + bool has_lockbox) +{ + ceph_assert(paxos.is_plugged()); + + dout(10) << __func__ << " cephx " << cephx_entity.name + << " lockbox "; + if (has_lockbox) { + *_dout << lockbox_entity.name; + } else { + *_dout << "n/a"; + } + *_dout << dendl; + + // we must have validated before reaching this point. + // if keys exist, then this means they also match; otherwise we would + // have failed before calling this function. + bool cephx_exists = mon.key_server.contains(cephx_entity.name); + + if (!cephx_exists) { + int err = add_entity(cephx_entity.name, cephx_entity.auth); + ceph_assert(0 == err); + } + + if (has_lockbox && + !mon.key_server.contains(lockbox_entity.name)) { + int err = add_entity(lockbox_entity.name, lockbox_entity.auth); + ceph_assert(0 == err); + } + + // given we have paxos plugged, this will not result in a proposal + // being triggered, but it will still be needed so that we get our + // pending state encoded into the paxos' pending transaction. + propose_pending(); + return 0; +} + +bool AuthMonitor::valid_caps( + const string& type, + const string& caps, + ostream *out) +{ + if (type == "mon") { + MonCap moncap; + if (!moncap.parse(caps, out)) { + return false; + } + return true; + } + + if (!g_conf().get_val<bool>("mon_auth_validate_all_caps")) { + return true; + } + + if (type == "mgr") { + MgrCap mgrcap; + if (!mgrcap.parse(caps, out)) { + return false; + } + } else if (type == "osd") { + OSDCap ocap; + if (!ocap.parse(caps, out)) { + return false; + } + } else if (type == "mds") { + MDSAuthCaps mdscap; + if (!mdscap.parse(g_ceph_context, caps, out)) { + return false; + } + } else { + if (out) { + *out << "unknown cap type '" << type << "'"; + } + return false; + } + return true; +} + +bool AuthMonitor::valid_caps(const vector<string>& caps, ostream *out) +{ + for (vector<string>::const_iterator p = caps.begin(); + p != caps.end(); p += 2) { + if ((p+1) == caps.end()) { + *out << "cap '" << *p << "' has no value"; + return false; + } + if (!valid_caps(*p, *(p+1), out)) { + return false; + } + } + return true; +} + +bool AuthMonitor::prepare_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + stringstream ss, ds; + bufferlist rdata; + string rs; + int err = -EINVAL; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + // ss has reason for failure + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed()); + return true; + } + + string prefix; + vector<string>caps_vec; + string entity_name; + EntityName entity; + + cmd_getval(cmdmap, "prefix", prefix); + + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed()); + return true; + } + + cmd_getval(cmdmap, "caps", caps_vec); + // fs authorize command's can have odd number of caps arguments + if ((prefix != "fs authorize") && (caps_vec.size() % 2) != 0) { + ss << "bad capabilities request; odd number of arguments"; + err = -EINVAL; + goto done; + } + + cmd_getval(cmdmap, "entity", entity_name); + if (!entity_name.empty() && !entity.from_str(entity_name)) { + ss << "bad entity name"; + err = -EINVAL; + goto done; + } + + if (prefix == "auth import") { + bufferlist bl = m->get_data(); + if (bl.length() == 0) { + ss << "auth import: no data supplied"; + getline(ss, rs); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + auto iter = bl.cbegin(); + KeyRing keyring; + try { + decode(keyring, iter); + } catch (const ceph::buffer::error &ex) { + ss << "error decoding keyring" << " " << ex.what(); + err = -EINVAL; + goto done; + } + err = import_keyring(keyring); + if (err < 0) { + ss << "auth import: no caps supplied"; + getline(ss, rs); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + ss << "imported keyring"; + getline(ss, rs); + err = 0; + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "auth add" && !entity_name.empty()) { + /* expected behavior: + * - if command reproduces current state, return 0. + * - if command adds brand new entity, handle it. + * - if command adds new state to existing entity, return error. + */ + KeyServerData::Incremental auth_inc; + auth_inc.name = entity; + bufferlist bl = m->get_data(); + bool has_keyring = (bl.length() > 0); + map<string,bufferlist> new_caps; + + KeyRing new_keyring; + if (has_keyring) { + auto iter = bl.cbegin(); + try { + decode(new_keyring, iter); + } catch (const ceph::buffer::error &ex) { + ss << "error decoding keyring"; + err = -EINVAL; + goto done; + } + } + + if (!valid_caps(caps_vec, &ss)) { + err = -EINVAL; + goto done; + } + + // are we about to have it? + if (entity_is_pending(entity)) { + wait_for_finished_proposal(op, + new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1)); + return true; + } + + // build new caps from provided arguments (if available) + for (vector<string>::iterator it = caps_vec.begin(); + it != caps_vec.end() && (it + 1) != caps_vec.end(); + it += 2) { + string sys = *it; + bufferlist cap; + encode(*(it+1), cap); + new_caps[sys] = cap; + } + + // pull info out of provided keyring + EntityAuth new_inc; + if (has_keyring) { + if (!new_keyring.get_auth(auth_inc.name, new_inc)) { + ss << "key for " << auth_inc.name + << " not found in provided keyring"; + err = -EINVAL; + goto done; + } + if (!new_caps.empty() && !new_inc.caps.empty()) { + ss << "caps cannot be specified both in keyring and in command"; + err = -EINVAL; + goto done; + } + if (new_caps.empty()) { + new_caps = new_inc.caps; + } + } + + err = exists_and_matches_entity(auth_inc.name, new_inc, + new_caps, has_keyring, ss); + // if entity/key/caps do not exist in the keyring, just fall through + // and add the entity; otherwise, make sure everything matches (in + // which case it's a no-op), because if not we must fail. + if (err != -ENOENT) { + if (err < 0) { + goto done; + } + // no-op. + ceph_assert(err == 0); + goto done; + } + err = 0; + + // okay, add it. + if (!has_keyring) { + dout(10) << "AuthMonitor::prepare_command generating random key for " + << auth_inc.name << dendl; + new_inc.key.create(g_ceph_context, CEPH_CRYPTO_AES); + } + new_inc.caps = new_caps; + + err = add_entity(auth_inc.name, new_inc); + ceph_assert(err == 0); + + ss << "added key for " << auth_inc.name; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if ((prefix == "auth get-or-create-key" || + prefix == "auth get-or-create") && + !entity_name.empty()) { + // auth get-or-create <name> [mon osdcapa osd osdcapb ...] + + if (!valid_caps(caps_vec, &ss)) { + err = -EINVAL; + goto done; + } + + // Parse the list of caps into a map + std::map<std::string, bufferlist> wanted_caps; + for (vector<string>::const_iterator it = caps_vec.begin(); + it != caps_vec.end() && (it + 1) != caps_vec.end(); + it += 2) { + const std::string &sys = *it; + bufferlist cap; + encode(*(it+1), cap); + wanted_caps[sys] = cap; + } + + // do we have it? + EntityAuth entity_auth; + if (mon.key_server.get_auth(entity, entity_auth)) { + for (const auto &sys_cap : wanted_caps) { + if (entity_auth.caps.count(sys_cap.first) == 0 || + !entity_auth.caps[sys_cap.first].contents_equal(sys_cap.second)) { + ss << "key for " << entity << " exists but cap " << sys_cap.first + << " does not match"; + err = -EINVAL; + goto done; + } + } + + if (prefix == "auth get-or-create-key") { + if (f) { + entity_auth.key.encode_formatted("auth", f.get(), rdata); + } else { + ds << entity_auth.key; + } + } else { + KeyRing kr; + kr.add(entity, entity_auth.key); + if (f) { + kr.set_caps(entity, entity_auth.caps); + kr.encode_formatted("auth", f.get(), rdata); + } else { + kr.encode_plaintext(rdata); + } + } + err = 0; + goto done; + } + + // ...or are we about to? + for (vector<Incremental>::iterator p = pending_auth.begin(); + p != pending_auth.end(); + ++p) { + if (p->inc_type == AUTH_DATA) { + KeyServerData::Incremental auth_inc; + auto q = p->auth_data.cbegin(); + decode(auth_inc, q); + if (auth_inc.op == KeyServerData::AUTH_INC_ADD && + auth_inc.name == entity) { + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } + } + } + + // create it + KeyServerData::Incremental auth_inc; + auth_inc.op = KeyServerData::AUTH_INC_ADD; + auth_inc.name = entity; + auth_inc.auth.key.create(g_ceph_context, CEPH_CRYPTO_AES); + auth_inc.auth.caps = wanted_caps; + + push_cephx_inc(auth_inc); + + if (prefix == "auth get-or-create-key") { + if (f) { + auth_inc.auth.key.encode_formatted("auth", f.get(), rdata); + } else { + ds << auth_inc.auth.key; + } + } else { + KeyRing kr; + kr.add(entity, auth_inc.auth.key); + if (f) { + kr.set_caps(entity, wanted_caps); + kr.encode_formatted("auth", f.get(), rdata); + } else { + kr.encode_plaintext(rdata); + } + } + + rdata.append(ds); + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, rdata, + get_last_committed() + 1)); + return true; + } else if (prefix == "fs authorize") { + string filesystem; + cmd_getval(cmdmap, "filesystem", filesystem); + string mon_cap_string = "allow r"; + string mds_cap_string, osd_cap_string; + string osd_cap_wanted = "r"; + + std::shared_ptr<const Filesystem> fs; + if (filesystem != "*" && filesystem != "all") { + fs = mon.mdsmon()->get_fsmap().get_filesystem(filesystem); + if (fs == nullptr) { + ss << "filesystem " << filesystem << " does not exist."; + err = -EINVAL; + goto done; + } else { + mon_cap_string += " fsname=" + std::string(fs->mds_map.get_fs_name()); + } + } + + for (auto it = caps_vec.begin(); + it != caps_vec.end() && (it + 1) != caps_vec.end(); + it += 2) { + const string &path = *it; + const string &cap = *(it+1); + bool root_squash = false; + if ((it + 2) != caps_vec.end() && *(it+2) == "root_squash") { + root_squash = true; + ++it; + } + + if (cap != "r" && cap.compare(0, 2, "rw")) { + ss << "Permission flags must start with 'r' or 'rw'."; + err = -EINVAL; + goto done; + } + if (cap.compare(0, 2, "rw") == 0) + osd_cap_wanted = "rw"; + + char last='\0'; + for (size_t i = 2; i < cap.size(); ++i) { + char c = cap.at(i); + if (last >= c) { + ss << "Permission flags (except 'rw') must be specified in alphabetical order."; + err = -EINVAL; + goto done; + } + switch (c) { + case 'p': + break; + case 's': + break; + default: + ss << "Unknown permission flag '" << c << "'."; + err = -EINVAL; + goto done; + } + } + + mds_cap_string += mds_cap_string.empty() ? "" : ", "; + mds_cap_string += "allow " + cap; + + if (filesystem != "*" && filesystem != "all" && fs != nullptr) { + mds_cap_string += " fsname=" + std::string(fs->mds_map.get_fs_name()); + } + + if (path != "/") { + mds_cap_string += " path=" + path; + } + + if (root_squash) { + mds_cap_string += " root_squash"; + } + } + + osd_cap_string += osd_cap_string.empty() ? "" : ", "; + osd_cap_string += "allow " + osd_cap_wanted + + " tag " + pg_pool_t::APPLICATION_NAME_CEPHFS + + " data=" + filesystem; + + std::map<string, bufferlist> wanted_caps = { + { "mon", _encode_cap(mon_cap_string) }, + { "osd", _encode_cap(osd_cap_string) }, + { "mds", _encode_cap(mds_cap_string) } + }; + + if (!valid_caps("mon", mon_cap_string, &ss) || + !valid_caps("osd", osd_cap_string, &ss) || + !valid_caps("mds", mds_cap_string, &ss)) { + err = -EINVAL; + goto done; + } + + EntityAuth entity_auth; + if (mon.key_server.get_auth(entity, entity_auth)) { + for (const auto &sys_cap : wanted_caps) { + if (entity_auth.caps.count(sys_cap.first) == 0 || + !entity_auth.caps[sys_cap.first].contents_equal(sys_cap.second)) { + ss << entity << " already has fs capabilities that differ from " + << "those supplied. To generate a new auth key for " << entity + << ", first remove " << entity << " from configuration files, " + << "execute 'ceph auth rm " << entity << "', then execute this " + << "command again."; + err = -EINVAL; + goto done; + } + } + + KeyRing kr; + kr.add(entity, entity_auth.key); + if (f) { + kr.set_caps(entity, entity_auth.caps); + kr.encode_formatted("auth", f.get(), rdata); + } else { + kr.encode_plaintext(rdata); + } + err = 0; + goto done; + } + + KeyServerData::Incremental auth_inc; + auth_inc.op = KeyServerData::AUTH_INC_ADD; + auth_inc.name = entity; + auth_inc.auth.key.create(g_ceph_context, CEPH_CRYPTO_AES); + auth_inc.auth.caps = wanted_caps; + + push_cephx_inc(auth_inc); + KeyRing kr; + kr.add(entity, auth_inc.auth.key); + if (f) { + kr.set_caps(entity, wanted_caps); + kr.encode_formatted("auth", f.get(), rdata); + } else { + kr.encode_plaintext(rdata); + } + + rdata.append(ds); + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, rdata, + get_last_committed() + 1)); + return true; + } else if (prefix == "auth caps" && !entity_name.empty()) { + KeyServerData::Incremental auth_inc; + auth_inc.name = entity; + if (!mon.key_server.get_auth(auth_inc.name, auth_inc.auth)) { + ss << "couldn't find entry " << auth_inc.name; + err = -ENOENT; + goto done; + } + + if (!valid_caps(caps_vec, &ss)) { + err = -EINVAL; + goto done; + } + + map<string,bufferlist> newcaps; + for (vector<string>::iterator it = caps_vec.begin(); + it != caps_vec.end(); it += 2) + encode(*(it+1), newcaps[*it]); + + auth_inc.op = KeyServerData::AUTH_INC_ADD; + auth_inc.auth.caps = newcaps; + push_cephx_inc(auth_inc); + + ss << "updated caps for " << auth_inc.name; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if ((prefix == "auth del" || prefix == "auth rm") && + !entity_name.empty()) { + KeyServerData::Incremental auth_inc; + auth_inc.name = entity; + if (!mon.key_server.contains(auth_inc.name)) { + ss << "entity " << entity << " does not exist"; + err = 0; + goto done; + } + auth_inc.op = KeyServerData::AUTH_INC_DEL; + push_cephx_inc(auth_inc); + + ss << "updated"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } +done: + rdata.append(ds); + getline(ss, rs, '\0'); + mon.reply_command(op, err, rs, rdata, get_last_committed()); + return false; +} + +bool AuthMonitor::prepare_global_id(MonOpRequestRef op) +{ + dout(10) << "AuthMonitor::prepare_global_id" << dendl; + increase_max_global_id(); + + return true; +} + +bool AuthMonitor::_upgrade_format_to_dumpling() +{ + dout(1) << __func__ << " upgrading from format 0 to 1" << dendl; + ceph_assert(format_version == 0); + + bool changed = false; + map<EntityName, EntityAuth>::iterator p; + for (p = mon.key_server.secrets_begin(); + p != mon.key_server.secrets_end(); + ++p) { + // grab mon caps, if any + string mon_caps; + if (p->second.caps.count("mon") == 0) + continue; + try { + auto it = p->second.caps["mon"].cbegin(); + decode(mon_caps, it); + } + catch (const ceph::buffer::error&) { + dout(10) << __func__ << " unable to parse mon cap for " + << p->first << dendl; + continue; + } + + string n = p->first.to_str(); + string new_caps; + + // set daemon profiles + if ((p->first.is_osd() || p->first.is_mds()) && + mon_caps == "allow rwx") { + new_caps = string("allow profile ") + std::string(p->first.get_type_name()); + } + + // update bootstrap keys + if (n == "client.bootstrap-osd") { + new_caps = "allow profile bootstrap-osd"; + } + if (n == "client.bootstrap-mds") { + new_caps = "allow profile bootstrap-mds"; + } + + if (new_caps.length() > 0) { + dout(5) << __func__ << " updating " << p->first << " mon cap from " + << mon_caps << " to " << new_caps << dendl; + + bufferlist bl; + encode(new_caps, bl); + + KeyServerData::Incremental auth_inc; + auth_inc.name = p->first; + auth_inc.auth = p->second; + auth_inc.auth.caps["mon"] = bl; + auth_inc.op = KeyServerData::AUTH_INC_ADD; + push_cephx_inc(auth_inc); + changed = true; + } + } + return changed; +} + +bool AuthMonitor::_upgrade_format_to_luminous() +{ + dout(1) << __func__ << " upgrading from format 1 to 2" << dendl; + ceph_assert(format_version == 1); + + bool changed = false; + map<EntityName, EntityAuth>::iterator p; + for (p = mon.key_server.secrets_begin(); + p != mon.key_server.secrets_end(); + ++p) { + string n = p->first.to_str(); + + string newcap; + if (n == "client.admin") { + // admin gets it all + newcap = "allow *"; + } else if (n.find("osd.") == 0 || + n.find("mds.") == 0 || + n.find("mon.") == 0) { + // daemons follow their profile + string type = n.substr(0, 3); + newcap = "allow profile " + type; + } else if (p->second.caps.count("mon")) { + // if there are any mon caps, give them 'r' mgr caps + newcap = "allow r"; + } + + if (newcap.length() > 0) { + dout(5) << " giving " << n << " mgr '" << newcap << "'" << dendl; + bufferlist bl; + encode(newcap, bl); + + EntityAuth auth = p->second; + auth.caps["mgr"] = bl; + + add_entity(p->first, auth); + changed = true; + } + + if (n.find("mgr.") == 0 && + p->second.caps.count("mon")) { + // the kraken ceph-mgr@.service set the mon cap to 'allow *'. + auto blp = p->second.caps["mon"].cbegin(); + string oldcaps; + decode(oldcaps, blp); + if (oldcaps == "allow *") { + dout(5) << " fixing " << n << " mon cap to 'allow profile mgr'" + << dendl; + bufferlist bl; + encode("allow profile mgr", bl); + + EntityAuth auth = p->second; + auth.caps["mon"] = bl; + add_entity(p->first, p->second); + changed = true; + } + } + } + + // add bootstrap key if it does not already exist + // (might have already been get-or-create'd by + // ceph-create-keys) + EntityName bootstrap_mgr_name; + int r = bootstrap_mgr_name.from_str("client.bootstrap-mgr"); + ceph_assert(r); + if (!mon.key_server.contains(bootstrap_mgr_name)) { + + EntityName name = bootstrap_mgr_name; + EntityAuth auth; + encode("allow profile bootstrap-mgr", auth.caps["mon"]); + auth.key.create(g_ceph_context, CEPH_CRYPTO_AES); + add_entity(name, auth); + changed = true; + } + return changed; +} + +bool AuthMonitor::_upgrade_format_to_mimic() +{ + dout(1) << __func__ << " upgrading from format 2 to 3" << dendl; + ceph_assert(format_version == 2); + + list<pair<EntityName,EntityAuth> > auth_lst; + _generate_bootstrap_keys(&auth_lst); + + bool changed = false; + for (auto &p : auth_lst) { + if (mon.key_server.contains(p.first)) { + continue; + } + int err = add_entity(p.first, p.second); + ceph_assert(err == 0); + changed = true; + } + + return changed; +} + +void AuthMonitor::upgrade_format() +{ + constexpr unsigned int FORMAT_NONE = 0; + constexpr unsigned int FORMAT_DUMPLING = 1; + constexpr unsigned int FORMAT_LUMINOUS = 2; + constexpr unsigned int FORMAT_MIMIC = 3; + + // when upgrading from the current format to a new format, ensure that + // the new format doesn't break the older format. I.e., if a given format N + // changes or adds something, ensure that when upgrading from N-1 to N+1, we + // still observe the changes for format N if those have not been superseded + // by N+1. + + unsigned int current = FORMAT_MIMIC; + if (!mon.get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_LUMINOUS)) { + // pre-luminous quorum + current = FORMAT_DUMPLING; + } else if (!mon.get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_MIMIC)) { + // pre-mimic quorum + current = FORMAT_LUMINOUS; + } + if (format_version >= current) { + dout(20) << __func__ << " format " << format_version + << " is current" << dendl; + return; + } + + // perform a rolling upgrade of the new format, if necessary. + // i.e., if we are moving from format NONE to MIMIC, we will first upgrade + // to DUMPLING, then to LUMINOUS, and finally to MIMIC, in several different + // proposals. + + bool changed = false; + if (format_version == FORMAT_NONE) { + changed = _upgrade_format_to_dumpling(); + + } else if (format_version == FORMAT_DUMPLING) { + changed = _upgrade_format_to_luminous(); + } else if (format_version == FORMAT_LUMINOUS) { + changed = _upgrade_format_to_mimic(); + } + + if (changed) { + // note new format + dout(10) << __func__ << " proposing update from format " << format_version + << " -> " << current << dendl; + format_version = current; + propose_pending(); + } +} + +void AuthMonitor::dump_info(Formatter *f) +{ + /*** WARNING: do not include any privileged information here! ***/ + f->open_object_section("auth"); + f->dump_unsigned("first_committed", get_first_committed()); + f->dump_unsigned("last_committed", get_last_committed()); + f->dump_unsigned("num_secrets", mon.key_server.get_num_secrets()); + f->close_section(); +} diff --git a/src/mon/AuthMonitor.h b/src/mon/AuthMonitor.h new file mode 100644 index 000000000..4312b5607 --- /dev/null +++ b/src/mon/AuthMonitor.h @@ -0,0 +1,237 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_AUTHMONITOR_H +#define CEPH_AUTHMONITOR_H + +#include <map> +#include <set> + +#include "global/global_init.h" +#include "include/ceph_features.h" +#include "include/types.h" +#include "mon/PaxosService.h" +#include "mon/MonitorDBStore.h" + +class MAuth; +class KeyRing; +class Monitor; + +#define MIN_GLOBAL_ID 0x1000 + +class AuthMonitor : public PaxosService { +public: + enum IncType { + GLOBAL_ID, + AUTH_DATA, + }; + struct Incremental { + IncType inc_type; + uint64_t max_global_id; + uint32_t auth_type; + ceph::buffer::list auth_data; + + Incremental() : inc_type(GLOBAL_ID), max_global_id(0), auth_type(0) {} + + void encode(ceph::buffer::list& bl, uint64_t features=-1) const { + using ceph::encode; + ENCODE_START(2, 2, bl); + __u32 _type = (__u32)inc_type; + encode(_type, bl); + if (_type == GLOBAL_ID) { + encode(max_global_id, bl); + } else { + encode(auth_type, bl); + encode(auth_data, bl); + } + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + __u32 _type; + decode(_type, bl); + inc_type = (IncType)_type; + ceph_assert(inc_type >= GLOBAL_ID && inc_type <= AUTH_DATA); + if (_type == GLOBAL_ID) { + decode(max_global_id, bl); + } else { + decode(auth_type, bl); + decode(auth_data, bl); + } + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->dump_int("type", inc_type); + f->dump_int("max_global_id", max_global_id); + f->dump_int("auth_type", auth_type); + f->dump_int("auth_data_len", auth_data.length()); + } + static void generate_test_instances(std::list<Incremental*>& ls) { + ls.push_back(new Incremental); + ls.push_back(new Incremental); + ls.back()->inc_type = GLOBAL_ID; + ls.back()->max_global_id = 1234; + ls.push_back(new Incremental); + ls.back()->inc_type = AUTH_DATA; + ls.back()->auth_type = 12; + ls.back()->auth_data.append("foo"); + } + }; + + struct auth_entity_t { + EntityName name; + EntityAuth auth; + }; + + +private: + std::vector<Incremental> pending_auth; + uint64_t max_global_id; + uint64_t last_allocated_id; + + // these are protected by mon->auth_lock + int mon_num = 0, mon_rank = 0; + + bool _upgrade_format_to_dumpling(); + bool _upgrade_format_to_luminous(); + bool _upgrade_format_to_mimic(); + void upgrade_format() override; + + void export_keyring(KeyRing& keyring); + int import_keyring(KeyRing& keyring); + + void push_cephx_inc(KeyServerData::Incremental& auth_inc) { + Incremental inc; + inc.inc_type = AUTH_DATA; + encode(auth_inc, inc.auth_data); + inc.auth_type = CEPH_AUTH_CEPHX; + pending_auth.push_back(inc); + } + + /* validate mon/osd/mds caps; fail on unrecognized service/type */ + bool valid_caps(const std::string& type, const std::string& caps, std::ostream *out); + bool valid_caps(const std::string& type, const ceph::buffer::list& bl, std::ostream *out) { + auto p = bl.begin(); + std::string v; + try { + using ceph::decode; + decode(v, p); + } catch (ceph::buffer::error& e) { + *out << "corrupt capability encoding"; + return false; + } + return valid_caps(type, v, out); + } + bool valid_caps(const std::vector<std::string>& caps, std::ostream *out); + + void on_active() override; + bool should_propose(double& delay) override; + void get_initial_keyring(KeyRing *keyring); + void create_initial_keys(KeyRing *keyring); + void create_initial() override; + void update_from_paxos(bool *need_bootstrap) override; + void create_pending() override; // prepare a new pending + bool prepare_global_id(MonOpRequestRef op); + bool _should_increase_max_global_id(); ///< called under mon->auth_lock + void increase_max_global_id(); + uint64_t assign_global_id(bool should_increase_max); +public: + uint64_t _assign_global_id(); ///< called under mon->auth_lock + void _set_mon_num_rank(int num, int rank); ///< called under mon->auth_lock + +private: + // propose pending update to peers + void encode_pending(MonitorDBStore::TransactionRef t) override; + void encode_full(MonitorDBStore::TransactionRef t) override; + version_t get_trim_to() const override; + + bool preprocess_query(MonOpRequestRef op) override; // true if processed. + bool prepare_update(MonOpRequestRef op) override; + + bool prep_auth(MonOpRequestRef op, bool paxos_writable); + + bool preprocess_command(MonOpRequestRef op); + bool prepare_command(MonOpRequestRef op); + + bool check_rotate(); + + bool entity_is_pending(EntityName& entity); + int exists_and_matches_entity( + const auth_entity_t& entity, + bool has_secret, + std::stringstream& ss); + int exists_and_matches_entity( + const EntityName& name, + const EntityAuth& auth, + const std::map<std::string,ceph::buffer::list>& caps, + bool has_secret, + std::stringstream& ss); + int remove_entity(const EntityName &entity); + int add_entity( + const EntityName& name, + const EntityAuth& auth); + + public: + AuthMonitor(Monitor &mn, Paxos &p, const std::string& service_name) + : PaxosService(mn, p, service_name), + max_global_id(0), + last_allocated_id(0) + {} + + void pre_auth(MAuth *m); + + void tick() override; // check state, take actions + + int validate_osd_destroy( + int32_t id, + const uuid_d& uuid, + EntityName& cephx_entity, + EntityName& lockbox_entity, + std::stringstream& ss); + int do_osd_destroy( + const EntityName& cephx_entity, + const EntityName& lockbox_entity); + + int do_osd_new( + const auth_entity_t& cephx_entity, + const auth_entity_t& lockbox_entity, + bool has_lockbox); + int validate_osd_new( + int32_t id, + const uuid_d& uuid, + const std::string& cephx_secret, + const std::string& lockbox_secret, + auth_entity_t& cephx_entity, + auth_entity_t& lockbox_entity, + std::stringstream& ss); + + void dump_info(ceph::Formatter *f); + + bool is_valid_cephx_key(const std::string& k) { + if (k.empty()) + return false; + + EntityAuth ea; + try { + ea.key.decode_base64(k); + return true; + } catch (ceph::buffer::error& e) { /* fallthrough */ } + return false; + } +}; + + +WRITE_CLASS_ENCODER_FEATURES(AuthMonitor::Incremental) + +#endif diff --git a/src/mon/CMakeLists.txt b/src/mon/CMakeLists.txt new file mode 100644 index 000000000..b4056fdb1 --- /dev/null +++ b/src/mon/CMakeLists.txt @@ -0,0 +1,42 @@ +set(lib_mon_srcs + ${CMAKE_SOURCE_DIR}/src/auth/cephx/CephxKeyServer.cc + ${CMAKE_SOURCE_DIR}/src/auth/cephx/CephxServiceHandler.cc + ${CMAKE_SOURCE_DIR}/src/auth/AuthServiceHandler.cc + Paxos.cc + PaxosService.cc + OSDMonitor.cc + MDSMonitor.cc + CommandHandler.cc + FSCommands.cc + MgrMonitor.cc + MgrStatMonitor.cc + Monitor.cc + MonmapMonitor.cc + LogMonitor.cc + AuthMonitor.cc + ConfigMap.cc + ConfigMonitor.cc + Elector.cc + ElectionLogic.cc + ConnectionTracker.cc + HealthMonitor.cc + KVMonitor.cc + ../mds/MDSAuthCaps.cc + ../mgr/mgr_commands.cc + ../osd/OSDCap.cc + $<TARGET_OBJECTS:mgr_cap_obj>) + +if(HAVE_GSSAPI) + list(APPEND lib_mon_srcs + ${CMAKE_SOURCE_DIR}/src/auth/krb/KrbServiceHandler.cpp) +endif() + +add_library(mon STATIC + ${lib_mon_srcs}) +target_link_libraries(mon + kv + heap_profiler + fmt::fmt) +if(WITH_JAEGER) + target_link_libraries(mon jaeger-base) +endif() diff --git a/src/mon/CommandHandler.cc b/src/mon/CommandHandler.cc new file mode 100644 index 000000000..903d35927 --- /dev/null +++ b/src/mon/CommandHandler.cc @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Ltd + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "CommandHandler.h" + +#include "common/strtol.h" +#include "include/ceph_assert.h" + +#include <ostream> +#include <string> +#include <string_view> + +int CommandHandler::parse_bool(std::string_view str, bool* result, std::ostream& ss) +{ + ceph_assert(result != nullptr); + + std::string interr; + int64_t n = strict_strtoll(str.data(), 10, &interr); + + if (str == "false" || str == "no" + || (interr.length() == 0 && n == 0)) { + *result = false; + return 0; + } else if (str == "true" || str == "yes" + || (interr.length() == 0 && n == 1)) { + *result = true; + return 0; + } else { + ss << "value must be false|no|0 or true|yes|1"; + return -EINVAL; + } +} diff --git a/src/mon/CommandHandler.h b/src/mon/CommandHandler.h new file mode 100644 index 000000000..167b4587f --- /dev/null +++ b/src/mon/CommandHandler.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat Ltd + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef COMMAND_HANDLER_H_ +#define COMMAND_HANDLER_H_ + +#include <ostream> +#include <string_view> + +class CommandHandler +{ +public: + /** + * Parse true|yes|1 style boolean string from `bool_str` + * `result` must be non-null. + * `ss` will be populated with error message on error. + * + * @return 0 on success, else -EINVAL + */ + int parse_bool(std::string_view str, bool* result, std::ostream& ss); +}; + +#endif diff --git a/src/mon/ConfigMap.cc b/src/mon/ConfigMap.cc new file mode 100644 index 000000000..763b8ce9b --- /dev/null +++ b/src/mon/ConfigMap.cc @@ -0,0 +1,291 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/algorithm/string/split.hpp> + +#include "ConfigMap.h" +#include "crush/CrushWrapper.h" +#include "common/entity_name.h" + +using namespace std::literals; + +using std::cerr; +using std::cout; +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::setfill; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; +using std::unique_ptr; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::mono_clock; +using ceph::mono_time; +using ceph::timespan_str; + +int MaskedOption::get_precision(const CrushWrapper *crush) +{ + // 0 = most precise + if (mask.location_type.size()) { + int r = crush->get_type_id(mask.location_type); + if (r >= 0) { + return r; + } + // bad type name, ignore it + } + int num_types = crush->get_num_type_names(); + if (mask.device_class.size()) { + return num_types; + } + return num_types + 1; +} + +void OptionMask::dump(Formatter *f) const +{ + if (location_type.size()) { + f->dump_string("location_type", location_type); + f->dump_string("location_value", location_value); + } + if (device_class.size()) { + f->dump_string("device_class", device_class); + } +} + +void MaskedOption::dump(Formatter *f) const +{ + f->dump_string("name", opt->name); + f->dump_string("value", raw_value); + f->dump_string("level", Option::level_to_str(opt->level)); + f->dump_bool("can_update_at_runtime", opt->can_update_at_runtime()); + f->dump_string("mask", mask.to_str()); + mask.dump(f); +} + +ostream& operator<<(ostream& out, const MaskedOption& o) +{ + out << o.opt->name; + if (o.mask.location_type.size()) { + out << "@" << o.mask.location_type << '=' << o.mask.location_value; + } + if (o.mask.device_class.size()) { + out << "@class=" << o.mask.device_class; + } + return out; +} + +// ---------- + +void Section::dump(Formatter *f) const +{ + for (auto& i : options) { + f->dump_object(i.first.c_str(), i.second); + } +} + +std::string Section::get_minimal_conf() const +{ + std::string r; + for (auto& i : options) { + if (i.second.opt->has_flag(Option::FLAG_NO_MON_UPDATE) || + i.second.opt->has_flag(Option::FLAG_MINIMAL_CONF)) { + if (i.second.mask.empty()) { + r += "\t"s + i.first + " = " + i.second.raw_value + "\n"; + } else { + r += "\t# masked option excluded: " + i.first + " = " + + i.second.raw_value + "\n"; + } + } + } + return r; +} + + +// ------------ + +void ConfigMap::dump(Formatter *f) const +{ + f->dump_object("global", global); + f->open_object_section("by_type"); + for (auto& i : by_type) { + f->dump_object(i.first.c_str(), i.second); + } + f->close_section(); + f->open_object_section("by_id"); + for (auto& i : by_id) { + f->dump_object(i.first.c_str(), i.second); + } + f->close_section(); +} + +std::map<std::string,std::string,std::less<>> +ConfigMap::generate_entity_map( + const EntityName& name, + const map<std::string,std::string>& crush_location, + const CrushWrapper *crush, + const std::string& device_class, + std::map<std::string,pair<std::string,const MaskedOption*>> *src) +{ + // global, then by type, then by name prefix component(s), then name. + // name prefix components are .-separated, + // e.g. client.a.b.c -> [global, client, client.a, client.a.b, client.a.b.c] + vector<pair<string,Section*>> sections = { make_pair("global", &global) }; + auto p = by_type.find(name.get_type_name()); + if (p != by_type.end()) { + sections.emplace_back(name.get_type_name(), &p->second); + } + vector<std::string> name_bits; + boost::split(name_bits, name.to_str(), [](char c){ return c == '.'; }); + std::string tname; + for (unsigned p = 0; p < name_bits.size(); ++p) { + if (p) { + tname += '.'; + } + tname += name_bits[p]; + auto q = by_id.find(tname); + if (q != by_id.end()) { + sections.push_back(make_pair(tname, &q->second)); + } + } + std::map<std::string,std::string,std::less<>> out; + MaskedOption *prev = nullptr; + for (auto s : sections) { + for (auto& i : s.second->options) { + auto& o = i.second; + // match against crush location, class + if (o.mask.device_class.size() && + o.mask.device_class != device_class) { + continue; + } + if (o.mask.location_type.size()) { + auto p = crush_location.find(o.mask.location_type); + if (p == crush_location.end() || + p->second != o.mask.location_value) { + continue; + } + } + if (prev && prev->opt->name != i.first) { + prev = nullptr; + } + if (prev && + prev->get_precision(crush) < o.get_precision(crush)) { + continue; + } + out[i.first] = o.raw_value; + if (src) { + (*src)[i.first] = make_pair(s.first, &o); + } + prev = &o; + } + } + return out; +} + +bool ConfigMap::parse_mask( + const std::string& who, + std::string *section, + OptionMask *mask) +{ + vector<std::string> split; + boost::split(split, who, [](char c){ return c == '/'; }); + for (unsigned j = 0; j < split.size(); ++j) { + auto& i = split[j]; + if (i == "global") { + *section = "global"; + continue; + } + size_t delim = i.find(':'); + if (delim != std::string::npos) { + string k = i.substr(0, delim); + if (k == "class") { + mask->device_class = i.substr(delim + 1); + } else { + mask->location_type = k; + mask->location_value = i.substr(delim + 1); + } + continue; + } + string type, id; + auto dotpos = i.find('.'); + if (dotpos != std::string::npos) { + type = i.substr(0, dotpos); + id = i.substr(dotpos + 1); + } else { + type = i; + } + if (EntityName::str_to_ceph_entity_type(type) == CEPH_ENTITY_TYPE_ANY) { + return false; + } + *section = i; + } + return true; +} + +void ConfigMap::parse_key( + const std::string& key, + std::string *name, + std::string *who) +{ + auto last_slash = key.rfind('/'); + if (last_slash == std::string::npos) { + *name = key; + } else if (auto mgrpos = key.find("/mgr/"); mgrpos != std::string::npos) { + *name = key.substr(mgrpos + 1); + *who = key.substr(0, mgrpos); + } else { + *name = key.substr(last_slash + 1); + *who = key.substr(0, last_slash); + } +} + + +// -------------- + +void ConfigChangeSet::dump(Formatter *f) const +{ + f->dump_int("version", version); + f->dump_stream("timestamp") << stamp; + f->dump_string("name", name); + f->open_array_section("changes"); + for (auto& i : diff) { + f->open_object_section("change"); + f->dump_string("name", i.first); + if (i.second.first) { + f->dump_string("previous_value", *i.second.first); + } + if (i.second.second) { + f->dump_string("new_value", *i.second.second); + } + f->close_section(); + } + f->close_section(); +} + +void ConfigChangeSet::print(ostream& out) const +{ + out << "--- " << version << " --- " << stamp; + if (name.size()) { + out << " --- " << name; + } + out << " ---\n"; + for (auto& i : diff) { + if (i.second.first) { + out << "- " << i.first << " = " << *i.second.first << "\n"; + } + if (i.second.second) { + out << "+ " << i.first << " = " << *i.second.second << "\n"; + } + } +} diff --git a/src/mon/ConfigMap.h b/src/mon/ConfigMap.h new file mode 100644 index 000000000..2ecdcc071 --- /dev/null +++ b/src/mon/ConfigMap.h @@ -0,0 +1,153 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <ostream> +#include <string> + +#include "include/utime.h" +#include "common/options.h" +#include "common/entity_name.h" + +class CrushWrapper; + +// the precedence is thus: +// +// global +// crush location (coarse to fine, ordered by type id) +// daemon type (e.g., osd) +// device class (osd only) +// crush location (coarse to fine, ordered by type id) +// daemon name (e.g., mds.foo) +// +// Note that this means that if we have +// +// config/host:foo/a = 1 +// config/osd/rack:foo/a = 2 +// +// then we get a = 2. The osd-level config wins, even though rack +// is less precise than host, because the crush limiters are only +// resolved within a section (global, per-daemon, per-instance). + +struct OptionMask { + std::string location_type, location_value; ///< matches crush_location + std::string device_class; ///< matches device class + + bool empty() const { + return location_type.size() == 0 + && location_value.size() == 0 + && device_class.size() == 0; + } + + std::string to_str() const { + std::string r; + if (location_type.size()) { + r += location_type + ":" + location_value; + } + if (device_class.size()) { + if (r.size()) { + r += "/"; + } + r += "class:" + device_class; + } + return r; + } + void dump(ceph::Formatter *f) const; +}; + +struct MaskedOption { + std::string raw_value; ///< raw, unparsed, unvalidated value + const Option *opt; ///< the option + OptionMask mask; + std::unique_ptr<const Option> unknown_opt; ///< if fabricated for an unknown option + + MaskedOption(const Option *o, bool fab=false) : opt(o) { + if (fab) { + unknown_opt.reset(o); + } + } + MaskedOption(MaskedOption&& o) { + raw_value = std::move(o.raw_value); + opt = o.opt; + mask = std::move(o.mask); + unknown_opt = std::move(o.unknown_opt); + } + const MaskedOption& operator=(const MaskedOption& o) = delete; + const MaskedOption& operator=(MaskedOption&& o) = delete; + + /// return a precision metric (smaller is more precise) + int get_precision(const CrushWrapper *crush); + + friend std::ostream& operator<<(std::ostream& out, const MaskedOption& o); + + void dump(ceph::Formatter *f) const; +}; + +struct Section { + std::multimap<std::string,MaskedOption> options; + + void clear() { + options.clear(); + } + void dump(ceph::Formatter *f) const; + std::string get_minimal_conf() const; +}; + +struct ConfigMap { + Section global; + std::map<std::string,Section, std::less<>> by_type; + std::map<std::string,Section, std::less<>> by_id; + std::list<std::unique_ptr<Option>> stray_options; + + Section *find_section(const std::string& name) { + if (name == "global") { + return &global; + } + auto i = by_type.find(name); + if (i != by_type.end()) { + return &i->second; + } + i = by_id.find(name); + if (i != by_id.end()) { + return &i->second; + } + return nullptr; + } + void clear() { + global.clear(); + by_type.clear(); + by_id.clear(); + stray_options.clear(); + } + void dump(ceph::Formatter *f) const; + std::map<std::string,std::string,std::less<>> generate_entity_map( + const EntityName& name, + const std::map<std::string,std::string>& crush_location, + const CrushWrapper *crush, + const std::string& device_class, + std::map<std::string,std::pair<std::string,const MaskedOption*>> *src=0); + + void parse_key( + const std::string& key, + std::string *name, + std::string *who); + static bool parse_mask( + const std::string& in, + std::string *section, + OptionMask *mask); +}; + + +struct ConfigChangeSet { + version_t version; + utime_t stamp; + std::string name; + + // key -> (old value, new value) + std::map<std::string,std::pair<boost::optional<std::string>,boost::optional<std::string>>> diff; + + void dump(ceph::Formatter *f) const; + void print(std::ostream& out) const; +}; diff --git a/src/mon/ConfigMonitor.cc b/src/mon/ConfigMonitor.cc new file mode 100644 index 000000000..c82a8417a --- /dev/null +++ b/src/mon/ConfigMonitor.cc @@ -0,0 +1,1028 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/algorithm/string/predicate.hpp> + +#include "mon/Monitor.h" +#include "mon/ConfigMonitor.h" +#include "mon/KVMonitor.h" +#include "mon/MgrMonitor.h" +#include "mon/OSDMonitor.h" +#include "messages/MConfig.h" +#include "messages/MGetConfig.h" +#include "messages/MMonCommand.h" +#include "common/Formatter.h" +#include "common/TextTable.h" +#include "common/cmdparse.h" +#include "include/stringify.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, this) +using namespace TOPNSPC::common; + +using namespace std::literals; + +using std::cerr; +using std::cout; +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::setfill; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; +using std::unique_ptr; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::mono_clock; +using ceph::mono_time; +using ceph::timespan_str; +static ostream& _prefix(std::ostream *_dout, const Monitor &mon, + const ConfigMonitor *hmon) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() << ").config "; +} + +const string KEY_PREFIX("config/"); +const string HISTORY_PREFIX("config-history/"); + +ConfigMonitor::ConfigMonitor(Monitor &m, Paxos &p, const string& service_name) + : PaxosService(m, p, service_name) { +} + +void ConfigMonitor::init() +{ + dout(10) << __func__ << dendl; +} + +void ConfigMonitor::create_initial() +{ + dout(10) << __func__ << dendl; + version = 0; + pending.clear(); +} + +void ConfigMonitor::update_from_paxos(bool *need_bootstrap) +{ + if (version == get_last_committed()) { + return; + } + version = get_last_committed(); + dout(10) << __func__ << " " << version << dendl; + load_config(); + check_all_subs(); +} + +void ConfigMonitor::create_pending() +{ + dout(10) << " " << version << dendl; + pending.clear(); + pending_description.clear(); +} + +void ConfigMonitor::encode_pending(MonitorDBStore::TransactionRef t) +{ + dout(10) << " " << (version+1) << dendl; + put_last_committed(t, version+1); + // NOTE: caller should have done encode_pending_to_kvmon() and + // kvmon->propose_pending() to commit the actual config changes. +} + +void ConfigMonitor::encode_pending_to_kvmon() +{ + // we need to pass our data through KVMonitor so that it is properly + // versioned and shared with subscribers. + for (auto& [key, value] : pending_cleanup) { + if (pending.count(key) == 0) { + derr << __func__ << " repair: adjusting config key '" << key << "'" + << dendl; + pending[key] = value; + } + } + pending_cleanup.clear(); + + // TODO: record changed sections (osd, mds.foo, rack:bar, ...) + + string history = HISTORY_PREFIX + stringify(version+1) + "/"; + { + bufferlist metabl; + ::encode(ceph_clock_now(), metabl); + ::encode(pending_description, metabl); + mon.kvmon()->enqueue_set(history, metabl); + } + for (auto& p : pending) { + string key = KEY_PREFIX + p.first; + auto q = current.find(p.first); + if (q != current.end()) { + if (p.second && *p.second == q->second) { + continue; + } + mon.kvmon()->enqueue_set(history + "-" + p.first, q->second); + } else if (!p.second) { + continue; + } + if (p.second) { + dout(20) << __func__ << " set " << key << dendl; + mon.kvmon()->enqueue_set(key, *p.second); + mon.kvmon()->enqueue_set(history + "+" + p.first, *p.second); + } else { + dout(20) << __func__ << " rm " << key << dendl; + mon.kvmon()->enqueue_rm(key); + } + } +} + +version_t ConfigMonitor::get_trim_to() const +{ + // we don't actually need *any* old states, but keep a few. + if (version > 5) { + return version - 5; + } + return 0; +} + +bool ConfigMonitor::preprocess_query(MonOpRequestRef op) +{ + switch (op->get_req()->get_type()) { + case MSG_MON_COMMAND: + try { + return preprocess_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + } + return false; +} + +static string indent_who(const string& who) +{ + if (who == "global") { + return who; + } + if (who.find('.') == string::npos) { + return " " + who; + } + return " " + who; +} + +bool ConfigMonitor::preprocess_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + std::stringstream ss; + int err = 0; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + + bufferlist odata; + if (prefix == "config help") { + stringstream ss; + string name; + cmd_getval(cmdmap, "key", name); + name = ConfFile::normalize_key_name(name); + const Option *opt = g_conf().find_option(name); + if (!opt) { + opt = mon.mgrmon()->find_module_option(name); + } + if (opt) { + if (f) { + f->dump_object("option", *opt); + } else { + opt->print(&ss); + } + } else { + ss << "configuration option '" << name << "' not recognized"; + err = -ENOENT; + goto reply; + } + if (f) { + f->flush(odata); + } else { + odata.append(ss.str()); + } + } else if (prefix == "config ls") { + ostringstream ss; + if (f) { + f->open_array_section("options"); + } + for (auto& i : ceph_options) { + if (f) { + f->dump_string("option", i.name); + } else { + ss << i.name << "\n"; + } + } + for (auto& i : mon.mgrmon()->get_mgr_module_options()) { + if (f) { + f->dump_string("option", i.first); + } else { + ss << i.first << "\n"; + } + } + if (f) { + f->close_section(); + f->flush(odata); + } else { + odata.append(ss.str()); + } + } else if (prefix == "config dump") { + list<pair<string,Section*>> sections = { + make_pair("global", &config_map.global) + }; + for (string type : { "mon", "mgr", "osd", "mds", "client" }) { + auto i = config_map.by_type.find(type); + if (i != config_map.by_type.end()) { + sections.push_back(make_pair(i->first, &i->second)); + } + auto j = config_map.by_id.lower_bound(type); + while (j != config_map.by_id.end() && + j->first.find(type) == 0) { + sections.push_back(make_pair(j->first, &j->second)); + ++j; + } + } + TextTable tbl; + if (!f) { + tbl.define_column("WHO", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("MASK", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("LEVEL", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("OPTION", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("RO", TextTable::LEFT, TextTable::LEFT); + } else { + f->open_array_section("config"); + } + for (auto s : sections) { + for (auto& i : s.second->options) { + if (!f) { + tbl << indent_who(s.first); + tbl << i.second.mask.to_str(); + tbl << Option::level_to_str(i.second.opt->level); + tbl << i.first; + tbl << i.second.raw_value; + tbl << (i.second.opt->can_update_at_runtime() ? "" : "*"); + tbl << TextTable::endrow; + } else { + f->open_object_section("option"); + f->dump_string("section", s.first); + i.second.dump(f.get()); + f->close_section(); + } + } + } + if (!f) { + odata.append(stringify(tbl)); + } else { + f->close_section(); + f->flush(odata); + } + } else if (prefix == "config get") { + string who, name; + cmd_getval(cmdmap, "who", who); + + EntityName entity; + if (!entity.from_str(who) && + !entity.from_str(who + ".")) { + ss << "unrecognized entity '" << who << "'"; + err = -EINVAL; + goto reply; + } + + map<string,string> crush_location; + string device_class; + if (entity.is_osd()) { + mon.osdmon()->osdmap.crush->get_full_location(who, &crush_location); + int id = atoi(entity.get_id().c_str()); + const char *c = mon.osdmon()->osdmap.crush->get_item_class(id); + if (c) { + device_class = c; + } + dout(10) << __func__ << " crush_location " << crush_location + << " class " << device_class << dendl; + } + + std::map<std::string,pair<std::string,const MaskedOption*>> src; + auto config = config_map.generate_entity_map( + entity, + crush_location, + mon.osdmon()->osdmap.crush.get(), + device_class, + &src); + + if (cmd_getval(cmdmap, "key", name)) { + name = ConfFile::normalize_key_name(name); + const Option *opt = g_conf().find_option(name); + if (!opt) { + opt = mon.mgrmon()->find_module_option(name); + } + if (!opt) { + ss << "unrecognized key '" << name << "'"; + err = -ENOENT; + goto reply; + } + if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) { + // handle special options + if (name == "fsid") { + odata.append(stringify(mon.monmap->get_fsid())); + odata.append("\n"); + goto reply; + } + err = -EINVAL; + ss << name << " is special and cannot be stored by the mon"; + goto reply; + } + // get a single value + auto p = config.find(name); + if (p != config.end()) { + odata.append(p->second); + odata.append("\n"); + goto reply; + } + if (!entity.is_client() && + !boost::get<boost::blank>(&opt->daemon_value)) { + odata.append(Option::to_str(opt->daemon_value)); + } else { + odata.append(Option::to_str(opt->value)); + } + odata.append("\n"); + } else { + // dump all (non-default) values for this entity + TextTable tbl; + if (!f) { + tbl.define_column("WHO", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("MASK", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("LEVEL", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("OPTION", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("VALUE", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("RO", TextTable::LEFT, TextTable::LEFT); + } else { + f->open_object_section("config"); + } + auto p = config.begin(); + auto q = src.begin(); + for (; p != config.end(); ++p, ++q) { + if (name.size() && p->first != name) { + continue; + } + if (!f) { + tbl << q->second.first; + tbl << q->second.second->mask.to_str(); + tbl << Option::level_to_str(q->second.second->opt->level); + tbl << p->first; + tbl << p->second; + tbl << (q->second.second->opt->can_update_at_runtime() ? "" : "*"); + tbl << TextTable::endrow; + } else { + f->open_object_section(p->first.c_str()); + f->dump_string("value", p->second); + f->dump_string("section", q->second.first); + f->dump_object("mask", q->second.second->mask); + f->dump_bool("can_update_at_runtime", + q->second.second->opt->can_update_at_runtime()); + f->close_section(); + } + } + if (!f) { + odata.append(stringify(tbl)); + } else { + f->close_section(); + f->flush(odata); + } + } + } else if (prefix == "config log") { + int64_t num = 10; + cmd_getval(cmdmap, "num", num); + ostringstream ds; + if (f) { + f->open_array_section("changesets"); + } + for (version_t v = version; v > version - std::min(version, (version_t)num); --v) { + ConfigChangeSet ch; + load_changeset(v, &ch); + if (f) { + f->dump_object("changeset", ch); + } else { + ch.print(ds); + } + } + if (f) { + f->close_section(); + f->flush(odata); + } else { + odata.append(ds.str()); + } + } else if (prefix == "config generate-minimal-conf") { + ostringstream conf; + conf << "# minimal ceph.conf for " << mon.monmap->get_fsid() << "\n"; + + // the basics + conf << "[global]\n"; + conf << "\tfsid = " << mon.monmap->get_fsid() << "\n"; + conf << "\tmon_host = "; + for (auto i = mon.monmap->mon_info.begin(); + i != mon.monmap->mon_info.end(); + ++i) { + if (i != mon.monmap->mon_info.begin()) { + conf << " "; + } + if (i->second.public_addrs.size() == 1 && + i->second.public_addrs.front().is_legacy() && + i->second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) { + // if this is a legacy addr on the legacy default port, then + // use the legacy-compatible formatting so that old clients + // can use this config. new code will see the :6789 and correctly + // interpret this as a v1 address. + conf << i->second.public_addrs.get_legacy_str(); + } else { + conf << i->second.public_addrs; + } + } + conf << "\n"; + conf << config_map.global.get_minimal_conf(); + for (auto m : { &config_map.by_type, &config_map.by_id }) { + for (auto& i : *m) { + auto s = i.second.get_minimal_conf(); + if (s.size()) { + conf << "\n[" << i.first << "]\n" << s; + } + } + } + odata.append(conf.str()); + err = 0; + } else { + return false; + } + + reply: + mon.reply_command(op, err, ss.str(), odata, get_last_committed()); + return true; +} + +void ConfigMonitor::handle_get_config(MonOpRequestRef op) +{ + auto m = op->get_req<MGetConfig>(); + dout(10) << __func__ << " " << m->name << " host " << m->host << dendl; + + const OSDMap& osdmap = mon.osdmon()->osdmap; + map<string,string> crush_location; + osdmap.crush->get_full_location(m->host, &crush_location); + auto out = config_map.generate_entity_map( + m->name, + crush_location, + osdmap.crush.get(), + m->device_class); + dout(20) << " config is " << out << dendl; + m->get_connection()->send_message(new MConfig{std::move(out)}); +} + +bool ConfigMonitor::prepare_update(MonOpRequestRef op) +{ + Message *m = op->get_req(); + dout(7) << "prepare_update " << *m + << " from " << m->get_orig_source_inst() << dendl; + switch (m->get_type()) { + case MSG_MON_COMMAND: + try { + return prepare_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + } + return false; +} + +bool ConfigMonitor::prepare_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + std::stringstream ss; + int err = -EINVAL; + + // make sure kv is writeable. + if (!mon.kvmon()->is_writeable()) { + dout(10) << __func__ << " waiting for kv mon to be writeable" << dendl; + mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + bufferlist odata; + + if (prefix == "config set" || + prefix == "config rm") { + string who; + string name, value; + bool force = false; + cmd_getval(cmdmap, "who", who); + cmd_getval(cmdmap, "name", name); + cmd_getval(cmdmap, "value", value); + cmd_getval(cmdmap, "force", force); + name = ConfFile::normalize_key_name(name); + + if (prefix == "config set" && !force) { + const Option *opt = g_conf().find_option(name); + if (!opt) { + opt = mon.mgrmon()->find_module_option(name); + } + if (!opt) { + ss << "unrecognized config option '" << name << "'"; + err = -EINVAL; + goto reply; + } + + Option::value_t real_value; + string errstr; + err = opt->parse_value(value, &real_value, &errstr, &value); + if (err < 0) { + ss << "error parsing value: " << errstr; + goto reply; + } + + if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) { + err = -EINVAL; + ss << name << " is special and cannot be stored by the mon"; + goto reply; + } + } + + string section; + OptionMask mask; + if (!ConfigMap::parse_mask(who, §ion, &mask)) { + ss << "unrecognized config target '" << who << "'"; + err = -EINVAL; + goto reply; + } + + string key; + if (section.size()) { + key += section + "/"; + } else { + key += "global/"; + } + string mask_str = mask.to_str(); + if (mask_str.size()) { + key += mask_str + "/"; + } + key += name; + + if (prefix == "config set") { + bufferlist bl; + bl.append(value); + pending[key] = bl; + } else { + pending[key] = boost::none; + } + goto update; + } else if (prefix == "config reset") { + int64_t revert_to = -1; + cmd_getval(cmdmap, "num", revert_to); + if (revert_to < 0 || + revert_to > (int64_t)version) { + err = -EINVAL; + ss << "must specify a valid historical version to revert to; " + << "see 'ceph config log' for a list of avialable configuration " + << "historical versions"; + goto reply; + } + if (revert_to == (int64_t)version) { + err = 0; + goto reply; + } + for (int64_t v = version; v > revert_to; --v) { + ConfigChangeSet ch; + load_changeset(v, &ch); + for (auto& i : ch.diff) { + if (i.second.first) { + bufferlist bl; + bl.append(*i.second.first); + pending[i.first] = bl; + } else if (i.second.second) { + pending[i.first] = boost::none; + } + } + } + pending_description = string("reset to ") + stringify(revert_to); + goto update; + } else if (prefix == "config assimilate-conf") { + ConfFile cf; + bufferlist bl = m->get_data(); + err = cf.parse_bufferlist(&bl, &ss); + if (err < 0) { + goto reply; + } + bool updated = false; + ostringstream newconf; + for (auto& [section, s] : cf) { + dout(20) << __func__ << " [" << section << "]" << dendl; + bool did_section = false; + for (auto& [key, val] : s) { + Option::value_t real_value; + string value; + string errstr; + if (key.empty()) { + continue; + } + // a known and worthy option? + const Option *o = g_conf().find_option(key); + if (!o) { + o = mon.mgrmon()->find_module_option(key); + } + if (!o || + (o->flags & Option::FLAG_NO_MON_UPDATE) || + (o->flags & Option::FLAG_CLUSTER_CREATE)) { + goto skip; + } + // normalize + err = o->parse_value(val, &real_value, &errstr, &value); + if (err < 0) { + dout(20) << __func__ << " failed to parse " << key << " = '" + << val << "'" << dendl; + goto skip; + } + // does it conflict with an existing value? + { + const Section *s = config_map.find_section(section); + if (s) { + auto k = s->options.find(key); + if (k != s->options.end()) { + if (value != k->second.raw_value) { + dout(20) << __func__ << " have " << key + << " = " << k->second.raw_value + << " (not " << value << ")" << dendl; + goto skip; + } + dout(20) << __func__ << " already have " << key + << " = " << k->second.raw_value << dendl; + continue; + } + } + } + dout(20) << __func__ << " add " << key << " = " << value + << " (" << val << ")" << dendl; + { + bufferlist bl; + bl.append(value); + pending[section + "/" + key] = bl; + updated = true; + } + continue; + + skip: + dout(20) << __func__ << " skip " << key << " = " << value + << " (" << val << ")" << dendl; + if (!did_section) { + newconf << "\n[" << section << "]\n"; + did_section = true; + } + newconf << "\t" << key << " = " << val << "\n"; + } + } + odata.append(newconf.str()); + if (updated) { + goto update; + } + } else { + ss << "unknown command " << prefix; + err = -EINVAL; + } + +reply: + mon.reply_command(op, err, ss.str(), odata, get_last_committed()); + return false; + +update: + // see if there is an actual change + auto p = pending.begin(); + while (p != pending.end()) { + auto q = current.find(p->first); + if (p->second && q != current.end() && *p->second == q->second) { + // set to same value + p = pending.erase(p); + } else if (!p->second && q == current.end()) { + // erasing non-existent value + p = pending.erase(p); + } else { + ++p; + } + } + if (pending.empty()) { + err = 0; + goto reply; + } + // immediately propose *with* KV mon + encode_pending_to_kvmon(); + paxos.plug(); + mon.kvmon()->propose_pending(); + paxos.unplug(); + force_immediate_propose(); + wait_for_finished_proposal( + op, + new Monitor::C_Command( + mon, op, 0, ss.str(), odata, + get_last_committed() + 1)); + return true; +} + +void ConfigMonitor::tick() +{ + if (!is_active() || !mon.is_leader()) { + return; + } + dout(10) << __func__ << dendl; + bool changed = false; + if (!pending_cleanup.empty()) { + changed = true; + } + if (changed && mon.kvmon()->is_writeable()) { + paxos.plug(); + encode_pending_to_kvmon(); + mon.kvmon()->propose_pending(); + paxos.unplug(); + propose_pending(); + } +} + +void ConfigMonitor::on_active() +{ +} + +void ConfigMonitor::load_config() +{ + std::map<std::string,std::string> renamed_pacific = { + { "mon_osd_blacklist_default_expire", "mon_osd_blocklist_default_expire" }, + { "mon_mds_blacklist_interval", "mon_mds_blocklist_interval" }, + { "mon_mgr_blacklist_interval", "mon_mgr_blocklist_interval" }, + { "rbd_blacklist_on_break_lock", "rbd_blocklist_on_break_lock" }, + { "rbd_blacklist_expire_seconds", "rbd_blocklist_expire_seconds" }, + { "mds_session_blacklist_on_timeout", "mds_session_blocklist_on_timeout" }, + { "mds_session_blacklist_on_evict", "mds_session_blocklist_on_evict" }, + }; + + unsigned num = 0; + KeyValueDB::Iterator it = mon.store->get_iterator(KV_PREFIX); + it->lower_bound(KEY_PREFIX); + config_map.clear(); + current.clear(); + pending_cleanup.clear(); + while (it->valid() && + it->key().compare(0, KEY_PREFIX.size(), KEY_PREFIX) == 0) { + string key = it->key().substr(KEY_PREFIX.size()); + string value = it->value().to_str(); + + current[key] = it->value(); + + string name; + string who; + config_map.parse_key(key, &name, &who); + + // has this option been renamed? + { + auto p = renamed_pacific.find(name); + if (p != renamed_pacific.end()) { + if (mon.monmap->min_mon_release >= ceph_release_t::pacific) { + // schedule a cleanup + pending_cleanup[key] = boost::none; + pending_cleanup[who + "/" + p->second] = it->value(); + } + // continue loading under the new name + name = p->second; + } + } + + const Option *opt = g_conf().find_option(name); + if (!opt) { + opt = mon.mgrmon()->find_module_option(name); + } + if (!opt) { + dout(10) << __func__ << " unrecognized option '" << name << "'" << dendl; + config_map.stray_options.push_back( + std::unique_ptr<Option>( + new Option(name, Option::TYPE_STR, Option::LEVEL_UNKNOWN))); + opt = config_map.stray_options.back().get(); + } + + string err; + int r = opt->pre_validate(&value, &err); + if (r < 0) { + dout(10) << __func__ << " pre-validate failed on '" << name << "' = '" + << value << "' for " << name << dendl; + } + + MaskedOption mopt(opt); + mopt.raw_value = value; + string section_name; + if (who.size() && + !ConfigMap::parse_mask(who, §ion_name, &mopt.mask)) { + derr << __func__ << " invalid mask for key " << key << dendl; + pending_cleanup[key] = boost::none; + } else if (opt->has_flag(Option::FLAG_NO_MON_UPDATE)) { + dout(10) << __func__ << " NO_MON_UPDATE option '" + << name << "' = '" << value << "' for " << name + << dendl; + pending_cleanup[key] = boost::none; + } else { + if (section_name.empty()) { + // we prefer global/$option instead of just $option + derr << __func__ << " adding global/ prefix to key '" << key << "'" + << dendl; + pending_cleanup[key] = boost::none; + pending_cleanup["global/"s + key] = it->value(); + } + Section *section = &config_map.global;; + if (section_name.size() && section_name != "global") { + if (section_name.find('.') != std::string::npos) { + section = &config_map.by_id[section_name]; + } else { + section = &config_map.by_type[section_name]; + } + } + section->options.insert(make_pair(name, std::move(mopt))); + ++num; + } + it->next(); + } + dout(10) << __func__ << " got " << num << " keys" << dendl; + + // refresh our own config + { + const OSDMap& osdmap = mon.osdmon()->osdmap; + map<string,string> crush_location; + osdmap.crush->get_full_location(g_conf()->host, &crush_location); + auto out = config_map.generate_entity_map( + g_conf()->name, + crush_location, + osdmap.crush.get(), + string{}); // no device class + g_conf().set_mon_vals(g_ceph_context, out, nullptr); + } +} + +void ConfigMonitor::load_changeset(version_t v, ConfigChangeSet *ch) +{ + ch->version = v; + string prefix = HISTORY_PREFIX + stringify(v) + "/"; + KeyValueDB::Iterator it = mon.store->get_iterator(KV_PREFIX); + it->lower_bound(prefix); + while (it->valid() && it->key().find(prefix) == 0) { + if (it->key() == prefix) { + bufferlist bl = it->value(); + auto p = bl.cbegin(); + try { + decode(ch->stamp, p); + decode(ch->name, p); + } + catch (ceph::buffer::error& e) { + derr << __func__ << " failure decoding changeset " << v << dendl; + } + } else { + char op = it->key()[prefix.length()]; + string key = it->key().substr(prefix.length() + 1); + if (op == '-') { + ch->diff[key].first = it->value().to_str(); + } else if (op == '+') { + ch->diff[key].second = it->value().to_str(); + } + } + it->next(); + } +} + +bool ConfigMonitor::refresh_config(MonSession *s) +{ + const OSDMap& osdmap = mon.osdmon()->osdmap; + map<string,string> crush_location; + if (s->remote_host.size()) { + osdmap.crush->get_full_location(s->remote_host, &crush_location); + dout(10) << __func__ << " crush_location for remote_host " << s->remote_host + << " is " << crush_location << dendl; + } + + string device_class; + if (s->name.is_osd()) { + const char *c = osdmap.crush->get_item_class(s->name.num()); + if (c) { + device_class = c; + dout(10) << __func__ << " device_class " << device_class << dendl; + } + } + + dout(20) << __func__ << " " << s->entity_name << " crush " << crush_location + << " device_class " << device_class << dendl; + auto out = config_map.generate_entity_map( + s->entity_name, + crush_location, + osdmap.crush.get(), + device_class); + + if (out == s->last_config && s->any_config) { + dout(20) << __func__ << " no change, " << out << dendl; + return false; + } + // removing this to hide sensitive data going into logs + // leaving this for debugging purposes + // dout(20) << __func__ << " " << out << dendl; + s->last_config = std::move(out); + s->any_config = true; + return true; +} + +bool ConfigMonitor::maybe_send_config(MonSession *s) +{ + bool changed = refresh_config(s); + dout(10) << __func__ << " to " << s->name << " " + << (changed ? "(changed)" : "(unchanged)") + << dendl; + if (changed) { + send_config(s); + } + return changed; +} + +void ConfigMonitor::send_config(MonSession *s) +{ + dout(10) << __func__ << " to " << s->name << dendl; + auto m = new MConfig(s->last_config); + s->con->send_message(m); +} + +void ConfigMonitor::check_sub(MonSession *s) +{ + if (!s->authenticated) { + dout(20) << __func__ << " not authenticated " << s->entity_name << dendl; + return; + } + auto p = s->sub_map.find("config"); + if (p != s->sub_map.end()) { + check_sub(p->second); + } +} + +void ConfigMonitor::check_sub(Subscription *sub) +{ + dout(10) << __func__ + << " next " << sub->next + << " have " << version << dendl; + if (sub->next <= version) { + maybe_send_config(sub->session); + if (sub->onetime) { + mon.with_session_map([sub](MonSessionMap& session_map) { + session_map.remove_sub(sub); + }); + } else { + sub->next = version + 1; + } + } +} + +void ConfigMonitor::check_all_subs() +{ + dout(10) << __func__ << dendl; + auto subs = mon.session_map.subs.find("config"); + if (subs == mon.session_map.subs.end()) { + return; + } + int updated = 0, total = 0; + auto p = subs->second->begin(); + while (!p.end()) { + auto sub = *p; + ++p; + ++total; + if (maybe_send_config(sub->session)) { + ++updated; + } + } + dout(10) << __func__ << " updated " << updated << " / " << total << dendl; +} diff --git a/src/mon/ConfigMonitor.h b/src/mon/ConfigMonitor.h new file mode 100644 index 000000000..e6c12a3d7 --- /dev/null +++ b/src/mon/ConfigMonitor.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/optional.hpp> + +#include "ConfigMap.h" +#include "mon/PaxosService.h" + +class MonSession; + +class ConfigMonitor : public PaxosService +{ + version_t version = 0; + ConfigMap config_map; + std::map<std::string,boost::optional<ceph::buffer::list>> pending; + std::string pending_description; + std::map<std::string,boost::optional<ceph::buffer::list>> pending_cleanup; + + std::map<std::string,ceph::buffer::list> current; + + void encode_pending_to_kvmon(); + +public: + ConfigMonitor(Monitor &m, Paxos &p, const std::string& service_name); + + void init() override; + + void load_config(); + void load_changeset(version_t v, ConfigChangeSet *ch); + + bool preprocess_query(MonOpRequestRef op) override; + bool prepare_update(MonOpRequestRef op) override; + + bool preprocess_command(MonOpRequestRef op); + bool prepare_command(MonOpRequestRef op); + + void handle_get_config(MonOpRequestRef op); + + void create_initial() override; + void update_from_paxos(bool *need_bootstrap) override; + void create_pending() override; + void encode_pending(MonitorDBStore::TransactionRef t) override; + version_t get_trim_to() const override; + + void encode_full(MonitorDBStore::TransactionRef t) override { } + + void on_active() override; + void tick() override; + + bool refresh_config(MonSession *s); + bool maybe_send_config(MonSession *s); + void send_config(MonSession *s); + void check_sub(MonSession *s); + void check_sub(Subscription *sub); + void check_all_subs(); +}; diff --git a/src/mon/ConnectionTracker.cc b/src/mon/ConnectionTracker.cc new file mode 100644 index 000000000..272ad40c2 --- /dev/null +++ b/src/mon/ConnectionTracker.cc @@ -0,0 +1,361 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "ConnectionTracker.h" +#include "common/Formatter.h" +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, rank, epoch, version) + +static std::ostream& _prefix(std::ostream *_dout, int rank, epoch_t epoch, uint64_t version) { + return *_dout << "rank: " << rank << " version: "<< version << " ConnectionTracker(" << epoch << ") "; +} + +std::ostream& operator<<(std::ostream&o, const ConnectionReport& c) { + o << "rank=" << c.rank << ",epoch=" << c.epoch << ",version=" << c.epoch_version + << ", current links: " << c.current << ", history: " << c.history; + return o; +} + +std::ostream& operator<<(std::ostream& o, const ConnectionTracker& c) { + o << "rank=" << c.rank << ", epoch=" << c.epoch << ", version=" << c.version + << ", half_life=" << c.half_life << ", reports: " << c.peer_reports; + return o; +} + +ConnectionReport *ConnectionTracker::reports(int p) +{ + auto i = peer_reports.find(p); + if (i == peer_reports.end()) { + ceph_assert(p != rank); + auto[j,k] = peer_reports.insert(std::pair<int,ConnectionReport>(p,ConnectionReport())); + i = j; + } + return &i->second; +} + +const ConnectionReport *ConnectionTracker::reports(int p) const +{ + auto i = peer_reports.find(p); + if (i == peer_reports.end()) { + return NULL; + } + return &i->second; +} + +void ConnectionTracker::receive_peer_report(const ConnectionTracker& o) +{ + ldout(cct, 30) << __func__ << dendl; + for (auto& i : o.peer_reports) { + const ConnectionReport& report = i.second; + if (i.first == rank) continue; + ConnectionReport& existing = *reports(i.first); + if (report.epoch > existing.epoch || + (report.epoch == existing.epoch && + report.epoch_version > existing.epoch_version)) { + ldout(cct, 30) << " new peer_report is more updated" << dendl; + ldout(cct, 30) << "existing: " << existing << dendl; + ldout(cct, 30) << "new: " << report << dendl; + existing = report; + } + } + encoding.clear(); +} + +bool ConnectionTracker::increase_epoch(epoch_t e) +{ + ldout(cct, 30) << __func__ << " to " << e << dendl; + if (e > epoch) { + my_reports.epoch_version = version = 0; + my_reports.epoch = epoch = e; + peer_reports[rank] = my_reports; + encoding.clear(); + return true; + } + return false; +} + +void ConnectionTracker::increase_version() +{ + ldout(cct, 30) << __func__ << " to " << version+1 << dendl; + encoding.clear(); + ++version; + my_reports.epoch_version = version; + peer_reports[rank] = my_reports; + if ((version % persist_interval) == 0 ) { + ldout(cct, 30) << version << " % " << persist_interval << " == 0" << dendl; + owner->persist_connectivity_scores(); + } +} + +void ConnectionTracker::report_live_connection(int peer_rank, double units_alive) +{ + ldout(cct, 30) << __func__ << " peer_rank: " << peer_rank << " units_alive: " << units_alive << dendl; + ldout(cct, 30) << "my_reports before: " << my_reports << dendl; + if (peer_rank == rank) { + lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl; + return; + } + // we need to "auto-initialize" to 1, do shenanigans + auto i = my_reports.history.find(peer_rank); + if (i == my_reports.history.end()) { + ldout(cct, 30) << "couldn't find: " << peer_rank + << " in my_reports.history" << "... inserting: " + << "(" << peer_rank << ", 1" << dendl; + auto[j,k] = my_reports.history.insert(std::pair<int,double>(peer_rank,1.0)); + i = j; + } + double& pscore = i->second; + ldout(cct, 30) << "adding new pscore to my_reports" << dendl; + pscore = pscore * (1 - units_alive / (2 * half_life)) + + (units_alive / (2 * half_life)); + pscore = std::min(pscore, 1.0); + my_reports.current[peer_rank] = true; + + increase_version(); + ldout(cct, 30) << "my_reports after: " << my_reports << dendl; +} + +void ConnectionTracker::report_dead_connection(int peer_rank, double units_dead) +{ + ldout(cct, 30) << __func__ << " peer_rank: " << peer_rank << " units_dead: " << units_dead << dendl; + ldout(cct, 30) << "my_reports before: " << my_reports << dendl; + if (peer_rank == rank) { + lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl; + return; + } + // we need to "auto-initialize" to 1, do shenanigans + auto i = my_reports.history.find(peer_rank); + if (i == my_reports.history.end()) { + ldout(cct, 30) << "couldn't find: " << peer_rank + << " in my_reports.history" << "... inserting: " + << "(" << peer_rank << ", 1" << dendl; + auto[j,k] = my_reports.history.insert(std::pair<int,double>(peer_rank,1.0)); + i = j; + } + double& pscore = i->second; + ldout(cct, 30) << "adding new pscore to my_reports" << dendl; + pscore = pscore * (1 - units_dead / (2 * half_life)) - + (units_dead / (2*half_life)); + pscore = std::max(pscore, 0.0); + my_reports.current[peer_rank] = false; + + increase_version(); + ldout(cct, 30) << "my_reports after: " << my_reports << dendl; +} + +void ConnectionTracker::get_total_connection_score(int peer_rank, double *rating, + int *live_count) const +{ + ldout(cct, 30) << __func__ << dendl; + *rating = 0; + *live_count = 0; + double rate = 0; + int live = 0; + + for (const auto& i : peer_reports) { // loop through all the scores + if (i.first == peer_rank) { // ... except the ones it has for itself, of course! + continue; + } + const auto& report = i.second; + auto score_i = report.history.find(peer_rank); + auto live_i = report.current.find(peer_rank); + if (score_i != report.history.end()) { + if (live_i->second) { + rate += score_i->second; + ++live; + } + } + } + *rating = rate; + *live_count = live; +} + +void ConnectionTracker::notify_rank_changed(int new_rank) +{ + ldout(cct, 20) << __func__ << " to " << new_rank << dendl; + if (new_rank == rank) return; + ldout(cct, 20) << "peer_reports before: " << peer_reports << dendl; + peer_reports.erase(rank); + peer_reports.erase(new_rank); + my_reports.rank = new_rank; + rank = new_rank; + encoding.clear(); + ldout(cct, 20) << "peer_reports after: " << peer_reports << dendl; + + increase_version(); +} + +void ConnectionTracker::notify_rank_removed(int rank_removed, int new_rank) +{ + ldout(cct, 20) << __func__ << " " << rank_removed + << " new_rank: " << new_rank << dendl; + ldout(cct, 20) << "my_reports before: " << my_reports << dendl; + ldout(cct, 20) << "peer_reports before: " << peer_reports << dendl; + ldout(cct, 20) << "my rank before: " << rank << dendl; + + encoding.clear(); + size_t starting_size_current = my_reports.current.size(); + // Lets adjust everything in my report. + my_reports.current.erase(rank_removed); + my_reports.history.erase(rank_removed); + auto ci = my_reports.current.upper_bound(rank_removed); + auto hi = my_reports.history.upper_bound(rank_removed); + while (ci != my_reports.current.end()) { + ceph_assert(ci->first == hi->first); + my_reports.current[ci->first - 1] = ci->second; + my_reports.history[hi->first - 1] = hi->second; + my_reports.current.erase(ci++); + my_reports.history.erase(hi++); + } + ceph_assert((my_reports.current.size() == starting_size_current) || + (my_reports.current.size() + 1 == starting_size_current)); + + size_t starting_size = peer_reports.size(); + auto pi = peer_reports.upper_bound(rank_removed); + // Remove the target rank and adjust everything that comes after. + // Note that we don't adjust current and history for our peer_reports + // because it is better to rely on our peers on that information. + peer_reports.erase(rank_removed); + while (pi != peer_reports.end()) { + peer_reports[pi->first - 1] = pi->second; // copy content of next rank to ourself. + peer_reports.erase(pi++); // destroy our next rank and move on. + } + + ceph_assert((peer_reports.size() == starting_size) || + (peer_reports.size() + 1 == starting_size)); + + if (rank_removed < rank) { // if the rank removed is lower than us, we need to adjust. + --rank; + my_reports.rank = rank; // also adjust my_reports.rank. + } + + ldout(cct, 20) << "my rank after: " << rank << dendl; + ldout(cct, 20) << "peer_reports after: " << peer_reports << dendl; + ldout(cct, 20) << "my_reports after: " << my_reports << dendl; + + //check if the new_rank from monmap is equal to our adjusted rank. + ceph_assert(rank == new_rank); + + increase_version(); +} + +bool ConnectionTracker::is_clean(int mon_rank, int monmap_size) +{ + ldout(cct, 30) << __func__ << dendl; + // check consistency between our rank according + // to monmap and our rank according to our report. + if (rank != mon_rank || + my_reports.rank != mon_rank) { + return false; + } else if (!peer_reports.empty()){ + // if peer_report max rank is greater than monmap max rank + // then there is a problem. + if (peer_reports.rbegin()->first > monmap_size - 1) return false; + } + return true; +} + +void ConnectionTracker::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(rank, bl); + encode(epoch, bl); + encode(version, bl); + encode(half_life, bl); + encode(peer_reports, bl); + ENCODE_FINISH(bl); +} + +void ConnectionTracker::decode(bufferlist::const_iterator& bl) { + clear_peer_reports(); + encoding.clear(); + + DECODE_START(1, bl); + decode(rank, bl); + decode(epoch, bl); + decode(version, bl); + decode(half_life, bl); + decode(peer_reports, bl); + DECODE_FINISH(bl); + if (rank >=0) + my_reports = peer_reports[rank]; +} + +const bufferlist& ConnectionTracker::get_encoded_bl() +{ + if (!encoding.length()) { + encode(encoding); + } + return encoding; +} + +void ConnectionReport::dump(ceph::Formatter *f) const +{ + f->dump_int("rank", rank); + f->dump_int("epoch", epoch); + f->dump_int("version", epoch_version); + f->open_object_section("peer_scores"); + for (auto i : history) { + f->open_object_section("peer"); + f->dump_int("peer_rank", i.first); + f->dump_float("peer_score", i.second); + f->dump_bool("peer_alive", current.find(i.first)->second); + f->close_section(); + } + f->close_section(); // peer scores +} + +void ConnectionReport::generate_test_instances(std::list<ConnectionReport*>& o) +{ + o.push_back(new ConnectionReport); + o.push_back(new ConnectionReport); + o.back()->rank = 1; + o.back()->epoch = 2; + o.back()->epoch_version = 3; + o.back()->current[0] = true; + o.back()->history[0] = .4; +} + +void ConnectionTracker::dump(ceph::Formatter *f) const +{ + f->dump_int("rank", rank); + f->dump_int("epoch", epoch); + f->dump_int("version", version); + f->dump_float("half_life", half_life); + f->dump_int("persist_interval", persist_interval); + f->open_object_section("reports"); + for (const auto& i : peer_reports) { + f->open_object_section("report"); + i.second.dump(f); + f->close_section(); + } + f->close_section(); // reports +} + +void ConnectionTracker::generate_test_instances(std::list<ConnectionTracker*>& o) +{ + o.push_back(new ConnectionTracker); + o.push_back(new ConnectionTracker); + ConnectionTracker *e = o.back(); + e->rank = 2; + e->epoch = 3; + e->version = 4; + e->peer_reports[0]; + e->peer_reports[1]; + e->my_reports = e->peer_reports[2]; +} diff --git a/src/mon/ConnectionTracker.h b/src/mon/ConnectionTracker.h new file mode 100644 index 000000000..09506636d --- /dev/null +++ b/src/mon/ConnectionTracker.h @@ -0,0 +1,205 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once +#include "include/types.h" + +struct ConnectionReport { + int rank = -1; // mon rank this state belongs to + std::map<int, bool> current; // true if connected to the other mon + std::map<int, double> history; // [0-1]; the connection reliability + epoch_t epoch = 0; // the (local) election epoch the ConnectionReport came from + uint64_t epoch_version = 0; // version of the ConnectionReport within the epoch + void encode(bufferlist& bl) const { + ENCODE_START(1, 1, bl); + encode(rank, bl); + encode(current, bl); + encode(history, bl); + encode(epoch, bl); + encode(epoch_version, bl); + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + DECODE_START(1, bl); + decode(rank, bl); + decode(current, bl); + decode(history, bl); + decode(epoch, bl); + decode(epoch_version, bl); + DECODE_FINISH(bl); + } + bool operator==(const ConnectionReport& o) const { + return o.rank == rank && o.current == current && + o.history == history && o.epoch == epoch && + o.epoch_version == epoch_version; + } + friend std::ostream& operator<<(std::ostream&o, const ConnectionReport& c); + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<ConnectionReport*>& o); +}; +WRITE_CLASS_ENCODER(ConnectionReport); + +class RankProvider { + public: + /** + * Get the rank of the running daemon. + * It can be -1, meaning unknown/invalid, or it + * can be >1. + * You should not invoke the function get_total_connection_score() + * with an unknown rank. + */ + virtual int get_my_rank() const = 0; + /** + * Asks our owner to encode us and persist it to disk. + * Presently we do this every tenth update. + */ + virtual void persist_connectivity_scores() = 0; + virtual ~RankProvider() {} +}; + +class ConnectionTracker { + public: + /** + * Receive a report from a peer and update our internal state + * if the peer has newer data. + */ + void receive_peer_report(const ConnectionTracker& o); + /** + * Bump up the epoch to the specified number. + * Validates that it is > current epoch and resets + * version to 0; returns false if not. + */ + bool increase_epoch(epoch_t e); + /** + * Bump up the version within our epoch. + * If the new version is a multiple of ten, we also persist it. + */ + void increase_version(); + + /** + * Report a connection to a peer rank has been considered alive for + * the given time duration. We assume the units_alive is <= the time + * since the previous reporting call. + * (Or, more precisely, we assume that the total amount of time + * passed in is less than or equal to the time which has actually + * passed -- you can report a 10-second death immediately followed + * by reporting 5 seconds of liveness if your metrics are delayed.) + */ + void report_live_connection(int peer_rank, double units_alive); + /** + * Report a connection to a peer rank has been considered dead for + * the given time duration, analogous to that above. + */ + void report_dead_connection(int peer_rank, double units_dead); + /** + * Set the half-life for dropping connection state + * out of the ongoing score. + * Whenever you add a new data point: + * new_score = old_score * ( 1 - units / (2d)) + (units/(2d)) + * where units is the units reported alive (for dead, you subtract them). + */ + void set_half_life(double d) { + half_life = d; + } + /** + * Get the total connection score of a rank across + * all peers, and the count of how many electors think it's alive. + * For this summation, if a rank reports a peer as down its score is zero. + */ + void get_total_connection_score(int peer_rank, double *rating, + int *live_count) const; + /** + * Check if our ranks are clean and make + * sure there are no extra peer_report lingering. + * In the future we also want to check the reports + * current and history of each peer_report. + */ + bool is_clean(int mon_rank, int monmap_size); + /** + * Encode this ConnectionTracker. Useful both for storing on disk + * and for sending off to peers for decoding and import + * with receive_peer_report() above. + */ + void encode(bufferlist &bl) const; + void decode(bufferlist::const_iterator& bl); + /** + * Get a bufferlist containing the ConnectionTracker. + * This is like encode() but holds a copy so it + * doesn't re-encode on every invocation. + */ + const bufferlist& get_encoded_bl(); + private: + epoch_t epoch; + uint64_t version; + map<int,ConnectionReport> peer_reports; + ConnectionReport my_reports; + double half_life; + RankProvider *owner; + int rank; + int persist_interval; + bufferlist encoding; + CephContext *cct; + int get_my_rank() const { return rank; } + ConnectionReport *reports(int p); + const ConnectionReport *reports(int p) const; + + void clear_peer_reports() { + encoding.clear(); + peer_reports.clear(); + my_reports = ConnectionReport(); + my_reports.rank = rank; + } + + public: + ConnectionTracker() : epoch(0), version(0), half_life(12*60*60), + owner(NULL), rank(-1), persist_interval(10) { + } + ConnectionTracker(RankProvider *o, int rank, double hl, + int persist_i, CephContext *c) : + epoch(0), version(0), + half_life(hl), owner(o), rank(rank), persist_interval(persist_i), cct(c) { + my_reports.rank = rank; + } + ConnectionTracker(const bufferlist& bl, CephContext *c) : + epoch(0), version(0), + half_life(0), owner(NULL), rank(-1), persist_interval(10), cct(c) + { + auto bi = bl.cbegin(); + decode(bi); + } + ConnectionTracker(const ConnectionTracker& o) : + epoch(o.epoch), version(o.version), + half_life(o.half_life), owner(o.owner), rank(o.rank), + persist_interval(o.persist_interval), cct(o.cct) + { + peer_reports = o.peer_reports; + my_reports = o.my_reports; + } + void notify_reset() { clear_peer_reports(); } + void set_rank(int new_rank) { + rank = new_rank; + my_reports.rank = rank; + } + + void notify_rank_changed(int new_rank); + void notify_rank_removed(int rank_removed, int new_rank); + friend std::ostream& operator<<(std::ostream& o, const ConnectionTracker& c); + friend ConnectionReport *get_connection_reports(ConnectionTracker& ct); + friend map<int,ConnectionReport> *get_peer_reports(ConnectionTracker& ct); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<ConnectionTracker*>& o); +}; + +WRITE_CLASS_ENCODER(ConnectionTracker); diff --git a/src/mon/CreatingPGs.h b/src/mon/CreatingPGs.h new file mode 100644 index 000000000..0075f81e7 --- /dev/null +++ b/src/mon/CreatingPGs.h @@ -0,0 +1,234 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <set> +#include <vector> + +#include "include/encoding.h" +#include "include/utime.h" + +#include "osd/osd_types.h" + +struct creating_pgs_t { + epoch_t last_scan_epoch = 0; + + struct pg_create_info { + epoch_t create_epoch; + utime_t create_stamp; + + // NOTE: pre-octopus instances of this class will have a + // zeroed-out history + std::vector<int> up; + int up_primary = -1; + std::vector<int> acting; + int acting_primary = -1; + pg_history_t history; + PastIntervals past_intervals; + + void encode(ceph::buffer::list& bl, uint64_t features) const { + using ceph::encode; + if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) { + // was pair<epoch_t,utime_t> prior to octopus + encode(create_epoch, bl); + encode(create_stamp, bl); + return; + } + ENCODE_START(1, 1, bl); + encode(create_epoch, bl); + encode(create_stamp, bl); + encode(up, bl); + encode(up_primary, bl); + encode(acting, bl); + encode(acting_primary, bl); + encode(history, bl); + encode(past_intervals, bl); + ENCODE_FINISH(bl); + } + void decode_legacy(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + decode(create_epoch, p); + decode(create_stamp, p); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + DECODE_START(1, p); + decode(create_epoch, p); + decode(create_stamp, p); + decode(up, p); + decode(up_primary, p); + decode(acting, p); + decode(acting_primary, p); + decode(history, p); + decode(past_intervals, p); + DECODE_FINISH(p); + } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("create_epoch", create_epoch); + f->dump_stream("create_stamp") << create_stamp; + f->open_array_section("up"); + for (auto& i : up) { + f->dump_unsigned("osd", i); + } + f->close_section(); + f->dump_int("up_primary", up_primary); + f->open_array_section("acting"); + for (auto& i : acting) { + f->dump_unsigned("osd", i); + } + f->close_section(); + f->dump_int("acting_primary", up_primary); + f->dump_object("pg_history", history); + f->dump_object("past_intervals", past_intervals); + } + + pg_create_info() {} + pg_create_info(epoch_t e, utime_t t) + : create_epoch(e), + create_stamp(t) { + // NOTE: we don't initialize the other fields here; see + // OSDMonitor::update_pending_pgs() + } + }; + + /// pgs we are currently creating + std::map<pg_t, pg_create_info> pgs; + + struct pool_create_info { + epoch_t created; + utime_t modified; + uint64_t start = 0; + uint64_t end = 0; + bool done() const { + return start >= end; + } + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(created, bl); + encode(modified, bl); + encode(start, bl); + encode(end, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + decode(created, p); + decode(modified, p); + decode(start, p); + decode(end, p); + } + }; + + /// queue of pgs we still need to create (poolid -> <created, set of ps>) + std::map<int64_t,pool_create_info> queue; + + /// pools that exist in the osdmap for which at least one pg has been created + std::set<int64_t> created_pools; + + bool still_creating_pool(int64_t poolid) { + for (auto& i : pgs) { + if (i.first.pool() == poolid) { + return true; + } + } + if (queue.count(poolid)) { + return true; + } + return false; + } + void create_pool(int64_t poolid, uint32_t pg_num, + epoch_t created, utime_t modified) { + ceph_assert(created_pools.count(poolid) == 0); + auto& c = queue[poolid]; + c.created = created; + c.modified = modified; + c.end = pg_num; + created_pools.insert(poolid); + } + unsigned remove_pool(int64_t removed_pool) { + const unsigned total = pgs.size(); + auto first = pgs.lower_bound(pg_t{0, (uint64_t)removed_pool}); + auto last = pgs.lower_bound(pg_t{0, (uint64_t)removed_pool + 1}); + pgs.erase(first, last); + created_pools.erase(removed_pool); + queue.erase(removed_pool); + return total - pgs.size(); + } + void encode(ceph::buffer::list& bl, uint64_t features) const { + unsigned v = 3; + if (!HAVE_FEATURE(features, SERVER_OCTOPUS)) { + v = 2; + } + ENCODE_START(v, 1, bl); + encode(last_scan_epoch, bl); + encode(pgs, bl, features); + encode(created_pools, bl); + encode(queue, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(3, bl); + decode(last_scan_epoch, bl); + if (struct_v >= 3) { + decode(pgs, bl); + } else { + // legacy pg encoding + pgs.clear(); + uint32_t num; + decode(num, bl); + while (num--) { + pg_t pgid; + decode(pgid, bl); + pgs[pgid].decode_legacy(bl); + } + } + decode(created_pools, bl); + if (struct_v >= 2) + decode(queue, bl); + DECODE_FINISH(bl); + } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("last_scan_epoch", last_scan_epoch); + f->open_array_section("creating_pgs"); + for (auto& pg : pgs) { + f->open_object_section("pg"); + f->dump_stream("pgid") << pg.first; + f->dump_object("pg_create_info", pg.second); + f->close_section(); + } + f->close_section(); + f->open_array_section("queue"); + for (auto& p : queue) { + f->open_object_section("pool"); + f->dump_unsigned("pool", p.first); + f->dump_unsigned("created", p.second.created); + f->dump_stream("modified") << p.second.modified; + f->dump_unsigned("ps_start", p.second.start); + f->dump_unsigned("ps_end", p.second.end); + f->close_section(); + } + f->close_section(); + f->open_array_section("created_pools"); + for (auto pool : created_pools) { + f->dump_unsigned("pool", pool); + } + f->close_section(); + } + static void generate_test_instances(std::list<creating_pgs_t*>& o) { + auto c = new creating_pgs_t; + c->last_scan_epoch = 17; + c->pgs.emplace(pg_t{42, 2}, pg_create_info(31, utime_t{891, 113})); + c->pgs.emplace(pg_t{44, 2}, pg_create_info(31, utime_t{891, 113})); + c->created_pools = {0, 1}; + o.push_back(c); + c = new creating_pgs_t; + c->last_scan_epoch = 18; + c->pgs.emplace(pg_t{42, 3}, pg_create_info(31, utime_t{891, 113})); + c->created_pools = {}; + o.push_back(c); + } +}; +WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t::pg_create_info) +WRITE_CLASS_ENCODER(creating_pgs_t::pool_create_info) +WRITE_CLASS_ENCODER_FEATURES(creating_pgs_t) diff --git a/src/mon/ElectionLogic.cc b/src/mon/ElectionLogic.cc new file mode 100644 index 000000000..e22a85bed --- /dev/null +++ b/src/mon/ElectionLogic.cc @@ -0,0 +1,556 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "ElectionLogic.h" + +#include "include/ceph_assert.h" +#include "common/dout.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, epoch, elector) +using std::cerr; +using std::cout; +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::setfill; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; +using std::unique_ptr; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::mono_clock; +using ceph::mono_time; +using ceph::timespan_str; +static ostream& _prefix(std::ostream *_dout, epoch_t epoch, ElectionOwner* elector) { + return *_dout << "paxos." << elector->get_my_rank() + << ").electionLogic(" << epoch << ") "; +} +void ElectionLogic::init() +{ + epoch = elector->read_persisted_epoch(); + if (!epoch) { + ldout(cct, 1) << "init, first boot, initializing epoch at 1 " << dendl; + epoch = 1; + } else if (epoch % 2) { + ldout(cct, 1) << "init, last seen epoch " << epoch + << ", mid-election, bumping" << dendl; + ++epoch; + elector->persist_epoch(epoch); + } else { + ldout(cct, 1) << "init, last seen epoch " << epoch << dendl; + } +} + +void ElectionLogic::bump_epoch(epoch_t e) +{ + ldout(cct, 10) << __func__ << " to " << e << dendl; + ceph_assert(epoch <= e); + epoch = e; + peer_tracker->increase_epoch(e); + elector->persist_epoch(epoch); + // clear up some state + electing_me = false; + acked_me.clear(); + elector->notify_bump_epoch(); +} + +void ElectionLogic::declare_standalone_victory() +{ + assert(elector->paxos_size() == 1 && elector->get_my_rank() == 0); + init(); + bump_epoch(epoch+1); +} + +void ElectionLogic::clear_live_election_state() +{ + leader_acked = -1; + electing_me = false; + reset_stable_tracker(); + leader_peer_tracker.reset(); +} + +void ElectionLogic::reset_stable_tracker() +{ + stable_peer_tracker.reset(new ConnectionTracker(*peer_tracker)); +} + +void ElectionLogic::connectivity_bump_epoch_in_election(epoch_t mepoch) +{ + ldout(cct, 30) << __func__ << " to " << mepoch << dendl; + ceph_assert(mepoch > epoch); + bump_epoch(mepoch); + reset_stable_tracker(); + double lscore, my_score; + my_score = connectivity_election_score(elector->get_my_rank()); + lscore = connectivity_election_score(leader_acked); + if (my_score > lscore) { + leader_acked = -1; + leader_peer_tracker.reset(); + } +} + +void ElectionLogic::start() +{ + if (!participating) { + ldout(cct, 0) << "not starting new election -- not participating" << dendl; + return; + } + ldout(cct, 5) << "start -- can i be leader?" << dendl; + + acked_me.clear(); + init(); + + // start by trying to elect me + if (epoch % 2 == 0) { + bump_epoch(epoch+1); // odd == election cycle + } else { + elector->validate_store(); + } + acked_me.insert(elector->get_my_rank()); + clear_live_election_state(); + reset_stable_tracker(); + electing_me = true; + + bufferlist bl; + if (strategy == CONNECTIVITY) { + stable_peer_tracker->encode(bl); + } + elector->propose_to_peers(epoch, bl); + elector->_start(); +} + +void ElectionLogic::defer(int who) +{ + if (strategy == CLASSIC) { + ldout(cct, 5) << "defer to " << who << dendl; + ceph_assert(who < elector->get_my_rank()); + } else { + ldout(cct, 5) << "defer to " << who << ", disallowed_leaders=" << elector->get_disallowed_leaders() << dendl; + ceph_assert(!elector->get_disallowed_leaders().count(who)); + } + + if (electing_me) { + // drop out + acked_me.clear(); + electing_me = false; + } + + // ack them + leader_acked = who; + elector->_defer_to(who); +} + +void ElectionLogic::end_election_period() +{ + ldout(cct, 5) << "election period ended" << dendl; + + // did i win? + if (electing_me && + acked_me.size() > (elector->paxos_size() / 2)) { + // i win + declare_victory(); + } else { + // whoever i deferred to didn't declare victory quickly enough. + if (elector->ever_participated()) + start(); + else + elector->reset_election(); + } +} + + +void ElectionLogic::declare_victory() +{ + ldout(cct, 5) << "I win! acked_me=" << acked_me << dendl; + last_election_winner = elector->get_my_rank(); + last_voted_for = last_election_winner; + clear_live_election_state(); + + set<int> new_quorum; + new_quorum.swap(acked_me); + + ceph_assert(epoch % 2 == 1); // election + bump_epoch(epoch+1); // is over! + + elector->message_victory(new_quorum); +} + +bool ElectionLogic::propose_classic_prefix(int from, epoch_t mepoch) +{ + if (mepoch > epoch) { + bump_epoch(mepoch); + } else if (mepoch < epoch) { + // got an "old" propose, + if (epoch % 2 == 0 && // in a non-election cycle + !elector->is_current_member(from)) { // from someone outside the quorum + // a mon just started up, call a new election so they can rejoin! + ldout(cct, 5) << " got propose from old epoch, " + << from << " must have just started" << dendl; + // we may be active; make sure we reset things in the monitor appropriately. + elector->trigger_new_election(); + } else { + ldout(cct, 5) << " ignoring old propose" << dendl; + } + return true; + } + return false; +} + +void ElectionLogic::receive_propose(int from, epoch_t mepoch, + const ConnectionTracker *ct) +{ + ldout(cct, 20) << __func__ << " from " << from << dendl; + if (from == elector->get_my_rank()) { + lderr(cct) << "I got a propose from my own rank, hopefully this is startup weirdness,dropping" << dendl; + return; + } + switch (strategy) { + case CLASSIC: + propose_classic_handler(from, mepoch); + break; + case DISALLOW: + propose_disallow_handler(from, mepoch); + break; + case CONNECTIVITY: + propose_connectivity_handler(from, mepoch, ct); + break; + default: + ceph_assert(0 == "how did election strategy become an invalid value?"); + } +} + +void ElectionLogic::propose_disallow_handler(int from, epoch_t mepoch) +{ + if (propose_classic_prefix(from, mepoch)) { + return; + } + const set<int>& disallowed_leaders = elector->get_disallowed_leaders(); + int my_rank = elector->get_my_rank(); + bool me_disallowed = disallowed_leaders.count(my_rank); + bool from_disallowed = disallowed_leaders.count(from); + bool my_win = !me_disallowed && // we are allowed to lead + (my_rank < from || from_disallowed); // we are a better choice than them + bool their_win = !from_disallowed && // they are allowed to lead + (my_rank > from || me_disallowed) && // they are a better choice than us + (leader_acked < 0 || leader_acked >= from); // they are a better choice than our previously-acked choice + + + if (my_win) { + // i would win over them. + if (leader_acked >= 0) { // we already acked someone + ceph_assert(leader_acked < from || from_disallowed); // and they still win, of course + ldout(cct, 5) << "no, we already acked " << leader_acked << dendl; + } else { + // wait, i should win! + if (!electing_me) { + elector->trigger_new_election(); + } + } + } else { + // they would win over me + if (their_win) { + defer(from); + } else { + // ignore them! + ldout(cct, 5) << "no, we already acked " << leader_acked << dendl; + } + } +} + +void ElectionLogic::propose_classic_handler(int from, epoch_t mepoch) +{ + if (propose_classic_prefix(from, mepoch)) { + return; + } + if (elector->get_my_rank() < from) { + // i would win over them. + if (leader_acked >= 0) { // we already acked someone + ceph_assert(leader_acked < from); // and they still win, of course + ldout(cct, 5) << "no, we already acked " << leader_acked << dendl; + } else { + // wait, i should win! + if (!electing_me) { + elector->trigger_new_election(); + } + } + } else { + // they would win over me + if (leader_acked < 0 || // haven't acked anyone yet, or + leader_acked > from || // they would win over who you did ack, or + leader_acked == from) { // this is the guy we're already deferring to + defer(from); + } else { + // ignore them! + ldout(cct, 5) << "no, we already acked " << leader_acked << dendl; + } + } +} + +double ElectionLogic::connectivity_election_score(int rank) +{ + ldout(cct, 30) << __func__ << " of " << rank << dendl; + if (elector->get_disallowed_leaders().count(rank)) { + return -1; + } + double score; + int liveness; + if (stable_peer_tracker) { + ldout(cct, 30) << "stable_peer_tracker exists so using that ..." << dendl; + stable_peer_tracker->get_total_connection_score(rank, &score, &liveness); + } else { + ldout(cct, 30) << "stable_peer_tracker does not exists, using peer_tracker ..." << dendl; + peer_tracker->get_total_connection_score(rank, &score, &liveness); + } + return score; +} + +void ElectionLogic::propose_connectivity_handler(int from, epoch_t mepoch, + const ConnectionTracker *ct) +{ + ldout(cct, 10) << __func__ << " from " << from << " mepoch: " + << mepoch << " epoch: " << epoch << dendl; + ldout(cct, 30) << "last_election_winner: " << last_election_winner << dendl; + if ((epoch % 2 == 0) && + last_election_winner != elector->get_my_rank() && + !elector->is_current_member(from)) { + // To prevent election flapping, peons ignore proposals from out-of-quorum + // peers unless their vote would materially change from the last election + ldout(cct, 30) << "Lets see if this out-of-quorum peer is worth it " << dendl; + int best_scorer = 0; + double best_score = 0; + double last_voted_for_score = 0; + ldout(cct, 30) << "elector->paxos_size(): " << elector->paxos_size() << dendl; + for (unsigned i = 0; i < elector->paxos_size(); ++i) { + double score = connectivity_election_score(i); + if (score > best_score) { + best_scorer = i; + best_score = score; + } + if (last_voted_for >= 0 && i == static_cast<unsigned>(last_voted_for)) { + last_voted_for_score = score; + } + } + ldout(cct, 30) << "best_scorer: " << best_scorer << " best_score: " << best_score + << " last_voted_for: " << last_voted_for << " last_voted_for_score: " + << last_voted_for_score << dendl; + if (best_scorer == last_voted_for || + (best_score - last_voted_for_score < ignore_propose_margin)) { + // drop this message; it won't change our vote so we defer to leader + ldout(cct, 30) << "drop this message; it won't change our vote so we defer to leader " << dendl; + return; + } + } + if (mepoch > epoch) { + ldout(cct, 20) << "mepoch > epoch" << dendl; + connectivity_bump_epoch_in_election(mepoch); + } else if (mepoch < epoch) { + // got an "old" propose, + if (epoch % 2 == 0 && // in a non-election cycle + !elector->is_current_member(from)) { // from someone outside the quorum + // a mon just started up, call a new election so they can rejoin! + ldout(cct, 5) << " got propose from old epoch, " + << from << " must have just started" << dendl; + ldout(cct, 10) << "triggering new election" << dendl; + // we may be active; make sure we reset things in the monitor appropriately. + elector->trigger_new_election(); + } else { + ldout(cct, 5) << " ignoring old propose" << dendl; + } + return; + } + + int my_rank = elector->get_my_rank(); + double my_score = connectivity_election_score(my_rank); + double from_score = connectivity_election_score(from); + double leader_score = -1; + if (leader_acked >= 0) { + leader_score = connectivity_election_score(leader_acked); + } + + ldout(cct, 20) << "propose from rank=" << from << ", tracker: " + << (stable_peer_tracker ? *stable_peer_tracker : *peer_tracker) << dendl; + + ldout(cct, 10) << "propose from rank=" << from << ",from_score=" << from_score + << "; my score=" << my_score + << "; currently acked " << leader_acked + << ",leader_score=" << leader_score << dendl; + + bool my_win = (my_score >= 0) && // My score is non-zero; I am allowed to lead + ((my_rank < from && my_score >= from_score) || // We have same scores and I have lower rank, or + (my_score > from_score)); // my score is higher + + bool their_win = (from_score >= 0) && // Their score is non-zero; they're allowed to lead, AND + ((from < my_rank && from_score >= my_score) || // Either they have lower rank and same score, or + (from_score > my_score)) && // their score is higher, AND + ((from <= leader_acked && from_score >= leader_score) || // same conditions compared to leader, or IS leader + (from_score > leader_score)); + + if (my_win) { + ldout(cct, 10) << " conditionally I win" << dendl; + // i would win over them. + if (leader_acked >= 0) { // we already acked someone + ceph_assert(leader_score >= from_score); // and they still win, of course + ldout(cct, 5) << "no, we already acked " << leader_acked << dendl; + } else { + // wait, i should win! + if (!electing_me) { + ldout(cct, 10) << " wait, i should win! triggering new election ..." << dendl; + elector->trigger_new_election(); + } + } + } else { + ldout(cct, 10) << " conditionally they win" << dendl; + // they would win over me + if (their_win || from == leader_acked) { + if (leader_acked >= 0 && from != leader_acked) { + // we have to make sure our acked leader will ALSO defer to them, or else + // we can't, to maintain guarantees! + ldout(cct, 10) << " make sure acked leader defer to: " << from << dendl; + double leader_from_score; + int leader_from_liveness; + leader_peer_tracker-> + get_total_connection_score(from, &leader_from_score, + &leader_from_liveness); + double leader_leader_score; + int leader_leader_liveness; + leader_peer_tracker-> + get_total_connection_score(leader_acked, &leader_leader_score, + &leader_leader_liveness); + if ((from < leader_acked && leader_from_score >= leader_leader_score) || + (leader_from_score > leader_leader_score)) { + ldout(cct, 10) << "defering to " << from << dendl; + defer(from); + leader_peer_tracker.reset(new ConnectionTracker(*ct)); + } else { // we can't defer to them *this* round even though they should win... + double cur_leader_score, cur_from_score; + int cur_leader_live, cur_from_live; + peer_tracker->get_total_connection_score(leader_acked, &cur_leader_score, &cur_leader_live); + peer_tracker->get_total_connection_score(from, &cur_from_score, &cur_from_live); + if ((from < leader_acked && cur_from_score >= cur_leader_score) || + (cur_from_score > cur_leader_score)) { + ldout(cct, 5) << "Bumping epoch and starting new election; acked " + << leader_acked << " should defer to " << from + << " but there is score disagreement!" << dendl; + bump_epoch(epoch+1); + start(); + } else { + ldout(cct, 5) << "no, we already acked " << leader_acked + << " and it won't defer to " << from + << " despite better round scores" << dendl; + } + } + } else { + ldout(cct, 10) << "defering to " << from << dendl; + defer(from); + leader_peer_tracker.reset(new ConnectionTracker(*ct)); + } + } else { + // ignore them! + ldout(cct, 5) << "no, we already acked " << leader_acked << " with score >=" << from_score << dendl; + } + } +} + +void ElectionLogic::receive_ack(int from, epoch_t from_epoch) +{ + ceph_assert(from_epoch % 2 == 1); // sender in an election epoch + if (from_epoch > epoch) { + ldout(cct, 5) << "woah, that's a newer epoch, i must have rebooted. bumping and re-starting!" << dendl; + bump_epoch(from_epoch); + start(); + return; + } + // is that _everyone_? + if (electing_me) { + acked_me.insert(from); + if (acked_me.size() == elector->paxos_size()) { + // if yes, shortcut to election finish + declare_victory(); + } + } else { + // ignore, i'm deferring already. + ceph_assert(leader_acked >= 0); + } +} + +bool ElectionLogic::victory_makes_sense(int from) +{ + bool makes_sense = false; + switch (strategy) { + case CLASSIC: + makes_sense = (from < elector->get_my_rank()); + break; + case DISALLOW: + makes_sense = (from < elector->get_my_rank()) || + elector->get_disallowed_leaders().count(elector->get_my_rank()); + break; + case CONNECTIVITY: + double my_score, leader_score; + my_score = connectivity_election_score(elector->get_my_rank()); + leader_score = connectivity_election_score(from); + ldout(cct, 5) << "victory from " << from << " makes sense? lscore:" + << leader_score + << "; my score:" << my_score << dendl; + + makes_sense = (leader_score >= my_score); + break; + default: + ceph_assert(0 == "how did you get a nonsense election strategy assigned?"); + } + return makes_sense; +} + +bool ElectionLogic::receive_victory_claim(int from, epoch_t from_epoch) +{ + bool election_okay = victory_makes_sense(from); + + last_election_winner = from; + last_voted_for = leader_acked; + clear_live_election_state(); + + if (!election_okay) { + ceph_assert(strategy == CONNECTIVITY); + ldout(cct, 1) << "I should have been elected over this leader; bumping and restarting!" << dendl; + bump_epoch(from_epoch); + start(); + return false; + } + + // i should have seen this election if i'm getting the victory. + if (from_epoch != epoch + 1) { + ldout(cct, 5) << "woah, that's a funny epoch, i must have rebooted. bumping and re-starting!" << dendl; + bump_epoch(from_epoch); + start(); + return false; + } + + bump_epoch(from_epoch); + + // they win + return true; +} diff --git a/src/mon/ElectionLogic.h b/src/mon/ElectionLogic.h new file mode 100644 index 000000000..65c727ca1 --- /dev/null +++ b/src/mon/ElectionLogic.h @@ -0,0 +1,459 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_ELECTIONLOGIC_H +#define CEPH_ELECTIONLOGIC_H + +#include <map> +#include "include/types.h" +#include "ConnectionTracker.h" + +class ElectionOwner { +public: + /** + * Write down the given epoch in persistent storage, such that it + * can later be retrieved by read_persisted_epoch even across process + * or machine restarts. + * + * @param e The epoch to write + */ + virtual void persist_epoch(epoch_t e) = 0; + /** + * Retrieve the most-previously-persisted epoch. + * + * @returns The latest epoch passed to persist_epoch() + */ + virtual epoch_t read_persisted_epoch() const = 0; + /** + * Validate that the persistent store is working by committing + * to it. (There is no interface for retrieving the value; this + * tests local functionality before doing things like triggering + * elections to try and join a quorum.) + */ + virtual void validate_store() = 0; + /** + * Notify the ElectionOwner that ElectionLogic has increased its + * election epoch. This resets an election (either on local loss or victory, + * or when trying a new election round) and the ElectionOwner + * should reset any tracking of its own to match. (The ElectionLogic + * will further trigger sending election messages if that is + * appropriate.) + */ + virtual void notify_bump_epoch() = 0; + /** + * Notify the ElectionOwner we must start a new election. + */ + virtual void trigger_new_election() = 0; + /** + * Retrieve this Paxos instance's rank. + */ + virtual int get_my_rank() const = 0; + /** + * Send a PROPOSE message to all our peers. This happens when + * we have started a new election (which may mean attempting to + * override a current one). + * + * @param e The election epoch of our proposal. + * @param bl A bufferlist containing data the logic wishes to share + */ + virtual void propose_to_peers(epoch_t e, bufferlist& bl) = 0; + /** + * The election has failed and we aren't sure what the state of the + * quorum is, so reset the entire system as if from scratch. + */ + virtual void reset_election() = 0; + /** + * Ask the ElectionOwner if we-the-Monitor have ever participated in the + * quorum (including across process restarts!). + * + * @returns true if we have participated, false otherwise + */ + virtual bool ever_participated() const = 0; + /** + * Ask the ElectionOwner for the size of the Paxos set. This includes + * those monitors which may not be in the current quorum! + * The value returned by this function can change between elections, + * but not during them. (In practical terms, it can be updated + * by making a paxos commit, but not by injecting values while + * an election is ongoing.) + */ + virtual unsigned paxos_size() const = 0; + /** + * Retrieve a set of ranks which are not allowed to become the leader. + * Like paxos_size(), This set can change between elections, but not + * during them. + */ + virtual const set<int>& get_disallowed_leaders() const = 0; + /** + * Tell the ElectionOwner we have started a new election. + * + * The ElectionOwner is responsible for timing out the election (by invoking + * end_election_period()) if it takes too long (as defined by the ElectionOwner). + * This function is the opportunity to do that and to clean up any other external + * election state it may be maintaining. + */ + virtual void _start() = 0; + /** + * Tell the ElectionOwner to defer to the identified peer. Tell that peer + * we have deferred to it. + * + * @post we sent an ack message to @p who + */ + virtual void _defer_to(int who) = 0; + /** + * We have won an election, so have the ElectionOwner message that to + * our new quorum! + * + * @param quorum The ranks of our peers which deferred to us and + * must be told of our victory + */ + virtual void message_victory(const std::set<int>& quorum) = 0; + /** + * Query the ElectionOwner about if a given rank is in the + * currently active quorum. + * @param rank the Paxos rank whose status we are checking + * @returns true if the rank is in our current quorum, false otherwise. + */ + virtual bool is_current_member(int rank) const = 0; + virtual ~ElectionOwner() {} +}; + +/** + * This class maintains local state for running an election + * between Paxos instances. It receives input requests + * and calls back out to its ElectionOwner to do persistence + * and message other entities. + */ + +class ElectionLogic { + ElectionOwner *elector; + ConnectionTracker *peer_tracker; + + CephContext *cct; + /** + * Latest epoch we've seen. + * + * @remarks if its value is odd, we're electing; if it's even, then we're + * stable. + */ + epoch_t epoch = 0; + /** + * The last rank which won an election we participated in + */ + int last_election_winner = -1; + /** + * Only used in the connectivity handler. + * The rank we voted for in the last election we voted in. + */ + int last_voted_for = -1; + double ignore_propose_margin = 0.0001; + /** + * Only used in the connectivity handler. + * Points at a stable copy of the peer_tracker we use to keep scores + * throughout an election period. + */ + std::unique_ptr<ConnectionTracker> stable_peer_tracker; + std::unique_ptr<ConnectionTracker> leader_peer_tracker; + /** + * Indicates who we have acked + */ + int leader_acked; + +public: + enum election_strategy { + // Keep in sync with MonMap.h! + CLASSIC = 1, // the original rank-based one + DISALLOW = 2, // disallow a set from being leader + CONNECTIVITY = 3 // includes DISALLOW, extends to prefer stronger connections + }; + election_strategy strategy; + + /** + * Indicates if we are participating in the quorum. + * + * @remarks By default, we are created as participating. We may stop + * participating if something explicitly sets our value + * false, though. If that happens, it will + * have to set participating=true and invoke start() for us to resume + * participating in the quorum. + */ + bool participating; + /** + * Indicates if we are the ones being elected. + * + * We always attempt to be the one being elected if we are the ones starting + * the election. If we are not the ones that started it, we will only attempt + * to be elected if we think we might have a chance (i.e., the other guy's + * rank is lower than ours). + */ + bool electing_me; + /** + * Set containing all those that acked our proposal to become the Leader. + * + * If we are acked by ElectionOwner::paxos_size() peers, we will declare + * victory. + */ + std::set<int> acked_me; + + ElectionLogic(ElectionOwner *e, election_strategy es, ConnectionTracker *t, + double ipm, + CephContext *c) : elector(e), peer_tracker(t), cct(c), + last_election_winner(-1), last_voted_for(-1), + ignore_propose_margin(ipm), + stable_peer_tracker(), + leader_peer_tracker(), + leader_acked(-1), + strategy(es), + participating(true), + electing_me(false) {} + /** + * Set the election strategy to use. If this is not consistent across the + * electing cluster, you're going to have a bad time. + * Defaults to CLASSIC. + */ + void set_election_strategy(election_strategy es) { + strategy = es; + } + /** + * If there are no other peers in this Paxos group, ElectionOwner + * can simply declare victory and we will make it so. + * + * @pre paxos_size() is 1 + * @pre get_my_rank is 0 + */ + void declare_standalone_victory(); + /** + * Start a new election by proposing ourselves as the new Leader. + * + * Basically, send propose messages to all the peers. + * + * @pre participating is true + * @post epoch is an odd value + * @post electing_me is true + * @post We have invoked propose_to_peers() on our ElectionOwner + * @post We have invoked _start() on our ElectionOwner + */ + void start(); + /** + * ElectionOwner has decided the election has taken too long and expired. + * + * This will happen when no one declared victory or started a new election + * during the allowed time span. + * + * When the election expires, we will check if we were the ones who won, and + * if so we will declare victory. If that is not the case, then we assume + * that the one we deferred to didn't declare victory quickly enough (in fact, + * as far as we know, it may even be dead); so, just propose ourselves as the + * Leader. + */ + void end_election_period(); + /** + * Handle a proposal from some other node proposing asking to become + * the Leader. + * + * If the message appears to be old (i.e., its epoch is lower than our epoch), + * then we may take one of two actions: + * + * @li Ignore it because it's nothing more than an old proposal + * @li Start new elections if we verify that it was sent by a monitor from + * outside the quorum; given its old state, it's fair to assume it just + * started, so we should start new elections so it may rejoin. (Some + * handlers may choose to ignore even these, if they think it's flapping.) + * + * We pass the propose off to a propose_*_handler function based + * on the election strategy we're using. + * Only the Connectivity strategy cares about the ConnectionTracker; it should + * be NULL if other strategies are in use. Otherwise, it will take ownership + * of the underlying data and delete it as needed. + * + * @pre Message epoch is from the current or a newer epoch + * @param mepoch The epoch of the proposal + * @param from The rank proposing itself as leader + * @param ct Any incoming ConnectionTracker data sent with the message. + * Callers are responsible for deleting this -- we will copy it if we want + * to keep the data. + */ + void receive_propose(int from, epoch_t mepoch, const ConnectionTracker *ct); + /** + * Handle a message from some other participant Acking us as the Leader. + * + * When we receive such a message, one of three thing may be happening: + * @li We received a message with a newer epoch, which means we must have + * somehow lost track of what was going on (maybe we rebooted), thus we + * will start a new election + * @li We consider ourselves in the run for the Leader (i.e., @p electing_me + * is true), and we are actually being Acked by someone; thus simply add + * the one acking us to the @p acked_me set. If we do now have acks from + * all the participants, then we can declare victory + * @li We already deferred the election to somebody else, so we will just + * ignore this message + * + * @pre Message epoch is from the current or a newer epoch + * @post Election is on-going if we deferred to somebody else + * @post Election is on-going if we are still waiting for further Acks + * @post Election is not on-going if we are victorious + * @post Election is not on-going if we must start a new one + * + * @param from The rank which acked us + * @param from_epoch The election epoch the ack belongs to + */ + void receive_ack(int from, epoch_t from_epoch); + /** + * Handle a message from some other participant declaring Victory. + * + * We just got a message from someone declaring themselves Victorious, thus + * the new Leader. + * + * However, if the message's epoch happens to be different from our epoch+1, + * then it means we lost track of something and we must start a new election. + * + * If that is not the case, then we will simply update our epoch to the one + * in the message and invoke start() to reset the quorum. + * + * @pre from_epoch is the current or a newer epoch + * @post Election is not on-going + * @post Updated @p epoch + * @post We are a peon in a new quorum if we lost the election + * + * @param from The victory-claiming rank + * @param from_epoch The election epoch in which they claim victory + */ + bool receive_victory_claim(int from, epoch_t from_epoch); + /** + * Obtain our epoch + * + * @returns Our current epoch number + */ + epoch_t get_epoch() const { return epoch; } + int get_election_winner() { return last_election_winner; } + +private: + /** + * Initiate the ElectionLogic class. + * + * Basically, we will simply read whatever epoch value we have in our stable + * storage, or consider it to be 1 if none is read. + * + * @post @p epoch is set to 1 or higher. + */ + void init(); + /** + * Update our epoch. + * + * If we come across a higher epoch, we simply update ours, also making + * sure we are no longer being elected (even though we could have been, + * we no longer are since we no longer are on that old epoch). + * + * @pre Our epoch is not larger than @p e + * @post Our epoch equals @p e + * + * @param e Epoch to which we will update our epoch + */ + void bump_epoch(epoch_t e); + /** + * If the incoming proposal is newer, bump our own epoch; if + * it comes from an out-of-quorum peer, trigger a new eleciton. + * @returns true if you should drop this proposal, false otherwise. + */ + bool propose_classic_prefix(int from, epoch_t mepoch); + /** + * Handle a proposal from another rank using the classic strategy. + * We will take one of the following actions: + * + * @li Ignore it because we already acked another node with higher rank + * @li Ignore it and start a new election because we outrank it + * @li Defer to it because it outranks us and the node we previously + * acked, if any + */ + void propose_classic_handler(int from, epoch_t mepoch); + /** + * Handle a proposal from another rank using our disallow strategy. + * This is the same as the classic strategy except we also disallow + * certain ranks from becoming the leader. + */ + void propose_disallow_handler(int from, epoch_t mepoch); + /** + * Handle a proposal from another rank using the connectivity strategy. + * We will choose to defer or not based on the ordered criteria: + * + * @li Whether the other monitor (or ourself) is on the disallow list + * @li Whether the other monitor or ourself has the most connectivity to peers + * @li Whether the other monitor or ourself has the lower rank + */ + void propose_connectivity_handler(int from, epoch_t mepoch, const ConnectionTracker *ct); + /** + * Helper function for connectivity handler. Combines the disallowed list + * with ConnectionTracker scores. + */ + double connectivity_election_score(int rank); + /** + * Defer the current election to some other monitor. + * + * This means that we will ack some other monitor and drop out from the run + * to become the Leader. We will only defer an election if the monitor we + * are deferring to outranks us. + * + * @pre @p who outranks us (i.e., who < our rank) + * @pre @p who outranks any other monitor we have deferred to in the past + * @post electing_me is false + * @post leader_acked equals @p who + * @post we triggered ElectionOwner's _defer_to() on @p who + * + * @param who Some other monitor's numeric identifier. + */ + void defer(int who); + /** + * Declare Victory. + * + * We won. Or at least we believe we won, but for all intents and purposes + * that does not matter. What matters is that we Won. + * + * That said, we must now bump our epoch to reflect that the election is over + * and then we must let everybody in the quorum know we are their brand new + * Leader. + * + * Actually, the quorum will be now defined as the group of monitors that + * acked us during the election process. + * + * @pre Election is on-going + * @pre electing_me is true + * @post electing_me is false + * @post epoch is bumped up into an even value + * @post Election is not on-going + * @post We have a quorum, composed of the monitors that acked us + * @post We invoked message_victory() on the ElectionOwner + */ + void declare_victory(); + /** + * This is just a helper function to validate that the victory claim we + * get from another rank makes any sense. + */ + bool victory_makes_sense(int from); + /** + * Reset some data members which we only care about while we are in an election + * or need to be set consistently during stable states. + */ + void clear_live_election_state(); + void reset_stable_tracker(); + /** + * Only for the connectivity handler, Bump the epoch + * when we get a message from a newer one and clear + * out leader and stable tracker + * data so that we can switch our allegiance. + */ + void connectivity_bump_epoch_in_election(epoch_t mepoch); +}; + +#endif diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc new file mode 100644 index 000000000..671c08d85 --- /dev/null +++ b/src/mon/Elector.cc @@ -0,0 +1,807 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Elector.h" +#include "Monitor.h" + +#include "common/Timer.h" +#include "MonitorDBStore.h" +#include "messages/MMonElection.h" +#include "messages/MMonPing.h" + +#include "common/config.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, get_epoch()) +using std::cerr; +using std::cout; +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::setfill; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; +using std::unique_ptr; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::mono_clock; +using ceph::mono_time; +using ceph::timespan_str; +static ostream& _prefix(std::ostream *_dout, Monitor *mon, epoch_t epoch) { + return *_dout << "mon." << mon->name << "@" << mon->rank + << "(" << mon->get_state_name() + << ").elector(" << epoch << ") "; +} + +Elector::Elector(Monitor *m, int strategy) : logic(this, static_cast<ElectionLogic::election_strategy>(strategy), + &peer_tracker, + m->cct->_conf.get_val<double>("mon_elector_ignore_propose_margin"), + m->cct), + peer_tracker(this, m->rank, + m->cct->_conf.get_val<uint64_t>("mon_con_tracker_score_halflife"), + m->cct->_conf.get_val<uint64_t>("mon_con_tracker_persist_interval"), m->cct), + ping_timeout(m->cct->_conf.get_val<double>("mon_elector_ping_timeout")), + PING_DIVISOR(m->cct->_conf.get_val<uint64_t>("mon_elector_ping_divisor")), + mon(m), elector(this) { + bufferlist bl; + mon->store->get(Monitor::MONITOR_NAME, "connectivity_scores", bl); + if (bl.length()) { + bufferlist::const_iterator bi = bl.begin(); + peer_tracker.decode(bi); + } +} + + +void Elector::persist_epoch(epoch_t e) +{ + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put(Monitor::MONITOR_NAME, "election_epoch", e); + t->put(Monitor::MONITOR_NAME, "connectivity_scores", peer_tracker.get_encoded_bl()); + mon->store->apply_transaction(t); +} + +void Elector::persist_connectivity_scores() +{ + dout(20) << __func__ << dendl; + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put(Monitor::MONITOR_NAME, "connectivity_scores", peer_tracker.get_encoded_bl()); + mon->store->apply_transaction(t); +} + +epoch_t Elector::read_persisted_epoch() const +{ + return mon->store->get(Monitor::MONITOR_NAME, "election_epoch"); +} + +void Elector::validate_store() +{ + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put(Monitor::MONITOR_NAME, "election_writeable_test", rand()); + int r = mon->store->apply_transaction(t); + ceph_assert(r >= 0); +} + +bool Elector::is_current_member(int rank) const +{ + return mon->quorum.count(rank); +} + +void Elector::trigger_new_election() +{ + mon->start_election(); +} + +int Elector::get_my_rank() const +{ + return mon->rank; +} + +void Elector::reset_election() +{ + mon->bootstrap(); +} + +bool Elector::ever_participated() const +{ + return mon->has_ever_joined; +} + +unsigned Elector::paxos_size() const +{ + return (unsigned)mon->monmap->size(); +} + +void Elector::shutdown() +{ + cancel_timer(); +} + +void Elector::notify_bump_epoch() +{ + mon->join_election(); +} + +void Elector::propose_to_peers(epoch_t e, bufferlist& logic_bl) +{ + // bcast to everyone else + for (unsigned i=0; i<mon->monmap->size(); ++i) { + if ((int)i == mon->rank) continue; + MMonElection *m = + new MMonElection(MMonElection::OP_PROPOSE, e, + peer_tracker.get_encoded_bl(), + logic.strategy, mon->monmap); + m->sharing_bl = logic_bl; + m->mon_features = ceph::features::mon::get_supported(); + m->mon_release = ceph_release(); + mon->send_mon_message(m, i); + } +} + +void Elector::_start() +{ + peer_info.clear(); + peer_info[mon->rank].cluster_features = CEPH_FEATURES_ALL; + peer_info[mon->rank].mon_release = ceph_release(); + peer_info[mon->rank].mon_features = ceph::features::mon::get_supported(); + mon->collect_metadata(&peer_info[mon->rank].metadata); + reset_timer(); +} + +void Elector::_defer_to(int who) +{ + MMonElection *m = new MMonElection(MMonElection::OP_ACK, get_epoch(), + peer_tracker.get_encoded_bl(), + logic.strategy, mon->monmap); + m->mon_features = ceph::features::mon::get_supported(); + m->mon_release = ceph_release(); + mon->collect_metadata(&m->metadata); + + mon->send_mon_message(m, who); + + // set a timer + reset_timer(1.0); // give the leader some extra time to declare victory +} + + +void Elector::reset_timer(double plus) +{ + // set the timer + cancel_timer(); + /** + * This class is used as the callback when the expire_event timer fires up. + * + * If the expire_event is fired, then it means that we had an election going, + * either started by us or by some other participant, but it took too long, + * thus expiring. + * + * When the election expires, we will check if we were the ones who won, and + * if so we will declare victory. If that is not the case, then we assume + * that the one we defered to didn't declare victory quickly enough (in fact, + * as far as we know, we may even be dead); so, just propose ourselves as the + * Leader. + */ + expire_event = mon->timer.add_event_after( + g_conf()->mon_election_timeout + plus, + new C_MonContext{mon, [this](int) { + logic.end_election_period(); + }}); +} + + +void Elector::cancel_timer() +{ + if (expire_event) { + mon->timer.cancel_event(expire_event); + expire_event = 0; + } +} + +void Elector::assimilate_connection_reports(const bufferlist& tbl) +{ + dout(10) << __func__ << dendl; + ConnectionTracker pct(tbl, mon->cct); + peer_tracker.receive_peer_report(pct); +} + +void Elector::message_victory(const std::set<int>& quorum) +{ + uint64_t cluster_features = CEPH_FEATURES_ALL; + mon_feature_t mon_features = ceph::features::mon::get_supported(); + map<int,Metadata> metadata; + ceph_release_t min_mon_release{ceph_release_t::unknown}; + for (auto id : quorum) { + auto i = peer_info.find(id); + ceph_assert(i != peer_info.end()); + auto& info = i->second; + cluster_features &= info.cluster_features; + mon_features &= info.mon_features; + metadata[id] = info.metadata; + if (min_mon_release == ceph_release_t::unknown || + info.mon_release < min_mon_release) { + min_mon_release = info.mon_release; + } + } + + cancel_timer(); + + + // tell everyone! + for (set<int>::iterator p = quorum.begin(); + p != quorum.end(); + ++p) { + if (*p == mon->rank) continue; + MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, get_epoch(), + peer_tracker.get_encoded_bl(), + logic.strategy, mon->monmap); + m->quorum = quorum; + m->quorum_features = cluster_features; + m->mon_features = mon_features; + m->sharing_bl = mon->get_local_commands_bl(mon_features); + m->mon_release = min_mon_release; + mon->send_mon_message(m, *p); + } + + // tell monitor + mon->win_election(get_epoch(), quorum, + cluster_features, mon_features, min_mon_release, + metadata); +} + + +void Elector::handle_propose(MonOpRequestRef op) +{ + op->mark_event("elector:handle_propose"); + auto m = op->get_req<MMonElection>(); + dout(5) << "handle_propose from " << m->get_source() << dendl; + int from = m->get_source().num(); + + ceph_assert(m->epoch % 2 == 1); // election + uint64_t required_features = mon->get_required_features(); + mon_feature_t required_mon_features = mon->get_required_mon_features(); + + dout(10) << __func__ << " required features " << required_features + << " " << required_mon_features + << ", peer features " << m->get_connection()->get_features() + << " " << m->mon_features + << dendl; + + if ((required_features ^ m->get_connection()->get_features()) & + required_features) { + dout(5) << " ignoring propose from mon" << from + << " without required features" << dendl; + nak_old_peer(op); + return; + } else if (mon->monmap->min_mon_release > m->mon_release) { + dout(5) << " ignoring propose from mon" << from + << " release " << (int)m->mon_release + << " < min_mon_release " << (int)mon->monmap->min_mon_release + << dendl; + nak_old_peer(op); + return; + } else if (!m->mon_features.contains_all(required_mon_features)) { + // all the features in 'required_mon_features' not in 'm->mon_features' + mon_feature_t missing = required_mon_features.diff(m->mon_features); + dout(5) << " ignoring propose from mon." << from + << " without required mon_features " << missing + << dendl; + nak_old_peer(op); + } + ConnectionTracker *oct = NULL; + if (m->sharing_bl.length()) { + oct = new ConnectionTracker(m->sharing_bl, mon->cct); + } + logic.receive_propose(from, m->epoch, oct); + delete oct; +} + +void Elector::handle_ack(MonOpRequestRef op) +{ + op->mark_event("elector:handle_ack"); + auto m = op->get_req<MMonElection>(); + dout(5) << "handle_ack from " << m->get_source() << dendl; + int from = m->get_source().num(); + + ceph_assert(m->epoch == get_epoch()); + uint64_t required_features = mon->get_required_features(); + if ((required_features ^ m->get_connection()->get_features()) & + required_features) { + dout(5) << " ignoring ack from mon" << from + << " without required features" << dendl; + return; + } + + mon_feature_t required_mon_features = mon->get_required_mon_features(); + if (!m->mon_features.contains_all(required_mon_features)) { + mon_feature_t missing = required_mon_features.diff(m->mon_features); + dout(5) << " ignoring ack from mon." << from + << " without required mon_features " << missing + << dendl; + return; + } + + if (logic.electing_me) { + // thanks + peer_info[from].cluster_features = m->get_connection()->get_features(); + peer_info[from].mon_features = m->mon_features; + peer_info[from].mon_release = m->mon_release; + peer_info[from].metadata = m->metadata; + dout(5) << " so far i have {"; + for (auto q = logic.acked_me.begin(); + q != logic.acked_me.end(); + ++q) { + auto p = peer_info.find(*q); + ceph_assert(p != peer_info.end()); + if (q != logic.acked_me.begin()) + *_dout << ","; + *_dout << " mon." << p->first << ":" + << " features " << p->second.cluster_features + << " " << p->second.mon_features; + } + *_dout << " }" << dendl; + } + + logic.receive_ack(from, m->epoch); +} + +void Elector::handle_victory(MonOpRequestRef op) +{ + op->mark_event("elector:handle_victory"); + auto m = op->get_req<MMonElection>(); + dout(5) << "handle_victory from " << m->get_source() + << " quorum_features " << m->quorum_features + << " " << m->mon_features + << dendl; + int from = m->get_source().num(); + + bool accept_victory = logic.receive_victory_claim(from, m->epoch); + + if (!accept_victory) { + return; + } + + mon->lose_election(get_epoch(), m->quorum, from, + m->quorum_features, m->mon_features, m->mon_release); + + // cancel my timer + cancel_timer(); + + // stash leader's commands + ceph_assert(m->sharing_bl.length()); + vector<MonCommand> new_cmds; + auto bi = m->sharing_bl.cbegin(); + MonCommand::decode_vector(new_cmds, bi); + mon->set_leader_commands(new_cmds); +} + +void Elector::nak_old_peer(MonOpRequestRef op) +{ + op->mark_event("elector:nak_old_peer"); + auto m = op->get_req<MMonElection>(); + uint64_t supported_features = m->get_connection()->get_features(); + uint64_t required_features = mon->get_required_features(); + mon_feature_t required_mon_features = mon->get_required_mon_features(); + dout(10) << "sending nak to peer " << m->get_source() + << " supports " << supported_features << " " << m->mon_features + << ", required " << required_features << " " << required_mon_features + << ", release " << (int)m->mon_release + << " vs required " << (int)mon->monmap->min_mon_release + << dendl; + MMonElection *reply = new MMonElection(MMonElection::OP_NAK, m->epoch, + peer_tracker.get_encoded_bl(), + logic.strategy, mon->monmap); + reply->quorum_features = required_features; + reply->mon_features = required_mon_features; + reply->mon_release = mon->monmap->min_mon_release; + mon->features.encode(reply->sharing_bl); + m->get_connection()->send_message(reply); +} + +void Elector::handle_nak(MonOpRequestRef op) +{ + op->mark_event("elector:handle_nak"); + auto m = op->get_req<MMonElection>(); + dout(1) << "handle_nak from " << m->get_source() + << " quorum_features " << m->quorum_features + << " " << m->mon_features + << " min_mon_release " << (int)m->mon_release + << dendl; + + if (m->mon_release > ceph_release()) { + derr << "Shutting down because I am release " << (int)ceph_release() + << " < min_mon_release " << (int)m->mon_release << dendl; + } else { + CompatSet other; + auto bi = m->sharing_bl.cbegin(); + other.decode(bi); + CompatSet diff = Monitor::get_supported_features().unsupported(other); + + mon_feature_t mon_supported = ceph::features::mon::get_supported(); + // all features in 'm->mon_features' not in 'mon_supported' + mon_feature_t mon_diff = m->mon_features.diff(mon_supported); + + derr << "Shutting down because I lack required monitor features: { " + << diff << " } " << mon_diff << dendl; + } + exit(0); + // the end! +} + +void Elector::begin_peer_ping(int peer) +{ + dout(20) << __func__ << " against " << peer << dendl; + if (live_pinging.count(peer)) { + dout(20) << peer << " already in live_pinging ... return " << dendl; + return; + } + + if (!mon->get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_PINGING)) { + return; + } + + peer_tracker.report_live_connection(peer, 0); // init this peer as existing + live_pinging.insert(peer); + dead_pinging.erase(peer); + peer_acked_ping[peer] = ceph_clock_now(); + if (!send_peer_ping(peer)) return; + mon->timer.add_event_after(ping_timeout / PING_DIVISOR, + new C_MonContext{mon, [this, peer](int) { + ping_check(peer); + }}); +} + +bool Elector::send_peer_ping(int peer, const utime_t *n) +{ + dout(10) << __func__ << " to peer " << peer << dendl; + if (peer >= mon->monmap->ranks.size()) { + // Monitor no longer exists in the monmap, + // therefore, we shouldn't ping this monitor + // since we cannot lookup the address! + dout(5) << "peer: " << peer << " >= ranks_size: " + << mon->monmap->ranks.size() << " ... dropping to prevent " + << "https://tracker.ceph.com/issues/50089" << dendl; + live_pinging.erase(peer); + return false; + } + utime_t now; + if (n != NULL) { + now = *n; + } else { + now = ceph_clock_now(); + } + MMonPing *ping = new MMonPing(MMonPing::PING, now, peer_tracker.get_encoded_bl()); + mon->messenger->send_to_mon(ping, mon->monmap->get_addrs(peer)); + peer_sent_ping[peer] = now; + return true; +} + +void Elector::ping_check(int peer) +{ + dout(20) << __func__ << " to peer " << peer << dendl; + + if (!live_pinging.count(peer) && + !dead_pinging.count(peer)) { + dout(20) << __func__ << peer << " is no longer marked for pinging" << dendl; + return; + } + utime_t now = ceph_clock_now(); + utime_t& acked_ping = peer_acked_ping[peer]; + utime_t& newest_ping = peer_sent_ping[peer]; + if (!acked_ping.is_zero() && acked_ping < now - ping_timeout) { + peer_tracker.report_dead_connection(peer, now - acked_ping); + acked_ping = now; + begin_dead_ping(peer); + return; + } + + if (acked_ping == newest_ping) { + if (!send_peer_ping(peer, &now)) return; + } + + mon->timer.add_event_after(ping_timeout / PING_DIVISOR, + new C_MonContext{mon, [this, peer](int) { + ping_check(peer); + }}); +} + +void Elector::begin_dead_ping(int peer) +{ + dout(20) << __func__ << " to peer " << peer << dendl; + if (dead_pinging.count(peer)) { + return; + } + + live_pinging.erase(peer); + dead_pinging.insert(peer); + mon->timer.add_event_after(ping_timeout, + new C_MonContext{mon, [this, peer](int) { + dead_ping(peer); + }}); +} + +void Elector::dead_ping(int peer) +{ + dout(20) << __func__ << " to peer " << peer << dendl; + if (!dead_pinging.count(peer)) { + dout(20) << __func__ << peer << " is no longer marked for dead pinging" << dendl; + return; + } + ceph_assert(!live_pinging.count(peer)); + + utime_t now = ceph_clock_now(); + utime_t& acked_ping = peer_acked_ping[peer]; + + peer_tracker.report_dead_connection(peer, now - acked_ping); + acked_ping = now; + mon->timer.add_event_after(ping_timeout, + new C_MonContext{mon, [this, peer](int) { + dead_ping(peer); + }}); +} + +void Elector::handle_ping(MonOpRequestRef op) +{ + MMonPing *m = static_cast<MMonPing*>(op->get_req()); + int prank = mon->monmap->get_rank(m->get_source_addr()); + dout(20) << __func__ << " from: " << prank << dendl; + begin_peer_ping(prank); + assimilate_connection_reports(m->tracker_bl); + switch(m->op) { + case MMonPing::PING: + { + MMonPing *reply = new MMonPing(MMonPing::PING_REPLY, m->stamp, peer_tracker.get_encoded_bl()); + m->get_connection()->send_message(reply); + } + break; + + case MMonPing::PING_REPLY: + + const utime_t& previous_acked = peer_acked_ping[prank]; + const utime_t& newest = peer_sent_ping[prank]; + + if (m->stamp > newest && !newest.is_zero()) { + derr << "dropping PING_REPLY stamp " << m->stamp + << " as it is newer than newest sent " << newest << dendl; + return; + } + + if (m->stamp > previous_acked) { + dout(20) << "m->stamp > previous_acked" << dendl; + peer_tracker.report_live_connection(prank, m->stamp - previous_acked); + peer_acked_ping[prank] = m->stamp; + } else{ + dout(20) << "m->stamp <= previous_acked .. we don't report_live_connection" << dendl; + } + utime_t now = ceph_clock_now(); + dout(30) << "now: " << now << " m->stamp: " << m->stamp << " ping_timeout: " + << ping_timeout << " PING_DIVISOR: " << PING_DIVISOR << dendl; + if (now - m->stamp > ping_timeout / PING_DIVISOR) { + if (!send_peer_ping(prank, &now)) return; + } + break; + } +} + +void Elector::dispatch(MonOpRequestRef op) +{ + op->mark_event("elector:dispatch"); + ceph_assert(op->is_type_election_or_ping()); + + switch (op->get_req()->get_type()) { + + case MSG_MON_ELECTION: + { + if (!logic.participating) { + return; + } + if (op->get_req()->get_source().num() >= mon->monmap->size()) { + dout(5) << " ignoring bogus election message with bad mon rank " + << op->get_req()->get_source() << dendl; + return; + } + + auto em = op->get_req<MMonElection>(); + dout(20) << __func__ << " from: " << mon->monmap->get_rank(em->get_source_addr()) << dendl; + // assume an old message encoding would have matched + if (em->fsid != mon->monmap->fsid) { + dout(0) << " ignoring election msg fsid " + << em->fsid << " != " << mon->monmap->fsid << dendl; + return; + } + + if (!mon->monmap->contains(em->get_source_addr())) { + dout(1) << "discarding election message: " << em->get_source_addr() + << " not in my monmap " << *mon->monmap << dendl; + return; + } + + MonMap peermap; + peermap.decode(em->monmap_bl); + if (peermap.epoch > mon->monmap->epoch) { + dout(0) << em->get_source_inst() << " has newer monmap epoch " << peermap.epoch + << " > my epoch " << mon->monmap->epoch + << ", taking it" + << dendl; + mon->monmap->decode(em->monmap_bl); + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put("monmap", mon->monmap->epoch, em->monmap_bl); + t->put("monmap", "last_committed", mon->monmap->epoch); + mon->store->apply_transaction(t); + //mon->monmon()->paxos->stash_latest(mon->monmap->epoch, em->monmap_bl); + cancel_timer(); + mon->notify_new_monmap(false); + mon->bootstrap(); + return; + } + if (peermap.epoch < mon->monmap->epoch) { + dout(0) << em->get_source_inst() << " has older monmap epoch " << peermap.epoch + << " < my epoch " << mon->monmap->epoch + << dendl; + } + + if (em->strategy != logic.strategy) { + dout(5) << __func__ << " somehow got an Election message with different strategy " + << em->strategy << " from local " << logic.strategy + << "; dropping for now to let race resolve" << dendl; + return; + } + + if (em->scoring_bl.length()) { + assimilate_connection_reports(em->scoring_bl); + } + + begin_peer_ping(mon->monmap->get_rank(em->get_source_addr())); + switch (em->op) { + case MMonElection::OP_PROPOSE: + handle_propose(op); + return; + } + + if (em->epoch < get_epoch()) { + dout(5) << "old epoch, dropping" << dendl; + break; + } + + switch (em->op) { + case MMonElection::OP_ACK: + handle_ack(op); + return; + case MMonElection::OP_VICTORY: + handle_victory(op); + return; + case MMonElection::OP_NAK: + handle_nak(op); + return; + default: + ceph_abort(); + } + } + break; + + case MSG_MON_PING: + handle_ping(op); + break; + + default: + ceph_abort(); + } +} + +void Elector::start_participating() +{ + logic.participating = true; +} + +bool Elector::peer_tracker_is_clean() +{ + return peer_tracker.is_clean(mon->rank, paxos_size()); +} + +void Elector::notify_clear_peer_state() +{ + dout(10) << __func__ << dendl; + dout(20) << " peer_tracker before: " << peer_tracker << dendl; + peer_tracker.notify_reset(); + peer_tracker.set_rank(mon->rank); + dout(20) << " peer_tracker after: " << peer_tracker << dendl; +} + +void Elector::notify_rank_changed(int new_rank) +{ + dout(10) << __func__ << " to " << new_rank << dendl; + peer_tracker.notify_rank_changed(new_rank); + live_pinging.erase(new_rank); + dead_pinging.erase(new_rank); +} + +void Elector::notify_rank_removed(int rank_removed, int new_rank) +{ + dout(10) << __func__ << ": " << rank_removed << dendl; + peer_tracker.notify_rank_removed(rank_removed, new_rank); + /* we have to clean up the pinging state, which is annoying + because it's not indexed anywhere (and adding indexing + would also be annoying). + In the case where we are removing any rank that is not the + higest, we start with the removed rank and examine the state + of the surrounding ranks. + Everybody who remains with larger rank gets a new rank one lower + than before, and we have to figure out the remaining scheduled + ping contexts. So, starting one past with the removed rank, we: + * check if the current rank is alive or dead + * examine our new rank (one less than before, initially the removed + rank) + * * erase it if it's in the wrong set + * * start pinging it if we're not already + * check if the next rank is in the same pinging set, and delete + * ourselves if not. + In the case where we are removing the highest rank, + we erase the removed rank from all sets. + */ + if (rank_removed < paxos_size()) { + for (unsigned i = rank_removed + 1; i <= paxos_size() ; ++i) { + if (live_pinging.count(i)) { + dead_pinging.erase(i-1); + if (!live_pinging.count(i-1)) { + begin_peer_ping(i-1); + } + if (!live_pinging.count(i+1)) { + live_pinging.erase(i); + } + } + else if (dead_pinging.count(i)) { + live_pinging.erase(i-1); + if (!dead_pinging.count(i-1)) { + begin_dead_ping(i-1); + } + if (!dead_pinging.count(i+1)) { + dead_pinging.erase(i); + } + } else { + // we aren't pinging rank i at all + if (i-1 == (unsigned)rank_removed) { + // so we special case to make sure we + // actually nuke the removed rank + dead_pinging.erase(rank_removed); + live_pinging.erase(rank_removed); + } + } + } + } else { + if (live_pinging.count(rank_removed)) { + live_pinging.erase(rank_removed); + } + if (dead_pinging.count(rank_removed)) { + dead_pinging.erase(rank_removed); + } + } +} + +void Elector::notify_strategy_maybe_changed(int strategy) +{ + logic.set_election_strategy(static_cast<ElectionLogic::election_strategy>(strategy)); +} diff --git a/src/mon/Elector.h b/src/mon/Elector.h new file mode 100644 index 000000000..2a53c1fc4 --- /dev/null +++ b/src/mon/Elector.h @@ -0,0 +1,406 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_MON_ELECTOR_H +#define CEPH_MON_ELECTOR_H + +#include <map> + +#include "include/types.h" +#include "include/Context.h" +#include "mon/MonOpRequest.h" +#include "mon/mon_types.h" +#include "mon/ElectionLogic.h" +#include "mon/ConnectionTracker.h" + +class Monitor; + + +/** + * This class is responsible for handling messages and maintaining + * an ElectionLogic which holds the local state when electing + * a new Leader. We may win or we may lose. If we win, it means we became the + * Leader; if we lose, it means we are a Peon. + */ +class Elector : public ElectionOwner, RankProvider { + /** + * @defgroup Elector_h_class Elector + * @{ + */ + ElectionLogic logic; + // connectivity validation and scoring + ConnectionTracker peer_tracker; + map<int, utime_t> peer_acked_ping; // rank -> last ping stamp they acked + map<int, utime_t> peer_sent_ping; // rank -> last ping stamp we sent + set<int> live_pinging; // ranks which we are currently pinging + set<int> dead_pinging; // ranks which didn't answer (degrading scores) + double ping_timeout; // the timeout after which we consider a ping to be dead + int PING_DIVISOR = 2; // we time out pings + + /** + * @defgroup Elector_h_internal_types Internal Types + * @{ + */ + /** + * This struct will hold the features from a given peer. + * Features may both be the cluster's (in the form of a uint64_t), or + * mon-specific features. Instead of keeping maps to hold them both, or + * a pair, which would be weird, a struct to keep them seems appropriate. + */ + struct elector_info_t { + uint64_t cluster_features = 0; + mon_feature_t mon_features; + ceph_release_t mon_release{0}; + std::map<std::string,std::string> metadata; + }; + + /** + * @} + */ + + /** + * The Monitor instance associated with this class. + */ + Monitor *mon; + + /** + * Event callback responsible for dealing with an expired election once a + * timer runs out and fires up. + */ + Context *expire_event = nullptr; + + /** + * Resets the expire_event timer, by cancelling any existing one and + * scheduling a new one. + * + * @remarks This function assumes as a default firing value the duration of + * the monitor's lease interval, and adds to it the value specified + * in @e plus + * + * @post expire_event is set + * + * @param plus The amount of time to be added to the default firing value. + */ + void reset_timer(double plus=0.0); + /** + * Cancel the expire_event timer, if it is defined. + * + * @post expire_event is not set + */ + void cancel_timer(); + + // electing me + /** + * @defgroup Elector_h_electing_me_vars We are being elected + * @{ + */ + /** + * Map containing info of all those that acked our proposal to become the Leader. + * Note each peer's info. + */ + std::map<int, elector_info_t> peer_info; + /** + * @} + */ + + /** + * Handle a message from some other node proposing itself to become it + * the Leader. + * + * We validate that the sending Monitor is allowed to participate based on + * its supported features, then pass the request to our ElectionLogic. + * + * @invariant The received message is an operation of type OP_PROPOSE + * + * @pre Message epoch is from the current or a newer epoch + * + * @param m A message sent by another participant in the quorum. + */ + void handle_propose(MonOpRequestRef op); + /** + * Handle a message from some other participant Acking us as the Leader. + * + * We validate that the sending Monitor is allowed to participate based on + * its supported features, add it to peer_info, and pass the ack to our + * ElectionLogic. + * + * @pre Message epoch is from the current or a newer epoch + * + * @param m A message with an operation type of OP_ACK + */ + void handle_ack(MonOpRequestRef op); + /** + * Handle a message from some other participant declaring Victory. + * + * We just got a message from someone declaring themselves Victorious, thus + * the new Leader. + * + * We pass the Victory to our ElectionLogic, and if it confirms the + * victory we lose the election and start following this Leader. Otherwise, + * drop the message. + * + * @pre Message epoch is from the current or a newer epoch + * @post Election is not on-going + * @post Updated @p epoch + * @post We have a new quorum if we lost the election + * + * @param m A message with an operation type of OP_VICTORY + */ + void handle_victory(MonOpRequestRef op); + /** + * Send a nak to a peer who's out of date, containing information about why. + * + * If we get a message from a peer who can't support the required quorum + * features, we have to ignore them. This function will at least send + * them a message about *why* they're being ignored -- if they're new + * enough to support such a message. + * + * @param m A message from a monitor not supporting required features. We + * take ownership of the reference. + */ + void nak_old_peer(MonOpRequestRef op); + /** + * Handle a message from some other participant declaring + * we cannot join the quorum. + * + * Apparently the quorum requires some feature that we do not implement. Shut + * down gracefully. + * + * @pre Election is on-going. + * @post We've shut down. + * + * @param m A message with an operation type of OP_NAK + */ + void handle_nak(MonOpRequestRef op); + /** + * Send a ping to the specified peer. + * @n optional time that we will use instead of calling ceph_clock_now() + */ + bool send_peer_ping(int peer, const utime_t *n=NULL); + /** + * Check the state of pinging the specified peer. This is our + * "tick" for heartbeating; scheduled by itself and begin_peer_ping(). + */ + void ping_check(int peer); + /** + * Move the peer out of live_pinging into dead_pinging set + * and schedule dead_ping()ing on it. + */ + void begin_dead_ping(int peer); + /** + * Checks that the peer is still marked for dead pinging, + * and then marks it as dead for the appropriate interval. + */ + void dead_ping(int peer); + /** + * Handle a ping from another monitor and assimilate the data it contains. + */ + void handle_ping(MonOpRequestRef op); + /** + * Update our view of everybody else's connectivity based on the provided + * tracker bufferlist + */ + void assimilate_connection_reports(const bufferlist& bl); + + public: + /** + * @defgroup Elector_h_ElectionOwner Functions from the ElectionOwner interface + * @{ + */ + /* Commit the given epoch to our MonStore. + * We also take the opportunity to persist our peer_tracker. + */ + void persist_epoch(epoch_t e); + /* Read the epoch out of our MonStore */ + epoch_t read_persisted_epoch() const; + /* Write a nonsense key "election_writeable_test" to our MonStore */ + void validate_store(); + /* Reset my tracking. Currently, just call Monitor::join_election() */ + void notify_bump_epoch(); + /* Call a new election: Invoke Monitor::start_election() */ + void trigger_new_election(); + /* Retrieve rank from the Monitor */ + int get_my_rank() const; + /* Send MMonElection OP_PROPOSE to every monitor in the map. */ + void propose_to_peers(epoch_t e, bufferlist &bl); + /* bootstrap() the Monitor */ + void reset_election(); + /* Retrieve the Monitor::has_ever_joined member */ + bool ever_participated() const; + /* Retrieve monmap->size() */ + unsigned paxos_size() const; + /* Right now we don't disallow anybody */ + set<int> disallowed_leaders; + const set<int>& get_disallowed_leaders() const { return disallowed_leaders; } + /** + * Reset the expire_event timer so we can limit the amount of time we + * will be electing. Clean up our peer_info. + * + * @post we reset the expire_event timer + */ + void _start(); + /** + * Send an MMonElection message deferring to the identified monitor. We + * also increase the election timeout so the monitor we defer to + * has some time to gather deferrals and actually win. (FIXME: necessary to protocol?) + * + * @post we sent an ack message to @p who + * @post we reset the expire_event timer + * + * @param who Some other monitor's numeric identifier. + */ + void _defer_to(int who); + /** + * Our ElectionLogic told us we won an election! Identify the quorum + * features, tell our new peons we've won, and invoke Monitor::win_election(). + */ + void message_victory(const std::set<int>& quorum); + /* Check if rank is in mon->quorum */ + bool is_current_member(int rank) const; + /* + * @} + */ + /** + * Persist our peer_tracker to disk. + */ + void persist_connectivity_scores(); + + Elector *elector; + + /** + * Create an Elector class + * + * @param m A Monitor instance + * @param strategy The election strategy to use, defined in MonMap/ElectionLogic + */ + explicit Elector(Monitor *m, int strategy); + virtual ~Elector() {} + + /** + * Inform this class it is supposed to shutdown. + * + * We will simply cancel the @p expire_event if any exists. + * + * @post @p expire_event is cancelled + */ + void shutdown(); + + /** + * Obtain our epoch from ElectionLogic. + * + * @returns Our current epoch number + */ + epoch_t get_epoch() { return logic.get_epoch(); } + + /** + * If the Monitor knows there are no Paxos peers (so + * we are rank 0 and there are no others) we can declare victory. + */ + void declare_standalone_victory() { + logic.declare_standalone_victory(); + } + /** + * Tell the Elector to start pinging a given peer. + * Do this when you discover a peer and it has a rank assigned. + * We do it ourselves on receipt of pings and when receiving other messages. + */ + void begin_peer_ping(int peer); + /** + * Handle received messages. + * + * We will ignore all messages that are not of type @p MSG_MON_ELECTION + * (i.e., messages whose interface is not of type @p MMonElection). All of + * those that are will then be dispatched to their operation-specific + * functions. + * + * @param m A received message + */ + void dispatch(MonOpRequestRef op); + + /** + * Call an election. + * + * This function simply calls ElectionLogic::start. + */ + void call_election() { + logic.start(); + } + + /** + * Stop participating in subsequent Elections. + * + * @post @p participating is false + */ + void stop_participating() { logic.participating = false; } + /** + * Start participating in Elections. + * + * If we are already participating (i.e., @p participating is true), then + * calling this function is moot. + * + * However, if we are not participating (i.e., @p participating is false), + * then we will start participating by setting @p participating to true and + * we will call for an Election. + * + * @post @p participating is true + */ + void start_participating(); + /** + * Check if our peer_tracker is self-consistent, not suffering from + * https://tracker.ceph.com/issues/58049 + */ + bool peer_tracker_is_clean(); + /** + * Forget everything about our peers. :( + */ + void notify_clear_peer_state(); + /** + * Notify that our local rank has changed + * and we may need to update internal data structures. + */ + void notify_rank_changed(int new_rank); + /** + * A peer has been removed so we should clean up state related to it. + * This is safe to call even if we haven't joined or are currently + * in a quorum. + */ + void notify_rank_removed(int rank_removed, int new_rank); + void notify_strategy_maybe_changed(int strategy); + /** + * Set the disallowed leaders. + * + * If you call this and the new disallowed set + * contains your current leader, you are + * responsible for calling an election! + * + * @returns false if the set is unchanged, + * true if the set changed + */ + bool set_disallowed_leaders(const set<int>& dl) { + if (dl == disallowed_leaders) return false; + disallowed_leaders = dl; + return true; + } + void dump_connection_scores(Formatter *f) { + f->open_object_section("connection scores"); + peer_tracker.dump(f); + f->close_section(); + } + /** + * @} + */ +}; + +#endif diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc new file mode 100644 index 000000000..0b1bb2a03 --- /dev/null +++ b/src/mon/FSCommands.cc @@ -0,0 +1,1516 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat Ltd + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "OSDMonitor.h" + +#include "FSCommands.h" +#include "MDSMonitor.h" +#include "MgrStatMonitor.h" +#include "mds/cephfs_features.h" + +using TOPNSPC::common::cmd_getval; + +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::pair; +using std::set; +using std::string; +using std::to_string; +using std::vector; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::ErasureCodeInterfaceRef; +using ceph::ErasureCodeProfile; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::make_message; +using ceph::mono_clock; +using ceph::mono_time; + +class FlagSetHandler : public FileSystemCommandHandler +{ + public: + FlagSetHandler() + : FileSystemCommandHandler("fs flag set") + { + } + + int handle( + Monitor *mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + string flag_name; + cmd_getval(cmdmap, "flag_name", flag_name); + + string flag_val; + cmd_getval(cmdmap, "val", flag_val); + + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + + if (flag_name == "enable_multiple") { + bool flag_bool = false; + int r = parse_bool(flag_val, &flag_bool, ss); + if (r != 0) { + ss << "Invalid boolean value '" << flag_val << "'"; + return r; + } + + fsmap.set_enable_multiple(flag_bool); + return 0; + } else { + ss << "Unknown flag '" << flag_name << "'"; + return -EINVAL; + } + } +}; + +class FailHandler : public FileSystemCommandHandler +{ + public: + FailHandler() + : FileSystemCommandHandler("fs fail") + { + } + + int handle( + Monitor* mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream& ss) override + { + if (!mon->osdmon()->is_writeable()) { + // not allowed to write yet, so retry when we can + mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op)); + return -EAGAIN; + } + + std::string fs_name; + if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + + auto fs = fsmap.get_filesystem(fs_name); + + auto f = [](auto fs) { + fs->mds_map.set_flag(CEPH_MDSMAP_NOT_JOINABLE); + }; + fsmap.modify_filesystem(fs->fscid, std::move(f)); + + std::vector<mds_gid_t> to_fail; + for (const auto& p : fs->mds_map.get_mds_info()) { + to_fail.push_back(p.first); + } + + for (const auto& gid : to_fail) { + mon->mdsmon()->fail_mds_gid(fsmap, gid); + } + if (!to_fail.empty()) { + mon->osdmon()->propose_pending(); + } + + ss << fs_name; + ss << " marked not joinable; MDS cannot join the cluster. All MDS ranks marked failed."; + + return 0; + } +}; + +class FsNewHandler : public FileSystemCommandHandler +{ + public: + explicit FsNewHandler(Paxos *paxos) + : FileSystemCommandHandler("fs new"), m_paxos(paxos) + { + } + + bool batched_propose() override { + return true; + } + + int handle( + Monitor *mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + ceph_assert(m_paxos->is_plugged()); + + string metadata_name; + cmd_getval(cmdmap, "metadata", metadata_name); + int64_t metadata = mon->osdmon()->osdmap.lookup_pg_pool_name(metadata_name); + if (metadata < 0) { + ss << "pool '" << metadata_name << "' does not exist"; + return -ENOENT; + } + + string data_name; + cmd_getval(cmdmap, "data", data_name); + int64_t data = mon->osdmon()->osdmap.lookup_pg_pool_name(data_name); + if (data < 0) { + ss << "pool '" << data_name << "' does not exist"; + return -ENOENT; + } + if (data == 0) { + ss << "pool '" << data_name << "' has id 0, which CephFS does not allow. Use another pool or recreate it to get a non-zero pool id."; + return -EINVAL; + } + + string fs_name; + cmd_getval(cmdmap, "fs_name", fs_name); + if (fs_name.empty()) { + // Ensure fs name is not empty so that we can implement + // commmands that refer to FS by name in future. + ss << "Filesystem name may not be empty"; + return -EINVAL; + } + + if (fsmap.get_filesystem(fs_name)) { + auto fs = fsmap.get_filesystem(fs_name); + if (*(fs->mds_map.get_data_pools().begin()) == data + && fs->mds_map.get_metadata_pool() == metadata) { + // Identical FS created already, this is a no-op + ss << "filesystem '" << fs_name << "' already exists"; + return 0; + } else { + ss << "filesystem already exists with name '" << fs_name << "'"; + return -EINVAL; + } + } + + bool force = false; + cmd_getval(cmdmap, "force", force); + + const pool_stat_t *stat = mon->mgrstatmon()->get_pool_stat(metadata); + if (stat) { + int64_t metadata_num_objects = stat->stats.sum.num_objects; + if (!force && metadata_num_objects > 0) { + ss << "pool '" << metadata_name + << "' already contains some objects. Use an empty pool instead."; + return -EINVAL; + } + } + + if (fsmap.filesystem_count() > 0 + && !fsmap.get_enable_multiple()) { + ss << "Creation of multiple filesystems is disabled. To enable " + "this experimental feature, use 'ceph fs flag set enable_multiple " + "true'"; + return -EINVAL; + } + + for (auto& fs : fsmap.get_filesystems()) { + const std::vector<int64_t> &data_pools = fs->mds_map.get_data_pools(); + + bool sure = false; + cmd_getval(cmdmap, + "allow_dangerous_metadata_overlay", sure); + + if ((std::find(data_pools.begin(), data_pools.end(), data) != data_pools.end() + || fs->mds_map.get_metadata_pool() == metadata) + && !sure) { + ss << "Filesystem '" << fs_name + << "' is already using one of the specified RADOS pools. This should ONLY be done in emergencies and after careful reading of the documentation. Pass --allow-dangerous-metadata-overlay to permit this."; + return -EEXIST; + } + } + + int64_t fscid = FS_CLUSTER_ID_NONE; + if (cmd_getval(cmdmap, "fscid", fscid)) { + if (!force) { + ss << "Pass --force to create a file system with a specific ID"; + return -EINVAL; + } + if (fsmap.filesystem_exists(fscid)) { + ss << "filesystem already exists with id '" << fscid << "'"; + return -EINVAL; + } + } + + pg_pool_t const *data_pool = mon->osdmon()->osdmap.get_pg_pool(data); + ceph_assert(data_pool != NULL); // Checked it existed above + pg_pool_t const *metadata_pool = mon->osdmon()->osdmap.get_pg_pool(metadata); + ceph_assert(metadata_pool != NULL); // Checked it existed above + + int r = _check_pool(mon->osdmon()->osdmap, data, POOL_DATA_DEFAULT, force, &ss); + if (r < 0) { + return r; + } + + r = _check_pool(mon->osdmon()->osdmap, metadata, POOL_METADATA, force, &ss); + if (r < 0) { + return r; + } + + if (!mon->osdmon()->is_writeable()) { + // not allowed to write yet, so retry when we can + mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op)); + return -EAGAIN; + } + mon->osdmon()->do_application_enable(data, + pg_pool_t::APPLICATION_NAME_CEPHFS, + "data", fs_name, true); + mon->osdmon()->do_application_enable(metadata, + pg_pool_t::APPLICATION_NAME_CEPHFS, + "metadata", fs_name, true); + mon->osdmon()->do_set_pool_opt(metadata, + pool_opts_t::RECOVERY_PRIORITY, + static_cast<int64_t>(5)); + mon->osdmon()->do_set_pool_opt(metadata, + pool_opts_t::PG_NUM_MIN, + static_cast<int64_t>(16)); + mon->osdmon()->do_set_pool_opt(metadata, + pool_opts_t::PG_AUTOSCALE_BIAS, + static_cast<double>(4.0)); + mon->osdmon()->propose_pending(); + + bool recover = false; + cmd_getval(cmdmap, "recover", recover); + + // All checks passed, go ahead and create. + auto&& fs = fsmap.create_filesystem(fs_name, metadata, data, + mon->get_quorum_con_features(), fscid, recover); + + ss << "new fs with metadata pool " << metadata << " and data pool " << data; + + if (recover) { + return 0; + } + + // assign a standby to rank 0 to avoid health warnings + auto info = fsmap.find_replacement_for({fs->fscid, 0}); + + if (info) { + mon->clog->info() << info->human_name() << " assigned to filesystem " + << fs_name << " as rank 0"; + fsmap.promote(info->global_id, *fs, 0); + } + + return 0; + } + +private: + Paxos *m_paxos; +}; + +class SetHandler : public FileSystemCommandHandler +{ +public: + SetHandler() + : FileSystemCommandHandler("fs set") + {} + + int handle( + Monitor *mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + std::string fs_name; + if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + + auto fs = fsmap.get_filesystem(fs_name); + string var; + if (!cmd_getval(cmdmap, "var", var) || var.empty()) { + ss << "Invalid variable"; + return -EINVAL; + } + string val; + string interr; + int64_t n = 0; + if (!cmd_getval(cmdmap, "val", val)) { + return -EINVAL; + } + // we got a string. see if it contains an int. + n = strict_strtoll(val.c_str(), 10, &interr); + if (var == "max_mds") { + // NOTE: see also "mds set_max_mds", which can modify the same field. + if (interr.length()) { + ss << interr; + return -EINVAL; + } + + if (n <= 0) { + ss << "You must specify at least one MDS"; + return -EINVAL; + } + + if (n > 1 && n > fs->mds_map.get_max_mds()) { + if (fs->mds_map.was_snaps_ever_allowed() && + !fs->mds_map.allows_multimds_snaps()) { + ss << "multi-active MDS is not allowed while there are snapshots possibly created by pre-mimic MDS"; + return -EINVAL; + } + } + if (n > MAX_MDS) { + ss << "may not have more than " << MAX_MDS << " MDS ranks"; + return -EINVAL; + } + + fsmap.modify_filesystem( + fs->fscid, + [n](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.clear_flag(CEPH_MDSMAP_NOT_JOINABLE); + fs->mds_map.set_max_mds(n); + }); + } else if (var == "inline_data") { + bool enable_inline = false; + int r = parse_bool(val, &enable_inline, ss); + if (r != 0) { + return r; + } + + if (enable_inline) { + bool confirm = false; + cmd_getval(cmdmap, "yes_i_really_really_mean_it", confirm); + if (!confirm) { + ss << "Inline data support is deprecated and will be removed in a future release. " + << "Add --yes-i-really-really-mean-it if you are certain you want this enabled."; + return -EPERM; + } + ss << "inline data enabled"; + + fsmap.modify_filesystem( + fs->fscid, + [](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.set_inline_data_enabled(true); + }); + } else { + ss << "inline data disabled"; + fsmap.modify_filesystem( + fs->fscid, + [](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.set_inline_data_enabled(false); + }); + } + } else if (var == "balancer") { + if (val.empty()) { + ss << "unsetting the metadata load balancer"; + } else { + ss << "setting the metadata load balancer to " << val; + } + fsmap.modify_filesystem( + fs->fscid, + [val](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.set_balancer(val); + }); + return true; + } else if (var == "max_file_size") { + if (interr.length()) { + ss << var << " requires an integer value"; + return -EINVAL; + } + if (n < CEPH_MIN_STRIPE_UNIT) { + ss << var << " must at least " << CEPH_MIN_STRIPE_UNIT; + return -ERANGE; + } + fsmap.modify_filesystem( + fs->fscid, + [n](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.set_max_filesize(n); + }); + } else if (var == "allow_new_snaps") { + bool enable_snaps = false; + int r = parse_bool(val, &enable_snaps, ss); + if (r != 0) { + return r; + } + + if (!enable_snaps) { + fsmap.modify_filesystem( + fs->fscid, + [](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.clear_snaps_allowed(); + }); + ss << "disabled new snapshots"; + } else { + fsmap.modify_filesystem( + fs->fscid, + [](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.set_snaps_allowed(); + }); + ss << "enabled new snapshots"; + } + } else if (var == "allow_multimds") { + ss << "Multiple MDS is always enabled. Use the max_mds" + << " parameter to control the number of active MDSs" + << " allowed. This command is DEPRECATED and will be" + << " REMOVED from future releases."; + } else if (var == "allow_multimds_snaps") { + bool enable = false; + int r = parse_bool(val, &enable, ss); + if (r != 0) { + return r; + } + + string confirm; + if (!cmd_getval(cmdmap, "confirm", confirm) || + confirm != "--yes-i-am-really-a-mds") { + ss << "Warning! This command is for MDS only. Do not run it manually"; + return -EPERM; + } + + if (enable) { + ss << "enabled multimds with snapshot"; + fsmap.modify_filesystem( + fs->fscid, + [](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.set_multimds_snaps_allowed(); + }); + } else { + ss << "disabled multimds with snapshot"; + fsmap.modify_filesystem( + fs->fscid, + [](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.clear_multimds_snaps_allowed(); + }); + } + } else if (var == "allow_dirfrags") { + ss << "Directory fragmentation is now permanently enabled." + << " This command is DEPRECATED and will be REMOVED from future releases."; + } else if (var == "down") { + bool is_down = false; + int r = parse_bool(val, &is_down, ss); + if (r != 0) { + return r; + } + + ss << fs->mds_map.get_fs_name(); + + fsmap.modify_filesystem( + fs->fscid, + [is_down](std::shared_ptr<Filesystem> fs) + { + if (is_down) { + if (fs->mds_map.get_max_mds() > 0) { + fs->mds_map.set_old_max_mds(); + fs->mds_map.set_max_mds(0); + } /* else already down! */ + } else { + mds_rank_t oldmax = fs->mds_map.get_old_max_mds(); + fs->mds_map.set_max_mds(oldmax ? oldmax : 1); + } + }); + + if (is_down) { + ss << " marked down. "; + } else { + ss << " marked up, max_mds = " << fs->mds_map.get_max_mds(); + } + } else if (var == "cluster_down" || var == "joinable") { + bool joinable = true; + int r = parse_bool(val, &joinable, ss); + if (r != 0) { + return r; + } + if (var == "cluster_down") { + joinable = !joinable; + } + + ss << fs->mds_map.get_fs_name(); + + fsmap.modify_filesystem( + fs->fscid, + [joinable](std::shared_ptr<Filesystem> fs) + { + if (joinable) { + fs->mds_map.clear_flag(CEPH_MDSMAP_NOT_JOINABLE); + } else { + fs->mds_map.set_flag(CEPH_MDSMAP_NOT_JOINABLE); + } + }); + + if (joinable) { + ss << " marked joinable; MDS may join as newly active."; + } else { + ss << " marked not joinable; MDS cannot join as newly active."; + } + + if (var == "cluster_down") { + ss << " WARNING: cluster_down flag is deprecated and will be" + << " removed in a future version. Please use \"joinable\"."; + } + } else if (var == "standby_count_wanted") { + if (interr.length()) { + ss << var << " requires an integer value"; + return -EINVAL; + } + if (n < 0) { + ss << var << " must be non-negative"; + return -ERANGE; + } + fsmap.modify_filesystem( + fs->fscid, + [n](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.set_standby_count_wanted(n); + }); + } else if (var == "session_timeout") { + if (interr.length()) { + ss << var << " requires an integer value"; + return -EINVAL; + } + if (n < 30) { + ss << var << " must be at least 30s"; + return -ERANGE; + } + fsmap.modify_filesystem( + fs->fscid, + [n](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.set_session_timeout((uint32_t)n); + }); + } else if (var == "session_autoclose") { + if (interr.length()) { + ss << var << " requires an integer value"; + return -EINVAL; + } + if (n < 30) { + ss << var << " must be at least 30s"; + return -ERANGE; + } + fsmap.modify_filesystem( + fs->fscid, + [n](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.set_session_autoclose((uint32_t)n); + }); + } else if (var == "allow_standby_replay") { + bool allow = false; + int r = parse_bool(val, &allow, ss); + if (r != 0) { + return r; + } + + if (!allow) { + if (!mon->osdmon()->is_writeable()) { + // not allowed to write yet, so retry when we can + mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op)); + return -EAGAIN; + } + std::vector<mds_gid_t> to_fail; + for (const auto& [gid, info]: fs->mds_map.get_mds_info()) { + if (info.state == MDSMap::STATE_STANDBY_REPLAY) { + to_fail.push_back(gid); + } + } + + for (const auto& gid : to_fail) { + mon->mdsmon()->fail_mds_gid(fsmap, gid); + } + if (!to_fail.empty()) { + mon->osdmon()->propose_pending(); + } + } + + auto f = [allow](auto& fs) { + if (allow) { + fs->mds_map.set_standby_replay_allowed(); + } else { + fs->mds_map.clear_standby_replay_allowed(); + } + }; + fsmap.modify_filesystem(fs->fscid, std::move(f)); + } else if (var == "min_compat_client") { + auto vno = ceph_release_from_name(val.c_str()); + if (!vno) { + ss << "version " << val << " is not recognized"; + return -EINVAL; + } + ss << "WARNING: setting min_compat_client is deprecated" + " and may not do what you want.\n" + "The oldest release to set is octopus.\n" + "Please migrate to `ceph fs required_client_features ...`."; + auto f = [vno](auto&& fs) { + fs->mds_map.set_min_compat_client(vno); + }; + fsmap.modify_filesystem(fs->fscid, std::move(f)); + } else { + ss << "unknown variable " << var; + return -EINVAL; + } + + return 0; + } +}; + +class CompatSetHandler : public FileSystemCommandHandler +{ + public: + CompatSetHandler() + : FileSystemCommandHandler("fs compat") + { + } + + int handle( + Monitor *mon, + FSMap &fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + static const std::set<std::string> subops = {"rm_incompat", "rm_compat", "add_incompat", "add_compat"}; + + std::string fs_name; + if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + auto fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "Not found: '" << fs_name << "'"; + return -ENOENT; + } + + string subop; + if (!cmd_getval(cmdmap, "subop", subop) || subops.count(subop) == 0) { + ss << "subop `" << subop << "' not recognized. Must be one of: " << subops; + return -EINVAL; + } + + int64_t feature; + if (!cmd_getval(cmdmap, "feature", feature) || feature <= 0) { + ss << "Invalid feature"; + return -EINVAL; + } + + if (fs->mds_map.get_num_up_mds() > 0) { + ss << "file system must be failed or down; use `ceph fs fail` to bring down"; + return -EBUSY; + } + + CompatSet cs = fs->mds_map.compat; + if (subop == "rm_compat") { + if (cs.compat.contains(feature)) { + ss << "removed compat feature " << feature; + cs.compat.remove(feature); + } else { + ss << "already removed compat feature " << feature; + } + } else if (subop == "rm_incompat") { + if (cs.incompat.contains(feature)) { + ss << "removed incompat feature " << feature; + cs.incompat.remove(feature); + } else { + ss << "already removed incompat feature " << feature; + } + } else if (subop == "add_compat" || subop == "add_incompat") { + string feature_str; + if (!cmd_getval(cmdmap, "feature_str", feature_str) || feature_str.empty()) { + ss << "adding a feature requires a feature string"; + return -EINVAL; + } + auto f = CompatSet::Feature(feature, feature_str); + if (subop == "add_compat") { + if (cs.compat.contains(feature)) { + auto name = cs.compat.get_name(feature); + if (name == feature_str) { + ss << "feature already exists"; + } else { + ss << "feature with differing name `" << name << "' exists"; + return -EEXIST; + } + } else { + cs.compat.insert(f); + ss << "added compat feature " << f; + } + } else if (subop == "add_incompat") { + if (cs.incompat.contains(feature)) { + auto name = cs.incompat.get_name(feature); + if (name == feature_str) { + ss << "feature already exists"; + } else { + ss << "feature with differing name `" << name << "' exists"; + return -EEXIST; + } + } else { + cs.incompat.insert(f); + ss << "added incompat feature " << f; + } + } else ceph_assert(0); + } else ceph_assert(0); + + auto modifyf = [cs = std::move(cs)](auto&& fs) { + fs->mds_map.compat = cs; + }; + + fsmap.modify_filesystem(fs->fscid, std::move(modifyf)); + return 0; + } +}; + +class RequiredClientFeaturesHandler : public FileSystemCommandHandler +{ + public: + RequiredClientFeaturesHandler() + : FileSystemCommandHandler("fs required_client_features") + { + } + + int handle( + Monitor *mon, + FSMap &fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + std::string fs_name; + if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + auto fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "Not found: '" << fs_name << "'"; + return -ENOENT; + } + string subop; + if (!cmd_getval(cmdmap, "subop", subop) || + (subop != "add" && subop != "rm")) { + ss << "Must either add or rm a feature; " << subop << " is not recognized"; + return -EINVAL; + } + string val; + if (!cmd_getval(cmdmap, "val", val) || val.empty()) { + ss << "Missing feature id/name"; + return -EINVAL; + } + + int feature = cephfs_feature_from_name(val); + if (feature < 0) { + string err; + feature = strict_strtol(val.c_str(), 10, &err); + if (err.length()) { + ss << "Invalid feature name: " << val; + return -EINVAL; + } + if (feature < 0 || feature > CEPHFS_FEATURE_MAX) { + ss << "Invalid feature id: " << feature; + return -EINVAL; + } + } + + if (subop == "add") { + bool ret = false; + fsmap.modify_filesystem( + fs->fscid, + [feature, &ret](auto&& fs) + { + if (fs->mds_map.get_required_client_features().test(feature)) + return; + fs->mds_map.add_required_client_feature(feature); + ret = true; + }); + if (ret) { + ss << "added feature '" << cephfs_feature_name(feature) << "' to required_client_features"; + } else { + ss << "feature '" << cephfs_feature_name(feature) << "' is already set"; + } + } else { + bool ret = false; + fsmap.modify_filesystem( + fs->fscid, + [feature, &ret](auto&& fs) + { + if (!fs->mds_map.get_required_client_features().test(feature)) + return; + fs->mds_map.remove_required_client_feature(feature); + ret = true; + }); + if (ret) { + ss << "removed feature '" << cephfs_feature_name(feature) << "' from required_client_features"; + } else { + ss << "feature '" << cephfs_feature_name(feature) << "' is already unset"; + } + } + return 0; + } +}; + + +class AddDataPoolHandler : public FileSystemCommandHandler +{ + public: + explicit AddDataPoolHandler(Paxos *paxos) + : FileSystemCommandHandler("fs add_data_pool"), m_paxos(paxos) + {} + + bool batched_propose() override { + return true; + } + + int handle( + Monitor *mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + ceph_assert(m_paxos->is_plugged()); + + string poolname; + cmd_getval(cmdmap, "pool", poolname); + + std::string fs_name; + if (!cmd_getval(cmdmap, "fs_name", fs_name) + || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + + int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(poolname); + if (poolid < 0) { + string err; + poolid = strict_strtol(poolname.c_str(), 10, &err); + if (err.length()) { + ss << "pool '" << poolname << "' does not exist"; + return -ENOENT; + } + } + + int r = _check_pool(mon->osdmon()->osdmap, poolid, POOL_DATA_EXTRA, false, &ss); + if (r != 0) { + return r; + } + + auto fs = fsmap.get_filesystem(fs_name); + // no-op when the data_pool already on fs + if (fs->mds_map.is_data_pool(poolid)) { + ss << "data pool " << poolid << " is already on fs " << fs_name; + return 0; + } + + if (!mon->osdmon()->is_writeable()) { + // not allowed to write yet, so retry when we can + mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op)); + return -EAGAIN; + } + mon->osdmon()->do_application_enable(poolid, + pg_pool_t::APPLICATION_NAME_CEPHFS, + "data", fs_name, true); + mon->osdmon()->propose_pending(); + + fsmap.modify_filesystem( + fs->fscid, + [poolid](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.add_data_pool(poolid); + }); + + ss << "added data pool " << poolid << " to fsmap"; + + return 0; + } + +private: + Paxos *m_paxos; +}; + +class SetDefaultHandler : public FileSystemCommandHandler +{ + public: + SetDefaultHandler() + : FileSystemCommandHandler("fs set-default") + {} + + int handle( + Monitor *mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + std::string fs_name; + cmd_getval(cmdmap, "fs_name", fs_name); + auto fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "filesystem '" << fs_name << "' does not exist"; + return -ENOENT; + } + + fsmap.set_legacy_client_fscid(fs->fscid); + return 0; + } +}; + +class RemoveFilesystemHandler : public FileSystemCommandHandler +{ + public: + RemoveFilesystemHandler() + : FileSystemCommandHandler("fs rm") + {} + + int handle( + Monitor *mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + /* We may need to blocklist ranks. */ + if (!mon->osdmon()->is_writeable()) { + // not allowed to write yet, so retry when we can + mon->osdmon()->wait_for_writeable(op, new PaxosService::C_RetryMessage(mon->mdsmon(), op)); + return -EAGAIN; + } + + // Check caller has correctly named the FS to delete + // (redundant while there is only one FS, but command + // syntax should apply to multi-FS future) + string fs_name; + cmd_getval(cmdmap, "fs_name", fs_name); + auto fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + // Consider absence success to make deletes idempotent + ss << "filesystem '" << fs_name << "' does not exist"; + return 0; + } + + // Check that no MDS daemons are active + if (fs->mds_map.get_num_up_mds() > 0) { + ss << "all MDS daemons must be inactive/failed before removing filesystem. See `ceph fs fail`."; + return -EINVAL; + } + + // Check for confirmation flag + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure) { + ss << "this is a DESTRUCTIVE operation and will make data in your filesystem permanently" \ + " inaccessible. Add --yes-i-really-mean-it if you are sure you wish to continue."; + return -EPERM; + } + + if (fsmap.get_legacy_client_fscid() == fs->fscid) { + fsmap.set_legacy_client_fscid(FS_CLUSTER_ID_NONE); + } + + std::vector<mds_gid_t> to_fail; + // There may be standby_replay daemons left here + for (const auto &i : fs->mds_map.get_mds_info()) { + ceph_assert(i.second.state == MDSMap::STATE_STANDBY_REPLAY); + to_fail.push_back(i.first); + } + + for (const auto &gid : to_fail) { + // Standby replays don't write, so it isn't important to + // wait for an osdmap propose here: ignore return value. + mon->mdsmon()->fail_mds_gid(fsmap, gid); + } + if (!to_fail.empty()) { + mon->osdmon()->propose_pending(); /* maybe new blocklists */ + } + + fsmap.erase_filesystem(fs->fscid); + + return 0; + } +}; + +class ResetFilesystemHandler : public FileSystemCommandHandler +{ + public: + ResetFilesystemHandler() + : FileSystemCommandHandler("fs reset") + {} + + int handle( + Monitor *mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + string fs_name; + cmd_getval(cmdmap, "fs_name", fs_name); + auto fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "filesystem '" << fs_name << "' does not exist"; + // Unlike fs rm, we consider this case an error + return -ENOENT; + } + + // Check that no MDS daemons are active + if (fs->mds_map.get_num_up_mds() > 0) { + ss << "all MDS daemons must be inactive before resetting filesystem: set the cluster_down flag" + " and use `ceph mds fail` to make this so"; + return -EINVAL; + } + + // Check for confirmation flag + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure) { + ss << "this is a potentially destructive operation, only for use by experts in disaster recovery. " + "Add --yes-i-really-mean-it if you are sure you wish to continue."; + return -EPERM; + } + + fsmap.reset_filesystem(fs->fscid); + + return 0; + } +}; + +class RemoveDataPoolHandler : public FileSystemCommandHandler +{ + public: + RemoveDataPoolHandler() + : FileSystemCommandHandler("fs rm_data_pool") + {} + + int handle( + Monitor *mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + string poolname; + cmd_getval(cmdmap, "pool", poolname); + + std::string fs_name; + if (!cmd_getval(cmdmap, "fs_name", fs_name) + || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + + int64_t poolid = mon->osdmon()->osdmap.lookup_pg_pool_name(poolname); + if (poolid < 0) { + string err; + poolid = strict_strtol(poolname.c_str(), 10, &err); + if (err.length()) { + ss << "pool '" << poolname << "' does not exist"; + return -ENOENT; + } else if (poolid < 0) { + ss << "invalid pool id '" << poolid << "'"; + return -EINVAL; + } + } + + ceph_assert(poolid >= 0); // Checked by parsing code above + + auto fs = fsmap.get_filesystem(fs_name); + if (fs->mds_map.get_first_data_pool() == poolid) { + ss << "cannot remove default data pool"; + return -EINVAL; + } + + int r = 0; + fsmap.modify_filesystem(fs->fscid, + [&r, poolid](std::shared_ptr<Filesystem> fs) + { + r = fs->mds_map.remove_data_pool(poolid); + }); + if (r == -ENOENT) { + // It was already removed, succeed in silence + return 0; + } else if (r == 0) { + // We removed it, succeed + ss << "removed data pool " << poolid << " from fsmap"; + return 0; + } else { + // Unexpected error, bubble up + return r; + } + } +}; + +/** + * For commands with an alternative prefix + */ +template<typename T> +class AliasHandler : public T +{ + std::string alias_prefix; + + public: + explicit AliasHandler(const std::string &new_prefix) + : T() + { + alias_prefix = new_prefix; + } + + std::string const &get_prefix() const override {return alias_prefix;} + + int handle( + Monitor *mon, + FSMap& fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) override + { + return T::handle(mon, fsmap, op, cmdmap, ss); + } +}; + +class MirrorHandlerEnable : public FileSystemCommandHandler +{ +public: + MirrorHandlerEnable() + : FileSystemCommandHandler("fs mirror enable") + {} + + int handle(Monitor *mon, + FSMap &fsmap, MonOpRequestRef op, + const cmdmap_t& cmdmap, std::ostream &ss) override { + std::string fs_name; + if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + + auto fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "Filesystem '" << fs_name << "' not found"; + return -ENOENT; + } + + if (fs->mirror_info.is_mirrored()) { + return 0; + } + + auto f = [](auto &&fs) { + fs->mirror_info.enable_mirroring(); + }; + fsmap.modify_filesystem(fs->fscid, std::move(f)); + + return 0; + } +}; + +class MirrorHandlerDisable : public FileSystemCommandHandler +{ +public: + MirrorHandlerDisable() + : FileSystemCommandHandler("fs mirror disable") + {} + + int handle(Monitor *mon, + FSMap &fsmap, MonOpRequestRef op, + const cmdmap_t& cmdmap, std::ostream &ss) override { + std::string fs_name; + if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + + auto fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "Filesystem '" << fs_name << "' not found"; + return -ENOENT; + } + + if (!fs->mirror_info.is_mirrored()) { + return 0; + } + + auto f = [](auto &&fs) { + fs->mirror_info.disable_mirroring(); + }; + fsmap.modify_filesystem(fs->fscid, std::move(f)); + + return 0; + } +}; + +class MirrorHandlerAddPeer : public FileSystemCommandHandler +{ +public: + MirrorHandlerAddPeer() + : FileSystemCommandHandler("fs mirror peer_add") + {} + + boost::optional<std::pair<string, string>> + extract_remote_cluster_conf(const std::string &spec) { + auto pos = spec.find("@"); + if (pos == std::string_view::npos) { + return boost::optional<std::pair<string, string>>(); + } + + auto client = spec.substr(0, pos); + auto cluster = spec.substr(pos+1); + + return std::make_pair(client, cluster); + } + + bool peer_add(FSMap &fsmap, Filesystem::const_ref &&fs, + const cmdmap_t &cmdmap, std::ostream &ss) { + string peer_uuid; + string remote_spec; + string remote_fs_name; + cmd_getval(cmdmap, "uuid", peer_uuid); + cmd_getval(cmdmap, "remote_cluster_spec", remote_spec); + cmd_getval(cmdmap, "remote_fs_name", remote_fs_name); + + // verify (and extract) remote cluster specification + auto remote_conf = extract_remote_cluster_conf(remote_spec); + if (!remote_conf) { + ss << "invalid remote cluster spec -- should be <client>@<cluster>"; + return false; + } + + if (fs->mirror_info.has_peer(peer_uuid)) { + ss << "peer already exists"; + return true; + } + if (fs->mirror_info.has_peer((*remote_conf).first, (*remote_conf).second, + remote_fs_name)) { + ss << "peer already exists"; + return true; + } + + auto f = [peer_uuid, remote_conf, remote_fs_name](auto &&fs) { + fs->mirror_info.peer_add(peer_uuid, (*remote_conf).first, + (*remote_conf).second, remote_fs_name); + }; + fsmap.modify_filesystem(fs->fscid, std::move(f)); + return true; + } + + int handle(Monitor *mon, + FSMap &fsmap, MonOpRequestRef op, + const cmdmap_t& cmdmap, std::ostream &ss) override { + std::string fs_name; + if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + + auto fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "Filesystem '" << fs_name << "' not found"; + return -ENOENT; + } + + if (!fs->mirror_info.is_mirrored()) { + ss << "Mirroring not enabled for filesystem '" << fs_name << "'"; + return -EINVAL; + } + + auto res = peer_add(fsmap, std::move(fs), cmdmap, ss); + if (!res) { + return -EINVAL; + } + + return 0; + } +}; + +class MirrorHandlerRemovePeer : public FileSystemCommandHandler +{ +public: + MirrorHandlerRemovePeer() + : FileSystemCommandHandler("fs mirror peer_remove") + {} + + bool peer_remove(FSMap &fsmap, Filesystem::const_ref &&fs, + const cmdmap_t &cmdmap, std::ostream &ss) { + string peer_uuid; + cmd_getval(cmdmap, "uuid", peer_uuid); + + if (!fs->mirror_info.has_peer(peer_uuid)) { + ss << "cannot find peer with uuid: " << peer_uuid; + return true; + } + + auto f = [peer_uuid](auto &&fs) { + fs->mirror_info.peer_remove(peer_uuid); + }; + fsmap.modify_filesystem(fs->fscid, std::move(f)); + return true; + } + + int handle(Monitor *mon, + FSMap &fsmap, MonOpRequestRef op, + const cmdmap_t& cmdmap, std::ostream &ss) override { + std::string fs_name; + if (!cmd_getval(cmdmap, "fs_name", fs_name) || fs_name.empty()) { + ss << "Missing filesystem name"; + return -EINVAL; + } + + auto fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "Filesystem '" << fs_name << "' not found"; + return -ENOENT; + } + + if (!fs->mirror_info.is_mirrored()) { + ss << "Mirroring not enabled for filesystem '" << fs_name << "'"; + return -EINVAL; + } + + auto res = peer_remove(fsmap, std::move(fs), cmdmap, ss); + if (!res) { + return -EINVAL; + } + + return 0; + } +}; + +std::list<std::shared_ptr<FileSystemCommandHandler> > +FileSystemCommandHandler::load(Paxos *paxos) +{ + std::list<std::shared_ptr<FileSystemCommandHandler> > handlers; + + handlers.push_back(std::make_shared<SetHandler>()); + handlers.push_back(std::make_shared<FailHandler>()); + handlers.push_back(std::make_shared<FlagSetHandler>()); + handlers.push_back(std::make_shared<CompatSetHandler>()); + handlers.push_back(std::make_shared<RequiredClientFeaturesHandler>()); + handlers.push_back(std::make_shared<AddDataPoolHandler>(paxos)); + handlers.push_back(std::make_shared<RemoveDataPoolHandler>()); + handlers.push_back(std::make_shared<FsNewHandler>(paxos)); + handlers.push_back(std::make_shared<RemoveFilesystemHandler>()); + handlers.push_back(std::make_shared<ResetFilesystemHandler>()); + + handlers.push_back(std::make_shared<SetDefaultHandler>()); + handlers.push_back(std::make_shared<AliasHandler<SetDefaultHandler> >( + "fs set_default")); + handlers.push_back(std::make_shared<MirrorHandlerEnable>()); + handlers.push_back(std::make_shared<MirrorHandlerDisable>()); + handlers.push_back(std::make_shared<MirrorHandlerAddPeer>()); + handlers.push_back(std::make_shared<MirrorHandlerRemovePeer>()); + + return handlers; +} + +int FileSystemCommandHandler::_check_pool( + OSDMap &osd_map, + const int64_t pool_id, + int type, + bool force, + std::ostream *ss) const +{ + ceph_assert(ss != NULL); + + const pg_pool_t *pool = osd_map.get_pg_pool(pool_id); + if (!pool) { + *ss << "pool id '" << pool_id << "' does not exist"; + return -ENOENT; + } + + const string& pool_name = osd_map.get_pool_name(pool_id); + + if (pool->is_erasure()) { + if (type == POOL_METADATA) { + *ss << "pool '" << pool_name << "' (id '" << pool_id << "')" + << " is an erasure-coded pool. Use of erasure-coded pools" + << " for CephFS metadata is not permitted"; + return -EINVAL; + } else if (type == POOL_DATA_DEFAULT && !force) { + *ss << "pool '" << pool_name << "' (id '" << pool_id << "')" + " is an erasure-coded pool." + " Use of an EC pool for the default data pool is discouraged;" + " see the online CephFS documentation for more information." + " Use --force to override."; + return -EINVAL; + } else if (!pool->allows_ecoverwrites()) { + // non-overwriteable EC pools are only acceptable with a cache tier overlay + if (!pool->has_tiers() || !pool->has_read_tier() || !pool->has_write_tier()) { + *ss << "pool '" << pool_name << "' (id '" << pool_id << "')" + << " is an erasure-coded pool, with no overwrite support"; + return -EINVAL; + } + + // That cache tier overlay must be writeback, not readonly (it's the + // write operations like modify+truncate we care about support for) + const pg_pool_t *write_tier = osd_map.get_pg_pool( + pool->write_tier); + ceph_assert(write_tier != NULL); // OSDMonitor shouldn't allow DNE tier + if (write_tier->cache_mode == pg_pool_t::CACHEMODE_FORWARD + || write_tier->cache_mode == pg_pool_t::CACHEMODE_READONLY) { + *ss << "EC pool '" << pool_name << "' has a write tier (" + << osd_map.get_pool_name(pool->write_tier) + << ") that is configured " + "to forward writes. Use a cache mode such as 'writeback' for " + "CephFS"; + return -EINVAL; + } + } + } + + if (pool->is_tier()) { + *ss << " pool '" << pool_name << "' (id '" << pool_id + << "') is already in use as a cache tier."; + return -EINVAL; + } + + if (!force && !pool->application_metadata.empty() && + pool->application_metadata.count( + pg_pool_t::APPLICATION_NAME_CEPHFS) == 0) { + *ss << " pool '" << pool_name << "' (id '" << pool_id + << "') has a non-CephFS application enabled."; + return -EINVAL; + } + + // Nothing special about this pool, so it is permissible + return 0; +} + +int FileSystemCommandHandler::is_op_allowed( + const MonOpRequestRef& op, const FSMap& fsmap, const cmdmap_t& cmdmap, + std::ostream &ss) const +{ + string fs_name; + cmd_getval(cmdmap, "fs_name", fs_name); + + // so that fsmap can filtered and the original copy is untouched. + FSMap fsmap_copy = fsmap; + fsmap_copy.filter(op->get_session()->get_allowed_fs_names()); + + auto fs = fsmap_copy.get_filesystem(fs_name); + if (fs == nullptr) { + /* let "fs rm" handle idempotent case where file system does not exist */ + if (!(get_prefix() == "fs rm" && fsmap.get_filesystem(fs_name) == nullptr)) { + ss << "Filesystem not found: '" << fs_name << "'"; + return -ENOENT; + } + } + + if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) { + ss << "Permission denied: '" << fs_name << "'"; + return -EPERM; + } + + return 1; +} diff --git a/src/mon/FSCommands.h b/src/mon/FSCommands.h new file mode 100644 index 000000000..4b59225f9 --- /dev/null +++ b/src/mon/FSCommands.h @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat Ltd + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef FS_COMMANDS_H_ +#define FS_COMMANDS_H_ + +#include "Monitor.h" +#include "CommandHandler.h" + +#include "osd/OSDMap.h" +#include "mds/FSMap.h" + +#include <string> +#include <ostream> + +class FileSystemCommandHandler : protected CommandHandler +{ +protected: + std::string prefix; + + enum { + POOL_METADATA, + POOL_DATA_DEFAULT, + POOL_DATA_EXTRA, + }; + /** + * Return 0 if the pool is suitable for use with CephFS, or + * in case of errors return a negative error code, and populate + * the passed ostream with an explanation. + * + * @param metadata whether the pool will be for metadata (stricter checks) + */ + int _check_pool( + OSDMap &osd_map, + const int64_t pool_id, + int type, + bool force, + std::ostream *ss) const; + + virtual std::string const &get_prefix() const {return prefix;} + +public: + FileSystemCommandHandler(const std::string &prefix_) + : prefix(prefix_) + {} + + virtual ~FileSystemCommandHandler() + {} + + int is_op_allowed(const MonOpRequestRef& op, const FSMap& fsmap, + const cmdmap_t& cmdmap, std::ostream &ss) const; + + int can_handle(std::string const &prefix_, MonOpRequestRef& op, FSMap& fsmap, + const cmdmap_t& cmdmap, std::ostream &ss) const + { + if (get_prefix() != prefix_) { + return 0; + } + + if (get_prefix() == "fs new" || get_prefix() == "fs flag set") { + return 1; + } + + return is_op_allowed(op, fsmap, cmdmap, ss); + } + + static std::list<std::shared_ptr<FileSystemCommandHandler> > load(Paxos *paxos); + + virtual bool batched_propose() { + return false; + } + + virtual int handle( + Monitor *mon, + FSMap &fsmap, + MonOpRequestRef op, + const cmdmap_t& cmdmap, + std::ostream &ss) = 0; +}; + +#endif diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc new file mode 100644 index 000000000..a45159e7c --- /dev/null +++ b/src/mon/HealthMonitor.cc @@ -0,0 +1,877 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <stdlib.h> +#include <limits.h> +#include <sstream> +#include <regex> +#include <time.h> +#include <iterator> + +#include "include/ceph_assert.h" +#include "include/common_fwd.h" +#include "include/stringify.h" + +#include "mon/Monitor.h" +#include "mon/HealthMonitor.h" + +#include "messages/MMonHealthChecks.h" + +#include "common/Formatter.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, this) +using namespace TOPNSPC::common; + +using namespace std::literals; +using std::cerr; +using std::cout; +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::setfill; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; +using std::unique_ptr; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::mono_clock; +using ceph::mono_time; +using ceph::parse_timespan; +using ceph::timespan_str; +static ostream& _prefix(std::ostream *_dout, const Monitor &mon, + const HealthMonitor *hmon) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() << ").health "; +} + +HealthMonitor::HealthMonitor(Monitor &m, Paxos &p, const string& service_name) + : PaxosService(m, p, service_name) { +} + +void HealthMonitor::init() +{ + dout(10) << __func__ << dendl; +} + +void HealthMonitor::create_initial() +{ + dout(10) << __func__ << dendl; +} + +void HealthMonitor::update_from_paxos(bool *need_bootstrap) +{ + version = get_last_committed(); + dout(10) << __func__ << dendl; + load_health(); + + bufferlist qbl; + mon.store->get(service_name, "quorum", qbl); + if (qbl.length()) { + auto p = qbl.cbegin(); + decode(quorum_checks, p); + } else { + quorum_checks.clear(); + } + + bufferlist lbl; + mon.store->get(service_name, "leader", lbl); + if (lbl.length()) { + auto p = lbl.cbegin(); + decode(leader_checks, p); + } else { + leader_checks.clear(); + } + + { + bufferlist bl; + mon.store->get(service_name, "mutes", bl); + if (bl.length()) { + auto p = bl.cbegin(); + decode(mutes, p); + } else { + mutes.clear(); + } + } + + dout(20) << "dump:"; + JSONFormatter jf(true); + jf.open_object_section("health"); + jf.open_object_section("quorum_health"); + for (auto& p : quorum_checks) { + string s = string("mon.") + stringify(p.first); + jf.dump_object(s.c_str(), p.second); + } + jf.close_section(); + jf.dump_object("leader_health", leader_checks); + jf.close_section(); + jf.flush(*_dout); + *_dout << dendl; +} + +void HealthMonitor::create_pending() +{ + dout(10) << " " << version << dendl; + pending_mutes = mutes; +} + +void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t) +{ + ++version; + dout(10) << " " << version << dendl; + put_last_committed(t, version); + + bufferlist qbl; + encode(quorum_checks, qbl); + t->put(service_name, "quorum", qbl); + bufferlist lbl; + encode(leader_checks, lbl); + t->put(service_name, "leader", lbl); + { + bufferlist bl; + encode(pending_mutes, bl); + t->put(service_name, "mutes", bl); + } + + health_check_map_t pending_health; + + // combine per-mon details carefully... + map<string,set<string>> names; // code -> <mon names> + for (auto p : quorum_checks) { + for (auto q : p.second.checks) { + names[q.first].insert(mon.monmap->get_name(p.first)); + } + pending_health.merge(p.second); + } + for (auto &p : pending_health.checks) { + p.second.summary = std::regex_replace( + p.second.summary, + std::regex("%hasorhave%"), + names[p.first].size() > 1 ? "have" : "has"); + p.second.summary = std::regex_replace( + p.second.summary, + std::regex("%names%"), stringify(names[p.first])); + p.second.summary = std::regex_replace( + p.second.summary, + std::regex("%plurals%"), + names[p.first].size() > 1 ? "s" : ""); + p.second.summary = std::regex_replace( + p.second.summary, + std::regex("%isorare%"), + names[p.first].size() > 1 ? "are" : "is"); + } + + pending_health.merge(leader_checks); + encode_health(pending_health, t); +} + +version_t HealthMonitor::get_trim_to() const +{ + // we don't actually need *any* old states, but keep a few. + if (version > 5) { + return version - 5; + } + return 0; +} + +bool HealthMonitor::preprocess_query(MonOpRequestRef op) +{ + auto m = op->get_req<PaxosServiceMessage>(); + switch (m->get_type()) { + case MSG_MON_COMMAND: + return preprocess_command(op); + case MSG_MON_HEALTH_CHECKS: + return false; + default: + mon.no_reply(op); + derr << "Unhandled message type " << m->get_type() << dendl; + return true; + } +} + +bool HealthMonitor::prepare_update(MonOpRequestRef op) +{ + Message *m = op->get_req(); + dout(7) << "prepare_update " << *m + << " from " << m->get_orig_source_inst() << dendl; + switch (m->get_type()) { + case MSG_MON_HEALTH_CHECKS: + return prepare_health_checks(op); + case MSG_MON_COMMAND: + return prepare_command(op); + default: + return false; + } +} + +bool HealthMonitor::preprocess_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + std::stringstream ss; + bufferlist rdata; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed()); + return true; + } + + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", rdata, + get_last_committed()); + return true; + } + // more sanity checks + try { + string format; + cmd_getval(cmdmap, "format", format); + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + } catch (const bad_cmd_get& e) { + mon.reply_command(op, -EINVAL, e.what(), rdata, get_last_committed()); + return true; + } + return false; +} + +bool HealthMonitor::prepare_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + + std::stringstream ss; + bufferlist rdata; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed()); + return true; + } + + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed()); + return true; + } + + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + + int r = 0; + + if (prefix == "health mute") { + string code; + bool sticky = false; + if (!cmd_getval(cmdmap, "code", code) || + code == "") { + r = -EINVAL; + ss << "must specify an alert code to mute"; + goto out; + } + cmd_getval(cmdmap, "sticky", sticky); + string ttl_str; + utime_t ttl; + if (cmd_getval(cmdmap, "ttl", ttl_str)) { + auto secs = parse_timespan(ttl_str); + if (secs == 0s) { + r = -EINVAL; + ss << "not a valid duration: " << ttl_str; + goto out; + } + ttl = ceph_clock_now(); + ttl += std::chrono::duration<double>(secs).count(); + } + health_check_map_t all; + gather_all_health_checks(&all); + string summary; + int64_t count = 0; + if (!sticky) { + auto p = all.checks.find(code); + if (p == all.checks.end()) { + r = -ENOENT; + ss << "health alert " << code << " is not currently raised"; + goto out; + } + count = p->second.count; + summary = p->second.summary; + } + auto& m = pending_mutes[code]; + m.code = code; + m.ttl = ttl; + m.sticky = sticky; + m.summary = summary; + m.count = count; + } else if (prefix == "health unmute") { + string code; + if (cmd_getval(cmdmap, "code", code)) { + pending_mutes.erase(code); + } else { + pending_mutes.clear(); + } + } else { + ss << "Command '" << prefix << "' not implemented!"; + r = -ENOSYS; + } + +out: + dout(4) << __func__ << " done, r=" << r << dendl; + /* Compose response */ + string rs; + getline(ss, rs); + + if (r >= 0) { + // success.. delay reply + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs, + get_last_committed() + 1)); + return true; + } else { + // reply immediately + mon.reply_command(op, r, rs, rdata, get_last_committed()); + return false; + } +} + +bool HealthMonitor::prepare_health_checks(MonOpRequestRef op) +{ + auto m = op->get_req<MMonHealthChecks>(); + // no need to check if it's changed, the peon has done so + quorum_checks[m->get_source().num()] = std::move(m->health_checks); + return true; +} + +void HealthMonitor::tick() +{ + if (!is_active()) { + return; + } + dout(10) << __func__ << dendl; + bool changed = false; + if (check_member_health()) { + changed = true; + } + if (!mon.is_leader()) { + return; + } + if (check_leader_health()) { + changed = true; + } + if (check_mutes()) { + changed = true; + } + if (changed) { + propose_pending(); + } +} + +bool HealthMonitor::check_mutes() +{ + bool changed = true; + auto now = ceph_clock_now(); + health_check_map_t all; + gather_all_health_checks(&all); + auto p = pending_mutes.begin(); + while (p != pending_mutes.end()) { + if (p->second.ttl != utime_t() && + p->second.ttl <= now) { + mon.clog->info() << "Health alert mute " << p->first + << " cleared (passed TTL " << p->second.ttl << ")"; + p = pending_mutes.erase(p); + changed = true; + continue; + } + if (!p->second.sticky) { + auto q = all.checks.find(p->first); + if (q == all.checks.end()) { + mon.clog->info() << "Health alert mute " << p->first + << " cleared (health alert cleared)"; + p = pending_mutes.erase(p); + changed = true; + continue; + } + if (p->second.count) { + // count-based mute + if (q->second.count > p->second.count) { + mon.clog->info() << "Health alert mute " << p->first + << " cleared (count increased from " << p->second.count + << " to " << q->second.count << ")"; + p = pending_mutes.erase(p); + changed = true; + continue; + } + if (q->second.count < p->second.count) { + // rachet down the mute + dout(10) << __func__ << " mute " << p->first << " count " + << p->second.count << " -> " << q->second.count + << dendl; + p->second.count = q->second.count; + changed = true; + } + } else { + // summary-based mute + if (p->second.summary != q->second.summary) { + mon.clog->info() << "Health alert mute " << p->first + << " cleared (summary changed)"; + p = pending_mutes.erase(p); + changed = true; + continue; + } + } + } + ++p; + } + return changed; +} + +void HealthMonitor::gather_all_health_checks(health_check_map_t *all) +{ + for (auto& svc : mon.paxos_service) { + all->merge(svc->get_health_checks()); + } +} + +health_status_t HealthMonitor::get_health_status( + bool want_detail, + Formatter *f, + std::string *plain, + const char *sep1, + const char *sep2) +{ + health_check_map_t all; + gather_all_health_checks(&all); + health_status_t r = HEALTH_OK; + for (auto& p : all.checks) { + if (!mutes.count(p.first)) { + if (r > p.second.severity) { + r = p.second.severity; + } + } + } + if (f) { + f->open_object_section("health"); + f->dump_stream("status") << r; + f->open_object_section("checks"); + for (auto& p : all.checks) { + f->open_object_section(p.first.c_str()); + p.second.dump(f, want_detail); + f->dump_bool("muted", mutes.count(p.first)); + f->close_section(); + } + f->close_section(); + f->open_array_section("mutes"); + for (auto& p : mutes) { + f->dump_object("mute", p.second); + } + f->close_section(); + f->close_section(); + } else { + auto now = ceph_clock_now(); + // one-liner: HEALTH_FOO[ thing1[; thing2 ...]] + string summary; + for (auto& p : all.checks) { + if (!mutes.count(p.first)) { + if (!summary.empty()) { + summary += sep2; + } + summary += p.second.summary; + } + } + *plain = stringify(r); + if (summary.size()) { + *plain += sep1; + *plain += summary; + } + if (!mutes.empty()) { + if (summary.size()) { + *plain += sep2; + } else { + *plain += sep1; + } + *plain += "(muted:"; + for (auto& p : mutes) { + *plain += " "; + *plain += p.first; + if (p.second.ttl) { + if (p.second.ttl > now) { + auto left = p.second.ttl; + left -= now; + *plain += "("s + utimespan_str(left) + ")"; + } else { + *plain += "(0s)"; + } + } + } + *plain += ")"; + } + *plain += "\n"; + // detail + if (want_detail) { + for (auto& p : all.checks) { + auto q = mutes.find(p.first); + if (q != mutes.end()) { + *plain += "(MUTED"; + if (q->second.ttl != utime_t()) { + if (q->second.ttl > now) { + auto left = q->second.ttl; + left -= now; + *plain += " ttl "; + *plain += utimespan_str(left); + } else { + *plain += "0s"; + } + } + if (q->second.sticky) { + *plain += ", STICKY"; + } + *plain += ") "; + } + *plain += "["s + short_health_string(p.second.severity) + "] " + + p.first + ": " + p.second.summary + "\n"; + for (auto& d : p.second.detail) { + *plain += " "; + *plain += d; + *plain += "\n"; + } + } + } + } + return r; +} + +bool HealthMonitor::check_member_health() +{ + dout(20) << __func__ << dendl; + bool changed = false; + const auto max = g_conf().get_val<uint64_t>("mon_health_max_detail"); + + // snapshot of usage + DataStats stats; + get_fs_stats(stats.fs_stats, g_conf()->mon_data.c_str()); + map<string,uint64_t> extra; + uint64_t store_size = mon.store->get_estimated_size(extra); + ceph_assert(store_size > 0); + stats.store_stats.bytes_total = store_size; + stats.store_stats.bytes_sst = extra["sst"]; + stats.store_stats.bytes_log = extra["log"]; + stats.store_stats.bytes_misc = extra["misc"]; + stats.last_update = ceph_clock_now(); + dout(10) << __func__ << " avail " << stats.fs_stats.avail_percent << "%" + << " total " << byte_u_t(stats.fs_stats.byte_total) + << ", used " << byte_u_t(stats.fs_stats.byte_used) + << ", avail " << byte_u_t(stats.fs_stats.byte_avail) << dendl; + + // MON_DISK_{LOW,CRIT,BIG} + health_check_map_t next; + if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_crit) { + stringstream ss, ss2; + ss << "mon%plurals% %names% %isorare% very low on available space"; + auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str(), 1); + ss2 << "mon." << mon.name << " has " << stats.fs_stats.avail_percent + << "% avail"; + d.detail.push_back(ss2.str()); + } else if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_warn) { + stringstream ss, ss2; + ss << "mon%plurals% %names% %isorare% low on available space"; + auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str(), 1); + ss2 << "mon." << mon.name << " has " << stats.fs_stats.avail_percent + << "% avail"; + d.detail.push_back(ss2.str()); + } + if (stats.store_stats.bytes_total >= g_conf()->mon_data_size_warn) { + stringstream ss, ss2; + ss << "mon%plurals% %names% %isorare% using a lot of disk space"; + auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str(), 1); + ss2 << "mon." << mon.name << " is " + << byte_u_t(stats.store_stats.bytes_total) + << " >= mon_data_size_warn (" + << byte_u_t(g_conf()->mon_data_size_warn) << ")"; + d.detail.push_back(ss2.str()); + } + + // OSD_NO_DOWN_OUT_INTERVAL + { + // Warn if 'mon_osd_down_out_interval' is set to zero. + // Having this option set to zero on the leader acts much like the + // 'noout' flag. It's hard to figure out what's going wrong with clusters + // without the 'noout' flag set but acting like that just the same, so + // we report a HEALTH_WARN in case this option is set to zero. + // This is an ugly hack to get the warning out, but until we find a way + // to spread global options throughout the mon cluster and have all mons + // using a base set of the same options, we need to work around this sort + // of things. + // There's also the obvious drawback that if this is set on a single + // monitor on a 3-monitor cluster, this warning will only be shown every + // third monitor connection. + if (g_conf()->mon_warn_on_osd_down_out_interval_zero && + g_conf()->mon_osd_down_out_interval == 0) { + ostringstream ss, ds; + ss << "mon%plurals% %names% %hasorhave% mon_osd_down_out_interval set to 0"; + auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str(), 1); + ds << "mon." << mon.name << " has mon_osd_down_out_interval set to 0"; + d.detail.push_back(ds.str()); + } + } + + // AUTH_INSECURE_GLOBAL_ID_RECLAIM + if (g_conf().get_val<bool>("mon_warn_on_insecure_global_id_reclaim") && + g_conf().get_val<bool>("auth_allow_insecure_global_id_reclaim")) { + // Warn if there are any clients that are insecurely renewing their global_id + std::lock_guard l(mon.session_map_lock); + list<std::string> detail; + for (auto p = mon.session_map.sessions.begin(); + p != mon.session_map.sessions.end(); + ++p) { + if ((*p)->global_id_status == global_id_status_t::RECLAIM_INSECURE) { + ostringstream ds; + ds << (*p)->entity_name << " at " << (*p)->addrs + << " is using insecure global_id reclaim"; + detail.push_back(ds.str()); + if (detail.size() >= max) { + detail.push_back("..."); + break; + } + } + } + if (!detail.empty()) { + ostringstream ss; + ss << "client%plurals% %isorare% using insecure global_id reclaim"; + auto& d = next.add("AUTH_INSECURE_GLOBAL_ID_RECLAIM", HEALTH_WARN, ss.str(), + detail.size()); + d.detail.swap(detail); + } + } + // AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED + if (g_conf().get_val<bool>("mon_warn_on_insecure_global_id_reclaim_allowed") && + g_conf().get_val<bool>("auth_allow_insecure_global_id_reclaim")) { + ostringstream ss, ds; + ss << "mon%plurals% %isorare% allowing insecure global_id reclaim"; + auto& d = next.add("AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED", HEALTH_WARN, ss.str(), 1); + ds << "mon." << mon.name << " has auth_allow_insecure_global_id_reclaim set to true"; + d.detail.push_back(ds.str()); + } + + auto p = quorum_checks.find(mon.rank); + if (p == quorum_checks.end()) { + if (next.empty()) { + return false; + } + } else { + if (p->second == next) { + return false; + } + } + + if (mon.is_leader()) { + // prepare to propose + quorum_checks[mon.rank] = next; + changed = true; + } else { + // tell the leader + mon.send_mon_message(new MMonHealthChecks(next), mon.get_leader()); + } + + return changed; +} + +bool HealthMonitor::check_leader_health() +{ + dout(20) << __func__ << dendl; + bool changed = false; + + // prune quorum_health + { + auto& qset = mon.get_quorum(); + auto p = quorum_checks.begin(); + while (p != quorum_checks.end()) { + if (qset.count(p->first) == 0) { + p = quorum_checks.erase(p); + changed = true; + } else { + ++p; + } + } + } + + health_check_map_t next; + + // DAEMON_OLD_VERSION + if (g_conf().get_val<bool>("mon_warn_on_older_version")) { + check_for_older_version(&next); + } + // MON_DOWN + check_for_mon_down(&next); + // MON_CLOCK_SKEW + check_for_clock_skew(&next); + // MON_MSGR2_NOT_ENABLED + if (g_conf().get_val<bool>("mon_warn_on_msgr2_not_enabled")) { + check_if_msgr2_enabled(&next); + } + + if (next != leader_checks) { + changed = true; + leader_checks = next; + } + return changed; +} + +void HealthMonitor::check_for_older_version(health_check_map_t *checks) +{ + static ceph::coarse_mono_time old_version_first_time = + ceph::coarse_mono_clock::zero(); + + auto now = ceph::coarse_mono_clock::now(); + if (ceph::coarse_mono_clock::is_zero(old_version_first_time)) { + old_version_first_time = now; + } + const auto warn_delay = g_conf().get_val<std::chrono::seconds>("mon_warn_older_version_delay"); + if (now - old_version_first_time > warn_delay) { + std::map<string, std::list<string> > all_versions; + mon.get_all_versions(all_versions); + if (all_versions.size() > 1) { + dout(20) << __func__ << " all_versions=" << all_versions << dendl; + // The last entry has the largest version + dout(20) << __func__ << " highest version daemon count " + << all_versions.rbegin()->second.size() << dendl; + // Erase last element (the highest version running) + all_versions.erase(all_versions.rbegin()->first); + ceph_assert(all_versions.size() > 0); + ostringstream ss; + unsigned daemon_count = 0; + for (auto& g : all_versions) { + daemon_count += g.second.size(); + } + int ver_count = all_versions.size(); + ceph_assert(!(daemon_count == 1 && ver_count != 1)); + ss << "There " << (daemon_count == 1 ? "is a daemon" : "are daemons") + << " running " << (ver_count > 1 ? "multiple old versions" : "an older version") << " of ceph"; + health_status_t status; + if (ver_count > 1) + status = HEALTH_ERR; + else + status = HEALTH_WARN; + auto& d = checks->add("DAEMON_OLD_VERSION", status, ss.str(), all_versions.size()); + for (auto& g : all_versions) { + ostringstream ds; + for (auto& i : g.second) { // Daemon list + ds << i << " "; + } + ds << (g.second.size() == 1 ? "is" : "are") + << " running an older version of ceph: " << g.first; + d.detail.push_back(ds.str()); + } + } else { + old_version_first_time = ceph::coarse_mono_clock::zero(); + } + } +} + +void HealthMonitor::check_for_mon_down(health_check_map_t *checks) +{ + int max = mon.monmap->size(); + int actual = mon.get_quorum().size(); + const auto now = ceph::real_clock::now(); + if (actual < max && + now > mon.monmap->created.to_real_time() + g_conf().get_val<std::chrono::seconds>("mon_down_mkfs_grace")) { + ostringstream ss; + ss << (max-actual) << "/" << max << " mons down, quorum " + << mon.get_quorum_names(); + auto& d = checks->add("MON_DOWN", HEALTH_WARN, ss.str(), max - actual); + set<int> q = mon.get_quorum(); + for (int i=0; i<max; i++) { + if (q.count(i) == 0) { + ostringstream ss; + ss << "mon." << mon.monmap->get_name(i) << " (rank " << i + << ") addr " << mon.monmap->get_addrs(i) + << " is down (out of quorum)"; + d.detail.push_back(ss.str()); + } + } + } +} + +void HealthMonitor::check_for_clock_skew(health_check_map_t *checks) +{ + if (!mon.timecheck_skews.empty()) { + list<string> warns; + list<string> details; + for (auto& i : mon.timecheck_skews) { + double skew = i.second; + double latency = mon.timecheck_latencies[i.first]; + string name = mon.monmap->get_name(i.first); + ostringstream tcss; + health_status_t tcstatus = mon.timecheck_status(tcss, skew, latency); + if (tcstatus != HEALTH_OK) { + warns.push_back(name); + ostringstream tmp_ss; + tmp_ss << "mon." << name << " " << tcss.str() + << " (latency " << latency << "s)"; + details.push_back(tmp_ss.str()); + } + } + if (!warns.empty()) { + ostringstream ss; + ss << "clock skew detected on"; + while (!warns.empty()) { + ss << " mon." << warns.front(); + warns.pop_front(); + if (!warns.empty()) + ss << ","; + } + auto& d = checks->add("MON_CLOCK_SKEW", HEALTH_WARN, ss.str(), details.size()); + d.detail.swap(details); + } + } +} + +void HealthMonitor::check_if_msgr2_enabled(health_check_map_t *checks) +{ + if (g_conf().get_val<bool>("ms_bind_msgr2") && + mon.monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + list<string> details; + for (auto& i : mon.monmap->mon_info) { + if (!i.second.public_addrs.has_msgr2()) { + ostringstream ds; + ds << "mon." << i.first << " is not bound to a msgr2 port, only " + << i.second.public_addrs; + details.push_back(ds.str()); + } + } + if (!details.empty()) { + ostringstream ss; + ss << details.size() << " monitors have not enabled msgr2"; + auto &d = checks->add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str(), + details.size()); + d.detail.swap(details); + } + } +} diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h new file mode 100644 index 000000000..c0e79d033 --- /dev/null +++ b/src/mon/HealthMonitor.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_HEALTH_MONITOR_H +#define CEPH_HEALTH_MONITOR_H + +#include "mon/PaxosService.h" + +class HealthMonitor : public PaxosService +{ + version_t version = 0; + std::map<int,health_check_map_t> quorum_checks; // for each quorum member + health_check_map_t leader_checks; // leader only + std::map<std::string,health_mute_t> mutes; + + std::map<std::string,health_mute_t> pending_mutes; + +public: + HealthMonitor(Monitor &m, Paxos &p, const std::string& service_name); + + /** + * @defgroup HealthMonitor_Inherited_h Inherited abstract methods + * @{ + */ + void init() override; + + bool preprocess_query(MonOpRequestRef op) override; + bool prepare_update(MonOpRequestRef op) override; + + void create_initial() override; + void update_from_paxos(bool *need_bootstrap) override; + void create_pending() override; + void encode_pending(MonitorDBStore::TransactionRef t) override; + version_t get_trim_to() const override; + + void encode_full(MonitorDBStore::TransactionRef t) override { } + + void tick() override; + + void gather_all_health_checks(health_check_map_t *all); + health_status_t get_health_status( + bool want_detail, + ceph::Formatter *f, + std::string *plain, + const char *sep1 = " ", + const char *sep2 = "; "); + + /** + * @} // HealthMonitor_Inherited_h + */ +private: + bool preprocess_command(MonOpRequestRef op); + + bool prepare_command(MonOpRequestRef op); + bool prepare_health_checks(MonOpRequestRef op); + void check_for_older_version(health_check_map_t *checks); + void check_for_mon_down(health_check_map_t *checks); + void check_for_clock_skew(health_check_map_t *checks); + void check_if_msgr2_enabled(health_check_map_t *checks); + bool check_leader_health(); + bool check_member_health(); + bool check_mutes(); +}; + +#endif // CEPH_HEALTH_MONITOR_H diff --git a/src/mon/KVMonitor.cc b/src/mon/KVMonitor.cc new file mode 100644 index 000000000..699cbe417 --- /dev/null +++ b/src/mon/KVMonitor.cc @@ -0,0 +1,525 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "mon/Monitor.h" +#include "mon/KVMonitor.h" +#include "include/stringify.h" +#include "messages/MKVData.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, this) + +static ostream& _prefix(std::ostream *_dout, const Monitor &mon, + const KVMonitor *hmon) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() << ").kv "; +} + +const string KV_PREFIX = "mon_config_key"; + +const int MAX_HISTORY = 50; + + +static bool is_binary_string(const string& s) +{ + for (auto c : s) { + // \n and \t are escaped in JSON; other control characters are not. + if ((c < 0x20 && c != '\n' && c != '\t') || c >= 0x7f) { + return true; + } + } + return false; +} + + +KVMonitor::KVMonitor(Monitor &m, Paxos &p, const string& service_name) + : PaxosService(m, p, service_name) { +} + +void KVMonitor::init() +{ + dout(10) << __func__ << dendl; +} + +void KVMonitor::create_initial() +{ + dout(10) << __func__ << dendl; + version = 0; + pending.clear(); +} + +void KVMonitor::update_from_paxos(bool *need_bootstrap) +{ + if (version == get_last_committed()) { + return; + } + version = get_last_committed(); + dout(10) << __func__ << " " << version << dendl; + check_all_subs(); +} + +void KVMonitor::create_pending() +{ + dout(10) << " " << version << dendl; + pending.clear(); +} + +void KVMonitor::encode_pending(MonitorDBStore::TransactionRef t) +{ + dout(10) << " " << (version+1) << dendl; + put_last_committed(t, version+1); + + // record the delta for this commit point + bufferlist bl; + encode(pending, bl); + put_version(t, version+1, bl); + + // make actual changes + for (auto& p : pending) { + string key = p.first; + if (p.second) { + dout(20) << __func__ << " set " << key << dendl; + t->put(KV_PREFIX, key, *p.second); + } else { + dout(20) << __func__ << " rm " << key << dendl; + t->erase(KV_PREFIX, key); + } + } +} + +version_t KVMonitor::get_trim_to() const +{ + // we don't need that many old states, but keep a few + if (version > MAX_HISTORY) { + return version - MAX_HISTORY; + } + return 0; +} + +void KVMonitor::get_store_prefixes(set<string>& s) const +{ + s.insert(service_name); + s.insert(KV_PREFIX); +} + +void KVMonitor::tick() +{ + if (!is_active() || !mon.is_leader()) { + return; + } + dout(10) << __func__ << dendl; +} + +void KVMonitor::on_active() +{ +} + + +bool KVMonitor::preprocess_query(MonOpRequestRef op) +{ + switch (op->get_req()->get_type()) { + case MSG_MON_COMMAND: + try { + return preprocess_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + } + return false; +} + +bool KVMonitor::preprocess_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + std::stringstream ss; + int err = 0; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + string key; + cmd_getval(cmdmap, "key", key); + + bufferlist odata; + + if (prefix == "config-key get") { + err = mon.store->get(KV_PREFIX, key, odata); + } + else if (prefix == "config-key exists") { + bool exists = mon.store->exists(KV_PREFIX, key); + ss << "key '" << key << "'"; + if (exists) { + ss << " exists"; + err = 0; + } else { + ss << " doesn't exist"; + err = -ENOENT; + } + } + else if (prefix == "config-key list" || + prefix == "config-key ls") { + if (!f) { + f.reset(Formatter::create("json-pretty")); + } + KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX); + f->open_array_section("keys"); + while (iter->valid()) { + string key(iter->key()); + f->dump_string("key", key); + iter->next(); + } + f->close_section(); + + stringstream tmp_ss; + f->flush(tmp_ss); + odata.append(tmp_ss); + err = 0; + } + else if (prefix == "config-key dump") { + if (!f) { + f.reset(Formatter::create("json-pretty")); + } + + KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX); + if (key.size()) { + iter->lower_bound(key); + } + f->open_object_section("config-key store"); + while (iter->valid()) { + if (key.size() && + iter->key().find(key) != 0) { + break; + } + string s = iter->value().to_str(); + if (is_binary_string(s)) { + ostringstream ss; + ss << "<<< binary blob of length " << s.size() << " >>>"; + f->dump_string(iter->key().c_str(), ss.str()); + } else { + f->dump_string(iter->key().c_str(), s); + } + iter->next(); + } + f->close_section(); + + stringstream tmp_ss; + f->flush(tmp_ss); + odata.append(tmp_ss); + err = 0; + } + else { + return false; + } + + mon.reply_command(op, err, ss.str(), odata, get_last_committed()); + return true; +} + +bool KVMonitor::prepare_update(MonOpRequestRef op) +{ + Message *m = op->get_req(); + dout(7) << "prepare_update " << *m + << " from " << m->get_orig_source_inst() << dendl; + switch (m->get_type()) { + case MSG_MON_COMMAND: + try { + return prepare_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + } + return false; +} + + +bool KVMonitor::prepare_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + std::stringstream ss; + int err = 0; + bufferlist odata; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + string key; + if (!cmd_getval(cmdmap, "key", key)) { + err = -EINVAL; + ss << "must specify a key"; + goto reply; + } + + + if (prefix == "config-key set" || + prefix == "config-key put") { + bufferlist data; + string val; + if (cmd_getval(cmdmap, "val", val)) { + // they specified a value in the command instead of a file + data.append(val); + } else if (m->get_data_len() > 0) { + // they specified '-i <file>' + data = m->get_data(); + } + if (data.length() > (size_t) g_conf()->mon_config_key_max_entry_size) { + err = -EFBIG; // File too large + ss << "error: entry size limited to " + << g_conf()->mon_config_key_max_entry_size << " bytes. " + << "Use 'mon config key max entry size' to manually adjust"; + goto reply; + } + + ss << "set " << key; + pending[key] = data; + goto update; + } + else if (prefix == "config-key del" || + prefix == "config-key rm") { + ss << "key deleted"; + pending[key] = boost::none; + goto update; + } + else { + ss << "unknown command " << prefix; + err = -EINVAL; + } + +reply: + mon.reply_command(op, err, ss.str(), odata, get_last_committed()); + return false; + +update: + // see if there is an actual change + if (pending.empty()) { + err = 0; + goto reply; + } + force_immediate_propose(); // faster response + wait_for_finished_proposal( + op, + new Monitor::C_Command( + mon, op, 0, ss.str(), odata, + get_last_committed() + 1)); + return true; +} + + + + +static string _get_dmcrypt_prefix(const uuid_d& uuid, const string k) +{ + return "dm-crypt/osd/" + stringify(uuid) + "/" + k; +} + +bool KVMonitor::_have_prefix(const string &prefix) +{ + KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX); + + while (iter->valid()) { + string key(iter->key()); + size_t p = key.find(prefix); + if (p != string::npos && p == 0) { + return true; + } + iter->next(); + } + return false; +} + +int KVMonitor::validate_osd_destroy( + const int32_t id, + const uuid_d& uuid) +{ + string dmcrypt_prefix = _get_dmcrypt_prefix(uuid, ""); + string daemon_prefix = + "daemon-private/osd." + stringify(id) + "/"; + + if (!_have_prefix(dmcrypt_prefix) && + !_have_prefix(daemon_prefix)) { + return -ENOENT; + } + return 0; +} + +void KVMonitor::do_osd_destroy(int32_t id, uuid_d& uuid) +{ + string dmcrypt_prefix = _get_dmcrypt_prefix(uuid, ""); + string daemon_prefix = + "daemon-private/osd." + stringify(id) + "/"; + + for (auto& prefix : { dmcrypt_prefix, daemon_prefix }) { + KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX); + iter->lower_bound(prefix); + if (iter->key().find(prefix) != 0) { + break; + } + pending[iter->key()] = boost::none; + } + + propose_pending(); +} + +int KVMonitor::validate_osd_new( + const uuid_d& uuid, + const string& dmcrypt_key, + stringstream& ss) +{ + string dmcrypt_prefix = _get_dmcrypt_prefix(uuid, "luks"); + bufferlist value; + value.append(dmcrypt_key); + + if (mon.store->exists(KV_PREFIX, dmcrypt_prefix)) { + bufferlist existing_value; + int err = mon.store->get(KV_PREFIX, dmcrypt_prefix, existing_value); + if (err < 0) { + dout(10) << __func__ << " unable to get dm-crypt key from store (r = " + << err << ")" << dendl; + return err; + } + if (existing_value.contents_equal(value)) { + // both values match; this will be an idempotent op. + return EEXIST; + } + ss << "dm-crypt key already exists and does not match"; + return -EEXIST; + } + return 0; +} + +void KVMonitor::do_osd_new( + const uuid_d& uuid, + const string& dmcrypt_key) +{ + ceph_assert(paxos.is_plugged()); + + string dmcrypt_key_prefix = _get_dmcrypt_prefix(uuid, "luks"); + bufferlist dmcrypt_key_value; + dmcrypt_key_value.append(dmcrypt_key); + + pending[dmcrypt_key_prefix] = dmcrypt_key_value; + + propose_pending(); +} + + +void KVMonitor::check_sub(MonSession *s) +{ + if (!s->authenticated) { + dout(20) << __func__ << " not authenticated " << s->entity_name << dendl; + return; + } + for (auto& p : s->sub_map) { + if (p.first.find("kv:") == 0) { + check_sub(p.second); + } + } +} + +void KVMonitor::check_sub(Subscription *sub) +{ + dout(10) << __func__ + << " next " << sub->next + << " have " << version << dendl; + if (sub->next <= version) { + maybe_send_update(sub); + if (sub->onetime) { + mon.with_session_map([sub](MonSessionMap& session_map) { + session_map.remove_sub(sub); + }); + } + } +} + +void KVMonitor::check_all_subs() +{ + dout(10) << __func__ << dendl; + int updated = 0, total = 0; + for (auto& i : mon.session_map.subs) { + if (i.first.find("kv:") == 0) { + auto p = i.second->begin(); + while (!p.end()) { + auto sub = *p; + ++p; + ++total; + if (maybe_send_update(sub)) { + ++updated; + } + } + } + } + dout(10) << __func__ << " updated " << updated << " / " << total << dendl; +} + +bool KVMonitor::maybe_send_update(Subscription *sub) +{ + if (sub->next > version) { + return false; + } + + auto m = new MKVData; + m->prefix = sub->type.substr(3); + m->version = version; + + if (sub->next && sub->next > get_first_committed()) { + // incremental + m->incremental = true; + + for (version_t cur = sub->next; cur <= version; ++cur) { + bufferlist bl; + int err = get_version(cur, bl); + ceph_assert(err == 0); + + std::map<std::string,boost::optional<ceph::buffer::list>> pending; + auto p = bl.cbegin(); + ceph::decode(pending, p); + + for (auto& i : pending) { + if (i.first.find(m->prefix) == 0) { + m->data[i.first] = i.second; + } + } + } + + dout(10) << __func__ << " incremental keys for " << m->prefix + << ", v " << sub->next << ".." << version + << ", " << m->data.size() << " keys" + << dendl; + } else { + m->incremental = false; + + KeyValueDB::Iterator iter = mon.store->get_iterator(KV_PREFIX); + iter->lower_bound(m->prefix); + while (iter->valid() && + iter->key().find(m->prefix) == 0) { + m->data[iter->key()] = iter->value(); + iter->next(); + } + + dout(10) << __func__ << " sending full dump of " << m->prefix + << ", " << m->data.size() << " keys" + << dendl; + } + sub->session->con->send_message(m); + sub->next = version + 1; + return true; +} diff --git a/src/mon/KVMonitor.h b/src/mon/KVMonitor.h new file mode 100644 index 000000000..c14c16380 --- /dev/null +++ b/src/mon/KVMonitor.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <boost/optional.hpp> + +#include "mon/PaxosService.h" + +class MonSession; + +extern const std::string KV_PREFIX; + +class KVMonitor : public PaxosService +{ + version_t version = 0; + std::map<std::string,boost::optional<ceph::buffer::list>> pending; + + bool _have_prefix(const string &prefix); + +public: + KVMonitor(Monitor &m, Paxos &p, const std::string& service_name); + + void init() override; + + void get_store_prefixes(set<string>& s) const override; + + bool preprocess_command(MonOpRequestRef op); + bool prepare_command(MonOpRequestRef op); + + bool preprocess_query(MonOpRequestRef op) override; + bool prepare_update(MonOpRequestRef op) override; + + void create_initial() override; + void update_from_paxos(bool *need_bootstrap) override; + void create_pending() override; + void encode_pending(MonitorDBStore::TransactionRef t) override; + version_t get_trim_to() const override; + + void encode_full(MonitorDBStore::TransactionRef t) override { } + + void on_active() override; + void tick() override; + + int validate_osd_destroy(const int32_t id, const uuid_d& uuid); + void do_osd_destroy(int32_t id, uuid_d& uuid); + int validate_osd_new( + const uuid_d& uuid, + const std::string& dmcrypt_key, + std::stringstream& ss); + void do_osd_new(const uuid_d& uuid, const std::string& dmcrypt_key); + + void check_sub(MonSession *s); + void check_sub(Subscription *sub); + void check_all_subs(); + + bool maybe_send_update(Subscription *sub); + + + // used by other services to adjust kv content; note that callers MUST ensure that + // propose_pending() is called and a commit is forced to provide atomicity and + // proper subscriber notifications. + void enqueue_set(const std::string& key, bufferlist &v) { + pending[key] = v; + } + void enqueue_rm(const std::string& key) { + pending[key] = boost::none; + } +}; diff --git a/src/mon/LogMonitor.cc b/src/mon/LogMonitor.cc new file mode 100644 index 000000000..88327663a --- /dev/null +++ b/src/mon/LogMonitor.cc @@ -0,0 +1,947 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <boost/algorithm/string/predicate.hpp> + +#include <sstream> +#include <syslog.h> + +#include "LogMonitor.h" +#include "Monitor.h" +#include "MonitorDBStore.h" + +#include "messages/MMonCommand.h" +#include "messages/MLog.h" +#include "messages/MLogAck.h" +#include "common/Graylog.h" +#include "common/errno.h" +#include "common/strtol.h" +#include "include/ceph_assert.h" +#include "include/str_list.h" +#include "include/str_map.h" +#include "include/compat.h" + +#define dout_subsys ceph_subsys_mon + +using namespace TOPNSPC::common; + +using std::cerr; +using std::cout; +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::setfill; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; +using std::unique_ptr; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::make_message; +using ceph::mono_clock; +using ceph::mono_time; +using ceph::timespan_str; + +string LogMonitor::log_channel_info::get_log_file(const string &channel) +{ + dout(25) << __func__ << " for channel '" + << channel << "'" << dendl; + + if (expanded_log_file.count(channel) == 0) { + string fname = expand_channel_meta( + get_str_map_key(log_file, channel, &CLOG_CONFIG_DEFAULT_KEY), + channel); + expanded_log_file[channel] = fname; + + dout(20) << __func__ << " for channel '" + << channel << "' expanded to '" + << fname << "'" << dendl; + } + return expanded_log_file[channel]; +} + + +void LogMonitor::log_channel_info::expand_channel_meta(map<string,string> &m) +{ + dout(20) << __func__ << " expand map: " << m << dendl; + for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p) { + m[p->first] = expand_channel_meta(p->second, p->first); + } + dout(20) << __func__ << " expanded map: " << m << dendl; +} + +string LogMonitor::log_channel_info::expand_channel_meta( + const string &input, + const string &change_to) +{ + size_t pos = string::npos; + string s(input); + while ((pos = s.find(LOG_META_CHANNEL)) != string::npos) { + string tmp = s.substr(0, pos) + change_to; + if (pos+LOG_META_CHANNEL.length() < s.length()) + tmp += s.substr(pos+LOG_META_CHANNEL.length()); + s = tmp; + } + dout(20) << __func__ << " from '" << input + << "' to '" << s << "'" << dendl; + + return s; +} + +bool LogMonitor::log_channel_info::do_log_to_syslog(const string &channel) { + string v = get_str_map_key(log_to_syslog, channel, + &CLOG_CONFIG_DEFAULT_KEY); + // We expect booleans, but they are in k/v pairs, kept + // as strings, in 'log_to_syslog'. We must ensure + // compatibility with existing boolean handling, and so + // we are here using a modified version of how + // md_config_t::set_val_raw() handles booleans. We will + // accept both 'true' and 'false', but will also check for + // '1' and '0'. The main distiction between this and the + // original code is that we will assume everything not '1', + // '0', 'true' or 'false' to be 'false'. + bool ret = false; + + if (boost::iequals(v, "false")) { + ret = false; + } else if (boost::iequals(v, "true")) { + ret = true; + } else { + std::string err; + int b = strict_strtol(v.c_str(), 10, &err); + ret = (err.empty() && b == 1); + } + + return ret; +} + +ceph::logging::Graylog::Ref LogMonitor::log_channel_info::get_graylog( + const string &channel) +{ + dout(25) << __func__ << " for channel '" + << channel << "'" << dendl; + + if (graylogs.count(channel) == 0) { + auto graylog(std::make_shared<ceph::logging::Graylog>("mon")); + + graylog->set_fsid(g_conf().get_val<uuid_d>("fsid")); + graylog->set_hostname(g_conf()->host); + graylog->set_destination(get_str_map_key(log_to_graylog_host, channel, + &CLOG_CONFIG_DEFAULT_KEY), + atoi(get_str_map_key(log_to_graylog_port, channel, + &CLOG_CONFIG_DEFAULT_KEY).c_str())); + + graylogs[channel] = graylog; + dout(20) << __func__ << " for channel '" + << channel << "' to graylog host '" + << log_to_graylog_host[channel] << ":" + << log_to_graylog_port[channel] + << "'" << dendl; + } + return graylogs[channel]; +} + + +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, get_last_committed()) +static ostream& _prefix(std::ostream *_dout, Monitor &mon, version_t v) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() + << ").log v" << v << " "; +} + +ostream& operator<<(ostream &out, const LogMonitor &pm) +{ + return out << "log"; +} + +/* + Tick function to update the map based on performance every N seconds +*/ + +void LogMonitor::tick() +{ + if (!is_active()) return; + + dout(10) << *this << dendl; + +} + +void LogMonitor::create_initial() +{ + dout(10) << "create_initial -- creating initial map" << dendl; + LogEntry e; + e.name = g_conf()->name; + e.rank = entity_name_t::MON(mon.rank); + e.addrs = mon.messenger->get_myaddrs(); + e.stamp = ceph_clock_now(); + e.prio = CLOG_INFO; + std::stringstream ss; + ss << "mkfs " << mon.monmap->get_fsid(); + e.msg = ss.str(); + e.seq = 0; + pending_log.insert(pair<utime_t,LogEntry>(e.stamp, e)); +} + +void LogMonitor::update_from_paxos(bool *need_bootstrap) +{ + dout(10) << __func__ << dendl; + version_t version = get_last_committed(); + dout(10) << __func__ << " version " << version + << " summary v " << summary.version << dendl; + if (version == summary.version) + return; + ceph_assert(version >= summary.version); + + map<string,bufferlist> channel_blog; + + version_t latest_full = get_version_latest_full(); + dout(10) << __func__ << " latest full " << latest_full << dendl; + if ((latest_full > 0) && (latest_full > summary.version)) { + bufferlist latest_bl; + get_version_full(latest_full, latest_bl); + ceph_assert(latest_bl.length() != 0); + dout(7) << __func__ << " loading summary e" << latest_full << dendl; + auto p = latest_bl.cbegin(); + decode(summary, p); + dout(7) << __func__ << " loaded summary e" << summary.version << dendl; + } + + // walk through incrementals + while (version > summary.version) { + bufferlist bl; + int err = get_version(summary.version+1, bl); + ceph_assert(err == 0); + ceph_assert(bl.length()); + + auto p = bl.cbegin(); + __u8 v; + decode(v, p); + while (!p.end()) { + LogEntry le; + le.decode(p); + dout(7) << "update_from_paxos applying incremental log " << summary.version+1 << " " << le << dendl; + + string channel = le.channel; + if (channel.empty()) // keep retrocompatibility + channel = CLOG_CHANNEL_CLUSTER; + + if (g_conf().get_val<bool>("mon_cluster_log_to_stderr")) { + cerr << channel << " " << le << std::endl; + } + + if (channels.do_log_to_syslog(channel)) { + string level = channels.get_level(channel); + string facility = channels.get_facility(channel); + if (level.empty() || facility.empty()) { + derr << __func__ << " unable to log to syslog -- level or facility" + << " not defined (level: " << level << ", facility: " + << facility << ")" << dendl; + continue; + } + le.log_to_syslog(channels.get_level(channel), + channels.get_facility(channel)); + } + + if (channels.do_log_to_graylog(channel)) { + ceph::logging::Graylog::Ref graylog = channels.get_graylog(channel); + if (graylog) { + graylog->log_log_entry(&le); + } + dout(7) << "graylog: " << channel << " " << graylog + << " host:" << channels.log_to_graylog_host << dendl; + } + + if (g_conf()->mon_cluster_log_to_file) { + string log_file = channels.get_log_file(channel); + dout(20) << __func__ << " logging for channel '" << channel + << "' to file '" << log_file << "'" << dendl; + + if (!log_file.empty()) { + string log_file_level = channels.get_log_file_level(channel); + if (log_file_level.empty()) { + dout(1) << __func__ << " warning: log file level not defined for" + << " channel '" << channel << "' yet a log file is --" + << " will assume lowest level possible" << dendl; + } + + int min = string_to_syslog_level(log_file_level); + int l = clog_type_to_syslog_level(le.prio); + if (l <= min) { + stringstream ss; + ss << le << "\n"; + // init entry if DNE + bufferlist &blog = channel_blog[channel]; + blog.append(ss.str()); + } + } + } + + summary.add(le); + } + + summary.version++; + summary.prune(g_conf()->mon_log_max_summary); + } + + dout(15) << __func__ << " logging for " + << channel_blog.size() << " channels" << dendl; + for(map<string,bufferlist>::iterator p = channel_blog.begin(); + p != channel_blog.end(); ++p) { + if (!p->second.length()) { + dout(15) << __func__ << " channel '" << p->first + << "': nothing to log" << dendl; + continue; + } + + dout(15) << __func__ << " channel '" << p->first + << "' logging " << p->second.length() << " bytes" << dendl; + string log_file = channels.get_log_file(p->first); + + int fd = ::open(log_file.c_str(), O_WRONLY|O_APPEND|O_CREAT|O_CLOEXEC, 0600); + if (fd < 0) { + int err = -errno; + dout(1) << "unable to write to '" << log_file << "' for channel '" + << p->first << "': " << cpp_strerror(err) << dendl; + } else { + int err = p->second.write_fd(fd); + if (err < 0) { + dout(1) << "error writing to '" << log_file << "' for channel '" + << p->first << ": " << cpp_strerror(err) << dendl; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + } + + check_subs(); +} + +void LogMonitor::create_pending() +{ + pending_log.clear(); + pending_summary = summary; + dout(10) << "create_pending v " << (get_last_committed() + 1) << dendl; +} + +void LogMonitor::encode_pending(MonitorDBStore::TransactionRef t) +{ + version_t version = get_last_committed() + 1; + bufferlist bl; + dout(10) << __func__ << " v" << version << dendl; + __u8 v = 1; + encode(v, bl); + for (auto p = pending_log.begin(); p != pending_log.end(); ++p) + p->second.encode(bl, mon.get_quorum_con_features()); + + put_version(t, version, bl); + put_last_committed(t, version); +} + +void LogMonitor::encode_full(MonitorDBStore::TransactionRef t) +{ + dout(10) << __func__ << " log v " << summary.version << dendl; + ceph_assert(get_last_committed() == summary.version); + + bufferlist summary_bl; + encode(summary, summary_bl, mon.get_quorum_con_features()); + + put_version_full(t, summary.version, summary_bl); + put_version_latest_full(t, summary.version); +} + +version_t LogMonitor::get_trim_to() const +{ + if (!mon.is_leader()) + return 0; + + unsigned max = g_conf()->mon_max_log_epochs; + version_t version = get_last_committed(); + if (version > max) + return version - max; + return 0; +} + +bool LogMonitor::preprocess_query(MonOpRequestRef op) +{ + op->mark_logmon_event("preprocess_query"); + auto m = op->get_req<PaxosServiceMessage>(); + dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl; + switch (m->get_type()) { + case MSG_MON_COMMAND: + try { + return preprocess_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + + case MSG_LOG: + return preprocess_log(op); + + default: + ceph_abort(); + return true; + } +} + +bool LogMonitor::prepare_update(MonOpRequestRef op) +{ + op->mark_logmon_event("prepare_update"); + auto m = op->get_req<PaxosServiceMessage>(); + dout(10) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl; + switch (m->get_type()) { + case MSG_MON_COMMAND: + try { + return prepare_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + case MSG_LOG: + return prepare_log(op); + default: + ceph_abort(); + return false; + } +} + +bool LogMonitor::preprocess_log(MonOpRequestRef op) +{ + op->mark_logmon_event("preprocess_log"); + auto m = op->get_req<MLog>(); + dout(10) << "preprocess_log " << *m << " from " << m->get_orig_source() << dendl; + int num_new = 0; + + MonSession *session = op->get_session(); + if (!session) + goto done; + if (!session->is_capable("log", MON_CAP_W)) { + dout(0) << "preprocess_log got MLog from entity with insufficient privileges " + << session->caps << dendl; + goto done; + } + + for (auto p = m->entries.begin(); + p != m->entries.end(); + ++p) { + if (!pending_summary.contains(p->key())) + num_new++; + } + if (!num_new) { + dout(10) << " nothing new" << dendl; + goto done; + } + + return false; + + done: + mon.no_reply(op); + return true; +} + +struct LogMonitor::C_Log : public C_MonOp { + LogMonitor *logmon; + C_Log(LogMonitor *p, MonOpRequestRef o) : + C_MonOp(o), logmon(p) {} + void _finish(int r) override { + if (r == -ECANCELED) { + return; + } + logmon->_updated_log(op); + } +}; + +bool LogMonitor::prepare_log(MonOpRequestRef op) +{ + op->mark_logmon_event("prepare_log"); + auto m = op->get_req<MLog>(); + dout(10) << "prepare_log " << *m << " from " << m->get_orig_source() << dendl; + + if (m->fsid != mon.monmap->fsid) { + dout(0) << "handle_log on fsid " << m->fsid << " != " << mon.monmap->fsid + << dendl; + return false; + } + + for (auto p = m->entries.begin(); + p != m->entries.end(); + ++p) { + dout(10) << " logging " << *p << dendl; + if (!pending_summary.contains(p->key())) { + pending_summary.add(*p); + pending_log.insert(pair<utime_t,LogEntry>(p->stamp, *p)); + } + } + pending_summary.prune(g_conf()->mon_log_max_summary); + wait_for_finished_proposal(op, new C_Log(this, op)); + return true; +} + +void LogMonitor::_updated_log(MonOpRequestRef op) +{ + auto m = op->get_req<MLog>(); + dout(7) << "_updated_log for " << m->get_orig_source_inst() << dendl; + mon.send_reply(op, new MLogAck(m->fsid, m->entries.rbegin()->seq)); +} + +bool LogMonitor::should_propose(double& delay) +{ + // commit now if we have a lot of pending events + if (g_conf()->mon_max_log_entries_per_event > 0 && + pending_log.size() >= (unsigned)g_conf()->mon_max_log_entries_per_event) + return true; + + // otherwise fall back to generic policy + return PaxosService::should_propose(delay); +} + + +bool LogMonitor::preprocess_command(MonOpRequestRef op) +{ + op->mark_logmon_event("preprocess_command"); + auto m = op->get_req<MMonCommand>(); + int r = -EINVAL; + bufferlist rdata; + stringstream ss; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", get_last_committed()); + return true; + } + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + if (prefix == "log last") { + int64_t num = 20; + cmd_getval(cmdmap, "num", num); + if (f) { + f->open_array_section("tail"); + } + + std::string level_str; + clog_type level; + if (cmd_getval(cmdmap, "level", level_str)) { + level = LogEntry::str_to_level(level_str); + if (level == CLOG_UNKNOWN) { + ss << "Invalid severity '" << level_str << "'"; + mon.reply_command(op, -EINVAL, ss.str(), get_last_committed()); + return true; + } + } else { + level = CLOG_INFO; + } + + std::string channel; + if (!cmd_getval(cmdmap, "channel", channel)) { + channel = CLOG_CHANNEL_DEFAULT; + } + + // We'll apply this twice, once while counting out lines + // and once while outputting them. + auto match = [level](const LogEntry &entry) { + return entry.prio >= level; + }; + + // Decrement operation that sets to container end when hitting rbegin + ostringstream ss; + if (channel == "*") { + list<LogEntry> full_tail; + summary.build_ordered_tail(&full_tail); + derr << "full " << full_tail << dendl; + auto rp = full_tail.rbegin(); + for (; num > 0 && rp != full_tail.rend(); ++rp) { + if (match(*rp)) { + num--; + } + } + if (rp == full_tail.rend()) { + --rp; + } + + // Decrement a reverse iterator such that going past rbegin() + // sets it to rend(). This is for writing a for() loop that + // goes up to (and including) rbegin() + auto dec = [&rp, &full_tail] () { + if (rp == full_tail.rbegin()) { + rp = full_tail.rend(); + } else { + --rp; + } + }; + + // Move forward to the end of the container (decrement the reverse + // iterator). + for (; rp != full_tail.rend(); dec()) { + if (!match(*rp)) { + continue; + } + if (f) { + f->dump_object("entry", *rp); + } else { + ss << *rp << "\n"; + } + } + } else { + auto p = summary.tail_by_channel.find(channel); + if (p != summary.tail_by_channel.end()) { + auto rp = p->second.rbegin(); + for (; num > 0 && rp != p->second.rend(); ++rp) { + if (match(rp->second)) { + num--; + } + } + if (rp == p->second.rend()) { + --rp; + } + + // Decrement a reverse iterator such that going past rbegin() + // sets it to rend(). This is for writing a for() loop that + // goes up to (and including) rbegin() + auto dec = [&rp, &p] () { + if (rp == p->second.rbegin()) { + rp = p->second.rend(); + } else { + --rp; + } + }; + + // Move forward to the end of the container (decrement the reverse + // iterator). + for (; rp != p->second.rend(); dec()) { + if (!match(rp->second)) { + continue; + } + if (f) { + f->dump_object("entry", rp->second); + } else { + ss << rp->second << "\n"; + } + } + } + } + if (f) { + f->close_section(); + f->flush(rdata); + } else { + rdata.append(ss.str()); + } + r = 0; + } else { + return false; + } + + string rs; + getline(ss, rs); + mon.reply_command(op, r, rs, rdata, get_last_committed()); + return true; +} + + +bool LogMonitor::prepare_command(MonOpRequestRef op) +{ + op->mark_logmon_event("prepare_command"); + auto m = op->get_req<MMonCommand>(); + stringstream ss; + string rs; + int err = -EINVAL; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + // ss has reason for failure + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", get_last_committed()); + return true; + } + + if (prefix == "log") { + vector<string> logtext; + string level_str; + cmd_getval(cmdmap, "logtext", logtext); + LogEntry le; + le.rank = m->get_orig_source(); + le.addrs.v.push_back(m->get_orig_source_addr()); + le.name = session->entity_name; + le.stamp = m->get_recv_stamp(); + le.seq = 0; + cmd_getval(cmdmap, "level", level_str, string("info")); + le.prio = LogEntry::str_to_level(level_str); + le.channel = CLOG_CHANNEL_DEFAULT; + le.msg = str_join(logtext, " "); + pending_summary.add(le); + pending_summary.prune(g_conf()->mon_log_max_summary); + pending_log.insert(pair<utime_t,LogEntry>(le.stamp, le)); + wait_for_finished_proposal(op, new Monitor::C_Command( + mon, op, 0, string(), get_last_committed() + 1)); + return true; + } + + getline(ss, rs); + mon.reply_command(op, err, rs, get_last_committed()); + return false; +} + + +int LogMonitor::sub_name_to_id(const string& n) +{ + if (n.substr(0, 4) == "log-" && n.size() > 4) { + return LogEntry::str_to_level(n.substr(4)); + } else { + return CLOG_UNKNOWN; + } +} + +void LogMonitor::check_subs() +{ + dout(10) << __func__ << dendl; + for (map<string, xlist<Subscription*>*>::iterator i = mon.session_map.subs.begin(); + i != mon.session_map.subs.end(); + ++i) { + for (xlist<Subscription*>::iterator j = i->second->begin(); !j.end(); ++j) { + if (sub_name_to_id((*j)->type) >= 0) + check_sub(*j); + } + } +} + +void LogMonitor::check_sub(Subscription *s) +{ + dout(10) << __func__ << " client wants " << s->type << " ver " << s->next << dendl; + + int sub_level = sub_name_to_id(s->type); + ceph_assert(sub_level >= 0); + + version_t summary_version = summary.version; + if (s->next > summary_version) { + dout(10) << __func__ << " client " << s->session->name + << " requested version (" << s->next << ") is greater than ours (" + << summary_version << "), which means we already sent him" + << " everything we have." << dendl; + return; + } + + MLog *mlog = new MLog(mon.monmap->fsid); + + if (s->next == 0) { + /* First timer, heh? */ + _create_sub_incremental(mlog, sub_level, get_last_committed()); + } else { + /* let us send you an incremental log... */ + _create_sub_incremental(mlog, sub_level, s->next); + } + + dout(10) << __func__ << " sending message to " << s->session->name + << " with " << mlog->entries.size() << " entries" + << " (version " << mlog->version << ")" << dendl; + + if (!mlog->entries.empty()) { + s->session->con->send_message(mlog); + } else { + mlog->put(); + } + if (s->onetime) + mon.session_map.remove_sub(s); + else + s->next = summary_version+1; +} + +/** + * Create an incremental log message from version \p sv to \p summary.version + * + * @param mlog Log message we'll send to the client with the messages received + * since version \p sv, inclusive. + * @param level The max log level of the messages the client is interested in. + * @param sv The version the client is looking for. + */ +void LogMonitor::_create_sub_incremental(MLog *mlog, int level, version_t sv) +{ + dout(10) << __func__ << " level " << level << " ver " << sv + << " cur summary ver " << summary.version << dendl; + + if (sv < get_first_committed()) { + dout(10) << __func__ << " skipped from " << sv + << " to first_committed " << get_first_committed() << dendl; + LogEntry le; + le.stamp = ceph_clock_now(); + le.prio = CLOG_WARN; + ostringstream ss; + ss << "skipped log messages from " << sv << " to " << get_first_committed(); + le.msg = ss.str(); + mlog->entries.push_back(le); + sv = get_first_committed(); + } + + version_t summary_ver = summary.version; + while (sv && sv <= summary_ver) { + bufferlist bl; + int err = get_version(sv, bl); + ceph_assert(err == 0); + ceph_assert(bl.length()); + auto p = bl.cbegin(); + __u8 v; + decode(v,p); + while (!p.end()) { + LogEntry le; + le.decode(p); + + if (le.prio < level) { + dout(20) << __func__ << " requested " << level + << " entry " << le.prio << dendl; + continue; + } + + mlog->entries.push_back(le); + } + mlog->version = sv++; + } + + dout(10) << __func__ << " incremental message ready (" + << mlog->entries.size() << " entries)" << dendl; +} + +void LogMonitor::update_log_channels() +{ + ostringstream oss; + + channels.clear(); + + int r = get_conf_str_map_helper( + g_conf().get_val<string>("mon_cluster_log_to_syslog"), + oss, &channels.log_to_syslog, + CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + derr << __func__ << " error parsing 'mon_cluster_log_to_syslog'" << dendl; + return; + } + + r = get_conf_str_map_helper( + g_conf().get_val<string>("mon_cluster_log_to_syslog_level"), + oss, &channels.syslog_level, + CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + derr << __func__ << " error parsing 'mon_cluster_log_to_syslog_level'" + << dendl; + return; + } + + r = get_conf_str_map_helper( + g_conf().get_val<string>("mon_cluster_log_to_syslog_facility"), + oss, &channels.syslog_facility, + CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + derr << __func__ << " error parsing 'mon_cluster_log_to_syslog_facility'" + << dendl; + return; + } + + r = get_conf_str_map_helper( + g_conf().get_val<string>("mon_cluster_log_file"), oss, + &channels.log_file, + CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + derr << __func__ << " error parsing 'mon_cluster_log_file'" << dendl; + return; + } + + r = get_conf_str_map_helper( + g_conf().get_val<string>("mon_cluster_log_file_level"), oss, + &channels.log_file_level, + CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + derr << __func__ << " error parsing 'mon_cluster_log_file_level'" + << dendl; + return; + } + + r = get_conf_str_map_helper( + g_conf().get_val<string>("mon_cluster_log_to_graylog"), oss, + &channels.log_to_graylog, + CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + derr << __func__ << " error parsing 'mon_cluster_log_to_graylog'" + << dendl; + return; + } + + r = get_conf_str_map_helper( + g_conf().get_val<string>("mon_cluster_log_to_graylog_host"), oss, + &channels.log_to_graylog_host, + CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + derr << __func__ << " error parsing 'mon_cluster_log_to_graylog_host'" + << dendl; + return; + } + + r = get_conf_str_map_helper( + g_conf().get_val<string>("mon_cluster_log_to_graylog_port"), oss, + &channels.log_to_graylog_port, + CLOG_CONFIG_DEFAULT_KEY); + if (r < 0) { + derr << __func__ << " error parsing 'mon_cluster_log_to_graylog_port'" + << dendl; + return; + } + + channels.expand_channel_meta(); +} + + +void LogMonitor::handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + if (changed.count("mon_cluster_log_to_syslog") || + changed.count("mon_cluster_log_to_syslog_level") || + changed.count("mon_cluster_log_to_syslog_facility") || + changed.count("mon_cluster_log_file") || + changed.count("mon_cluster_log_file_level") || + changed.count("mon_cluster_log_to_graylog") || + changed.count("mon_cluster_log_to_graylog_host") || + changed.count("mon_cluster_log_to_graylog_port")) { + update_log_channels(); + } +} diff --git a/src/mon/LogMonitor.h b/src/mon/LogMonitor.h new file mode 100644 index 000000000..6d6a0b71c --- /dev/null +++ b/src/mon/LogMonitor.h @@ -0,0 +1,189 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LOGMONITOR_H +#define CEPH_LOGMONITOR_H + +#include <map> +#include <set> + +#include "include/types.h" +#include "PaxosService.h" + +#include "common/config_fwd.h" +#include "common/LogEntry.h" +#include "include/str_map.h" + +class MLog; + +static const std::string LOG_META_CHANNEL = "$channel"; + +namespace ceph { +namespace logging { + class Graylog; +} +} + +class LogMonitor : public PaxosService, + public md_config_obs_t { +private: + std::multimap<utime_t,LogEntry> pending_log; + LogSummary pending_summary, summary; + + struct log_channel_info { + + std::map<std::string,std::string> log_to_syslog; + std::map<std::string,std::string> syslog_level; + std::map<std::string,std::string> syslog_facility; + std::map<std::string,std::string> log_file; + std::map<std::string,std::string> expanded_log_file; + std::map<std::string,std::string> log_file_level; + std::map<std::string,std::string> log_to_graylog; + std::map<std::string,std::string> log_to_graylog_host; + std::map<std::string,std::string> log_to_graylog_port; + + std::map<std::string, std::shared_ptr<ceph::logging::Graylog>> graylogs; + uuid_d fsid; + std::string host; + + void clear() { + log_to_syslog.clear(); + syslog_level.clear(); + syslog_facility.clear(); + log_file.clear(); + expanded_log_file.clear(); + log_file_level.clear(); + log_to_graylog.clear(); + log_to_graylog_host.clear(); + log_to_graylog_port.clear(); + graylogs.clear(); + } + + /** expands $channel meta variable on all maps *EXCEPT* log_file + * + * We won't expand the log_file map meta variables here because we + * intend to do that selectively during get_log_file() + */ + void expand_channel_meta() { + expand_channel_meta(log_to_syslog); + expand_channel_meta(syslog_level); + expand_channel_meta(syslog_facility); + expand_channel_meta(log_file_level); + } + void expand_channel_meta(std::map<std::string,std::string> &m); + std::string expand_channel_meta(const std::string &input, + const std::string &change_to); + + bool do_log_to_syslog(const std::string &channel); + + std::string get_facility(const std::string &channel) { + return get_str_map_key(syslog_facility, channel, + &CLOG_CONFIG_DEFAULT_KEY); + } + + std::string get_level(const std::string &channel) { + return get_str_map_key(syslog_level, channel, + &CLOG_CONFIG_DEFAULT_KEY); + } + + std::string get_log_file(const std::string &channel); + + std::string get_log_file_level(const std::string &channel) { + return get_str_map_key(log_file_level, channel, + &CLOG_CONFIG_DEFAULT_KEY); + } + + bool do_log_to_graylog(const std::string &channel) { + return (get_str_map_key(log_to_graylog, channel, + &CLOG_CONFIG_DEFAULT_KEY) == "true"); + } + + std::shared_ptr<ceph::logging::Graylog> get_graylog(const std::string &channel); + } channels; + + void update_log_channels(); + + void create_initial() override; + void update_from_paxos(bool *need_bootstrap) override; + void create_pending() override; // prepare a new pending + // propose pending update to peers + void encode_pending(MonitorDBStore::TransactionRef t) override; + void encode_full(MonitorDBStore::TransactionRef t) override; + version_t get_trim_to() const override; + bool preprocess_query(MonOpRequestRef op) override; // true if processed. + bool prepare_update(MonOpRequestRef op) override; + + bool preprocess_log(MonOpRequestRef op); + bool prepare_log(MonOpRequestRef op); + void _updated_log(MonOpRequestRef op); + + bool should_propose(double& delay) override; + + bool should_stash_full() override { + // commit a LogSummary on every commit + return true; + } + + struct C_Log; + + bool preprocess_command(MonOpRequestRef op); + bool prepare_command(MonOpRequestRef op); + + void _create_sub_incremental(MLog *mlog, int level, version_t sv); + + public: + LogMonitor(Monitor &mn, Paxos &p, const std::string& service_name) + : PaxosService(mn, p, service_name) { } + + void init() override { + generic_dout(10) << "LogMonitor::init" << dendl; + g_conf().add_observer(this); + update_log_channels(); + } + + void tick() override; // check state, take actions + + void check_subs(); + void check_sub(Subscription *s); + + /** + * translate log sub name ('log-info') to integer id + * + * @param n name + * @return id, or -1 if unrecognized + */ + int sub_name_to_id(const std::string& n); + + void on_shutdown() override { + g_conf().remove_observer(this); + } + + const char **get_tracked_conf_keys() const override { + static const char* KEYS[] = { + "mon_cluster_log_to_syslog", + "mon_cluster_log_to_syslog_level", + "mon_cluster_log_to_syslog_facility", + "mon_cluster_log_file", + "mon_cluster_log_file_level", + "mon_cluster_log_to_graylog", + "mon_cluster_log_to_graylog_host", + "mon_cluster_log_to_graylog_port", + NULL + }; + return KEYS; + } + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) override; +}; +#endif diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc new file mode 100644 index 000000000..2ec7a2018 --- /dev/null +++ b/src/mon/MDSMonitor.cc @@ -0,0 +1,2370 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <regex> +#include <sstream> +#include <boost/utility.hpp> + +#include "MDSMonitor.h" +#include "FSCommands.h" +#include "Monitor.h" +#include "MonitorDBStore.h" +#include "OSDMonitor.h" + +#include "common/strtol.h" +#include "common/perf_counters.h" +#include "common/config.h" +#include "common/cmdparse.h" +#include "messages/MMDSMap.h" +#include "messages/MFSMap.h" +#include "messages/MFSMapUser.h" +#include "messages/MMDSLoadTargets.h" +#include "messages/MMonCommand.h" +#include "messages/MGenericMessage.h" + +#include "include/ceph_assert.h" +#include "include/str_list.h" +#include "include/stringify.h" +#include "mds/mdstypes.h" +#include "Session.h" + +using namespace TOPNSPC::common; + +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::string; +using std::string_view; +using std::stringstream; +using std::to_string; +using std::vector; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::ErasureCodeInterfaceRef; +using ceph::ErasureCodeProfile; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::make_message; +using ceph::mono_clock; +using ceph::mono_time; + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, get_fsmap()) +static ostream& _prefix(std::ostream *_dout, Monitor &mon, const FSMap& fsmap) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() + << ").mds e" << fsmap.get_epoch() << " "; +} + +static const string MDS_METADATA_PREFIX("mds_metadata"); +static const string MDS_HEALTH_PREFIX("mds_health"); + + +/* + * Specialized implementation of cmd_getval to allow us to parse + * out strongly-typedef'd types + */ +namespace TOPNSPC::common { +template<> bool cmd_getval(const cmdmap_t& cmdmap, + const std::string& k, mds_gid_t &val) +{ + return cmd_getval(cmdmap, k, (int64_t&)val); +} + +template<> bool cmd_getval(const cmdmap_t& cmdmap, + const std::string& k, mds_rank_t &val) +{ + return cmd_getval(cmdmap, k, (int64_t&)val); +} + +template<> bool cmd_getval(const cmdmap_t& cmdmap, + const std::string& k, MDSMap::DaemonState &val) +{ + return cmd_getval(cmdmap, k, (int64_t&)val); +} +} +// my methods + +template <int dblV> +void MDSMonitor::print_map(const FSMap& m) +{ + dout(dblV) << "print_map\n"; + m.print(*_dout); + *_dout << dendl; +} + +// service methods +void MDSMonitor::create_initial() +{ + dout(10) << "create_initial" << dendl; +} + +void MDSMonitor::get_store_prefixes(std::set<string>& s) const +{ + s.insert(service_name); + s.insert(MDS_METADATA_PREFIX); + s.insert(MDS_HEALTH_PREFIX); +} + +void MDSMonitor::update_from_paxos(bool *need_bootstrap) +{ + version_t version = get_last_committed(); + if (version == get_fsmap().epoch) + return; + + dout(10) << __func__ << " version " << version + << ", my e " << get_fsmap().epoch << dendl; + ceph_assert(version > get_fsmap().epoch); + + load_health(); + + // read and decode + bufferlist fsmap_bl; + fsmap_bl.clear(); + int err = get_version(version, fsmap_bl); + ceph_assert(err == 0); + + ceph_assert(fsmap_bl.length() > 0); + dout(10) << __func__ << " got " << version << dendl; + try { + PaxosFSMap::decode(fsmap_bl); + } catch (const ceph::buffer::malformed_input& e) { + derr << "unable to decode FSMap: " << e.what() << dendl; + throw; + } + + // new map + dout(0) << "new map" << dendl; + print_map<0>(get_fsmap()); + if (!g_conf()->mon_mds_skip_sanity) { + get_fsmap().sanity(); + } + + check_subs(); +} + +void MDSMonitor::init() +{ + (void)load_metadata(pending_metadata); +} + +void MDSMonitor::create_pending() +{ + auto &fsmap = PaxosFSMap::create_pending(); + + if (mon.osdmon()->is_readable()) { + const auto &osdmap = mon.osdmon()->osdmap; + fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);}); + } + + dout(10) << "create_pending e" << fsmap.epoch << dendl; +} + +void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t) +{ + auto &pending = get_pending_fsmap_writeable(); + auto &epoch = pending.epoch; + + dout(10) << "encode_pending e" << epoch << dendl; + + // print map iff 'debug mon = 30' or higher + print_map<30>(pending); + if (!g_conf()->mon_mds_skip_sanity) { + pending.sanity(true); + } + + // Set 'modified' on maps modified this epoch + for (auto &p : pending.filesystems) { + if (p.second->mds_map.epoch == epoch) { + p.second->mds_map.modified = ceph_clock_now(); + } + } + + // apply to paxos + ceph_assert(get_last_committed() + 1 == pending.epoch); + bufferlist pending_bl; + pending.encode(pending_bl, mon.get_quorum_con_features()); + + /* put everything in the transaction */ + put_version(t, pending.epoch, pending_bl); + put_last_committed(t, pending.epoch); + + // Encode MDSHealth data + for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin(); + i != pending_daemon_health.end(); ++i) { + bufferlist bl; + i->second.encode(bl); + t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl); + } + + for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin(); + i != pending_daemon_health_rm.end(); ++i) { + t->erase(MDS_HEALTH_PREFIX, stringify(*i)); + } + pending_daemon_health_rm.clear(); + remove_from_metadata(pending, t); + + // health + health_check_map_t new_checks; + const auto &info_map = pending.get_mds_info(); + for (const auto &i : info_map) { + const auto &gid = i.first; + const auto &info = i.second; + if (pending_daemon_health_rm.count(gid)) { + continue; + } + MDSHealth health; + auto p = pending_daemon_health.find(gid); + if (p != pending_daemon_health.end()) { + health = p->second; + } else { + bufferlist bl; + mon.store->get(MDS_HEALTH_PREFIX, stringify(gid), bl); + if (!bl.length()) { + derr << "Missing health data for MDS " << gid << dendl; + continue; + } + auto bl_i = bl.cbegin(); + health.decode(bl_i); + } + for (const auto &metric : health.metrics) { + if (metric.type == MDS_HEALTH_DUMMY) { + continue; + } + const auto rank = info.rank; + health_check_t *check = &new_checks.get_or_add( + mds_metric_name(metric.type), + metric.sev, + mds_metric_summary(metric.type), + 1); + ostringstream ss; + ss << "mds." << info.name << "(mds." << rank << "): " << metric.message; + bool first = true; + for (auto &p : metric.metadata) { + if (first) { + ss << " "; + } else { + ss << ", "; + } + ss << p.first << ": " << p.second; + first = false; + } + check->detail.push_back(ss.str()); + } + } + pending.get_health_checks(&new_checks); + for (auto& p : new_checks.checks) { + p.second.summary = std::regex_replace( + p.second.summary, + std::regex("%num%"), + stringify(p.second.detail.size())); + p.second.summary = std::regex_replace( + p.second.summary, + std::regex("%plurals%"), + p.second.detail.size() > 1 ? "s" : ""); + p.second.summary = std::regex_replace( + p.second.summary, + std::regex("%isorare%"), + p.second.detail.size() > 1 ? "are" : "is"); + p.second.summary = std::regex_replace( + p.second.summary, + std::regex("%hasorhave%"), + p.second.detail.size() > 1 ? "have" : "has"); + } + encode_health(new_checks, t); +} + +version_t MDSMonitor::get_trim_to() const +{ + version_t floor = 0; + if (g_conf()->mon_mds_force_trim_to > 0 && + g_conf()->mon_mds_force_trim_to <= (int)get_last_committed()) { + floor = g_conf()->mon_mds_force_trim_to; + dout(10) << __func__ << " explicit mon_mds_force_trim_to = " + << floor << dendl; + } + + unsigned max = g_conf()->mon_max_mdsmap_epochs; + version_t last = get_last_committed(); + + if (last - get_first_committed() > max && floor < last - max) { + floor = last-max; + } + + dout(20) << __func__ << " = " << floor << dendl; + return floor; +} + +bool MDSMonitor::preprocess_query(MonOpRequestRef op) +{ + op->mark_mdsmon_event(__func__); + auto m = op->get_req<PaxosServiceMessage>(); + dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source() + << " " << m->get_orig_source_addrs() << dendl; + + switch (m->get_type()) { + + case MSG_MDS_BEACON: + return preprocess_beacon(op); + + case MSG_MON_COMMAND: + try { + return preprocess_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + + case MSG_MDS_OFFLOAD_TARGETS: + return preprocess_offload_targets(op); + + default: + ceph_abort(); + return true; + } +} + +void MDSMonitor::_note_beacon(MMDSBeacon *m) +{ + mds_gid_t gid = mds_gid_t(m->get_global_id()); + version_t seq = m->get_seq(); + + dout(5) << "_note_beacon " << *m << " noting time" << dendl; + auto &beacon = last_beacon[gid]; + beacon.stamp = mono_clock::now(); + beacon.seq = seq; +} + +bool MDSMonitor::preprocess_beacon(MonOpRequestRef op) +{ + op->mark_mdsmon_event(__func__); + auto m = op->get_req<MMDSBeacon>(); + MDSMap::DaemonState state = m->get_state(); + mds_gid_t gid = m->get_global_id(); + version_t seq = m->get_seq(); + MDSMap::mds_info_t info; + epoch_t effective_epoch = 0; + + const auto &fsmap = get_fsmap(); + + // check privileges, ignore if fails + MonSession *session = op->get_session(); + if (!session) + goto ignore; + if (!session->is_capable("mds", MON_CAP_X)) { + dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges " + << session->caps << dendl; + goto ignore; + } + + if (m->get_fsid() != mon.monmap->fsid) { + dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon.monmap->fsid << dendl; + goto ignore; + } + + dout(5) << "preprocess_beacon " << *m + << " from " << m->get_orig_source() + << " " << m->get_orig_source_addrs() + << " " << m->get_compat() + << dendl; + + // make sure the address has a port + if (m->get_orig_source_addr().get_port() == 0) { + dout(1) << " ignoring boot message without a port" << dendl; + goto ignore; + } + + // fw to leader? + if (!is_leader()) + return false; + + // booted, but not in map? + if (!fsmap.gid_exists(gid)) { + if (state != MDSMap::STATE_BOOT) { + dout(7) << "mds_beacon " << *m << " is not in fsmap (state " + << ceph_mds_state_name(state) << ")" << dendl; + + /* We can't send an MDSMap this MDS was a part of because we no longer + * know which FS it was part of. Nor does this matter. Sending an empty + * MDSMap is sufficient for getting the MDS to respawn. + */ + auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap()); + mon.send_reply(op, m.detach()); + return true; + } else { + return false; // not booted yet. + } + } + dout(10) << __func__ << ": GID exists in map: " << gid << dendl; + info = fsmap.get_info_gid(gid); + + if (state == MDSMap::STATE_DNE) { + return false; + } + + // old seq? + if (info.state_seq > seq) { + dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl; + goto ignore; + } + + // Work out the latest epoch that this daemon should have seen + { + fs_cluster_id_t fscid = fsmap.mds_roles.at(gid); + if (fscid == FS_CLUSTER_ID_NONE) { + effective_epoch = fsmap.standby_epochs.at(gid); + } else { + effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch; + } + if (effective_epoch != m->get_last_epoch_seen()) { + dout(10) << "mds_beacon " << *m + << " ignoring requested state, because mds hasn't seen latest map" << dendl; + goto reply; + } + } + + if (info.laggy()) { + _note_beacon(m); + return false; // no longer laggy, need to update map. + } + if (state == MDSMap::STATE_BOOT) { + // ignore, already booted. + goto ignore; + } + + // did the join_fscid change + if (m->get_fs().size()) { + fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE; + auto f = fsmap.get_filesystem(m->get_fs()); + if (f) { + fscid = f->fscid; + } + if (info.join_fscid != fscid) { + dout(10) << __func__ << " standby mds_join_fs changed to " << fscid + << " (" << m->get_fs() << ")" << dendl; + _note_beacon(m); + return false; + } + } else { + if (info.join_fscid != FS_CLUSTER_ID_NONE) { + dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl; + _note_beacon(m); + return false; + } + } + + // is there a state change here? + if (info.state != state) { + _note_beacon(m); + return false; + } + + // Comparing known daemon health with m->get_health() + // and return false (i.e. require proposal) if they + // do not match, to update our stored + if (!(pending_daemon_health[gid] == m->get_health())) { + dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl; + _note_beacon(m); + return false; + } + + reply: + // note time and reply + ceph_assert(effective_epoch > 0); + _note_beacon(m); + { + auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid, + m->get_global_id(), m->get_name(), effective_epoch, + state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT); + mon.send_reply(op, beacon.detach()); + } + return true; + + ignore: + // I won't reply this beacon, drop it. + mon.no_reply(op); + return true; +} + +bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op) +{ + op->mark_mdsmon_event(__func__); + auto m = op->get_req<MMDSLoadTargets>(); + dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl; + + const auto &fsmap = get_fsmap(); + + // check privileges, ignore message if fails + MonSession *session = op->get_session(); + if (!session) + goto ignore; + if (!session->is_capable("mds", MON_CAP_X)) { + dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps " + << session->caps << dendl; + goto ignore; + } + + if (fsmap.gid_exists(m->global_id) && + m->targets == fsmap.get_info_gid(m->global_id).export_targets) + goto ignore; + + return false; + + ignore: + mon.no_reply(op); + return true; +} + + +bool MDSMonitor::prepare_update(MonOpRequestRef op) +{ + op->mark_mdsmon_event(__func__); + auto m = op->get_req<PaxosServiceMessage>(); + dout(7) << "prepare_update " << *m << dendl; + + switch (m->get_type()) { + + case MSG_MDS_BEACON: + return prepare_beacon(op); + + case MSG_MON_COMMAND: + try { + return prepare_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + + case MSG_MDS_OFFLOAD_TARGETS: + return prepare_offload_targets(op); + + default: + ceph_abort(); + } + + return true; +} + +bool MDSMonitor::prepare_beacon(MonOpRequestRef op) +{ + op->mark_mdsmon_event(__func__); + auto m = op->get_req<MMDSBeacon>(); + // -- this is an update -- + dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source() + << " " << m->get_orig_source_addrs() << dendl; + entity_addrvec_t addrs = m->get_orig_source_addrs(); + mds_gid_t gid = m->get_global_id(); + MDSMap::DaemonState state = m->get_state(); + version_t seq = m->get_seq(); + + auto &pending = get_pending_fsmap_writeable(); + + dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl; + + // Calculate deltas of health metrics created and removed + // Do this by type rather than MDSHealthMetric equality, because messages can + // change a lot when they include e.g. a number of items. + const auto &old_health = pending_daemon_health[gid].metrics; + const auto &new_health = m->get_health().metrics; + + std::set<mds_metric_t> old_types; + for (const auto &i : old_health) { + old_types.insert(i.type); + } + + std::set<mds_metric_t> new_types; + for (const auto &i : new_health) { + if (i.type == MDS_HEALTH_DUMMY) { + continue; + } + new_types.insert(i.type); + } + + for (const auto &new_metric: new_health) { + if (new_metric.type == MDS_HEALTH_DUMMY) { + continue; + } + if (old_types.count(new_metric.type) == 0) { + dout(10) << "MDS health message (" << m->get_orig_source() + << "): " << new_metric.sev << " " << new_metric.message << dendl; + } + } + + // Log the disappearance of health messages at INFO + for (const auto &old_metric : old_health) { + if (new_types.count(old_metric.type) == 0) { + mon.clog->info() << "MDS health message cleared (" + << m->get_orig_source() << "): " << old_metric.message; + } + } + + // Store health + pending_daemon_health[gid] = m->get_health(); + + const auto& cs = m->get_compat(); + if (state == MDSMap::STATE_BOOT) { + // zap previous instance of this name? + if (g_conf()->mds_enforce_unique_name) { + bool failed_mds = false; + while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) { + if (!mon.osdmon()->is_writeable()) { + mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + const auto& existing_info = pending.get_info_gid(existing); + mon.clog->info() << existing_info.human_name() << " restarted"; + fail_mds_gid(pending, existing); + failed_mds = true; + } + if (failed_mds) { + ceph_assert(mon.osdmon()->is_writeable()); + request_proposal(mon.osdmon()); + } + } + + // Add this daemon to the map + if (pending.mds_roles.count(gid) == 0) { + MDSMap::mds_info_t new_info; + new_info.global_id = gid; + new_info.name = m->get_name(); + new_info.addrs = addrs; + new_info.mds_features = m->get_mds_features(); + new_info.state = MDSMap::STATE_STANDBY; + new_info.state_seq = seq; + new_info.compat = cs; + if (m->get_fs().size()) { + fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE; + auto f = pending.get_filesystem(m->get_fs()); + if (f) { + fscid = f->fscid; + } + new_info.join_fscid = fscid; + } + pending.insert(new_info); + } + + // initialize the beacon timer + auto &beacon = last_beacon[gid]; + beacon.stamp = mono_clock::now(); + beacon.seq = seq; + + update_metadata(m->get_global_id(), m->get_sys_info()); + } else { + // state update + + if (!pending.gid_exists(gid)) { + /* gid has been removed from pending, send null map */ + dout(5) << "mds_beacon " << *m << " is not in fsmap (state " + << ceph_mds_state_name(state) << ")" << dendl; + + /* We can't send an MDSMap this MDS was a part of because we no longer + * know which FS it was part of. Nor does this matter. Sending an empty + * MDSMap is sufficient for getting the MDS to respawn. + */ + goto null; + } + + const auto& info = pending.get_info_gid(gid); + + // did the reported compat change? That's illegal! + if (cs.compare(info.compat) != 0) { + if (!mon.osdmon()->is_writeable()) { + mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + mon.clog->warn() << info.human_name() << " compat changed unexpectedly"; + fail_mds_gid(pending, gid); + request_proposal(mon.osdmon()); + return true; + } + + if (state == MDSMap::STATE_DNE) { + dout(1) << __func__ << ": DNE from " << info << dendl; + goto evict; + } + + // legal state change? + if ((info.state == MDSMap::STATE_STANDBY && state != info.state) || + (info.state == MDSMap::STATE_STANDBY_REPLAY && state != info.state && state != MDSMap::STATE_DAMAGED)) { + // Standby daemons should never modify their own state. + // Except that standby-replay can indicate the rank is damaged due to failure to replay. + // Reject any attempts to do so. + derr << "standby " << gid << " attempted to change state to " + << ceph_mds_state_name(state) << ", rejecting" << dendl; + goto evict; + } else if (info.state != MDSMap::STATE_STANDBY && state != info.state && + !MDSMap::state_transition_valid(info.state, state)) { + // Validate state transitions for daemons that hold a rank + derr << "daemon " << gid << " (rank " << info.rank << ") " + << "reported invalid state transition " + << ceph_mds_state_name(info.state) << " -> " + << ceph_mds_state_name(state) << dendl; + goto evict; + } + + if (info.laggy()) { + dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl; + pending.modify_daemon(info.global_id, [](auto& info) + { + info.clear_laggy(); + } + ); + } + + dout(5) << "prepare_beacon mds." << info.rank + << " " << ceph_mds_state_name(info.state) + << " -> " << ceph_mds_state_name(state) + << dendl; + + fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE; + if (m->get_fs().size()) { + auto f = pending.get_filesystem(m->get_fs()); + if (f) { + fscid = f->fscid; + } + } + pending.modify_daemon(gid, [fscid](auto& info) { + info.join_fscid = fscid; + }); + + if (state == MDSMap::STATE_STOPPED) { + const auto fscid = pending.mds_roles.at(gid); + const auto &fs = pending.get_filesystem(fscid); + + mon.clog->info() << info.human_name() << " finished " + << "stopping rank " << info.rank << " in filesystem " + << fs->mds_map.fs_name << " (now has " + << fs->mds_map.get_num_in_mds() - 1 << " ranks)"; + + auto erased = pending.stop(gid); + erased.push_back(gid); + + for (const auto& erased_gid : erased) { + last_beacon.erase(erased_gid); + if (pending_daemon_health.count(erased_gid)) { + pending_daemon_health.erase(erased_gid); + pending_daemon_health_rm.insert(erased_gid); + } + } + } else if (state == MDSMap::STATE_DAMAGED) { + if (!mon.osdmon()->is_writeable()) { + dout(1) << __func__ << ": DAMAGED from rank " << info.rank + << " waiting for osdmon writeable to blocklist it" << dendl; + mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + + auto rank = info.rank; + + // Record this MDS rank as damaged, so that other daemons + // won't try to run it. + dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl; + + auto fs = pending.get_filesystem(gid); + auto rankgid = fs->mds_map.get_gid(rank); + auto rankinfo = pending.get_info_gid(rankgid); + auto followergid = fs->mds_map.get_standby_replay(rank); + + ceph_assert(gid == rankgid || gid == followergid); + + utime_t until = ceph_clock_now(); + until += g_conf().get_val<double>("mon_mds_blocklist_interval"); + const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until); + if (followergid != MDS_GID_NONE) { + fail_mds_gid(pending, followergid); + last_beacon.erase(followergid); + } + request_proposal(mon.osdmon()); + pending.damaged(rankgid, blocklist_epoch); + last_beacon.erase(rankgid); + + /* MDS expects beacon reply back */ + } else { + if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) { + const auto &fscid = pending.mds_roles.at(gid); + const auto &fs = pending.get_filesystem(fscid); + mon.clog->info() << info.human_name() << " is now active in " + << "filesystem " << fs->mds_map.fs_name << " as rank " + << info.rank; + } + + // Made it through special cases and validations, record the + // daemon's reported state to the FSMap. + pending.modify_daemon(gid, [state, seq](auto& info) { + info.state = state; + info.state_seq = seq; + }); + } + } + + dout(5) << "prepare_beacon pending map now:" << dendl; + print_map(pending); + + wait_for_finished_proposal(op, new LambdaContext([op, this](int r){ + if (r >= 0) + _updated(op); // success + else if (r == -ECANCELED) { + mon.no_reply(op); + } else { + dispatch(op); // try again + } + })); + + return true; + +evict: + if (!mon.osdmon()->is_writeable()) { + dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl; + mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + + fail_mds_gid(pending, gid); + request_proposal(mon.osdmon()); + dout(5) << __func__ << ": pending map now:" << dendl; + print_map(pending); + + goto null; + +null: + wait_for_finished_proposal(op, new LambdaContext([op, this](int r){ + if (r >= 0) { + auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap()); + mon.send_reply(op, m.detach()); + } else { + dispatch(op); // try again + } + })); + + return true; +} + +bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op) +{ + auto &pending = get_pending_fsmap_writeable(); + + op->mark_mdsmon_event(__func__); + auto m = op->get_req<MMDSLoadTargets>(); + mds_gid_t gid = m->global_id; + if (pending.gid_has_rank(gid)) { + dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl; + pending.update_export_targets(gid, m->targets); + } else { + dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl; + } + mon.no_reply(op); + return true; +} + +bool MDSMonitor::should_propose(double& delay) +{ + // delegate to PaxosService to assess whether we should propose + return PaxosService::should_propose(delay); +} + +void MDSMonitor::_updated(MonOpRequestRef op) +{ + const auto &fsmap = get_fsmap(); + op->mark_mdsmon_event(__func__); + auto m = op->get_req<MMDSBeacon>(); + dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl; + mon.clog->debug() << m->get_orig_source() << " " + << m->get_orig_source_addrs() << " " + << ceph_mds_state_name(m->get_state()); + + if (m->get_state() == MDSMap::STATE_STOPPED) { + // send the map manually (they're out of the map, so they won't get it automatic) + auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap()); + mon.send_reply(op, m.detach()); + } else { + auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid, + m->get_global_id(), m->get_name(), fsmap.get_epoch(), + m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT); + mon.send_reply(op, beacon.detach()); + } +} + +void MDSMonitor::on_active() +{ + tick(); + + if (is_leader()) { + mon.clog->debug() << "fsmap " << get_fsmap(); + } +} + +void MDSMonitor::dump_info(Formatter *f) +{ + f->open_object_section("fsmap"); + get_fsmap().dump(f); + f->close_section(); + + f->dump_unsigned("mdsmap_first_committed", get_first_committed()); + f->dump_unsigned("mdsmap_last_committed", get_last_committed()); +} + +bool MDSMonitor::preprocess_command(MonOpRequestRef op) +{ + op->mark_mdsmon_event(__func__); + auto m = op->get_req<MMonCommand>(); + int r = -1; + bufferlist rdata; + stringstream ss, ds; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + // ss has reason for failure + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed()); + return true; + } + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + std::unique_ptr<Formatter> f(Formatter::create(format)); + + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed()); + return true; + } + + // to use const qualifier filter fsmap beforehand + FSMap _fsmap_copy = get_fsmap(); + _fsmap_copy.filter(session->get_allowed_fs_names()); + const auto& fsmap = _fsmap_copy; + + if (prefix == "mds stat") { + if (f) { + f->open_object_section("mds_stat"); + dump_info(f.get()); + f->close_section(); + f->flush(ds); + } else { + ds << fsmap; + } + r = 0; + } else if (prefix == "mds ok-to-stop") { + vector<string> ids; + if (!cmd_getval(cmdmap, "ids", ids)) { + r = -EINVAL; + ss << "must specify mds id"; + goto out; + } + if (fsmap.is_any_degraded()) { + ss << "one or more filesystems is currently degraded"; + r = -EBUSY; + goto out; + } + set<mds_gid_t> stopping; + for (auto& id : ids) { + ostringstream ess; + mds_gid_t gid = gid_from_arg(fsmap, id, ess); + if (gid == MDS_GID_NONE) { + // the mds doesn't exist, but no file systems are unhappy, so losing it + // can't have any effect. + continue; + } + stopping.insert(gid); + } + set<mds_gid_t> active; + set<mds_gid_t> standby; + for (auto gid : stopping) { + if (fsmap.gid_has_rank(gid)) { + // ignore standby-replay daemons (at this level) + if (!fsmap.is_standby_replay(gid)) { + auto standby = fsmap.get_standby_replay(gid); + if (standby == MDS_GID_NONE || + stopping.count(standby)) { + // no standby-replay, or we're also stopping the standby-replay + // for this mds + active.insert(gid); + } + } + } else { + // net loss of a standby + standby.insert(gid); + } + } + if (fsmap.get_num_standby() - standby.size() < active.size()) { + r = -EBUSY; + ss << "insufficent standby MDS daemons to stop active gids " + << stringify(active) + << " and/or standby gids " << stringify(standby);; + goto out; + } + r = 0; + ss << "should be safe to stop " << ids; + } else if (prefix == "fs dump") { + int64_t epocharg; + epoch_t epoch; + + const FSMap *fsmapp = &fsmap; + FSMap dummy; + if (cmd_getval(cmdmap, "epoch", epocharg)) { + epoch = epocharg; + bufferlist b; + int err = get_version(epoch, b); + if (err == -ENOENT) { + r = -ENOENT; + goto out; + } else { + ceph_assert(err == 0); + ceph_assert(b.length()); + dummy.decode(b); + fsmapp = &dummy; + } + } + + stringstream ds; + if (f != NULL) { + f->open_object_section("fsmap"); + fsmapp->dump(f.get()); + f->close_section(); + f->flush(ds); + r = 0; + } else { + fsmapp->print(ds); + r = 0; + } + + rdata.append(ds); + ss << "dumped fsmap epoch " << fsmapp->get_epoch(); + } else if (prefix == "mds metadata") { + if (!f) + f.reset(Formatter::create("json-pretty")); + + string who; + bool all = !cmd_getval(cmdmap, "who", who); + dout(1) << "all = " << all << dendl; + if (all) { + r = 0; + // Dump all MDSs' metadata + const auto all_info = fsmap.get_mds_info(); + + f->open_array_section("mds_metadata"); + for(const auto &i : all_info) { + const auto &info = i.second; + + f->open_object_section("mds"); + f->dump_string("name", info.name); + std::ostringstream get_err; + r = dump_metadata(fsmap, info.name, f.get(), get_err); + if (r == -EINVAL || r == -ENOENT) { + // Drop error, list what metadata we do have + dout(1) << get_err.str() << dendl; + r = 0; + } else if (r != 0) { + derr << "Unexpected error reading metadata: " << cpp_strerror(r) + << dendl; + ss << get_err.str(); + f->close_section(); + break; + } + f->close_section(); + } + f->close_section(); + } else { + // Dump a single daemon's metadata + f->open_object_section("mds_metadata"); + r = dump_metadata(fsmap, who, f.get(), ss); + f->close_section(); + } + f->flush(ds); + } else if (prefix == "mds versions") { + if (!f) + f.reset(Formatter::create("json-pretty")); + count_metadata("ceph_version", f.get()); + f->flush(ds); + r = 0; + } else if (prefix == "mds count-metadata") { + if (!f) + f.reset(Formatter::create("json-pretty")); + string field; + cmd_getval(cmdmap, "property", field); + count_metadata(field, f.get()); + f->flush(ds); + r = 0; + } else if (prefix == "fs compat show") { + string fs_name; + cmd_getval(cmdmap, "fs_name", fs_name); + const auto &fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "filesystem '" << fs_name << "' not found"; + r = -ENOENT; + goto out; + } + + if (f) { + f->open_object_section("mds_compat"); + fs->mds_map.compat.dump(f.get()); + f->close_section(); + f->flush(ds); + } else { + ds << fs->mds_map.compat; + } + r = 0; + } else if (prefix == "mds compat show") { + if (f) { + f->open_object_section("mds_compat"); + fsmap.default_compat.dump(f.get()); + f->close_section(); + f->flush(ds); + } else { + ds << fsmap.default_compat; + } + r = 0; + } else if (prefix == "fs get") { + string fs_name; + cmd_getval(cmdmap, "fs_name", fs_name); + const auto &fs = fsmap.get_filesystem(fs_name); + if (fs == nullptr) { + ss << "filesystem '" << fs_name << "' not found"; + r = -ENOENT; + } else { + if (f != nullptr) { + f->open_object_section("filesystem"); + fs->dump(f.get()); + f->close_section(); + f->flush(ds); + r = 0; + } else { + fs->print(ds); + r = 0; + } + } + } else if (prefix == "fs ls") { + if (f) { + f->open_array_section("filesystems"); + for (const auto &p : fsmap.filesystems) { + const auto &fs = p.second; + f->open_object_section("filesystem"); + { + const MDSMap &mds_map = fs->mds_map; + f->dump_string("name", mds_map.fs_name); + /* Output both the names and IDs of pools, for use by + * humans and machines respectively */ + f->dump_string("metadata_pool", mon.osdmon()->osdmap.get_pool_name( + mds_map.metadata_pool)); + f->dump_int("metadata_pool_id", mds_map.metadata_pool); + f->open_array_section("data_pool_ids"); + for (const auto &id : mds_map.data_pools) { + f->dump_int("data_pool_id", id); + } + f->close_section(); + + f->open_array_section("data_pools"); + for (const auto &id : mds_map.data_pools) { + const auto &name = mon.osdmon()->osdmap.get_pool_name(id); + f->dump_string("data_pool", name); + } + f->close_section(); + } + f->close_section(); + } + f->close_section(); + f->flush(ds); + } else { + for (const auto &p : fsmap.filesystems) { + const auto &fs = p.second; + const MDSMap &mds_map = fs->mds_map; + const string &md_pool_name = mon.osdmon()->osdmap.get_pool_name( + mds_map.metadata_pool); + + ds << "name: " << mds_map.fs_name << ", metadata pool: " + << md_pool_name << ", data pools: ["; + for (const auto &id : mds_map.data_pools) { + const string &pool_name = mon.osdmon()->osdmap.get_pool_name(id); + ds << pool_name << " "; + } + ds << "]" << std::endl; + } + + if (fsmap.filesystems.empty()) { + ds << "No filesystems enabled" << std::endl; + } + } + r = 0; + } else if (prefix == "fs feature ls") { + if (f) { + f->open_array_section("cephfs_features"); + for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) { + f->open_object_section("feature"); + f->dump_int("index", i); + f->dump_string("name", cephfs_feature_name(i)); + f->close_section(); + } + f->close_section(); + f->flush(ds); + } else { + for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) { + ds << i << " " << cephfs_feature_name(i) << std::endl; + } + } + r = 0; + } + +out: + if (r != -1) { + rdata.append(ds); + string rs; + getline(ss, rs); + mon.reply_command(op, r, rs, rdata, get_last_committed()); + return true; + } else + return false; +} + +bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid) +{ + const auto& info = fsmap.get_info_gid(gid); + dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl; + + ceph_assert(mon.osdmon()->is_writeable()); + + epoch_t blocklist_epoch = 0; + if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) { + utime_t until = ceph_clock_now(); + until += g_conf().get_val<double>("mon_mds_blocklist_interval"); + blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until); + } + + fsmap.erase(gid, blocklist_epoch); + last_beacon.erase(gid); + if (pending_daemon_health.count(gid)) { + pending_daemon_health.erase(gid); + pending_daemon_health_rm.insert(gid); + } + + return blocklist_epoch != 0; +} + +mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss) +{ + // Try parsing as a role + mds_role_t role; + std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors + int r = fsmap.parse_role(arg, &role, ignore_err); + if (r == 0) { + // See if a GID is assigned to this role + const auto &fs = fsmap.get_filesystem(role.fscid); + ceph_assert(fs != nullptr); // parse_role ensures it exists + if (fs->mds_map.is_up(role.rank)) { + dout(10) << __func__ << ": validated rank/GID " << role + << " as a rank" << dendl; + return fs->mds_map.get_mds_info(role.rank).global_id; + } + } + + // Try parsing as a gid + std::string err; + unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err); + if (!err.empty()) { + // Not a role or a GID, try as a daemon name + const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg); + if (!mds_info) { + ss << "MDS named '" << arg + << "' does not exist, or is not up"; + return MDS_GID_NONE; + } + dout(10) << __func__ << ": resolved MDS name '" << arg + << "' to GID " << mds_info->global_id << dendl; + return mds_info->global_id; + } else { + // Not a role, but parses as a an integer, might be a GID + dout(10) << __func__ << ": treating MDS reference '" << arg + << "' as an integer " << maybe_gid << dendl; + + if (fsmap.gid_exists(mds_gid_t(maybe_gid))) { + return mds_gid_t(maybe_gid); + } + } + + dout(1) << __func__ << ": rank/GID " << arg + << " not a existent rank or GID" << dendl; + return MDS_GID_NONE; +} + +int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss, + const std::string &arg, MDSMap::mds_info_t *failed_info) +{ + ceph_assert(failed_info != nullptr); + + mds_gid_t gid = gid_from_arg(fsmap, arg, ss); + if (gid == MDS_GID_NONE) { + return 0; + } + if (!mon.osdmon()->is_writeable()) { + return -EAGAIN; + } + + // Take a copy of the info before removing the MDS from the map, + // so that the caller knows which mds (if any) they ended up removing. + *failed_info = fsmap.get_info_gid(gid); + + fail_mds_gid(fsmap, gid); + ss << "failed mds gid " << gid; + ceph_assert(mon.osdmon()->is_writeable()); + request_proposal(mon.osdmon()); + return 0; +} + +bool MDSMonitor::prepare_command(MonOpRequestRef op) +{ + op->mark_mdsmon_event(__func__); + auto m = op->get_req<MMonCommand>(); + int r = -EINVAL; + stringstream ss; + bufferlist rdata; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed()); + return true; + } + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + + /* Refuse access if message not associated with a valid session */ + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed()); + return true; + } + + auto &pending = get_pending_fsmap_writeable(); + + bool batched_propose = false; + for (const auto &h : handlers) { + r = h->can_handle(prefix, op, pending, cmdmap, ss); + if (r == 1) { + ; // pass, since we got the right handler. + } else if (r == 0) { + continue; + } else { + goto out; + } + + batched_propose = h->batched_propose(); + if (batched_propose) { + paxos.plug(); + } + r = h->handle(&mon, pending, op, cmdmap, ss); + if (batched_propose) { + paxos.unplug(); + } + + if (r == -EAGAIN) { + // message has been enqueued for retry; return. + dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl; + return false; + } else { + if (r == 0) { + // On successful updates, print the updated map + print_map(pending); + } + // Successful or not, we're done: respond. + goto out; + } + } + + r = filesystem_command(pending, op, prefix, cmdmap, ss); + if (r >= 0) { + goto out; + } else if (r == -EAGAIN) { + // Do not reply, the message has been enqueued for retry + dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl; + return false; + } else if (r != -ENOSYS) { + goto out; + } + + if (r == -ENOSYS && ss.str().empty()) { + ss << "unrecognized command"; + } + +out: + dout(4) << __func__ << " done, r=" << r << dendl; + /* Compose response */ + string rs; + getline(ss, rs); + + if (r >= 0) { + // success.. delay reply + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs, + get_last_committed() + 1)); + if (batched_propose) { + force_immediate_propose(); + } + return true; + } else { + // reply immediately + mon.reply_command(op, r, rs, rdata, get_last_committed()); + return false; + } +} + +int MDSMonitor::filesystem_command( + FSMap &fsmap, + MonOpRequestRef op, + std::string const &prefix, + const cmdmap_t& cmdmap, + std::stringstream &ss) +{ + dout(4) << __func__ << " prefix='" << prefix << "'" << dendl; + op->mark_mdsmon_event(__func__); + int r = 0; + string whostr; + cmd_getval(cmdmap, "role", whostr); + + if (prefix == "mds set_state") { + mds_gid_t gid; + if (!cmd_getval(cmdmap, "gid", gid)) { + ss << "error parsing 'gid' value '" + << cmd_vartype_stringify(cmdmap.at("gid")) << "'"; + return -EINVAL; + } + MDSMap::DaemonState state; + if (!cmd_getval(cmdmap, "state", state)) { + ss << "error parsing 'state' string value '" + << cmd_vartype_stringify(cmdmap.at("state")) << "'"; + return -EINVAL; + } + if (fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) { + fsmap.modify_daemon(gid, [state](auto& info) { + info.state = state; + }); + ss << "set mds gid " << gid << " to state " << state << " " + << ceph_mds_state_name(state); + return 0; + } + } else if (prefix == "mds fail") { + string who; + cmd_getval(cmdmap, "role_or_gid", who); + + MDSMap::mds_info_t failed_info; + mds_gid_t gid = gid_from_arg(fsmap, who, ss); + if (gid == MDS_GID_NONE) { + ss << "MDS named '" << who << "' does not exist, is not up or you " + << "lack the permission to see."; + return 0; + } + if(!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) { + ss << "MDS named '" << who << "' does not exist, is not up or you " + << "lack the permission to see."; + return -EINVAL; + } + string_view fs_name = fsmap.fs_name_from_gid(gid); + if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) { + ss << "Permission denied."; + return -EPERM; + } + + r = fail_mds(fsmap, ss, who, &failed_info); + if (r < 0 && r == -EAGAIN) { + mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return -EAGAIN; // don't propose yet; wait for message to be retried + } else if (r == 0) { + // Only log if we really did something (not when was already gone) + if (failed_info.global_id != MDS_GID_NONE) { + mon.clog->info() << failed_info.human_name() << " marked failed by " + << op->get_session()->entity_name; + } + } + } else if (prefix == "mds rm") { + mds_gid_t gid; + if (!cmd_getval(cmdmap, "gid", gid)) { + ss << "error parsing 'gid' value '" + << cmd_vartype_stringify(cmdmap.at("gid")) << "'"; + return -EINVAL; + } + if (!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) { + ss << "mds gid " << gid << " does not exist"; + return 0; + } + string_view fs_name = fsmap.fs_name_from_gid(gid); + if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) { + ss << "Permission denied."; + return -EPERM; + } + const auto &info = fsmap.get_info_gid(gid); + MDSMap::DaemonState state = info.state; + if (state > 0) { + ss << "cannot remove active mds." << info.name + << " rank " << info.rank; + return -EBUSY; + } else { + fsmap.erase(gid, {}); + ss << "removed mds gid " << gid; + return 0; + } + } else if (prefix == "mds rmfailed") { + bool confirm = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", confirm); + if (!confirm) { + ss << "WARNING: this can make your filesystem inaccessible! " + "Add --yes-i-really-mean-it if you are sure you wish to continue."; + return -EPERM; + } + + std::string role_str; + cmd_getval(cmdmap, "role", role_str); + mds_role_t role; + const auto fs_names = op->get_session()->get_allowed_fs_names(); + int r = fsmap.parse_role(role_str, &role, ss, fs_names); + if (r < 0) { + ss << "invalid role '" << role_str << "'"; + return -EINVAL; + } + string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name(); + if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) { + ss << "Permission denied."; + return -EPERM; + } + + fsmap.modify_filesystem( + role.fscid, + [role](std::shared_ptr<Filesystem> fs) + { + fs->mds_map.failed.erase(role.rank); + }); + + ss << "removed failed mds." << role; + return 0; + /* TODO: convert to fs commands to update defaults */ + } else if (prefix == "mds compat rm_compat") { + int64_t f; + if (!cmd_getval(cmdmap, "feature", f)) { + ss << "error parsing feature value '" + << cmd_vartype_stringify(cmdmap.at("feature")) << "'"; + return -EINVAL; + } + if (fsmap.default_compat.compat.contains(f)) { + ss << "removing compat feature " << f; + fsmap.default_compat.compat.remove(f); + } else { + ss << "compat feature " << f << " not present in " << fsmap.default_compat; + } + r = 0; + } else if (prefix == "mds compat rm_incompat") { + int64_t f; + if (!cmd_getval(cmdmap, "feature", f)) { + ss << "error parsing feature value '" + << cmd_vartype_stringify(cmdmap.at("feature")) << "'"; + return -EINVAL; + } + if (fsmap.default_compat.incompat.contains(f)) { + ss << "removing incompat feature " << f; + fsmap.default_compat.incompat.remove(f); + } else { + ss << "incompat feature " << f << " not present in " << fsmap.default_compat; + } + r = 0; + } else if (prefix == "mds repaired") { + std::string role_str; + cmd_getval(cmdmap, "role", role_str); + mds_role_t role; + const auto fs_names = op->get_session()->get_allowed_fs_names(); + r = fsmap.parse_role(role_str, &role, ss, fs_names); + if (r < 0) { + return r; + } + string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name(); + if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) { + ss << "Permission denied."; + return -EPERM; + } + + bool modified = fsmap.undamaged(role.fscid, role.rank); + if (modified) { + ss << "repaired: restoring rank " << role; + } else { + ss << "nothing to do: rank is not damaged"; + } + + r = 0; + } else if (prefix == "mds freeze") { + std::string who; + cmd_getval(cmdmap, "role_or_gid", who); + mds_gid_t gid = gid_from_arg(fsmap, who, ss); + if (gid == MDS_GID_NONE) { + return -EINVAL; + } + + string_view fs_name = fsmap.fs_name_from_gid(gid); + if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) { + ss << "Permission denied."; + return -EPERM; + } + + bool freeze = false; + { + std::string str; + cmd_getval(cmdmap, "val", str); + if ((r = parse_bool(str, &freeze, ss)) != 0) { + return r; + } + } + + auto f = [freeze,gid,&ss](auto& info) { + if (freeze) { + ss << "freezing mds." << gid; + info.freeze(); + } else { + ss << "unfreezing mds." << gid; + info.unfreeze(); + } + }; + fsmap.modify_daemon(gid, f); + r = 0; + } else { + return -ENOSYS; + } + + return r; +} + +void MDSMonitor::check_subs() +{ + // Subscriptions may be to "mdsmap" (MDS and legacy clients), + // "mdsmap.<namespace>", or to "fsmap" for the full state of all + // filesystems. Build a list of all the types we service + // subscriptions for. + + std::vector<std::string> types = { + "fsmap", + "fsmap.user", + "mdsmap", + }; + + for (const auto &p : get_fsmap().filesystems) { + const auto &fscid = p.first; + CachedStackStringStream cos; + *cos << "mdsmap." << fscid; + types.push_back(std::string(cos->strv())); + } + + for (const auto &type : types) { + auto& subs = mon.session_map.subs; + auto subs_it = subs.find(type); + if (subs_it == subs.end()) + continue; + auto sub_it = subs_it->second->begin(); + while (!sub_it.end()) { + auto sub = *sub_it; + ++sub_it; // N.B. check_sub may remove sub! + check_sub(sub); + } + } +} + + +void MDSMonitor::check_sub(Subscription *sub) +{ + dout(20) << __func__ << ": " << sub->type << dendl; + + // to use const qualifier filter fsmap beforehand + FSMap _fsmap_copy = get_fsmap(); + _fsmap_copy.filter(sub->session->get_allowed_fs_names()); + const auto& fsmap = _fsmap_copy; + if (sub->next > fsmap.get_epoch()) { + return; + } + + if (sub->type == "fsmap") { + sub->session->con->send_message(new MFSMap(mon.monmap->fsid, fsmap)); + if (sub->onetime) { + mon.session_map.remove_sub(sub); + } else { + sub->next = fsmap.get_epoch() + 1; + } + } else if (sub->type == "fsmap.user") { + FSMapUser fsmap_u; + fsmap_u.epoch = fsmap.get_epoch(); + fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid; + for (const auto &p : fsmap.filesystems) { + FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid]; + fs_info.cid = p.second->fscid; + fs_info.name = p.second->mds_map.fs_name; + } + sub->session->con->send_message(new MFSMapUser(mon.monmap->fsid, fsmap_u)); + if (sub->onetime) { + mon.session_map.remove_sub(sub); + } else { + sub->next = fsmap.get_epoch() + 1; + } + } else if (sub->type.compare(0, 6, "mdsmap") == 0) { + const bool is_mds = sub->session->name.is_mds(); + mds_gid_t mds_gid = MDS_GID_NONE; + fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE; + if (is_mds) { + // What (if any) namespace are you assigned to? + auto mds_info = fsmap.get_mds_info(); + for (const auto &p : mds_info) { + if (p.second.addrs == sub->session->addrs) { + mds_gid = p.first; + fscid = fsmap.mds_roles.at(mds_gid); + } + } + } else { + // You're a client. Did you request a particular + // namespace? + if (sub->type.compare(0, 7, "mdsmap.") == 0) { + auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size()); + dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl; + std::string err; + fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err); + if (!err.empty()) { + // Client asked for a non-existent namespace, send them nothing + dout(1) << "Invalid client subscription '" << sub->type + << "'" << dendl; + return; + } + } else { + // Unqualified request for "mdsmap": give it the one marked + // for use by legacy clients. + if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) { + fscid = fsmap.legacy_client_fscid; + } else { + dout(1) << "Client subscribed for legacy filesystem but " + "none is configured" << dendl; + return; + } + } + if (!fsmap.filesystem_exists(fscid)) { + // Client asked for a non-existent namespace, send them nothing + // TODO: something more graceful for when a client has a filesystem + // mounted, and the fileysstem is deleted. Add a "shut down you fool" + // flag to MMDSMap? + dout(1) << "Client subscribed to non-existent namespace '" << + fscid << "'" << dendl; + return; + } + } + dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl; + + // Work out the effective latest epoch + const MDSMap *mds_map = nullptr; + MDSMap null_map = MDSMap::create_null_mdsmap(); + if (fscid == FS_CLUSTER_ID_NONE) { + // For a client, we should have already dropped out + ceph_assert(is_mds); + + auto it = fsmap.standby_daemons.find(mds_gid); + if (it != fsmap.standby_daemons.end()) { + // For an MDS, we need to feed it an MDSMap with its own state in + null_map.mds_info[mds_gid] = it->second; + null_map.epoch = fsmap.standby_epochs.at(mds_gid); + } else { + null_map.epoch = fsmap.epoch; + } + mds_map = &null_map; + } else { + // Check the effective epoch + mds_map = &fsmap.get_filesystem(fscid)->mds_map; + } + + ceph_assert(mds_map != nullptr); + dout(10) << __func__ << " selected MDS map epoch " << + mds_map->epoch << " for namespace " << fscid << " for subscriber " + << sub->session->name << " who wants epoch " << sub->next << dendl; + + if (sub->next > mds_map->epoch) { + return; + } + auto msg = make_message<MMDSMap>(mon.monmap->fsid, *mds_map, + mds_map->fs_name); + + sub->session->con->send_message(msg.detach()); + if (sub->onetime) { + mon.session_map.remove_sub(sub); + } else { + sub->next = mds_map->get_epoch() + 1; + } + } +} + + +void MDSMonitor::update_metadata(mds_gid_t gid, + const map<string, string>& metadata) +{ + if (metadata.empty()) { + return; + } + pending_metadata[gid] = metadata; + + MonitorDBStore::TransactionRef t = paxos.get_pending_transaction(); + bufferlist bl; + encode(pending_metadata, bl); + t->put(MDS_METADATA_PREFIX, "last_metadata", bl); + paxos.trigger_propose(); +} + +void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t) +{ + bool update = false; + for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) { + if (!fsmap.gid_exists(it->first)) { + it = pending_metadata.erase(it); + update = true; + } else { + ++it; + } + } + if (!update) + return; + bufferlist bl; + encode(pending_metadata, bl); + t->put(MDS_METADATA_PREFIX, "last_metadata", bl); +} + +int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m) +{ + bufferlist bl; + int r = mon.store->get(MDS_METADATA_PREFIX, "last_metadata", bl); + if (r) { + dout(5) << "Unable to load 'last_metadata'" << dendl; + return r; + } + + auto it = bl.cbegin(); + ceph::decode(m, it); + return 0; +} + +void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out) +{ + map<mds_gid_t,Metadata> meta; + load_metadata(meta); + for (auto& p : meta) { + auto q = p.second.find(field); + if (q == p.second.end()) { + (*out)["unknown"]++; + } else { + (*out)[q->second]++; + } + } +} + +void MDSMonitor::count_metadata(const std::string &field, Formatter *f) +{ + map<string,int> by_val; + count_metadata(field, &by_val); + f->open_object_section(field.c_str()); + for (auto& p : by_val) { + f->dump_int(p.first.c_str(), p.second); + } + f->close_section(); +} + +void MDSMonitor::get_versions(std::map<string, list<string> > &versions) +{ + map<mds_gid_t,Metadata> meta; + load_metadata(meta); + const auto &fsmap = get_fsmap(); + std::map<mds_gid_t, mds_info_t> map = fsmap.get_mds_info(); + dout(10) << __func__ << " mds meta=" << meta << dendl; + for (auto& p : meta) { + auto q = p.second.find("ceph_version_short"); + if (q == p.second.end()) continue; + versions[q->second].push_back(string("mds.") + map[p.first].name); + } +} + +int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who, + Formatter *f, ostream& err) +{ + ceph_assert(f); + + mds_gid_t gid = gid_from_arg(fsmap, who, err); + if (gid == MDS_GID_NONE) { + return -EINVAL; + } + + map<mds_gid_t, Metadata> metadata; + if (int r = load_metadata(metadata)) { + err << "Unable to load 'last_metadata'"; + return r; + } + + if (!metadata.count(gid)) { + return -ENOENT; + } + const Metadata& m = metadata[gid]; + for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) { + f->dump_string(p->first.c_str(), p->second); + } + return 0; +} + +int MDSMonitor::print_nodes(Formatter *f) +{ + ceph_assert(f); + + const auto &fsmap = get_fsmap(); + + map<mds_gid_t, Metadata> metadata; + if (int r = load_metadata(metadata)) { + return r; + } + + map<string, list<string> > mdses; // hostname => mds + for (const auto &p : metadata) { + const mds_gid_t& gid = p.first; + const Metadata& m = p.second; + Metadata::const_iterator hostname = m.find("hostname"); + if (hostname == m.end()) { + // not likely though + continue; + } + if (!fsmap.gid_exists(gid)) { + dout(5) << __func__ << ": GID " << gid << " not existent" << dendl; + continue; + } + const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid); + mdses[hostname->second].push_back(mds_info.name); + } + + dump_services(f, mdses, "mds"); + return 0; +} + +/** + * If a cluster is undersized (with respect to max_mds), then + * attempt to find daemons to grow it. If the cluster is oversized + * (with respect to max_mds) then shrink it by stopping its highest rank. + */ +bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid) +{ + auto ¤t_mds_map = get_fsmap().get_filesystem(fscid)->mds_map; + auto&& fs = fsmap.get_filesystem(fscid); + auto &mds_map = fs->mds_map; + + int in = mds_map.get_num_in_mds(); + int max = mds_map.get_max_mds(); + + dout(20) << __func__ << " in " << in << " max " << max << dendl; + + /* Check that both the current epoch mds_map is resizeable as well as the + * current batch of changes in pending. This is important if an MDS is + * becoming active in the next epoch. + */ + if (!current_mds_map.is_resizeable() || + !mds_map.is_resizeable()) { + dout(5) << __func__ << " mds_map is not currently resizeable" << dendl; + return false; + } + + if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) { + mds_rank_t mds = mds_rank_t(0); + while (mds_map.is_in(mds)) { + mds++; + } + auto info = fsmap.find_replacement_for({fscid, mds}); + if (!info) { + return false; + } + + dout(1) << "assigned standby " << info->addrs + << " as mds." << mds << dendl; + mon.clog->info() << info->human_name() << " assigned to " + "filesystem " << mds_map.fs_name << " as rank " + << mds << " (now has " << mds_map.get_num_in_mds() + 1 + << " ranks)"; + fsmap.promote(info->global_id, *fs, mds); + return true; + } else if (in > max) { + mds_rank_t target = in - 1; + const auto &info = mds_map.get_info(target); + if (mds_map.is_active(target)) { + dout(1) << "stopping " << target << dendl; + mon.clog->info() << "stopping " << info.human_name(); + auto f = [](auto& info) { + info.state = MDSMap::STATE_STOPPING; + }; + fsmap.modify_daemon(info.global_id, f); + return true; + } else { + dout(20) << "skipping stop of " << target << dendl; + return false; + } + } + + return false; +} + + +/** + * Fail a daemon and replace it with a suitable standby. + */ +bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose) +{ + ceph_assert(osd_propose != nullptr); + + const auto fscid = fsmap.mds_roles.at(gid); + const auto& info = fsmap.get_info_gid(gid); + const auto rank = info.rank; + const auto state = info.state; + + if (info.is_frozen()) { + return false; + } else if (state == MDSMap::STATE_STANDBY_REPLAY || + state == MDSMap::STATE_STANDBY) { + dout(1) << " failing and removing standby " << gid << " " << info.addrs + << " mds." << rank + << "." << info.inc << " " << ceph_mds_state_name(state) + << dendl; + *osd_propose |= fail_mds_gid(fsmap, gid); + return true; + } else if (rank >= 0 && rep_info) { + auto fs = fsmap.filesystems.at(fscid); + if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) { + return false; + } + // are we in? + // and is there a non-laggy standby that can take over for us? + dout(1) << " replacing " << gid << " " << info.addrs + << " mds." << rank << "." << info.inc + << " " << ceph_mds_state_name(state) + << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs + << dendl; + + mon.clog->warn() << "Replacing " << info.human_name() + << " as rank " << rank + << " with standby " << rep_info->human_name(); + + // Remove the old one + *osd_propose |= fail_mds_gid(fsmap, gid); + + // Promote the replacement + fsmap.promote(rep_info->global_id, *fs, rank); + + return true; + } + return false; +} + +bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap) +{ + bool do_propose = false; + const auto now = mono_clock::now(); + const bool osdmap_writeable = mon.osdmon()->is_writeable(); + const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace"); + const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval"); + + if (mono_clock::is_zero(last_tick)) { + last_tick = now; + } + + { + auto since_last = std::chrono::duration<double>(now-last_tick); + + if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) { + // This case handles either local slowness (calls being delayed + // for whatever reason) or cluster election slowness (a long gap + // between calls while an election happened) + dout(1) << __func__ << ": resetting beacon timeouts due to mon delay " + "(slow election?) of " << since_last.count() << " seconds" << dendl; + for (auto& p : last_beacon) { + p.second.stamp = now; + } + } + } + + // make sure last_beacon is fully populated + for (auto& p : fsmap.mds_roles) { + auto& gid = p.first; + last_beacon.emplace(std::piecewise_construct, + std::forward_as_tuple(gid), + std::forward_as_tuple(now, 0)); + } + + // We will only take decisive action (replacing/removing a daemon) + // if we have some indication that some other daemon(s) are successfully + // getting beacons through recently. + mono_time latest_beacon = mono_clock::zero(); + for (const auto& p : last_beacon) { + latest_beacon = std::max(p.second.stamp, latest_beacon); + } + auto since = std::chrono::duration<double>(now-latest_beacon); + const bool may_replace = since.count() < + std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5); + + // check beacon timestamps + std::vector<mds_gid_t> to_remove; + const bool mon_down = mon.is_mon_down(); + const auto mds_beacon_mon_down_grace = + g_conf().get_val<std::chrono::seconds>("mds_beacon_mon_down_grace"); + const auto quorum_age = std::chrono::seconds(mon.quorum_age()); + const bool new_quorum = quorum_age < mds_beacon_mon_down_grace; + for (auto it = last_beacon.begin(); it != last_beacon.end(); ) { + auto& [gid, beacon_info] = *it; + auto since_last = std::chrono::duration<double>(now-beacon_info.stamp); + + if (!fsmap.gid_exists(gid)) { + // gid no longer exists, remove from tracked beacons + it = last_beacon.erase(it); + continue; + } + + if (since_last.count() >= g_conf()->mds_beacon_grace) { + auto& info = fsmap.get_info_gid(gid); + dout(1) << "no beacon from mds." << info.rank << "." << info.inc + << " (gid: " << gid << " addr: " << info.addrs + << " state: " << ceph_mds_state_name(info.state) << ")" + << " since " << since_last.count() << dendl; + if ((mon_down || new_quorum) && since_last < mds_beacon_mon_down_grace) { + /* The MDS may be sending beacons to a monitor not yet in quorum or + * temporarily partitioned. Hold off on removal for a little longer... + */ + dout(10) << "deferring removal for mds_beacon_mon_down_grace during MON_DOWN" << dendl; + ++it; + continue; + } + // If the OSDMap is writeable, we can blocklist things, so we can + // try failing any laggy MDS daemons. Consider each one for failure. + if (!info.laggy()) { + dout(1) << " marking " << gid << " " << info.addrs + << " mds." << info.rank << "." << info.inc + << " " << ceph_mds_state_name(info.state) + << " laggy" << dendl; + fsmap.modify_daemon(info.global_id, [](auto& info) { + info.laggy_since = ceph_clock_now(); + }); + do_propose = true; + } + if (osdmap_writeable && may_replace) { + to_remove.push_back(gid); // drop_mds may invalidate iterator + } + } + + ++it; + } + + for (const auto& gid : to_remove) { + auto info = fsmap.get_info_gid(gid); + const mds_info_t* rep_info = nullptr; + if (info.rank >= 0) { + auto fscid = fsmap.fscid_from_gid(gid); + rep_info = fsmap.find_replacement_for({fscid, info.rank}); + } + bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap); + if (dropped) { + mon.clog->info() << "MDS " << info.human_name() + << " is removed because it is dead or otherwise unavailable."; + do_propose = true; + } + } + + if (osdmap_writeable) { + for (auto& [fscid, fs] : fsmap.filesystems) { + if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) && + fs->mds_map.is_resizeable()) { + // Check if a rank or standby-replay should be replaced with a stronger + // affinity standby. This looks at ranks and standby-replay: + for (const auto& [gid, info] : fs->mds_map.get_mds_info()) { + const auto join_fscid = info.join_fscid; + if (join_fscid == fscid) + continue; + const auto rank = info.rank; + const auto state = info.state; + const mds_info_t* rep_info = nullptr; + if (state == MDSMap::STATE_STANDBY_REPLAY) { + rep_info = fsmap.get_available_standby(*fs); + } else if (state == MDSMap::STATE_ACTIVE) { + rep_info = fsmap.find_replacement_for({fscid, rank}); + } else { + /* N.B. !is_degraded() */ + ceph_abort_msg("invalid state in MDSMap"); + } + if (!rep_info) { + break; + } + bool better_affinity = false; + if (join_fscid == FS_CLUSTER_ID_NONE) { + better_affinity = (rep_info->join_fscid == fscid); + } else { + better_affinity = (rep_info->join_fscid == fscid) || + (rep_info->join_fscid == FS_CLUSTER_ID_NONE); + } + if (better_affinity) { + if (state == MDSMap::STATE_STANDBY_REPLAY) { + mon.clog->info() << "Dropping low affinity standby-replay " + << info.human_name() + << " in favor of higher affinity standby."; + *propose_osdmap |= fail_mds_gid(fsmap, gid); + /* Now let maybe_promote_standby do the promotion. */ + } else { + mon.clog->info() << "Dropping low affinity active " + << info.human_name() + << " in favor of higher affinity standby."; + do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap); + } + break; /* don't replace more than one per tick per fs */ + } + } + } + } + } + return do_propose; +} + +bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs) +{ + if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) { + return false; + } + + bool do_propose = false; + + // have a standby take over? + set<mds_rank_t> failed; + fs.mds_map.get_failed_mds_set(failed); + for (const auto& rank : failed) { + auto info = fsmap.find_replacement_for({fs.fscid, rank}); + if (info) { + dout(1) << " taking over failed mds." << rank << " with " << info->global_id + << "/" << info->name << " " << info->addrs << dendl; + mon.clog->info() << "Standby " << info->human_name() + << " assigned to filesystem " << fs.mds_map.fs_name + << " as rank " << rank; + + fsmap.promote(info->global_id, fs, rank); + do_propose = true; + } + } + + if (fs.mds_map.is_resizeable() && fs.mds_map.allows_standby_replay()) { + // There were no failures to replace, so try using any available standbys + // as standby-replay daemons. Don't do this when the cluster is degraded + // as a standby-replay daemon may try to read a journal being migrated. + for (;;) { + auto info = fsmap.get_available_standby(fs); + if (!info) break; + dout(20) << "standby available mds." << info->global_id << dendl; + bool changed = false; + for (const auto& rank : fs.mds_map.in) { + dout(20) << "examining " << rank << dendl; + if (fs.mds_map.is_followable(rank)) { + dout(1) << " setting mds." << info->global_id + << " to follow mds rank " << rank << dendl; + fsmap.assign_standby_replay(info->global_id, fs.fscid, rank); + do_propose = true; + changed = true; + break; + } + } + if (!changed) break; + } + } + + return do_propose; +} + +void MDSMonitor::tick() +{ + if (!is_active() || !is_leader()) return; + + auto &pending = get_pending_fsmap_writeable(); + + bool do_propose = false; + bool propose_osdmap = false; + + if (check_fsmap_struct_version) { + /* Allow time for trimming otherwise PaxosService::is_writeable will always + * be false. + */ + + auto now = clock::now(); + auto elapsed = now - last_fsmap_struct_flush; + if (elapsed > std::chrono::seconds(30)) { + FSMap fsmap; + bufferlist bl; + auto v = get_first_committed(); + int err = get_version(v, bl); + if (err) { + derr << "could not get version " << v << dendl; + ceph_abort(); + } + try { + fsmap.decode(bl); + } catch (const ceph::buffer::malformed_input& e) { + dout(5) << "flushing old fsmap struct because unable to decode FSMap: " << e.what() << dendl; + } + /* N.B. FSMap::is_struct_old is also true for undecoded (failed to decode) FSMap */ + if (fsmap.is_struct_old()) { + dout(5) << "fsmap struct is too old; proposing to flush out old versions" << dendl; + do_propose = true; + last_fsmap_struct_flush = now; + } else { + dout(20) << "struct is recent" << dendl; + check_fsmap_struct_version = false; + } + } + } + + do_propose |= pending.check_health(); + + /* Check health and affinity of ranks */ + do_propose |= check_health(pending, &propose_osdmap); + + /* Resize the cluster according to max_mds. */ + for (auto& p : pending.filesystems) { + do_propose |= maybe_resize_cluster(pending, p.second->fscid); + } + + /* Replace any failed ranks. */ + for (auto& p : pending.filesystems) { + do_propose |= maybe_promote_standby(pending, *p.second); + } + + if (propose_osdmap) { + request_proposal(mon.osdmon()); + } + + if (do_propose) { + propose_pending(); + } + + last_tick = mono_clock::now(); +} + +MDSMonitor::MDSMonitor(Monitor &mn, Paxos &p, string service_name) + : PaxosService(mn, p, service_name) +{ + handlers = FileSystemCommandHandler::load(&p); +} + +void MDSMonitor::on_restart() +{ + // Clear out the leader-specific state. + last_tick = mono_clock::now(); + last_beacon.clear(); +} + diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h new file mode 100644 index 000000000..c70814996 --- /dev/null +++ b/src/mon/MDSMonitor.h @@ -0,0 +1,158 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* Metadata Server Monitor + */ + +#ifndef CEPH_MDSMONITOR_H +#define CEPH_MDSMONITOR_H + +#include <map> +#include <set> + +#include "include/types.h" +#include "PaxosFSMap.h" +#include "PaxosService.h" +#include "msg/Messenger.h" +#include "messages/MMDSBeacon.h" +#include "CommandHandler.h" + +class FileSystemCommandHandler; + +class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHandler { + public: + using clock = ceph::coarse_mono_clock; + using time = ceph::coarse_mono_time; + + MDSMonitor(Monitor &mn, Paxos &p, std::string service_name); + + // service methods + void create_initial() override; + void get_store_prefixes(std::set<std::string>& s) const override; + void update_from_paxos(bool *need_bootstrap) override; + void init() override; + void create_pending() override; + void encode_pending(MonitorDBStore::TransactionRef t) override; + // we don't require full versions; don't encode any. + void encode_full(MonitorDBStore::TransactionRef t) override { } + version_t get_trim_to() const override; + + bool preprocess_query(MonOpRequestRef op) override; // true if processed. + bool prepare_update(MonOpRequestRef op) override; + bool should_propose(double& delay) override; + + bool should_print_status() const { + auto& fs = get_fsmap(); + auto fs_count = fs.filesystem_count(); + auto standby_count = fs.get_num_standby(); + return fs_count > 0 || standby_count > 0; + } + + void on_active() override; + void on_restart() override; + + void check_subs(); + void check_sub(Subscription *sub); + + void dump_info(ceph::Formatter *f); + int print_nodes(ceph::Formatter *f); + + /** + * Return true if a blocklist was done (i.e. OSD propose needed) + */ + bool fail_mds_gid(FSMap &fsmap, mds_gid_t gid); + + bool is_leader() const override { return mon.is_leader(); } + + protected: + using mds_info_t = MDSMap::mds_info_t; + + // my helpers + template<int dblV = 7> + void print_map(const FSMap &m); + + void _updated(MonOpRequestRef op); + + void _note_beacon(class MMDSBeacon *m); + bool preprocess_beacon(MonOpRequestRef op); + bool prepare_beacon(MonOpRequestRef op); + + bool preprocess_offload_targets(MonOpRequestRef op); + bool prepare_offload_targets(MonOpRequestRef op); + + int fail_mds(FSMap &fsmap, std::ostream &ss, + const std::string &arg, mds_info_t *failed_info); + + bool preprocess_command(MonOpRequestRef op); + bool prepare_command(MonOpRequestRef op); + + int filesystem_command( + FSMap &fsmap, + MonOpRequestRef op, + std::string const &prefix, + const cmdmap_t& cmdmap, + std::stringstream &ss); + + // beacons + struct beacon_info_t { + ceph::mono_time stamp = ceph::mono_clock::zero(); + uint64_t seq = 0; + beacon_info_t() {} + beacon_info_t(ceph::mono_time stamp, uint64_t seq) : stamp(stamp), seq(seq) {} + }; + std::map<mds_gid_t, beacon_info_t> last_beacon; + + std::list<std::shared_ptr<FileSystemCommandHandler> > handlers; + + bool maybe_promote_standby(FSMap& fsmap, Filesystem& fs); + bool maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid); + bool drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool* osd_propose); + bool check_health(FSMap &fsmap, bool* osd_propose); + void tick() override; // check state, take actions + + int dump_metadata(const FSMap &fsmap, const std::string &who, ceph::Formatter *f, + std::ostream& err); + + void update_metadata(mds_gid_t gid, const Metadata& metadata); + void remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t); + int load_metadata(std::map<mds_gid_t, Metadata>& m); + void count_metadata(const std::string& field, ceph::Formatter *f); + +public: + void print_fs_summary(ostream& out) { + get_fsmap().print_fs_summary(out); + } + void count_metadata(const std::string& field, std::map<std::string,int> *out); + void get_versions(std::map<std::string, std::list<std::string>> &versions); + +protected: + // MDS daemon GID to latest health state from that GID + std::map<uint64_t, MDSHealth> pending_daemon_health; + std::set<uint64_t> pending_daemon_health_rm; + + std::map<mds_gid_t, Metadata> pending_metadata; + + mds_gid_t gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream& err); + + // When did the mon last call into our tick() method? Used for detecting + // when the mon was not updating us for some period (e.g. during slow + // election) to reset last_beacon timeouts + ceph::mono_time last_tick = ceph::mono_clock::zero(); + +private: + time last_fsmap_struct_flush = clock::zero(); + bool check_fsmap_struct_version = true; +}; + +#endif diff --git a/src/mon/MgrMap.h b/src/mon/MgrMap.h new file mode 100644 index 000000000..5342fc51f --- /dev/null +++ b/src/mon/MgrMap.h @@ -0,0 +1,601 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef MGR_MAP_H_ +#define MGR_MAP_H_ + +#include <sstream> +#include <set> + +#include "msg/msg_types.h" +#include "include/encoding.h" +#include "include/utime.h" +#include "common/Formatter.h" +#include "common/ceph_releases.h" +#include "common/version.h" +#include "common/options.h" +#include "common/Clock.h" + + +class MgrMap +{ +public: + struct ModuleOption { + std::string name; + uint8_t type = Option::TYPE_STR; // Option::type_t TYPE_* + uint8_t level = Option::LEVEL_ADVANCED; // Option::level_t LEVEL_* + uint32_t flags = 0; // Option::flag_t FLAG_* + std::string default_value; + std::string min, max; + std::set<std::string> enum_allowed; + std::string desc, long_desc; + std::set<std::string> tags; + std::set<std::string> see_also; + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(name, bl); + encode(type, bl); + encode(level, bl); + encode(flags, bl); + encode(default_value, bl); + encode(min, bl); + encode(max, bl); + encode(enum_allowed, bl); + encode(desc, bl); + encode(long_desc, bl); + encode(tags, bl); + encode(see_also, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(name, p); + decode(type, p); + decode(level, p); + decode(flags, p); + decode(default_value, p); + decode(min, p); + decode(max, p); + decode(enum_allowed, p); + decode(desc, p); + decode(long_desc, p); + decode(tags, p); + decode(see_also, p); + DECODE_FINISH(p); + } + void dump(ceph::Formatter *f) const { + f->dump_string("name", name); + f->dump_string("type", Option::type_to_str( + static_cast<Option::type_t>(type))); + f->dump_string("level", Option::level_to_str( + static_cast<Option::level_t>(level))); + f->dump_unsigned("flags", flags); + f->dump_string("default_value", default_value); + f->dump_string("min", min); + f->dump_string("max", max); + f->open_array_section("enum_allowed"); + for (auto& i : enum_allowed) { + f->dump_string("value", i); + } + f->close_section(); + f->dump_string("desc", desc); + f->dump_string("long_desc", long_desc); + f->open_array_section("tags"); + for (auto& i : tags) { + f->dump_string("tag", i); + } + f->close_section(); + f->open_array_section("see_also"); + for (auto& i : see_also) { + f->dump_string("option", i); + } + f->close_section(); + } + }; + + class ModuleInfo + { + public: + std::string name; + bool can_run = true; + std::string error_string; + std::map<std::string,ModuleOption> module_options; + + // We do not include the module's `failed` field in the beacon, + // because it is exposed via health checks. + void encode(ceph::buffer::list &bl) const { + ENCODE_START(2, 1, bl); + encode(name, bl); + encode(can_run, bl); + encode(error_string, bl); + encode(module_options, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator &bl) { + DECODE_START(1, bl); + decode(name, bl); + decode(can_run, bl); + decode(error_string, bl); + if (struct_v >= 2) { + decode(module_options, bl); + } + DECODE_FINISH(bl); + } + + bool operator==(const ModuleInfo &rhs) const + { + return (name == rhs.name) && (can_run == rhs.can_run); + } + + void dump(ceph::Formatter *f) const { + f->open_object_section("module"); + f->dump_string("name", name); + f->dump_bool("can_run", can_run); + f->dump_string("error_string", error_string); + f->open_object_section("module_options"); + for (auto& i : module_options) { + f->dump_object(i.first.c_str(), i.second); + } + f->close_section(); + f->close_section(); + } + }; + + class StandbyInfo + { + public: + uint64_t gid = 0; + std::string name; + std::vector<ModuleInfo> available_modules; + uint64_t mgr_features = 0; + + StandbyInfo(uint64_t gid_, const std::string &name_, + const std::vector<ModuleInfo>& am, + uint64_t feat) + : gid(gid_), name(name_), available_modules(am), + mgr_features(feat) + {} + + StandbyInfo() {} + + void encode(ceph::buffer::list& bl) const + { + ENCODE_START(4, 1, bl); + encode(gid, bl); + encode(name, bl); + std::set<std::string> old_available_modules; + for (const auto &i : available_modules) { + old_available_modules.insert(i.name); + } + encode(old_available_modules, bl); // version 2 + encode(available_modules, bl); // version 3 + encode(mgr_features, bl); // v4 + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator& p) + { + DECODE_START(4, p); + decode(gid, p); + decode(name, p); + if (struct_v >= 2) { + std::set<std::string> old_available_modules; + decode(old_available_modules, p); + if (struct_v < 3) { + for (const auto &name : old_available_modules) { + MgrMap::ModuleInfo info; + info.name = name; + available_modules.push_back(std::move(info)); + } + } + } + if (struct_v >= 3) { + decode(available_modules, p); + } + if (struct_v >= 4) { + decode(mgr_features, p); + } + DECODE_FINISH(p); + } + + bool have_module(const std::string &module_name) const + { + auto it = std::find_if(available_modules.begin(), + available_modules.end(), + [module_name](const ModuleInfo &m) -> bool { + return m.name == module_name; + }); + + return it != available_modules.end(); + } + }; + + epoch_t epoch = 0; + epoch_t last_failure_osd_epoch = 0; + + /// global_id of the ceph-mgr instance selected as a leader + uint64_t active_gid = 0; + /// server address reported by the leader once it is active + entity_addrvec_t active_addrs; + /// whether the nominated leader is active (i.e. has initialized its server) + bool available = false; + /// the name (foo in mgr.<foo>) of the active daemon + std::string active_name; + /// when the active mgr became active, or we lost the active mgr + utime_t active_change; + /// features + uint64_t active_mgr_features = 0; + + std::vector<entity_addrvec_t> clients; // for blocklist + + std::map<uint64_t, StandbyInfo> standbys; + + // Modules which are enabled + std::set<std::string> modules; + + // Modules which should always be enabled. A manager daemon will enable + // modules from the union of this set and the `modules` set above, latest + // active version. + std::map<uint32_t, std::set<std::string>> always_on_modules; + + // Modules which are reported to exist + std::vector<ModuleInfo> available_modules; + + // Map of module name to URI, indicating services exposed by + // running modules on the active mgr daemon. + std::map<std::string, std::string> services; + + epoch_t get_epoch() const { return epoch; } + epoch_t get_last_failure_osd_epoch() const { return last_failure_osd_epoch; } + const entity_addrvec_t& get_active_addrs() const { return active_addrs; } + uint64_t get_active_gid() const { return active_gid; } + bool get_available() const { return available; } + const std::string &get_active_name() const { return active_name; } + const utime_t& get_active_change() const { return active_change; } + int get_num_standby() const { return standbys.size(); } + + bool all_support_module(const std::string& module) { + if (!have_module(module)) { + return false; + } + for (auto& p : standbys) { + if (!p.second.have_module(module)) { + return false; + } + } + return true; + } + + bool have_module(const std::string &module_name) const + { + for (const auto &i : available_modules) { + if (i.name == module_name) { + return true; + } + } + + return false; + } + + const ModuleInfo *get_module_info(const std::string &module_name) const { + for (const auto &i : available_modules) { + if (i.name == module_name) { + return &i; + } + } + return nullptr; + } + + bool can_run_module(const std::string &module_name, std::string *error) const + { + for (const auto &i : available_modules) { + if (i.name == module_name) { + *error = i.error_string; + return i.can_run; + } + } + + std::ostringstream oss; + oss << "Module '" << module_name << "' does not exist"; + throw std::logic_error(oss.str()); + } + + bool module_enabled(const std::string& module_name) const + { + return modules.find(module_name) != modules.end(); + } + + bool any_supports_module(const std::string& module) const { + if (have_module(module)) { + return true; + } + for (auto& p : standbys) { + if (p.second.have_module(module)) { + return true; + } + } + return false; + } + + bool have_name(const std::string& name) const { + if (active_name == name) { + return true; + } + for (auto& p : standbys) { + if (p.second.name == name) { + return true; + } + } + return false; + } + + std::set<std::string> get_all_names() const { + std::set<std::string> ls; + if (active_name.size()) { + ls.insert(active_name); + } + for (auto& p : standbys) { + ls.insert(p.second.name); + } + return ls; + } + + std::set<std::string> get_always_on_modules() const { + unsigned rnum = to_integer<uint32_t>(ceph_release()); + auto it = always_on_modules.find(rnum); + if (it == always_on_modules.end()) { + // ok, try the most recent release + if (always_on_modules.empty()) { + return {}; // ugh + } + --it; + if (it->first < rnum) { + return it->second; + } + return {}; // wth + } + return it->second; + } + + void encode(ceph::buffer::list& bl, uint64_t features) const + { + if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + ENCODE_START(5, 1, bl); + encode(epoch, bl); + encode(active_addrs.legacy_addr(), bl, features); + encode(active_gid, bl); + encode(available, bl); + encode(active_name, bl); + encode(standbys, bl); + encode(modules, bl); + + // Pre-version 4 std::string std::list of available modules + // (replaced by direct encode of ModuleInfo below) + std::set<std::string> old_available_modules; + for (const auto &i : available_modules) { + old_available_modules.insert(i.name); + } + encode(old_available_modules, bl); + + encode(services, bl); + encode(available_modules, bl); + ENCODE_FINISH(bl); + return; + } + ENCODE_START(11, 6, bl); + encode(epoch, bl); + encode(active_addrs, bl, features); + encode(active_gid, bl); + encode(available, bl); + encode(active_name, bl); + encode(standbys, bl); + encode(modules, bl); + encode(services, bl); + encode(available_modules, bl); + encode(active_change, bl); + encode(always_on_modules, bl); + encode(active_mgr_features, bl); + encode(last_failure_osd_epoch, bl); + encode(clients, bl, features); + ENCODE_FINISH(bl); + return; + } + + void decode(ceph::buffer::list::const_iterator& p) + { + DECODE_START(11, p); + decode(epoch, p); + decode(active_addrs, p); + decode(active_gid, p); + decode(available, p); + decode(active_name, p); + decode(standbys, p); + if (struct_v >= 2) { + decode(modules, p); + + if (struct_v < 6) { + // Reconstitute ModuleInfos from names + std::set<std::string> module_name_list; + decode(module_name_list, p); + // Only need to unpack this field if we won't have the full + // MgrMap::ModuleInfo structures added in v4 + if (struct_v < 4) { + for (const auto &i : module_name_list) { + MgrMap::ModuleInfo info; + info.name = i; + available_modules.push_back(std::move(info)); + } + } + } + } + if (struct_v >= 3) { + decode(services, p); + } + if (struct_v >= 4) { + decode(available_modules, p); + } + if (struct_v >= 7) { + decode(active_change, p); + } else { + active_change = {}; + } + if (struct_v >= 8) { + decode(always_on_modules, p); + } + if (struct_v >= 9) { + decode(active_mgr_features, p); + } + if (struct_v >= 10) { + decode(last_failure_osd_epoch, p); + } + if (struct_v >= 11) { + decode(clients, p); + } + DECODE_FINISH(p); + } + + void dump(ceph::Formatter *f) const { + f->dump_int("epoch", epoch); + f->dump_int("active_gid", get_active_gid()); + f->dump_string("active_name", get_active_name()); + f->dump_object("active_addrs", active_addrs); + f->dump_stream("active_addr") << active_addrs.get_legacy_str(); + f->dump_stream("active_change") << active_change; + f->dump_unsigned("active_mgr_features", active_mgr_features); + f->dump_bool("available", available); + f->open_array_section("standbys"); + for (const auto &i : standbys) { + f->open_object_section("standby"); + f->dump_int("gid", i.second.gid); + f->dump_string("name", i.second.name); + f->dump_unsigned("mgr_features", i.second.mgr_features); + f->open_array_section("available_modules"); + for (const auto& j : i.second.available_modules) { + j.dump(f); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + f->open_array_section("modules"); + for (auto& i : modules) { + f->dump_string("module", i); + } + f->close_section(); + f->open_array_section("available_modules"); + for (const auto& j : available_modules) { + j.dump(f); + } + f->close_section(); + + f->open_object_section("services"); + for (const auto &i : services) { + f->dump_string(i.first.c_str(), i.second); + } + f->close_section(); + + f->open_object_section("always_on_modules"); + for (auto& v : always_on_modules) { + f->open_array_section(ceph_release_name(v.first)); + for (auto& m : v.second) { + f->dump_string("module", m); + } + f->close_section(); + } + f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch); + f->open_array_section("active_clients"); + for (const auto &c : clients) { + f->dump_object("client", c); + } + f->close_section(); + f->close_section(); + } + + static void generate_test_instances(std::list<MgrMap*> &l) { + l.push_back(new MgrMap); + } + + void print_summary(ceph::Formatter *f, std::ostream *ss) const + { + // One or the other, not both + ceph_assert((ss != nullptr) != (f != nullptr)); + if (f) { + f->dump_bool("available", available); + f->dump_int("num_standbys", standbys.size()); + f->open_array_section("modules"); + for (auto& i : modules) { + f->dump_string("module", i); + } + f->close_section(); + f->open_object_section("services"); + for (const auto &i : services) { + f->dump_string(i.first.c_str(), i.second); + } + f->close_section(); + } else { + utime_t now = ceph_clock_now(); + if (get_active_gid() != 0) { + *ss << get_active_name(); + if (!available) { + // If the daemon hasn't gone active yet, indicate that. + *ss << "(active, starting"; + } else { + *ss << "(active"; + } + if (active_change) { + *ss << ", since " << utimespan_str(now - active_change); + } + *ss << ")"; + } else { + *ss << "no daemons active"; + if (active_change) { + *ss << " (since " << utimespan_str(now - active_change) << ")"; + } + } + if (standbys.size()) { + *ss << ", standbys: "; + bool first = true; + for (const auto &i : standbys) { + if (!first) { + *ss << ", "; + } + *ss << i.second.name; + first = false; + } + } + } + } + + friend std::ostream& operator<<(std::ostream& out, const MgrMap& m) { + std::ostringstream ss; + m.print_summary(nullptr, &ss); + return out << ss.str(); + } + + friend std::ostream& operator<<(std::ostream& out, const std::vector<ModuleInfo>& mi) { + for (const auto &i : mi) { + out << i.name << " "; + } + return out; + } +}; + +WRITE_CLASS_ENCODER_FEATURES(MgrMap) +WRITE_CLASS_ENCODER(MgrMap::StandbyInfo) +WRITE_CLASS_ENCODER(MgrMap::ModuleInfo); +WRITE_CLASS_ENCODER(MgrMap::ModuleOption); + +#endif + diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc new file mode 100644 index 000000000..bf5e2ed31 --- /dev/null +++ b/src/mon/MgrMonitor.cc @@ -0,0 +1,1356 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#include <boost/tokenizer.hpp> + +#include "messages/MMgrBeacon.h" +#include "messages/MMgrMap.h" +#include "messages/MMgrDigest.h" + +#include "include/stringify.h" +#include "mgr/MgrContext.h" +#include "mgr/mgr_commands.h" +#include "OSDMonitor.h" +#include "ConfigMonitor.h" +#include "HealthMonitor.h" + +#include "MgrMonitor.h" + +#define MGR_METADATA_PREFIX "mgr_metadata" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, map) +using namespace TOPNSPC::common; + +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::ErasureCodeInterfaceRef; +using ceph::ErasureCodeProfile; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::make_message; +using ceph::mono_clock; +using ceph::mono_time; + +static ostream& _prefix(std::ostream *_dout, Monitor &mon, + const MgrMap& mgrmap) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() + << ").mgr e" << mgrmap.get_epoch() << " "; +} + +// the system treats always_on_modules as if they provide built-in functionality +// by ensuring that they are always enabled. +const static std::map<uint32_t, std::set<std::string>> always_on_modules = { + { + CEPH_RELEASE_NAUTILUS, { + "crash", + "status", + "progress", + "balancer", + "devicehealth", + "orchestrator_cli", + "rbd_support", + "volumes", + } + }, + { + CEPH_RELEASE_OCTOPUS, { + "crash", + "status", + "progress", + "balancer", + "devicehealth", + "orchestrator", + "rbd_support", + "volumes", + "pg_autoscaler", + "telemetry", + } + }, + { + CEPH_RELEASE_PACIFIC, { + "crash", + "status", + "progress", + "balancer", + "devicehealth", + "orchestrator", + "rbd_support", + "volumes", + "pg_autoscaler", + "telemetry", + } + } +}; + +// Prefix for mon store of active mgr's command descriptions +const static std::string command_descs_prefix = "mgr_command_descs"; + +const Option *MgrMonitor::find_module_option(const string& name) +{ + // we have two forms of names: "mgr/$module/$option" and + // localized "mgr/$module/$instance/$option". normalize to the + // former by stripping out $instance. + string real_name; + if (name.substr(0, 4) != "mgr/") { + return nullptr; + } + auto second_slash = name.find('/', 5); + if (second_slash == std::string::npos) { + return nullptr; + } + auto third_slash = name.find('/', second_slash + 1); + if (third_slash != std::string::npos) { + // drop the $instance part between the second and third slash + real_name = name.substr(0, second_slash) + name.substr(third_slash); + } else { + real_name = name; + } + auto p = mgr_module_options.find(real_name); + if (p != mgr_module_options.end()) { + return &p->second; + } + return nullptr; +} + +version_t MgrMonitor::get_trim_to() const +{ + int64_t max = g_conf().get_val<int64_t>("mon_max_mgrmap_epochs"); + if (map.epoch > max) { + return map.epoch - max; + } + return 0; +} + +void MgrMonitor::create_initial() +{ + // Take a local copy of initial_modules for tokenizer to iterate over. + auto initial_modules = g_conf().get_val<std::string>("mgr_initial_modules"); + boost::tokenizer<> tok(initial_modules); + for (auto& m : tok) { + pending_map.modules.insert(m); + } + pending_map.always_on_modules = always_on_modules; + pending_command_descs = mgr_commands; + dout(10) << __func__ << " initial modules " << pending_map.modules + << ", always on modules " << pending_map.get_always_on_modules() + << ", " << pending_command_descs.size() << " commands" + << dendl; +} + +void MgrMonitor::get_store_prefixes(std::set<string>& s) const +{ + s.insert(service_name); + s.insert(command_descs_prefix); + s.insert(MGR_METADATA_PREFIX); +} + +void MgrMonitor::update_from_paxos(bool *need_bootstrap) +{ + version_t version = get_last_committed(); + if (version != map.epoch) { + dout(4) << "loading version " << version << dendl; + + bufferlist bl; + int err = get_version(version, bl); + ceph_assert(err == 0); + + bool old_available = map.get_available(); + uint64_t old_gid = map.get_active_gid(); + + auto p = bl.cbegin(); + map.decode(p); + + dout(4) << "active server: " << map.active_addrs + << "(" << map.active_gid << ")" << dendl; + + ever_had_active_mgr = get_value("ever_had_active_mgr"); + + load_health(); + + if (map.available) { + first_seen_inactive = utime_t(); + } else { + first_seen_inactive = ceph_clock_now(); + } + + check_subs(); + + if (version == 1 + || command_descs.empty() + || (map.get_available() + && (!old_available || old_gid != map.get_active_gid()))) { + dout(4) << "mkfs or daemon transitioned to available, loading commands" + << dendl; + bufferlist loaded_commands; + int r = mon.store->get(command_descs_prefix, "", loaded_commands); + if (r < 0) { + derr << "Failed to load mgr commands: " << cpp_strerror(r) << dendl; + } else { + auto p = loaded_commands.cbegin(); + decode(command_descs, p); + } + } + } + + // populate module options + mgr_module_options.clear(); + misc_option_strings.clear(); + for (auto& i : map.available_modules) { + for (auto& j : i.module_options) { + string name = string("mgr/") + i.name + "/" + j.second.name; + auto p = mgr_module_options.emplace( + name, + Option(name, static_cast<Option::type_t>(j.second.type), + static_cast<Option::level_t>(j.second.level))); + Option& opt = p.first->second; + opt.set_flags(static_cast<Option::flag_t>(j.second.flags)); + opt.set_flag(Option::FLAG_MGR); + opt.set_description(j.second.desc.c_str()); + opt.set_long_description(j.second.long_desc.c_str()); + for (auto& k : j.second.tags) { + opt.add_tag(k.c_str()); + } + for (auto& k : j.second.see_also) { + if (i.module_options.count(k)) { + // it's another module option + misc_option_strings.push_back(string("mgr/") + i.name + "/" + k); + opt.add_see_also(misc_option_strings.back().c_str()); + } else { + // it's a native option + opt.add_see_also(k.c_str()); + } + } + Option::value_t v, v2; + std::string err; + if (j.second.default_value.size() && + !opt.parse_value(j.second.default_value, &v, &err)) { + opt.set_default(v); + } + if (j.second.min.size() && + j.second.max.size() && + !opt.parse_value(j.second.min, &v, &err) && + !opt.parse_value(j.second.max, &v2, &err)) { + opt.set_min_max(v, v2); + } + std::vector<const char *> enum_allowed; + for (auto& k : j.second.enum_allowed) { + enum_allowed.push_back(k.c_str()); + } + opt.set_enum_allowed(enum_allowed); + } + } + // force ConfigMonitor to refresh, since it uses const Option * + // pointers into our mgr_module_options (which we just rebuilt). + mon.configmon()->load_config(); + + if (!mon.is_init()) { + // feed our pet MgrClient, unless we are in Monitor::[pre]init() + prime_mgr_client(); + } +} + +void MgrMonitor::prime_mgr_client() +{ + dout(10) << __func__ << dendl; + mon.mgr_client.ms_dispatch2(make_message<MMgrMap>(map)); +} + +void MgrMonitor::create_pending() +{ + pending_map = map; + pending_map.epoch++; +} + +health_status_t MgrMonitor::should_warn_about_mgr_down() +{ + utime_t now = ceph_clock_now(); + // we warn if we have osds AND we've exceeded the grace period + // which means a new mon cluster and be HEALTH_OK indefinitely as long as + // no OSDs are ever created. + if (mon.osdmon()->osdmap.get_num_osds() > 0 && + now > mon.monmap->created + g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")) { + health_status_t level = HEALTH_WARN; + if (first_seen_inactive != utime_t() && + now - first_seen_inactive > g_conf().get_val<int64_t>("mon_mgr_inactive_grace")) { + level = HEALTH_ERR; + } + return level; + } + return HEALTH_OK; +} + +void MgrMonitor::post_paxos_update() +{ + // are we handling digest subscribers? + if (digest_event) { + bool send = false; + if (prev_health_checks.empty()) { + prev_health_checks.resize(mon.paxos_service.size()); + send = true; + } + ceph_assert(prev_health_checks.size() == mon.paxos_service.size()); + for (auto i = 0u; i < prev_health_checks.size(); i++) { + const auto& curr = mon.paxos_service[i]->get_health_checks(); + if (!send && curr != prev_health_checks[i]) { + send = true; + } + prev_health_checks[i] = curr; + } + if (send) { + if (is_active()) { + send_digests(); + } else { + cancel_timer(); + wait_for_active_ctx(new C_MonContext{&mon, [this](int) { + send_digests(); + }}); + } + } + } +} + +void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t) +{ + dout(10) << __func__ << " " << pending_map << dendl; + bufferlist bl; + pending_map.encode(bl, mon.get_quorum_con_features()); + put_version(t, pending_map.epoch, bl); + put_last_committed(t, pending_map.epoch); + + for (auto& p : pending_metadata) { + dout(10) << __func__ << " set metadata for " << p.first << dendl; + t->put(MGR_METADATA_PREFIX, p.first, p.second); + } + for (auto& name : pending_metadata_rm) { + dout(10) << __func__ << " rm metadata for " << name << dendl; + t->erase(MGR_METADATA_PREFIX, name); + } + pending_metadata.clear(); + pending_metadata_rm.clear(); + + health_check_map_t next; + if (pending_map.active_gid == 0) { + auto level = should_warn_about_mgr_down(); + if (level != HEALTH_OK) { + next.add("MGR_DOWN", level, "no active mgr", 0); + } else { + dout(10) << __func__ << " no health warning (never active and new cluster)" + << dendl; + } + } else { + put_value(t, "ever_had_active_mgr", 1); + } + encode_health(next, t); + + if (pending_command_descs.size()) { + dout(4) << __func__ << " encoding " << pending_command_descs.size() + << " command_descs" << dendl; + for (auto& p : pending_command_descs) { + p.set_flag(MonCommand::FLAG_MGR); + } + bufferlist bl; + encode(pending_command_descs, bl); + t->put(command_descs_prefix, "", bl); + pending_command_descs.clear(); + } +} + +bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid) +{ + // check permissions + MonSession *session = op->get_session(); + if (!session) + return false; + if (!session->is_capable("mgr", MON_CAP_X)) { + dout(1) << __func__ << " insufficient caps " << session->caps << dendl; + return false; + } + if (fsid != mon.monmap->fsid) { + dout(1) << __func__ << " op fsid " << fsid + << " != " << mon.monmap->fsid << dendl; + return false; + } + return true; +} + +bool MgrMonitor::preprocess_query(MonOpRequestRef op) +{ + auto m = op->get_req<PaxosServiceMessage>(); + switch (m->get_type()) { + case MSG_MGR_BEACON: + return preprocess_beacon(op); + case MSG_MON_COMMAND: + try { + return preprocess_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + + default: + mon.no_reply(op); + derr << "Unhandled message type " << m->get_type() << dendl; + return true; + } +} + +bool MgrMonitor::prepare_update(MonOpRequestRef op) +{ + auto m = op->get_req<PaxosServiceMessage>(); + switch (m->get_type()) { + case MSG_MGR_BEACON: + return prepare_beacon(op); + + case MSG_MON_COMMAND: + try { + return prepare_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + + default: + mon.no_reply(op); + derr << "Unhandled message type " << m->get_type() << dendl; + return true; + } +} + + + +class C_Updated : public Context { + MgrMonitor *mm; + MonOpRequestRef op; +public: + C_Updated(MgrMonitor *a, MonOpRequestRef c) : + mm(a), op(c) {} + void finish(int r) override { + if (r >= 0) { + // Success + } else if (r == -ECANCELED) { + mm->mon.no_reply(op); + } else { + mm->dispatch(op); // try again + } + } +}; + +bool MgrMonitor::preprocess_beacon(MonOpRequestRef op) +{ + auto m = op->get_req<MMgrBeacon>(); + mon.no_reply(op); // we never reply to beacons + dout(4) << "beacon from " << m->get_gid() << dendl; + + if (!check_caps(op, m->get_fsid())) { + // drop it on the floor + return true; + } + + // always send this to the leader's prepare_beacon() + return false; +} + +bool MgrMonitor::prepare_beacon(MonOpRequestRef op) +{ + auto m = op->get_req<MMgrBeacon>(); + dout(4) << "beacon from " << m->get_gid() << dendl; + + // See if we are seeing same name, new GID for the active daemon + if (m->get_name() == pending_map.active_name + && m->get_gid() != pending_map.active_gid) + { + dout(4) << "Active daemon restart (mgr." << m->get_name() << ")" << dendl; + mon.clog->info() << "Active manager daemon " << m->get_name() + << " restarted"; + if (!mon.osdmon()->is_writeable()) { + dout(1) << __func__ << ": waiting for osdmon writeable to" + " blocklist old instance." << dendl; + mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + drop_active(); + } + + // See if we are seeing same name, new GID for any standbys + for (const auto &i : pending_map.standbys) { + const MgrMap::StandbyInfo &s = i.second; + if (s.name == m->get_name() && s.gid != m->get_gid()) { + dout(4) << "Standby daemon restart (mgr." << m->get_name() << ")" << dendl; + mon.clog->debug() << "Standby manager daemon " << m->get_name() + << " restarted"; + drop_standby(i.first); + break; + } + } + + last_beacon[m->get_gid()] = ceph::coarse_mono_clock::now(); + + // Track whether we modified pending_map + bool updated = false; + + if (pending_map.active_gid == m->get_gid()) { + if (pending_map.services != m->get_services()) { + dout(4) << "updated services from mgr." << m->get_name() + << ": " << m->get_services() << dendl; + pending_map.services = m->get_services(); + updated = true; + } + + // A beacon from the currently active daemon + if (pending_map.active_addrs != m->get_server_addrs()) { + dout(4) << "learned address " << m->get_server_addrs() + << " (was " << pending_map.active_addrs << ")" << dendl; + pending_map.active_addrs = m->get_server_addrs(); + updated = true; + } + + if (pending_map.get_available() != m->get_available()) { + dout(4) << "available " << m->get_gid() << dendl; + mon.clog->info() << "Manager daemon " << pending_map.active_name + << " is now available"; + + // This beacon should include command descriptions + pending_command_descs = m->get_command_descs(); + if (pending_command_descs.empty()) { + // This should not happen, but it also isn't fatal: we just + // won't successfully update our list of commands. + dout(4) << "First available beacon from " << pending_map.active_name + << "(" << m->get_gid() << ") does not include command descs" + << dendl; + } else { + dout(4) << "First available beacon from " << pending_map.active_name + << "(" << m->get_gid() << ") includes " + << pending_command_descs.size() << " command descs" << dendl; + } + + pending_map.available = m->get_available(); + updated = true; + } + if (pending_map.available_modules != m->get_available_modules()) { + dout(4) << "available_modules " << m->get_available_modules() + << " (was " << pending_map.available_modules << ")" << dendl; + pending_map.available_modules = m->get_available_modules(); + updated = true; + } + const auto& clients = m->get_clients(); + if (pending_map.clients != clients) { + dout(4) << "active's RADOS clients " << clients + << " (was " << pending_map.clients << ")" << dendl; + pending_map.clients = clients; + updated = true; + } + } else if (pending_map.active_gid == 0) { + // There is no currently active daemon, select this one. + if (pending_map.standbys.count(m->get_gid())) { + drop_standby(m->get_gid(), false); + } + dout(4) << "selecting new active " << m->get_gid() + << " " << m->get_name() + << " (was " << pending_map.active_gid << " " + << pending_map.active_name << ")" << dendl; + pending_map.active_gid = m->get_gid(); + pending_map.active_name = m->get_name(); + pending_map.active_change = ceph_clock_now(); + pending_map.active_mgr_features = m->get_mgr_features(); + pending_map.available_modules = m->get_available_modules(); + encode(m->get_metadata(), pending_metadata[m->get_name()]); + pending_metadata_rm.erase(m->get_name()); + + mon.clog->info() << "Activating manager daemon " + << pending_map.active_name; + + updated = true; + } else { + if (pending_map.standbys.count(m->get_gid()) > 0) { + dout(10) << "from existing standby " << m->get_gid() << dendl; + if (pending_map.standbys[m->get_gid()].available_modules != + m->get_available_modules()) { + dout(10) << "existing standby " << m->get_gid() << " available_modules " + << m->get_available_modules() << " (was " + << pending_map.standbys[m->get_gid()].available_modules << ")" + << dendl; + pending_map.standbys[m->get_gid()].available_modules = + m->get_available_modules(); + updated = true; + } + } else { + dout(10) << "new standby " << m->get_gid() << dendl; + mon.clog->debug() << "Standby manager daemon " << m->get_name() + << " started"; + pending_map.standbys[m->get_gid()] = {m->get_gid(), m->get_name(), + m->get_available_modules(), + m->get_mgr_features()}; + encode(m->get_metadata(), pending_metadata[m->get_name()]); + pending_metadata_rm.erase(m->get_name()); + updated = true; + } + } + + if (updated) { + dout(4) << "updating map" << dendl; + wait_for_finished_proposal(op, new C_Updated(this, op)); + } else { + dout(10) << "no change" << dendl; + } + + return updated; +} + +void MgrMonitor::check_subs() +{ + const std::string type = "mgrmap"; + if (mon.session_map.subs.count(type) == 0) + return; + for (auto sub : *(mon.session_map.subs[type])) { + check_sub(sub); + } +} + +void MgrMonitor::check_sub(Subscription *sub) +{ + if (sub->type == "mgrmap") { + if (sub->next <= map.get_epoch()) { + dout(20) << "Sending map to subscriber " << sub->session->con + << " " << sub->session->con->get_peer_addr() << dendl; + sub->session->con->send_message2(make_message<MMgrMap>(map)); + if (sub->onetime) { + mon.session_map.remove_sub(sub); + } else { + sub->next = map.get_epoch() + 1; + } + } + } else { + ceph_assert(sub->type == "mgrdigest"); + if (sub->next == 0) { + // new registration; cancel previous timer + cancel_timer(); + } + if (digest_event == nullptr) { + send_digests(); + } + } +} + +/** + * Handle digest subscriptions separately (outside of check_sub) because + * they are going to be periodic rather than version-driven. + */ +void MgrMonitor::send_digests() +{ + cancel_timer(); + + const std::string type = "mgrdigest"; + if (mon.session_map.subs.count(type) == 0) { + prev_health_checks.clear(); + return; + } + + if (!is_active()) { + // if paxos is currently not active, don't send a digest but reenable timer + goto timer; + } + dout(10) << __func__ << dendl; + + for (auto sub : *(mon.session_map.subs[type])) { + dout(10) << __func__ << " sending digest to subscriber " << sub->session->con + << " " << sub->session->con->get_peer_addr() << dendl; + auto mdigest = make_message<MMgrDigest>(); + + JSONFormatter f; + mon.healthmon()->get_health_status(true, &f, nullptr, nullptr, nullptr); + f.flush(mdigest->health_json); + f.reset(); + + mon.get_mon_status(&f); + f.flush(mdigest->mon_status_json); + f.reset(); + + sub->session->con->send_message2(mdigest); + } + +timer: + digest_event = mon.timer.add_event_after( + g_conf().get_val<int64_t>("mon_mgr_digest_period"), + new C_MonContext{&mon, [this](int) { + send_digests(); + }}); +} + +void MgrMonitor::cancel_timer() +{ + if (digest_event) { + mon.timer.cancel_event(digest_event); + digest_event = nullptr; + } +} + +void MgrMonitor::on_active() +{ + if (!mon.is_leader()) { + return; + } + mon.clog->debug() << "mgrmap e" << map.epoch << ": " << map; + if (!HAVE_FEATURE(mon.get_quorum_con_features(), SERVER_NAUTILUS)) { + return; + } + if (pending_map.always_on_modules == always_on_modules) { + return; + } + dout(4) << "always on modules changed, pending " + << pending_map.always_on_modules << " != wanted " + << always_on_modules << dendl; + pending_map.always_on_modules = always_on_modules; + propose_pending(); +} + +void MgrMonitor::tick() +{ + if (!is_active() || !mon.is_leader()) + return; + + const auto now = ceph::coarse_mono_clock::now(); + + const auto mgr_beacon_grace = + g_conf().get_val<std::chrono::seconds>("mon_mgr_beacon_grace"); + + // Note that this is the mgr daemon's tick period, not ours (the + // beacon is sent with this period). + const auto mgr_tick_period = + g_conf().get_val<std::chrono::seconds>("mgr_tick_period"); + + if (last_tick != ceph::coarse_mono_clock::time_point::min() + && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) { + // This case handles either local slowness (calls being delayed + // for whatever reason) or cluster election slowness (a long gap + // between calls while an election happened) + dout(4) << __func__ << ": resetting beacon timeouts due to mon delay " + "(slow election?) of " << now - last_tick << " seconds" << dendl; + for (auto &i : last_beacon) { + i.second = now; + } + } + + last_tick = now; + + // Populate any missing beacons (i.e. no beacon since MgrMonitor + // instantiation) with the current time, so that they will + // eventually look laggy if they fail to give us a beacon. + if (pending_map.active_gid != 0 + && last_beacon.count(pending_map.active_gid) == 0) { + last_beacon[pending_map.active_gid] = now; + } + for (auto s : pending_map.standbys) { + if (last_beacon.count(s.first) == 0) { + last_beacon[s.first] = now; + } + } + + // Cull standbys first so that any remaining standbys + // will be eligible to take over from the active if we cull him. + std::list<uint64_t> dead_standbys; + const auto cutoff = now - mgr_beacon_grace; + for (const auto &i : pending_map.standbys) { + auto last_beacon_time = last_beacon.at(i.first); + if (last_beacon_time < cutoff) { + dead_standbys.push_back(i.first); + } + } + + bool propose = false; + + for (auto i : dead_standbys) { + dout(4) << "Dropping laggy standby " << i << dendl; + drop_standby(i); + propose = true; + } + + if (pending_map.active_gid != 0 + && last_beacon.at(pending_map.active_gid) < cutoff + && mon.osdmon()->is_writeable()) { + const std::string old_active_name = pending_map.active_name; + drop_active(); + propose = true; + dout(4) << "Dropping active" << pending_map.active_gid << dendl; + if (promote_standby()) { + dout(4) << "Promoted standby " << pending_map.active_gid << dendl; + mon.clog->info() << "Manager daemon " << old_active_name + << " is unresponsive, replacing it with standby" + << " daemon " << pending_map.active_name; + } else { + dout(4) << "Active is laggy but have no standbys to replace it" << dendl; + mon.clog->info() << "Manager daemon " << old_active_name + << " is unresponsive. No standby daemons available."; + } + } else if (pending_map.active_gid == 0) { + if (promote_standby()) { + dout(4) << "Promoted standby " << pending_map.active_gid << dendl; + mon.clog->info() << "Activating manager daemon " + << pending_map.active_name; + propose = true; + } + } + + if (!pending_map.available && + !ever_had_active_mgr && + should_warn_about_mgr_down() != HEALTH_OK) { + dout(10) << " exceeded mon_mgr_mkfs_grace " + << g_conf().get_val<int64_t>("mon_mgr_mkfs_grace") + << " seconds" << dendl; + propose = true; + } + + // obsolete modules? + if (mon.monmap->min_mon_release >= ceph_release_t::octopus && + pending_map.module_enabled("orchestrator_cli")) { + dout(10) << " disabling obsolete/renamed 'orchestrator_cli'" << dendl; + // we don't need to enable 'orchestrator' because it's now always-on + pending_map.modules.erase("orchestrator_cli"); + propose = true; + } + + if (propose) { + propose_pending(); + } +} + +void MgrMonitor::on_restart() +{ + // Clear out the leader-specific state. + last_beacon.clear(); + last_tick = ceph::coarse_mono_clock::now(); +} + + +bool MgrMonitor::promote_standby() +{ + ceph_assert(pending_map.active_gid == 0); + if (pending_map.standbys.size()) { + // Promote a replacement (arbitrary choice of standby) + auto replacement_gid = pending_map.standbys.begin()->first; + pending_map.active_gid = replacement_gid; + pending_map.active_name = pending_map.standbys.at(replacement_gid).name; + pending_map.available_modules = + pending_map.standbys.at(replacement_gid).available_modules; + pending_map.active_mgr_features = + pending_map.standbys.at(replacement_gid).mgr_features; + pending_map.available = false; + pending_map.active_addrs = entity_addrvec_t(); + pending_map.active_change = ceph_clock_now(); + + drop_standby(replacement_gid, false); + + return true; + } else { + return false; + } +} + +void MgrMonitor::drop_active() +{ + ceph_assert(mon.osdmon()->is_writeable()); + + if (last_beacon.count(pending_map.active_gid) > 0) { + last_beacon.erase(pending_map.active_gid); + } + + ceph_assert(pending_map.active_gid > 0); + auto until = ceph_clock_now(); + until += g_conf().get_val<double>("mon_mgr_blocklist_interval"); + dout(5) << "blocklisting previous mgr." << pending_map.active_name << "." + << pending_map.active_gid << " (" + << pending_map.active_addrs << ")" << dendl; + auto blocklist_epoch = mon.osdmon()->blocklist(pending_map.active_addrs, until); + + /* blocklist RADOS clients in use by the mgr */ + for (const auto& a : pending_map.clients) { + mon.osdmon()->blocklist(a, until); + } + request_proposal(mon.osdmon()); + + pending_metadata_rm.insert(pending_map.active_name); + pending_metadata.erase(pending_map.active_name); + pending_map.active_name = ""; + pending_map.active_gid = 0; + pending_map.active_change = ceph_clock_now(); + pending_map.active_mgr_features = 0; + pending_map.available = false; + pending_map.active_addrs = entity_addrvec_t(); + pending_map.services.clear(); + pending_map.clients.clear(); + pending_map.last_failure_osd_epoch = blocklist_epoch; + + // So that when new active mgr subscribes to mgrdigest, it will + // get an immediate response instead of waiting for next timer + cancel_timer(); +} + +void MgrMonitor::drop_standby(uint64_t gid, bool drop_meta) +{ + if (drop_meta) { + pending_metadata_rm.insert(pending_map.standbys[gid].name); + pending_metadata.erase(pending_map.standbys[gid].name); + } + pending_map.standbys.erase(gid); + if (last_beacon.count(gid) > 0) { + last_beacon.erase(gid); + } +} + +bool MgrMonitor::preprocess_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + std::stringstream ss; + bufferlist rdata; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed()); + return true; + } + + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", rdata, + get_last_committed()); + return true; + } + + string format; + cmd_getval(cmdmap, "format", format); + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", + "json-pretty")); + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + int r = 0; + + if (prefix == "mgr stat") { + f->open_object_section("stat"); + f->dump_unsigned("epoch", map.get_epoch()); + f->dump_bool("available", map.get_available()); + f->dump_string("active_name", map.get_active_name()); + f->dump_unsigned("num_standby", map.get_num_standby()); + f->close_section(); + f->flush(rdata); + } else if (prefix == "mgr dump") { + int64_t epoch = 0; + cmd_getval(cmdmap, "epoch", epoch, (int64_t)map.get_epoch()); + if (epoch == (int64_t)map.get_epoch()) { + f->dump_object("mgrmap", map); + } else { + bufferlist bl; + int err = get_version(epoch, bl); + if (err == -ENOENT) { + r = -ENOENT; + ss << "there is no map for epoch " << epoch; + goto reply; + } + MgrMap m; + auto p = bl.cbegin(); + m.decode(p); + f->dump_object("mgrmap", m); + } + f->flush(rdata); + } else if (prefix == "mgr module ls") { + f->open_object_section("modules"); + { + f->open_array_section("always_on_modules"); + for (auto& p : map.get_always_on_modules()) { + f->dump_string("module", p); + } + f->close_section(); + f->open_array_section("enabled_modules"); + for (auto& p : map.modules) { + if (map.get_always_on_modules().count(p) > 0) + continue; + // We only show the name for enabled modules. The any errors + // etc will show up as a health checks. + f->dump_string("module", p); + } + f->close_section(); + f->open_array_section("disabled_modules"); + for (auto& p : map.available_modules) { + if (map.modules.count(p.name) == 0 && + map.get_always_on_modules().count(p.name) == 0) { + // For disabled modules, we show the full info, to + // give a hint about whether enabling it will work + p.dump(f.get()); + } + } + f->close_section(); + } + f->close_section(); + f->flush(rdata); + } else if (prefix == "mgr services") { + f->open_object_section("services"); + for (const auto &i : map.services) { + f->dump_string(i.first.c_str(), i.second); + } + f->close_section(); + f->flush(rdata); + } else if (prefix == "mgr metadata") { + string name; + cmd_getval(cmdmap, "who", name); + if (name.size() > 0 && !map.have_name(name)) { + ss << "mgr." << name << " does not exist"; + r = -ENOENT; + goto reply; + } + string format; + cmd_getval(cmdmap, "format", format); + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty")); + if (name.size()) { + f->open_object_section("mgr_metadata"); + f->dump_string("name", name); + r = dump_metadata(name, f.get(), &ss); + if (r < 0) + goto reply; + f->close_section(); + } else { + r = 0; + f->open_array_section("mgr_metadata"); + for (auto& i : map.get_all_names()) { + f->open_object_section("mgr"); + f->dump_string("name", i); + r = dump_metadata(i, f.get(), NULL); + if (r == -EINVAL || r == -ENOENT) { + // Drop error, continue to get other daemons' metadata + dout(4) << "No metadata for mgr." << i << dendl; + r = 0; + } else if (r < 0) { + // Unexpected error + goto reply; + } + f->close_section(); + } + f->close_section(); + } + f->flush(rdata); + } else if (prefix == "mgr versions") { + count_metadata("ceph_version", f.get()); + f->flush(rdata); + r = 0; + } else if (prefix == "mgr count-metadata") { + string field; + cmd_getval(cmdmap, "property", field); + count_metadata(field, f.get()); + f->flush(rdata); + r = 0; + } else { + return false; + } + +reply: + string rs; + getline(ss, rs); + mon.reply_command(op, r, rs, rdata, get_last_committed()); + return true; +} + +bool MgrMonitor::prepare_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + + std::stringstream ss; + bufferlist rdata; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed()); + return true; + } + + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed()); + return true; + } + + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + + int r = 0; + + if (prefix == "mgr fail") { + string who; + if (!cmd_getval(cmdmap, "who", who)) { + if (!map.active_gid) { + ss << "Currently no active mgr"; + goto out; + } + who = map.active_name; + } + + std::string err; + uint64_t gid = strict_strtol(who.c_str(), 10, &err); + bool changed = false; + if (!err.empty()) { + // Does not parse as a gid, treat it as a name + if (pending_map.active_name == who) { + if (!mon.osdmon()->is_writeable()) { + mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + drop_active(); + changed = true; + } else { + gid = 0; + for (const auto &i : pending_map.standbys) { + if (i.second.name == who) { + gid = i.first; + break; + } + } + if (gid != 0) { + drop_standby(gid); + changed = true; + } else { + ss << "Daemon not found '" << who << "', already failed?"; + } + } + } else { + if (pending_map.active_gid == gid) { + if (!mon.osdmon()->is_writeable()) { + mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + drop_active(); + changed = true; + } else if (pending_map.standbys.count(gid) > 0) { + drop_standby(gid); + changed = true; + } else { + ss << "Daemon not found '" << gid << "', already failed?"; + } + } + + if (changed && pending_map.active_gid == 0) { + promote_standby(); + } + } else if (prefix == "mgr module enable") { + string module; + cmd_getval(cmdmap, "module", module); + if (module.empty()) { + r = -EINVAL; + goto out; + } + if (pending_map.get_always_on_modules().count(module) > 0) { + ss << "module '" << module << "' is already enabled (always-on)"; + goto out; + } + string force; + cmd_getval(cmdmap, "force", force); + if (!pending_map.all_support_module(module) && + force != "--force") { + ss << "all mgr daemons do not support module '" << module << "', pass " + << "--force to force enablement"; + r = -ENOENT; + goto out; + } + + std::string can_run_error; + if (force != "--force" && !pending_map.can_run_module(module, &can_run_error)) { + ss << "module '" << module << "' reports that it cannot run on the active " + "manager daemon: " << can_run_error << " (pass --force to force " + "enablement)"; + r = -ENOENT; + goto out; + } + + if (pending_map.module_enabled(module)) { + ss << "module '" << module << "' is already enabled"; + r = 0; + goto out; + } + pending_map.modules.insert(module); + } else if (prefix == "mgr module disable") { + string module; + cmd_getval(cmdmap, "module", module); + if (module.empty()) { + r = -EINVAL; + goto out; + } + if (pending_map.get_always_on_modules().count(module) > 0) { + ss << "module '" << module << "' cannot be disabled (always-on)"; + r = -EINVAL; + goto out; + } + if (!pending_map.module_enabled(module)) { + ss << "module '" << module << "' is already disabled"; + r = 0; + goto out; + } + if (!pending_map.modules.count(module)) { + ss << "module '" << module << "' is not enabled"; + } + pending_map.modules.erase(module); + } else { + ss << "Command '" << prefix << "' not implemented!"; + r = -ENOSYS; + } + +out: + dout(4) << __func__ << " done, r=" << r << dendl; + /* Compose response */ + string rs; + getline(ss, rs); + + if (r >= 0) { + // success.. delay reply + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs, + get_last_committed() + 1)); + return true; + } else { + // reply immediately + mon.reply_command(op, r, rs, rdata, get_last_committed()); + return false; + } +} + +void MgrMonitor::init() +{ + if (digest_event == nullptr) { + send_digests(); // To get it to schedule its own event + } +} + +void MgrMonitor::on_shutdown() +{ + cancel_timer(); +} + +int MgrMonitor::load_metadata(const string& name, std::map<string, string>& m, + ostream *err) const +{ + bufferlist bl; + int r = mon.store->get(MGR_METADATA_PREFIX, name, bl); + if (r < 0) + return r; + try { + auto p = bl.cbegin(); + decode(m, p); + } + catch (ceph::buffer::error& e) { + if (err) + *err << "mgr." << name << " metadata is corrupt"; + return -EIO; + } + return 0; +} + +void MgrMonitor::count_metadata(const string& field, std::map<string,int> *out) +{ + std::set<string> ls = map.get_all_names(); + for (auto& name : ls) { + std::map<string,string> meta; + load_metadata(name, meta, nullptr); + auto p = meta.find(field); + if (p == meta.end()) { + (*out)["unknown"]++; + } else { + (*out)[p->second]++; + } + } +} + +void MgrMonitor::count_metadata(const string& field, Formatter *f) +{ + std::map<string,int> by_val; + count_metadata(field, &by_val); + f->open_object_section(field.c_str()); + for (auto& p : by_val) { + f->dump_int(p.first.c_str(), p.second); + } + f->close_section(); +} + +void MgrMonitor::get_versions(std::map<string, list<string> > &versions) +{ + std::set<string> ls = map.get_all_names(); + for (auto& name : ls) { + std::map<string,string> meta; + load_metadata(name, meta, nullptr); + auto p = meta.find("ceph_version_short"); + if (p == meta.end()) continue; + versions[p->second].push_back(string("mgr.") + name); + } +} + +int MgrMonitor::dump_metadata(const string& name, Formatter *f, ostream *err) +{ + std::map<string,string> m; + if (int r = load_metadata(name, m, err)) + return r; + for (auto& p : m) { + f->dump_string(p.first.c_str(), p.second); + } + return 0; +} + +void MgrMonitor::print_nodes(Formatter *f) const +{ + ceph_assert(f); + + std::map<string, list<string> > mgrs; // hostname => mgr + auto ls = map.get_all_names(); + for (auto& name : ls) { + std::map<string,string> meta; + if (load_metadata(name, meta, nullptr)) { + continue; + } + auto hostname = meta.find("hostname"); + if (hostname == meta.end()) { + // not likely though + continue; + } + mgrs[hostname->second].push_back(name); + } + + dump_services(f, mgrs, "mgr"); +} + +const std::vector<MonCommand> &MgrMonitor::get_command_descs() const +{ + if (command_descs.empty()) { + // must have just upgraded; fallback to static commands + return mgr_commands; + } else { + return command_descs; + } +} diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h new file mode 100644 index 000000000..be75602ab --- /dev/null +++ b/src/mon/MgrMonitor.h @@ -0,0 +1,140 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CEPH_MGRMONITOR_H +#define CEPH_MGRMONITOR_H + +#include <map> +#include <set> + +#include "include/Context.h" +#include "MgrMap.h" +#include "PaxosService.h" +#include "MonCommand.h" + +class MgrMonitor: public PaxosService +{ + MgrMap map; + MgrMap pending_map; + bool ever_had_active_mgr = false; + + std::map<std::string, ceph::buffer::list> pending_metadata; + std::set<std::string> pending_metadata_rm; + + std::map<std::string,Option> mgr_module_options; + std::list<std::string> misc_option_strings; + + utime_t first_seen_inactive; + + std::map<uint64_t, ceph::coarse_mono_clock::time_point> last_beacon; + + /** + * If a standby is available, make it active, given that + * there is currently no active daemon. + * + * @return true if a standby was promoted + */ + bool promote_standby(); + void drop_active(); + + /** + * Remove this gid from the list of standbys. By default, + * also remove metadata (i.e. forget the daemon entirely). + * + * Set `drop_meta` to false if you would like to keep + * the daemon's metadata, for example if you're dropping + * it as a standby before reinstating it as the active daemon. + */ + void drop_standby(uint64_t gid, bool drop_meta=true); + + Context *digest_event = nullptr; + void cancel_timer(); + + std::vector<health_check_map_t> prev_health_checks; + + bool check_caps(MonOpRequestRef op, const uuid_d& fsid); + + health_status_t should_warn_about_mgr_down(); + + // Command descriptions we've learned from the active mgr + std::vector<MonCommand> command_descs; + std::vector<MonCommand> pending_command_descs; + +public: + MgrMonitor(Monitor &mn, Paxos &p, const std::string& service_name) + : PaxosService(mn, p, service_name) + {} + ~MgrMonitor() override {} + + void init() override; + void on_shutdown() override; + + const MgrMap &get_map() const { return map; } + + const std::map<std::string,Option>& get_mgr_module_options() { + return mgr_module_options; + } + const Option *find_module_option(const std::string& name); + + bool in_use() const { return map.epoch > 0; } + + version_t get_trim_to() const override; + + void prime_mgr_client(); + + void create_initial() override; + void get_store_prefixes(std::set<std::string>& s) const override; + void update_from_paxos(bool *need_bootstrap) override; + void post_paxos_update() override; + void create_pending() override; + void encode_pending(MonitorDBStore::TransactionRef t) override; + + bool preprocess_query(MonOpRequestRef op) override; + bool prepare_update(MonOpRequestRef op) override; + + bool preprocess_command(MonOpRequestRef op); + bool prepare_command(MonOpRequestRef op); + + void encode_full(MonitorDBStore::TransactionRef t) override { } + + bool preprocess_beacon(MonOpRequestRef op); + bool prepare_beacon(MonOpRequestRef op); + + void check_sub(Subscription *sub); + void check_subs(); + void send_digests(); + + void on_active() override; + void on_restart() override; + + void tick() override; + + void print_summary(ceph::Formatter *f, std::ostream *ss) const; + + const std::vector<MonCommand> &get_command_descs() const; + + int load_metadata(const std::string& name, std::map<std::string, std::string>& m, + std::ostream *err) const; + int dump_metadata(const std::string& name, ceph::Formatter *f, std::ostream *err); + void print_nodes(ceph::Formatter *f) const; + void count_metadata(const std::string& field, ceph::Formatter *f); + void count_metadata(const std::string& field, std::map<std::string,int> *out); + void get_versions(std::map<std::string, std::list<std::string>> &versions); + + // When did the mon last call into our tick() method? Used for detecting + // when the mon was not updating us for some period (e.g. during slow + // election) to reset last_beacon timeouts + ceph::coarse_mono_clock::time_point last_tick; +}; + +#endif diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc new file mode 100644 index 000000000..9da4c50da --- /dev/null +++ b/src/mon/MgrStatMonitor.cc @@ -0,0 +1,367 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "MgrStatMonitor.h" +#include "mon/OSDMonitor.h" +#include "mon/MgrMonitor.h" +#include "mon/PGMap.h" +#include "messages/MGetPoolStats.h" +#include "messages/MGetPoolStatsReply.h" +#include "messages/MMonMgrReport.h" +#include "messages/MStatfs.h" +#include "messages/MStatfsReply.h" +#include "messages/MServiceMap.h" + +#include "include/ceph_assert.h" // re-clobber assert + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon) + +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::ErasureCodeInterfaceRef; +using ceph::ErasureCodeProfile; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::make_message; +using ceph::mono_clock; +using ceph::mono_time; + +static ostream& _prefix(std::ostream *_dout, Monitor &mon) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() + << ").mgrstat "; +} + +MgrStatMonitor::MgrStatMonitor(Monitor &mn, Paxos &p, const string& service_name) + : PaxosService(mn, p, service_name) +{ +} + +MgrStatMonitor::~MgrStatMonitor() = default; + +void MgrStatMonitor::create_initial() +{ + dout(10) << __func__ << dendl; + version = 0; + service_map.epoch = 1; + service_map.modified = ceph_clock_now(); + pending_service_map_bl.clear(); + encode(service_map, pending_service_map_bl, CEPH_FEATURES_ALL); +} + +void MgrStatMonitor::update_from_paxos(bool *need_bootstrap) +{ + version = get_last_committed(); + dout(10) << " " << version << dendl; + load_health(); + bufferlist bl; + get_version(version, bl); + if (version) { + ceph_assert(bl.length()); + try { + auto p = bl.cbegin(); + decode(digest, p); + decode(service_map, p); + if (!p.end()) { + decode(progress_events, p); + } + dout(10) << __func__ << " v" << version + << " service_map e" << service_map.epoch + << " " << progress_events.size() << " progress events" + << dendl; + } + catch (ceph::buffer::error& e) { + derr << "failed to decode mgrstat state; luminous dev version? " + << e.what() << dendl; + } + } + check_subs(); + update_logger(); + mon.osdmon()->notify_new_pg_digest(); +} + +void MgrStatMonitor::update_logger() +{ + dout(20) << __func__ << dendl; + + mon.cluster_logger->set(l_cluster_osd_bytes, digest.osd_sum.statfs.total); + mon.cluster_logger->set(l_cluster_osd_bytes_used, + digest.osd_sum.statfs.get_used_raw()); + mon.cluster_logger->set(l_cluster_osd_bytes_avail, + digest.osd_sum.statfs.available); + + mon.cluster_logger->set(l_cluster_num_pool, digest.pg_pool_sum.size()); + uint64_t num_pg = 0; + for (auto i : digest.num_pg_by_pool) { + num_pg += i.second; + } + mon.cluster_logger->set(l_cluster_num_pg, num_pg); + + unsigned active = 0, active_clean = 0, peering = 0; + for (auto p = digest.num_pg_by_state.begin(); + p != digest.num_pg_by_state.end(); + ++p) { + if (p->first & PG_STATE_ACTIVE) { + active += p->second; + if (p->first & PG_STATE_CLEAN) + active_clean += p->second; + } + if (p->first & PG_STATE_PEERING) + peering += p->second; + } + mon.cluster_logger->set(l_cluster_num_pg_active_clean, active_clean); + mon.cluster_logger->set(l_cluster_num_pg_active, active); + mon.cluster_logger->set(l_cluster_num_pg_peering, peering); + + mon.cluster_logger->set(l_cluster_num_object, digest.pg_sum.stats.sum.num_objects); + mon.cluster_logger->set(l_cluster_num_object_degraded, digest.pg_sum.stats.sum.num_objects_degraded); + mon.cluster_logger->set(l_cluster_num_object_misplaced, digest.pg_sum.stats.sum.num_objects_misplaced); + mon.cluster_logger->set(l_cluster_num_object_unfound, digest.pg_sum.stats.sum.num_objects_unfound); + mon.cluster_logger->set(l_cluster_num_bytes, digest.pg_sum.stats.sum.num_bytes); + +} + +void MgrStatMonitor::create_pending() +{ + dout(10) << " " << version << dendl; + pending_digest = digest; + pending_health_checks = get_health_checks(); + pending_service_map_bl.clear(); + encode(service_map, pending_service_map_bl, mon.get_quorum_con_features()); +} + +void MgrStatMonitor::encode_pending(MonitorDBStore::TransactionRef t) +{ + ++version; + dout(10) << " " << version << dendl; + bufferlist bl; + encode(pending_digest, bl, mon.get_quorum_con_features()); + ceph_assert(pending_service_map_bl.length()); + bl.append(pending_service_map_bl); + encode(pending_progress_events, bl); + put_version(t, version, bl); + put_last_committed(t, version); + + encode_health(pending_health_checks, t); +} + +version_t MgrStatMonitor::get_trim_to() const +{ + // we don't actually need *any* old states, but keep a few. + if (version > 5) { + return version - 5; + } + return 0; +} + +void MgrStatMonitor::on_active() +{ + update_logger(); +} + +void MgrStatMonitor::tick() +{ +} + +bool MgrStatMonitor::preprocess_query(MonOpRequestRef op) +{ + auto m = op->get_req<PaxosServiceMessage>(); + switch (m->get_type()) { + case CEPH_MSG_STATFS: + return preprocess_statfs(op); + case MSG_MON_MGR_REPORT: + return preprocess_report(op); + case MSG_GETPOOLSTATS: + return preprocess_getpoolstats(op); + default: + mon.no_reply(op); + derr << "Unhandled message type " << m->get_type() << dendl; + return true; + } +} + +bool MgrStatMonitor::prepare_update(MonOpRequestRef op) +{ + auto m = op->get_req<PaxosServiceMessage>(); + switch (m->get_type()) { + case MSG_MON_MGR_REPORT: + return prepare_report(op); + default: + mon.no_reply(op); + derr << "Unhandled message type " << m->get_type() << dendl; + return true; + } +} + +bool MgrStatMonitor::preprocess_report(MonOpRequestRef op) +{ + auto m = op->get_req<MMonMgrReport>(); + mon.no_reply(op); + if (m->gid && + m->gid != mon.mgrmon()->get_map().get_active_gid()) { + dout(10) << "ignoring report from non-active mgr " << m->gid + << dendl; + return true; + } + return false; +} + +bool MgrStatMonitor::prepare_report(MonOpRequestRef op) +{ + auto m = op->get_req<MMonMgrReport>(); + bufferlist bl = m->get_data(); + auto p = bl.cbegin(); + decode(pending_digest, p); + pending_health_checks.swap(m->health_checks); + if (m->service_map_bl.length()) { + pending_service_map_bl.swap(m->service_map_bl); + } + pending_progress_events.swap(m->progress_events); + dout(10) << __func__ << " " << pending_digest << ", " + << pending_health_checks.checks.size() << " health checks, " + << progress_events.size() << " progress events" << dendl; + dout(20) << "pending_digest:\n"; + JSONFormatter jf(true); + jf.open_object_section("pending_digest"); + pending_digest.dump(&jf); + jf.close_section(); + jf.flush(*_dout); + *_dout << dendl; + dout(20) << "health checks:\n"; + JSONFormatter jf(true); + jf.open_object_section("health_checks"); + pending_health_checks.dump(&jf); + jf.close_section(); + jf.flush(*_dout); + *_dout << dendl; + dout(20) << "progress events:\n"; + JSONFormatter jf(true); + jf.open_object_section("progress_events"); + for (auto& i : pending_progress_events) { + jf.dump_object(i.first.c_str(), i.second); + } + jf.close_section(); + jf.flush(*_dout); + *_dout << dendl; + return true; +} + +bool MgrStatMonitor::preprocess_getpoolstats(MonOpRequestRef op) +{ + op->mark_pgmon_event(__func__); + auto m = op->get_req<MGetPoolStats>(); + auto session = op->get_session(); + if (!session) + return true; + if (!session->is_capable("pg", MON_CAP_R)) { + dout(0) << "MGetPoolStats received from entity with insufficient caps " + << session->caps << dendl; + return true; + } + if (m->fsid != mon.monmap->fsid) { + dout(0) << __func__ << " on fsid " + << m->fsid << " != " << mon.monmap->fsid << dendl; + return true; + } + epoch_t ver = get_last_committed(); + auto reply = new MGetPoolStatsReply(m->fsid, m->get_tid(), ver); + reply->per_pool = digest.use_per_pool_stats(); + for (const auto& pool_name : m->pools) { + const auto pool_id = mon.osdmon()->osdmap.lookup_pg_pool_name(pool_name); + if (pool_id == -ENOENT) + continue; + auto pool_stat = get_pool_stat(pool_id); + if (!pool_stat) + continue; + reply->pool_stats[pool_name] = *pool_stat; + } + mon.send_reply(op, reply); + return true; +} + +bool MgrStatMonitor::preprocess_statfs(MonOpRequestRef op) +{ + op->mark_pgmon_event(__func__); + auto statfs = op->get_req<MStatfs>(); + auto session = op->get_session(); + + if (!session) + return true; + if (!session->is_capable("pg", MON_CAP_R)) { + dout(0) << "MStatfs received from entity with insufficient privileges " + << session->caps << dendl; + return true; + } + if (statfs->fsid != mon.monmap->fsid) { + dout(0) << __func__ << " on fsid " << statfs->fsid + << " != " << mon.monmap->fsid << dendl; + return true; + } + const auto& pool = statfs->data_pool; + if (pool && !mon.osdmon()->osdmap.have_pg_pool(*pool)) { + // There's no error field for MStatfsReply so just ignore the request. + // This is known to happen when a client is still accessing a removed fs. + dout(1) << __func__ << " on removed pool " << *pool << dendl; + return true; + } + dout(10) << __func__ << " " << *statfs + << " from " << statfs->get_orig_source() << dendl; + epoch_t ver = get_last_committed(); + auto reply = new MStatfsReply(statfs->fsid, statfs->get_tid(), ver); + reply->h.st = get_statfs(mon.osdmon()->osdmap, pool); + mon.send_reply(op, reply); + return true; +} + +void MgrStatMonitor::check_sub(Subscription *sub) +{ + dout(10) << __func__ + << " next " << sub->next + << " vs service_map.epoch " << service_map.epoch << dendl; + if (sub->next <= service_map.epoch) { + auto m = new MServiceMap(service_map); + sub->session->con->send_message(m); + if (sub->onetime) { + mon.with_session_map([sub](MonSessionMap& session_map) { + session_map.remove_sub(sub); + }); + } else { + sub->next = service_map.epoch + 1; + } + } +} + +void MgrStatMonitor::check_subs() +{ + dout(10) << __func__ << dendl; + if (!service_map.epoch) { + return; + } + auto subs = mon.session_map.subs.find("servicemap"); + if (subs == mon.session_map.subs.end()) { + return; + } + auto p = subs->second->begin(); + while (!p.end()) { + auto sub = *p; + ++p; + check_sub(sub); + } +} diff --git a/src/mon/MgrStatMonitor.h b/src/mon/MgrStatMonitor.h new file mode 100644 index 000000000..7c31f2c13 --- /dev/null +++ b/src/mon/MgrStatMonitor.h @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/Context.h" +#include "PaxosService.h" +#include "mon/PGMap.h" +#include "mgr/ServiceMap.h" + +class MgrStatMonitor : public PaxosService { + // live version + version_t version = 0; + PGMapDigest digest; + ServiceMap service_map; + std::map<std::string,ProgressEvent> progress_events; + + // pending commit + PGMapDigest pending_digest; + health_check_map_t pending_health_checks; + std::map<std::string,ProgressEvent> pending_progress_events; + ceph::buffer::list pending_service_map_bl; + +public: + MgrStatMonitor(Monitor &mn, Paxos &p, const std::string& service_name); + ~MgrStatMonitor() override; + + void init() override {} + void on_shutdown() override {} + + void create_initial() override; + void update_from_paxos(bool *need_bootstrap) override; + void create_pending() override; + void encode_pending(MonitorDBStore::TransactionRef t) override; + version_t get_trim_to() const override; + + bool definitely_converted_snapsets() const { + return digest.definitely_converted_snapsets(); + } + + bool preprocess_query(MonOpRequestRef op) override; + bool prepare_update(MonOpRequestRef op) override; + + void encode_full(MonitorDBStore::TransactionRef t) override { } + + bool preprocess_report(MonOpRequestRef op); + bool prepare_report(MonOpRequestRef op); + + bool preprocess_getpoolstats(MonOpRequestRef op); + bool preprocess_statfs(MonOpRequestRef op); + + void check_sub(Subscription *sub); + void check_subs(); + void send_digests(); + + void on_active() override; + void tick() override; + + uint64_t get_last_osd_stat_seq(int osd) { + return digest.get_last_osd_stat_seq(osd); + } + + void update_logger(); + + const ServiceMap& get_service_map() const { + return service_map; + } + + const std::map<std::string,ProgressEvent>& get_progress_events() { + return progress_events; + } + + // pg stat access + const pool_stat_t* get_pool_stat(int64_t poolid) const { + auto i = digest.pg_pool_sum.find(poolid); + if (i != digest.pg_pool_sum.end()) { + return &i->second; + } + return nullptr; + } + + const PGMapDigest& get_digest() { + return digest; + } + + ceph_statfs get_statfs(OSDMap& osdmap, + boost::optional<int64_t> data_pool) const { + return digest.get_statfs(osdmap, data_pool); + } + + void print_summary(ceph::Formatter *f, std::ostream *out) const { + digest.print_summary(f, out); + } + void dump_info(ceph::Formatter *f) const { + digest.dump(f); + f->dump_object("servicemap", get_service_map()); + } + void dump_cluster_stats(std::stringstream *ss, + ceph::Formatter *f, + bool verbose) const { + digest.dump_cluster_stats(ss, f, verbose); + } + void dump_pool_stats(const OSDMap& osdm, std::stringstream *ss, ceph::Formatter *f, + bool verbose) const { + digest.dump_pool_stats_full(osdm, ss, f, verbose); + } +}; diff --git a/src/mon/MonCap.cc b/src/mon/MonCap.cc new file mode 100644 index 000000000..e1dc37239 --- /dev/null +++ b/src/mon/MonCap.cc @@ -0,0 +1,679 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <boost/config/warning_disable.hpp> +#include <boost/spirit/include/qi_uint.hpp> +#include <boost/spirit/include/qi.hpp> +#include <boost/fusion/include/std_pair.hpp> +#include <boost/spirit/include/phoenix.hpp> +#include <boost/fusion/adapted/struct/adapt_struct.hpp> +#include <boost/fusion/include/adapt_struct.hpp> +#include <boost/algorithm/string/predicate.hpp> + +#include "MonCap.h" +#include "include/stringify.h" +#include "include/ipaddr.h" +#include "common/debug.h" +#include "common/Formatter.h" + +#include <algorithm> +#include <regex> + +#include "include/ceph_assert.h" + +using std::list; +using std::map; +using std::ostream; +using std::pair; +using std::string; +using std::vector; + +using ceph::bufferlist; +using ceph::Formatter; + +static inline bool is_not_alnum_space(char c) +{ + return !(isalpha(c) || isdigit(c) || (c == '-') || (c == '_')); +} + +static std::string maybe_quote_string(const std::string& str) +{ + if (find_if(str.begin(), str.end(), is_not_alnum_space) == str.end()) + return str; + return string("\"") + str + string("\""); +} + +#define dout_subsys ceph_subsys_mon + +ostream& operator<<(ostream& out, const mon_rwxa_t& p) +{ + if (p == MON_CAP_ANY) + return out << "*"; + + if (p & MON_CAP_R) + out << "r"; + if (p & MON_CAP_W) + out << "w"; + if (p & MON_CAP_X) + out << "x"; + return out; +} + +ostream& operator<<(ostream& out, const StringConstraint& c) +{ + switch (c.match_type) { + case StringConstraint::MATCH_TYPE_EQUAL: + return out << "value " << c.value; + case StringConstraint::MATCH_TYPE_PREFIX: + return out << "prefix " << c.value; + case StringConstraint::MATCH_TYPE_REGEX: + return out << "regex " << c.value; + default: + break; + } + return out; +} + +ostream& operator<<(ostream& out, const MonCapGrant& m) +{ + out << "allow"; + if (m.service.length()) { + out << " service " << maybe_quote_string(m.service); + } + if (m.command.length()) { + out << " command " << maybe_quote_string(m.command); + if (!m.command_args.empty()) { + out << " with"; + for (auto p = m.command_args.begin(); + p != m.command_args.end(); + ++p) { + switch (p->second.match_type) { + case StringConstraint::MATCH_TYPE_EQUAL: + out << " " << maybe_quote_string(p->first) << "=" + << maybe_quote_string(p->second.value); + break; + case StringConstraint::MATCH_TYPE_PREFIX: + out << " " << maybe_quote_string(p->first) << " prefix " + << maybe_quote_string(p->second.value); + break; + case StringConstraint::MATCH_TYPE_REGEX: + out << " " << maybe_quote_string(p->first) << " regex " + << maybe_quote_string(p->second.value); + break; + default: + break; + } + } + } + } + if (m.profile.length()) { + out << " profile " << maybe_quote_string(m.profile); + } + if (m.allow != 0) + out << " " << m.allow; + if (m.network.size()) + out << " network " << m.network; + return out; +} + + +// <magic> +// fusion lets us easily populate structs via the qi parser. + +typedef map<string,StringConstraint> kvmap; + +BOOST_FUSION_ADAPT_STRUCT(MonCapGrant, + (std::string, service) + (std::string, profile) + (std::string, command) + (kvmap, command_args) + (mon_rwxa_t, allow) + (std::string, network) + (std::string, fs_name)) + +BOOST_FUSION_ADAPT_STRUCT(StringConstraint, + (StringConstraint::MatchType, match_type) + (std::string, value)) + +// </magic> + +void MonCapGrant::parse_network() +{ + network_valid = ::parse_network(network.c_str(), &network_parsed, + &network_prefix); +} + +void MonCapGrant::expand_profile(const EntityName& name) const +{ + // only generate this list once + if (!profile_grants.empty()) + return; + + if (profile == "read-only") { + // grants READ-ONLY caps monitor-wide + // 'auth' requires MON_CAP_X even for RO, which we do not grant here. + profile_grants.push_back(mon_rwxa_t(MON_CAP_R)); + return; + } + + if (profile == "read-write") { + // grants READ-WRITE caps monitor-wide + // 'auth' requires MON_CAP_X for all operations, which we do not grant. + profile_grants.push_back(mon_rwxa_t(MON_CAP_R | MON_CAP_W)); + return; + } + + if (profile == "mon") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_ALL)); + profile_grants.push_back(MonCapGrant("log", MON_CAP_ALL)); + } + if (profile == "osd") { + profile_grants.push_back(MonCapGrant("osd", MON_CAP_ALL)); + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R | MON_CAP_W)); + profile_grants.push_back(MonCapGrant("log", MON_CAP_W)); + StringConstraint constraint(StringConstraint::MATCH_TYPE_REGEX, + string("osd_mclock_max_capacity_iops_(hdd|ssd)")); + profile_grants.push_back(MonCapGrant("config set", "name", constraint)); + } + if (profile == "mds") { + profile_grants.push_back(MonCapGrant("mds", MON_CAP_ALL)); + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + // This command grant is checked explicitly in MRemoveSnaps handling + profile_grants.push_back(MonCapGrant("osd pool rmsnap")); + profile_grants.push_back(MonCapGrant("osd blocklist")); + profile_grants.push_back(MonCapGrant("osd blacklist")); // for compat + profile_grants.push_back(MonCapGrant("log", MON_CAP_W)); + } + if (profile == "mgr") { + profile_grants.push_back(MonCapGrant("mgr", MON_CAP_ALL)); + profile_grants.push_back(MonCapGrant("log", MON_CAP_R | MON_CAP_W)); + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R | MON_CAP_W)); + profile_grants.push_back(MonCapGrant("mds", MON_CAP_R | MON_CAP_W)); + profile_grants.push_back(MonCapGrant("fs", MON_CAP_R | MON_CAP_W)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R | MON_CAP_W)); + profile_grants.push_back(MonCapGrant("auth", MON_CAP_R | MON_CAP_W | MON_CAP_X)); + profile_grants.push_back(MonCapGrant("config-key", MON_CAP_R | MON_CAP_W)); + profile_grants.push_back(MonCapGrant("config", MON_CAP_R | MON_CAP_W)); + // cephadm orchestrator provisions new daemon keys and updates caps + profile_grants.push_back(MonCapGrant("auth get-or-create")); + profile_grants.push_back(MonCapGrant("auth caps")); + profile_grants.push_back(MonCapGrant("auth rm")); + // tell commands (this is a bit of a kludge) + profile_grants.push_back(MonCapGrant("smart")); + } + if (profile == "osd" || profile == "mds" || profile == "mon" || + profile == "mgr") { + StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX, + string("daemon-private/") + stringify(name) + + string("/")); + std::string prefix = string("daemon-private/") + stringify(name) + string("/"); + profile_grants.push_back(MonCapGrant("config-key get", "key", constraint)); + profile_grants.push_back(MonCapGrant("config-key put", "key", constraint)); + profile_grants.push_back(MonCapGrant("config-key set", "key", constraint)); + profile_grants.push_back(MonCapGrant("config-key exists", "key", constraint)); + profile_grants.push_back(MonCapGrant("config-key delete", "key", constraint)); + } + if (profile == "bootstrap-osd") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap + profile_grants.push_back(MonCapGrant("mon getmap")); + profile_grants.push_back(MonCapGrant("osd new")); + profile_grants.push_back(MonCapGrant("osd purge-new")); + } + if (profile == "bootstrap-mds") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap + profile_grants.push_back(MonCapGrant("mon getmap")); + profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mds keys + profile_grants.back().command_args["entity"] = StringConstraint( + StringConstraint::MATCH_TYPE_PREFIX, "mds."); + profile_grants.back().command_args["caps_mon"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, "allow profile mds"); + profile_grants.back().command_args["caps_osd"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, "allow rwx"); + profile_grants.back().command_args["caps_mds"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, "allow"); + } + if (profile == "bootstrap-mgr") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap + profile_grants.push_back(MonCapGrant("mon getmap")); + profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mgr keys + profile_grants.back().command_args["entity"] = StringConstraint( + StringConstraint::MATCH_TYPE_PREFIX, "mgr."); + profile_grants.back().command_args["caps_mon"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, "allow profile mgr"); + } + if (profile == "bootstrap-rgw") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); // read osdmap + profile_grants.push_back(MonCapGrant("mon getmap")); + profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other mds keys + profile_grants.back().command_args["entity"] = StringConstraint( + StringConstraint::MATCH_TYPE_PREFIX, "client.rgw."); + profile_grants.back().command_args["caps_mon"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, "allow rw"); + profile_grants.back().command_args["caps_osd"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, "allow rwx"); + } + if (profile == "bootstrap-rbd" || profile == "bootstrap-rbd-mirror") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); // read monmap + profile_grants.push_back(MonCapGrant("auth get-or-create")); // FIXME: this can expose other rbd keys + profile_grants.back().command_args["entity"] = StringConstraint( + StringConstraint::MATCH_TYPE_PREFIX, "client."); + profile_grants.back().command_args["caps_mon"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, + (profile == "bootstrap-rbd-mirror" ? "profile rbd-mirror" : + "profile rbd")); + profile_grants.back().command_args["caps_osd"] = StringConstraint( + StringConstraint::MATCH_TYPE_REGEX, + "^([ ,]*profile(=|[ ]+)['\"]?rbd[^ ,'\"]*['\"]?([ ]+pool(=|[ ]+)['\"]?[^,'\"]+['\"]?)?)+$"); + } + if (profile == "fs-client") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("mds", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); + } + if (profile == "simple-rados-client") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); + } + if (profile == "simple-rados-client-with-blocklist") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd blocklist")); + profile_grants.back().command_args["blocklistop"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, "add"); + profile_grants.back().command_args["addr"] = StringConstraint( + StringConstraint::MATCH_TYPE_REGEX, "^[^/]+/[0-9]+$"); + + } + if (boost::starts_with(profile, "rbd")) { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); + + // exclusive lock dead-client blocklisting (IP+nonce required) + profile_grants.push_back(MonCapGrant("osd blocklist")); + profile_grants.back().command_args["blocklistop"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, "add"); + profile_grants.back().command_args["addr"] = StringConstraint( + StringConstraint::MATCH_TYPE_REGEX, "^[^/]+/[0-9]+$"); + + // for compat, + profile_grants.push_back(MonCapGrant("osd blacklist")); + profile_grants.back().command_args["blacklistop"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, "add"); + profile_grants.back().command_args["addr"] = StringConstraint( + StringConstraint::MATCH_TYPE_REGEX, "^[^/]+/[0-9]+$"); + + } + if (profile == "rbd-mirror") { + StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX, + "rbd/mirror/"); + profile_grants.push_back(MonCapGrant("config-key get", "key", constraint)); + } else if (profile == "rbd-mirror-peer") { + StringConstraint constraint(StringConstraint::MATCH_TYPE_REGEX, + "rbd/mirror/[^/]+"); + profile_grants.push_back(MonCapGrant("config-key get", "key", constraint)); + + constraint = StringConstraint(StringConstraint::MATCH_TYPE_PREFIX, + "rbd/mirror/peer/"); + profile_grants.push_back(MonCapGrant("config-key set", "key", constraint)); + } + else if (profile == "crash") { + // TODO: we could limit this to getting the monmap and mgrmap... + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + } + if (profile == "cephfs-mirror") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("mds", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); + StringConstraint constraint(StringConstraint::MATCH_TYPE_PREFIX, + "cephfs/mirror/peer/"); + profile_grants.push_back(MonCapGrant("config-key get", "key", constraint)); + + } + if (profile == "role-definer") { + // grants ALL caps to the auth subsystem, read-only on the + // monitor subsystem and nothing else. + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("auth", MON_CAP_ALL)); + } +} + +mon_rwxa_t MonCapGrant::get_allowed(CephContext *cct, + EntityName name, + const std::string& s, const std::string& c, + const map<string,string>& c_args) const +{ + if (profile.length()) { + expand_profile(name); + mon_rwxa_t a; + for (auto p = profile_grants.begin(); + p != profile_grants.end(); ++p) + a = a | p->get_allowed(cct, name, s, c, c_args); + return a; + } + if (service.length()) { + if (service != s) + return 0; + return allow; + } + if (command.length()) { + if (command != c) + return 0; + for (map<string,StringConstraint>::const_iterator p = command_args.begin(); p != command_args.end(); ++p) { + map<string,string>::const_iterator q = c_args.find(p->first); + // argument must be present if a constraint exists + if (q == c_args.end()) + return 0; + switch (p->second.match_type) { + case StringConstraint::MATCH_TYPE_EQUAL: + if (p->second.value != q->second) + return 0; + break; + case StringConstraint::MATCH_TYPE_PREFIX: + if (q->second.find(p->second.value) != 0) + return 0; + break; + case StringConstraint::MATCH_TYPE_REGEX: + try { + std::regex pattern( + p->second.value, std::regex::extended); + if (!std::regex_match(q->second, pattern)) + return 0; + } catch(const std::regex_error&) { + return 0; + } + break; + default: + break; + } + } + return MON_CAP_ALL; + } + // we don't allow config-key service to be accessed with blanket caps other + // than '*' (i.e., 'any'), and that should have been checked by the caller + // via 'is_allow_all()'. + if (s == "config-key") { + return 0; + } + return allow; +} + +ostream& operator<<(ostream&out, const MonCap& m) +{ + for (vector<MonCapGrant>::const_iterator p = m.grants.begin(); p != m.grants.end(); ++p) { + if (p != m.grants.begin()) + out << ", "; + out << *p; + } + return out; +} + +bool MonCap::is_allow_all() const +{ + for (vector<MonCapGrant>::const_iterator p = grants.begin(); p != grants.end(); ++p) + if (p->is_allow_all()) + return true; + return false; +} + +void MonCap::set_allow_all() +{ + grants.clear(); + grants.push_back(MonCapGrant(MON_CAP_ANY)); + text = "allow *"; +} + +bool MonCap::is_capable( + CephContext *cct, + EntityName name, + const string& service, + const string& command, const map<string,string>& command_args, + bool op_may_read, bool op_may_write, bool op_may_exec, + const entity_addr_t& addr) const +{ + if (cct) + ldout(cct, 20) << "is_capable service=" << service << " command=" << command + << (op_may_read ? " read":"") + << (op_may_write ? " write":"") + << (op_may_exec ? " exec":"") + << " addr " << addr + << " on cap " << *this + << dendl; + + mon_rwxa_t allow = 0; + for (vector<MonCapGrant>::const_iterator p = grants.begin(); + p != grants.end(); ++p) { + if (cct) + ldout(cct, 20) << " allow so far " << allow << ", doing grant " << *p + << dendl; + + if (p->network.size() && + (!p->network_valid || + !network_contains(p->network_parsed, + p->network_prefix, + addr))) { + continue; + } + + if (p->is_allow_all()) { + if (cct) + ldout(cct, 20) << " allow all" << dendl; + return true; + } + + // check enumerated caps + allow = allow | p->get_allowed(cct, name, service, command, command_args); + if ((!op_may_read || (allow & MON_CAP_R)) && + (!op_may_write || (allow & MON_CAP_W)) && + (!op_may_exec || (allow & MON_CAP_X))) { + if (cct) + ldout(cct, 20) << " match" << dendl; + return true; + } + } + return false; +} + +void MonCap::encode(bufferlist& bl) const +{ + ENCODE_START(4, 4, bl); // legacy MonCaps was 3, 3 + encode(text, bl); + ENCODE_FINISH(bl); +} + +void MonCap::decode(bufferlist::const_iterator& bl) +{ + std::string s; + DECODE_START(4, bl); + decode(s, bl); + DECODE_FINISH(bl); + parse(s, NULL); +} + +void MonCap::dump(Formatter *f) const +{ + f->dump_string("text", text); +} + +void MonCap::generate_test_instances(list<MonCap*>& ls) +{ + ls.push_back(new MonCap); + ls.push_back(new MonCap); + ls.back()->parse("allow *"); + ls.push_back(new MonCap); + ls.back()->parse("allow rwx"); + ls.push_back(new MonCap); + ls.back()->parse("allow service foo x"); + ls.push_back(new MonCap); + ls.back()->parse("allow command bar x"); + ls.push_back(new MonCap); + ls.back()->parse("allow service foo r, allow command bar x"); + ls.push_back(new MonCap); + ls.back()->parse("allow command bar with k1=v1 x"); + ls.push_back(new MonCap); + ls.back()->parse("allow command bar with k1=v1 k2=v2 x"); +} + +// grammar +namespace qi = boost::spirit::qi; +namespace ascii = boost::spirit::ascii; +namespace phoenix = boost::phoenix; + + +template <typename Iterator> +struct MonCapParser : qi::grammar<Iterator, MonCap()> +{ + MonCapParser() : MonCapParser::base_type(moncap) + { + using qi::char_; + using qi::int_; + using qi::ulong_long; + using qi::lexeme; + using qi::alnum; + using qi::_val; + using qi::_1; + using qi::_2; + using qi::_3; + using qi::eps; + using qi::lit; + + quoted_string %= + lexeme['"' >> +(char_ - '"') >> '"'] | + lexeme['\'' >> +(char_ - '\'') >> '\'']; + unquoted_word %= +char_("a-zA-Z0-9_./-"); + str %= quoted_string | unquoted_word; + network_str %= +char_("/.:a-fA-F0-9]["); + fs_name_str %= +char_("a-zA-Z0-9_.-"); + + spaces = +(lit(' ') | lit('\n') | lit('\t')); + + // command := command[=]cmd [k1=v1 k2=v2 ...] + str_match = '=' >> qi::attr(StringConstraint::MATCH_TYPE_EQUAL) >> str; + str_prefix = spaces >> lit("prefix") >> spaces >> + qi::attr(StringConstraint::MATCH_TYPE_PREFIX) >> str; + str_regex = spaces >> lit("regex") >> spaces >> + qi::attr(StringConstraint::MATCH_TYPE_REGEX) >> str; + kv_pair = str >> (str_match | str_prefix | str_regex); + kv_map %= kv_pair >> *(spaces >> kv_pair); + command_match = -spaces >> lit("allow") >> spaces >> lit("command") >> (lit('=') | spaces) + >> qi::attr(string()) >> qi::attr(string()) + >> str + >> -(spaces >> lit("with") >> spaces >> kv_map) + >> qi::attr(0) + >> -(spaces >> lit("network") >> spaces >> network_str); + + // service foo rwxa + service_match %= -spaces >> lit("allow") >> spaces >> lit("service") >> (lit('=') | spaces) + >> str >> qi::attr(string()) >> qi::attr(string()) + >> qi::attr(map<string,StringConstraint>()) + >> spaces >> rwxa + >> -(spaces >> lit("network") >> spaces >> network_str); + + // profile foo + profile_match %= -spaces >> -(lit("allow") >> spaces) + >> lit("profile") >> (lit('=') | spaces) + >> qi::attr(string()) + >> str + >> qi::attr(string()) + >> qi::attr(map<string,StringConstraint>()) + >> qi::attr(0) + >> -(spaces >> lit("network") >> spaces >> network_str); + + // rwxa + rwxa_match %= -spaces >> lit("allow") >> spaces + >> qi::attr(string()) >> qi::attr(string()) >> qi::attr(string()) + >> qi::attr(map<string,StringConstraint>()) + >> rwxa + >> -(spaces >> lit("network") >> spaces >> network_str) + >> -(spaces >> lit("fsname") >> (lit('=') | spaces) >> fs_name_str); + + // rwxa := * | [r][w][x] + rwxa = + (lit("*")[_val = MON_CAP_ANY]) | + (lit("all")[_val = MON_CAP_ANY]) | + ( eps[_val = 0] >> + ( lit('r')[_val |= MON_CAP_R] || + lit('w')[_val |= MON_CAP_W] || + lit('x')[_val |= MON_CAP_X] + ) + ); + + // grant := allow ... + grant = -spaces >> (rwxa_match | profile_match | service_match | command_match) >> -spaces; + + // moncap := grant [grant ...] + grants %= (grant % (*lit(' ') >> (lit(';') | lit(',')) >> *lit(' '))); + moncap = grants [_val = phoenix::construct<MonCap>(_1)]; + + } + qi::rule<Iterator> spaces; + qi::rule<Iterator, unsigned()> rwxa; + qi::rule<Iterator, string()> quoted_string; + qi::rule<Iterator, string()> unquoted_word; + qi::rule<Iterator, string()> str, network_str; + qi::rule<Iterator, string()> fs_name_str; + + qi::rule<Iterator, StringConstraint()> str_match, str_prefix, str_regex; + qi::rule<Iterator, pair<string, StringConstraint>()> kv_pair; + qi::rule<Iterator, map<string, StringConstraint>()> kv_map; + + qi::rule<Iterator, MonCapGrant()> rwxa_match; + qi::rule<Iterator, MonCapGrant()> command_match; + qi::rule<Iterator, MonCapGrant()> service_match; + qi::rule<Iterator, MonCapGrant()> profile_match; + qi::rule<Iterator, MonCapGrant()> grant; + qi::rule<Iterator, std::vector<MonCapGrant>()> grants; + qi::rule<Iterator, MonCap()> moncap; +}; + +bool MonCap::parse(const string& str, ostream *err) +{ + auto iter = str.begin(); + auto end = str.end(); + + MonCapParser<string::const_iterator> exp; + bool r = qi::parse(iter, end, exp, *this); + if (r && iter == end) { + text = str; + for (auto& g : grants) { + g.parse_network(); + } + return true; + } + + // Make sure no grants are kept after parsing failed! + grants.clear(); + + if (err) { + if (iter != end) + *err << "mon capability parse failed, stopped at '" + << std::string(iter, end) + << "' of '" << str << "'"; + else + *err << "mon capability parse failed, stopped at end of '" << str << "'"; + } + + return false; +} + diff --git a/src/mon/MonCap.h b/src/mon/MonCap.h new file mode 100644 index 000000000..ab4e35bc9 --- /dev/null +++ b/src/mon/MonCap.h @@ -0,0 +1,227 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_MONCAP_H +#define CEPH_MONCAP_H + +#include <ostream> + +#include "include/common_fwd.h" +#include "include/types.h" +#include "common/entity_name.h" +#include "mds/mdstypes.h" + +static const __u8 MON_CAP_R = (1 << 1); // read +static const __u8 MON_CAP_W = (1 << 2); // write +static const __u8 MON_CAP_X = (1 << 3); // execute +static const __u8 MON_CAP_ALL = MON_CAP_R | MON_CAP_W | MON_CAP_X; +static const __u8 MON_CAP_ANY = 0xff; // * + +struct mon_rwxa_t { + __u8 val; + + // cppcheck-suppress noExplicitConstructor + mon_rwxa_t(__u8 v = 0) : val(v) {} + mon_rwxa_t& operator=(__u8 v) { + val = v; + return *this; + } + operator __u8() const { + return val; + } +}; + +std::ostream& operator<<(std::ostream& out, const mon_rwxa_t& p); + +struct StringConstraint { + enum MatchType { + MATCH_TYPE_NONE, + MATCH_TYPE_EQUAL, + MATCH_TYPE_PREFIX, + MATCH_TYPE_REGEX + }; + + MatchType match_type = MATCH_TYPE_NONE; + std::string value; + + StringConstraint() {} + StringConstraint(MatchType match_type, std::string value) + : match_type(match_type), value(value) { + } +}; + +std::ostream& operator<<(std::ostream& out, const StringConstraint& c); + +struct MonCapGrant { + /* + * A grant can come in one of five forms: + * + * - a blanket allow ('allow rw', 'allow *') + * - this will match against any service and the read/write/exec flags + * in the mon code. semantics of what X means are somewhat ad hoc. + * + * - a service allow ('allow service mds rw') + * - this will match against a specific service and the r/w/x flags. + * + * - a profile ('allow profile osd') + * - this will match against specific monitor-enforced semantics of what + * this type of user should need to do. examples include 'osd', 'mds', + * 'bootstrap-osd'. + * + * - a command ('allow command foo', 'allow command bar with arg1=val1 arg2 prefix val2') + * this includes the command name (the prefix string), and a set + * of key/value pairs that constrain use of that command. if no pairs + * are specified, any arguments are allowed; if a pair is specified, that + * argument must be present and equal or match a prefix. + * + * - an fs name ('allow fsname foo') + * - this will restrict access to MDSMaps in the FSMap to the provided + * fs name. + */ + std::string service; + std::string profile; + std::string command; + std::map<std::string, StringConstraint> command_args; + std::string fs_name; + + // restrict by network + std::string network; + + // these are filled in by parse_network(), called by MonCap::parse() + entity_addr_t network_parsed; + unsigned network_prefix = 0; + bool network_valid = true; + + void parse_network(); + + mon_rwxa_t allow; + + // explicit grants that a profile grant expands to; populated as + // needed by expand_profile() (via is_match()) and cached here. + mutable std::list<MonCapGrant> profile_grants; + + void expand_profile(const EntityName& name) const; + + MonCapGrant() : allow(0) {} + // cppcheck-suppress noExplicitConstructor + MonCapGrant(mon_rwxa_t a) : allow(a) {} + MonCapGrant(std::string s, mon_rwxa_t a) : service(std::move(s)), allow(a) {} + // cppcheck-suppress noExplicitConstructor + MonCapGrant(std::string c) : command(std::move(c)) {} + MonCapGrant(std::string c, std::string a, StringConstraint co) : command(std::move(c)) { + command_args[a] = co; + } + MonCapGrant(mon_rwxa_t a, std::string fsname) : fs_name(fsname), allow(a) {} + + /** + * check if given request parameters match our constraints + * + * @param cct context + * @param name entity name + * @param service service (if any) + * @param command command (if any) + * @param command_args command args (if any) + * @return bits we allow + */ + mon_rwxa_t get_allowed(CephContext *cct, + EntityName name, + const std::string& service, + const std::string& command, + const std::map<std::string, std::string>& command_args) const; + + bool is_allow_all() const { + return + allow == MON_CAP_ANY && + service.length() == 0 && + profile.length() == 0 && + command.length() == 0 && + fs_name.empty(); + } +}; + +std::ostream& operator<<(std::ostream& out, const MonCapGrant& g); + +struct MonCap { + std::string text; + std::vector<MonCapGrant> grants; + + MonCap() {} + explicit MonCap(const std::vector<MonCapGrant> &g) : grants(g) {} + + std::string get_str() const { + return text; + } + + bool is_allow_all() const; + void set_allow_all(); + bool parse(const std::string& str, std::ostream *err=NULL); + + /** + * check if we are capable of something + * + * This method actually checks a description of a particular operation against + * what the capability has specified. + * + * @param service service name + * @param command command id + * @param command_args + * @param op_may_read whether the operation may need to read + * @param op_may_write whether the operation may need to write + * @param op_may_exec whether the operation may exec + * @return true if the operation is allowed, false otherwise + */ + bool is_capable(CephContext *cct, + EntityName name, + const std::string& service, + const std::string& command, + const std::map<std::string, std::string>& command_args, + bool op_may_read, bool op_may_write, bool op_may_exec, + const entity_addr_t& addr) const; + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<MonCap*>& ls); + + std::vector<string> allowed_fs_names() const { + std::vector<string> ret; + for (auto& g : grants) { + if (not g.fs_name.empty()) { + ret.push_back(g.fs_name); + } else { + return {}; + } + } + return ret; + } + + bool fs_name_capable(const EntityName& ename, string_view fs_name, + __u8 mask) { + for (auto& g : grants) { + if (g.is_allow_all()) { + return true; + } + + if ((g.fs_name.empty() || g.fs_name == fs_name) && (mask & g.allow)) { + return true; + } + + g.expand_profile(ename); + for (auto& pg : g.profile_grants) { + if ((pg.service == "fs" || pg.service == "mds") && + (pg.fs_name.empty() || pg.fs_name == fs_name) && + (pg.allow & mask)) { + return true; + } + } + } + + return false; + } + +}; +WRITE_CLASS_ENCODER(MonCap) + +std::ostream& operator<<(std::ostream& out, const MonCap& cap); + +#endif diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc new file mode 100644 index 000000000..9c637bf8a --- /dev/null +++ b/src/mon/MonClient.cc @@ -0,0 +1,2025 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <algorithm> +#include <iterator> +#include <random> +#include <boost/range/adaptor/map.hpp> +#include <boost/range/adaptor/filtered.hpp> +#include <boost/range/algorithm/copy.hpp> +#include <boost/range/algorithm_ext/copy_n.hpp> +#include "common/weighted_shuffle.h" + +#include "include/random.h" +#include "include/scope_guard.h" +#include "include/stringify.h" + +#include "messages/MMonGetMap.h" +#include "messages/MMonGetVersion.h" +#include "messages/MMonGetMap.h" +#include "messages/MMonGetVersionReply.h" +#include "messages/MMonMap.h" +#include "messages/MConfig.h" +#include "messages/MGetConfig.h" +#include "messages/MAuth.h" +#include "messages/MLogAck.h" +#include "messages/MAuthReply.h" +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MPing.h" + +#include "messages/MMonSubscribe.h" +#include "messages/MMonSubscribeAck.h" +#include "common/errno.h" +#include "common/hostname.h" +#include "common/LogClient.h" + +#include "MonClient.h" +#include "error_code.h" +#include "MonMap.h" + +#include "auth/Auth.h" +#include "auth/KeyRing.h" +#include "auth/AuthClientHandler.h" +#include "auth/AuthRegistry.h" +#include "auth/RotatingKeyRing.h" + +#define dout_subsys ceph_subsys_monc +#undef dout_prefix +#define dout_prefix *_dout << "monclient" << (_hunting() ? "(hunting)":"") << ": " + +namespace bs = boost::system; +using std::string; +using namespace std::literals; + +MonClient::MonClient(CephContext *cct_, boost::asio::io_context& service) : + Dispatcher(cct_), + AuthServer(cct_), + messenger(NULL), + timer(cct_, monc_lock), + service(service), + initialized(false), + log_client(NULL), + more_log_pending(false), + want_monmap(true), + had_a_connection(false), + reopen_interval_multiplier( + cct_->_conf.get_val<double>("mon_client_hunt_interval_min_multiple")), + last_mon_command_tid(0), + version_req_id(0) +{} + +MonClient::~MonClient() +{ +} + +int MonClient::build_initial_monmap() +{ + ldout(cct, 10) << __func__ << dendl; + int r = monmap.build_initial(cct, false, std::cerr); + ldout(cct,10) << "monmap:\n"; + monmap.print(*_dout); + *_dout << dendl; + return r; +} + +int MonClient::get_monmap() +{ + ldout(cct, 10) << __func__ << dendl; + std::unique_lock l(monc_lock); + + sub.want("monmap", 0, 0); + if (!_opened()) + _reopen_session(); + map_cond.wait(l, [this] { return !want_monmap; }); + ldout(cct, 10) << __func__ << " done" << dendl; + return 0; +} + +int MonClient::get_monmap_and_config() +{ + ldout(cct, 10) << __func__ << dendl; + ceph_assert(!messenger); + + int tries = 10; + + cct->init_crypto(); + auto shutdown_crypto = make_scope_guard([this] { + cct->shutdown_crypto(); + }); + + int r = build_initial_monmap(); + if (r < 0) { + lderr(cct) << __func__ << " cannot identify monitors to contact" << dendl; + return r; + } + + messenger = Messenger::create_client_messenger( + cct, "temp_mon_client"); + ceph_assert(messenger); + messenger->add_dispatcher_head(this); + messenger->start(); + auto shutdown_msgr = make_scope_guard([this] { + messenger->shutdown(); + messenger->wait(); + delete messenger; + messenger = nullptr; + if (!monmap.fsid.is_zero()) { + cct->_conf.set_val("fsid", stringify(monmap.fsid)); + } + }); + + want_bootstrap_config = true; + auto shutdown_config = make_scope_guard([this] { + std::unique_lock l(monc_lock); + want_bootstrap_config = false; + bootstrap_config.reset(); + }); + + ceph::ref_t<MConfig> config; + while (tries-- > 0) { + r = init(); + if (r < 0) { + return r; + } + r = authenticate(std::chrono::duration<double>(cct->_conf.get_val<std::chrono::seconds>("client_mount_timeout")).count()); + if (r == -ETIMEDOUT) { + shutdown(); + continue; + } + if (r < 0) { + break; + } + { + std::unique_lock l(monc_lock); + if (monmap.get_epoch() && + !monmap.persistent_features.contains_all( + ceph::features::mon::FEATURE_MIMIC)) { + ldout(cct,10) << __func__ << " pre-mimic monitor, no config to fetch" + << dendl; + r = 0; + break; + } + while ((!bootstrap_config || monmap.get_epoch() == 0) && r == 0) { + ldout(cct,20) << __func__ << " waiting for monmap|config" << dendl; + auto status = map_cond.wait_for(l, ceph::make_timespan( + cct->_conf->mon_client_hunt_interval)); + if (status == std::cv_status::timeout) { + r = -ETIMEDOUT; + } + } + + if (bootstrap_config) { + ldout(cct,10) << __func__ << " success" << dendl; + config = std::move(bootstrap_config); + r = 0; + break; + } + } + lderr(cct) << __func__ << " failed to get config" << dendl; + shutdown(); + continue; + } + + if (config) { + // apply the bootstrap config to ensure its applied prior to completing + // the bootstrap + cct->_conf.set_mon_vals(cct, config->config, config_cb); + } + + shutdown(); + return r; +} + + +/** + * Ping the monitor with id @p mon_id and set the resulting reply in + * the provided @p result_reply, if this last parameter is not NULL. + * + * So that we don't rely on the MonClient's default messenger, set up + * during connect(), we create our own messenger to comunicate with the + * specified monitor. This is advantageous in the following ways: + * + * - Isolate the ping procedure from the rest of the MonClient's operations, + * allowing us to not acquire or manage the big monc_lock, thus not + * having to block waiting for some other operation to finish before we + * can proceed. + * * for instance, we can ping mon.FOO even if we are currently hunting + * or blocked waiting for auth to complete with mon.BAR. + * + * - Ping a monitor prior to establishing a connection (using connect()) + * and properly establish the MonClient's messenger. This frees us + * from dealing with the complex foo that happens in connect(). + * + * We also don't rely on MonClient as a dispatcher for this messenger, + * unlike what happens with the MonClient's default messenger. This allows + * us to sandbox the whole ping, having it much as a separate entity in + * the MonClient class, considerably simplifying the handling and dispatching + * of messages without needing to consider monc_lock. + * + * Current drawback is that we will establish a messenger for each ping + * we want to issue, instead of keeping a single messenger instance that + * would be used for all pings. + */ +int MonClient::ping_monitor(const string &mon_id, string *result_reply) +{ + ldout(cct, 10) << __func__ << dendl; + + string new_mon_id; + if (monmap.contains("noname-"+mon_id)) { + new_mon_id = "noname-"+mon_id; + } else { + new_mon_id = mon_id; + } + + if (new_mon_id.empty()) { + ldout(cct, 10) << __func__ << " specified mon id is empty!" << dendl; + return -EINVAL; + } else if (!monmap.contains(new_mon_id)) { + ldout(cct, 10) << __func__ << " no such monitor 'mon." << new_mon_id << "'" + << dendl; + return -ENOENT; + } + + // N.B. monc isn't initialized + + auth_registry.refresh_config(); + + KeyRing keyring; + keyring.from_ceph_context(cct); + RotatingKeyRing rkeyring(cct, cct->get_module_type(), &keyring); + + MonClientPinger *pinger = new MonClientPinger(cct, + &rkeyring, + result_reply); + + Messenger *smsgr = Messenger::create_client_messenger(cct, "temp_ping_client"); + smsgr->add_dispatcher_head(pinger); + smsgr->set_auth_client(pinger); + smsgr->start(); + + ConnectionRef con = smsgr->connect_to_mon(monmap.get_addrs(new_mon_id)); + ldout(cct, 10) << __func__ << " ping mon." << new_mon_id + << " " << con->get_peer_addr() << dendl; + + pinger->mc.reset(new MonConnection(cct, con, 0, &auth_registry)); + pinger->mc->start(monmap.get_epoch(), entity_name); + con->send_message(new MPing); + + int ret = pinger->wait_for_reply(cct->_conf->mon_client_ping_timeout); + if (ret == 0) { + ldout(cct,10) << __func__ << " got ping reply" << dendl; + } else { + ret = -ret; + } + + con->mark_down(); + pinger->mc.reset(); + smsgr->shutdown(); + smsgr->wait(); + delete smsgr; + delete pinger; + return ret; +} + +bool MonClient::ms_dispatch(Message *m) +{ + // we only care about these message types + switch (m->get_type()) { + case CEPH_MSG_MON_MAP: + case CEPH_MSG_AUTH_REPLY: + case CEPH_MSG_MON_SUBSCRIBE_ACK: + case CEPH_MSG_MON_GET_VERSION_REPLY: + case MSG_MON_COMMAND_ACK: + case MSG_COMMAND_REPLY: + case MSG_LOGACK: + case MSG_CONFIG: + break; + case CEPH_MSG_PING: + m->put(); + return true; + default: + return false; + } + + std::lock_guard lock(monc_lock); + + if (!m->get_connection()->is_anon() && + m->get_source().type() == CEPH_ENTITY_TYPE_MON) { + if (_hunting()) { + auto p = _find_pending_con(m->get_connection()); + if (p == pending_cons.end()) { + // ignore any messages outside hunting sessions + ldout(cct, 10) << "discarding stray monitor message " << *m << dendl; + m->put(); + return true; + } + } else if (!active_con || active_con->get_con() != m->get_connection()) { + // ignore any messages outside our session(s) + ldout(cct, 10) << "discarding stray monitor message " << *m << dendl; + m->put(); + return true; + } + } + + switch (m->get_type()) { + case CEPH_MSG_MON_MAP: + handle_monmap(static_cast<MMonMap*>(m)); + if (passthrough_monmap) { + return false; + } else { + m->put(); + } + break; + case CEPH_MSG_AUTH_REPLY: + handle_auth(static_cast<MAuthReply*>(m)); + break; + case CEPH_MSG_MON_SUBSCRIBE_ACK: + handle_subscribe_ack(static_cast<MMonSubscribeAck*>(m)); + break; + case CEPH_MSG_MON_GET_VERSION_REPLY: + handle_get_version_reply(static_cast<MMonGetVersionReply*>(m)); + break; + case MSG_MON_COMMAND_ACK: + handle_mon_command_ack(static_cast<MMonCommandAck*>(m)); + break; + case MSG_COMMAND_REPLY: + if (m->get_connection()->is_anon() && + m->get_source().type() == CEPH_ENTITY_TYPE_MON) { + // this connection is from 'tell'... ignore everything except our command + // reply. (we'll get misc other message because we authenticated, but we + // don't need them.) + handle_command_reply(static_cast<MCommandReply*>(m)); + return true; + } + // leave the message for another dispatch handler (e.g., Objecter) + return false; + case MSG_LOGACK: + if (log_client) { + log_client->handle_log_ack(static_cast<MLogAck*>(m)); + m->put(); + if (more_log_pending) { + send_log(); + } + } else { + m->put(); + } + break; + case MSG_CONFIG: + handle_config(static_cast<MConfig*>(m)); + break; + } + return true; +} + +void MonClient::send_log(bool flush) +{ + if (log_client) { + auto lm = log_client->get_mon_log_message(flush); + if (lm) + _send_mon_message(std::move(lm)); + more_log_pending = log_client->are_pending(); + } +} + +void MonClient::flush_log() +{ + std::lock_guard l(monc_lock); + send_log(); +} + +/* Unlike all the other message-handling functions, we don't put away a reference +* because we want to support MMonMap passthrough to other Dispatchers. */ +void MonClient::handle_monmap(MMonMap *m) +{ + ldout(cct, 10) << __func__ << " " << *m << dendl; + auto con_addrs = m->get_source_addrs(); + string old_name = monmap.get_name(con_addrs); + const auto old_epoch = monmap.get_epoch(); + + auto p = m->monmapbl.cbegin(); + decode(monmap, p); + + ldout(cct, 10) << " got monmap " << monmap.epoch + << " from mon." << old_name + << " (according to old e" << monmap.get_epoch() << ")" + << dendl; + ldout(cct, 10) << "dump:\n"; + monmap.print(*_dout); + *_dout << dendl; + + if (old_epoch != monmap.get_epoch()) { + tried.clear(); + } + if (old_name.size() == 0) { + ldout(cct,10) << " can't identify which mon we were connected to" << dendl; + _reopen_session(); + } else { + auto new_name = monmap.get_name(con_addrs); + if (new_name.empty()) { + ldout(cct, 10) << "mon." << old_name << " at " << con_addrs + << " went away" << dendl; + // can't find the mon we were talking to (above) + _reopen_session(); + } else if (messenger->should_use_msgr2() && + monmap.get_addrs(new_name).has_msgr2() && + !con_addrs.has_msgr2()) { + ldout(cct,1) << " mon." << new_name << " has (v2) addrs " + << monmap.get_addrs(new_name) << " but i'm connected to " + << con_addrs << ", reconnecting" << dendl; + _reopen_session(); + } + } + + cct->set_mon_addrs(monmap); + + sub.got("monmap", monmap.get_epoch()); + map_cond.notify_all(); + want_monmap = false; + + if (authenticate_err == 1) { + _finish_auth(0); + } +} + +void MonClient::handle_config(MConfig *m) +{ + ldout(cct,10) << __func__ << " " << *m << dendl; + + if (want_bootstrap_config) { + // get_monmap_and_config is waiting for config which it will apply + // synchronously + bootstrap_config = ceph::ref_t<MConfig>(m, false); + map_cond.notify_all(); + return; + } + + // Take the sledgehammer approach to ensuring we don't depend on + // anything in MonClient. + boost::asio::post(finish_strand, + [m, cct = boost::intrusive_ptr<CephContext>(cct), + config_notify_cb = config_notify_cb, + config_cb = config_cb]() { + cct->_conf.set_mon_vals(cct.get(), m->config, config_cb); + if (config_notify_cb) { + config_notify_cb(); + } + m->put(); + }); +} + +// ---------------------- + +int MonClient::init() +{ + ldout(cct, 10) << __func__ << dendl; + + entity_name = cct->_conf->name; + + auth_registry.refresh_config(); + + std::lock_guard l(monc_lock); + keyring.reset(new KeyRing); + if (auth_registry.is_supported_method(messenger->get_mytype(), + CEPH_AUTH_CEPHX)) { + // this should succeed, because auth_registry just checked! + int r = keyring->from_ceph_context(cct); + if (r != 0) { + // but be somewhat graceful in case there was a race condition + lderr(cct) << "keyring not found" << dendl; + return r; + } + } + if (!auth_registry.any_supported_methods(messenger->get_mytype())) { + return -ENOENT; + } + + rotating_secrets.reset( + new RotatingKeyRing(cct, cct->get_module_type(), keyring.get())); + + initialized = true; + + messenger->set_auth_client(this); + messenger->add_dispatcher_head(this); + + timer.init(); + schedule_tick(); + + return 0; +} + +void MonClient::shutdown() +{ + ldout(cct, 10) << __func__ << dendl; + monc_lock.lock(); + stopping = true; + while (!version_requests.empty()) { + ceph::async::post(std::move(version_requests.begin()->second), + monc_errc::shutting_down, 0, 0); + ldout(cct, 20) << __func__ << " canceling and discarding version request " + << version_requests.begin()->first << dendl; + version_requests.erase(version_requests.begin()); + } + while (!mon_commands.empty()) { + auto tid = mon_commands.begin()->first; + _cancel_mon_command(tid); + } + ldout(cct, 20) << __func__ << " discarding " << waiting_for_session.size() + << " pending message(s)" << dendl; + waiting_for_session.clear(); + + active_con.reset(); + pending_cons.clear(); + + auth.reset(); + global_id = 0; + authenticate_err = 0; + authenticated = false; + + monc_lock.unlock(); + + if (initialized) { + initialized = false; + } + monc_lock.lock(); + timer.shutdown(); + stopping = false; + monc_lock.unlock(); +} + +int MonClient::authenticate(double timeout) +{ + std::unique_lock lock{monc_lock}; + + if (active_con) { + ldout(cct, 5) << "already authenticated" << dendl; + return 0; + } + sub.want("monmap", monmap.get_epoch() ? monmap.get_epoch() + 1 : 0, 0); + sub.want("config", 0, 0); + if (!_opened()) + _reopen_session(); + + auto until = ceph::real_clock::now(); + until += ceph::make_timespan(timeout); + if (timeout > 0.0) + ldout(cct, 10) << "authenticate will time out at " << until << dendl; + while (!active_con && authenticate_err >= 0) { + if (timeout > 0.0) { + auto r = auth_cond.wait_until(lock, until); + if (r == std::cv_status::timeout && !active_con) { + ldout(cct, 0) << "authenticate timed out after " << timeout << dendl; + authenticate_err = -ETIMEDOUT; + } + } else { + auth_cond.wait(lock); + } + } + + if (active_con) { + ldout(cct, 5) << __func__ << " success, global_id " + << active_con->get_global_id() << dendl; + // active_con should not have been set if there was an error + ceph_assert(authenticate_err >= 0); + authenticated = true; + } + + if (authenticate_err < 0 && auth_registry.no_keyring_disabled_cephx()) { + lderr(cct) << __func__ << " NOTE: no keyring found; disabled cephx authentication" << dendl; + } + + return authenticate_err; +} + +void MonClient::handle_auth(MAuthReply *m) +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + + if (m->get_connection()->is_anon()) { + // anon connection, used for mon tell commands + for (auto& p : mon_commands) { + if (p.second->target_con == m->get_connection()) { + auto& mc = p.second->target_session; + int ret = mc->handle_auth(m, entity_name, + CEPH_ENTITY_TYPE_MON, + rotating_secrets.get()); + (void)ret; // we don't care + break; + } + } + m->put(); + return; + } + + if (!_hunting()) { + std::swap(active_con->get_auth(), auth); + int ret = active_con->authenticate(m); + m->put(); + std::swap(auth, active_con->get_auth()); + if (global_id != active_con->get_global_id()) { + lderr(cct) << __func__ << " peer assigned me a different global_id: " + << active_con->get_global_id() << dendl; + } + if (ret != -EAGAIN) { + _finish_auth(ret); + } + return; + } + + // hunting + auto found = _find_pending_con(m->get_connection()); + ceph_assert(found != pending_cons.end()); + int auth_err = found->second.handle_auth(m, entity_name, want_keys, + rotating_secrets.get()); + m->put(); + if (auth_err == -EAGAIN) { + return; + } + if (auth_err) { + pending_cons.erase(found); + if (!pending_cons.empty()) { + // keep trying with pending connections + return; + } + // the last try just failed, give up. + } else { + auto& mc = found->second; + ceph_assert(mc.have_session()); + active_con.reset(new MonConnection(std::move(mc))); + pending_cons.clear(); + } + + _finish_hunting(auth_err); + _finish_auth(auth_err); +} + +void MonClient::_finish_auth(int auth_err) +{ + ldout(cct,10) << __func__ << " " << auth_err << dendl; + authenticate_err = auth_err; + // _resend_mon_commands() could _reopen_session() if the connected mon is not + // the one the MonCommand is targeting. + if (!auth_err && active_con) { + ceph_assert(auth); + _check_auth_tickets(); + } + auth_cond.notify_all(); +} + +// --------- + +void MonClient::send_mon_message(MessageRef m) +{ + std::lock_guard l{monc_lock}; + _send_mon_message(std::move(m)); +} + +void MonClient::_send_mon_message(MessageRef m) +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + if (active_con) { + auto cur_con = active_con->get_con(); + ldout(cct, 10) << "_send_mon_message to mon." + << monmap.get_name(cur_con->get_peer_addr()) + << " at " << cur_con->get_peer_addr() << dendl; + cur_con->send_message2(std::move(m)); + } else { + waiting_for_session.push_back(std::move(m)); + } +} + +void MonClient::_reopen_session(int rank) +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + ldout(cct, 10) << __func__ << " rank " << rank << dendl; + + active_con.reset(); + pending_cons.clear(); + + authenticate_err = 1; // == in progress + + _start_hunting(); + + if (rank >= 0) { + _add_conn(rank); + } else { + _add_conns(); + } + + // throw out old queued messages + waiting_for_session.clear(); + + // throw out version check requests + while (!version_requests.empty()) { + ceph::async::post(std::move(version_requests.begin()->second), + monc_errc::session_reset, 0, 0); + version_requests.erase(version_requests.begin()); + } + + for (auto& c : pending_cons) { + c.second.start(monmap.get_epoch(), entity_name); + } + + if (sub.reload()) { + _renew_subs(); + } +} + +void MonClient::_add_conn(unsigned rank) +{ + auto peer = monmap.get_addrs(rank); + auto conn = messenger->connect_to_mon(peer); + MonConnection mc(cct, conn, global_id, &auth_registry); + if (auth) { + mc.get_auth().reset(auth->clone()); + } + pending_cons.insert(std::make_pair(peer, std::move(mc))); + ldout(cct, 10) << "picked mon." << monmap.get_name(rank) + << " con " << conn + << " addr " << peer + << dendl; +} + +void MonClient::_add_conns() +{ + // collect the next batch of candidates who are listed right next to the ones + // already tried + auto get_next_batch = [this]() -> std::vector<unsigned> { + std::multimap<uint16_t, unsigned> ranks_by_priority; + boost::copy( + monmap.mon_info | boost::adaptors::filtered( + [this](auto& info) { + auto rank = monmap.get_rank(info.first); + return tried.count(rank) == 0; + }) | boost::adaptors::transformed( + [this](auto& info) { + auto rank = monmap.get_rank(info.first); + return std::make_pair(info.second.priority, rank); + }), std::inserter(ranks_by_priority, end(ranks_by_priority))); + if (ranks_by_priority.empty()) { + return {}; + } + // only choose the monitors with lowest priority + auto cands = boost::make_iterator_range( + ranks_by_priority.equal_range(ranks_by_priority.begin()->first)); + std::vector<unsigned> ranks; + boost::range::copy(cands | boost::adaptors::map_values, + std::back_inserter(ranks)); + return ranks; + }; + auto ranks = get_next_batch(); + if (ranks.empty()) { + tried.clear(); // start over + ranks = get_next_batch(); + } + ceph_assert(!ranks.empty()); + if (ranks.size() > 1) { + std::vector<uint16_t> weights; + for (auto i : ranks) { + auto rank_name = monmap.get_name(i); + weights.push_back(monmap.get_weight(rank_name)); + } + random_device_t rd; + if (std::accumulate(begin(weights), end(weights), 0u) == 0) { + std::shuffle(begin(ranks), end(ranks), std::mt19937{rd()}); + } else { + weighted_shuffle(begin(ranks), end(ranks), begin(weights), end(weights), + std::mt19937{rd()}); + } + } + ldout(cct, 10) << __func__ << " ranks=" << ranks << dendl; + unsigned n = cct->_conf->mon_client_hunt_parallel; + if (n == 0 || n > ranks.size()) { + n = ranks.size(); + } + for (unsigned i = 0; i < n; i++) { + _add_conn(ranks[i]); + tried.insert(ranks[i]); + } +} + +bool MonClient::ms_handle_reset(Connection *con) +{ + std::lock_guard lock(monc_lock); + + if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON) + return false; + + if (con->is_anon()) { + auto p = mon_commands.begin(); + while (p != mon_commands.end()) { + auto cmd = p->second; + ++p; + if (cmd->target_con == con) { + _send_command(cmd); // may retry or fail + break; + } + } + return true; + } + + if (_hunting()) { + if (pending_cons.count(con->get_peer_addrs())) { + ldout(cct, 10) << __func__ << " hunted mon " << con->get_peer_addrs() + << dendl; + } else { + ldout(cct, 10) << __func__ << " stray mon " << con->get_peer_addrs() + << dendl; + } + return true; + } else { + if (active_con && con == active_con->get_con()) { + ldout(cct, 10) << __func__ << " current mon " << con->get_peer_addrs() + << dendl; + _reopen_session(); + return false; + } else { + ldout(cct, 10) << "ms_handle_reset stray mon " << con->get_peer_addrs() + << dendl; + return true; + } + } +} + +bool MonClient::_opened() const +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + return active_con || _hunting(); +} + +bool MonClient::_hunting() const +{ + return !pending_cons.empty(); +} + +void MonClient::_start_hunting() +{ + ceph_assert(!_hunting()); + // adjust timeouts if necessary + if (!had_a_connection) + return; + reopen_interval_multiplier *= cct->_conf->mon_client_hunt_interval_backoff; + if (reopen_interval_multiplier > + cct->_conf->mon_client_hunt_interval_max_multiple) { + reopen_interval_multiplier = + cct->_conf->mon_client_hunt_interval_max_multiple; + } +} + +void MonClient::_finish_hunting(int auth_err) +{ + ldout(cct,10) << __func__ << " " << auth_err << dendl; + ceph_assert(ceph_mutex_is_locked(monc_lock)); + // the pending conns have been cleaned. + ceph_assert(!_hunting()); + if (active_con) { + auto con = active_con->get_con(); + ldout(cct, 1) << "found mon." + << monmap.get_name(con->get_peer_addr()) + << dendl; + } else { + ldout(cct, 1) << "no mon sessions established" << dendl; + } + + had_a_connection = true; + _un_backoff(); + + if (!auth_err) { + last_rotating_renew_sent = utime_t(); + while (!waiting_for_session.empty()) { + _send_mon_message(std::move(waiting_for_session.front())); + waiting_for_session.pop_front(); + } + _resend_mon_commands(); + send_log(true); + if (active_con) { + auth = std::move(active_con->get_auth()); + if (global_id && global_id != active_con->get_global_id()) { + lderr(cct) << __func__ << " global_id changed from " << global_id + << " to " << active_con->get_global_id() << dendl; + } + global_id = active_con->get_global_id(); + } + } +} + +void MonClient::tick() +{ + ldout(cct, 10) << __func__ << dendl; + + utime_t now = ceph_clock_now(); + + auto reschedule_tick = make_scope_guard([this] { + schedule_tick(); + }); + + _check_auth_tickets(); + _check_tell_commands(); + + if (_hunting()) { + ldout(cct, 1) << "continuing hunt" << dendl; + return _reopen_session(); + } else if (active_con) { + // just renew as needed + auto cur_con = active_con->get_con(); + if (!cur_con->has_feature(CEPH_FEATURE_MON_STATEFUL_SUB)) { + const bool maybe_renew = sub.need_renew(); + ldout(cct, 10) << "renew subs? -- " << (maybe_renew ? "yes" : "no") + << dendl; + if (maybe_renew) { + _renew_subs(); + } + } + + if (now > last_keepalive + cct->_conf->mon_client_ping_interval) { + cur_con->send_keepalive(); + last_keepalive = now; + + if (cct->_conf->mon_client_ping_timeout > 0 && + cur_con->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) { + utime_t lk = cur_con->get_last_keepalive_ack(); + utime_t interval = now - lk; + if (interval > cct->_conf->mon_client_ping_timeout) { + ldout(cct, 1) << "no keepalive since " << lk << " (" << interval + << " seconds), reconnecting" << dendl; + return _reopen_session(); + } + } + + _un_backoff(); + } + + if (now > last_send_log + cct->_conf->mon_client_log_interval) { + send_log(); + last_send_log = now; + } + } +} + +void MonClient::_un_backoff() +{ + // un-backoff our reconnect interval + reopen_interval_multiplier = std::max( + cct->_conf.get_val<double>("mon_client_hunt_interval_min_multiple"), + reopen_interval_multiplier / + cct->_conf.get_val<double>("mon_client_hunt_interval_backoff")); + ldout(cct, 20) << __func__ << " reopen_interval_multipler now " + << reopen_interval_multiplier << dendl; +} + +void MonClient::schedule_tick() +{ + auto do_tick = make_lambda_context([this](int) { tick(); }); + if (!is_connected()) { + // start another round of hunting + const auto hunt_interval = (cct->_conf->mon_client_hunt_interval * + reopen_interval_multiplier); + timer.add_event_after(hunt_interval, do_tick); + } else { + // keep in touch + timer.add_event_after(std::min(cct->_conf->mon_client_ping_interval, + cct->_conf->mon_client_log_interval), + do_tick); + } +} + +// --------- + +void MonClient::_renew_subs() +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + if (!sub.have_new()) { + ldout(cct, 10) << __func__ << " - empty" << dendl; + return; + } + + ldout(cct, 10) << __func__ << dendl; + if (!_opened()) + _reopen_session(); + else { + auto m = ceph::make_message<MMonSubscribe>(); + m->what = sub.get_subs(); + m->hostname = ceph_get_short_hostname(); + _send_mon_message(std::move(m)); + sub.renewed(); + } +} + +void MonClient::handle_subscribe_ack(MMonSubscribeAck *m) +{ + sub.acked(m->interval); + m->put(); +} + +int MonClient::_check_auth_tickets() +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + if (active_con && auth) { + if (auth->need_tickets()) { + ldout(cct, 10) << __func__ << " getting new tickets!" << dendl; + auto m = ceph::make_message<MAuth>(); + m->protocol = auth->get_protocol(); + auth->prepare_build_request(); + auth->build_request(m->auth_payload); + _send_mon_message(m); + } + + _check_auth_rotating(); + } + return 0; +} + +int MonClient::_check_auth_rotating() +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + if (!rotating_secrets || + !auth_principal_needs_rotating_keys(entity_name)) { + ldout(cct, 20) << "_check_auth_rotating not needed by " << entity_name << dendl; + return 0; + } + + if (!active_con || !auth) { + ldout(cct, 10) << "_check_auth_rotating waiting for auth session" << dendl; + return 0; + } + + utime_t now = ceph_clock_now(); + utime_t cutoff = now; + cutoff -= std::min(30.0, cct->_conf->auth_service_ticket_ttl / 4.0); + utime_t issued_at_lower_bound = now; + issued_at_lower_bound -= cct->_conf->auth_service_ticket_ttl; + if (!rotating_secrets->need_new_secrets(cutoff)) { + ldout(cct, 10) << "_check_auth_rotating have uptodate secrets (they expire after " << cutoff << ")" << dendl; + rotating_secrets->dump_rotating(); + return 0; + } + + ldout(cct, 10) << "_check_auth_rotating renewing rotating keys (they expired before " << cutoff << ")" << dendl; + if (!rotating_secrets->need_new_secrets() && + rotating_secrets->need_new_secrets(issued_at_lower_bound)) { + // the key has expired before it has been issued? + lderr(cct) << __func__ << " possible clock skew, rotating keys expired way too early" + << " (before " << issued_at_lower_bound << ")" << dendl; + } + if ((now > last_rotating_renew_sent) && + double(now - last_rotating_renew_sent) < 1) { + ldout(cct, 10) << __func__ << " called too often (last: " + << last_rotating_renew_sent << "), skipping refresh" << dendl; + return 0; + } + auto m = ceph::make_message<MAuth>(); + m->protocol = auth->get_protocol(); + if (auth->build_rotating_request(m->auth_payload)) { + last_rotating_renew_sent = now; + _send_mon_message(std::move(m)); + } + return 0; +} + +int MonClient::wait_auth_rotating(double timeout) +{ + std::unique_lock l(monc_lock); + + // Must be initialized + ceph_assert(auth != nullptr); + + if (auth->get_protocol() == CEPH_AUTH_NONE) + return 0; + + if (!rotating_secrets) + return 0; + + ldout(cct, 10) << __func__ << " waiting for " << timeout << dendl; + utime_t cutoff = ceph_clock_now(); + cutoff -= std::min(30.0, cct->_conf->auth_service_ticket_ttl / 4.0); + if (auth_cond.wait_for(l, ceph::make_timespan(timeout), [this, cutoff] { + return (!auth_principal_needs_rotating_keys(entity_name) || + !rotating_secrets->need_new_secrets(cutoff)); + })) { + ldout(cct, 10) << __func__ << " done" << dendl; + return 0; + } else { + ldout(cct, 0) << __func__ << " timed out after " << timeout << dendl; + return -ETIMEDOUT; + } +} + +// --------- + +void MonClient::_send_command(MonCommand *r) +{ + if (r->is_tell()) { + ++r->send_attempts; + if (r->send_attempts > cct->_conf->mon_client_directed_command_retry) { + _finish_command(r, monc_errc::mon_unavailable, "mon unavailable", {}); + return; + } + // tell-style command + if (monmap.min_mon_release >= ceph_release_t::octopus) { + if (r->target_con) { + r->target_con->mark_down(); + } + if (r->target_rank >= 0) { + if (r->target_rank >= (int)monmap.size()) { + ldout(cct, 10) << " target " << r->target_rank + << " >= max mon " << monmap.size() << dendl; + _finish_command(r, monc_errc::rank_dne, "mon rank dne"sv, {}); + return; + } + r->target_con = messenger->connect_to_mon( + monmap.get_addrs(r->target_rank), true /* anon */); + } else { + if (!monmap.contains(r->target_name)) { + ldout(cct, 10) << " target " << r->target_name + << " not present in monmap" << dendl; + _finish_command(r, monc_errc::mon_dne, "mon dne"sv, {}); + return; + } + r->target_con = messenger->connect_to_mon( + monmap.get_addrs(r->target_name), true /* anon */); + } + + r->target_session.reset(new MonConnection(cct, r->target_con, 0, + &auth_registry)); + r->target_session->start(monmap.get_epoch(), entity_name); + r->last_send_attempt = ceph_clock_now(); + + MCommand *m = new MCommand(monmap.fsid); + m->set_tid(r->tid); + m->cmd = r->cmd; + m->set_data(r->inbl); + r->target_session->queue_command(m); + return; + } + + // ugly legacy handling of pre-octopus mons + entity_addr_t peer; + if (active_con) { + peer = active_con->get_con()->get_peer_addr(); + } + + if (r->target_rank >= 0 && + r->target_rank != monmap.get_rank(peer)) { + ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd + << " wants rank " << r->target_rank + << ", reopening session" + << dendl; + if (r->target_rank >= (int)monmap.size()) { + ldout(cct, 10) << " target " << r->target_rank + << " >= max mon " << monmap.size() << dendl; + _finish_command(r, monc_errc::rank_dne, "mon rank dne"sv, {}); + return; + } + _reopen_session(r->target_rank); + return; + } + if (r->target_name.length() && + r->target_name != monmap.get_name(peer)) { + ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd + << " wants mon " << r->target_name + << ", reopening session" + << dendl; + if (!monmap.contains(r->target_name)) { + ldout(cct, 10) << " target " << r->target_name + << " not present in monmap" << dendl; + _finish_command(r, monc_errc::mon_dne, "mon dne"sv, {}); + return; + } + _reopen_session(monmap.get_rank(r->target_name)); + return; + } + // fall-thru to send 'normal' CLI command + } + + // normal CLI command + ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl; + auto m = ceph::make_message<MMonCommand>(monmap.fsid); + m->set_tid(r->tid); + m->cmd = r->cmd; + m->set_data(r->inbl); + _send_mon_message(std::move(m)); + return; +} + +void MonClient::_check_tell_commands() +{ + // resend any requests + auto now = ceph_clock_now(); + auto p = mon_commands.begin(); + while (p != mon_commands.end()) { + auto cmd = p->second; + ++p; + if (cmd->is_tell() && + cmd->last_send_attempt != utime_t() && + now - cmd->last_send_attempt > cct->_conf->mon_client_hunt_interval) { + ldout(cct,5) << __func__ << " timeout tell command " << cmd->tid << dendl; + _send_command(cmd); // might remove cmd from mon_commands + } + } +} + +void MonClient::_resend_mon_commands() +{ + // resend any requests + auto p = mon_commands.begin(); + while (p != mon_commands.end()) { + auto cmd = p->second; + ++p; + if (cmd->is_tell() && monmap.min_mon_release >= ceph_release_t::octopus) { + // starting with octopus, tell commands use their own connetion and need no + // special resend when we finish hunting. + } else { + _send_command(cmd); // might remove cmd from mon_commands + } + } +} + +void MonClient::handle_mon_command_ack(MMonCommandAck *ack) +{ + MonCommand *r = NULL; + uint64_t tid = ack->get_tid(); + + if (tid == 0 && !mon_commands.empty()) { + r = mon_commands.begin()->second; + ldout(cct, 10) << __func__ << " has tid 0, assuming it is " << r->tid << dendl; + } else { + auto p = mon_commands.find(tid); + if (p == mon_commands.end()) { + ldout(cct, 10) << __func__ << " " << ack->get_tid() << " not found" << dendl; + ack->put(); + return; + } + r = p->second; + } + + ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl; + auto ec = ack->r < 0 ? bs::error_code(-ack->r, mon_category()) + : bs::error_code(); + _finish_command(r, ec, ack->rs, + std::move(ack->get_data())); + ack->put(); +} + +void MonClient::handle_command_reply(MCommandReply *reply) +{ + MonCommand *r = NULL; + uint64_t tid = reply->get_tid(); + + if (tid == 0 && !mon_commands.empty()) { + r = mon_commands.begin()->second; + ldout(cct, 10) << __func__ << " has tid 0, assuming it is " << r->tid + << dendl; + } else { + auto p = mon_commands.find(tid); + if (p == mon_commands.end()) { + ldout(cct, 10) << __func__ << " " << reply->get_tid() << " not found" + << dendl; + reply->put(); + return; + } + r = p->second; + } + + ldout(cct, 10) << __func__ << " " << r->tid << " " << r->cmd << dendl; + auto ec = reply->r < 0 ? bs::error_code(-reply->r, mon_category()) + : bs::error_code(); + _finish_command(r, ec, reply->rs, std::move(reply->get_data())); + reply->put(); +} + +int MonClient::_cancel_mon_command(uint64_t tid) +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + + auto it = mon_commands.find(tid); + if (it == mon_commands.end()) { + ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl; + return -ENOENT; + } + + ldout(cct, 10) << __func__ << " tid " << tid << dendl; + + MonCommand *cmd = it->second; + _finish_command(cmd, monc_errc::timed_out, "timed out"sv, {}); + return 0; +} + +void MonClient::_finish_command(MonCommand *r, bs::error_code ret, + std::string_view rs, ceph::buffer::list&& bl) +{ + ldout(cct, 10) << __func__ << " " << r->tid << " = " << ret << " " << rs + << dendl; + ceph::async::post(std::move(r->onfinish), ret, std::string(rs), + std::move(bl)); + if (r->target_con) { + r->target_con->mark_down(); + } + mon_commands.erase(r->tid); + delete r; +} + +// --------- + +void MonClient::handle_get_version_reply(MMonGetVersionReply* m) +{ + ceph_assert(ceph_mutex_is_locked(monc_lock)); + auto iter = version_requests.find(m->handle); + if (iter == version_requests.end()) { + ldout(cct, 0) << __func__ << " version request with handle " << m->handle + << " not found" << dendl; + } else { + auto req = std::move(iter->second); + ldout(cct, 10) << __func__ << " finishing " << iter->first << " version " + << m->version << dendl; + version_requests.erase(iter); + ceph::async::post(std::move(req), bs::error_code(), + m->version, m->oldest_version); + } + m->put(); +} + +int MonClient::get_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t *auth_method, + std::vector<uint32_t> *preferred_modes, + ceph::buffer::list *bl) +{ + std::lock_guard l(monc_lock); + ldout(cct,10) << __func__ << " con " << con << " auth_method " << *auth_method + << dendl; + + // connection to mon? + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + ceph_assert(!auth_meta->authorizer); + if (con->is_anon()) { + for (auto& i : mon_commands) { + if (i.second->target_con == con) { + return i.second->target_session->get_auth_request( + auth_method, preferred_modes, bl, + entity_name, want_keys, rotating_secrets.get()); + } + } + } + for (auto& i : pending_cons) { + if (i.second.is_con(con)) { + return i.second.get_auth_request( + auth_method, preferred_modes, bl, + entity_name, want_keys, rotating_secrets.get()); + } + } + return -ENOENT; + } + + // generate authorizer + if (!auth) { + lderr(cct) << __func__ << " but no auth handler is set up" << dendl; + return -EACCES; + } + auth_meta->authorizer.reset(auth->build_authorizer(con->get_peer_type())); + if (!auth_meta->authorizer) { + lderr(cct) << __func__ << " failed to build_authorizer for type " + << ceph_entity_type_name(con->get_peer_type()) << dendl; + return -EACCES; + } + auth_meta->auth_method = auth_meta->authorizer->protocol; + auth_registry.get_supported_modes(con->get_peer_type(), + auth_meta->auth_method, + preferred_modes); + *bl = auth_meta->authorizer->bl; + return 0; +} + +int MonClient::handle_auth_reply_more( + Connection *con, + AuthConnectionMeta *auth_meta, + const ceph::buffer::list& bl, + ceph::buffer::list *reply) +{ + std::lock_guard l(monc_lock); + + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + if (con->is_anon()) { + for (auto& i : mon_commands) { + if (i.second->target_con == con) { + return i.second->target_session->handle_auth_reply_more( + auth_meta, bl, reply); + } + } + } + for (auto& i : pending_cons) { + if (i.second.is_con(con)) { + return i.second.handle_auth_reply_more(auth_meta, bl, reply); + } + } + return -ENOENT; + } + + // authorizer challenges + if (!auth || !auth_meta->authorizer) { + lderr(cct) << __func__ << " no authorizer?" << dendl; + return -1; + } + auth_meta->authorizer->add_challenge(cct, bl); + *reply = auth_meta->authorizer->bl; + return 0; +} + +int MonClient::handle_auth_done( + Connection *con, + AuthConnectionMeta *auth_meta, + uint64_t global_id, + uint32_t con_mode, + const ceph::buffer::list& bl, + CryptoKey *session_key, + std::string *connection_secret) +{ + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + std::lock_guard l(monc_lock); + if (con->is_anon()) { + for (auto& i : mon_commands) { + if (i.second->target_con == con) { + return i.second->target_session->handle_auth_done( + auth_meta, global_id, bl, + session_key, connection_secret); + } + } + } + for (auto& i : pending_cons) { + if (i.second.is_con(con)) { + int r = i.second.handle_auth_done( + auth_meta, global_id, bl, + session_key, connection_secret); + if (r) { + pending_cons.erase(i.first); + if (!pending_cons.empty()) { + return r; + } + } else { + active_con.reset(new MonConnection(std::move(i.second))); + pending_cons.clear(); + ceph_assert(active_con->have_session()); + } + + _finish_hunting(r); + if (r || monmap.get_epoch() > 0) { + _finish_auth(r); + } + return r; + } + } + return -ENOENT; + } else { + // verify authorizer reply + auto p = bl.begin(); + if (!auth_meta->authorizer->verify_reply(p, &auth_meta->connection_secret)) { + ldout(cct, 0) << __func__ << " failed verifying authorizer reply" + << dendl; + return -EACCES; + } + auth_meta->session_key = auth_meta->authorizer->session_key; + return 0; + } +} + +int MonClient::handle_auth_bad_method( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) +{ + auth_meta->allowed_methods = allowed_methods; + + std::lock_guard l(monc_lock); + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + if (con->is_anon()) { + for (auto& i : mon_commands) { + if (i.second->target_con == con) { + int r = i.second->target_session->handle_auth_bad_method( + old_auth_method, + result, + allowed_methods, + allowed_modes); + if (r < 0) { + auto ec = bs::error_code(-r, mon_category()); + _finish_command(i.second, ec, "auth failed"sv, {}); + } + return r; + } + } + } + for (auto& i : pending_cons) { + if (i.second.is_con(con)) { + int r = i.second.handle_auth_bad_method(old_auth_method, + result, + allowed_methods, + allowed_modes); + if (r == 0) { + return r; // try another method on this con + } + pending_cons.erase(i.first); + if (!pending_cons.empty()) { + return r; // fail this con, maybe another con will succeed + } + // fail hunt + _finish_hunting(r); + _finish_auth(r); + return r; + } + } + return -ENOENT; + } else { + // huh... + ldout(cct,10) << __func__ << " hmm, they didn't like " << old_auth_method + << " result " << cpp_strerror(result) + << " and auth is " << (auth ? auth->get_protocol() : 0) + << dendl; + return -EACCES; + } +} + +int MonClient::handle_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + bool more, + uint32_t auth_method, + const ceph::buffer::list& payload, + ceph::buffer::list *reply) +{ + if (payload.length() == 0) { + // for some channels prior to nautilus (osd heartbeat), we + // tolerate the lack of an authorizer. + if (!con->get_messenger()->require_authorizer) { + handle_authentication_dispatcher->ms_handle_authentication(con); + return 1; + } + return -EACCES; + } + auth_meta->auth_mode = payload[0]; + if (auth_meta->auth_mode < AUTH_MODE_AUTHORIZER || + auth_meta->auth_mode > AUTH_MODE_AUTHORIZER_MAX) { + return -EACCES; + } + AuthAuthorizeHandler *ah = get_auth_authorize_handler(con->get_peer_type(), + auth_method); + if (!ah) { + lderr(cct) << __func__ << " no AuthAuthorizeHandler found for auth method " + << auth_method << dendl; + return -EOPNOTSUPP; + } + + auto ac = &auth_meta->authorizer_challenge; + if (auth_meta->skip_authorizer_challenge) { + ldout(cct, 10) << __func__ << " skipping challenge on " << con << dendl; + ac = nullptr; + } + + bool was_challenge = (bool)auth_meta->authorizer_challenge; + bool isvalid = ah->verify_authorizer( + cct, + *rotating_secrets, + payload, + auth_meta->get_connection_secret_length(), + reply, + &con->peer_name, + &con->peer_global_id, + &con->peer_caps_info, + &auth_meta->session_key, + &auth_meta->connection_secret, + ac); + if (isvalid) { + handle_authentication_dispatcher->ms_handle_authentication(con); + return 1; + } + if (!more && !was_challenge && auth_meta->authorizer_challenge) { + ldout(cct,10) << __func__ << " added challenge on " << con << dendl; + return 0; + } + ldout(cct,10) << __func__ << " bad authorizer on " << con << dendl; + // discard old challenge + auth_meta->authorizer_challenge.reset(); + return -EACCES; +} + +AuthAuthorizer* MonClient::build_authorizer(int service_id) const { + std::lock_guard l(monc_lock); + if (auth) { + return auth->build_authorizer(service_id); + } else { + ldout(cct, 0) << __func__ << " for " << ceph_entity_type_name(service_id) + << ", but no auth is available now" << dendl; + return nullptr; + } +} + +#define dout_subsys ceph_subsys_monc +#undef dout_prefix +#define dout_prefix *_dout << "monclient" << (have_session() ? ": " : "(hunting): ") + +MonConnection::MonConnection( + CephContext *cct, ConnectionRef con, uint64_t global_id, + AuthRegistry *ar) + : cct(cct), con(con), global_id(global_id), auth_registry(ar) +{} + +MonConnection::~MonConnection() +{ + if (con) { + con->mark_down(); + con.reset(); + } +} + +bool MonConnection::have_session() const +{ + return state == State::HAVE_SESSION; +} + +void MonConnection::start(epoch_t epoch, + const EntityName& entity_name) +{ + using ceph::encode; + auth_start = ceph_clock_now(); + + if (con->get_peer_addr().is_msgr2()) { + ldout(cct, 10) << __func__ << " opening mon connection" << dendl; + state = State::AUTHENTICATING; + con->send_message(new MMonGetMap()); + return; + } + + // restart authentication handshake + state = State::NEGOTIATING; + + // send an initial keepalive to ensure our timestamp is valid by the + // time we are in an OPENED state (by sequencing this before + // authentication). + con->send_keepalive(); + + auto m = new MAuth; + m->protocol = CEPH_AUTH_UNKNOWN; + m->monmap_epoch = epoch; + __u8 struct_v = 1; + encode(struct_v, m->auth_payload); + std::vector<uint32_t> auth_supported; + auth_registry->get_supported_methods(con->get_peer_type(), &auth_supported); + encode(auth_supported, m->auth_payload); + encode(entity_name, m->auth_payload); + encode(global_id, m->auth_payload); + con->send_message(m); +} + +int MonConnection::get_auth_request( + uint32_t *method, + std::vector<uint32_t> *preferred_modes, + ceph::buffer::list *bl, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring) +{ + using ceph::encode; + // choose method + if (auth_method < 0) { + std::vector<uint32_t> as; + auth_registry->get_supported_methods(con->get_peer_type(), &as); + if (as.empty()) { + return -EACCES; + } + auth_method = as.front(); + } + *method = auth_method; + auth_registry->get_supported_modes(con->get_peer_type(), auth_method, + preferred_modes); + ldout(cct,10) << __func__ << " method " << *method + << " preferred_modes " << *preferred_modes << dendl; + if (preferred_modes->empty()) { + return -EACCES; + } + + int r = _init_auth(*method, entity_name, want_keys, keyring, true); + ceph_assert(r == 0); + + // initial requset includes some boilerplate... + encode((char)AUTH_MODE_MON, *bl); + encode(entity_name, *bl); + encode(global_id, *bl); + + // and (maybe) some method-specific initial payload + auth->build_initial_request(bl); + + return 0; +} + +int MonConnection::handle_auth_reply_more( + AuthConnectionMeta *auth_meta, + const ceph::buffer::list& bl, + ceph::buffer::list *reply) +{ + ldout(cct, 10) << __func__ << " payload " << bl.length() << dendl; + ldout(cct, 30) << __func__ << " got\n"; + bl.hexdump(*_dout); + *_dout << dendl; + + auto p = bl.cbegin(); + ldout(cct, 10) << __func__ << " payload_len " << bl.length() << dendl; + int r = auth->handle_response(0, p, &auth_meta->session_key, + &auth_meta->connection_secret); + if (r == -EAGAIN) { + auth->prepare_build_request(); + auth->build_request(*reply); + ldout(cct, 10) << __func__ << " responding with " << reply->length() + << " bytes" << dendl; + r = 0; + } else if (r < 0) { + lderr(cct) << __func__ << " handle_response returned " << r << dendl; + } else { + ldout(cct, 10) << __func__ << " authenticated!" << dendl; + // FIXME + ceph_abort(cct, "write me"); + } + return r; +} + +int MonConnection::handle_auth_done( + AuthConnectionMeta *auth_meta, + uint64_t new_global_id, + const ceph::buffer::list& bl, + CryptoKey *session_key, + std::string *connection_secret) +{ + ldout(cct,10) << __func__ << " global_id " << new_global_id + << " payload " << bl.length() + << dendl; + global_id = new_global_id; + auth->set_global_id(global_id); + auto p = bl.begin(); + int auth_err = auth->handle_response(0, p, &auth_meta->session_key, + &auth_meta->connection_secret); + if (auth_err >= 0) { + state = State::HAVE_SESSION; + } + con->set_last_keepalive_ack(auth_start); + + if (pending_tell_command) { + con->send_message2(std::move(pending_tell_command)); + } + return auth_err; +} + +int MonConnection::handle_auth_bad_method( + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) +{ + ldout(cct,10) << __func__ << " old_auth_method " << old_auth_method + << " result " << cpp_strerror(result) + << " allowed_methods " << allowed_methods << dendl; + std::vector<uint32_t> auth_supported; + auth_registry->get_supported_methods(con->get_peer_type(), &auth_supported); + auto p = std::find(auth_supported.begin(), auth_supported.end(), + old_auth_method); + assert(p != auth_supported.end()); + p = std::find_first_of(std::next(p), auth_supported.end(), + allowed_methods.begin(), allowed_methods.end()); + if (p == auth_supported.end()) { + lderr(cct) << __func__ << " server allowed_methods " << allowed_methods + << " but i only support " << auth_supported << dendl; + return -EACCES; + } + auth_method = *p; + ldout(cct,10) << __func__ << " will try " << auth_method << " next" << dendl; + return 0; +} + +int MonConnection::handle_auth(MAuthReply* m, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring) +{ + if (state == State::NEGOTIATING) { + int r = _negotiate(m, entity_name, want_keys, keyring); + if (r) { + return r; + } + state = State::AUTHENTICATING; + } + int r = authenticate(m); + if (!r) { + state = State::HAVE_SESSION; + } + return r; +} + +int MonConnection::_negotiate(MAuthReply *m, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring) +{ + int r = _init_auth(m->protocol, entity_name, want_keys, keyring, false); + if (r == -ENOTSUP) { + if (m->result == -ENOTSUP) { + ldout(cct, 10) << "none of our auth protocols are supported by the server" + << dendl; + } + return m->result; + } + return r; +} + +int MonConnection::_init_auth( + uint32_t method, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring, + bool msgr2) +{ + ldout(cct, 10) << __func__ << " method " << method << dendl; + if (auth && auth->get_protocol() == (int)method) { + ldout(cct, 10) << __func__ << " already have auth, reseting" << dendl; + auth->reset(); + return 0; + } + + ldout(cct, 10) << __func__ << " creating new auth" << dendl; + auth.reset(AuthClientHandler::create(cct, method, keyring)); + if (!auth) { + ldout(cct, 10) << " no handler for protocol " << method << dendl; + return -ENOTSUP; + } + + // do not request MGR key unless the mon has the SERVER_KRAKEN + // feature. otherwise it will give us an auth error. note that + // we have to use the FEATUREMASK because pre-jewel the kraken + // feature bit was used for something else. + if (!msgr2 && + (want_keys & CEPH_ENTITY_TYPE_MGR) && + !(con->has_features(CEPH_FEATUREMASK_SERVER_KRAKEN))) { + ldout(cct, 1) << __func__ + << " not requesting MGR keys from pre-kraken monitor" + << dendl; + want_keys &= ~CEPH_ENTITY_TYPE_MGR; + } + auth->set_want_keys(want_keys); + auth->init(entity_name); + auth->set_global_id(global_id); + return 0; +} + +int MonConnection::authenticate(MAuthReply *m) +{ + ceph_assert(auth); + if (!m->global_id) { + ldout(cct, 1) << "peer sent an invalid global_id" << dendl; + } + if (m->global_id != global_id) { + // it's a new session + auth->reset(); + global_id = m->global_id; + auth->set_global_id(global_id); + ldout(cct, 10) << "my global_id is " << m->global_id << dendl; + } + auto p = m->result_bl.cbegin(); + int ret = auth->handle_response(m->result, p, nullptr, nullptr); + if (ret == -EAGAIN) { + auto ma = new MAuth; + ma->protocol = auth->get_protocol(); + auth->prepare_build_request(); + auth->build_request(ma->auth_payload); + con->send_message(ma); + } + if (ret == 0 && pending_tell_command) { + con->send_message2(std::move(pending_tell_command)); + } + + return ret; +} + +void MonClient::register_config_callback(md_config_t::config_callback fn) { + ceph_assert(!config_cb); + config_cb = fn; +} + +md_config_t::config_callback MonClient::get_config_callback() { + return config_cb; +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wnon-virtual-dtor" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wnon-virtual-dtor" +class monc_error_category : public ceph::converting_category { +public: + monc_error_category(){} + const char* name() const noexcept override; + const char* message(int ev, char*, std::size_t) const noexcept override; + std::string message(int ev) const override; + bs::error_condition default_error_condition(int ev) const noexcept + override; + bool equivalent(int ev, const bs::error_condition& c) const + noexcept override; + using ceph::converting_category::equivalent; + int from_code(int ev) const noexcept override; +}; +#pragma GCC diagnostic pop +#pragma clang diagnostic pop + +const char* monc_error_category::name() const noexcept { + return "monc"; +} + +const char* monc_error_category::message(int ev, char*, std::size_t) const noexcept { + if (ev == 0) + return "No error"; + + switch (static_cast<monc_errc>(ev)) { + case monc_errc::shutting_down: // Command failed due to MonClient shutting down + return "Command failed due to MonClient shutting down"; + case monc_errc::session_reset: + return "Monitor session was reset"; + case monc_errc::rank_dne: + return "Requested monitor rank does not exist"; + case monc_errc::mon_dne: + return "Requested monitor does not exist"; + case monc_errc::timed_out: + return "Monitor operation timed out"; + case monc_errc::mon_unavailable: + return "Monitor unavailable"; + } + + return "Unknown error"; +} + +std::string monc_error_category::message(int ev) const { + return message(ev, nullptr, 0); +} + +bs::error_condition monc_error_category::default_error_condition(int ev) const noexcept { + switch (static_cast<monc_errc>(ev)) { + case monc_errc::shutting_down: + return bs::errc::operation_canceled; + case monc_errc::session_reset: + return bs::errc::resource_unavailable_try_again; + case monc_errc::rank_dne: + [[fallthrough]]; + case monc_errc::mon_dne: + return ceph::errc::not_in_map; + case monc_errc::timed_out: + return bs::errc::timed_out; + case monc_errc::mon_unavailable: + return bs::errc::no_such_device; + } + return { ev, *this }; +} + +bool monc_error_category::equivalent(int ev, const bs::error_condition& c) const noexcept { + switch (static_cast<monc_errc>(ev)) { + case monc_errc::rank_dne: + [[fallthrough]]; + case monc_errc::mon_dne: + return c == bs::errc::no_such_file_or_directory; + default: + return default_error_condition(ev) == c; + } +} + +int monc_error_category::from_code(int ev) const noexcept { + if (ev == 0) + return 0; + + switch (static_cast<monc_errc>(ev)) { + case monc_errc::shutting_down: + return -ECANCELED; + case monc_errc::session_reset: + return -EAGAIN; + case monc_errc::rank_dne: + [[fallthrough]]; + case monc_errc::mon_dne: + return -ENOENT; + case monc_errc::timed_out: + return -ETIMEDOUT; + case monc_errc::mon_unavailable: + return -ENXIO; + } + return -EDOM; +} + +const bs::error_category& monc_category() noexcept { + static const monc_error_category c; + return c; +} diff --git a/src/mon/MonClient.h b/src/mon/MonClient.h new file mode 100644 index 000000000..6a7daa814 --- /dev/null +++ b/src/mon/MonClient.h @@ -0,0 +1,774 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_MONCLIENT_H +#define CEPH_MONCLIENT_H + +#include <functional> +#include <list> +#include <map> +#include <memory> +#include <set> +#include <string> +#include <vector> + +#include "msg/Messenger.h" + +#include "MonMap.h" +#include "MonSub.h" + +#include "common/async/completion.h" +#include "common/Timer.h" +#include "common/config.h" +#include "messages/MMonGetVersion.h" + +#include "auth/AuthClient.h" +#include "auth/AuthServer.h" + +class MMonMap; +class MConfig; +class MMonGetVersionReply; +class MMonCommandAck; +class LogClient; +class AuthClientHandler; +class AuthRegistry; +class KeyRing; +class RotatingKeyRing; + +class MonConnection { +public: + MonConnection(CephContext *cct, + ConnectionRef conn, + uint64_t global_id, + AuthRegistry *auth_registry); + ~MonConnection(); + MonConnection(MonConnection&& rhs) = default; + MonConnection& operator=(MonConnection&&) = default; + MonConnection(const MonConnection& rhs) = delete; + MonConnection& operator=(const MonConnection&) = delete; + int handle_auth(MAuthReply *m, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring); + int authenticate(MAuthReply *m); + void start(epoch_t epoch, + const EntityName& entity_name); + bool have_session() const; + uint64_t get_global_id() const { + return global_id; + } + ConnectionRef get_con() { + return con; + } + std::unique_ptr<AuthClientHandler>& get_auth() { + return auth; + } + + int get_auth_request( + uint32_t *method, + std::vector<uint32_t> *preferred_modes, + ceph::buffer::list *out, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring); + int handle_auth_reply_more( + AuthConnectionMeta *auth_meta, + const ceph::buffer::list& bl, + ceph::buffer::list *reply); + int handle_auth_done( + AuthConnectionMeta *auth_meta, + uint64_t global_id, + const ceph::buffer::list& bl, + CryptoKey *session_key, + std::string *connection_secret); + int handle_auth_bad_method( + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes); + + bool is_con(Connection *c) const { + return con.get() == c; + } + void queue_command(Message *m) { + pending_tell_command = m; + } + +private: + int _negotiate(MAuthReply *m, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring); + int _init_auth(uint32_t method, + const EntityName& entity_name, + uint32_t want_keys, + RotatingKeyRing* keyring, + bool msgr2); + +private: + CephContext *cct; + enum class State { + NONE, + NEGOTIATING, // v1 only + AUTHENTICATING, // v1 and v2 + HAVE_SESSION, + }; + State state = State::NONE; + ConnectionRef con; + int auth_method = -1; + utime_t auth_start; + + std::unique_ptr<AuthClientHandler> auth; + uint64_t global_id; + + MessageRef pending_tell_command; + + AuthRegistry *auth_registry; +}; + + +struct MonClientPinger : public Dispatcher, + public AuthClient { + ceph::mutex lock = ceph::make_mutex("MonClientPinger::lock"); + ceph::condition_variable ping_recvd_cond; + std::string *result; + bool done; + RotatingKeyRing *keyring; + std::unique_ptr<MonConnection> mc; + + MonClientPinger(CephContext *cct_, + RotatingKeyRing *keyring, + std::string *res_) : + Dispatcher(cct_), + result(res_), + done(false), + keyring(keyring) + { } + + int wait_for_reply(double timeout = 0.0) { + std::unique_lock locker{lock}; + if (timeout <= 0) { + timeout = std::chrono::duration<double>(cct->_conf.get_val<std::chrono::seconds>("client_mount_timeout")).count(); + } + done = false; + if (ping_recvd_cond.wait_for(locker, + ceph::make_timespan(timeout), + [this] { return done; })) { + return 0; + } else { + return ETIMEDOUT; + } + } + + bool ms_dispatch(Message *m) override { + using ceph::decode; + std::lock_guard l(lock); + if (m->get_type() != CEPH_MSG_PING) + return false; + + ceph::buffer::list &payload = m->get_payload(); + if (result && payload.length() > 0) { + auto p = std::cbegin(payload); + decode(*result, p); + } + done = true; + ping_recvd_cond.notify_all(); + m->put(); + return true; + } + bool ms_handle_reset(Connection *con) override { + std::lock_guard l(lock); + done = true; + ping_recvd_cond.notify_all(); + return true; + } + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override { + return false; + } + + // AuthClient + int get_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t *auth_method, + std::vector<uint32_t> *preferred_modes, + ceph::buffer::list *bl) override { + return mc->get_auth_request(auth_method, preferred_modes, bl, + cct->_conf->name, 0, keyring); + } + int handle_auth_reply_more( + Connection *con, + AuthConnectionMeta *auth_meta, + const ceph::buffer::list& bl, + ceph::buffer::list *reply) override { + return mc->handle_auth_reply_more(auth_meta, bl, reply); + } + int handle_auth_done( + Connection *con, + AuthConnectionMeta *auth_meta, + uint64_t global_id, + uint32_t con_mode, + const ceph::buffer::list& bl, + CryptoKey *session_key, + std::string *connection_secret) override { + return mc->handle_auth_done(auth_meta, global_id, bl, + session_key, connection_secret); + } + int handle_auth_bad_method( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) override { + return mc->handle_auth_bad_method(old_auth_method, result, + allowed_methods, allowed_modes); + } +}; + +const boost::system::error_category& monc_category() noexcept; + +enum class monc_errc { + shutting_down = 1, // Command failed due to MonClient shutting down + session_reset, // Monitor session was reset + rank_dne, // Requested monitor rank does not exist + mon_dne, // Requested monitor does not exist + timed_out, // Monitor operation timed out + mon_unavailable // Monitor unavailable +}; + +namespace boost::system { +template<> +struct is_error_code_enum<::monc_errc> { + static const bool value = true; +}; +} + +// implicit conversion: +inline boost::system::error_code make_error_code(monc_errc e) noexcept { + return { static_cast<int>(e), monc_category() }; +} + +// explicit conversion: +inline boost::system::error_condition make_error_condition(monc_errc e) noexcept { + return { static_cast<int>(e), monc_category() }; +} + +const boost::system::error_category& monc_category() noexcept; + +class MonClient : public Dispatcher, + public AuthClient, + public AuthServer /* for mgr, osd, mds */ { + static constexpr auto dout_subsys = ceph_subsys_monc; +public: + // Error, Newest, Oldest + using VersionSig = void(boost::system::error_code, version_t, version_t); + using VersionCompletion = ceph::async::Completion<VersionSig>; + + using CommandSig = void(boost::system::error_code, std::string, + ceph::buffer::list); + using CommandCompletion = ceph::async::Completion<CommandSig>; + + MonMap monmap; + std::map<std::string,std::string> config_mgr; + +private: + Messenger *messenger; + + std::unique_ptr<MonConnection> active_con; + std::map<entity_addrvec_t, MonConnection> pending_cons; + std::set<unsigned> tried; + + EntityName entity_name; + + mutable ceph::mutex monc_lock = ceph::make_mutex("MonClient::monc_lock"); + SafeTimer timer; + boost::asio::io_context& service; + boost::asio::io_context::strand finish_strand{service}; + + bool initialized; + bool stopping = false; + + LogClient *log_client; + bool more_log_pending; + + void send_log(bool flush = false); + + bool ms_dispatch(Message *m) override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override { return false; } + + void handle_monmap(MMonMap *m); + void handle_config(MConfig *m); + + void handle_auth(MAuthReply *m); + + // monitor session + utime_t last_keepalive; + utime_t last_send_log; + + void tick(); + void schedule_tick(); + + // monclient + bool want_monmap; + ceph::condition_variable map_cond; + bool passthrough_monmap = false; + + bool want_bootstrap_config = false; + ceph::ref_t<MConfig> bootstrap_config; + + // authenticate + std::unique_ptr<AuthClientHandler> auth; + uint32_t want_keys = 0; + uint64_t global_id = 0; + ceph::condition_variable auth_cond; + int authenticate_err = 0; + bool authenticated = false; + + std::list<MessageRef> waiting_for_session; + utime_t last_rotating_renew_sent; + bool had_a_connection; + double reopen_interval_multiplier; + + Dispatcher *handle_authentication_dispatcher = nullptr; + bool _opened() const; + bool _hunting() const; + void _start_hunting(); + void _finish_hunting(int auth_err); + void _finish_auth(int auth_err); + void _reopen_session(int rank = -1); + void _add_conn(unsigned rank); + void _add_conns(); + void _un_backoff(); + void _send_mon_message(MessageRef m); + + std::map<entity_addrvec_t, MonConnection>::iterator _find_pending_con( + const ConnectionRef& con) { + for (auto i = pending_cons.begin(); i != pending_cons.end(); ++i) { + if (i->second.get_con() == con) { + return i; + } + } + return pending_cons.end(); + } + +public: + // AuthClient + int get_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t *method, + std::vector<uint32_t> *preferred_modes, + ceph::buffer::list *bl) override; + int handle_auth_reply_more( + Connection *con, + AuthConnectionMeta *auth_meta, + const ceph::buffer::list& bl, + ceph::buffer::list *reply) override; + int handle_auth_done( + Connection *con, + AuthConnectionMeta *auth_meta, + uint64_t global_id, + uint32_t con_mode, + const ceph::buffer::list& bl, + CryptoKey *session_key, + std::string *connection_secret) override; + int handle_auth_bad_method( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) override; + // AuthServer + int handle_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + bool more, + uint32_t auth_method, + const ceph::buffer::list& bl, + ceph::buffer::list *reply) override; + + void set_entity_name(EntityName name) { entity_name = name; } + void set_handle_authentication_dispatcher(Dispatcher *d) { + handle_authentication_dispatcher = d; + } + int _check_auth_tickets(); + int _check_auth_rotating(); + int wait_auth_rotating(double timeout); + + int authenticate(double timeout=0.0); + bool is_authenticated() const {return authenticated;} + + bool is_connected() const { return active_con != nullptr; } + + /** + * Try to flush as many log messages as we can in a single + * message. Use this before shutting down to transmit your + * last message. + */ + void flush_log(); + +private: + // mon subscriptions + MonSub sub; + void _renew_subs(); + void handle_subscribe_ack(MMonSubscribeAck* m); + +public: + void renew_subs() { + std::lock_guard l(monc_lock); + _renew_subs(); + } + bool sub_want(std::string what, version_t start, unsigned flags) { + std::lock_guard l(monc_lock); + return sub.want(what, start, flags); + } + void sub_got(std::string what, version_t have) { + std::lock_guard l(monc_lock); + sub.got(what, have); + } + void sub_unwant(std::string what) { + std::lock_guard l(monc_lock); + sub.unwant(what); + } + bool sub_want_increment(std::string what, version_t start, unsigned flags) { + std::lock_guard l(monc_lock); + return sub.inc_want(what, start, flags); + } + + std::unique_ptr<KeyRing> keyring; + std::unique_ptr<RotatingKeyRing> rotating_secrets; + + public: + MonClient(CephContext *cct_, boost::asio::io_context& service); + MonClient(const MonClient &) = delete; + MonClient& operator=(const MonClient &) = delete; + ~MonClient() override; + + int init(); + void shutdown(); + + void set_log_client(LogClient *clog) { + log_client = clog; + } + LogClient *get_log_client() { + return log_client; + } + + int build_initial_monmap(); + int get_monmap(); + int get_monmap_and_config(); + /** + * If you want to see MonMap messages, set this and + * the MonClient will tell the Messenger it hasn't + * dealt with it. + * Note that if you do this, *you* are of course responsible for + * putting the message reference! + */ + void set_passthrough_monmap() { + std::lock_guard l(monc_lock); + passthrough_monmap = true; + } + void unset_passthrough_monmap() { + std::lock_guard l(monc_lock); + passthrough_monmap = false; + } + /** + * Ping monitor with ID @p mon_id and record the resulting + * reply in @p result_reply. + * + * @param[in] mon_id Target monitor's ID + * @param[out] result_reply reply from mon.ID, if param != NULL + * @returns 0 in case of success; < 0 in case of error, + * -ETIMEDOUT if monitor didn't reply before timeout + * expired (default: conf->client_mount_timeout). + */ + int ping_monitor(const std::string &mon_id, std::string *result_reply); + + void send_mon_message(Message *m) { + send_mon_message(MessageRef{m, false}); + } + void send_mon_message(MessageRef m); + + void reopen_session() { + std::lock_guard l(monc_lock); + _reopen_session(); + } + + const uuid_d& get_fsid() const { + return monmap.fsid; + } + + entity_addrvec_t get_mon_addrs(unsigned i) const { + std::lock_guard l(monc_lock); + if (i < monmap.size()) + return monmap.get_addrs(i); + return entity_addrvec_t(); + } + int get_num_mon() const { + std::lock_guard l(monc_lock); + return monmap.size(); + } + + uint64_t get_global_id() const { + std::lock_guard l(monc_lock); + return global_id; + } + + void set_messenger(Messenger *m) { messenger = m; } + entity_addrvec_t get_myaddrs() const { return messenger->get_myaddrs(); } + AuthAuthorizer* build_authorizer(int service_id) const; + + void set_want_keys(uint32_t want) { + want_keys = want; + } + + // admin commands +private: + uint64_t last_mon_command_tid; + + struct MonCommand { + // for tell only + std::string target_name; + int target_rank = -1; + ConnectionRef target_con; + std::unique_ptr<MonConnection> target_session; + unsigned send_attempts = 0; ///< attempt count for legacy mons + utime_t last_send_attempt; + uint64_t tid; + std::vector<std::string> cmd; + ceph::buffer::list inbl; + std::unique_ptr<CommandCompletion> onfinish; + std::optional<boost::asio::steady_timer> cancel_timer; + + MonCommand(MonClient& monc, uint64_t t, std::unique_ptr<CommandCompletion> onfinish) + : tid(t), onfinish(std::move(onfinish)) { + auto timeout = + monc.cct->_conf.get_val<std::chrono::seconds>("rados_mon_op_timeout"); + if (timeout.count() > 0) { + cancel_timer.emplace(monc.service, timeout); + cancel_timer->async_wait( + [this, &monc](boost::system::error_code ec) { + if (ec) + return; + std::scoped_lock l(monc.monc_lock); + monc._cancel_mon_command(tid); + }); + } + } + + bool is_tell() const { + return target_name.size() || target_rank >= 0; + } + }; + friend MonCommand; + std::map<uint64_t,MonCommand*> mon_commands; + + void _send_command(MonCommand *r); + void _check_tell_commands(); + void _resend_mon_commands(); + int _cancel_mon_command(uint64_t tid); + void _finish_command(MonCommand *r, boost::system::error_code ret, std::string_view rs, + bufferlist&& bl); + void _finish_auth(); + void handle_mon_command_ack(MMonCommandAck *ack); + void handle_command_reply(MCommandReply *reply); + +public: + template<typename CompletionToken> + auto start_mon_command(const std::vector<std::string>& cmd, + const ceph::buffer::list& inbl, + CompletionToken&& token) { + ldout(cct,10) << __func__ << " cmd=" << cmd << dendl; + boost::asio::async_completion<CompletionToken, CommandSig> init(token); + { + std::scoped_lock l(monc_lock); + auto h = CommandCompletion::create(service.get_executor(), + std::move(init.completion_handler)); + if (!initialized || stopping) { + ceph::async::post(std::move(h), monc_errc::shutting_down, std::string{}, + bufferlist{}); + } else { + auto r = new MonCommand(*this, ++last_mon_command_tid, std::move(h)); + r->cmd = cmd; + r->inbl = inbl; + mon_commands.emplace(r->tid, r); + _send_command(r); + } + } + return init.result.get(); + } + + template<typename CompletionToken> + auto start_mon_command(int mon_rank, const std::vector<std::string>& cmd, + const ceph::buffer::list& inbl, CompletionToken&& token) { + ldout(cct,10) << __func__ << " cmd=" << cmd << dendl; + boost::asio::async_completion<CompletionToken, CommandSig> init(token); + { + std::scoped_lock l(monc_lock); + auto h = CommandCompletion::create(service.get_executor(), + std::move(init.completion_handler)); + if (!initialized || stopping) { + ceph::async::post(std::move(h), monc_errc::shutting_down, std::string{}, + bufferlist{}); + } else { + auto r = new MonCommand(*this, ++last_mon_command_tid, std::move(h)); + r->target_rank = mon_rank; + r->cmd = cmd; + r->inbl = inbl; + mon_commands.emplace(r->tid, r); + _send_command(r); + } + } + return init.result.get(); + } + + template<typename CompletionToken> + auto start_mon_command(const std::string& mon_name, + const std::vector<std::string>& cmd, + const ceph::buffer::list& inbl, + CompletionToken&& token) { + ldout(cct,10) << __func__ << " cmd=" << cmd << dendl; + boost::asio::async_completion<CompletionToken, CommandSig> init(token); + { + std::scoped_lock l(monc_lock); + auto h = CommandCompletion::create(service.get_executor(), + std::move(init.completion_handler)); + if (!initialized || stopping) { + ceph::async::post(std::move(h), monc_errc::shutting_down, std::string{}, + bufferlist{}); + } else { + auto r = new MonCommand(*this, ++last_mon_command_tid, std::move(h)); + // detect/tolerate mon *rank* passed as a string + std::string err; + int rank = strict_strtoll(mon_name.c_str(), 10, &err); + if (err.size() == 0 && rank >= 0) { + ldout(cct,10) << __func__ << " interpreting name '" << mon_name + << "' as rank " << rank << dendl; + r->target_rank = rank; + } else { + r->target_name = mon_name; + } + r->cmd = cmd; + r->inbl = inbl; + mon_commands.emplace(r->tid, r); + _send_command(r); + } + } + return init.result.get(); + } + + class ContextVerter { + std::string* outs; + ceph::bufferlist* outbl; + Context* onfinish; + + public: + ContextVerter(std::string* outs, ceph::bufferlist* outbl, Context* onfinish) + : outs(outs), outbl(outbl), onfinish(onfinish) {} + ~ContextVerter() = default; + ContextVerter(const ContextVerter&) = default; + ContextVerter& operator =(const ContextVerter&) = default; + ContextVerter(ContextVerter&&) = default; + ContextVerter& operator =(ContextVerter&&) = default; + + void operator()(boost::system::error_code e, + std::string s, + ceph::bufferlist bl) { + if (outs) + *outs = std::move(s); + if (outbl) + *outbl = std::move(bl); + if (onfinish) + onfinish->complete(ceph::from_error_code(e)); + } + }; + + void start_mon_command(const vector<string>& cmd, const bufferlist& inbl, + bufferlist *outbl, string *outs, + Context *onfinish) { + start_mon_command(cmd, inbl, ContextVerter(outs, outbl, onfinish)); + } + void start_mon_command(int mon_rank, + const vector<string>& cmd, const bufferlist& inbl, + bufferlist *outbl, string *outs, + Context *onfinish) { + start_mon_command(mon_rank, cmd, inbl, ContextVerter(outs, outbl, onfinish)); + } + void start_mon_command(const string &mon_name, ///< mon name, with mon. prefix + const vector<string>& cmd, const bufferlist& inbl, + bufferlist *outbl, string *outs, + Context *onfinish) { + start_mon_command(mon_name, cmd, inbl, ContextVerter(outs, outbl, onfinish)); + } + + + // version requests +public: + /** + * get latest known version(s) of cluster map + * + * @param map string name of map (e.g., 'osdmap') + * @param token context that will be triggered on completion + * @return (via Completion) {} on success, + * boost::system::errc::resource_unavailable_try_again if we need to + * resubmit our request + */ + template<typename CompletionToken> + auto get_version(std::string&& map, CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, VersionSig> init(token); + { + std::scoped_lock l(monc_lock); + auto m = ceph::make_message<MMonGetVersion>(); + m->what = std::move(map); + m->handle = ++version_req_id; + version_requests.emplace(m->handle, + VersionCompletion::create( + service.get_executor(), + std::move(init.completion_handler))); + _send_mon_message(m); + } + return init.result.get(); + } + + /** + * Run a callback within our lock, with a reference + * to the MonMap + */ + template<typename Callback, typename...Args> + auto with_monmap(Callback&& cb, Args&&...args) const -> + decltype(cb(monmap, std::forward<Args>(args)...)) { + std::lock_guard l(monc_lock); + return std::forward<Callback>(cb)(monmap, std::forward<Args>(args)...); + } + + void register_config_callback(md_config_t::config_callback fn); + void register_config_notify_callback(std::function<void(void)> f) { + config_notify_cb = f; + } + md_config_t::config_callback get_config_callback(); + +private: + + std::map<ceph_tid_t, std::unique_ptr<VersionCompletion>> version_requests; + ceph_tid_t version_req_id; + void handle_get_version_reply(MMonGetVersionReply* m); + md_config_t::config_callback config_cb; + std::function<void(void)> config_notify_cb; +}; + +#endif diff --git a/src/mon/MonCommand.h b/src/mon/MonCommand.h new file mode 100644 index 000000000..cb60d3d17 --- /dev/null +++ b/src/mon/MonCommand.h @@ -0,0 +1,175 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 John Spray <john.spray@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#pragma once + +#include <string> +#include "include/encoding.h" + +struct MonCommand { + std::string cmdstring; + std::string helpstring; + std::string module; + std::string req_perms; + uint64_t flags; + + // MonCommand flags + static const uint64_t FLAG_NONE = 0; + static const uint64_t FLAG_NOFORWARD = 1 << 0; + static const uint64_t FLAG_OBSOLETE = 1 << 1; + static const uint64_t FLAG_DEPRECATED = 1 << 2; + static const uint64_t FLAG_MGR = 1 << 3; + static const uint64_t FLAG_POLL = 1 << 4; + static const uint64_t FLAG_HIDDEN = 1 << 5; + // asok and tell commands are not forwarded, and they should not be listed + // in --help output. + static const uint64_t FLAG_TELL = (FLAG_NOFORWARD | FLAG_HIDDEN); + + bool has_flag(uint64_t flag) const { return (flags & flag) == flag; } + void set_flag(uint64_t flag) { flags |= flag; } + void unset_flag(uint64_t flag) { flags &= ~flag; } + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + encode_bare(bl); + encode(flags, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator &bl) { + DECODE_START(1, bl); + decode_bare(bl); + decode(flags, bl); + DECODE_FINISH(bl); + } + + /** + * Unversioned encoding for use within encode_array. + */ + void encode_bare(ceph::buffer::list &bl) const { + using ceph::encode; + encode(cmdstring, bl); + encode(helpstring, bl); + encode(module, bl); + encode(req_perms, bl); + std::string availability = "cli,rest"; // Removed field, for backward compat + encode(availability, bl); + } + void decode_bare(ceph::buffer::list::const_iterator &bl) { + using ceph::decode; + decode(cmdstring, bl); + decode(helpstring, bl); + decode(module, bl); + decode(req_perms, bl); + std::string availability; // Removed field, for backward compat + decode(availability, bl); + } + bool is_compat(const MonCommand* o) const { + return cmdstring == o->cmdstring && + module == o->module && req_perms == o->req_perms; + } + + bool is_tell() const { + return has_flag(MonCommand::FLAG_TELL); + } + + bool is_noforward() const { + return has_flag(MonCommand::FLAG_NOFORWARD); + } + + bool is_obsolete() const { + return has_flag(MonCommand::FLAG_OBSOLETE); + } + + bool is_deprecated() const { + return has_flag(MonCommand::FLAG_DEPRECATED); + } + + bool is_mgr() const { + return has_flag(MonCommand::FLAG_MGR); + } + + bool is_hidden() const { + return has_flag(MonCommand::FLAG_HIDDEN); + } + + static void encode_array(const MonCommand *cmds, int size, ceph::buffer::list &bl) { + ENCODE_START(2, 1, bl); + uint16_t s = size; + encode(s, bl); + for (int i = 0; i < size; ++i) { + cmds[i].encode_bare(bl); + } + for (int i = 0; i < size; i++) { + encode(cmds[i].flags, bl); + } + ENCODE_FINISH(bl); + } + static void decode_array(MonCommand **cmds, int *size, + ceph::buffer::list::const_iterator &bl) { + DECODE_START(2, bl); + uint16_t s = 0; + decode(s, bl); + *size = s; + *cmds = new MonCommand[*size]; + for (int i = 0; i < *size; ++i) { + (*cmds)[i].decode_bare(bl); + } + if (struct_v >= 2) { + for (int i = 0; i < *size; i++) + decode((*cmds)[i].flags, bl); + } else { + for (int i = 0; i < *size; i++) + (*cmds)[i].flags = 0; + } + DECODE_FINISH(bl); + } + + // this uses a u16 for the count, so we need a special encoder/decoder. + static void encode_vector(const std::vector<MonCommand>& cmds, + ceph::buffer::list &bl) { + ENCODE_START(2, 1, bl); + uint16_t s = cmds.size(); + encode(s, bl); + for (unsigned i = 0; i < s; ++i) { + cmds[i].encode_bare(bl); + } + for (unsigned i = 0; i < s; i++) { + encode(cmds[i].flags, bl); + } + ENCODE_FINISH(bl); + } + static void decode_vector(std::vector<MonCommand> &cmds, + ceph::buffer::list::const_iterator &bl) { + DECODE_START(2, bl); + uint16_t s = 0; + decode(s, bl); + cmds.resize(s); + for (unsigned i = 0; i < s; ++i) { + cmds[i].decode_bare(bl); + } + if (struct_v >= 2) { + for (unsigned i = 0; i < s; i++) + decode(cmds[i].flags, bl); + } else { + for (unsigned i = 0; i < s; i++) + cmds[i].flags = 0; + } + DECODE_FINISH(bl); + } + + bool requires_perm(char p) const { + return (req_perms.find(p) != std::string::npos); + } +}; +WRITE_CLASS_ENCODER(MonCommand) diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h new file mode 100644 index 000000000..f5ca47eb4 --- /dev/null +++ b/src/mon/MonCommands.h @@ -0,0 +1,1407 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Inktank Storage, Inc. + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* no guard; may be included multiple times */ + +/* + * Define commands that are reported by the monitor's + * "get_command_descriptions" command, and parsed by the Python + * frontend 'ceph' (and perhaps by other frontends, such as a RESTful + * server). The format is: + * + * COMMAND(signature, helpstring, modulename, req perms, availability) + * where: + * signature: describes the command and its parameters (more below) + * helpstring: displays in CLI help, API help (nice if it refers to + * parameter names from signature, 40-a few hundred chars) + * modulename: the monitor module or daemon this applies to: + * mds, osd, pg (osd), mon, auth, log, config-key, mgr + * req perms: required permission in that modulename space to execute command + * this also controls what type of REST command is accepted + * + * The commands describe themselves completely enough for the separate + * frontend(s) to be able to accept user input and validate it against + * the command descriptions, and generate a JSON object that contains + * key:value mappings of parameter names to validated parameter values. + * + * 'signature' is a space-separated list of individual command descriptors; + * each descriptor is either a literal string, which can contain no spaces or + * '=' signs (for instance, in "pg stat", both "pg" and "stat" are literal + * strings representing one descriptor each), or a list of key=val[,key=val...] + * which also includes no spaces. + * + * The key=val form describes a non-literal parameter. Each will have at + * least a name= and type=, and each type can have its own type-specific + * parameters. The parser is the arbiter of these types and their + * interpretation. A few more non-type-specific key=val pairs exist: + * + * req=false marks an optional parameter (default for req is 'true') + * n=<n> is a repeat count for how many of this argument must be supplied. + * n=1 is the default. + * n=N is a special case that means "1 or more". + * + * A perhaps-incomplete list of types: + * + * CephInt: Optional: range=min[|max] + * CephFloat: Optional range + * CephString: optional badchars + * CephSocketpath: validation involves "is it S_ISSOCK" + * CephIPAddr: v4 or v6 addr with optional port, syntax validated + * CephEntityAddr: CephIPAddr + optional '/nonce' + * CephPoolname: Plainold string + * CephObjectname: Another plainold string + * CephPgid: n.xxx where n is an int > 0, xxx is a hex number > 0 + * CephName: daemon name, '*' or '<type>.<id>' (id must be int for type osd) + * CephOsdName: osd name, '*' or '<id> or 'osd.<id>' (id must be int) + * CephChoices: strings="foo|bar" means this param can be either + * CephFilepath: openable file + * CephFragment: cephfs 'fragID': val/bits, val in hex 0xnnn, bits in dec + * CephUUID: uuid in text matching Python uuid.UUID() + * CephPrefix: special type assigned to literals + * + * Example: + * + * COMMAND("auth add " + * "name=entity,type=CephString " + * "name=caps,type=CephString,n=N,req=false", + * "add auth info for <name> from input file, or random key " + * "if no input given, and/or any caps specified in the command") + * + * defines a command "auth add" that takes a required argument "entity" + * of type "CephString", and from 1 to N arguments named "caps" of type + * CephString, at least one of which is required. The front end will + * validate user input against this description. Let's say the user + * enters auth add client.admin 'mon rwx' 'osd *'. The result will be a + * JSON object like {"prefix":"auth add", "entity":"client.admin", + * "caps":["mon rwx", "osd *"]}. + * Note that + * - string literals are accumulated into 'prefix' + * - n=1 descriptors are given normal string or int object values + * - n=N descriptors are given array values + * + * NOTE: be careful with spaces. Each descriptor must be separated by + * one space, no other characters, so if you split lines as above, be + * sure to close and reopen the quotes, and be careful to include the ' + * separating spaces in the quoted string. + * + * The monitor marshals this JSON into a std::map<string, cmd_vartype> + * where cmd_vartype is a boost::variant type-enforcing discriminated + * type, so the monitor is expected to know the type of each argument. + * See cmdparse.cc/h for more details. + * + * The flag parameter for COMMAND_WITH_FLAGS macro must be passed using + * FLAG(f), where 'f' may be one of the following: + * + * NONE - no flag assigned + * NOFORWARD - command may not be forwarded + * OBSOLETE - command is considered obsolete + * DEPRECATED - command is considered deprecated + * MGR - command goes to ceph-mgr (for luminous+) + * POLL - command is intended to be called periodically by the + * client (see iostat) + * HIDDEN - command is hidden (no reported by help etc) + * TELL - tell/asok command. it's an alias of (NOFORWARD | HIDDEN) + * + * A command should always be first considered DEPRECATED before being + * considered OBSOLETE, giving due consideration to users and conforming + * to any guidelines regarding deprecating commands. + */ + +COMMAND("pg map name=pgid,type=CephPgid", "show mapping of pg to osds", \ + "pg", "r") +COMMAND("pg repeer name=pgid,type=CephPgid", "force a PG to repeer", + "osd", "rw") +COMMAND("osd last-stat-seq name=id,type=CephOsdName", \ + "get the last pg stats sequence number reported for this osd", \ + "osd", "r") + +/* + * auth commands AuthMonitor.cc + */ + +COMMAND("auth export name=entity,type=CephString,req=false", \ + "write keyring for requested entity, or master keyring if none given", \ + "auth", "rx") +COMMAND("auth get name=entity,type=CephString", \ + "write keyring file with requested key", "auth", "rx") +COMMAND("auth get-key name=entity,type=CephString", "display requested key", \ + "auth", "rx") +COMMAND("auth print-key name=entity,type=CephString", "display requested key", \ + "auth", "rx") +COMMAND("auth print_key name=entity,type=CephString", "display requested key", \ + "auth", "rx") +COMMAND_WITH_FLAG("auth list", "list authentication state", "auth", "rx", + FLAG(DEPRECATED)) +COMMAND("auth ls", "list authentication state", "auth", "rx") +COMMAND("auth import", "auth import: read keyring file from -i <file>", + "auth", "rwx") +COMMAND("auth add " + "name=entity,type=CephString " + "name=caps,type=CephString,n=N,req=false", + "add auth info for <entity> from input file, or random key if no " + "input is given, and/or any caps specified in the command", + "auth", "rwx") +COMMAND("auth get-or-create-key " + "name=entity,type=CephString " + "name=caps,type=CephString,n=N,req=false", + "get, or add, key for <name> from system/caps pairs specified in the command. If key already exists, any given caps must match the existing caps for that key.", + "auth", "rwx") +COMMAND("auth get-or-create " + "name=entity,type=CephString " + "name=caps,type=CephString,n=N,req=false", + "add auth info for <entity> from input file, or random key if no input given, and/or any caps specified in the command", + "auth", "rwx") +COMMAND("fs authorize " + "name=filesystem,type=CephString " + "name=entity,type=CephString " + "name=caps,type=CephString,n=N", + "add auth for <entity> to access file system <filesystem> based on following directory and permissions pairs", + "auth", "rwx") +COMMAND("auth caps " + "name=entity,type=CephString " + "name=caps,type=CephString,n=N", + "update caps for <name> from caps specified in the command", + "auth", "rwx") +COMMAND_WITH_FLAG("auth del " + "name=entity,type=CephString", + "delete all caps for <name>", + "auth", "rwx", + FLAG(DEPRECATED)) +COMMAND("auth rm " + "name=entity,type=CephString", + "remove all caps for <name>", + "auth", "rwx") + +/* + * Monitor commands (Monitor.cc) + */ +COMMAND_WITH_FLAG("compact", "cause compaction of monitor's leveldb/rocksdb storage", + "mon", "rw", + FLAG(TELL)) +COMMAND_WITH_FLAG("scrub", "scrub the monitor stores", + "mon", "rw", + FLAG(OBSOLETE)) +COMMAND("fsid", "show cluster FSID/UUID", "mon", "r") +COMMAND("log name=logtext,type=CephString,n=N", + "log supplied text to the monitor log", "mon", "rw") +COMMAND("log last " + "name=num,type=CephInt,range=1,req=false " + "name=level,type=CephChoices,strings=debug|info|sec|warn|error,req=false " + "name=channel,type=CephChoices,strings=*|cluster|audit|cephadm,req=false", + "print last few lines of the cluster log", + "mon", "r") + +COMMAND("status", "show cluster status", "mon", "r") +COMMAND("health name=detail,type=CephChoices,strings=detail,req=false", + "show cluster health", "mon", "r") +COMMAND("health mute "\ + "name=code,type=CephString " + "name=ttl,type=CephString,req=false " + "name=sticky,type=CephBool,req=false", + "mute health alert", "mon", "w") +COMMAND("health unmute "\ + "name=code,type=CephString,req=false", + "unmute existing health alert mute(s)", "mon", "w") +COMMAND("time-sync-status", "show time sync status", "mon", "r") +COMMAND("df name=detail,type=CephChoices,strings=detail,req=false", + "show cluster free space stats", "mon", "r") +COMMAND("report name=tags,type=CephString,n=N,req=false", + "report full status of cluster, optional title tag strings", + "mon", "r") +COMMAND("features", "report of connected features", + "mon", "r") +COMMAND("quorum_status", "report status of monitor quorum", + "mon", "r") +COMMAND("mon ok-to-stop " + "name=ids,type=CephString,n=N", + "check whether mon(s) can be safely stopped without reducing immediate " + "availability", + "mon", "r") +COMMAND("mon ok-to-add-offline", + "check whether adding a mon and not starting it would break quorum", + "mon", "r") +COMMAND("mon ok-to-rm " + "name=id,type=CephString", + "check whether removing the specified mon would break quorum", + "mon", "r") + +COMMAND("tell " + "name=target,type=CephName " + "name=args,type=CephString,n=N", + "send a command to a specific daemon", "mon", "rw") +COMMAND_WITH_FLAG("version", "show mon daemon version", "mon", "r", + FLAG(TELL)) + +COMMAND("node ls " + "name=type,type=CephChoices,strings=all|osd|mon|mds|mgr,req=false", + "list all nodes in cluster [type]", "mon", "r") +/* + * Monitor-specific commands under module 'mon' + */ +COMMAND_WITH_FLAG("mon scrub", + "scrub the monitor stores", + "mon", "rw", + FLAG(NONE)) +COMMAND("mon metadata name=id,type=CephString,req=false", + "fetch metadata for mon <id>", + "mon", "r") +COMMAND("mon count-metadata name=property,type=CephString", + "count mons by metadata field property", + "mon", "r") +COMMAND("mon versions", + "check running versions of monitors", + "mon", "r") +COMMAND("versions", + "check running versions of ceph daemons", + "mon", "r") + + + +/* + * MDS commands (MDSMonitor.cc) + */ + +COMMAND_WITH_FLAG("mds stat", "show MDS status", "mds", "r", FLAG(HIDDEN)) +COMMAND_WITH_FLAG("mds dump " + "name=epoch,type=CephInt,req=false,range=0", + "dump legacy MDS cluster info, optionally from epoch", + "mds", "r", FLAG(OBSOLETE)) +COMMAND("fs dump " + "name=epoch,type=CephInt,req=false,range=0", + "dump all CephFS status, optionally from epoch", "mds", "r") +COMMAND_WITH_FLAG("mds getmap " + "name=epoch,type=CephInt,req=false,range=0", + "get MDS map, optionally from epoch", "mds", "r", FLAG(OBSOLETE)) +COMMAND("mds metadata name=who,type=CephString,req=false", + "fetch metadata for mds <role>", + "mds", "r") +COMMAND("mds count-metadata name=property,type=CephString", + "count MDSs by metadata field property", + "mds", "r") +COMMAND("mds versions", + "check running versions of MDSs", + "mds", "r") +COMMAND_WITH_FLAG("mds tell " + "name=who,type=CephString " + "name=args,type=CephString,n=N", + "send command to particular mds", "mds", "rw", FLAG(OBSOLETE)) +COMMAND_WITH_FLAG("mds stop name=role,type=CephString", "stop mds", + "mds", "rw", FLAG(OBSOLETE)) +COMMAND_WITH_FLAG("mds deactivate name=role,type=CephString", + "clean up specified MDS rank (use with `set max_mds` to shrink cluster)", + "mds", "rw", FLAG(OBSOLETE)) +COMMAND("mds ok-to-stop name=ids,type=CephString,n=N", + "check whether stopping the specified MDS would reduce immediate availability", + "mds", "r") +COMMAND_WITH_FLAG("mds set_max_mds " + "name=maxmds,type=CephInt,range=0", + "set max MDS index", "mds", "rw", FLAG(OBSOLETE)) +COMMAND_WITH_FLAG("mds set " + "name=var,type=CephChoices,strings=max_mds|max_file_size|inline_data|" + "allow_new_snaps|allow_multimds|allow_multimds_snaps|allow_dirfrags " + "name=val,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "set mds parameter <var> to <val>", "mds", "rw", FLAG(OBSOLETE)) +COMMAND_WITH_FLAG("mds freeze name=role_or_gid,type=CephString" + " name=val,type=CephString", + "freeze MDS yes/no", "mds", "rw", FLAG(HIDDEN)) +// arbitrary limit 0-20 below; worth standing on head to make it +// relate to actual state definitions? +// #include "include/ceph_fs.h" +COMMAND_WITH_FLAG("mds set_state " + "name=gid,type=CephInt,range=0 " + "name=state,type=CephInt,range=0|20", + "set mds state of <gid> to <numeric-state>", "mds", "rw", FLAG(HIDDEN)) +COMMAND("mds fail name=role_or_gid,type=CephString", + "Mark MDS failed: trigger a failover if a standby is available", + "mds", "rw") +COMMAND("mds repaired name=role,type=CephString", + "mark a damaged MDS rank as no longer damaged", "mds", "rw") +COMMAND("mds rm " + "name=gid,type=CephInt,range=0", + "remove nonactive mds", "mds", "rw") +COMMAND_WITH_FLAG("mds rmfailed name=role,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "remove failed rank", "mds", "rw", FLAG(HIDDEN)) +COMMAND_WITH_FLAG("mds cluster_down", "take MDS cluster down", "mds", "rw", FLAG(OBSOLETE)) +COMMAND_WITH_FLAG("mds cluster_up", "bring MDS cluster up", "mds", "rw", FLAG(OBSOLETE)) +COMMAND_WITH_FLAG("mds compat show", "show mds compatibility settings", + "mds", "r", FLAG(DEPRECATED)) +COMMAND("fs compat show " + "name=fs_name,type=CephString ", + "show fs compatibility settings", + "mds", "r") +COMMAND_WITH_FLAG("mds compat rm_compat " + "name=feature,type=CephInt,range=0", + "remove compatible feature", "mds", "rw", FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("mds compat rm_incompat " + "name=feature,type=CephInt,range=0", + "remove incompatible feature", "mds", "rw", FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("mds add_data_pool " + "name=pool,type=CephString", + "add data pool <pool>", "mds", "rw", FLAG(OBSOLETE)) +COMMAND_WITH_FLAG("mds rm_data_pool " + "name=pool,type=CephString", + "remove data pool <pool>", "mds", "rw", FLAG(OBSOLETE)) +COMMAND_WITH_FLAG("mds remove_data_pool " + "name=pool,type=CephString", + "remove data pool <pool>", "mds", "rw", FLAG(OBSOLETE)) +COMMAND_WITH_FLAG("mds newfs " + "name=metadata,type=CephInt,range=0 " + "name=data,type=CephInt,range=0 " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "make new filesystem using pools <metadata> and <data>", + "mds", "rw", FLAG(OBSOLETE)) +COMMAND("fs new " + "name=fs_name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=metadata,type=CephString " + "name=data,type=CephString " + "name=force,type=CephBool,req=false " + "name=allow_dangerous_metadata_overlay,type=CephBool,req=false " + "name=fscid,type=CephInt,range=0,req=false " + "name=recover,type=CephBool,req=false", + "make new filesystem using named pools <metadata> and <data>", + "fs", "rw") +COMMAND("fs fail " + "name=fs_name,type=CephString ", + "bring the file system down and all of its ranks", + "fs", "rw") +COMMAND("fs rm " + "name=fs_name,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "disable the named filesystem", + "fs", "rw") +COMMAND("fs reset " + "name=fs_name,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "disaster recovery only: reset to a single-MDS map", + "fs", "rw") +COMMAND("fs ls ", + "list filesystems", + "fs", "r") +COMMAND("fs get name=fs_name,type=CephString", + "get info about one filesystem", + "fs", "r") +COMMAND("fs set " + "name=fs_name,type=CephString " + "name=var,type=CephChoices,strings=max_mds|max_file_size" + "|allow_new_snaps|inline_data|cluster_down|allow_dirfrags|balancer" + "|standby_count_wanted|session_timeout|session_autoclose" + "|allow_standby_replay|down|joinable|min_compat_client " + "name=val,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false " + "name=yes_i_really_really_mean_it,type=CephBool,req=false", + "set fs parameter <var> to <val>", "mds", "rw") +COMMAND("fs flag set name=flag_name,type=CephChoices,strings=enable_multiple " + "name=val,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "Set a global CephFS flag", + "fs", "rw") + +COMMAND("fs feature ls", + "list available cephfs features to be set/unset", + "mds", "r") + +COMMAND("fs compat " + "name=fs_name,type=CephString " + "name=subop,type=CephChoices,strings=rm_compat|rm_incompat|add_compat|add_incompat " + "name=feature,type=CephInt " + "name=feature_str,type=CephString,req=false ", + "manipulate compat settings", "fs", "rw") + +COMMAND("fs required_client_features " + "name=fs_name,type=CephString " + "name=subop,type=CephChoices,strings=add|rm " + "name=val,type=CephString ", + "add/remove required features of clients", "mds", "rw") + +COMMAND("fs add_data_pool name=fs_name,type=CephString " + "name=pool,type=CephString", + "add data pool <pool>", "mds", "rw") +COMMAND("fs rm_data_pool name=fs_name,type=CephString " + "name=pool,type=CephString", + "remove data pool <pool>", "mds", "rw") +COMMAND_WITH_FLAG("fs set_default name=fs_name,type=CephString", + "set the default to the named filesystem", + "fs", "rw", + FLAG(DEPRECATED)) +COMMAND("fs set-default name=fs_name,type=CephString", + "set the default to the named filesystem", + "fs", "rw") +COMMAND("fs mirror enable " + "name=fs_name,type=CephString ", + "enable mirroring for a ceph filesystem", "mds", "rw") +COMMAND("fs mirror disable " + "name=fs_name,type=CephString ", + "disable mirroring for a ceph filesystem", "mds", "rw") +COMMAND("fs mirror peer_add " + "name=fs_name,type=CephString " + "name=uuid,type=CephString " + "name=remote_cluster_spec,type=CephString " + "name=remote_fs_name,type=CephString", + "add a mirror peer for a ceph filesystem", "mds", "rw") +COMMAND("fs mirror peer_remove " + "name=fs_name,type=CephString " + "name=uuid,type=CephString ", + "remove a mirror peer for a ceph filesystem", "mds", "rw") + +/* + * Monmap commands + */ +COMMAND("mon dump " + "name=epoch,type=CephInt,range=0,req=false", + "dump formatted monmap (optionally from epoch)", + "mon", "r") +COMMAND("mon stat", "summarize monitor status", "mon", "r") +COMMAND("mon getmap " + "name=epoch,type=CephInt,range=0,req=false", + "get monmap", "mon", "r") +COMMAND("mon add " + "name=name,type=CephString " + "name=addr,type=CephIPAddr " + "name=location,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=],req=false", + "add new monitor named <name> at <addr>, possibly with CRUSH location <location>", "mon", "rw") +COMMAND("mon rm " + "name=name,type=CephString", + "remove monitor named <name>", "mon", "rw") +COMMAND_WITH_FLAG("mon remove " + "name=name,type=CephString", + "remove monitor named <name>", "mon", "rw", + FLAG(DEPRECATED)) +COMMAND("mon feature ls " + "name=with_value,type=CephChoices,strings=--with-value,req=false", + "list available mon map features to be set/unset", + "mon", "r") +COMMAND("mon feature set " + "name=feature_name,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "set provided feature on mon map", + "mon", "rw") +COMMAND("mon set-rank " + "name=name,type=CephString " + "name=rank,type=CephInt", + "set the rank for the specified mon", + "mon", "rw") +COMMAND("mon set-addrs " + "name=name,type=CephString " + "name=addrs,type=CephString", + "set the addrs (IPs and ports) a specific monitor binds to", + "mon", "rw") +COMMAND("mon set-weight " + "name=name,type=CephString " + "name=weight,type=CephInt,range=0|65535", + "set the weight for the specified mon", + "mon", "rw") +COMMAND("mon enable-msgr2", + "enable the msgr2 protocol on port 3300", + "mon", "rw") +COMMAND("mon set election_strategy " \ + "name=strategy,type=CephString", \ + "set the election strategy to use; choices classic, disallow, connectivity", \ + "mon", "rw") +COMMAND("mon add disallowed_leader " \ + "name=name,type=CephString", \ + "prevent the named mon from being a leader", \ + "mon", "rw") +COMMAND("mon rm disallowed_leader " \ + "name=name,type=CephString", \ + "allow the named mon to be a leader again", \ + "mon", "rw") +COMMAND("mon set_location " \ + "name=name,type=CephString " + "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", + "specify location <args> for the monitor <name>, using CRUSH bucket names", \ + "mon", "rw") +COMMAND("mon enable_stretch_mode " \ + "name=tiebreaker_mon,type=CephString, " + "name=new_crush_rule,type=CephString, " + "name=dividing_bucket,type=CephString, ", + "enable stretch mode, changing the peering rules and " + "failure handling on all pools with <tiebreaker_mon> " + "as the tiebreaker and setting <dividing_bucket> locations " + "as the units for stretching across", + "mon", "rw") +COMMAND("mon set_new_tiebreaker " \ + "name=name,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "switch the stretch tiebreaker to be the named mon", \ + "mon", "rw") + +/* + * OSD commands + */ +COMMAND("osd stat", "print summary of OSD map", "osd", "r") +COMMAND("osd dump " + "name=epoch,type=CephInt,range=0,req=false", + "print summary of OSD map", "osd", "r") +COMMAND("osd info " + "name=id,type=CephOsdName,req=false", + "print osd's {id} information (instead of all osds from map)", + "osd", "r") +COMMAND("osd tree " + "name=epoch,type=CephInt,range=0,req=false " + "name=states,type=CephChoices,strings=up|down|in|out|destroyed,n=N,req=false", + "print OSD tree", "osd", "r") +COMMAND("osd tree-from " + "name=epoch,type=CephInt,range=0,req=false " + "name=bucket,type=CephString " + "name=states,type=CephChoices,strings=up|down|in|out|destroyed,n=N,req=false", + "print OSD tree in bucket", "osd", "r") +COMMAND("osd ls " + "name=epoch,type=CephInt,range=0,req=false", + "show all OSD ids", "osd", "r") +COMMAND("osd getmap " + "name=epoch,type=CephInt,range=0,req=false", + "get OSD map", "osd", "r") +COMMAND("osd getcrushmap " + "name=epoch,type=CephInt,range=0,req=false", + "get CRUSH map", "osd", "r") +COMMAND("osd getmaxosd", "show largest OSD id", "osd", "r") +COMMAND("osd ls-tree " + "name=epoch,type=CephInt,range=0,req=false " + "name=name,type=CephString,req=true", + "show OSD ids under bucket <name> in the CRUSH map", + "osd", "r") +COMMAND("osd find " + "name=id,type=CephOsdName", + "find osd <id> in the CRUSH map and show its location", + "osd", "r") +COMMAND("osd metadata " + "name=id,type=CephOsdName,req=false", + "fetch metadata for osd {id} (default all)", + "osd", "r") +COMMAND("osd count-metadata name=property,type=CephString", + "count OSDs by metadata field property", + "osd", "r") +COMMAND("osd versions", + "check running versions of OSDs", + "osd", "r") +COMMAND("osd numa-status", + "show NUMA status of OSDs", + "osd", "r") +COMMAND("osd map " + "name=pool,type=CephPoolname " + "name=object,type=CephObjectname " + "name=nspace,type=CephString,req=false", + "find pg for <object> in <pool> with [namespace]", "osd", "r") +COMMAND_WITH_FLAG("osd lspools", + "list pools", "osd", "r", FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd crush rule list", "list crush rules", "osd", "r", + FLAG(DEPRECATED)) +COMMAND("osd crush rule ls", "list crush rules", "osd", "r") +COMMAND("osd crush rule ls-by-class " + "name=class,type=CephString,goodchars=[A-Za-z0-9-_.]", + "list all crush rules that reference the same <class>", + "osd", "r") +COMMAND("osd crush rule dump " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.],req=false", + "dump crush rule <name> (default all)", + "osd", "r") +COMMAND("osd crush dump", + "dump crush map", + "osd", "r") +COMMAND("osd setcrushmap name=prior_version,type=CephInt,req=false", + "set crush map from input file", + "osd", "rw") +COMMAND("osd crush set name=prior_version,type=CephInt,req=false", + "set crush map from input file", + "osd", "rw") +COMMAND("osd crush add-bucket " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=type,type=CephString " + "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=],req=false", + "add no-parent (probably root) crush bucket <name> of type <type> " + "to location <args>", + "osd", "rw") +COMMAND("osd crush rename-bucket " + "name=srcname,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=dstname,type=CephString,goodchars=[A-Za-z0-9-_.]", + "rename bucket <srcname> to <dstname>", + "osd", "rw") +COMMAND("osd crush set " + "name=id,type=CephOsdName " + "name=weight,type=CephFloat,range=0.0 " + "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", + "update crushmap position and weight for <name> to <weight> with location <args>", + "osd", "rw") +COMMAND("osd crush add " + "name=id,type=CephOsdName " + "name=weight,type=CephFloat,range=0.0 " + "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", + "add or update crushmap position and weight for <name> with <weight> and location <args>", + "osd", "rw") +COMMAND("osd crush set-all-straw-buckets-to-straw2", + "convert all CRUSH current straw buckets to use the straw2 algorithm", + "osd", "rw") +COMMAND("osd crush class create " + "name=class,type=CephString,goodchars=[A-Za-z0-9-_]", + "create crush device class <class>", + "osd", "rw") +COMMAND("osd crush class rm " + "name=class,type=CephString,goodchars=[A-Za-z0-9-_]", + "remove crush device class <class>", + "osd", "rw") +COMMAND("osd crush set-device-class " + "name=class,type=CephString " + "name=ids,type=CephString,n=N", + "set the <class> of the osd(s) <id> [<id>...]," + "or use <all|any> to set all.", + "osd", "rw") +COMMAND("osd crush rm-device-class " + "name=ids,type=CephString,n=N", + "remove class of the osd(s) <id> [<id>...]," + "or use <all|any> to remove all.", + "osd", "rw") +COMMAND("osd crush class rename " + "name=srcname,type=CephString,goodchars=[A-Za-z0-9-_] " + "name=dstname,type=CephString,goodchars=[A-Za-z0-9-_]", + "rename crush device class <srcname> to <dstname>", + "osd", "rw") +COMMAND("osd crush create-or-move " + "name=id,type=CephOsdName " + "name=weight,type=CephFloat,range=0.0 " + "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", + "create entry or move existing entry for <name> <weight> at/to location <args>", + "osd", "rw") +COMMAND("osd crush move " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", + "move existing entry for <name> to location <args>", + "osd", "rw") +COMMAND("osd crush swap-bucket " + "name=source,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=dest,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "swap existing bucket contents from (orphan) bucket <source> and <target>", + "osd", "rw") +COMMAND("osd crush link " + "name=name,type=CephString " + "name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", + "link existing entry for <name> under location <args>", + "osd", "rw") +COMMAND("osd crush rm " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=ancestor,type=CephString,req=false,goodchars=[A-Za-z0-9-_.]", + "remove <name> from crush map (everywhere, or just at <ancestor>)",\ + "osd", "rw") +COMMAND_WITH_FLAG("osd crush remove " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=ancestor,type=CephString,req=false,goodchars=[A-Za-z0-9-_.]", + "remove <name> from crush map (everywhere, or just at <ancestor>)", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND("osd crush unlink " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=ancestor,type=CephString,req=false,goodchars=[A-Za-z0-9-_.]", + "unlink <name> from crush map (everywhere, or just at <ancestor>)", + "osd", "rw") +COMMAND("osd crush reweight-all", + "recalculate the weights for the tree to ensure they sum correctly", + "osd", "rw") +COMMAND("osd crush reweight " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=weight,type=CephFloat,range=0.0", + "change <name>'s weight to <weight> in crush map", + "osd", "rw") +COMMAND("osd crush reweight-subtree " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=weight,type=CephFloat,range=0.0", + "change all leaf items beneath <name> to <weight> in crush map", + "osd", "rw") +COMMAND("osd crush tunables " + "name=profile,type=CephChoices,strings=legacy|argonaut|bobtail|firefly|hammer|jewel|optimal|default", + "set crush tunables values to <profile>", "osd", "rw") +COMMAND("osd crush set-tunable " + "name=tunable,type=CephChoices,strings=straw_calc_version " + "name=value,type=CephInt", + "set crush tunable <tunable> to <value>", + "osd", "rw") +COMMAND("osd crush get-tunable " + "name=tunable,type=CephChoices,strings=straw_calc_version", + "get crush tunable <tunable>", + "osd", "r") +COMMAND("osd crush show-tunables", + "show current crush tunables", "osd", "r") +COMMAND("osd crush rule create-simple " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=root,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=type,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=mode,type=CephChoices,strings=firstn|indep,req=false", + "create crush rule <name> to start from <root>, replicate across buckets of type <type>, using a choose mode of <firstn|indep> (default firstn; indep best for erasure pools)", + "osd", "rw") +COMMAND("osd crush rule create-replicated " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=root,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=type,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=class,type=CephString,goodchars=[A-Za-z0-9-_.],req=false", + "create crush rule <name> for replicated pool to start from <root>, replicate across buckets of type <type>, use devices of type <class> (ssd or hdd)", + "osd", "rw") +COMMAND("osd crush rule create-erasure " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=profile,type=CephString,req=false,goodchars=[A-Za-z0-9-_.=]", + "create crush rule <name> for erasure coded pool created with <profile> (default default)", + "osd", "rw") +COMMAND("osd crush rule rm " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] ", + "remove crush rule <name>", "osd", "rw") +COMMAND("osd crush rule rename " + "name=srcname,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=dstname,type=CephString,goodchars=[A-Za-z0-9-_.]", + "rename crush rule <srcname> to <dstname>", + "osd", "rw") +COMMAND("osd crush tree " + "name=shadow,type=CephChoices,strings=--show-shadow,req=false", + "dump crush buckets and items in a tree view", + "osd", "r") +COMMAND("osd crush ls name=node,type=CephString,goodchars=[A-Za-z0-9-_.]", + "list items beneath a node in the CRUSH tree", + "osd", "r") +COMMAND("osd crush class ls", + "list all crush device classes", + "osd", "r") +COMMAND("osd crush class ls-osd " + "name=class,type=CephString,goodchars=[A-Za-z0-9-_]", + "list all osds belonging to the specific <class>", + "osd", "r") +COMMAND("osd crush get-device-class " + "name=ids,type=CephString,n=N", + "get classes of specified osd(s) <id> [<id>...]", + "osd", "r") +COMMAND("osd crush weight-set ls", + "list crush weight sets", + "osd", "r") +COMMAND("osd crush weight-set dump", + "dump crush weight sets", + "osd", "r") +COMMAND("osd crush weight-set create-compat", + "create a default backward-compatible weight-set", + "osd", "rw") +COMMAND("osd crush weight-set create " + "name=pool,type=CephPoolname "\ + "name=mode,type=CephChoices,strings=flat|positional", + "create a weight-set for a given pool", + "osd", "rw") +COMMAND("osd crush weight-set rm name=pool,type=CephPoolname", + "remove the weight-set for a given pool", + "osd", "rw") +COMMAND("osd crush weight-set rm-compat", + "remove the backward-compatible weight-set", + "osd", "rw") +COMMAND("osd crush weight-set reweight " + "name=pool,type=CephPoolname " + "name=item,type=CephString " + "name=weight,type=CephFloat,range=0.0,n=N", + "set weight for an item (bucket or osd) in a pool's weight-set", + "osd", "rw") +COMMAND("osd crush weight-set reweight-compat " + "name=item,type=CephString " + "name=weight,type=CephFloat,range=0.0,n=N", + "set weight for an item (bucket or osd) in the backward-compatible weight-set", + "osd", "rw") +COMMAND("osd setmaxosd " + "name=newmax,type=CephInt,range=0", + "set new maximum osd value", "osd", "rw") +COMMAND("osd set-full-ratio " + "name=ratio,type=CephFloat,range=0.0|1.0", + "set usage ratio at which OSDs are marked full", + "osd", "rw") +COMMAND("osd set-backfillfull-ratio " + "name=ratio,type=CephFloat,range=0.0|1.0", + "set usage ratio at which OSDs are marked too full to backfill", + "osd", "rw") +COMMAND("osd set-nearfull-ratio " + "name=ratio,type=CephFloat,range=0.0|1.0", + "set usage ratio at which OSDs are marked near-full", + "osd", "rw") +COMMAND("osd get-require-min-compat-client", + "get the minimum client version we will maintain compatibility with", + "osd", "r") +COMMAND("osd set-require-min-compat-client " + "name=version,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "set the minimum client version we will maintain compatibility with", + "osd", "rw") +COMMAND("osd pause", "pause osd", "osd", "rw") +COMMAND("osd unpause", "unpause osd", "osd", "rw") +COMMAND("osd erasure-code-profile set " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=profile,type=CephString,n=N,req=false " + "name=force,type=CephBool,req=false", + "create erasure code profile <name> with [<key[=value]> ...] pairs. Add a --force at the end to override an existing profile (VERY DANGEROUS)", + "osd", "rw") +COMMAND("osd erasure-code-profile get " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.]", + "get erasure code profile <name>", + "osd", "r") +COMMAND("osd erasure-code-profile rm " + "name=name,type=CephString,goodchars=[A-Za-z0-9-_.]", + "remove erasure code profile <name>", + "osd", "rw") +COMMAND("osd erasure-code-profile ls", + "list all erasure code profiles", + "osd", "r") +COMMAND("osd set " + "name=key,type=CephChoices,strings=full|pause|noup|nodown|" + "noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|" + "notieragent|nosnaptrim|pglog_hardlimit " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "set <key>", "osd", "rw") +COMMAND("osd unset " + "name=key,type=CephChoices,strings=full|pause|noup|nodown|"\ + "noout|noin|nobackfill|norebalance|norecover|noscrub|nodeep-scrub|" + "notieragent|nosnaptrim", + "unset <key>", "osd", "rw") +COMMAND("osd require-osd-release "\ + "name=release,type=CephChoices,strings=luminous|mimic|nautilus|octopus|pacific " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "set the minimum allowed OSD release to participate in the cluster", + "osd", "rw") +COMMAND("osd down " + "name=ids,type=CephString,n=N " + "name=definitely_dead,type=CephBool,req=false", + "set osd(s) <id> [<id>...] down, " + "or use <any|all> to set all osds down", + "osd", "rw") +COMMAND("osd stop " + "type=CephString,name=ids,n=N", + "stop the corresponding osd daemons and mark them as down", + "osd", "rw") +COMMAND("osd out " + "name=ids,type=CephString,n=N", + "set osd(s) <id> [<id>...] out, " + "or use <any|all> to set all osds out", + "osd", "rw") +COMMAND("osd in " + "name=ids,type=CephString,n=N", + "set osd(s) <id> [<id>...] in, " + "can use <any|all> to automatically set all previously out osds in", + "osd", "rw") +COMMAND_WITH_FLAG("osd rm " + "name=ids,type=CephString,n=N", + "remove osd(s) <id> [<id>...], " + "or use <any|all> to remove all osds", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd add-noup " + "name=ids,type=CephString,n=N", + "mark osd(s) <id> [<id>...] as noup, " + "or use <all|any> to mark all osds as noup", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd add-nodown " + "name=ids,type=CephString,n=N", + "mark osd(s) <id> [<id>...] as nodown, " + "or use <all|any> to mark all osds as nodown", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd add-noin " + "name=ids,type=CephString,n=N", + "mark osd(s) <id> [<id>...] as noin, " + "or use <all|any> to mark all osds as noin", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd add-noout " + "name=ids,type=CephString,n=N", + "mark osd(s) <id> [<id>...] as noout, " + "or use <all|any> to mark all osds as noout", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd rm-noup " + "name=ids,type=CephString,n=N", + "allow osd(s) <id> [<id>...] to be marked up " + "(if they are currently marked as noup), " + "can use <all|any> to automatically filter out all noup osds", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd rm-nodown " + "name=ids,type=CephString,n=N", + "allow osd(s) <id> [<id>...] to be marked down " + "(if they are currently marked as nodown), " + "can use <all|any> to automatically filter out all nodown osds", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd rm-noin " + "name=ids,type=CephString,n=N", + "allow osd(s) <id> [<id>...] to be marked in " + "(if they are currently marked as noin), " + "can use <all|any> to automatically filter out all noin osds", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd rm-noout " + "name=ids,type=CephString,n=N", + "allow osd(s) <id> [<id>...] to be marked out " + "(if they are currently marked as noout), " + "can use <all|any> to automatically filter out all noout osds", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND("osd set-group " + "name=flags,type=CephString " + "name=who,type=CephString,n=N", + "set <flags> for batch osds or crush nodes, " + "<flags> must be a comma-separated subset of {noup,nodown,noin,noout}", + "osd", "rw") +COMMAND("osd unset-group " + "name=flags,type=CephString " + "name=who,type=CephString,n=N", + "unset <flags> for batch osds or crush nodes, " + "<flags> must be a comma-separated subset of {noup,nodown,noin,noout}", + "osd", "rw") +COMMAND("osd reweight " + "name=id,type=CephOsdName " + "type=CephFloat,name=weight,range=0.0|1.0", + "reweight osd to 0.0 < <weight> < 1.0", "osd", "rw") +COMMAND("osd reweightn " + "name=weights,type=CephString", + "reweight osds with {<id>: <weight>,...}", + "osd", "rw") +COMMAND("osd force-create-pg " + "name=pgid,type=CephPgid "\ + "name=yes_i_really_mean_it,type=CephBool,req=false", + "force creation of pg <pgid>", + "osd", "rw") +COMMAND("osd pg-temp " + "name=pgid,type=CephPgid " + "name=id,type=CephOsdName,n=N,req=false", + "set pg_temp mapping pgid:[<id> [<id>...]] (developers only)", + "osd", "rw") +COMMAND("osd pg-upmap " + "name=pgid,type=CephPgid " + "name=id,type=CephOsdName,n=N", + "set pg_upmap mapping <pgid>:[<id> [<id>...]] (developers only)", + "osd", "rw") +COMMAND("osd rm-pg-upmap " + "name=pgid,type=CephPgid", + "clear pg_upmap mapping for <pgid> (developers only)", + "osd", "rw") +COMMAND("osd pg-upmap-items " + "name=pgid,type=CephPgid " + "name=id,type=CephOsdName,n=N", + "set pg_upmap_items mapping <pgid>:{<id> to <id>, [...]} (developers only)", + "osd", "rw") +COMMAND("osd rm-pg-upmap-items " + "name=pgid,type=CephPgid", + "clear pg_upmap_items mapping for <pgid> (developers only)", + "osd", "rw") +COMMAND("osd primary-temp " + "name=pgid,type=CephPgid " + "name=id,type=CephOsdName", + "set primary_temp mapping pgid:<id>|-1 (developers only)", + "osd", "rw") +COMMAND("osd primary-affinity " + "name=id,type=CephOsdName " + "type=CephFloat,name=weight,range=0.0|1.0", + "adjust osd primary-affinity from 0.0 <= <weight> <= 1.0", + "osd", "rw") +COMMAND_WITH_FLAG("osd destroy-actual " + "name=id,type=CephOsdName " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "mark osd as being destroyed. Keeps the ID intact (allowing reuse), " + "but removes cephx keys, config-key data and lockbox keys, "\ + "rendering data permanently unreadable.", + "osd", "rw", FLAG(HIDDEN)) +COMMAND("osd purge-new " + "name=id,type=CephOsdName " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "purge all traces of an OSD that was partially created but never " + "started", + "osd", "rw") +COMMAND_WITH_FLAG("osd purge-actual " + "name=id,type=CephOsdName " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "purge all osd data from the monitors. Combines `osd destroy`, " + "`osd rm`, and `osd crush rm`.", + "osd", "rw", FLAG(HIDDEN)) +COMMAND("osd lost " + "name=id,type=CephOsdName " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "mark osd as permanently lost. THIS DESTROYS DATA IF NO MORE REPLICAS EXIST, BE CAREFUL", + "osd", "rw") +COMMAND_WITH_FLAG("osd create " + "name=uuid,type=CephUUID,req=false " + "name=id,type=CephOsdName,req=false", + "create new osd (with optional UUID and ID)", "osd", "rw", + FLAG(DEPRECATED)) +COMMAND("osd new " + "name=uuid,type=CephUUID,req=true " + "name=id,type=CephOsdName,req=false", + "Create a new OSD. If supplied, the `id` to be replaced needs to " + "exist and have been previously destroyed. " + "Reads secrets from JSON file via `-i <file>` (see man page).", + "osd", "rw") +COMMAND("osd blocklist " + "name=range,type=CephString,goodchars=[range],req=false " + "name=blocklistop,type=CephChoices,strings=add|rm " + "name=addr,type=CephEntityAddr " + "name=expire,type=CephFloat,range=0.0,req=false", + "add (optionally until <expire> seconds from now) or remove <addr> from blocklist", + "osd", "rw") +COMMAND("osd blocklist ls", "show blocklisted clients", "osd", "r") +COMMAND("osd blocklist clear", "clear all blocklisted clients", "osd", "rw") + +COMMAND_WITH_FLAG("osd blacklist " + "name=blacklistop,type=CephChoices,strings=add|rm " + "name=addr,type=CephEntityAddr " + "name=expire,type=CephFloat,range=0.0,req=false", + "add (optionally until <expire> seconds from now) or remove <addr> from blacklist", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd blacklist ls", "show blacklisted clients", "osd", "r", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("osd blacklist clear", "clear all blacklisted clients", "osd", "rw", + FLAG(DEPRECATED)) + +COMMAND("osd pool mksnap " + "name=pool,type=CephPoolname " + "name=snap,type=CephString", + "make snapshot <snap> in <pool>", "osd", "rw") +COMMAND("osd pool rmsnap " + "name=pool,type=CephPoolname " + "name=snap,type=CephString", + "remove snapshot <snap> from <pool>", "osd", "rw") +COMMAND("osd pool ls " + "name=detail,type=CephChoices,strings=detail,req=false", + "list pools", "osd", "r") +COMMAND("osd pool create " + "name=pool,type=CephPoolname " + "name=pg_num,type=CephInt,range=0,req=false " + "name=pgp_num,type=CephInt,range=0,req=false " + "name=pool_type,type=CephChoices,strings=replicated|erasure,req=false " + "name=erasure_code_profile,type=CephString,req=false,goodchars=[A-Za-z0-9-_.] " + "name=rule,type=CephString,req=false " + "name=expected_num_objects,type=CephInt,range=0,req=false " + "name=size,type=CephInt,range=0,req=false " + "name=pg_num_min,type=CephInt,range=0,req=false " + "name=pg_num_max,type=CephInt,range=0,req=false " + "name=autoscale_mode,type=CephChoices,strings=on|off|warn,req=false " + "name=bulk,type=CephBool,req=false " + "name=target_size_bytes,type=CephInt,range=0,req=false " + "name=target_size_ratio,type=CephFloat,range=0.0,req=false",\ + "create pool", "osd", "rw") +COMMAND_WITH_FLAG("osd pool delete " + "name=pool,type=CephPoolname " + "name=pool2,type=CephPoolname,req=false " + "name=yes_i_really_really_mean_it,type=CephBool,req=false " + "name=yes_i_really_really_mean_it_not_faking,type=CephBool,req=false ", + "delete pool", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND("osd pool rm " + "name=pool,type=CephPoolname " + "name=pool2,type=CephPoolname,req=false " + "name=yes_i_really_really_mean_it,type=CephBool,req=false " + "name=yes_i_really_really_mean_it_not_faking,type=CephBool,req=false ", + "remove pool", + "osd", "rw") +COMMAND("osd pool rename " + "name=srcpool,type=CephPoolname " + "name=destpool,type=CephPoolname", + "rename <srcpool> to <destpool>", "osd", "rw") +COMMAND("osd pool get " + "name=pool,type=CephPoolname " + "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|bulk", + "get pool parameter <var>", "osd", "r") +COMMAND("osd pool set " + "name=pool,type=CephPoolname " + "name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|bulk " + "name=val,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "set pool parameter <var> to <val>", "osd", "rw") +// 'val' is a CephString because it can include a unit. Perhaps +// there should be a Python type for validation/conversion of strings +// with units. +COMMAND("osd pool set-quota " + "name=pool,type=CephPoolname " + "name=field,type=CephChoices,strings=max_objects|max_bytes " + "name=val,type=CephString", + "set object or byte limit on pool", "osd", "rw") +COMMAND("osd pool get-quota " + "name=pool,type=CephPoolname ", + "obtain object or byte limits for pool", + "osd", "r") +COMMAND("osd pool application enable " + "name=pool,type=CephPoolname " + "name=app,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "enable use of an application <app> [cephfs,rbd,rgw] on pool <poolname>", + "osd", "rw") +COMMAND("osd pool application disable " + "name=pool,type=CephPoolname " + "name=app,type=CephString " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "disables use of an application <app> on pool <poolname>", + "osd", "rw") +COMMAND("osd pool application set " + "name=pool,type=CephPoolname " + "name=app,type=CephString " + "name=key,type=CephString,goodchars=[A-Za-z0-9-_.] " + "name=value,type=CephString,goodchars=[A-Za-z0-9-_.=]", + "sets application <app> metadata key <key> to <value> on pool <poolname>", + "osd", "rw") +COMMAND("osd pool application rm " + "name=pool,type=CephPoolname " + "name=app,type=CephString " + "name=key,type=CephString", + "removes application <app> metadata key <key> on pool <poolname>", + "osd", "rw") +COMMAND("osd pool application get " + "name=pool,type=CephPoolname,req=fasle " + "name=app,type=CephString,req=false " + "name=key,type=CephString,req=false", + "get value of key <key> of application <app> on pool <poolname>", + "osd", "r") +COMMAND("osd utilization", + "get basic pg distribution stats", + "osd", "r") +COMMAND("osd force_healthy_stretch_mode " \ + "name=yes_i_really_mean_it,type=CephBool,req=false", + "force a healthy stretch mode, requiring the full number of CRUSH buckets " + "to peer and letting all non-tiebreaker monitors be elected leader ", + "osd", "rw") +COMMAND("osd force_recovery_stretch_mode " \ + "name=yes_i_really_mean_it,type=CephBool,req=false", + "try and force a recovery stretch mode, increasing the " + "pool size to its non-failure value if currently degraded and " + "all monitor buckets are up", + "osd", "rw") + + +// tiering +COMMAND("osd tier add " + "name=pool,type=CephPoolname " + "name=tierpool,type=CephPoolname " + "name=force_nonempty,type=CephChoices,strings=--force-nonempty,req=false", + "add the tier <tierpool> (the second one) to base pool <pool> (the first one)", + "osd", "rw") +COMMAND("osd tier rm " + "name=pool,type=CephPoolname " + "name=tierpool,type=CephPoolname", + "remove the tier <tierpool> (the second one) from base pool <pool> (the first one)", + "osd", "rw") +COMMAND_WITH_FLAG("osd tier remove " + "name=pool,type=CephPoolname " + "name=tierpool,type=CephPoolname", + "remove the tier <tierpool> (the second one) from base pool <pool> (the first one)", + "osd", "rw", + FLAG(DEPRECATED)) +COMMAND("osd tier cache-mode " + "name=pool,type=CephPoolname " + "name=mode,type=CephChoices,strings=writeback|readproxy|readonly|none " + "name=yes_i_really_mean_it,type=CephBool,req=false", + "specify the caching mode for cache tier <pool>", "osd", "rw") +COMMAND("osd tier set-overlay " + "name=pool,type=CephPoolname " + "name=overlaypool,type=CephPoolname", + "set the overlay pool for base pool <pool> to be <overlaypool>", "osd", "rw") +COMMAND("osd tier rm-overlay " + "name=pool,type=CephPoolname ", + "remove the overlay pool for base pool <pool>", "osd", "rw") +COMMAND_WITH_FLAG("osd tier remove-overlay " + "name=pool,type=CephPoolname ", + "remove the overlay pool for base pool <pool>", "osd", "rw", + FLAG(DEPRECATED)) + +COMMAND("osd tier add-cache " + "name=pool,type=CephPoolname " + "name=tierpool,type=CephPoolname " + "name=size,type=CephInt,range=0", + "add a cache <tierpool> (the second one) of size <size> to existing pool <pool> (the first one)", + "osd", "rw") + +/* + * mon/KVMonitor.cc + */ + +COMMAND("config-key get " + "name=key,type=CephString", + "get <key>", "config-key", "r") +COMMAND("config-key set " + "name=key,type=CephString " + "name=val,type=CephString,req=false", + "set <key> to value <val>", "config-key", "rw") +COMMAND_WITH_FLAG("config-key put " + "name=key,type=CephString " + "name=val,type=CephString,req=false", + "put <key>, value <val>", "config-key", "rw", + FLAG(DEPRECATED)) +COMMAND_WITH_FLAG("config-key del " + "name=key,type=CephString", + "delete <key>", "config-key", "rw", + FLAG(DEPRECATED)) +COMMAND("config-key rm " + "name=key,type=CephString", + "rm <key>", "config-key", "rw") +COMMAND("config-key exists " + "name=key,type=CephString", + "check for <key>'s existence", "config-key", "r") +COMMAND_WITH_FLAG("config-key list ", "list keys", "config-key", "r", + FLAG(DEPRECATED)) +COMMAND("config-key ls ", "list keys", "config-key", "r") +COMMAND("config-key dump " + "name=key,type=CephString,req=false", "dump keys and values (with optional prefix)", "config-key", "r") + + +/* + * mon/MgrMonitor.cc + */ +COMMAND("mgr stat", + "dump basic info about the mgr cluster state", + "mgr", "r") +COMMAND("mgr dump " + "name=epoch,type=CephInt,range=0,req=false", + "dump the latest MgrMap", + "mgr", "r") +COMMAND("mgr fail name=who,type=CephString,req=false", + "treat the named manager daemon as failed", "mgr", "rw") +COMMAND("mgr module ls", + "list active mgr modules", "mgr", "r") +COMMAND("mgr services", + "list service endpoints provided by mgr modules", + "mgr", "r") +COMMAND("mgr module enable " + "name=module,type=CephString " + "name=force,type=CephChoices,strings=--force,req=false", + "enable mgr module", "mgr", "rw") +COMMAND("mgr module disable " + "name=module,type=CephString", + "disable mgr module", "mgr", "rw") +COMMAND("mgr metadata name=who,type=CephString,req=false", + "dump metadata for all daemons or a specific daemon", + "mgr", "r") +COMMAND("mgr count-metadata name=property,type=CephString", + "count ceph-mgr daemons by metadata field property", + "mgr", "r") +COMMAND("mgr versions", + "check running versions of ceph-mgr daemons", + "mgr", "r") + +// ConfigMonitor +COMMAND("config set" + " name=who,type=CephString" + " name=name,type=CephString" + " name=value,type=CephString" + " name=force,type=CephBool,req=false", + "Set a configuration option for one or more entities", + "config", "rw") +COMMAND("config rm" + " name=who,type=CephString" + " name=name,type=CephString", + "Clear a configuration option for one or more entities", + "config", "rw") +COMMAND("config get " + "name=who,type=CephString " + "name=key,type=CephString,req=False", + "Show configuration option(s) for an entity", + "config", "r") +COMMAND("config dump", + "Show all configuration option(s)", + "mon", "r") +COMMAND("config help " + "name=key,type=CephString", + "Describe a configuration option", + "config", "r") +COMMAND("config ls", + "List available configuration options", + "config", "r") +COMMAND("config assimilate-conf", + "Assimilate options from a conf, and return a new, minimal conf file", + "config", "rw") +COMMAND("config log name=num,type=CephInt,req=False", + "Show recent history of config changes", + "config", "r") +COMMAND("config reset " + "name=num,type=CephInt,range=0", + "Revert configuration to a historical version specified by <num>", + "config", "rw") +COMMAND("config generate-minimal-conf", + "Generate a minimal ceph.conf file", + "config", "r") + + + + +// these are tell commands that were implemented as CLI commands in +// the broken pre-octopus way that we want to allow to work when a +// monitor has upgraded to octopus+ but the monmap min_mon_release is +// still < octopus. we exclude things that weren't well supported +// before and that aren't implemented by the octopus mon anymore. +// +// the command set below matches the kludge in Monitor::handle_command +// that shunts these off to the asok machinery. + +COMMAND_WITH_FLAG("injectargs " + "name=injected_args,type=CephString,n=N", + "inject config arguments into monitor", "mon", "rw", + FLAG(TELL)) +COMMAND_WITH_FLAG("smart name=devid,type=CephString,req=false", + "Query health metrics for underlying device", + "mon", "rw", + FLAG(TELL)) +COMMAND_WITH_FLAG("mon_status", + "report status of monitors", + "mon", "r", + FLAG(TELL)) +COMMAND_WITH_FLAG("heap " + "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats " + "name=value,type=CephString,req=false", + "show heap usage info (available only if compiled with tcmalloc)", + "mon", "rw", + FLAG(TELL)) +COMMAND_WITH_FLAG("connection scores dump", + "show the scores used in connectivity-based elections", + "mon", "rwx", + FLAG(TELL)) +COMMAND_WITH_FLAG("connection scores reset", + "reset the scores used in connectivity-based elections", + "mon", "rwx", + FLAG(TELL)) +COMMAND_WITH_FLAG("sync_force " + "name=validate,type=CephChoices,strings=--yes-i-really-mean-it,req=false", + "force sync of and clear monitor store", + "mon", "rw", + FLAG(TELL)) +COMMAND_WITH_FLAG("add_bootstrap_peer_hint " + "name=addr,type=CephIPAddr", + "add peer address as potential bootstrap " + "peer for cluster bringup", + "mon", "rw", + FLAG(TELL)) +COMMAND_WITH_FLAG("add_bootstrap_peer_hintv " + "name=addrv,type=CephString", + "add peer address vector as potential bootstrap " + "peer for cluster bringup", + "mon", "rw", + FLAG(TELL)) +COMMAND_WITH_FLAG("quorum enter ", + "force monitor back into quorum", + "mon", "rw", + FLAG(TELL)) +COMMAND_WITH_FLAG("quorum exit", + "force monitor out of the quorum", + "mon", "rw", + FLAG(TELL)) +COMMAND_WITH_FLAG("ops", + "show the ops currently in flight", + "mon", "r", + FLAG(TELL)) +COMMAND_WITH_FLAG("sessions", + "list existing sessions", + "mon", "r", + FLAG(TELL)) +COMMAND_WITH_FLAG("dump_historic_ops", + "dump_historic_ops", + "mon", "r", + FLAG(TELL)) diff --git a/src/mon/MonMap.cc b/src/mon/MonMap.cc new file mode 100644 index 000000000..2d14578a6 --- /dev/null +++ b/src/mon/MonMap.cc @@ -0,0 +1,972 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "MonMap.h" + +#include <algorithm> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#ifdef WITH_SEASTAR +#include <seastar/core/fstream.hh> +#include <seastar/core/reactor.hh> +#include <seastar/net/dns.hh> +#include "crimson/common/config_proxy.h" +#endif + +#include "common/Formatter.h" + +#include "include/ceph_features.h" +#include "include/addr_parsing.h" +#include "common/ceph_argparse.h" +#include "common/dns_resolve.h" +#include "common/errno.h" +#include "common/dout.h" +#include "common/Clock.h" +#include "mon/health_check.h" + +using std::list; +using std::map; +using std::ostream; +using std::set; +using std::string; +using std::vector; + +using ceph::DNSResolver; +using ceph::Formatter; + +void mon_info_t::encode(ceph::buffer::list& bl, uint64_t features) const +{ + uint8_t v = 5; + uint8_t min_v = 1; + if (!crush_loc.empty()) { + // we added crush_loc in version 5, but need to let old clients decode it + // so just leave the min_v at version 1. Monitors are protected + // from misunderstandings about location because setting it is blocked + // on FEATURE_PINGING + min_v = 1; + } + if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 2; + } + ENCODE_START(v, min_v, bl); + encode(name, bl); + if (v < 3) { + ceph_assert(min_v == 1); + auto a = public_addrs.legacy_addr(); + if (a != entity_addr_t()) { + encode(a, bl, features); + } else { + // note: we don't have a legacy addr here, so lie so that it looks + // like one, just so that old clients get a valid-looking map. + // they won't be able to talk to the v2 mons, but that's better + // than nothing. + encode(public_addrs.as_legacy_addr(), bl, features); + } + } else { + encode(public_addrs, bl, features); + } + encode(priority, bl); + encode(weight, bl); + encode(crush_loc, bl); + ENCODE_FINISH(bl); +} + +void mon_info_t::decode(ceph::buffer::list::const_iterator& p) +{ + DECODE_START(5, p); + decode(name, p); + decode(public_addrs, p); + if (struct_v >= 2) { + decode(priority, p); + } + if (struct_v >= 4) { + decode(weight, p); + } + if (struct_v >= 5) { + decode(crush_loc, p); + } + DECODE_FINISH(p); +} + +void mon_info_t::print(ostream& out) const +{ + out << "mon." << name + << " addrs " << public_addrs + << " priority " << priority + << " weight " << weight + << " crush location " << crush_loc; +} + +namespace { + struct rank_cmp { + bool operator()(const mon_info_t &a, const mon_info_t &b) const { + if (a.public_addrs.legacy_or_front_addr() == b.public_addrs.legacy_or_front_addr()) + return a.name < b.name; + return a.public_addrs.legacy_or_front_addr() < b.public_addrs.legacy_or_front_addr(); + } + }; +} + +void MonMap::calc_legacy_ranks() +{ + ranks.resize(mon_info.size()); + + // Used to order entries according to public_addr, because that's + // how the ranks are expected to be ordered by. We may expand this + // later on, according to some other criteria, by specifying a + // different comparator. + // + // Please note that we use a 'set' here instead of resorting to + // std::sort() because we need more info than that's available in + // the vector. The vector will thus be ordered by, e.g., public_addr + // while only containing the names of each individual monitor. + // The only way of achieving this with std::sort() would be to first + // insert every mon_info_t entry into a vector 'foo', std::sort() 'foo' + // with custom comparison functions, and then copy each invidual entry + // to a new vector. Unless there's a simpler way, we don't think the + // added complexity makes up for the additional memory usage of a 'set'. + set<mon_info_t, rank_cmp> tmp; + + for (auto p = mon_info.begin(); p != mon_info.end(); ++p) { + mon_info_t &m = p->second; + tmp.insert(m); + } + + // map the set to the actual ranks etc + unsigned i = 0; + for (auto p = tmp.begin(); p != tmp.end(); ++p, ++i) { + ranks[i] = p->name; + } +} + +void MonMap::encode(ceph::buffer::list& blist, uint64_t con_features) const +{ + if ((con_features & CEPH_FEATURE_MONNAMES) == 0) { + using ceph::encode; + __u16 v = 1; + encode(v, blist); + ceph::encode_raw(fsid, blist); + encode(epoch, blist); + vector<entity_inst_t> mon_inst(ranks.size()); + for (unsigned n = 0; n < ranks.size(); n++) { + mon_inst[n].name = entity_name_t::MON(n); + mon_inst[n].addr = get_addrs(n).legacy_addr(); + } + encode(mon_inst, blist, con_features); + encode(last_changed, blist); + encode(created, blist); + return; + } + + map<string,entity_addr_t> legacy_mon_addr; + if (!HAVE_FEATURE(con_features, MONENC) || + !HAVE_FEATURE(con_features, SERVER_NAUTILUS)) { + for (auto& [name, info] : mon_info) { + legacy_mon_addr[name] = info.public_addrs.legacy_addr(); + } + } + + if (!HAVE_FEATURE(con_features, MONENC)) { + /* we keep the mon_addr map when encoding to ensure compatibility + * with clients and other monitors that do not yet support the 'mons' + * map. This map keeps its original behavior, containing a mapping of + * monitor id (i.e., 'foo' in 'mon.foo') to the monitor's public + * address -- which is obtained from the public address of each entry + * in the 'mons' map. + */ + using ceph::encode; + __u16 v = 2; + encode(v, blist); + ceph::encode_raw(fsid, blist); + encode(epoch, blist); + encode(legacy_mon_addr, blist, con_features); + encode(last_changed, blist); + encode(created, blist); + return; + } + + if (!HAVE_FEATURE(con_features, SERVER_NAUTILUS)) { + ENCODE_START(5, 3, blist); + ceph::encode_raw(fsid, blist); + encode(epoch, blist); + encode(legacy_mon_addr, blist, con_features); + encode(last_changed, blist); + encode(created, blist); + encode(persistent_features, blist); + encode(optional_features, blist); + encode(mon_info, blist, con_features); + ENCODE_FINISH(blist); + return; + } + + ENCODE_START(9, 6, blist); + ceph::encode_raw(fsid, blist); + encode(epoch, blist); + encode(last_changed, blist); + encode(created, blist); + encode(persistent_features, blist); + encode(optional_features, blist); + encode(mon_info, blist, con_features); + encode(ranks, blist); + encode(min_mon_release, blist); + encode(removed_ranks, blist); + uint8_t t = strategy; + encode(t, blist); + encode(disallowed_leaders, blist); + encode(stretch_mode_enabled, blist); + encode(tiebreaker_mon, blist); + encode(stretch_marked_down_mons, blist); + ENCODE_FINISH(blist); +} + +void MonMap::decode(ceph::buffer::list::const_iterator& p) +{ + map<string,entity_addr_t> mon_addr; + DECODE_START_LEGACY_COMPAT_LEN_16(9, 3, 3, p); + ceph::decode_raw(fsid, p); + decode(epoch, p); + if (struct_v == 1) { + vector<entity_inst_t> mon_inst; + decode(mon_inst, p); + for (unsigned i = 0; i < mon_inst.size(); i++) { + char n[2]; + n[0] = '0' + i; + n[1] = 0; + string name = n; + mon_addr[name] = mon_inst[i].addr; + } + } else if (struct_v < 6) { + decode(mon_addr, p); + } + decode(last_changed, p); + decode(created, p); + if (struct_v >= 4) { + decode(persistent_features, p); + decode(optional_features, p); + } + if (struct_v < 5) { + // generate mon_info from legacy mon_addr + for (auto& [name, addr] : mon_addr) { + mon_info_t &m = mon_info[name]; + m.name = name; + m.public_addrs = entity_addrvec_t(addr); + } + } else { + decode(mon_info, p); + } + if (struct_v < 6) { + calc_legacy_ranks(); + } else { + decode(ranks, p); + } + if (struct_v >= 7) { + decode(min_mon_release, p); + } else { + min_mon_release = infer_ceph_release_from_mon_features(persistent_features); + } + if (struct_v >= 8) { + decode(removed_ranks, p); + uint8_t t; + decode(t, p); + strategy = static_cast<election_strategy>(t); + decode(disallowed_leaders, p); + } + if (struct_v >= 9) { + decode(stretch_mode_enabled, p); + decode(tiebreaker_mon, p); + decode(stretch_marked_down_mons, p); + } else { + stretch_mode_enabled = false; + tiebreaker_mon = ""; + stretch_marked_down_mons.clear(); + } + calc_addr_mons(); + DECODE_FINISH(p); +} + +void MonMap::generate_test_instances(list<MonMap*>& o) +{ + o.push_back(new MonMap); + o.push_back(new MonMap); + o.back()->epoch = 1; + o.back()->last_changed = utime_t(123, 456); + o.back()->created = utime_t(789, 101112); + o.back()->add("one", entity_addrvec_t()); + + MonMap *m = new MonMap; + { + m->epoch = 1; + m->last_changed = utime_t(123, 456); + + entity_addrvec_t empty_addr_one = entity_addrvec_t(entity_addr_t()); + empty_addr_one.v[0].set_nonce(1); + m->add("empty_addr_one", empty_addr_one); + entity_addrvec_t empty_addr_two = entity_addrvec_t(entity_addr_t()); + empty_addr_two.v[0].set_nonce(2); + m->add("empty_addr_two", empty_addr_two); + + const char *local_pub_addr_s = "127.0.1.2"; + + const char *end_p = local_pub_addr_s + strlen(local_pub_addr_s); + entity_addrvec_t local_pub_addr; + local_pub_addr.parse(local_pub_addr_s, &end_p); + + m->add(mon_info_t("filled_pub_addr", entity_addrvec_t(local_pub_addr), 1, 1)); + + m->add("empty_addr_zero", entity_addrvec_t()); + } + o.push_back(m); +} + +// read from/write to a file +int MonMap::write(const char *fn) +{ + // encode + ceph::buffer::list bl; + encode(bl, CEPH_FEATURES_ALL); + + return bl.write_file(fn); +} + +int MonMap::read(const char *fn) +{ + // read + ceph::buffer::list bl; + std::string error; + int r = bl.read_file(fn, &error); + if (r < 0) + return r; + decode(bl); + return 0; +} + +void MonMap::print_summary(ostream& out) const +{ + out << "e" << epoch << ": " + << mon_info.size() << " mons at {"; + // the map that we used to print, as it was, no longer + // maps strings to the monitor's public address, but to + // mon_info_t instead. As such, print the map in a way + // that keeps the expected format. + bool has_printed = false; + for (auto p = mon_info.begin(); p != mon_info.end(); ++p) { + if (has_printed) + out << ","; + out << p->first << "=" << p->second.public_addrs; + has_printed = true; + } + out << "}" << " removed_ranks: {" << removed_ranks << "}"; +} + +void MonMap::print(ostream& out) const +{ + out << "epoch " << epoch << "\n"; + out << "fsid " << fsid << "\n"; + out << "last_changed " << last_changed << "\n"; + out << "created " << created << "\n"; + out << "min_mon_release " << to_integer<unsigned>(min_mon_release) + << " (" << min_mon_release << ")\n"; + out << "election_strategy: " << strategy << "\n"; + if (stretch_mode_enabled) { + out << "stretch_mode_enabled " << stretch_mode_enabled << "\n"; + out << "tiebreaker_mon " << tiebreaker_mon << "\n"; + } + if (stretch_mode_enabled || + !disallowed_leaders.empty()) { + out << "disallowed_leaders " << disallowed_leaders << "\n"; + } + unsigned i = 0; + for (auto p = ranks.begin(); p != ranks.end(); ++p) { + const auto &mi = mon_info.find(*p); + ceph_assert(mi != mon_info.end()); + out << i++ << ": " << mi->second.public_addrs << " mon." << *p; + if (!mi->second.crush_loc.empty()) { + out << "; crush_location " << mi->second.crush_loc; + } + out << "\n"; + } +} + +void MonMap::dump(Formatter *f) const +{ + f->dump_unsigned("epoch", epoch); + f->dump_stream("fsid") << fsid; + last_changed.gmtime(f->dump_stream("modified")); + created.gmtime(f->dump_stream("created")); + f->dump_unsigned("min_mon_release", to_integer<unsigned>(min_mon_release)); + f->dump_string("min_mon_release_name", to_string(min_mon_release)); + f->dump_int ("election_strategy", strategy); + f->dump_stream("disallowed_leaders: ") << disallowed_leaders; + f->dump_bool("stretch_mode", stretch_mode_enabled); + f->dump_string("tiebreaker_mon", tiebreaker_mon); + f->dump_stream("removed_ranks: ") << removed_ranks; + f->open_object_section("features"); + persistent_features.dump(f, "persistent"); + optional_features.dump(f, "optional"); + f->close_section(); + f->open_array_section("mons"); + int i = 0; + for (auto p = ranks.begin(); p != ranks.end(); ++p, ++i) { + f->open_object_section("mon"); + f->dump_int("rank", i); + f->dump_string("name", *p); + f->dump_object("public_addrs", get_addrs(*p)); + // compat: make these look like pre-nautilus entity_addr_t + f->dump_stream("addr") << get_addrs(*p).get_legacy_str(); + f->dump_stream("public_addr") << get_addrs(*p).get_legacy_str(); + f->dump_unsigned("priority", get_priority(*p)); + f->dump_unsigned("weight", get_weight(*p)); + const auto &mi = mon_info.find(*p); + // we don't need to assert this validity as all the get_* functions did + f->dump_stream("crush_location") << mi->second.crush_loc; + f->close_section(); + } + f->close_section(); +} + +void MonMap::dump_summary(Formatter *f) const +{ + f->dump_unsigned("epoch", epoch); + f->dump_string("min_mon_release_name", to_string(min_mon_release)); + f->dump_unsigned("num_mons", ranks.size()); +} + +// an ambiguous mon addr may be legacy or may be msgr2--we aren' sure. +// when that happens we need to try them both (unless we can +// reasonably infer from the port number which it is). +void MonMap::_add_ambiguous_addr(const string& name, + entity_addr_t addr, + int priority, + int weight, + bool for_mkfs) +{ + if (addr.get_type() != entity_addr_t::TYPE_ANY) { + // a v1: or v2: prefix was specified + if (addr.get_port() == 0) { + // use default port + if (addr.get_type() == entity_addr_t::TYPE_LEGACY) { + addr.set_port(CEPH_MON_PORT_LEGACY); + } else if (addr.get_type() == entity_addr_t::TYPE_MSGR2) { + addr.set_port(CEPH_MON_PORT_IANA); + } else { + // wth + return; + } + if (!contains(addr)) { + add(name, entity_addrvec_t(addr), priority, weight); + } + } else { + if (!contains(addr)) { + add(name, entity_addrvec_t(addr), priority, weight); + } + } + } else { + // no v1: or v2: prefix specified + if (addr.get_port() == CEPH_MON_PORT_LEGACY) { + // legacy port implies legacy addr + addr.set_type(entity_addr_t::TYPE_LEGACY); + if (!contains(addr)) { + if (!for_mkfs) { + add(name + "-legacy", entity_addrvec_t(addr), priority, weight); + } else { + add(name, entity_addrvec_t(addr), priority, weight); + } + } + } else if (addr.get_port() == CEPH_MON_PORT_IANA) { + // iana port implies msgr2 addr + addr.set_type(entity_addr_t::TYPE_MSGR2); + if (!contains(addr)) { + add(name, entity_addrvec_t(addr), priority, weight); + } + } else if (addr.get_port() == 0) { + // no port; include both msgr2 and legacy ports + if (!for_mkfs) { + addr.set_type(entity_addr_t::TYPE_MSGR2); + addr.set_port(CEPH_MON_PORT_IANA); + if (!contains(addr)) { + add(name, entity_addrvec_t(addr), priority, weight); + } + addr.set_type(entity_addr_t::TYPE_LEGACY); + addr.set_port(CEPH_MON_PORT_LEGACY); + if (!contains(addr)) { + add(name + "-legacy", entity_addrvec_t(addr), priority, weight); + } + } else { + entity_addrvec_t av; + addr.set_type(entity_addr_t::TYPE_MSGR2); + addr.set_port(CEPH_MON_PORT_IANA); + av.v.push_back(addr); + addr.set_type(entity_addr_t::TYPE_LEGACY); + addr.set_port(CEPH_MON_PORT_LEGACY); + av.v.push_back(addr); + if (!contains(av)) { + add(name, av, priority, weight); + } + } + } else { + addr.set_type(entity_addr_t::TYPE_MSGR2); + if (!contains(addr)) { + add(name, entity_addrvec_t(addr), priority, weight); + } + if (!for_mkfs) { + // try legacy on same port too + addr.set_type(entity_addr_t::TYPE_LEGACY); + if (!contains(addr)) { + add(name + "-legacy", entity_addrvec_t(addr), priority, weight); + } + } + } + } +} + +void MonMap::init_with_addrs(const std::vector<entity_addrvec_t>& addrs, + bool for_mkfs, + std::string_view prefix) +{ + char id = 'a'; + for (auto& addr : addrs) { + string name{prefix}; + name += id++; + if (addr.v.size() == 1) { + _add_ambiguous_addr(name, addr.front(), 0, 0, for_mkfs); + } else { + // they specified an addrvec, so let's assume they also specified + // the addr *type* and *port*. (we could possibly improve this?) + add(name, addr, 0); + } + } +} + +int MonMap::init_with_ips(const std::string& ips, + bool for_mkfs, + std::string_view prefix) +{ + vector<entity_addrvec_t> addrs; + if (!parse_ip_port_vec( + ips.c_str(), addrs, + entity_addr_t::TYPE_ANY)) { + return -EINVAL; + } + if (addrs.empty()) + return -ENOENT; + init_with_addrs(addrs, for_mkfs, prefix); + return 0; +} + +int MonMap::init_with_hosts(const std::string& hostlist, + bool for_mkfs, + std::string_view prefix) +{ + // maybe they passed us a DNS-resolvable name + char *hosts = resolve_addrs(hostlist.c_str()); + if (!hosts) + return -EINVAL; + + vector<entity_addrvec_t> addrs; + bool success = parse_ip_port_vec( + hosts, addrs, + entity_addr_t::TYPE_ANY); + free(hosts); + if (!success) + return -EINVAL; + if (addrs.empty()) + return -ENOENT; + init_with_addrs(addrs, for_mkfs, prefix); + calc_legacy_ranks(); + return 0; +} + +void MonMap::set_initial_members(CephContext *cct, + list<std::string>& initial_members, + string my_name, + const entity_addrvec_t& my_addrs, + set<entity_addrvec_t> *removed) +{ + // remove non-initial members + unsigned i = 0; + while (i < size()) { + string n = get_name(i); + if (std::find(initial_members.begin(), initial_members.end(), n) + != initial_members.end()) { + lgeneric_dout(cct, 1) << " keeping " << n << " " << get_addrs(i) << dendl; + i++; + continue; + } + + lgeneric_dout(cct, 1) << " removing " << get_name(i) << " " << get_addrs(i) + << dendl; + if (removed) { + removed->insert(get_addrs(i)); + } + remove(n); + ceph_assert(!contains(n)); + } + + // add missing initial members + for (auto& p : initial_members) { + if (!contains(p)) { + if (p == my_name) { + lgeneric_dout(cct, 1) << " adding self " << p << " " << my_addrs + << dendl; + add(p, my_addrs); + } else { + entity_addr_t a; + a.set_type(entity_addr_t::TYPE_LEGACY); + a.set_family(AF_INET); + for (int n=1; ; n++) { + a.set_nonce(n); + if (!contains(a)) + break; + } + lgeneric_dout(cct, 1) << " adding " << p << " " << a << dendl; + add(p, entity_addrvec_t(a)); + } + ceph_assert(contains(p)); + } + } + calc_legacy_ranks(); +} + +int MonMap::init_with_config_file(const ConfigProxy& conf, + std::ostream& errout) +{ + std::vector<std::string> sections; + int ret = conf.get_all_sections(sections); + if (ret) { + errout << "Unable to find any monitors in the configuration " + << "file, because there was an error listing the sections. error " + << ret << std::endl; + return -ENOENT; + } + std::vector<std::string> mon_names; + for (const auto& section : sections) { + if (section.substr(0, 4) == "mon." && section.size() > 4) { + mon_names.push_back(section.substr(4)); + } + } + + // Find an address for each monitor in the config file. + for (const auto& mon_name : mon_names) { + std::vector<std::string> sections; + std::string m_name("mon"); + m_name += "."; + m_name += mon_name; + sections.push_back(m_name); + sections.push_back("mon"); + sections.push_back("global"); + std::string val; + int res = conf.get_val_from_conf_file(sections, "mon addr", val, true); + if (res) { + errout << "failed to get an address for mon." << mon_name + << ": error " << res << std::endl; + continue; + } + // the 'mon addr' field is a legacy field, so assume anything + // there on a weird port is a v1 address, and do not handle + // addrvecs. + entity_addr_t addr; + if (!addr.parse(val.c_str(), nullptr, entity_addr_t::TYPE_LEGACY)) { + errout << "unable to parse address for mon." << mon_name + << ": addr='" << val << "'" << std::endl; + continue; + } + if (addr.get_port() == 0) { + addr.set_port(CEPH_MON_PORT_LEGACY); + } + uint16_t priority = 0; + if (!conf.get_val_from_conf_file(sections, "mon priority", val, false)) { + try { + priority = std::stoul(val); + } catch (std::logic_error&) { + errout << "unable to parse priority for mon." << mon_name + << ": priority='" << val << "'" << std::endl; + continue; + } + } + uint16_t weight = 0; + if (!conf.get_val_from_conf_file(sections, "mon weight", val, false)) { + try { + weight = std::stoul(val); + } catch (std::logic_error&) { + errout << "unable to parse weight for mon." << mon_name + << ": weight='" << val << "'" + << std::endl; + continue; + } + } + + // make sure this mon isn't already in the map + if (contains(addr)) + remove(get_name(addr)); + if (contains(mon_name)) + remove(mon_name); + _add_ambiguous_addr(mon_name, addr, priority, weight, false); + } + return 0; +} + +void MonMap::check_health(health_check_map_t *checks) const +{ + if (stretch_mode_enabled) { + list<string> detail; + for (auto& p : mon_info) { + if (p.second.crush_loc.empty()) { + ostringstream ss; + ss << "mon " << p.first << " has no location set while in stretch mode"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " monitor(s) have no location set while in stretch mode" + << "; this may cause issues with failover, OSD connections, netsplit handling, etc"; + auto& d = checks->add("MON_LOCATION_NOT_SET", HEALTH_WARN, + ss.str(), detail.size()); + d.detail.swap(detail); + } + } +} + +#ifdef WITH_SEASTAR + +using namespace seastar; + +seastar::future<> MonMap::read_monmap(const std::string& monmap) +{ + return open_file_dma(monmap, open_flags::ro).then([this] (file f) { + return f.size().then([this, f = std::move(f)](size_t s) { + return do_with(make_file_input_stream(f), [this, s](input_stream<char>& in) { + return in.read_exactly(s).then([this](temporary_buffer<char> buf) { + ceph::buffer::list bl; + bl.push_back(ceph::buffer::ptr_node::create( + ceph::buffer::create(std::move(buf)))); + decode(bl); + }); + }); + }); + }); +} + +seastar::future<> MonMap::init_with_dns_srv(bool for_mkfs, const std::string& name) +{ + string domain; + string service = name; + // check if domain is also provided and extract it from srv_name + size_t idx = name.find("_"); + if (idx != name.npos) { + domain = name.substr(idx + 1); + service = name.substr(0, idx); + } + return seastar::net::dns::get_srv_records( + seastar::net::dns_resolver::srv_proto::tcp, + service, domain).then([this](seastar::net::dns_resolver::srv_records records) { + return parallel_for_each(records, [this](auto record) { + return seastar::net::dns::resolve_name(record.target).then( + [record,this](seastar::net::inet_address a) { + // the resolved address does not contain ceph specific info like nonce + // nonce or msgr proto (legacy, msgr2), so set entity_addr_t manually + entity_addr_t addr; + addr.set_type(entity_addr_t::TYPE_ANY); + addr.set_family(int(a.in_family())); + addr.set_port(record.port); + switch (a.in_family()) { + case seastar::net::inet_address::family::INET: + addr.in4_addr().sin_addr = a; + break; + case seastar::net::inet_address::family::INET6: + addr.in6_addr().sin6_addr = a; + break; + } + _add_ambiguous_addr(record.target, + addr, + record.priority, + record.weight, + false); + }); + }); + }).handle_exception_type([](const std::system_error& e) { + // ignore DNS failures + return seastar::make_ready_future<>(); + }); +} + +seastar::future<> MonMap::build_monmap(const crimson::common::ConfigProxy& conf, + bool for_mkfs) +{ + // -m foo? + if (const auto mon_host = conf.get_val<std::string>("mon_host"); + !mon_host.empty()) { + if (auto ret = init_with_ips(mon_host, for_mkfs, "noname-"); ret == 0) { + return make_ready_future<>(); + } + // TODO: resolve_addrs() is a blocking call + if (auto ret = init_with_hosts(mon_host, for_mkfs, "noname-"); ret == 0) { + return make_ready_future<>(); + } else { + throw std::runtime_error(cpp_strerror(ret)); + } + } + + // What monitors are in the config file? + ostringstream errout; + if (auto ret = init_with_config_file(conf, errout); ret < 0) { + throw std::runtime_error(errout.str()); + } + if (size() > 0) { + return make_ready_future<>(); + } + // no info found from conf options lets try use DNS SRV records + const string srv_name = conf.get_val<std::string>("mon_dns_srv_name"); + return init_with_dns_srv(for_mkfs, srv_name).then([this] { + if (size() == 0) { + throw std::runtime_error("no monitors specified to connect to."); + } + }); +} + +seastar::future<> MonMap::build_initial(const crimson::common::ConfigProxy& conf, bool for_mkfs) +{ + // file? + if (const auto monmap = conf.get_val<std::string>("monmap"); + !monmap.empty()) { + return read_monmap(monmap); + } else { + // fsid from conf? + if (const auto new_fsid = conf.get_val<uuid_d>("fsid"); + !new_fsid.is_zero()) { + fsid = new_fsid; + } + return build_monmap(conf, for_mkfs).then([this] { + created = ceph_clock_now(); + last_changed = created; + calc_legacy_ranks(); + }); + } +} + +#else // WITH_SEASTAR + +int MonMap::init_with_monmap(const std::string& monmap, std::ostream& errout) +{ + int r; + try { + r = read(monmap.c_str()); + } catch (ceph::buffer::error&) { + r = -EINVAL; + } + if (r >= 0) + return 0; + errout << "unable to read/decode monmap from " << monmap + << ": " << cpp_strerror(-r) << std::endl; + return r; +} + +int MonMap::init_with_dns_srv(CephContext* cct, + std::string srv_name, + bool for_mkfs, + std::ostream& errout) +{ + string domain; + // check if domain is also provided and extract it from srv_name + size_t idx = srv_name.find("_"); + if (idx != string::npos) { + domain = srv_name.substr(idx + 1); + srv_name = srv_name.substr(0, idx); + } + + map<string, DNSResolver::Record> records; + if (DNSResolver::get_instance()->resolve_srv_hosts(cct, srv_name, + DNSResolver::SRV_Protocol::TCP, domain, &records) != 0) { + + errout << "unable to get monitor info from DNS SRV with service name: " + << "ceph-mon" << std::endl; + return -1; + } else { + for (auto& record : records) { + record.second.addr.set_type(entity_addr_t::TYPE_ANY); + _add_ambiguous_addr(record.first, + record.second.addr, + record.second.priority, + record.second.weight, + false); + } + return 0; + } +} + +int MonMap::build_initial(CephContext *cct, bool for_mkfs, ostream& errout) +{ + const auto& conf = cct->_conf; + + // mon_host_override? + auto mon_host_override = conf.get_val<std::string>("mon_host_override"); + if (!mon_host_override.empty()) { + lgeneric_dout(cct, 1) << "Using mon_host_override " << mon_host_override << dendl; + auto ret = init_with_ips(mon_host_override, for_mkfs, "noname-"); + if (ret == -EINVAL) { + ret = init_with_hosts(mon_host_override, for_mkfs, "noname-"); + } + if (ret < 0) { + errout << "unable to parse addrs in '" << mon_host_override << "'" + << std::endl; + } + return ret; + } + + // cct? + auto addrs = cct->get_mon_addrs(); + if (addrs != nullptr && (addrs->size() > 0)) { + init_with_addrs(*addrs, for_mkfs, "noname-"); + return 0; + } + + // file? + if (const auto monmap = conf.get_val<std::string>("monmap"); + !monmap.empty()) { + return init_with_monmap(monmap, errout); + } + + // fsid from conf? + if (const auto new_fsid = conf.get_val<uuid_d>("fsid"); + !new_fsid.is_zero()) { + fsid = new_fsid; + } + // -m foo? + if (const auto mon_host = conf.get_val<std::string>("mon_host"); + !mon_host.empty()) { + auto ret = init_with_ips(mon_host, for_mkfs, "noname-"); + if (ret == -EINVAL) { + ret = init_with_hosts(mon_host, for_mkfs, "noname-"); + } + if (ret < 0) { + errout << "unable to parse addrs in '" << mon_host << "'" + << std::endl; + return ret; + } + } + if (size() == 0) { + // What monitors are in the config file? + if (auto ret = init_with_config_file(conf, errout); ret < 0) { + return ret; + } + } + if (size() == 0) { + // no info found from conf options lets try use DNS SRV records + string srv_name = conf.get_val<std::string>("mon_dns_srv_name"); + if (auto ret = init_with_dns_srv(cct, srv_name, for_mkfs, errout); ret < 0) { + return -ENOENT; + } + } + if (size() == 0) { + errout << "no monitors specified to connect to." << std::endl; + return -ENOENT; + } + strategy = static_cast<election_strategy>(conf.get_val<uint64_t>("mon_election_default_strategy")); + created = ceph_clock_now(); + last_changed = created; + calc_legacy_ranks(); + return 0; +} +#endif // WITH_SEASTAR diff --git a/src/mon/MonMap.h b/src/mon/MonMap.h new file mode 100644 index 000000000..02304edfd --- /dev/null +++ b/src/mon/MonMap.h @@ -0,0 +1,546 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MONMAP_H +#define CEPH_MONMAP_H + +#ifdef WITH_SEASTAR +#include <seastar/core/future.hh> +#endif + +#include "common/config_fwd.h" +#include "common/ceph_releases.h" + +#include "include/err.h" +#include "include/types.h" + +#include "mon/mon_types.h" +#include "msg/Message.h" + +class health_check_map_t; + +#ifdef WITH_SEASTAR +namespace crimson::common { + class ConfigProxy; +} +#endif + +namespace ceph { + class Formatter; +} + +struct mon_info_t { + /** + * monitor name + * + * i.e., 'foo' in 'mon.foo' + */ + std::string name; + /** + * monitor's public address(es) + * + * public facing address(es), used to communicate with all clients + * and with other monitors. + */ + entity_addrvec_t public_addrs; + /** + * the priority of the mon, the lower value the more preferred + */ + uint16_t priority{0}; + uint16_t weight{0}; + + /** + * The location of the monitor, in CRUSH hierarchy terms + */ + std::map<std::string,std::string> crush_loc; + + // <REMOVE ME> + mon_info_t(const std::string& n, const entity_addr_t& p_addr, uint16_t p) + : name(n), public_addrs(p_addr), priority(p) + {} + // </REMOVE ME> + + mon_info_t(const std::string& n, const entity_addrvec_t& p_addrs, + uint16_t p, uint16_t w) + : name(n), public_addrs(p_addrs), priority(p), weight(w) + {} + mon_info_t(const std::string &n, const entity_addrvec_t& p_addrs) + : name(n), public_addrs(p_addrs) + { } + + mon_info_t() { } + + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& p); + void print(std::ostream& out) const; +}; +WRITE_CLASS_ENCODER_FEATURES(mon_info_t) + +inline std::ostream& operator<<(std::ostream& out, const mon_info_t& mon) { + mon.print(out); + return out; +} + +class MonMap { + public: + epoch_t epoch; // what epoch/version of the monmap + uuid_d fsid; + utime_t last_changed; + utime_t created; + + std::map<std::string, mon_info_t> mon_info; + std::map<entity_addr_t, std::string> addr_mons; + + std::vector<std::string> ranks; + /* ranks which were removed when this map took effect. + There should only be one at a time, but leave support + for arbitrary numbers just to be safe. */ + std::set<int> removed_ranks; + + /** + * Persistent Features are all those features that once set on a + * monmap cannot, and should not, be removed. These will define the + * non-negotiable features that a given monitor must support to + * properly operate in a given quorum. + * + * Should be reserved for features that we really want to make sure + * are sticky, and are important enough to tolerate not being able + * to downgrade a monitor. + */ + mon_feature_t persistent_features; + /** + * Optional Features are all those features that can be enabled or + * disabled following a given criteria -- e.g., user-mandated via the + * cli --, and act much like indicators of what the cluster currently + * supports. + * + * They are by no means "optional" in the sense that monitors can + * ignore them. Just that they are not persistent. + */ + mon_feature_t optional_features; + + /** + * Returns the set of features required by this monmap. + * + * The features required by this monmap is the union of all the + * currently set persistent features and the currently set optional + * features. + * + * @returns the set of features required by this monmap + */ + mon_feature_t get_required_features() const { + return (persistent_features | optional_features); + } + + // upgrade gate + ceph_release_t min_mon_release{ceph_release_t::unknown}; + + void _add_ambiguous_addr(const std::string& name, + entity_addr_t addr, + int priority, + int weight, + bool for_mkfs); + + enum election_strategy { + // Keep in sync with ElectionLogic.h! + CLASSIC = 1, // the original rank-based one + DISALLOW = 2, // disallow a set from being leader + CONNECTIVITY = 3 // includes DISALLOW, extends to prefer stronger connections + }; + election_strategy strategy = CLASSIC; + std::set<std::string> disallowed_leaders; // can't be leader under CONNECTIVITY/DISALLOW + bool stretch_mode_enabled = false; + string tiebreaker_mon; + set<string> stretch_marked_down_mons; // can't be leader until fully recovered + +public: + void calc_legacy_ranks(); + void calc_addr_mons() { + // populate addr_mons + addr_mons.clear(); + for (auto& p : mon_info) { + for (auto& a : p.second.public_addrs.v) { + addr_mons[a] = p.first; + } + } + } + + MonMap() + : epoch(0) { + } + + uuid_d& get_fsid() { return fsid; } + + unsigned size() const { + return mon_info.size(); + } + + unsigned min_quorum_size(unsigned total_mons=0) const { + if (total_mons == 0) { + total_mons = size(); + } + return total_mons / 2 + 1; + } + + epoch_t get_epoch() const { return epoch; } + void set_epoch(epoch_t e) { epoch = e; } + + /** + * Obtain list of public facing addresses + * + * @param ls list to populate with the monitors' addresses + */ + void list_addrs(std::list<entity_addr_t>& ls) const { + for (auto& i : mon_info) { + for (auto& j : i.second.public_addrs.v) { + ls.push_back(j); + } + } + } + + /** + * Add new monitor to the monmap + * + * @param m monitor info of the new monitor + */ + void add(const mon_info_t& m) { + ceph_assert(mon_info.count(m.name) == 0); + for (auto& a : m.public_addrs.v) { + ceph_assert(addr_mons.count(a) == 0); + } + mon_info[m.name] = m; + if (get_required_features().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + ranks.push_back(m.name); + ceph_assert(ranks.size() == mon_info.size()); + } else { + calc_legacy_ranks(); + } + calc_addr_mons(); + } + + /** + * Add new monitor to the monmap + * + * @param name Monitor name (i.e., 'foo' in 'mon.foo') + * @param addr Monitor's public address + */ + void add(const std::string &name, const entity_addrvec_t &addrv, + uint16_t priority=0, uint16_t weight=0) { + add(mon_info_t(name, addrv, priority, weight)); + } + + /** + * Remove monitor from the monmap + * + * @param name Monitor name (i.e., 'foo' in 'mon.foo') + */ + void remove(const std::string &name) { + // this must match what we do in ConnectionTracker::notify_rank_removed + ceph_assert(mon_info.count(name)); + int rank = get_rank(name); + mon_info.erase(name); + disallowed_leaders.erase(name); + ceph_assert(mon_info.count(name) == 0); + if (rank >= 0 ) { + removed_ranks.insert(rank); + } + if (get_required_features().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + ranks.erase(std::find(ranks.begin(), ranks.end(), name)); + ceph_assert(ranks.size() == mon_info.size()); + } else { + calc_legacy_ranks(); + } + calc_addr_mons(); + } + + /** + * Rename monitor from @p oldname to @p newname + * + * @param oldname monitor's current name (i.e., 'foo' in 'mon.foo') + * @param newname monitor's new name (i.e., 'bar' in 'mon.bar') + */ + void rename(std::string oldname, std::string newname) { + ceph_assert(contains(oldname)); + ceph_assert(!contains(newname)); + mon_info[newname] = mon_info[oldname]; + mon_info.erase(oldname); + mon_info[newname].name = newname; + if (get_required_features().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + *std::find(ranks.begin(), ranks.end(), oldname) = newname; + ceph_assert(ranks.size() == mon_info.size()); + } else { + calc_legacy_ranks(); + } + calc_addr_mons(); + } + + int set_rank(const std::string& name, int rank) { + int oldrank = get_rank(name); + if (oldrank < 0) { + return -ENOENT; + } + if (rank < 0 || rank >= (int)ranks.size()) { + return -EINVAL; + } + if (oldrank != rank) { + ranks.erase(ranks.begin() + oldrank); + ranks.insert(ranks.begin() + rank, name); + } + return 0; + } + + bool contains(const std::string& name) const { + return mon_info.count(name); + } + + /** + * Check if monmap contains a monitor with address @p a + * + * @note checks for all addresses a monitor may have, public or otherwise. + * + * @param a monitor address + * @returns true if monmap contains a monitor with address @p; + * false otherwise. + */ + bool contains(const entity_addr_t &a, std::string *name=nullptr) const { + for (auto& i : mon_info) { + for (auto& j : i.second.public_addrs.v) { + if (j == a) { + if (name) { + *name = i.first; + } + return true; + } + } + } + return false; + } + bool contains(const entity_addrvec_t &av, std::string *name=nullptr) const { + for (auto& i : mon_info) { + for (auto& j : i.second.public_addrs.v) { + for (auto& k : av.v) { + if (j == k) { + if (name) { + *name = i.first; + } + return true; + } + } + } + } + return false; + } + + std::string get_name(unsigned n) const { + ceph_assert(n < ranks.size()); + return ranks[n]; + } + std::string get_name(const entity_addr_t& a) const { + std::map<entity_addr_t, std::string>::const_iterator p = addr_mons.find(a); + if (p == addr_mons.end()) + return std::string(); + else + return p->second; + } + std::string get_name(const entity_addrvec_t& av) const { + for (auto& i : av.v) { + std::map<entity_addr_t, std::string>::const_iterator p = addr_mons.find(i); + if (p != addr_mons.end()) + return p->second; + } + return std::string(); + } + + int get_rank(const std::string& n) const { + if (auto found = std::find(ranks.begin(), ranks.end(), n); + found != ranks.end()) { + return std::distance(ranks.begin(), found); + } else { + return -1; + } + } + int get_rank(const entity_addr_t& a) const { + std::string n = get_name(a); + if (!n.empty()) { + return get_rank(n); + } + return -1; + } + int get_rank(const entity_addrvec_t& av) const { + std::string n = get_name(av); + if (!n.empty()) { + return get_rank(n); + } + return -1; + } + bool get_addr_name(const entity_addr_t& a, std::string& name) { + if (addr_mons.count(a) == 0) + return false; + name = addr_mons[a]; + return true; + } + + const entity_addrvec_t& get_addrs(const std::string& n) const { + ceph_assert(mon_info.count(n)); + std::map<std::string,mon_info_t>::const_iterator p = mon_info.find(n); + return p->second.public_addrs; + } + const entity_addrvec_t& get_addrs(unsigned m) const { + ceph_assert(m < ranks.size()); + return get_addrs(ranks[m]); + } + void set_addrvec(const std::string& n, const entity_addrvec_t& a) { + ceph_assert(mon_info.count(n)); + mon_info[n].public_addrs = a; + calc_addr_mons(); + } + uint16_t get_priority(const std::string& n) const { + auto it = mon_info.find(n); + ceph_assert(it != mon_info.end()); + return it->second.priority; + } + uint16_t get_weight(const std::string& n) const { + auto it = mon_info.find(n); + ceph_assert(it != mon_info.end()); + return it->second.weight; + } + void set_weight(const std::string& n, uint16_t v) { + auto it = mon_info.find(n); + ceph_assert(it != mon_info.end()); + it->second.weight = v; + } + + void encode(ceph::buffer::list& blist, uint64_t con_features) const; + void decode(ceph::buffer::list& blist) { + auto p = std::cbegin(blist); + decode(p); + } + void decode(ceph::buffer::list::const_iterator& p); + + void generate_fsid() { + fsid.generate_random(); + } + + // read from/write to a file + int write(const char *fn); + int read(const char *fn); + + /** + * build an initial bootstrap monmap from conf + * + * Build an initial bootstrap monmap from the config. This will + * try, in this order: + * + * 1 monmap -- an explicitly provided monmap + * 2 mon_host -- list of monitors + * 3 config [mon.*] sections, and 'mon addr' fields in those sections + * + * @param cct context (and associated config) + * @param errout std::ostream to send error messages too + */ +#ifdef WITH_SEASTAR + seastar::future<> build_initial(const crimson::common::ConfigProxy& conf, bool for_mkfs); +#else + int build_initial(CephContext *cct, bool for_mkfs, std::ostream& errout); +#endif + /** + * filter monmap given a set of initial members. + * + * Remove mons that aren't in the initial_members list. Add missing + * mons and give them dummy IPs (blank IPv4, with a non-zero + * nonce). If the name matches my_name, then my_addr will be used in + * place of a dummy addr. + * + * @param initial_members list of initial member names + * @param my_name name of self, can be blank + * @param my_addr my addr + * @param removed optional pointer to set to insert removed mon addrs to + */ + void set_initial_members(CephContext *cct, + std::list<std::string>& initial_members, + std::string my_name, + const entity_addrvec_t& my_addrs, + std::set<entity_addrvec_t> *removed); + + void print(std::ostream& out) const; + void print_summary(std::ostream& out) const; + void dump(ceph::Formatter *f) const; + void dump_summary(ceph::Formatter *f) const; + + void check_health(health_check_map_t *checks) const; + + static void generate_test_instances(std::list<MonMap*>& o); +protected: + /** + * build a monmap from a list of entity_addrvec_t's + * + * Give mons dummy names. + * + * @param addrs list of entity_addrvec_t's + * @param prefix prefix to prepend to generated mon names + */ + void init_with_addrs(const std::vector<entity_addrvec_t>& addrs, + bool for_mkfs, + std::string_view prefix); + /** + * build a monmap from a list of ips + * + * Give mons dummy names. + * + * @param hosts list of ips, space or comma separated + * @param prefix prefix to prepend to generated mon names + * @return 0 for success, -errno on error + */ + int init_with_ips(const std::string& ips, + bool for_mkfs, + std::string_view prefix); + /** + * build a monmap from a list of hostnames + * + * Give mons dummy names. + * + * @param hosts list of ips, space or comma separated + * @param prefix prefix to prepend to generated mon names + * @return 0 for success, -errno on error + */ + int init_with_hosts(const std::string& hostlist, + bool for_mkfs, + std::string_view prefix); + int init_with_config_file(const ConfigProxy& conf, std::ostream& errout); +#if WITH_SEASTAR + seastar::future<> read_monmap(const std::string& monmap); + /// try to build monmap with different settings, like + /// mon_host, mon* sections, and mon_dns_srv_name + seastar::future<> build_monmap(const crimson::common::ConfigProxy& conf, bool for_mkfs); + /// initialize monmap by resolving given service name + seastar::future<> init_with_dns_srv(bool for_mkfs, const std::string& name); +#else + /// read from encoded monmap file + int init_with_monmap(const std::string& monmap, std::ostream& errout); + int init_with_dns_srv(CephContext* cct, std::string srv_name, bool for_mkfs, + std::ostream& errout); +#endif +}; +WRITE_CLASS_ENCODER_FEATURES(MonMap) + +inline std::ostream& operator<<(std::ostream &out, const MonMap &m) { + m.print_summary(out); + return out; +} + +#endif diff --git a/src/mon/MonOpRequest.h b/src/mon/MonOpRequest.h new file mode 100644 index 000000000..73275e81e --- /dev/null +++ b/src/mon/MonOpRequest.h @@ -0,0 +1,238 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat <contact@redhat.com> + * Copyright (C) 2015 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef MON_OPREQUEST_H_ +#define MON_OPREQUEST_H_ +#include <iosfwd> +#include <stdint.h> + +#include "common/TrackedOp.h" +#include "mon/Session.h" +#include "msg/Message.h" + +struct MonOpRequest : public TrackedOp { + friend class OpTracker; + + void mark_dispatch() { + mark_event("monitor_dispatch"); + } + void mark_wait_for_quorum() { + mark_event("wait_for_quorum"); + } + void mark_zap() { + mark_event("monitor_zap"); + } + void mark_forwarded() { + mark_event("forwarded"); + forwarded_to_leader = true; + } + + void mark_svc_event(const std::string &service, const std::string &event) { + std::string s = service; + s.append(":").append(event); + mark_event(s); + } + + void mark_logmon_event(const std::string &event) { + mark_svc_event("logm", event); + } + void mark_osdmon_event(const std::string &event) { + mark_svc_event("osdmap", event); + } + void mark_pgmon_event(const std::string &event) { + mark_svc_event("pgmap", event); + } + void mark_mdsmon_event(const std::string &event) { + mark_svc_event("mdsmap", event); + } + void mark_authmon_event(const std::string &event) { + mark_svc_event("auth", event); + } + void mark_paxos_event(const std::string &event) { + mark_svc_event("paxos", event); + } + + + enum op_type_t { + OP_TYPE_NONE = 0, ///< no type defined (default) + OP_TYPE_SERVICE, ///< belongs to a Paxos Service or similar + OP_TYPE_MONITOR, ///< belongs to the Monitor class + OP_TYPE_ELECTION, ///< belongs to the Elector class + OP_TYPE_PAXOS, ///< refers to Paxos messages + OP_TYPE_COMMAND, ///< is a command + }; + + MonOpRequest(const MonOpRequest &other) = delete; + MonOpRequest & operator = (const MonOpRequest &other) = delete; + +private: + Message *request; + utime_t dequeued_time; + RefCountedPtr session; + ConnectionRef con; + bool forwarded_to_leader; + op_type_t op_type; + + MonOpRequest(Message *req, OpTracker *tracker) : + TrackedOp(tracker, + req->get_recv_stamp().is_zero() ? + ceph_clock_now() : req->get_recv_stamp()), + request(req), + con(NULL), + forwarded_to_leader(false), + op_type(OP_TYPE_NONE) + { + if (req) { + con = req->get_connection(); + if (con) { + session = con->get_priv(); + } + } + } + + void _dump(ceph::Formatter *f) const override { + { + f->open_array_section("events"); + std::lock_guard l(lock); + for (auto i = events.begin(); i != events.end(); ++i) { + f->open_object_section("event"); + f->dump_string("event", i->str); + f->dump_stream("time") << i->stamp; + + auto i_next = i + 1; + + if (i_next < events.end()) { + f->dump_float("duration", i_next->stamp - i->stamp); + } else { + f->dump_float("duration", events.rbegin()->stamp - get_initiated()); + } + + f->close_section(); + } + f->close_section(); + f->open_object_section("info"); + f->dump_int("seq", seq); + f->dump_bool("src_is_mon", is_src_mon()); + f->dump_stream("source") << request->get_source_inst(); + f->dump_bool("forwarded_to_leader", forwarded_to_leader); + f->close_section(); + } + } + +protected: + void _dump_op_descriptor_unlocked(std::ostream& stream) const override { + get_req()->print(stream); + } + +public: + ~MonOpRequest() override { + request->put(); + } + + MonSession *get_session() const { + return static_cast<MonSession*>(session.get()); + } + + template<class T> + T *get_req() const { return static_cast<T*>(request); } + + Message *get_req() const { return get_req<Message>(); } + + int get_req_type() const { + if (!request) + return 0; + return request->get_type(); + } + + ConnectionRef get_connection() { return con; } + + void set_session(MonSession *s) { + session.reset(s); + } + + bool is_src_mon() const { + return (con && con->get_peer_type() & CEPH_ENTITY_TYPE_MON); + } + + typedef boost::intrusive_ptr<MonOpRequest> Ref; + + void set_op_type(op_type_t t) { + op_type = t; + } + void set_type_service() { + set_op_type(OP_TYPE_SERVICE); + } + void set_type_monitor() { + set_op_type(OP_TYPE_MONITOR); + } + void set_type_paxos() { + set_op_type(OP_TYPE_PAXOS); + } + void set_type_election_or_ping() { + set_op_type(OP_TYPE_ELECTION); + } + void set_type_command() { + set_op_type(OP_TYPE_COMMAND); + } + + op_type_t get_op_type() { + return op_type; + } + + bool is_type_service() { + return (get_op_type() == OP_TYPE_SERVICE); + } + bool is_type_monitor() { + return (get_op_type() == OP_TYPE_MONITOR); + } + bool is_type_paxos() { + return (get_op_type() == OP_TYPE_PAXOS); + } + bool is_type_election_or_ping() { + return (get_op_type() == OP_TYPE_ELECTION); + } + bool is_type_command() { + return (get_op_type() == OP_TYPE_COMMAND); + } +}; + +typedef MonOpRequest::Ref MonOpRequestRef; + +struct C_MonOp : public Context +{ + MonOpRequestRef op; + + explicit C_MonOp(MonOpRequestRef o) : + op(o) { } + + void finish(int r) override { + if (op && r == -ECANCELED) { + op->mark_event("callback canceled"); + } else if (op && r == -EAGAIN) { + op->mark_event("callback retry"); + } else if (op && r == 0) { + op->mark_event("callback finished"); + } + _finish(r); + } + + void mark_op_event(const std::string &event) { + if (op) + op->mark_event(event); + } + + virtual void _finish(int r) = 0; +}; + +#endif /* MON_OPREQUEST_H_ */ diff --git a/src/mon/MonSub.cc b/src/mon/MonSub.cc new file mode 100644 index 000000000..a2c60ba91 --- /dev/null +++ b/src/mon/MonSub.cc @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "MonSub.h" + +bool MonSub::have_new() const { + return !sub_new.empty(); +} + +bool MonSub::need_renew() const +{ + return ceph::coarse_mono_clock::now() > renew_after; +} + +void MonSub::renewed() +{ + if (clock::is_zero(renew_sent)) { + renew_sent = clock::now(); + } + // update sub_sent with sub_new + sub_new.insert(sub_sent.begin(), sub_sent.end()); + std::swap(sub_new, sub_sent); + sub_new.clear(); +} + +void MonSub::acked(uint32_t interval) +{ + if (!clock::is_zero(renew_sent)) { + // NOTE: this is only needed for legacy (infernalis or older) + // mons; see MonClient::tick(). + renew_after = renew_sent; + renew_after += ceph::make_timespan(interval / 2.0); + renew_sent = clock::zero(); + } +} + +bool MonSub::reload() +{ + for (auto& [what, sub] : sub_sent) { + if (sub_new.count(what) == 0) { + sub_new[what] = sub; + } + } + return have_new(); +} + +void MonSub::got(const std::string& what, version_t have) +{ + if (auto i = sub_new.find(what); i != sub_new.end()) { + auto& sub = i->second; + if (sub.start <= have) { + if (sub.flags & CEPH_SUBSCRIBE_ONETIME) { + sub_new.erase(i); + } else { + sub.start = have + 1; + } + } + } else if (auto i = sub_sent.find(what); i != sub_sent.end()) { + auto& sub = i->second; + if (sub.start <= have) { + if (sub.flags & CEPH_SUBSCRIBE_ONETIME) { + sub_sent.erase(i); + } else { + sub.start = have + 1; + } + } + } +} + +bool MonSub::want(const std::string& what, version_t start, unsigned flags) +{ + if (auto sub = sub_new.find(what); + sub != sub_new.end() && + sub->second.start == start && + sub->second.flags == flags) { + return false; + } else if (auto sub = sub_sent.find(what); + sub != sub_sent.end() && + sub->second.start == start && + sub->second.flags == flags) { + return false; + } else { + sub_new[what].start = start; + sub_new[what].flags = flags; + return true; + } +} + +bool MonSub::inc_want(const std::string& what, version_t start, unsigned flags) +{ + if (auto sub = sub_new.find(what); sub != sub_new.end()) { + if (sub->second.start >= start) { + return false; + } else { + sub->second.start = start; + sub->second.flags = flags; + return true; + } + } else if (auto sub = sub_sent.find(what); + sub == sub_sent.end() || sub->second.start < start) { + auto& item = sub_new[what]; + item.start = start; + item.flags = flags; + return true; + } else { + return false; + } +} + +void MonSub::unwant(const std::string& what) +{ + sub_sent.erase(what); + sub_new.erase(what); +} diff --git a/src/mon/MonSub.h b/src/mon/MonSub.h new file mode 100644 index 000000000..8ff5a8f18 --- /dev/null +++ b/src/mon/MonSub.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <map> +#include <string> + +#include "common/ceph_time.h" +#include "include/types.h" + +// mon subscriptions +class MonSub +{ +public: + // @returns true if there is any "new" subscriptions + bool have_new() const; + auto get_subs() const { + return sub_new; + } + bool need_renew() const; + // change the status of "new" subscriptions to "sent" + void renewed(); + // the peer acked the subscription request + void acked(uint32_t interval); + void got(const std::string& what, version_t version); + // revert the status of subscriptions from "sent" to "new" + // @returns true if there is any pending "new" subscriptions + bool reload(); + // add a new subscription + bool want(const std::string& what, version_t start, unsigned flags); + // increment the requested subscription start point. If you do increase + // the value, apply the passed-in flags as well; otherwise do nothing. + bool inc_want(const std::string& what, version_t start, unsigned flags); + // cancel a subscription + void unwant(const std::string& what); +private: + // my subs, and current versions + std::map<std::string,ceph_mon_subscribe_item> sub_sent; + // unsent new subs + std::map<std::string,ceph_mon_subscribe_item> sub_new; + using time_point = ceph::coarse_mono_time; + using clock = typename time_point::clock; + time_point renew_sent; + time_point renew_after; +}; diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc new file mode 100644 index 000000000..ce7ec37d9 --- /dev/null +++ b/src/mon/Monitor.cc @@ -0,0 +1,6887 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <iterator> +#include <sstream> +#include <tuple> +#include <stdlib.h> +#include <signal.h> +#include <limits.h> +#include <cstring> +#include <boost/scope_exit.hpp> +#include <boost/algorithm/string/predicate.hpp> + +#include "json_spirit/json_spirit_reader.h" +#include "json_spirit/json_spirit_writer.h" + +#include "Monitor.h" +#include "common/version.h" +#include "common/blkdev.h" +#include "common/cmdparse.h" +#include "common/signal.h" + +#include "osd/OSDMap.h" + +#include "MonitorDBStore.h" + +#include "messages/PaxosServiceMessage.h" +#include "messages/MMonMap.h" +#include "messages/MMonGetMap.h" +#include "messages/MMonGetVersion.h" +#include "messages/MMonGetVersionReply.h" +#include "messages/MGenericMessage.h" +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" +#include "messages/MMonSync.h" +#include "messages/MMonScrub.h" +#include "messages/MMonProbe.h" +#include "messages/MMonJoin.h" +#include "messages/MMonPaxos.h" +#include "messages/MRoute.h" +#include "messages/MForward.h" + +#include "messages/MMonSubscribe.h" +#include "messages/MMonSubscribeAck.h" + +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" + +#include "messages/MAuthReply.h" + +#include "messages/MTimeCheck2.h" +#include "messages/MPing.h" + +#include "common/strtol.h" +#include "common/ceph_argparse.h" +#include "common/Timer.h" +#include "common/Clock.h" +#include "common/errno.h" +#include "common/perf_counters.h" +#include "common/admin_socket.h" +#include "global/signal_handler.h" +#include "common/Formatter.h" +#include "include/stringify.h" +#include "include/color.h" +#include "include/ceph_fs.h" +#include "include/str_list.h" + +#include "OSDMonitor.h" +#include "MDSMonitor.h" +#include "MonmapMonitor.h" +#include "LogMonitor.h" +#include "AuthMonitor.h" +#include "MgrMonitor.h" +#include "MgrStatMonitor.h" +#include "ConfigMonitor.h" +#include "KVMonitor.h" +#include "mon/HealthMonitor.h" +#include "common/config.h" +#include "common/cmdparse.h" +#include "include/ceph_assert.h" +#include "include/compat.h" +#include "perfglue/heap_profiler.h" + +#include "auth/none/AuthNoneClientHandler.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +using namespace TOPNSPC::common; + +using std::cout; +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::setfill; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; +using std::unique_ptr; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::ErasureCodeInterfaceRef; +using ceph::ErasureCodeProfile; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::make_message; +using ceph::mono_clock; +using ceph::mono_time; +using ceph::timespan_str; + + +static ostream& _prefix(std::ostream *_dout, const Monitor *mon) { + return *_dout << "mon." << mon->name << "@" << mon->rank + << "(" << mon->get_state_name() << ") e" << mon->monmap->get_epoch() << " "; +} + +const string Monitor::MONITOR_NAME = "monitor"; +const string Monitor::MONITOR_STORE_PREFIX = "monitor_store"; + + +#undef FLAG +#undef COMMAND +#undef COMMAND_WITH_FLAG +#define FLAG(f) (MonCommand::FLAG_##f) +#define COMMAND(parsesig, helptext, modulename, req_perms) \ + {parsesig, helptext, modulename, req_perms, FLAG(NONE)}, +#define COMMAND_WITH_FLAG(parsesig, helptext, modulename, req_perms, flags) \ + {parsesig, helptext, modulename, req_perms, flags}, +MonCommand mon_commands[] = { +#include <mon/MonCommands.h> +}; +#undef COMMAND +#undef COMMAND_WITH_FLAG + +Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s, + Messenger *m, Messenger *mgr_m, MonMap *map) : + Dispatcher(cct_), + AuthServer(cct_), + name(nm), + rank(-1), + messenger(m), + con_self(m ? m->get_loopback_connection() : NULL), + timer(cct_, lock), + finisher(cct_, "mon_finisher", "fin"), + cpu_tp(cct, "Monitor::cpu_tp", "cpu_tp", g_conf()->mon_cpu_threads), + has_ever_joined(false), + logger(NULL), cluster_logger(NULL), cluster_logger_registered(false), + monmap(map), + log_client(cct_, messenger, monmap, LogClient::FLAG_MON), + key_server(cct, &keyring), + auth_cluster_required(cct, + cct->_conf->auth_supported.empty() ? + cct->_conf->auth_cluster_required : cct->_conf->auth_supported), + auth_service_required(cct, + cct->_conf->auth_supported.empty() ? + cct->_conf->auth_service_required : cct->_conf->auth_supported), + mgr_messenger(mgr_m), + mgr_client(cct_, mgr_m, monmap), + gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")), + store(s), + + elector(this, map->strategy), + required_features(0), + leader(0), + quorum_con_features(0), + // scrub + scrub_version(0), + scrub_event(NULL), + scrub_timeout_event(NULL), + + // sync state + sync_provider_count(0), + sync_cookie(0), + sync_full(false), + sync_start_version(0), + sync_timeout_event(NULL), + sync_last_committed_floor(0), + + timecheck_round(0), + timecheck_acks(0), + timecheck_rounds_since_clean(0), + timecheck_event(NULL), + + admin_hook(NULL), + routed_request_tid(0), + op_tracker(cct, g_conf().get_val<bool>("mon_enable_op_tracker"), 1) +{ + clog = log_client.create_channel(CLOG_CHANNEL_CLUSTER); + audit_clog = log_client.create_channel(CLOG_CHANNEL_AUDIT); + + update_log_clients(); + + if (!gss_ktfile_client.empty()) { + // Assert we can export environment variable + /* + The default client keytab is used, if it is present and readable, + to automatically obtain initial credentials for GSSAPI client + applications. The principal name of the first entry in the client + keytab is used by default when obtaining initial credentials. + 1. The KRB5_CLIENT_KTNAME environment variable. + 2. The default_client_keytab_name profile variable in [libdefaults]. + 3. The hardcoded default, DEFCKTNAME. + */ + const int32_t set_result(setenv("KRB5_CLIENT_KTNAME", + gss_ktfile_client.c_str(), 1)); + ceph_assert(set_result == 0); + } + + op_tracker.set_complaint_and_threshold( + g_conf().get_val<std::chrono::seconds>("mon_op_complaint_time").count(), + g_conf().get_val<int64_t>("mon_op_log_threshold")); + op_tracker.set_history_size_and_duration( + g_conf().get_val<uint64_t>("mon_op_history_size"), + g_conf().get_val<std::chrono::seconds>("mon_op_history_duration").count()); + op_tracker.set_history_slow_op_size_and_threshold( + g_conf().get_val<uint64_t>("mon_op_history_slow_op_size"), + g_conf().get_val<std::chrono::seconds>("mon_op_history_slow_op_threshold").count()); + + paxos = std::make_unique<Paxos>(*this, "paxos"); + + paxos_service[PAXOS_MDSMAP].reset(new MDSMonitor(*this, *paxos, "mdsmap")); + paxos_service[PAXOS_MONMAP].reset(new MonmapMonitor(*this, *paxos, "monmap")); + paxos_service[PAXOS_OSDMAP].reset(new OSDMonitor(cct, *this, *paxos, "osdmap")); + paxos_service[PAXOS_LOG].reset(new LogMonitor(*this, *paxos, "logm")); + paxos_service[PAXOS_AUTH].reset(new AuthMonitor(*this, *paxos, "auth")); + paxos_service[PAXOS_MGR].reset(new MgrMonitor(*this, *paxos, "mgr")); + paxos_service[PAXOS_MGRSTAT].reset(new MgrStatMonitor(*this, *paxos, "mgrstat")); + paxos_service[PAXOS_HEALTH].reset(new HealthMonitor(*this, *paxos, "health")); + paxos_service[PAXOS_CONFIG].reset(new ConfigMonitor(*this, *paxos, "config")); + paxos_service[PAXOS_KV].reset(new KVMonitor(*this, *paxos, "kv")); + + bool r = mon_caps.parse("allow *", NULL); + ceph_assert(r); + + exited_quorum = ceph_clock_now(); + + // prepare local commands + local_mon_commands.resize(std::size(mon_commands)); + for (unsigned i = 0; i < std::size(mon_commands); ++i) { + local_mon_commands[i] = mon_commands[i]; + } + MonCommand::encode_vector(local_mon_commands, local_mon_commands_bl); + + prenautilus_local_mon_commands = local_mon_commands; + for (auto& i : prenautilus_local_mon_commands) { + std::string n = cmddesc_get_prenautilus_compat(i.cmdstring); + if (n != i.cmdstring) { + dout(20) << " pre-nautilus cmd " << i.cmdstring << " -> " << n << dendl; + i.cmdstring = n; + } + } + MonCommand::encode_vector(prenautilus_local_mon_commands, prenautilus_local_mon_commands_bl); + + // assume our commands until we have an election. this only means + // we won't reply with EINVAL before the election; any command that + // actually matters will wait until we have quorum etc and then + // retry (and revalidate). + leader_mon_commands = local_mon_commands; +} + +Monitor::~Monitor() +{ + op_tracker.on_shutdown(); + + delete logger; + ceph_assert(session_map.sessions.empty()); +} + + +class AdminHook : public AdminSocketHook { + Monitor *mon; +public: + explicit AdminHook(Monitor *m) : mon(m) {} + int call(std::string_view command, const cmdmap_t& cmdmap, + Formatter *f, + std::ostream& errss, + bufferlist& out) override { + stringstream outss; + int r = mon->do_admin_command(command, cmdmap, f, errss, outss); + out.append(outss); + return r; + } +}; + +int Monitor::do_admin_command( + std::string_view command, + const cmdmap_t& cmdmap, + Formatter *f, + std::ostream& err, + std::ostream& out) +{ + std::lock_guard l(lock); + + int r = 0; + string args; + for (auto p = cmdmap.begin(); + p != cmdmap.end(); ++p) { + if (p->first == "prefix") + continue; + if (!args.empty()) + args += ", "; + args += cmd_vartype_stringify(p->second); + } + args = "[" + args + "]"; + + bool read_only = (command == "mon_status" || + command == "mon metadata" || + command == "quorum_status" || + command == "ops" || + command == "sessions"); + + (read_only ? audit_clog->debug() : audit_clog->info()) + << "from='admin socket' entity='admin socket' " + << "cmd='" << command << "' args=" << args << ": dispatch"; + + if (command == "mon_status") { + get_mon_status(f); + } else if (command == "quorum_status") { + _quorum_status(f, out); + } else if (command == "sync_force") { + string validate; + if ((!cmd_getval(cmdmap, "validate", validate)) || + (validate != "--yes-i-really-mean-it")) { + err << "are you SURE? this will mean the monitor store will be erased " + "the next time the monitor is restarted. pass " + "'--yes-i-really-mean-it' if you really do."; + r = -EPERM; + goto abort; + } + sync_force(f); + } else if (command.compare(0, 23, "add_bootstrap_peer_hint") == 0 || + command.compare(0, 24, "add_bootstrap_peer_hintv") == 0) { + if (!_add_bootstrap_peer_hint(command, cmdmap, out)) + goto abort; + } else if (command == "quorum enter") { + elector.start_participating(); + start_election(); + out << "started responding to quorum, initiated new election"; + } else if (command == "quorum exit") { + start_election(); + elector.stop_participating(); + out << "stopped responding to quorum, initiated new election"; + } else if (command == "ops") { + (void)op_tracker.dump_ops_in_flight(f); + } else if (command == "sessions") { + f->open_array_section("sessions"); + for (auto p : session_map.sessions) { + f->dump_object("session", *p); + } + f->close_section(); + } else if (command == "dump_historic_ops") { + if (!op_tracker.dump_historic_ops(f)) { + err << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \ + please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards."; + } + } else if (command == "dump_historic_ops_by_duration" ) { + if (op_tracker.dump_historic_ops(f, true)) { + err << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \ + please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards."; + } + } else if (command == "dump_historic_slow_ops") { + if (op_tracker.dump_historic_slow_ops(f, {})) { + err << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \ + please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards."; + } + } else if (command == "quorum") { + string quorumcmd; + cmd_getval(cmdmap, "quorumcmd", quorumcmd); + if (quorumcmd == "exit") { + start_election(); + elector.stop_participating(); + out << "stopped responding to quorum, initiated new election" << std::endl; + } else if (quorumcmd == "enter") { + elector.start_participating(); + start_election(); + out << "started responding to quorum, initiated new election" << std::endl; + } else { + err << "needs a valid 'quorum' command" << std::endl; + } + } else if (command == "connection scores dump") { + if (!get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_PINGING)) { + err << "Not all monitors support changing election strategies; \ + please upgrade them first!"; + } + elector.dump_connection_scores(f); + } else if (command == "connection scores reset") { + if (!get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_PINGING)) { + err << "Not all monitors support changing election strategies; \ + please upgrade them first!"; + } + elector.notify_clear_peer_state(); + } else if (command == "smart") { + string want_devid; + cmd_getval(cmdmap, "devid", want_devid); + + string devname = store->get_devname(); + if (devname.empty()) { + err << "could not determine device name for " << store->get_path(); + r = -ENOENT; + goto abort; + } + set<string> devnames; + get_raw_devices(devname, &devnames); + json_spirit::mObject json_map; + uint64_t smart_timeout = cct->_conf.get_val<uint64_t>( + "mon_smart_report_timeout"); + for (auto& devname : devnames) { + string err; + string devid = get_device_id(devname, &err); + if (want_devid.size() && want_devid != devid) { + derr << "get_device_id failed on " << devname << ": " << err << dendl; + continue; + } + json_spirit::mValue smart_json; + if (block_device_get_metrics(devname, smart_timeout, + &smart_json)) { + dout(10) << "block_device_get_metrics failed for /dev/" << devname + << dendl; + continue; + } + json_map[devid] = smart_json; + } + json_spirit::write(json_map, out, json_spirit::pretty_print); + } else if (command == "heap") { + if (!ceph_using_tcmalloc()) { + err << "could not issue heap profiler command -- not using tcmalloc!"; + r = -EOPNOTSUPP; + goto abort; + } + string cmd; + if (!cmd_getval(cmdmap, "heapcmd", cmd)) { + err << "unable to get value for command \"" << cmd << "\""; + r = -EINVAL; + goto abort; + } + std::vector<std::string> cmd_vec; + get_str_vec(cmd, cmd_vec); + string val; + if (cmd_getval(cmdmap, "value", val)) { + cmd_vec.push_back(val); + } + ceph_heap_profiler_handle_command(cmd_vec, out); + } else if (command == "compact") { + dout(1) << "triggering manual compaction" << dendl; + auto start = ceph::coarse_mono_clock::now(); + store->compact_async(); + auto end = ceph::coarse_mono_clock::now(); + auto duration = ceph::to_seconds<double>(end - start); + dout(1) << "finished manual compaction in " + << duration << " seconds" << dendl; + out << "compacted " << g_conf().get_val<std::string>("mon_keyvaluedb") + << " in " << duration << " seconds"; + } else { + ceph_abort_msg("bad AdminSocket command binding"); + } + (read_only ? audit_clog->debug() : audit_clog->info()) + << "from='admin socket' " + << "entity='admin socket' " + << "cmd=" << command << " " + << "args=" << args << ": finished"; + return r; + +abort: + (read_only ? audit_clog->debug() : audit_clog->info()) + << "from='admin socket' " + << "entity='admin socket' " + << "cmd=" << command << " " + << "args=" << args << ": aborted"; + return r; +} + +void Monitor::handle_signal(int signum) +{ + ceph_assert(signum == SIGINT || signum == SIGTERM); + derr << "*** Got Signal " << sig_str(signum) << " ***" << dendl; + shutdown(); +} + +CompatSet Monitor::get_initial_supported_features() +{ + CompatSet::FeatureSet ceph_mon_feature_compat; + CompatSet::FeatureSet ceph_mon_feature_ro_compat; + CompatSet::FeatureSet ceph_mon_feature_incompat; + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_BASE); + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS); + return CompatSet(ceph_mon_feature_compat, ceph_mon_feature_ro_compat, + ceph_mon_feature_incompat); +} + +CompatSet Monitor::get_supported_features() +{ + CompatSet compat = get_initial_supported_features(); + compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES); + compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC); + compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2); + compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3); + compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_KRAKEN); + compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS); + compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_MIMIC); + compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_NAUTILUS); + compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OCTOPUS); + compat.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_PACIFIC); + return compat; +} + +CompatSet Monitor::get_legacy_features() +{ + CompatSet::FeatureSet ceph_mon_feature_compat; + CompatSet::FeatureSet ceph_mon_feature_ro_compat; + CompatSet::FeatureSet ceph_mon_feature_incompat; + ceph_mon_feature_incompat.insert(CEPH_MON_FEATURE_INCOMPAT_BASE); + return CompatSet(ceph_mon_feature_compat, ceph_mon_feature_ro_compat, + ceph_mon_feature_incompat); +} + +int Monitor::check_features(MonitorDBStore *store) +{ + CompatSet required = get_supported_features(); + CompatSet ondisk; + + read_features_off_disk(store, &ondisk); + + if (!required.writeable(ondisk)) { + CompatSet diff = required.unsupported(ondisk); + generic_derr << "ERROR: on disk data includes unsupported features: " << diff << dendl; + return -EPERM; + } + + return 0; +} + +void Monitor::read_features_off_disk(MonitorDBStore *store, CompatSet *features) +{ + bufferlist featuresbl; + store->get(MONITOR_NAME, COMPAT_SET_LOC, featuresbl); + if (featuresbl.length() == 0) { + generic_dout(0) << "WARNING: mon fs missing feature list.\n" + << "Assuming it is old-style and introducing one." << dendl; + //we only want the baseline ~v.18 features assumed to be on disk. + //If new features are introduced this code needs to disappear or + //be made smarter. + *features = get_legacy_features(); + + features->encode(featuresbl); + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put(MONITOR_NAME, COMPAT_SET_LOC, featuresbl); + store->apply_transaction(t); + } else { + auto it = featuresbl.cbegin(); + features->decode(it); + } +} + +void Monitor::read_features() +{ + read_features_off_disk(store, &features); + dout(10) << "features " << features << dendl; + + calc_quorum_requirements(); + dout(10) << "required_features " << required_features << dendl; +} + +void Monitor::write_features(MonitorDBStore::TransactionRef t) +{ + bufferlist bl; + features.encode(bl); + t->put(MONITOR_NAME, COMPAT_SET_LOC, bl); +} + +const char** Monitor::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "crushtool", // helpful for testing + "mon_election_timeout", + "mon_lease", + "mon_lease_renew_interval_factor", + "mon_lease_ack_timeout_factor", + "mon_accept_timeout_factor", + // clog & admin clog + "clog_to_monitors", + "clog_to_syslog", + "clog_to_syslog_facility", + "clog_to_syslog_level", + "clog_to_graylog", + "clog_to_graylog_host", + "clog_to_graylog_port", + "host", + "fsid", + // periodic health to clog + "mon_health_to_clog", + "mon_health_to_clog_interval", + "mon_health_to_clog_tick_interval", + // scrub interval + "mon_scrub_interval", + "mon_allow_pool_delete", + // osdmap pruning - observed, not handled. + "mon_osdmap_full_prune_enabled", + "mon_osdmap_full_prune_min", + "mon_osdmap_full_prune_interval", + "mon_osdmap_full_prune_txsize", + // debug options - observed, not handled + "mon_debug_extra_checks", + "mon_debug_block_osdmap_trim", + NULL + }; + return KEYS; +} + +void Monitor::handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + sanitize_options(); + + dout(10) << __func__ << " " << changed << dendl; + + if (changed.count("clog_to_monitors") || + changed.count("clog_to_syslog") || + changed.count("clog_to_syslog_level") || + changed.count("clog_to_syslog_facility") || + changed.count("clog_to_graylog") || + changed.count("clog_to_graylog_host") || + changed.count("clog_to_graylog_port") || + changed.count("host") || + changed.count("fsid")) { + update_log_clients(); + } + + if (changed.count("mon_health_to_clog") || + changed.count("mon_health_to_clog_interval") || + changed.count("mon_health_to_clog_tick_interval")) { + finisher.queue(new C_MonContext{this, [this, changed](int) { + std::lock_guard l{lock}; + health_to_clog_update_conf(changed); + }}); + } + + if (changed.count("mon_scrub_interval")) { + auto scrub_interval = + conf.get_val<std::chrono::seconds>("mon_scrub_interval"); + finisher.queue(new C_MonContext{this, [this, scrub_interval](int) { + std::lock_guard l{lock}; + scrub_update_interval(scrub_interval); + }}); + } +} + +void Monitor::update_log_clients() +{ + map<string,string> log_to_monitors; + map<string,string> log_to_syslog; + map<string,string> log_channel; + map<string,string> log_prio; + map<string,string> log_to_graylog; + map<string,string> log_to_graylog_host; + map<string,string> log_to_graylog_port; + uuid_d fsid; + string host; + + if (parse_log_client_options(g_ceph_context, log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host)) + return; + + clog->update_config(log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host); + + audit_clog->update_config(log_to_monitors, log_to_syslog, + log_channel, log_prio, log_to_graylog, + log_to_graylog_host, log_to_graylog_port, + fsid, host); +} + +int Monitor::sanitize_options() +{ + int r = 0; + + // mon_lease must be greater than mon_lease_renewal; otherwise we + // may incur in leases expiring before they are renewed. + if (g_conf()->mon_lease_renew_interval_factor >= 1.0) { + clog->error() << "mon_lease_renew_interval_factor (" + << g_conf()->mon_lease_renew_interval_factor + << ") must be less than 1.0"; + r = -EINVAL; + } + + // mon_lease_ack_timeout must be greater than mon_lease to make sure we've + // got time to renew the lease and get an ack for it. Having both options + // with the same value, for a given small vale, could mean timing out if + // the monitors happened to be overloaded -- or even under normal load for + // a small enough value. + if (g_conf()->mon_lease_ack_timeout_factor <= 1.0) { + clog->error() << "mon_lease_ack_timeout_factor (" + << g_conf()->mon_lease_ack_timeout_factor + << ") must be greater than 1.0"; + r = -EINVAL; + } + + return r; +} + +int Monitor::preinit() +{ + std::unique_lock l(lock); + + dout(1) << "preinit fsid " << monmap->fsid << dendl; + + int r = sanitize_options(); + if (r < 0) { + derr << "option sanitization failed!" << dendl; + return r; + } + + ceph_assert(!logger); + { + PerfCountersBuilder pcb(g_ceph_context, "mon", l_mon_first, l_mon_last); + pcb.add_u64(l_mon_num_sessions, "num_sessions", "Open sessions", "sess", + PerfCountersBuilder::PRIO_USEFUL); + pcb.add_u64_counter(l_mon_session_add, "session_add", "Created sessions", + "sadd", PerfCountersBuilder::PRIO_INTERESTING); + pcb.add_u64_counter(l_mon_session_rm, "session_rm", "Removed sessions", + "srm", PerfCountersBuilder::PRIO_INTERESTING); + pcb.add_u64_counter(l_mon_session_trim, "session_trim", "Trimmed sessions", + "strm", PerfCountersBuilder::PRIO_USEFUL); + pcb.add_u64_counter(l_mon_num_elections, "num_elections", "Elections participated in", + "ecnt", PerfCountersBuilder::PRIO_USEFUL); + pcb.add_u64_counter(l_mon_election_call, "election_call", "Elections started", + "estt", PerfCountersBuilder::PRIO_INTERESTING); + pcb.add_u64_counter(l_mon_election_win, "election_win", "Elections won", + "ewon", PerfCountersBuilder::PRIO_INTERESTING); + pcb.add_u64_counter(l_mon_election_lose, "election_lose", "Elections lost", + "elst", PerfCountersBuilder::PRIO_INTERESTING); + logger = pcb.create_perf_counters(); + cct->get_perfcounters_collection()->add(logger); + } + + ceph_assert(!cluster_logger); + { + PerfCountersBuilder pcb(g_ceph_context, "cluster", l_cluster_first, l_cluster_last); + pcb.add_u64(l_cluster_num_mon, "num_mon", "Monitors"); + pcb.add_u64(l_cluster_num_mon_quorum, "num_mon_quorum", "Monitors in quorum"); + pcb.add_u64(l_cluster_num_osd, "num_osd", "OSDs"); + pcb.add_u64(l_cluster_num_osd_up, "num_osd_up", "OSDs that are up"); + pcb.add_u64(l_cluster_num_osd_in, "num_osd_in", "OSD in state \"in\" (they are in cluster)"); + pcb.add_u64(l_cluster_osd_epoch, "osd_epoch", "Current epoch of OSD map"); + pcb.add_u64(l_cluster_osd_bytes, "osd_bytes", "Total capacity of cluster", NULL, 0, unit_t(UNIT_BYTES)); + pcb.add_u64(l_cluster_osd_bytes_used, "osd_bytes_used", "Used space", NULL, 0, unit_t(UNIT_BYTES)); + pcb.add_u64(l_cluster_osd_bytes_avail, "osd_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES)); + pcb.add_u64(l_cluster_num_pool, "num_pool", "Pools"); + pcb.add_u64(l_cluster_num_pg, "num_pg", "Placement groups"); + pcb.add_u64(l_cluster_num_pg_active_clean, "num_pg_active_clean", "Placement groups in active+clean state"); + pcb.add_u64(l_cluster_num_pg_active, "num_pg_active", "Placement groups in active state"); + pcb.add_u64(l_cluster_num_pg_peering, "num_pg_peering", "Placement groups in peering state"); + pcb.add_u64(l_cluster_num_object, "num_object", "Objects"); + pcb.add_u64(l_cluster_num_object_degraded, "num_object_degraded", "Degraded (missing replicas) objects"); + pcb.add_u64(l_cluster_num_object_misplaced, "num_object_misplaced", "Misplaced (wrong location in the cluster) objects"); + pcb.add_u64(l_cluster_num_object_unfound, "num_object_unfound", "Unfound objects"); + pcb.add_u64(l_cluster_num_bytes, "num_bytes", "Size of all objects", NULL, 0, unit_t(UNIT_BYTES)); + cluster_logger = pcb.create_perf_counters(); + } + + paxos->init_logger(); + + // verify cluster_uuid + { + int r = check_fsid(); + if (r == -ENOENT) + r = write_fsid(); + if (r < 0) { + return r; + } + } + + // open compatset + read_features(); + + // have we ever joined a quorum? + has_ever_joined = (store->get(MONITOR_NAME, "joined") != 0); + dout(10) << "has_ever_joined = " << (int)has_ever_joined << dendl; + + if (!has_ever_joined) { + // impose initial quorum restrictions? + list<string> initial_members; + get_str_list(g_conf()->mon_initial_members, initial_members); + + if (!initial_members.empty()) { + dout(1) << " initial_members " << initial_members << ", filtering seed monmap" << dendl; + + monmap->set_initial_members( + g_ceph_context, initial_members, name, messenger->get_myaddrs(), + &extra_probe_peers); + + dout(10) << " monmap is " << *monmap << dendl; + dout(10) << " extra probe peers " << extra_probe_peers << dendl; + } + } else if (!monmap->contains(name)) { + derr << "not in monmap and have been in a quorum before; " + << "must have been removed" << dendl; + if (g_conf()->mon_force_quorum_join) { + dout(0) << "we should have died but " + << "'mon_force_quorum_join' is set -- allowing boot" << dendl; + } else { + derr << "commit suicide!" << dendl; + return -ENOENT; + } + } + + { + // We have a potentially inconsistent store state in hands. Get rid of it + // and start fresh. + bool clear_store = false; + if (store->exists("mon_sync", "in_sync")) { + dout(1) << __func__ << " clean up potentially inconsistent store state" + << dendl; + clear_store = true; + } + + if (store->get("mon_sync", "force_sync") > 0) { + dout(1) << __func__ << " force sync by clearing store state" << dendl; + clear_store = true; + } + + if (clear_store) { + set<string> sync_prefixes = get_sync_targets_names(); + store->clear(sync_prefixes); + } + } + + sync_last_committed_floor = store->get("mon_sync", "last_committed_floor"); + dout(10) << "sync_last_committed_floor " << sync_last_committed_floor << dendl; + + init_paxos(); + + if (is_keyring_required()) { + // we need to bootstrap authentication keys so we can form an + // initial quorum. + if (authmon()->get_last_committed() == 0) { + dout(10) << "loading initial keyring to bootstrap authentication for mkfs" << dendl; + bufferlist bl; + int err = store->get("mkfs", "keyring", bl); + if (err == 0 && bl.length() > 0) { + // Attempt to decode and extract keyring only if it is found. + KeyRing keyring; + auto p = bl.cbegin(); + decode(keyring, p); + extract_save_mon_key(keyring); + } + } + + string keyring_loc = g_conf()->mon_data + "/keyring"; + + r = keyring.load(cct, keyring_loc); + if (r < 0) { + EntityName mon_name; + mon_name.set_type(CEPH_ENTITY_TYPE_MON); + EntityAuth mon_key; + if (key_server.get_auth(mon_name, mon_key)) { + dout(1) << "copying mon. key from old db to external keyring" << dendl; + keyring.add(mon_name, mon_key); + bufferlist bl; + keyring.encode_plaintext(bl); + write_default_keyring(bl); + } else { + derr << "unable to load initial keyring " << g_conf()->keyring << dendl; + return r; + } + } + } + + admin_hook = new AdminHook(this); + AdminSocket* admin_socket = cct->get_admin_socket(); + + // unlock while registering to avoid mon_lock -> admin socket lock dependency. + l.unlock(); + // register tell/asock commands + for (const auto& command : local_mon_commands) { + if (!command.is_tell()) { + continue; + } + const auto prefix = cmddesc_get_prefix(command.cmdstring); + if (prefix == "injectargs" || + prefix == "version" || + prefix == "tell") { + // not registerd by me + continue; + } + r = admin_socket->register_command(command.cmdstring, admin_hook, + command.helpstring); + ceph_assert(r == 0); + } + l.lock(); + + // add ourselves as a conf observer + g_conf().add_observer(this); + + messenger->set_auth_client(this); + messenger->set_auth_server(this); + mgr_messenger->set_auth_client(this); + + auth_registry.refresh_config(); + + return 0; +} + +int Monitor::init() +{ + dout(2) << "init" << dendl; + std::lock_guard l(lock); + + finisher.start(); + + // start ticker + timer.init(); + new_tick(); + + cpu_tp.start(); + + // i'm ready! + messenger->add_dispatcher_tail(this); + + // kickstart pet mgrclient + mgr_client.init(); + mgr_messenger->add_dispatcher_tail(&mgr_client); + mgr_messenger->add_dispatcher_tail(this); // for auth ms_* calls + mgrmon()->prime_mgr_client(); + + state = STATE_PROBING; + + bootstrap(); + + if (!elector.peer_tracker_is_clean()){ + dout(10) << "peer_tracker looks inconsistent" + << " previous bad logic, clearing ..." << dendl; + elector.notify_clear_peer_state(); + } + + // add features of myself into feature_map + session_map.feature_map.add_mon(con_self->get_features()); + return 0; +} + +void Monitor::init_paxos() +{ + dout(10) << __func__ << dendl; + paxos->init(); + + // init services + for (auto& svc : paxos_service) { + svc->init(); + } + + refresh_from_paxos(NULL); +} + +void Monitor::refresh_from_paxos(bool *need_bootstrap) +{ + dout(10) << __func__ << dendl; + + bufferlist bl; + int r = store->get(MONITOR_NAME, "cluster_fingerprint", bl); + if (r >= 0) { + try { + auto p = bl.cbegin(); + decode(fingerprint, p); + } + catch (ceph::buffer::error& e) { + dout(10) << __func__ << " failed to decode cluster_fingerprint" << dendl; + } + } else { + dout(10) << __func__ << " no cluster_fingerprint" << dendl; + } + + for (auto& svc : paxos_service) { + svc->refresh(need_bootstrap); + } + for (auto& svc : paxos_service) { + svc->post_refresh(); + } + load_metadata(); +} + +void Monitor::register_cluster_logger() +{ + if (!cluster_logger_registered) { + dout(10) << "register_cluster_logger" << dendl; + cluster_logger_registered = true; + cct->get_perfcounters_collection()->add(cluster_logger); + } else { + dout(10) << "register_cluster_logger - already registered" << dendl; + } +} + +void Monitor::unregister_cluster_logger() +{ + if (cluster_logger_registered) { + dout(10) << "unregister_cluster_logger" << dendl; + cluster_logger_registered = false; + cct->get_perfcounters_collection()->remove(cluster_logger); + } else { + dout(10) << "unregister_cluster_logger - not registered" << dendl; + } +} + +void Monitor::update_logger() +{ + cluster_logger->set(l_cluster_num_mon, monmap->size()); + cluster_logger->set(l_cluster_num_mon_quorum, quorum.size()); +} + +void Monitor::shutdown() +{ + dout(1) << "shutdown" << dendl; + + lock.lock(); + + wait_for_paxos_write(); + + { + std::lock_guard l(auth_lock); + authmon()->_set_mon_num_rank(0, 0); + } + + state = STATE_SHUTDOWN; + + lock.unlock(); + g_conf().remove_observer(this); + lock.lock(); + + if (admin_hook) { + cct->get_admin_socket()->unregister_commands(admin_hook); + delete admin_hook; + admin_hook = NULL; + } + + elector.shutdown(); + + mgr_client.shutdown(); + + lock.unlock(); + finisher.wait_for_empty(); + finisher.stop(); + lock.lock(); + + // clean up + paxos->shutdown(); + for (auto& svc : paxos_service) { + svc->shutdown(); + } + + finish_contexts(g_ceph_context, waitfor_quorum, -ECANCELED); + finish_contexts(g_ceph_context, maybe_wait_for_quorum, -ECANCELED); + + timer.shutdown(); + + cpu_tp.stop(); + + remove_all_sessions(); + + log_client.shutdown(); + + // unlock before msgr shutdown... + lock.unlock(); + + // shutdown messenger before removing logger from perfcounter collection, + // otherwise _ms_dispatch() will try to update deleted logger + messenger->shutdown(); + mgr_messenger->shutdown(); + + if (logger) { + cct->get_perfcounters_collection()->remove(logger); + } + if (cluster_logger) { + if (cluster_logger_registered) + cct->get_perfcounters_collection()->remove(cluster_logger); + delete cluster_logger; + cluster_logger = NULL; + } +} + +void Monitor::wait_for_paxos_write() +{ + if (paxos->is_writing() || paxos->is_writing_previous()) { + dout(10) << __func__ << " flushing pending write" << dendl; + lock.unlock(); + store->flush(); + lock.lock(); + dout(10) << __func__ << " flushed pending write" << dendl; + } +} + +void Monitor::respawn() +{ + // --- WARNING TO FUTURE COPY/PASTERS --- + // You must also add a call like + // + // ceph_pthread_setname(pthread_self(), "ceph-mon"); + // + // to main() so that /proc/$pid/stat field 2 contains "(ceph-mon)" + // instead of "(exe)", so that killall (and log rotation) will work. + + dout(0) << __func__ << dendl; + + char *new_argv[orig_argc+1]; + dout(1) << " e: '" << orig_argv[0] << "'" << dendl; + for (int i=0; i<orig_argc; i++) { + new_argv[i] = (char *)orig_argv[i]; + dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl; + } + new_argv[orig_argc] = NULL; + + /* Determine the path to our executable, test if Linux /proc/self/exe exists. + * This allows us to exec the same executable even if it has since been + * unlinked. + */ + char exe_path[PATH_MAX] = ""; +#ifdef PROCPREFIX + if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) != -1) { + dout(1) << "respawning with exe " << exe_path << dendl; + strcpy(exe_path, PROCPREFIX "/proc/self/exe"); + } else { +#else + { +#endif + /* Print CWD for the user's interest */ + char buf[PATH_MAX]; + char *cwd = getcwd(buf, sizeof(buf)); + ceph_assert(cwd); + dout(1) << " cwd " << cwd << dendl; + + /* Fall back to a best-effort: just running in our CWD */ + strncpy(exe_path, orig_argv[0], PATH_MAX-1); + } + + dout(1) << " exe_path " << exe_path << dendl; + + unblock_all_signals(NULL); + execv(exe_path, new_argv); + + dout(0) << "respawn execv " << orig_argv[0] + << " failed with " << cpp_strerror(errno) << dendl; + + // We have to assert out here, because suicide() returns, and callers + // to respawn expect it never to return. + ceph_abort(); +} + +void Monitor::bootstrap() +{ + dout(10) << "bootstrap" << dendl; + wait_for_paxos_write(); + + sync_reset_requester(); + unregister_cluster_logger(); + cancel_probe_timeout(); + + if (monmap->get_epoch() == 0) { + dout(10) << "reverting to legacy ranks for seed monmap (epoch 0)" << dendl; + monmap->calc_legacy_ranks(); + } + dout(10) << "monmap " << *monmap << dendl; + { + auto from_release = monmap->min_mon_release; + ostringstream err; + if (!can_upgrade_from(from_release, "min_mon_release", err)) { + derr << "current monmap has " << err.str() << " stopping." << dendl; + exit(0); + } + } + // note my rank + int newrank = monmap->get_rank(messenger->get_myaddrs()); + if (newrank < 0 && rank >= 0) { + // was i ever part of the quorum? + if (has_ever_joined) { + dout(0) << " removed from monmap, suicide." << dendl; + exit(0); + } + elector.notify_clear_peer_state(); + } + if (newrank >= 0 && + monmap->get_addrs(newrank) != messenger->get_myaddrs()) { + dout(0) << " monmap addrs for rank " << newrank << " changed, i am " + << messenger->get_myaddrs() + << ", monmap is " << monmap->get_addrs(newrank) << ", respawning" + << dendl; + + if (monmap->get_epoch()) { + // store this map in temp mon_sync location so that we use it on + // our next startup + derr << " stashing newest monmap " << monmap->get_epoch() + << " for next startup" << dendl; + bufferlist bl; + monmap->encode(bl, -1); + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put("mon_sync", "temp_newer_monmap", bl); + store->apply_transaction(t); + } + + respawn(); + } + if (newrank != rank) { + dout(0) << " my rank is now " << newrank << " (was " << rank << ")" << dendl; + messenger->set_myname(entity_name_t::MON(newrank)); + rank = newrank; + elector.notify_rank_changed(rank); + + // reset all connections, or else our peers will think we are someone else. + messenger->mark_down_all(); + } + + // reset + state = STATE_PROBING; + + _reset(); + + // sync store + if (g_conf()->mon_compact_on_bootstrap) { + dout(10) << "bootstrap -- triggering compaction" << dendl; + store->compact(); + dout(10) << "bootstrap -- finished compaction" << dendl; + } + + // stretch mode bits + set_elector_disallowed_leaders(false); + + // singleton monitor? + if (monmap->size() == 1 && rank == 0) { + win_standalone_election(); + return; + } + + reset_probe_timeout(); + + // i'm outside the quorum + if (monmap->contains(name)) + outside_quorum.insert(name); + + // probe monitors + dout(10) << "probing other monitors" << dendl; + for (unsigned i = 0; i < monmap->size(); i++) { + if ((int)i != rank) + send_mon_message( + new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined, + ceph_release()), + i); + } + for (auto& av : extra_probe_peers) { + if (av != messenger->get_myaddrs()) { + messenger->send_to_mon( + new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined, + ceph_release()), + av); + } + } +} + +bool Monitor::_add_bootstrap_peer_hint(std::string_view cmd, + const cmdmap_t& cmdmap, + ostream& ss) +{ + if (is_leader() || is_peon()) { + ss << "mon already active; ignoring bootstrap hint"; + return true; + } + + entity_addrvec_t addrs; + string addrstr; + if (cmd_getval(cmdmap, "addr", addrstr)) { + dout(10) << "_add_bootstrap_peer_hint '" << cmd << "' addr '" + << addrstr << "'" << dendl; + + entity_addr_t addr; + const char *end = 0; + if (!addr.parse(addrstr.c_str(), &end, entity_addr_t::TYPE_ANY)) { + ss << "failed to parse addrs '" << addrstr + << "'; syntax is 'add_bootstrap_peer_hint ip[:port]'"; + return false; + } + + addrs.v.push_back(addr); + if (addr.get_port() == 0) { + addrs.v[0].set_type(entity_addr_t::TYPE_MSGR2); + addrs.v[0].set_port(CEPH_MON_PORT_IANA); + addrs.v.push_back(addr); + addrs.v[1].set_type(entity_addr_t::TYPE_LEGACY); + addrs.v[1].set_port(CEPH_MON_PORT_LEGACY); + } else if (addr.get_type() == entity_addr_t::TYPE_ANY) { + if (addr.get_port() == CEPH_MON_PORT_LEGACY) { + addrs.v[0].set_type(entity_addr_t::TYPE_LEGACY); + } else { + addrs.v[0].set_type(entity_addr_t::TYPE_MSGR2); + } + } + } else if (cmd_getval(cmdmap, "addrv", addrstr)) { + dout(10) << "_add_bootstrap_peer_hintv '" << cmd << "' addrv '" + << addrstr << "'" << dendl; + const char *end = 0; + if (!addrs.parse(addrstr.c_str(), &end)) { + ss << "failed to parse addrs '" << addrstr + << "'; syntax is 'add_bootstrap_peer_hintv v2:ip:port[,v1:ip:port]'"; + return false; + } + } else { + ss << "no addr or addrv provided"; + return false; + } + + extra_probe_peers.insert(addrs); + ss << "adding peer " << addrs << " to list: " << extra_probe_peers; + return true; +} + +// called by bootstrap(), or on leader|peon -> electing +void Monitor::_reset() +{ + dout(10) << __func__ << dendl; + + // disable authentication + { + std::lock_guard l(auth_lock); + authmon()->_set_mon_num_rank(0, 0); + } + + cancel_probe_timeout(); + timecheck_finish(); + health_events_cleanup(); + health_check_log_times.clear(); + scrub_event_cancel(); + + leader_since = utime_t(); + quorum_since = {}; + if (!quorum.empty()) { + exited_quorum = ceph_clock_now(); + } + quorum.clear(); + outside_quorum.clear(); + quorum_feature_map.clear(); + + scrub_reset(); + + paxos->restart(); + + for (auto& svc : paxos_service) { + svc->restart(); + } +} + + +// ----------------------------------------------------------- +// sync + +set<string> Monitor::get_sync_targets_names() +{ + set<string> targets; + targets.insert(paxos->get_name()); + for (auto& svc : paxos_service) { + svc->get_store_prefixes(targets); + } + return targets; +} + + +void Monitor::sync_timeout() +{ + dout(10) << __func__ << dendl; + ceph_assert(state == STATE_SYNCHRONIZING); + bootstrap(); +} + +void Monitor::sync_obtain_latest_monmap(bufferlist &bl) +{ + dout(1) << __func__ << dendl; + + MonMap latest_monmap; + + // Grab latest monmap from MonmapMonitor + bufferlist monmon_bl; + int err = monmon()->get_monmap(monmon_bl); + if (err < 0) { + if (err != -ENOENT) { + derr << __func__ + << " something wrong happened while reading the store: " + << cpp_strerror(err) << dendl; + ceph_abort_msg("error reading the store"); + } + } else { + latest_monmap.decode(monmon_bl); + } + + // Grab last backed up monmap (if any) and compare epochs + if (store->exists("mon_sync", "latest_monmap")) { + bufferlist backup_bl; + int err = store->get("mon_sync", "latest_monmap", backup_bl); + if (err < 0) { + derr << __func__ + << " something wrong happened while reading the store: " + << cpp_strerror(err) << dendl; + ceph_abort_msg("error reading the store"); + } + ceph_assert(backup_bl.length() > 0); + + MonMap backup_monmap; + backup_monmap.decode(backup_bl); + + if (backup_monmap.epoch > latest_monmap.epoch) + latest_monmap = backup_monmap; + } + + // Check if our current monmap's epoch is greater than the one we've + // got so far. + if (monmap->epoch > latest_monmap.epoch) + latest_monmap = *monmap; + + dout(1) << __func__ << " obtained monmap e" << latest_monmap.epoch << dendl; + + latest_monmap.encode(bl, CEPH_FEATURES_ALL); +} + +void Monitor::sync_reset_requester() +{ + dout(10) << __func__ << dendl; + + if (sync_timeout_event) { + timer.cancel_event(sync_timeout_event); + sync_timeout_event = NULL; + } + + sync_provider = entity_addrvec_t(); + sync_cookie = 0; + sync_full = false; + sync_start_version = 0; +} + +void Monitor::sync_reset_provider() +{ + dout(10) << __func__ << dendl; + sync_providers.clear(); +} + +void Monitor::sync_start(entity_addrvec_t &addrs, bool full) +{ + dout(10) << __func__ << " " << addrs << (full ? " full" : " recent") << dendl; + + ceph_assert(state == STATE_PROBING || + state == STATE_SYNCHRONIZING); + state = STATE_SYNCHRONIZING; + + // make sure are not a provider for anyone! + sync_reset_provider(); + + sync_full = full; + + if (sync_full) { + // stash key state, and mark that we are syncing + auto t(std::make_shared<MonitorDBStore::Transaction>()); + sync_stash_critical_state(t); + t->put("mon_sync", "in_sync", 1); + + sync_last_committed_floor = std::max(sync_last_committed_floor, paxos->get_version()); + dout(10) << __func__ << " marking sync in progress, storing sync_last_committed_floor " + << sync_last_committed_floor << dendl; + t->put("mon_sync", "last_committed_floor", sync_last_committed_floor); + + store->apply_transaction(t); + + ceph_assert(g_conf()->mon_sync_requester_kill_at != 1); + + // clear the underlying store + set<string> targets = get_sync_targets_names(); + dout(10) << __func__ << " clearing prefixes " << targets << dendl; + store->clear(targets); + + // make sure paxos knows it has been reset. this prevents a + // bootstrap and then different probe reply order from possibly + // deciding a partial or no sync is needed. + paxos->init(); + + ceph_assert(g_conf()->mon_sync_requester_kill_at != 2); + } + + // assume 'other' as the leader. We will update the leader once we receive + // a reply to the sync start. + sync_provider = addrs; + + sync_reset_timeout(); + + MMonSync *m = new MMonSync(sync_full ? MMonSync::OP_GET_COOKIE_FULL : MMonSync::OP_GET_COOKIE_RECENT); + if (!sync_full) + m->last_committed = paxos->get_version(); + messenger->send_to_mon(m, sync_provider); +} + +void Monitor::sync_stash_critical_state(MonitorDBStore::TransactionRef t) +{ + dout(10) << __func__ << dendl; + bufferlist backup_monmap; + sync_obtain_latest_monmap(backup_monmap); + ceph_assert(backup_monmap.length() > 0); + t->put("mon_sync", "latest_monmap", backup_monmap); +} + +void Monitor::sync_reset_timeout() +{ + dout(10) << __func__ << dendl; + if (sync_timeout_event) + timer.cancel_event(sync_timeout_event); + sync_timeout_event = timer.add_event_after( + g_conf()->mon_sync_timeout, + new C_MonContext{this, [this](int) { + sync_timeout(); + }}); +} + +void Monitor::sync_finish(version_t last_committed) +{ + dout(10) << __func__ << " lc " << last_committed << " from " << sync_provider << dendl; + + ceph_assert(g_conf()->mon_sync_requester_kill_at != 7); + + if (sync_full) { + // finalize the paxos commits + auto tx(std::make_shared<MonitorDBStore::Transaction>()); + paxos->read_and_prepare_transactions(tx, sync_start_version, + last_committed); + tx->put(paxos->get_name(), "last_committed", last_committed); + + dout(30) << __func__ << " final tx dump:\n"; + JSONFormatter f(true); + tx->dump(&f); + f.flush(*_dout); + *_dout << dendl; + + store->apply_transaction(tx); + } + + ceph_assert(g_conf()->mon_sync_requester_kill_at != 8); + + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->erase("mon_sync", "in_sync"); + t->erase("mon_sync", "force_sync"); + t->erase("mon_sync", "last_committed_floor"); + store->apply_transaction(t); + + ceph_assert(g_conf()->mon_sync_requester_kill_at != 9); + + init_paxos(); + + ceph_assert(g_conf()->mon_sync_requester_kill_at != 10); + + bootstrap(); +} + +void Monitor::handle_sync(MonOpRequestRef op) +{ + auto m = op->get_req<MMonSync>(); + dout(10) << __func__ << " " << *m << dendl; + switch (m->op) { + + // provider --------- + + case MMonSync::OP_GET_COOKIE_FULL: + case MMonSync::OP_GET_COOKIE_RECENT: + handle_sync_get_cookie(op); + break; + case MMonSync::OP_GET_CHUNK: + handle_sync_get_chunk(op); + break; + + // client ----------- + + case MMonSync::OP_COOKIE: + handle_sync_cookie(op); + break; + + case MMonSync::OP_CHUNK: + case MMonSync::OP_LAST_CHUNK: + handle_sync_chunk(op); + break; + case MMonSync::OP_NO_COOKIE: + handle_sync_no_cookie(op); + break; + + default: + dout(0) << __func__ << " unknown op " << m->op << dendl; + ceph_abort_msg("unknown op"); + } +} + +// leader + +void Monitor::_sync_reply_no_cookie(MonOpRequestRef op) +{ + auto m = op->get_req<MMonSync>(); + MMonSync *reply = new MMonSync(MMonSync::OP_NO_COOKIE, m->cookie); + m->get_connection()->send_message(reply); +} + +void Monitor::handle_sync_get_cookie(MonOpRequestRef op) +{ + auto m = op->get_req<MMonSync>(); + if (is_synchronizing()) { + _sync_reply_no_cookie(op); + return; + } + + ceph_assert(g_conf()->mon_sync_provider_kill_at != 1); + + // make sure they can understand us. + if ((required_features ^ m->get_connection()->get_features()) & + required_features) { + dout(5) << " ignoring peer mon." << m->get_source().num() + << " has features " << std::hex + << m->get_connection()->get_features() + << " but we require " << required_features << std::dec << dendl; + return; + } + + // make up a unique cookie. include election epoch (which persists + // across restarts for the whole cluster) and a counter for this + // process instance. there is no need to be unique *across* + // monitors, though. + uint64_t cookie = ((unsigned long long)elector.get_epoch() << 24) + ++sync_provider_count; + ceph_assert(sync_providers.count(cookie) == 0); + + dout(10) << __func__ << " cookie " << cookie << " for " << m->get_source_inst() << dendl; + + SyncProvider& sp = sync_providers[cookie]; + sp.cookie = cookie; + sp.addrs = m->get_source_addrs(); + sp.reset_timeout(g_ceph_context, g_conf()->mon_sync_timeout * 2); + + set<string> sync_targets; + if (m->op == MMonSync::OP_GET_COOKIE_FULL) { + // full scan + sync_targets = get_sync_targets_names(); + sp.last_committed = paxos->get_version(); + sp.synchronizer = store->get_synchronizer(sp.last_key, sync_targets); + sp.full = true; + dout(10) << __func__ << " will sync prefixes " << sync_targets << dendl; + } else { + // just catch up paxos + sp.last_committed = m->last_committed; + } + dout(10) << __func__ << " will sync from version " << sp.last_committed << dendl; + + MMonSync *reply = new MMonSync(MMonSync::OP_COOKIE, sp.cookie); + reply->last_committed = sp.last_committed; + m->get_connection()->send_message(reply); +} + +void Monitor::handle_sync_get_chunk(MonOpRequestRef op) +{ + auto m = op->get_req<MMonSync>(); + dout(10) << __func__ << " " << *m << dendl; + + if (sync_providers.count(m->cookie) == 0) { + dout(10) << __func__ << " no cookie " << m->cookie << dendl; + _sync_reply_no_cookie(op); + return; + } + + ceph_assert(g_conf()->mon_sync_provider_kill_at != 2); + + SyncProvider& sp = sync_providers[m->cookie]; + sp.reset_timeout(g_ceph_context, g_conf()->mon_sync_timeout * 2); + + if (sp.last_committed < paxos->get_first_committed() && + paxos->get_first_committed() > 1) { + dout(10) << __func__ << " sync requester fell behind paxos, their lc " << sp.last_committed + << " < our fc " << paxos->get_first_committed() << dendl; + sync_providers.erase(m->cookie); + _sync_reply_no_cookie(op); + return; + } + + MMonSync *reply = new MMonSync(MMonSync::OP_CHUNK, sp.cookie); + auto tx(std::make_shared<MonitorDBStore::Transaction>()); + + int bytes_left = g_conf()->mon_sync_max_payload_size; + int keys_left = g_conf()->mon_sync_max_payload_keys; + while (sp.last_committed < paxos->get_version() && + bytes_left > 0 && + keys_left > 0) { + bufferlist bl; + sp.last_committed++; + + int err = store->get(paxos->get_name(), sp.last_committed, bl); + ceph_assert(err == 0); + + tx->put(paxos->get_name(), sp.last_committed, bl); + bytes_left -= bl.length(); + --keys_left; + dout(20) << __func__ << " including paxos state " << sp.last_committed + << dendl; + } + reply->last_committed = sp.last_committed; + + if (sp.full && bytes_left > 0 && keys_left > 0) { + sp.synchronizer->get_chunk_tx(tx, bytes_left, keys_left); + sp.last_key = sp.synchronizer->get_last_key(); + reply->last_key = sp.last_key; + } + + if ((sp.full && sp.synchronizer->has_next_chunk()) || + sp.last_committed < paxos->get_version()) { + dout(10) << __func__ << " chunk, through version " << sp.last_committed + << " key " << sp.last_key << dendl; + } else { + dout(10) << __func__ << " last chunk, through version " << sp.last_committed + << " key " << sp.last_key << dendl; + reply->op = MMonSync::OP_LAST_CHUNK; + + ceph_assert(g_conf()->mon_sync_provider_kill_at != 3); + + // clean up our local state + sync_providers.erase(sp.cookie); + } + + encode(*tx, reply->chunk_bl); + + m->get_connection()->send_message(reply); +} + +// requester + +void Monitor::handle_sync_cookie(MonOpRequestRef op) +{ + auto m = op->get_req<MMonSync>(); + dout(10) << __func__ << " " << *m << dendl; + if (sync_cookie) { + dout(10) << __func__ << " already have a cookie, ignoring" << dendl; + return; + } + if (m->get_source_addrs() != sync_provider) { + dout(10) << __func__ << " source does not match, discarding" << dendl; + return; + } + sync_cookie = m->cookie; + sync_start_version = m->last_committed; + + sync_reset_timeout(); + sync_get_next_chunk(); + + ceph_assert(g_conf()->mon_sync_requester_kill_at != 3); +} + +void Monitor::sync_get_next_chunk() +{ + dout(20) << __func__ << " cookie " << sync_cookie << " provider " << sync_provider << dendl; + if (g_conf()->mon_inject_sync_get_chunk_delay > 0) { + dout(20) << __func__ << " injecting delay of " << g_conf()->mon_inject_sync_get_chunk_delay << dendl; + usleep((long long)(g_conf()->mon_inject_sync_get_chunk_delay * 1000000.0)); + } + MMonSync *r = new MMonSync(MMonSync::OP_GET_CHUNK, sync_cookie); + messenger->send_to_mon(r, sync_provider); + + ceph_assert(g_conf()->mon_sync_requester_kill_at != 4); +} + +void Monitor::handle_sync_chunk(MonOpRequestRef op) +{ + auto m = op->get_req<MMonSync>(); + dout(10) << __func__ << " " << *m << dendl; + + if (m->cookie != sync_cookie) { + dout(10) << __func__ << " cookie does not match, discarding" << dendl; + return; + } + if (m->get_source_addrs() != sync_provider) { + dout(10) << __func__ << " source does not match, discarding" << dendl; + return; + } + + ceph_assert(state == STATE_SYNCHRONIZING); + ceph_assert(g_conf()->mon_sync_requester_kill_at != 5); + + auto tx(std::make_shared<MonitorDBStore::Transaction>()); + tx->append_from_encoded(m->chunk_bl); + + dout(30) << __func__ << " tx dump:\n"; + JSONFormatter f(true); + tx->dump(&f); + f.flush(*_dout); + *_dout << dendl; + + store->apply_transaction(tx); + + ceph_assert(g_conf()->mon_sync_requester_kill_at != 6); + + if (!sync_full) { + dout(10) << __func__ << " applying recent paxos transactions as we go" << dendl; + auto tx(std::make_shared<MonitorDBStore::Transaction>()); + paxos->read_and_prepare_transactions(tx, paxos->get_version() + 1, + m->last_committed); + tx->put(paxos->get_name(), "last_committed", m->last_committed); + + dout(30) << __func__ << " tx dump:\n"; + JSONFormatter f(true); + tx->dump(&f); + f.flush(*_dout); + *_dout << dendl; + + store->apply_transaction(tx); + paxos->init(); // to refresh what we just wrote + } + + if (m->op == MMonSync::OP_CHUNK) { + sync_reset_timeout(); + sync_get_next_chunk(); + } else if (m->op == MMonSync::OP_LAST_CHUNK) { + sync_finish(m->last_committed); + } +} + +void Monitor::handle_sync_no_cookie(MonOpRequestRef op) +{ + dout(10) << __func__ << dendl; + bootstrap(); +} + +void Monitor::sync_trim_providers() +{ + dout(20) << __func__ << dendl; + + utime_t now = ceph_clock_now(); + map<uint64_t,SyncProvider>::iterator p = sync_providers.begin(); + while (p != sync_providers.end()) { + if (now > p->second.timeout) { + dout(10) << __func__ << " expiring cookie " << p->second.cookie + << " for " << p->second.addrs << dendl; + sync_providers.erase(p++); + } else { + ++p; + } + } +} + +// --------------------------------------------------- +// probe + +void Monitor::cancel_probe_timeout() +{ + if (probe_timeout_event) { + dout(10) << "cancel_probe_timeout " << probe_timeout_event << dendl; + timer.cancel_event(probe_timeout_event); + probe_timeout_event = NULL; + } else { + dout(10) << "cancel_probe_timeout (none scheduled)" << dendl; + } +} + +void Monitor::reset_probe_timeout() +{ + cancel_probe_timeout(); + probe_timeout_event = new C_MonContext{this, [this](int r) { + probe_timeout(r); + }}; + double t = g_conf()->mon_probe_timeout; + if (timer.add_event_after(t, probe_timeout_event)) { + dout(10) << "reset_probe_timeout " << probe_timeout_event + << " after " << t << " seconds" << dendl; + } else { + probe_timeout_event = nullptr; + } +} + +void Monitor::probe_timeout(int r) +{ + dout(4) << "probe_timeout " << probe_timeout_event << dendl; + ceph_assert(is_probing() || is_synchronizing()); + ceph_assert(probe_timeout_event); + probe_timeout_event = NULL; + bootstrap(); +} + +void Monitor::handle_probe(MonOpRequestRef op) +{ + auto m = op->get_req<MMonProbe>(); + dout(10) << "handle_probe " << *m << dendl; + + if (m->fsid != monmap->fsid) { + dout(0) << "handle_probe ignoring fsid " << m->fsid << " != " << monmap->fsid << dendl; + return; + } + + switch (m->op) { + case MMonProbe::OP_PROBE: + handle_probe_probe(op); + break; + + case MMonProbe::OP_REPLY: + handle_probe_reply(op); + break; + + case MMonProbe::OP_MISSING_FEATURES: + derr << __func__ << " require release " << (int)m->mon_release << " > " + << (int)ceph_release() + << ", or missing features (have " << CEPH_FEATURES_ALL + << ", required " << m->required_features + << ", missing " << (m->required_features & ~CEPH_FEATURES_ALL) << ")" + << dendl; + break; + } +} + +void Monitor::handle_probe_probe(MonOpRequestRef op) +{ + auto m = op->get_req<MMonProbe>(); + + dout(10) << "handle_probe_probe " << m->get_source_inst() << *m + << " features " << m->get_connection()->get_features() << dendl; + uint64_t missing = required_features & ~m->get_connection()->get_features(); + if ((m->mon_release != ceph_release_t::unknown && + m->mon_release < monmap->min_mon_release) || + missing) { + dout(1) << " peer " << m->get_source_addr() + << " release " << m->mon_release + << " < min_mon_release " << monmap->min_mon_release + << ", or missing features " << missing << dendl; + MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_MISSING_FEATURES, + name, has_ever_joined, monmap->min_mon_release); + m->required_features = required_features; + m->get_connection()->send_message(r); + goto out; + } + + if (!is_probing() && !is_synchronizing()) { + // If the probing mon is way ahead of us, we need to re-bootstrap. + // Normally we capture this case when we initially bootstrap, but + // it is possible we pass those checks (we overlap with + // quorum-to-be) but fail to join a quorum before it moves past + // us. We need to be kicked back to bootstrap so we can + // synchonize, not keep calling elections. + if (paxos->get_version() + 1 < m->paxos_first_version) { + dout(1) << " peer " << m->get_source_addr() << " has first_committed " + << "ahead of us, re-bootstrapping" << dendl; + bootstrap(); + goto out; + + } + } + + MMonProbe *r; + r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, name, has_ever_joined, + ceph_release()); + r->name = name; + r->quorum = quorum; + r->leader = leader; + monmap->encode(r->monmap_bl, m->get_connection()->get_features()); + r->paxos_first_version = paxos->get_first_committed(); + r->paxos_last_version = paxos->get_version(); + m->get_connection()->send_message(r); + + // did we discover a peer here? + if (!monmap->contains(m->get_source_addr())) { + dout(1) << " adding peer " << m->get_source_addrs() + << " to list of hints" << dendl; + extra_probe_peers.insert(m->get_source_addrs()); + } else { + elector.begin_peer_ping(monmap->get_rank(m->get_source_addr())); + } + + out: + return; +} + +void Monitor::handle_probe_reply(MonOpRequestRef op) +{ + auto m = op->get_req<MMonProbe>(); + dout(10) << "handle_probe_reply " << m->get_source_inst() + << " " << *m << dendl; + dout(10) << " monmap is " << *monmap << dendl; + + // discover name and addrs during probing or electing states. + if (!is_probing() && !is_electing()) { + return; + } + + // newer map, or they've joined a quorum and we haven't? + bufferlist mybl; + monmap->encode(mybl, m->get_connection()->get_features()); + // make sure it's actually different; the checks below err toward + // taking the other guy's map, which could cause us to loop. + if (!mybl.contents_equal(m->monmap_bl)) { + MonMap *newmap = new MonMap; + newmap->decode(m->monmap_bl); + if (m->has_ever_joined && (newmap->get_epoch() > monmap->get_epoch() || + !has_ever_joined)) { + dout(10) << " got newer/committed monmap epoch " << newmap->get_epoch() + << ", mine was " << monmap->get_epoch() << dendl; + int epoch_diff = newmap->get_epoch() - monmap->get_epoch(); + delete newmap; + monmap->decode(m->monmap_bl); + dout(20) << "has_ever_joined: " << has_ever_joined << dendl; + if (epoch_diff == 1 && has_ever_joined) { + notify_new_monmap(false); + } else { + notify_new_monmap(false, false); + elector.notify_clear_peer_state(); + } + bootstrap(); + return; + } + delete newmap; + } + + // rename peer? + string peer_name = monmap->get_name(m->get_source_addr()); + if (monmap->get_epoch() == 0 && peer_name.compare(0, 7, "noname-") == 0) { + dout(10) << " renaming peer " << m->get_source_addr() << " " + << peer_name << " -> " << m->name << " in my monmap" + << dendl; + monmap->rename(peer_name, m->name); + + if (is_electing()) { + bootstrap(); + return; + } + } else if (peer_name.size()) { + dout(10) << " peer name is " << peer_name << dendl; + } else { + dout(10) << " peer " << m->get_source_addr() << " not in map" << dendl; + } + + // new initial peer? + if (monmap->get_epoch() == 0 && + monmap->contains(m->name) && + monmap->get_addrs(m->name).front().is_blank_ip()) { + dout(1) << " learned initial mon " << m->name + << " addrs " << m->get_source_addrs() << dendl; + monmap->set_addrvec(m->name, m->get_source_addrs()); + + bootstrap(); + return; + } + + // end discover phase + if (!is_probing()) { + return; + } + + ceph_assert(paxos != NULL); + + if (is_synchronizing()) { + dout(10) << " currently syncing" << dendl; + return; + } + + entity_addrvec_t other = m->get_source_addrs(); + + if (m->paxos_last_version < sync_last_committed_floor) { + dout(10) << " peer paxos versions [" << m->paxos_first_version + << "," << m->paxos_last_version << "] < my sync_last_committed_floor " + << sync_last_committed_floor << ", ignoring" + << dendl; + } else { + if (paxos->get_version() < m->paxos_first_version && + m->paxos_first_version > 1) { // no need to sync if we're 0 and they start at 1. + dout(10) << " peer paxos first versions [" << m->paxos_first_version + << "," << m->paxos_last_version << "]" + << " vs my version " << paxos->get_version() + << " (too far ahead)" + << dendl; + cancel_probe_timeout(); + sync_start(other, true); + return; + } + if (paxos->get_version() + g_conf()->paxos_max_join_drift < m->paxos_last_version) { + dout(10) << " peer paxos last version " << m->paxos_last_version + << " vs my version " << paxos->get_version() + << " (too far ahead)" + << dendl; + cancel_probe_timeout(); + sync_start(other, false); + return; + } + } + + // did the existing cluster complete upgrade to luminous? + if (osdmon()->osdmap.get_epoch()) { + if (osdmon()->osdmap.require_osd_release < ceph_release_t::luminous) { + derr << __func__ << " existing cluster has not completed upgrade to" + << " luminous; 'ceph osd require_osd_release luminous' before" + << " upgrading" << dendl; + exit(0); + } + if (!osdmon()->osdmap.test_flag(CEPH_OSDMAP_PURGED_SNAPDIRS) || + !osdmon()->osdmap.test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) { + derr << __func__ << " existing cluster has not completed a full luminous" + << " scrub to purge legacy snapdir objects; please scrub before" + << " upgrading beyond luminous." << dendl; + exit(0); + } + } + + // is there an existing quorum? + if (m->quorum.size()) { + dout(10) << " existing quorum " << m->quorum << dendl; + + dout(10) << " peer paxos version " << m->paxos_last_version + << " vs my version " << paxos->get_version() + << " (ok)" + << dendl; + bool in_map = false; + const auto my_info = monmap->mon_info.find(name); + const map<string,string> *map_crush_loc{nullptr}; + if (my_info != monmap->mon_info.end()) { + in_map = true; + map_crush_loc = &my_info->second.crush_loc; + } + if (in_map && + !monmap->get_addrs(name).front().is_blank_ip() && + (!need_set_crush_loc || (*map_crush_loc == crush_loc))) { + // i'm part of the cluster; just initiate a new election + start_election(); + } else { + dout(10) << " ready to join, but i'm not in the monmap/" + "my addr is blank/location is wrong, trying to join" << dendl; + send_mon_message(new MMonJoin(monmap->fsid, name, + messenger->get_myaddrs(), crush_loc, + need_set_crush_loc), + m->leader); + } + } else { + if (monmap->contains(m->name)) { + dout(10) << " mon." << m->name << " is outside the quorum" << dendl; + outside_quorum.insert(m->name); + } else { + dout(10) << " mostly ignoring mon." << m->name << ", not part of monmap" << dendl; + return; + } + + unsigned need = monmap->min_quorum_size(); + dout(10) << " outside_quorum now " << outside_quorum << ", need " << need << dendl; + if (outside_quorum.size() >= need) { + if (outside_quorum.count(name)) { + dout(10) << " that's enough to form a new quorum, calling election" << dendl; + start_election(); + } else { + dout(10) << " that's enough to form a new quorum, but it does not include me; waiting" << dendl; + } + } else { + dout(10) << " that's not yet enough for a new quorum, waiting" << dendl; + } + } +} + +void Monitor::join_election() +{ + dout(10) << __func__ << dendl; + wait_for_paxos_write(); + _reset(); + state = STATE_ELECTING; + + logger->inc(l_mon_num_elections); +} + +void Monitor::start_election() +{ + dout(10) << "start_election" << dendl; + wait_for_paxos_write(); + _reset(); + state = STATE_ELECTING; + + logger->inc(l_mon_num_elections); + logger->inc(l_mon_election_call); + + clog->info() << "mon." << name << " calling monitor election"; + elector.call_election(); +} + +void Monitor::win_standalone_election() +{ + dout(1) << "win_standalone_election" << dendl; + + // bump election epoch, in case the previous epoch included other + // monitors; we need to be able to make the distinction. + elector.declare_standalone_victory(); + + rank = monmap->get_rank(name); + ceph_assert(rank == 0); + set<int> q; + q.insert(rank); + + map<int,Metadata> metadata; + collect_metadata(&metadata[0]); + + win_election(elector.get_epoch(), q, + CEPH_FEATURES_ALL, + ceph::features::mon::get_supported(), + ceph_release(), + metadata); +} + +const utime_t& Monitor::get_leader_since() const +{ + ceph_assert(state == STATE_LEADER); + return leader_since; +} + +epoch_t Monitor::get_epoch() +{ + return elector.get_epoch(); +} + +void Monitor::_finish_svc_election() +{ + ceph_assert(state == STATE_LEADER || state == STATE_PEON); + + for (auto& svc : paxos_service) { + // we already called election_finished() on monmon(); avoid callig twice + if (state == STATE_LEADER && svc.get() == monmon()) + continue; + svc->election_finished(); + } +} + +void Monitor::win_election(epoch_t epoch, const set<int>& active, uint64_t features, + const mon_feature_t& mon_features, + ceph_release_t min_mon_release, + const map<int,Metadata>& metadata) +{ + dout(10) << __func__ << " epoch " << epoch << " quorum " << active + << " features " << features + << " mon_features " << mon_features + << " min_mon_release " << min_mon_release + << dendl; + ceph_assert(is_electing()); + state = STATE_LEADER; + leader_since = ceph_clock_now(); + quorum_since = mono_clock::now(); + leader = rank; + quorum = active; + quorum_con_features = features; + quorum_mon_features = mon_features; + quorum_min_mon_release = min_mon_release; + pending_metadata = metadata; + outside_quorum.clear(); + + clog->info() << "mon." << name << " is new leader, mons " << get_quorum_names() + << " in quorum (ranks " << quorum << ")"; + + set_leader_commands(get_local_commands(mon_features)); + + paxos->leader_init(); + // NOTE: tell monmap monitor first. This is important for the + // bootstrap case to ensure that the very first paxos proposal + // codifies the monmap. Otherwise any manner of chaos can ensue + // when monitors are call elections or participating in a paxos + // round without agreeing on who the participants are. + monmon()->election_finished(); + _finish_svc_election(); + + logger->inc(l_mon_election_win); + + // inject new metadata in first transaction. + { + // include previous metadata for missing mons (that aren't part of + // the current quorum). + map<int,Metadata> m = metadata; + for (unsigned rank = 0; rank < monmap->size(); ++rank) { + if (m.count(rank) == 0 && + mon_metadata.count(rank)) { + m[rank] = mon_metadata[rank]; + } + } + + // FIXME: This is a bit sloppy because we aren't guaranteed to submit + // a new transaction immediately after the election finishes. We should + // do that anyway for other reasons, though. + MonitorDBStore::TransactionRef t = paxos->get_pending_transaction(); + bufferlist bl; + encode(m, bl); + t->put(MONITOR_STORE_PREFIX, "last_metadata", bl); + } + + finish_election(); + if (monmap->size() > 1 && + monmap->get_epoch() > 0) { + timecheck_start(); + health_tick_start(); + + // Freshen the health status before doing health_to_clog in case + // our just-completed election changed the health + healthmon()->wait_for_active_ctx(new LambdaContext([this](int r){ + dout(20) << "healthmon now active" << dendl; + healthmon()->tick(); + if (healthmon()->is_proposing()) { + dout(20) << __func__ << " healthmon proposing, waiting" << dendl; + healthmon()->wait_for_finished_proposal(nullptr, new C_MonContext{this, + [this](int r){ + ceph_assert(ceph_mutex_is_locked_by_me(lock)); + do_health_to_clog_interval(); + }}); + + } else { + do_health_to_clog_interval(); + } + })); + + scrub_event_start(); + } +} + +void Monitor::lose_election(epoch_t epoch, set<int> &q, int l, + uint64_t features, + const mon_feature_t& mon_features, + ceph_release_t min_mon_release) +{ + state = STATE_PEON; + leader_since = utime_t(); + quorum_since = mono_clock::now(); + leader = l; + quorum = q; + outside_quorum.clear(); + quorum_con_features = features; + quorum_mon_features = mon_features; + quorum_min_mon_release = min_mon_release; + dout(10) << "lose_election, epoch " << epoch << " leader is mon" << leader + << " quorum is " << quorum << " features are " << quorum_con_features + << " mon_features are " << quorum_mon_features + << " min_mon_release " << min_mon_release + << dendl; + + paxos->peon_init(); + _finish_svc_election(); + + logger->inc(l_mon_election_lose); + + finish_election(); +} + +namespace { +std::string collect_compression_algorithms() +{ + ostringstream os; + bool printed = false; + for (auto [name, key] : Compressor::compression_algorithms) { + if (printed) { + os << ", "; + } else { + printed = true; + } + std::ignore = key; + os << name; + } + return os.str(); +} +} + +void Monitor::collect_metadata(Metadata *m) +{ + collect_sys_info(m, g_ceph_context); + (*m)["addrs"] = stringify(messenger->get_myaddrs()); + (*m)["compression_algorithms"] = collect_compression_algorithms(); + + // infer storage device + string devname = store->get_devname(); + set<string> devnames; + get_raw_devices(devname, &devnames); + map<string,string> errs; + get_device_metadata(devnames, m, &errs); + for (auto& i : errs) { + dout(1) << __func__ << " " << i.first << ": " << i.second << dendl; + } +} + +void Monitor::finish_election() +{ + apply_quorum_to_compatset_features(); + apply_monmap_to_compatset_features(); + timecheck_finish(); + exited_quorum = utime_t(); + finish_contexts(g_ceph_context, waitfor_quorum); + finish_contexts(g_ceph_context, maybe_wait_for_quorum); + resend_routed_requests(); + update_logger(); + register_cluster_logger(); + + // enable authentication + { + std::lock_guard l(auth_lock); + authmon()->_set_mon_num_rank(monmap->size(), rank); + } + + // am i named and located properly? + string cur_name = monmap->get_name(messenger->get_myaddrs()); + const auto my_infop = monmap->mon_info.find(cur_name); + const map<string,string>& map_crush_loc = my_infop->second.crush_loc; + + if (cur_name != name || + (need_set_crush_loc && map_crush_loc != crush_loc)) { + dout(10) << " renaming/moving myself from " << cur_name << "/" + << map_crush_loc <<" -> " << name << "/" << crush_loc << dendl; + send_mon_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddrs(), + crush_loc, need_set_crush_loc), + leader); + return; + } + do_stretch_mode_election_work(); +} + +void Monitor::_apply_compatset_features(CompatSet &new_features) +{ + if (new_features.compare(features) != 0) { + CompatSet diff = features.unsupported(new_features); + dout(1) << __func__ << " enabling new quorum features: " << diff << dendl; + features = new_features; + + auto t = std::make_shared<MonitorDBStore::Transaction>(); + write_features(t); + store->apply_transaction(t); + + calc_quorum_requirements(); + } +} + +void Monitor::apply_quorum_to_compatset_features() +{ + CompatSet new_features(features); + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES); + if (quorum_con_features & CEPH_FEATURE_OSDMAP_ENC) { + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC); + } + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2); + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3); + dout(5) << __func__ << dendl; + _apply_compatset_features(new_features); +} + +void Monitor::apply_monmap_to_compatset_features() +{ + CompatSet new_features(features); + mon_feature_t monmap_features = monmap->get_required_features(); + + /* persistent monmap features may go into the compatset. + * optional monmap features may not - why? + * because optional monmap features may be set/unset by the admin, + * and possibly by other means that haven't yet been thought out, + * so we can't make the monitor enforce them on start - because they + * may go away. + * this, of course, does not invalidate setting a compatset feature + * for an optional feature - as long as you make sure to clean it up + * once you unset it. + */ + if (monmap_features.contains_all(ceph::features::mon::FEATURE_KRAKEN)) { + ceph_assert(ceph::features::mon::get_persistent().contains_all( + ceph::features::mon::FEATURE_KRAKEN)); + // this feature should only ever be set if the quorum supports it. + ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_KRAKEN)); + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_KRAKEN); + } + if (monmap_features.contains_all(ceph::features::mon::FEATURE_LUMINOUS)) { + ceph_assert(ceph::features::mon::get_persistent().contains_all( + ceph::features::mon::FEATURE_LUMINOUS)); + // this feature should only ever be set if the quorum supports it. + ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_LUMINOUS)); + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS); + } + if (monmap_features.contains_all(ceph::features::mon::FEATURE_MIMIC)) { + ceph_assert(ceph::features::mon::get_persistent().contains_all( + ceph::features::mon::FEATURE_MIMIC)); + // this feature should only ever be set if the quorum supports it. + ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_MIMIC)); + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_MIMIC); + } + if (monmap_features.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) { + ceph_assert(ceph::features::mon::get_persistent().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)); + // this feature should only ever be set if the quorum supports it. + ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_NAUTILUS)); + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_NAUTILUS); + } + if (monmap_features.contains_all(ceph::features::mon::FEATURE_OCTOPUS)) { + ceph_assert(ceph::features::mon::get_persistent().contains_all( + ceph::features::mon::FEATURE_OCTOPUS)); + // this feature should only ever be set if the quorum supports it. + ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_OCTOPUS)); + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_OCTOPUS); + } + if (monmap_features.contains_all(ceph::features::mon::FEATURE_PACIFIC)) { + ceph_assert(ceph::features::mon::get_persistent().contains_all( + ceph::features::mon::FEATURE_PACIFIC)); + // this feature should only ever be set if the quorum supports it. + ceph_assert(HAVE_FEATURE(quorum_con_features, SERVER_PACIFIC)); + new_features.incompat.insert(CEPH_MON_FEATURE_INCOMPAT_PACIFIC); + } + + dout(5) << __func__ << dendl; + _apply_compatset_features(new_features); +} + +void Monitor::calc_quorum_requirements() +{ + required_features = 0; + + // compatset + if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC)) { + required_features |= CEPH_FEATURE_OSDMAP_ENC; + } + if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_KRAKEN)) { + required_features |= CEPH_FEATUREMASK_SERVER_KRAKEN; + } + if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_LUMINOUS)) { + required_features |= CEPH_FEATUREMASK_SERVER_LUMINOUS; + } + if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_MIMIC)) { + required_features |= CEPH_FEATUREMASK_SERVER_MIMIC; + } + if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_NAUTILUS)) { + required_features |= CEPH_FEATUREMASK_SERVER_NAUTILUS | + CEPH_FEATUREMASK_CEPHX_V2; + } + if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_OCTOPUS)) { + required_features |= CEPH_FEATUREMASK_SERVER_OCTOPUS; + } + if (features.incompat.contains(CEPH_MON_FEATURE_INCOMPAT_PACIFIC)) { + required_features |= CEPH_FEATUREMASK_SERVER_PACIFIC; + } + + // monmap + if (monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_KRAKEN)) { + required_features |= CEPH_FEATUREMASK_SERVER_KRAKEN; + } + if (monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_LUMINOUS)) { + required_features |= CEPH_FEATUREMASK_SERVER_LUMINOUS; + } + if (monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_MIMIC)) { + required_features |= CEPH_FEATUREMASK_SERVER_MIMIC; + } + if (monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + required_features |= CEPH_FEATUREMASK_SERVER_NAUTILUS | + CEPH_FEATUREMASK_CEPHX_V2; + } + dout(10) << __func__ << " required_features " << required_features << dendl; +} + +void Monitor::get_combined_feature_map(FeatureMap *fm) +{ + *fm += session_map.feature_map; + for (auto id : quorum) { + if (id != rank) { + *fm += quorum_feature_map[id]; + } + } +} + +void Monitor::sync_force(Formatter *f) +{ + auto tx(std::make_shared<MonitorDBStore::Transaction>()); + sync_stash_critical_state(tx); + tx->put("mon_sync", "force_sync", 1); + store->apply_transaction(tx); + + f->open_object_section("sync_force"); + f->dump_int("ret", 0); + f->dump_stream("msg") << "forcing store sync the next time the monitor starts"; + f->close_section(); // sync_force +} + +void Monitor::_quorum_status(Formatter *f, ostream& ss) +{ + bool free_formatter = false; + + if (!f) { + // louzy/lazy hack: default to json if no formatter has been defined + f = new JSONFormatter(); + free_formatter = true; + } + f->open_object_section("quorum_status"); + f->dump_int("election_epoch", get_epoch()); + + f->open_array_section("quorum"); + for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) + f->dump_int("mon", *p); + f->close_section(); // quorum + + list<string> quorum_names = get_quorum_names(); + f->open_array_section("quorum_names"); + for (list<string>::iterator p = quorum_names.begin(); p != quorum_names.end(); ++p) + f->dump_string("mon", *p); + f->close_section(); // quorum_names + + f->dump_string("quorum_leader_name", quorum.empty() ? string() : monmap->get_name(leader)); + + if (!quorum.empty()) { + f->dump_int( + "quorum_age", + quorum_age()); + } + + f->open_object_section("features"); + f->dump_stream("quorum_con") << quorum_con_features; + quorum_mon_features.dump(f, "quorum_mon"); + f->close_section(); + + f->open_object_section("monmap"); + monmap->dump(f); + f->close_section(); // monmap + + f->close_section(); // quorum_status + f->flush(ss); + if (free_formatter) + delete f; +} + +void Monitor::get_mon_status(Formatter *f) +{ + f->open_object_section("mon_status"); + f->dump_string("name", name); + f->dump_int("rank", rank); + f->dump_string("state", get_state_name()); + f->dump_int("election_epoch", get_epoch()); + + f->open_array_section("quorum"); + for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) { + f->dump_int("mon", *p); + } + f->close_section(); // quorum + + if (!quorum.empty()) { + f->dump_int( + "quorum_age", + quorum_age()); + } + + f->open_object_section("features"); + f->dump_stream("required_con") << required_features; + mon_feature_t req_mon_features = get_required_mon_features(); + req_mon_features.dump(f, "required_mon"); + f->dump_stream("quorum_con") << quorum_con_features; + quorum_mon_features.dump(f, "quorum_mon"); + f->close_section(); // features + + f->open_array_section("outside_quorum"); + for (set<string>::iterator p = outside_quorum.begin(); p != outside_quorum.end(); ++p) + f->dump_string("mon", *p); + f->close_section(); // outside_quorum + + f->open_array_section("extra_probe_peers"); + for (set<entity_addrvec_t>::iterator p = extra_probe_peers.begin(); + p != extra_probe_peers.end(); + ++p) { + f->dump_object("peer", *p); + } + f->close_section(); // extra_probe_peers + + f->open_array_section("sync_provider"); + for (map<uint64_t,SyncProvider>::const_iterator p = sync_providers.begin(); + p != sync_providers.end(); + ++p) { + f->dump_unsigned("cookie", p->second.cookie); + f->dump_object("addrs", p->second.addrs); + f->dump_stream("timeout") << p->second.timeout; + f->dump_unsigned("last_committed", p->second.last_committed); + f->dump_stream("last_key") << p->second.last_key; + } + f->close_section(); + + if (is_synchronizing()) { + f->open_object_section("sync"); + f->dump_stream("sync_provider") << sync_provider; + f->dump_unsigned("sync_cookie", sync_cookie); + f->dump_unsigned("sync_start_version", sync_start_version); + f->close_section(); + } + + if (g_conf()->mon_sync_provider_kill_at > 0) + f->dump_int("provider_kill_at", g_conf()->mon_sync_provider_kill_at); + if (g_conf()->mon_sync_requester_kill_at > 0) + f->dump_int("requester_kill_at", g_conf()->mon_sync_requester_kill_at); + + f->open_object_section("monmap"); + monmap->dump(f); + f->close_section(); + + f->dump_object("feature_map", session_map.feature_map); + f->dump_bool("stretch_mode", stretch_mode_engaged); + f->close_section(); // mon_status +} + + +// health status to clog + +void Monitor::health_tick_start() +{ + if (!cct->_conf->mon_health_to_clog || + cct->_conf->mon_health_to_clog_tick_interval <= 0) + return; + + dout(15) << __func__ << dendl; + + health_tick_stop(); + health_tick_event = timer.add_event_after( + cct->_conf->mon_health_to_clog_tick_interval, + new C_MonContext{this, [this](int r) { + if (r < 0) + return; + health_tick_start(); + }}); +} + +void Monitor::health_tick_stop() +{ + dout(15) << __func__ << dendl; + + if (health_tick_event) { + timer.cancel_event(health_tick_event); + health_tick_event = NULL; + } +} + +ceph::real_clock::time_point Monitor::health_interval_calc_next_update() +{ + auto now = ceph::real_clock::now(); + + auto secs = std::chrono::duration_cast<std::chrono::seconds>(now.time_since_epoch()); + int remainder = secs.count() % cct->_conf->mon_health_to_clog_interval; + int adjustment = cct->_conf->mon_health_to_clog_interval - remainder; + auto next = secs + std::chrono::seconds(adjustment); + + dout(20) << __func__ + << " now: " << now << "," + << " next: " << next << "," + << " interval: " << cct->_conf->mon_health_to_clog_interval + << dendl; + + return ceph::real_clock::time_point{next}; +} + +void Monitor::health_interval_start() +{ + dout(15) << __func__ << dendl; + + if (!cct->_conf->mon_health_to_clog || + cct->_conf->mon_health_to_clog_interval <= 0) { + return; + } + + health_interval_stop(); + auto next = health_interval_calc_next_update(); + health_interval_event = new C_MonContext{this, [this](int r) { + if (r < 0) + return; + do_health_to_clog_interval(); + }}; + if (!timer.add_event_at(next, health_interval_event)) { + health_interval_event = nullptr; + } +} + +void Monitor::health_interval_stop() +{ + dout(15) << __func__ << dendl; + if (health_interval_event) { + timer.cancel_event(health_interval_event); + } + health_interval_event = NULL; +} + +void Monitor::health_events_cleanup() +{ + health_tick_stop(); + health_interval_stop(); + health_status_cache.reset(); +} + +void Monitor::health_to_clog_update_conf(const std::set<std::string> &changed) +{ + dout(20) << __func__ << dendl; + + if (changed.count("mon_health_to_clog")) { + if (!cct->_conf->mon_health_to_clog) { + health_events_cleanup(); + return; + } else { + if (!health_tick_event) { + health_tick_start(); + } + if (!health_interval_event) { + health_interval_start(); + } + } + } + + if (changed.count("mon_health_to_clog_interval")) { + if (cct->_conf->mon_health_to_clog_interval <= 0) { + health_interval_stop(); + } else { + health_interval_start(); + } + } + + if (changed.count("mon_health_to_clog_tick_interval")) { + if (cct->_conf->mon_health_to_clog_tick_interval <= 0) { + health_tick_stop(); + } else { + health_tick_start(); + } + } +} + +void Monitor::do_health_to_clog_interval() +{ + // outputting to clog may have been disabled in the conf + // since we were scheduled. + if (!cct->_conf->mon_health_to_clog || + cct->_conf->mon_health_to_clog_interval <= 0) + return; + + dout(10) << __func__ << dendl; + + // do we have a cached value for next_clog_update? if not, + // do we know when the last update was? + + do_health_to_clog(true); + health_interval_start(); +} + +void Monitor::do_health_to_clog(bool force) +{ + // outputting to clog may have been disabled in the conf + // since we were scheduled. + if (!cct->_conf->mon_health_to_clog || + cct->_conf->mon_health_to_clog_interval <= 0) + return; + + dout(10) << __func__ << (force ? " (force)" : "") << dendl; + + string summary; + health_status_t level = healthmon()->get_health_status(false, nullptr, &summary); + if (!force && + summary == health_status_cache.summary && + level == health_status_cache.overall) + return; + + if (g_conf()->mon_health_detail_to_clog && + summary != health_status_cache.summary && + level != HEALTH_OK) { + string details; + level = healthmon()->get_health_status(true, nullptr, &details); + clog->health(level) << "Health detail: " << details; + } else { + clog->health(level) << "overall " << summary; + } + health_status_cache.summary = summary; + health_status_cache.overall = level; +} + +void Monitor::log_health( + const health_check_map_t& updated, + const health_check_map_t& previous, + MonitorDBStore::TransactionRef t) +{ + if (!g_conf()->mon_health_to_clog) { + return; + } + + const utime_t now = ceph_clock_now(); + + // FIXME: log atomically as part of @t instead of using clog. + dout(10) << __func__ << " updated " << updated.checks.size() + << " previous " << previous.checks.size() + << dendl; + const auto min_log_period = g_conf().get_val<int64_t>( + "mon_health_log_update_period"); + for (auto& p : updated.checks) { + auto q = previous.checks.find(p.first); + bool logged = false; + if (q == previous.checks.end()) { + // new + ostringstream ss; + ss << "Health check failed: " << p.second.summary << " (" + << p.first << ")"; + clog->health(p.second.severity) << ss.str(); + + logged = true; + } else { + if (p.second.summary != q->second.summary || + p.second.severity != q->second.severity) { + + auto status_iter = health_check_log_times.find(p.first); + if (status_iter != health_check_log_times.end()) { + if (p.second.severity == q->second.severity && + now - status_iter->second.updated_at < min_log_period) { + // We already logged this recently and the severity is unchanged, + // so skip emitting an update of the summary string. + // We'll get an update out of tick() later if the check + // is still failing. + continue; + } + } + + // summary or severity changed (ignore detail changes at this level) + ostringstream ss; + ss << "Health check update: " << p.second.summary << " (" << p.first << ")"; + clog->health(p.second.severity) << ss.str(); + + logged = true; + } + } + // Record the time at which we last logged, so that we can check this + // when considering whether/when to print update messages. + if (logged) { + auto iter = health_check_log_times.find(p.first); + if (iter == health_check_log_times.end()) { + health_check_log_times.emplace(p.first, HealthCheckLogStatus( + p.second.severity, p.second.summary, now)); + } else { + iter->second = HealthCheckLogStatus( + p.second.severity, p.second.summary, now); + } + } + } + for (auto& p : previous.checks) { + if (!updated.checks.count(p.first)) { + // cleared + ostringstream ss; + if (p.first == "DEGRADED_OBJECTS") { + clog->info() << "All degraded objects recovered"; + } else if (p.first == "OSD_FLAGS") { + clog->info() << "OSD flags cleared"; + } else { + clog->info() << "Health check cleared: " << p.first << " (was: " + << p.second.summary << ")"; + } + + if (health_check_log_times.count(p.first)) { + health_check_log_times.erase(p.first); + } + } + } + + if (previous.checks.size() && updated.checks.size() == 0) { + // We might be going into a fully healthy state, check + // other subsystems + bool any_checks = false; + for (auto& svc : paxos_service) { + if (&(svc->get_health_checks()) == &(previous)) { + // Ignore the ones we're clearing right now + continue; + } + + if (svc->get_health_checks().checks.size() > 0) { + any_checks = true; + break; + } + } + if (!any_checks) { + clog->info() << "Cluster is now healthy"; + } + } +} + +void Monitor::update_pending_metadata() +{ + Metadata metadata; + collect_metadata(&metadata); + size_t version_size = mon_metadata[rank]["ceph_version_short"].size(); + const std::string current_version = mon_metadata[rank]["ceph_version_short"]; + const std::string pending_version = metadata["ceph_version_short"]; + + if (current_version.compare(0, version_size, pending_version) < 0) { + mgr_client.update_daemon_metadata("mon", name, metadata); + } +} + +void Monitor::get_cluster_status(stringstream &ss, Formatter *f, + MonSession *session) +{ + if (f) + f->open_object_section("status"); + + const auto&& fs_names = session->get_allowed_fs_names(); + + if (f) { + f->dump_stream("fsid") << monmap->get_fsid(); + healthmon()->get_health_status(false, f, nullptr); + f->dump_unsigned("election_epoch", get_epoch()); + { + f->open_array_section("quorum"); + for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) + f->dump_int("rank", *p); + f->close_section(); + f->open_array_section("quorum_names"); + for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) + f->dump_string("id", monmap->get_name(*p)); + f->close_section(); + f->dump_int( + "quorum_age", + quorum_age()); + } + f->open_object_section("monmap"); + monmap->dump_summary(f); + f->close_section(); + f->open_object_section("osdmap"); + osdmon()->osdmap.print_summary(f, cout, string(12, ' ')); + f->close_section(); + f->open_object_section("pgmap"); + mgrstatmon()->print_summary(f, NULL); + f->close_section(); + f->open_object_section("fsmap"); + + FSMap fsmap_copy = mdsmon()->get_fsmap(); + if (!fs_names.empty()) { + fsmap_copy.filter(fs_names); + } + const FSMap *fsmapp = &fsmap_copy; + + fsmapp->print_summary(f, NULL); + f->close_section(); + f->open_object_section("mgrmap"); + mgrmon()->get_map().print_summary(f, nullptr); + f->close_section(); + + f->dump_object("servicemap", mgrstatmon()->get_service_map()); + + f->open_object_section("progress_events"); + for (auto& i : mgrstatmon()->get_progress_events()) { + f->dump_object(i.first.c_str(), i.second); + } + f->close_section(); + + f->close_section(); + } else { + ss << " cluster:\n"; + ss << " id: " << monmap->get_fsid() << "\n"; + + string health; + healthmon()->get_health_status(false, nullptr, &health, + "\n ", "\n "); + ss << " health: " << health << "\n"; + + ss << "\n \n services:\n"; + { + size_t maxlen = 3; + auto& service_map = mgrstatmon()->get_service_map(); + for (auto& p : service_map.services) { + maxlen = std::max(maxlen, p.first.size()); + } + string spacing(maxlen - 3, ' '); + const auto quorum_names = get_quorum_names(); + const auto mon_count = monmap->mon_info.size(); + auto mnow = ceph::mono_clock::now(); + ss << " mon: " << spacing << mon_count << " daemons, quorum " + << quorum_names << " (age " << timespan_str(mnow - quorum_since) << ")"; + if (quorum_names.size() != mon_count) { + std::list<std::string> out_of_q; + for (size_t i = 0; i < monmap->ranks.size(); ++i) { + if (quorum.count(i) == 0) { + out_of_q.push_back(monmap->ranks[i]); + } + } + ss << ", out of quorum: " << joinify(out_of_q.begin(), + out_of_q.end(), std::string(", ")); + } + ss << "\n"; + if (mgrmon()->in_use()) { + ss << " mgr: " << spacing; + mgrmon()->get_map().print_summary(nullptr, &ss); + ss << "\n"; + } + + FSMap fsmap_copy = mdsmon()->get_fsmap(); + if (!fs_names.empty()) { + fsmap_copy.filter(fs_names); + } + const FSMap *fsmapp = &fsmap_copy; + + if (fsmapp->filesystem_count() > 0 and mdsmon()->should_print_status()){ + ss << " mds: " << spacing; + fsmapp->print_daemon_summary(ss); + ss << "\n"; + } + + ss << " osd: " << spacing; + osdmon()->osdmap.print_summary(NULL, ss, string(maxlen + 6, ' ')); + ss << "\n"; + for (auto& p : service_map.services) { + const std::string &service = p.first; + // filter out normal ceph entity types + if (ServiceMap::is_normal_ceph_entity(service)) { + continue; + } + ss << " " << p.first << ": " << string(maxlen - p.first.size(), ' ') + << p.second.get_summary() << "\n"; + } + } + + if (auto& service_map = mgrstatmon()->get_service_map(); + std::any_of(service_map.services.begin(), + service_map.services.end(), + [](auto& service) { + return service.second.has_running_tasks(); + })) { + ss << "\n \n task status:\n"; + for (auto& [name, service] : service_map.services) { + ss << service.get_task_summary(name); + } + } + + ss << "\n \n data:\n"; + mdsmon()->print_fs_summary(ss); + mgrstatmon()->print_summary(NULL, &ss); + + auto& pem = mgrstatmon()->get_progress_events(); + if (!pem.empty()) { + ss << "\n \n progress:\n"; + for (auto& i : pem) { + if (i.second.add_to_ceph_s){ + ss << " " << i.second.message << "\n"; + } + } + } + ss << "\n "; + } +} + +void Monitor::_generate_command_map(cmdmap_t& cmdmap, + map<string,string> ¶m_str_map) +{ + for (auto p = cmdmap.begin(); p != cmdmap.end(); ++p) { + if (p->first == "prefix") + continue; + if (p->first == "caps") { + vector<string> cv; + if (cmd_getval(cmdmap, "caps", cv) && + cv.size() % 2 == 0) { + for (unsigned i = 0; i < cv.size(); i += 2) { + string k = string("caps_") + cv[i]; + param_str_map[k] = cv[i + 1]; + } + continue; + } + } + param_str_map[p->first] = cmd_vartype_stringify(p->second); + } +} + +const MonCommand *Monitor::_get_moncommand( + const string &cmd_prefix, + const vector<MonCommand>& cmds) +{ + for (auto& c : cmds) { + if (c.cmdstring.compare(0, cmd_prefix.size(), cmd_prefix) == 0) { + return &c; + } + } + return nullptr; +} + +bool Monitor::_allowed_command(MonSession *s, const string &module, + const string &prefix, const cmdmap_t& cmdmap, + const map<string,string>& param_str_map, + const MonCommand *this_cmd) { + + bool cmd_r = this_cmd->requires_perm('r'); + bool cmd_w = this_cmd->requires_perm('w'); + bool cmd_x = this_cmd->requires_perm('x'); + + bool capable = s->caps.is_capable( + g_ceph_context, + s->entity_name, + module, prefix, param_str_map, + cmd_r, cmd_w, cmd_x, + s->get_peer_socket_addr()); + + dout(10) << __func__ << " " << (capable ? "" : "not ") << "capable" << dendl; + return capable; +} + +void Monitor::format_command_descriptions(const std::vector<MonCommand> &commands, + Formatter *f, + uint64_t features, + bufferlist *rdata) +{ + int cmdnum = 0; + f->open_object_section("command_descriptions"); + for (const auto &cmd : commands) { + unsigned flags = cmd.flags; + ostringstream secname; + secname << "cmd" << setfill('0') << std::setw(3) << cmdnum; + dump_cmddesc_to_json(f, features, secname.str(), + cmd.cmdstring, cmd.helpstring, cmd.module, + cmd.req_perms, flags); + cmdnum++; + } + f->close_section(); // command_descriptions + + f->flush(*rdata); +} + +bool Monitor::is_keyring_required() +{ + return auth_cluster_required.is_supported_auth(CEPH_AUTH_CEPHX) || + auth_service_required.is_supported_auth(CEPH_AUTH_CEPHX) || + auth_cluster_required.is_supported_auth(CEPH_AUTH_GSS) || + auth_service_required.is_supported_auth(CEPH_AUTH_GSS); +} + +struct C_MgrProxyCommand : public Context { + Monitor *mon; + MonOpRequestRef op; + uint64_t size; + bufferlist outbl; + string outs; + C_MgrProxyCommand(Monitor *mon, MonOpRequestRef op, uint64_t s) + : mon(mon), op(op), size(s) { } + void finish(int r) { + std::lock_guard l(mon->lock); + mon->mgr_proxy_bytes -= size; + mon->reply_command(op, r, outs, outbl, 0); + } +}; + +void Monitor::handle_tell_command(MonOpRequestRef op) +{ + ceph_assert(op->is_type_command()); + MCommand *m = static_cast<MCommand*>(op->get_req()); + if (m->fsid != monmap->fsid) { + dout(0) << "handle_command on fsid " << m->fsid << " != " << monmap->fsid << dendl; + return reply_tell_command(op, -EACCES, "wrong fsid"); + } + MonSession *session = op->get_session(); + if (!session) { + dout(5) << __func__ << " dropping stray message " << *m << dendl; + return; + } + cmdmap_t cmdmap; + if (stringstream ss; !cmdmap_from_json(m->cmd, &cmdmap, ss)) { + return reply_tell_command(op, -EINVAL, ss.str()); + } + map<string,string> param_str_map; + _generate_command_map(cmdmap, param_str_map); + string prefix; + if (!cmd_getval(cmdmap, "prefix", prefix)) { + return reply_tell_command(op, -EINVAL, "no prefix"); + } + if (auto cmd = _get_moncommand(prefix, + get_local_commands(quorum_mon_features)); + cmd) { + if (cmd->is_obsolete() || + (cct->_conf->mon_debug_deprecated_as_obsolete && + cmd->is_deprecated())) { + return reply_tell_command(op, -ENOTSUP, + "command is obsolete; " + "please check usage and/or man page"); + } + } + // see if command is allowed + if (!session->caps.is_capable( + g_ceph_context, + session->entity_name, + "mon", prefix, param_str_map, + true, true, true, + session->get_peer_socket_addr())) { + return reply_tell_command(op, -EACCES, "insufficient caps"); + } + // pass it to asok + cct->get_admin_socket()->queue_tell_command(m); +} + +void Monitor::handle_command(MonOpRequestRef op) +{ + ceph_assert(op->is_type_command()); + auto m = op->get_req<MMonCommand>(); + if (m->fsid != monmap->fsid) { + dout(0) << "handle_command on fsid " << m->fsid << " != " << monmap->fsid + << dendl; + reply_command(op, -EPERM, "wrong fsid", 0); + return; + } + + MonSession *session = op->get_session(); + if (!session) { + dout(5) << __func__ << " dropping stray message " << *m << dendl; + return; + } + + if (m->cmd.empty()) { + reply_command(op, -EINVAL, "no command specified", 0); + return; + } + + string prefix; + vector<string> fullcmd; + cmdmap_t cmdmap; + stringstream ss, ds; + bufferlist rdata; + string rs; + int r = -EINVAL; + rs = "unrecognized command"; + + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + // ss has reason for failure + r = -EINVAL; + rs = ss.str(); + if (!m->get_source().is_mon()) // don't reply to mon->mon commands + reply_command(op, r, rs, 0); + return; + } + + // check return value. If no prefix parameter provided, + // return value will be false, then return error info. + if (!cmd_getval(cmdmap, "prefix", prefix)) { + reply_command(op, -EINVAL, "command prefix not found", 0); + return; + } + + // check prefix is empty + if (prefix.empty()) { + reply_command(op, -EINVAL, "command prefix must not be empty", 0); + return; + } + + if (prefix == "get_command_descriptions") { + bufferlist rdata; + Formatter *f = Formatter::create("json"); + + std::vector<MonCommand> commands = static_cast<MgrMonitor*>( + paxos_service[PAXOS_MGR].get())->get_command_descs(); + + for (auto& c : leader_mon_commands) { + commands.push_back(c); + } + + auto features = m->get_connection()->get_features(); + format_command_descriptions(commands, f, features, &rdata); + delete f; + reply_command(op, 0, "", rdata, 0); + return; + } + + dout(0) << "handle_command " << *m << dendl; + + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + get_str_vec(prefix, fullcmd); + + // make sure fullcmd is not empty. + // invalid prefix will cause empty vector fullcmd. + // such as, prefix=";,,;" + if (fullcmd.empty()) { + reply_command(op, -EINVAL, "command requires a prefix to be valid", 0); + return; + } + + std::string_view module = fullcmd[0]; + + // validate command is in leader map + + const MonCommand *leader_cmd; + const auto& mgr_cmds = mgrmon()->get_command_descs(); + const MonCommand *mgr_cmd = nullptr; + if (!mgr_cmds.empty()) { + mgr_cmd = _get_moncommand(prefix, mgr_cmds); + } + leader_cmd = _get_moncommand(prefix, leader_mon_commands); + if (!leader_cmd) { + leader_cmd = mgr_cmd; + if (!leader_cmd) { + reply_command(op, -EINVAL, "command not known", 0); + return; + } + } + // validate command is in our map & matches, or forward if it is allowed + const MonCommand *mon_cmd = _get_moncommand( + prefix, + get_local_commands(quorum_mon_features)); + if (!mon_cmd) { + mon_cmd = mgr_cmd; + } + if (!is_leader()) { + if (!mon_cmd) { + if (leader_cmd->is_noforward()) { + reply_command(op, -EINVAL, + "command not locally supported and not allowed to forward", + 0); + return; + } + dout(10) << "Command not locally supported, forwarding request " + << m << dendl; + forward_request_leader(op); + return; + } else if (!mon_cmd->is_compat(leader_cmd)) { + if (mon_cmd->is_noforward()) { + reply_command(op, -EINVAL, + "command not compatible with leader and not allowed to forward", + 0); + return; + } + dout(10) << "Command not compatible with leader, forwarding request " + << m << dendl; + forward_request_leader(op); + return; + } + } + + if (mon_cmd->is_obsolete() || + (cct->_conf->mon_debug_deprecated_as_obsolete + && mon_cmd->is_deprecated())) { + reply_command(op, -ENOTSUP, + "command is obsolete; please check usage and/or man page", + 0); + return; + } + + if (session->proxy_con && mon_cmd->is_noforward()) { + dout(10) << "Got forward for noforward command " << m << dendl; + reply_command(op, -EINVAL, "forward for noforward command", rdata, 0); + return; + } + + /* what we perceive as being the service the command falls under */ + string service(mon_cmd->module); + + dout(25) << __func__ << " prefix='" << prefix + << "' module='" << module + << "' service='" << service << "'" << dendl; + + bool cmd_is_rw = + (mon_cmd->requires_perm('w') || mon_cmd->requires_perm('x')); + + // validate user's permissions for requested command + map<string,string> param_str_map; + + // Catch bad_cmd_get exception if _generate_command_map() throws it + try { + _generate_command_map(cmdmap, param_str_map); + } + catch(bad_cmd_get& e) { + reply_command(op, -EINVAL, e.what(), 0); + } + + if (!_allowed_command(session, service, prefix, cmdmap, + param_str_map, mon_cmd)) { + dout(1) << __func__ << " access denied" << dendl; + if (prefix != "config set" && prefix != "config-key set") + (cmd_is_rw ? audit_clog->info() : audit_clog->debug()) + << "from='" << session->name << " " << session->addrs << "' " + << "entity='" << session->entity_name << "' " + << "cmd=" << m->cmd << ": access denied"; + reply_command(op, -EACCES, "access denied", 0); + return; + } + + if (prefix != "config set" && prefix != "config-key set") + (cmd_is_rw ? audit_clog->info() : audit_clog->debug()) + << "from='" << session->name << " " << session->addrs << "' " + << "entity='" << session->entity_name << "' " + << "cmd=" << m->cmd << ": dispatch"; + + // compat kludge for legacy clients trying to tell commands that are + // new. see bottom of MonCommands.h. we need to handle both (1) + // pre-octopus clients and (2) octopus clients with a mix of pre-octopus + // and octopus mons. + if ((!HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS) || + monmap->min_mon_release < ceph_release_t::octopus) && + (prefix == "injectargs" || + prefix == "smart" || + prefix == "mon_status" || + prefix == "heap")) { + if (m->get_connection()->get_messenger() == 0) { + // Prior to octopus, monitors might forward these messages + // around. that was broken at baseline, and if we try to process + // this message now, it will assert out when we try to send a + // message in reply from the asok/tell worker (see + // AnonConnection). Just reply with an error. + dout(5) << __func__ << " failing forwarded command from a (presumably) " + << "pre-octopus peer" << dendl; + reply_command( + op, -EBUSY, + "failing forwarded tell command in mixed-version mon cluster", 0); + return; + } + dout(5) << __func__ << " passing command to tell/asok" << dendl; + cct->get_admin_socket()->queue_tell_command(m); + return; + } + + if (mon_cmd->is_mgr()) { + const auto& hdr = m->get_header(); + uint64_t size = hdr.front_len + hdr.middle_len + hdr.data_len; + uint64_t max = g_conf().get_val<Option::size_t>("mon_client_bytes") + * g_conf().get_val<double>("mon_mgr_proxy_client_bytes_ratio"); + if (mgr_proxy_bytes + size > max) { + dout(10) << __func__ << " current mgr proxy bytes " << mgr_proxy_bytes + << " + " << size << " > max " << max << dendl; + reply_command(op, -EAGAIN, "hit limit on proxied mgr commands", rdata, 0); + return; + } + mgr_proxy_bytes += size; + dout(10) << __func__ << " proxying mgr command (+" << size + << " -> " << mgr_proxy_bytes << ")" << dendl; + C_MgrProxyCommand *fin = new C_MgrProxyCommand(this, op, size); + mgr_client.start_command(m->cmd, + m->get_data(), + &fin->outbl, + &fin->outs, + new C_OnFinisher(fin, &finisher)); + return; + } + + if ((module == "mds" || module == "fs") && + prefix != "fs authorize") { + mdsmon()->dispatch(op); + return; + } + if ((module == "osd" || + prefix == "pg map" || + prefix == "pg repeer") && + prefix != "osd last-stat-seq") { + osdmon()->dispatch(op); + return; + } + if (module == "config") { + configmon()->dispatch(op); + return; + } + + if (module == "mon" && + /* Let the Monitor class handle the following commands: + * 'mon scrub' + */ + prefix != "mon scrub" && + prefix != "mon metadata" && + prefix != "mon versions" && + prefix != "mon count-metadata" && + prefix != "mon ok-to-stop" && + prefix != "mon ok-to-add-offline" && + prefix != "mon ok-to-rm") { + monmon()->dispatch(op); + return; + } + if (module == "health" && prefix != "health") { + healthmon()->dispatch(op); + return; + } + if (module == "auth" || prefix == "fs authorize") { + authmon()->dispatch(op); + return; + } + if (module == "log") { + logmon()->dispatch(op); + return; + } + + if (module == "config-key") { + kvmon()->dispatch(op); + return; + } + + if (module == "mgr") { + mgrmon()->dispatch(op); + return; + } + + if (prefix == "fsid") { + if (f) { + f->open_object_section("fsid"); + f->dump_stream("fsid") << monmap->fsid; + f->close_section(); + f->flush(rdata); + } else { + ds << monmap->fsid; + rdata.append(ds); + } + reply_command(op, 0, "", rdata, 0); + return; + } + + if (prefix == "mon scrub") { + wait_for_paxos_write(); + if (is_leader()) { + int r = scrub_start(); + reply_command(op, r, "", rdata, 0); + } else if (is_peon()) { + forward_request_leader(op); + } else { + reply_command(op, -EAGAIN, "no quorum", rdata, 0); + } + return; + } + + if (prefix == "time-sync-status") { + if (!f) + f.reset(Formatter::create("json-pretty")); + f->open_object_section("time_sync"); + if (!timecheck_skews.empty()) { + f->open_object_section("time_skew_status"); + for (auto& i : timecheck_skews) { + double skew = i.second; + double latency = timecheck_latencies[i.first]; + string name = monmap->get_name(i.first); + ostringstream tcss; + health_status_t tcstatus = timecheck_status(tcss, skew, latency); + f->open_object_section(name.c_str()); + f->dump_float("skew", skew); + f->dump_float("latency", latency); + f->dump_stream("health") << tcstatus; + if (tcstatus != HEALTH_OK) { + f->dump_stream("details") << tcss.str(); + } + f->close_section(); + } + f->close_section(); + } + f->open_object_section("timechecks"); + f->dump_unsigned("epoch", get_epoch()); + f->dump_int("round", timecheck_round); + f->dump_stream("round_status") << ((timecheck_round%2) ? + "on-going" : "finished"); + f->close_section(); + f->close_section(); + f->flush(rdata); + r = 0; + rs = ""; + } else if (prefix == "status" || + prefix == "health" || + prefix == "df") { + string detail; + cmd_getval(cmdmap, "detail", detail); + + if (prefix == "status") { + // get_cluster_status handles f == NULL + get_cluster_status(ds, f.get(), session); + + if (f) { + f->flush(ds); + ds << '\n'; + } + rdata.append(ds); + } else if (prefix == "health") { + string plain; + healthmon()->get_health_status(detail == "detail", f.get(), f ? nullptr : &plain); + if (f) { + f->flush(rdata); + } else { + rdata.append(plain); + } + } else if (prefix == "df") { + bool verbose = (detail == "detail"); + if (f) + f->open_object_section("stats"); + + mgrstatmon()->dump_cluster_stats(&ds, f.get(), verbose); + if (!f) { + ds << "\n \n"; + } + mgrstatmon()->dump_pool_stats(osdmon()->osdmap, &ds, f.get(), verbose); + + if (f) { + f->close_section(); + f->flush(ds); + ds << '\n'; + } + } else { + ceph_abort_msg("We should never get here!"); + return; + } + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "report") { + + // this must be formatted, in its current form + if (!f) + f.reset(Formatter::create("json-pretty")); + f->open_object_section("report"); + f->dump_stream("cluster_fingerprint") << fingerprint; + f->dump_string("version", ceph_version_to_str()); + f->dump_string("commit", git_version_to_str()); + f->dump_stream("timestamp") << ceph_clock_now(); + + vector<string> tagsvec; + cmd_getval(cmdmap, "tags", tagsvec); + string tagstr = str_join(tagsvec, " "); + if (!tagstr.empty()) + tagstr = tagstr.substr(0, tagstr.find_last_of(' ')); + f->dump_string("tag", tagstr); + + healthmon()->get_health_status(true, f.get(), nullptr); + + monmon()->dump_info(f.get()); + osdmon()->dump_info(f.get()); + mdsmon()->dump_info(f.get()); + authmon()->dump_info(f.get()); + mgrstatmon()->dump_info(f.get()); + + paxos->dump_info(f.get()); + + f->close_section(); + f->flush(rdata); + + ostringstream ss2; + ss2 << "report " << rdata.crc32c(CEPH_MON_PORT_LEGACY); + rs = ss2.str(); + r = 0; + } else if (prefix == "osd last-stat-seq") { + int64_t osd = 0; + cmd_getval(cmdmap, "id", osd); + uint64_t seq = mgrstatmon()->get_last_osd_stat_seq(osd); + if (f) { + f->dump_unsigned("seq", seq); + f->flush(ds); + } else { + ds << seq; + rdata.append(ds); + } + rs = ""; + r = 0; + } else if (prefix == "node ls") { + string node_type("all"); + cmd_getval(cmdmap, "type", node_type); + if (!f) + f.reset(Formatter::create("json-pretty")); + if (node_type == "all") { + f->open_object_section("nodes"); + print_nodes(f.get(), ds); + osdmon()->print_nodes(f.get()); + mdsmon()->print_nodes(f.get()); + mgrmon()->print_nodes(f.get()); + f->close_section(); + } else if (node_type == "mon") { + print_nodes(f.get(), ds); + } else if (node_type == "osd") { + osdmon()->print_nodes(f.get()); + } else if (node_type == "mds") { + mdsmon()->print_nodes(f.get()); + } else if (node_type == "mgr") { + mgrmon()->print_nodes(f.get()); + } + f->flush(ds); + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "features") { + if (!is_leader() && !is_peon()) { + dout(10) << " waiting for quorum" << dendl; + waitfor_quorum.push_back(new C_RetryMessage(this, op)); + return; + } + if (!is_leader()) { + forward_request_leader(op); + return; + } + if (!f) + f.reset(Formatter::create("json-pretty")); + FeatureMap fm; + get_combined_feature_map(&fm); + f->dump_object("features", fm); + f->flush(rdata); + rs = ""; + r = 0; + } else if (prefix == "mon metadata") { + if (!f) + f.reset(Formatter::create("json-pretty")); + + string name; + bool all = !cmd_getval(cmdmap, "id", name); + if (!all) { + // Dump a single mon's metadata + int mon = monmap->get_rank(name); + if (mon < 0) { + rs = "requested mon not found"; + r = -ENOENT; + goto out; + } + f->open_object_section("mon_metadata"); + r = get_mon_metadata(mon, f.get(), ds); + f->close_section(); + } else { + // Dump all mons' metadata + r = 0; + f->open_array_section("mon_metadata"); + for (unsigned int rank = 0; rank < monmap->size(); ++rank) { + std::ostringstream get_err; + f->open_object_section("mon"); + f->dump_string("name", monmap->get_name(rank)); + r = get_mon_metadata(rank, f.get(), get_err); + f->close_section(); + if (r == -ENOENT || r == -EINVAL) { + dout(1) << get_err.str() << dendl; + // Drop error, list what metadata we do have + r = 0; + } else if (r != 0) { + derr << "Unexpected error from get_mon_metadata: " + << cpp_strerror(r) << dendl; + ds << get_err.str(); + break; + } + } + f->close_section(); + } + + f->flush(ds); + rdata.append(ds); + rs = ""; + } else if (prefix == "mon versions") { + if (!f) + f.reset(Formatter::create("json-pretty")); + count_metadata("ceph_version", f.get()); + f->flush(ds); + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "mon count-metadata") { + if (!f) + f.reset(Formatter::create("json-pretty")); + string field; + cmd_getval(cmdmap, "property", field); + count_metadata(field, f.get()); + f->flush(ds); + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "quorum_status") { + // make sure our map is readable and up to date + if (!is_leader() && !is_peon()) { + dout(10) << " waiting for quorum" << dendl; + waitfor_quorum.push_back(new C_RetryMessage(this, op)); + return; + } + _quorum_status(f.get(), ds); + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "mon ok-to-stop") { + vector<string> ids; + if (!cmd_getval(cmdmap, "ids", ids)) { + r = -EINVAL; + goto out; + } + set<string> wouldbe; + for (auto rank : quorum) { + wouldbe.insert(monmap->get_name(rank)); + } + for (auto& n : ids) { + if (monmap->contains(n)) { + wouldbe.erase(n); + } + } + if (wouldbe.size() < monmap->min_quorum_size()) { + r = -EBUSY; + rs = "not enough monitors would be available (" + stringify(wouldbe) + + ") after stopping mons " + stringify(ids); + goto out; + } + r = 0; + rs = "quorum should be preserved (" + stringify(wouldbe) + + ") after stopping " + stringify(ids); + } else if (prefix == "mon ok-to-add-offline") { + if (quorum.size() < monmap->min_quorum_size(monmap->size() + 1)) { + rs = "adding a monitor may break quorum (until that monitor starts)"; + r = -EBUSY; + goto out; + } + rs = "adding another mon that is not yet online will not break quorum"; + r = 0; + } else if (prefix == "mon ok-to-rm") { + string id; + if (!cmd_getval(cmdmap, "id", id)) { + r = -EINVAL; + rs = "must specify a monitor id"; + goto out; + } + if (!monmap->contains(id)) { + r = 0; + rs = "mon." + id + " does not exist"; + goto out; + } + int rank = monmap->get_rank(id); + if (quorum.count(rank) && + quorum.size() - 1 < monmap->min_quorum_size(monmap->size() - 1)) { + r = -EBUSY; + rs = "removing mon." + id + " would break quorum"; + goto out; + } + r = 0; + rs = "safe to remove mon." + id; + } else if (prefix == "version") { + if (f) { + f->open_object_section("version"); + f->dump_string("version", pretty_version_to_str()); + f->close_section(); + f->flush(ds); + } else { + ds << pretty_version_to_str(); + } + rdata.append(ds); + rs = ""; + r = 0; + } else if (prefix == "versions") { + if (!f) + f.reset(Formatter::create("json-pretty")); + map<string,int> overall; + f->open_object_section("version"); + map<string,int> mon, mgr, osd, mds; + + count_metadata("ceph_version", &mon); + f->open_object_section("mon"); + for (auto& p : mon) { + f->dump_int(p.first.c_str(), p.second); + overall[p.first] += p.second; + } + f->close_section(); + + mgrmon()->count_metadata("ceph_version", &mgr); + f->open_object_section("mgr"); + for (auto& p : mgr) { + f->dump_int(p.first.c_str(), p.second); + overall[p.first] += p.second; + } + f->close_section(); + + osdmon()->count_metadata("ceph_version", &osd); + f->open_object_section("osd"); + for (auto& p : osd) { + f->dump_int(p.first.c_str(), p.second); + overall[p.first] += p.second; + } + f->close_section(); + + mdsmon()->count_metadata("ceph_version", &mds); + f->open_object_section("mds"); + for (auto& p : mds) { + f->dump_int(p.first.c_str(), p.second); + overall[p.first] += p.second; + } + f->close_section(); + + for (auto& p : mgrstatmon()->get_service_map().services) { + auto &service = p.first; + if (ServiceMap::is_normal_ceph_entity(service)) { + continue; + } + f->open_object_section(service.c_str()); + map<string,int> m; + p.second.count_metadata("ceph_version", &m); + for (auto& q : m) { + f->dump_int(q.first.c_str(), q.second); + overall[q.first] += q.second; + } + f->close_section(); + } + + f->open_object_section("overall"); + for (auto& p : overall) { + f->dump_int(p.first.c_str(), p.second); + } + f->close_section(); + f->close_section(); + f->flush(rdata); + rs = ""; + r = 0; + } + + out: + if (!m->get_source().is_mon()) // don't reply to mon->mon commands + reply_command(op, r, rs, rdata, 0); +} + +void Monitor::reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version) +{ + bufferlist rdata; + reply_command(op, rc, rs, rdata, version); +} + +void Monitor::reply_command(MonOpRequestRef op, int rc, const string &rs, + bufferlist& rdata, version_t version) +{ + auto m = op->get_req<MMonCommand>(); + ceph_assert(m->get_type() == MSG_MON_COMMAND); + MMonCommandAck *reply = new MMonCommandAck(m->cmd, rc, rs, version); + reply->set_tid(m->get_tid()); + reply->set_data(rdata); + send_reply(op, reply); +} + +void Monitor::reply_tell_command( + MonOpRequestRef op, int rc, const string &rs) +{ + MCommand *m = static_cast<MCommand*>(op->get_req()); + ceph_assert(m->get_type() == MSG_COMMAND); + MCommandReply *reply = new MCommandReply(rc, rs); + reply->set_tid(m->get_tid()); + m->get_connection()->send_message(reply); +} + + +// ------------------------ +// request/reply routing +// +// a client/mds/osd will connect to a random monitor. we need to forward any +// messages requiring state updates to the leader, and then route any replies +// back via the correct monitor and back to them. (the monitor will not +// initiate any connections.) + +void Monitor::forward_request_leader(MonOpRequestRef op) +{ + op->mark_event(__func__); + + int mon = get_leader(); + MonSession *session = op->get_session(); + PaxosServiceMessage *req = op->get_req<PaxosServiceMessage>(); + + if (req->get_source().is_mon() && req->get_source_addrs() != messenger->get_myaddrs()) { + dout(10) << "forward_request won't forward (non-local) mon request " << *req << dendl; + } else if (session->proxy_con) { + dout(10) << "forward_request won't double fwd request " << *req << dendl; + } else if (!session->closed) { + RoutedRequest *rr = new RoutedRequest; + rr->tid = ++routed_request_tid; + rr->con = req->get_connection(); + rr->con_features = rr->con->get_features(); + encode_message(req, CEPH_FEATURES_ALL, rr->request_bl); // for my use only; use all features + rr->session = static_cast<MonSession *>(session->get()); + rr->op = op; + routed_requests[rr->tid] = rr; + session->routed_request_tids.insert(rr->tid); + + dout(10) << "forward_request " << rr->tid << " request " << *req + << " features " << rr->con_features << dendl; + + MForward *forward = new MForward(rr->tid, + req, + rr->con_features, + rr->session->caps); + forward->set_priority(req->get_priority()); + if (session->auth_handler) { + forward->entity_name = session->entity_name; + } else if (req->get_source().is_mon()) { + forward->entity_name.set_type(CEPH_ENTITY_TYPE_MON); + } + send_mon_message(forward, mon); + op->mark_forwarded(); + ceph_assert(op->get_req()->get_type() != 0); + } else { + dout(10) << "forward_request no session for request " << *req << dendl; + } +} + +// fake connection attached to forwarded messages +struct AnonConnection : public Connection { + entity_addr_t socket_addr; + + int send_message(Message *m) override { + ceph_assert(!"send_message on anonymous connection"); + } + void send_keepalive() override { + ceph_assert(!"send_keepalive on anonymous connection"); + } + void mark_down() override { + // silently ignore + } + void mark_disposable() override { + // silengtly ignore + } + bool is_connected() override { return false; } + entity_addr_t get_peer_socket_addr() const override { + return socket_addr; + } + +private: + FRIEND_MAKE_REF(AnonConnection); + explicit AnonConnection(CephContext *cct, const entity_addr_t& sa) + : Connection(cct, nullptr), + socket_addr(sa) {} +}; + +//extract the original message and put it into the regular dispatch function +void Monitor::handle_forward(MonOpRequestRef op) +{ + auto m = op->get_req<MForward>(); + dout(10) << "received forwarded message from " + << ceph_entity_type_name(m->client_type) + << " " << m->client_addrs + << " via " << m->get_source_inst() << dendl; + MonSession *session = op->get_session(); + ceph_assert(session); + + if (!session->is_capable("mon", MON_CAP_X)) { + dout(0) << "forward from entity with insufficient caps! " + << session->caps << dendl; + } else { + // see PaxosService::dispatch(); we rely on this being anon + // (c->msgr == NULL) + PaxosServiceMessage *req = m->claim_message(); + ceph_assert(req != NULL); + + auto c = ceph::make_ref<AnonConnection>(cct, m->client_socket_addr); + MonSession *s = new MonSession(static_cast<Connection*>(c.get())); + s->_ident(req->get_source(), + req->get_source_addrs()); + c->set_priv(RefCountedPtr{s, false}); + c->set_peer_addrs(m->client_addrs); + c->set_peer_type(m->client_type); + c->set_features(m->con_features); + + s->authenticated = true; + s->caps = m->client_caps; + dout(10) << " caps are " << s->caps << dendl; + s->entity_name = m->entity_name; + dout(10) << " entity name '" << s->entity_name << "' type " + << s->entity_name.get_type() << dendl; + s->proxy_con = m->get_connection(); + s->proxy_tid = m->tid; + + req->set_connection(c); + + // not super accurate, but better than nothing. + req->set_recv_stamp(m->get_recv_stamp()); + + /* + * note which election epoch this is; we will drop the message if + * there is a future election since our peers will resend routed + * requests in that case. + */ + req->rx_election_epoch = get_epoch(); + + dout(10) << " mesg " << req << " from " << m->get_source_addr() << dendl; + _ms_dispatch(req); + + // break the session <-> con ref loop by removing the con->session + // reference, which is no longer needed once the MonOpRequest is + // set up. + c->set_priv(NULL); + } +} + +void Monitor::send_reply(MonOpRequestRef op, Message *reply) +{ + op->mark_event(__func__); + + MonSession *session = op->get_session(); + ceph_assert(session); + Message *req = op->get_req(); + ConnectionRef con = op->get_connection(); + + reply->set_cct(g_ceph_context); + dout(2) << __func__ << " " << op << " " << reply << " " << *reply << dendl; + + if (!con) { + dout(2) << "send_reply no connection, dropping reply " << *reply + << " to " << req << " " << *req << dendl; + reply->put(); + op->mark_event("reply: no connection"); + return; + } + + if (!session->con && !session->proxy_con) { + dout(2) << "send_reply no connection, dropping reply " << *reply + << " to " << req << " " << *req << dendl; + reply->put(); + op->mark_event("reply: no connection"); + return; + } + + if (session->proxy_con) { + dout(15) << "send_reply routing reply to " << con->get_peer_addr() + << " via " << session->proxy_con->get_peer_addr() + << " for request " << *req << dendl; + session->proxy_con->send_message(new MRoute(session->proxy_tid, reply)); + op->mark_event("reply: send routed request"); + } else { + session->con->send_message(reply); + op->mark_event("reply: send"); + } +} + +void Monitor::no_reply(MonOpRequestRef op) +{ + MonSession *session = op->get_session(); + Message *req = op->get_req(); + + if (session->proxy_con) { + dout(10) << "no_reply to " << req->get_source_inst() + << " via " << session->proxy_con->get_peer_addr() + << " for request " << *req << dendl; + session->proxy_con->send_message(new MRoute(session->proxy_tid, NULL)); + op->mark_event("no_reply: send routed request"); + } else { + dout(10) << "no_reply to " << req->get_source_inst() + << " " << *req << dendl; + op->mark_event("no_reply"); + } +} + +void Monitor::handle_route(MonOpRequestRef op) +{ + auto m = op->get_req<MRoute>(); + MonSession *session = op->get_session(); + //check privileges + if (!session->is_capable("mon", MON_CAP_X)) { + dout(0) << "MRoute received from entity without appropriate perms! " + << dendl; + return; + } + if (m->msg) + dout(10) << "handle_route tid " << m->session_mon_tid << " " << *m->msg + << dendl; + else + dout(10) << "handle_route tid " << m->session_mon_tid << " null" << dendl; + + // look it up + if (!m->session_mon_tid) { + dout(10) << " not a routed request, ignoring" << dendl; + return; + } + auto found = routed_requests.find(m->session_mon_tid); + if (found == routed_requests.end()) { + dout(10) << " don't have routed request tid " << m->session_mon_tid << dendl; + return; + } + std::unique_ptr<RoutedRequest> rr{found->second}; + // reset payload, in case encoding is dependent on target features + if (m->msg) { + m->msg->clear_payload(); + rr->con->send_message(m->msg); + m->msg = NULL; + } + if (m->send_osdmap_first) { + dout(10) << " sending osdmaps from " << m->send_osdmap_first << dendl; + osdmon()->send_incremental(m->send_osdmap_first, rr->session, + true, MonOpRequestRef()); + } + ceph_assert(rr->tid == m->session_mon_tid && rr->session->routed_request_tids.count(m->session_mon_tid)); + routed_requests.erase(found); + rr->session->routed_request_tids.erase(m->session_mon_tid); +} + +void Monitor::resend_routed_requests() +{ + dout(10) << "resend_routed_requests" << dendl; + int mon = get_leader(); + list<Context*> retry; + for (map<uint64_t, RoutedRequest*>::iterator p = routed_requests.begin(); + p != routed_requests.end(); + ++p) { + RoutedRequest *rr = p->second; + + if (mon == rank) { + dout(10) << " requeue for self tid " << rr->tid << dendl; + rr->op->mark_event("retry routed request"); + retry.push_back(new C_RetryMessage(this, rr->op)); + if (rr->session) { + ceph_assert(rr->session->routed_request_tids.count(p->first)); + rr->session->routed_request_tids.erase(p->first); + } + delete rr; + } else { + auto q = rr->request_bl.cbegin(); + PaxosServiceMessage *req = + (PaxosServiceMessage *)decode_message(cct, 0, q); + rr->op->mark_event("resend forwarded message to leader"); + dout(10) << " resend to mon." << mon << " tid " << rr->tid << " " << *req + << dendl; + MForward *forward = new MForward(rr->tid, + req, + rr->con_features, + rr->session->caps); + req->put(); // forward takes its own ref; drop ours. + forward->client_type = rr->con->get_peer_type(); + forward->client_addrs = rr->con->get_peer_addrs(); + forward->client_socket_addr = rr->con->get_peer_socket_addr(); + forward->set_priority(req->get_priority()); + send_mon_message(forward, mon); + } + } + if (mon == rank) { + routed_requests.clear(); + finish_contexts(g_ceph_context, retry); + } +} + +void Monitor::remove_session(MonSession *s) +{ + dout(10) << "remove_session " << s << " " << s->name << " " << s->addrs + << " features 0x" << std::hex << s->con_features << std::dec << dendl; + ceph_assert(s->con); + ceph_assert(!s->closed); + for (set<uint64_t>::iterator p = s->routed_request_tids.begin(); + p != s->routed_request_tids.end(); + ++p) { + ceph_assert(routed_requests.count(*p)); + RoutedRequest *rr = routed_requests[*p]; + dout(10) << " dropping routed request " << rr->tid << dendl; + delete rr; + routed_requests.erase(*p); + } + s->routed_request_tids.clear(); + s->con->set_priv(nullptr); + session_map.remove_session(s); + logger->set(l_mon_num_sessions, session_map.get_size()); + logger->inc(l_mon_session_rm); +} + +void Monitor::remove_all_sessions() +{ + std::lock_guard l(session_map_lock); + while (!session_map.sessions.empty()) { + MonSession *s = session_map.sessions.front(); + remove_session(s); + logger->inc(l_mon_session_rm); + } + if (logger) + logger->set(l_mon_num_sessions, session_map.get_size()); +} + +void Monitor::send_mon_message(Message *m, int rank) +{ + messenger->send_to_mon(m, monmap->get_addrs(rank)); +} + +void Monitor::waitlist_or_zap_client(MonOpRequestRef op) +{ + /** + * Wait list the new session until we're in the quorum, assuming it's + * sufficiently new. + * tick() will periodically send them back through so we can send + * the client elsewhere if we don't think we're getting back in. + * + * But we allow a few sorts of messages: + * 1) Monitors can talk to us at any time, of course. + * 2) auth messages. It's unlikely to go through much faster, but + * it's possible we've just lost our quorum status and we want to take... + * 3) command messages. We want to accept these under all possible + * circumstances. + */ + Message *m = op->get_req(); + MonSession *s = op->get_session(); + ConnectionRef con = op->get_connection(); + utime_t too_old = ceph_clock_now(); + too_old -= g_ceph_context->_conf->mon_lease; + if (m->get_recv_stamp() > too_old && + con->is_connected()) { + dout(5) << "waitlisting message " << *m << dendl; + maybe_wait_for_quorum.push_back(new C_RetryMessage(this, op)); + op->mark_wait_for_quorum(); + } else { + dout(5) << "discarding message " << *m << " and sending client elsewhere" << dendl; + con->mark_down(); + // proxied sessions aren't registered and don't have a con; don't remove + // those. + if (!s->proxy_con) { + std::lock_guard l(session_map_lock); + remove_session(s); + } + op->mark_zap(); + } +} + +void Monitor::_ms_dispatch(Message *m) +{ + if (is_shutdown()) { + m->put(); + return; + } + + MonOpRequestRef op = op_tracker.create_request<MonOpRequest>(m); + bool src_is_mon = op->is_src_mon(); + op->mark_event("mon:_ms_dispatch"); + MonSession *s = op->get_session(); + if (s && s->closed) { + return; + } + + if (src_is_mon && s) { + ConnectionRef con = m->get_connection(); + if (con->get_messenger() && con->get_features() != s->con_features) { + // only update features if this is a non-anonymous connection + dout(10) << __func__ << " feature change for " << m->get_source_inst() + << " (was " << s->con_features + << ", now " << con->get_features() << ")" << dendl; + // connection features changed - recreate session. + if (s->con && s->con != con) { + dout(10) << __func__ << " connection for " << m->get_source_inst() + << " changed from session; mark down and replace" << dendl; + s->con->mark_down(); + } + if (s->item.is_on_list()) { + // forwarded messages' sessions are not in the sessions map and + // exist only while the op is being handled. + std::lock_guard l(session_map_lock); + remove_session(s); + } + s = nullptr; + } + } + + if (!s) { + // if the sender is not a monitor, make sure their first message for a + // session is an MAuth. If it is not, assume it's a stray message, + // and considering that we are creating a new session it is safe to + // assume that the sender hasn't authenticated yet, so we have no way + // of assessing whether we should handle it or not. + if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH && + m->get_type() != CEPH_MSG_MON_GET_MAP && + m->get_type() != CEPH_MSG_PING)) { + dout(1) << __func__ << " dropping stray message " << *m + << " from " << m->get_source_inst() << dendl; + return; + } + + ConnectionRef con = m->get_connection(); + { + std::lock_guard l(session_map_lock); + s = session_map.new_session(m->get_source(), + m->get_source_addrs(), + con.get()); + } + ceph_assert(s); + con->set_priv(RefCountedPtr{s, false}); + dout(10) << __func__ << " new session " << s << " " << *s + << " features 0x" << std::hex + << s->con_features << std::dec << dendl; + op->set_session(s); + + logger->set(l_mon_num_sessions, session_map.get_size()); + logger->inc(l_mon_session_add); + + if (src_is_mon) { + // give it monitor caps; the peer type has been authenticated + dout(5) << __func__ << " setting monitor caps on this connection" << dendl; + if (!s->caps.is_allow_all()) // but no need to repeatedly copy + s->caps = mon_caps; + s->authenticated = true; + } + } else { + dout(20) << __func__ << " existing session " << s << " for " << s->name + << dendl; + } + + ceph_assert(s); + + s->session_timeout = ceph_clock_now(); + s->session_timeout += g_conf()->mon_session_timeout; + + if (s->auth_handler) { + s->entity_name = s->auth_handler->get_entity_name(); + s->global_id = s->auth_handler->get_global_id(); + s->global_id_status = s->auth_handler->get_global_id_status(); + } + dout(20) << " entity_name " << s->entity_name + << " global_id " << s->global_id + << " (" << s->global_id_status + << ") caps " << s->caps.get_str() << dendl; + + if (!session_stretch_allowed(s, op)) { + return; + } + if ((is_synchronizing() || + (!s->authenticated && !exited_quorum.is_zero())) && + !src_is_mon && + m->get_type() != CEPH_MSG_PING) { + waitlist_or_zap_client(op); + } else { + dispatch_op(op); + } + return; +} + +void Monitor::dispatch_op(MonOpRequestRef op) +{ + op->mark_event("mon:dispatch_op"); + MonSession *s = op->get_session(); + ceph_assert(s); + if (s->closed) { + dout(10) << " session closed, dropping " << op->get_req() << dendl; + return; + } + + /* we will consider the default type as being 'monitor' until proven wrong */ + op->set_type_monitor(); + /* deal with all messages that do not necessarily need caps */ + switch (op->get_req()->get_type()) { + // auth + case MSG_MON_GLOBAL_ID: + case CEPH_MSG_AUTH: + op->set_type_service(); + /* no need to check caps here */ + paxos_service[PAXOS_AUTH]->dispatch(op); + return; + + case CEPH_MSG_PING: + handle_ping(op); + return; + case MSG_COMMAND: + op->set_type_command(); + handle_tell_command(op); + return; + } + + if (!op->get_session()->authenticated) { + dout(5) << __func__ << " " << op->get_req()->get_source_inst() + << " is not authenticated, dropping " << *(op->get_req()) + << dendl; + return; + } + + // global_id_status == NONE: all sessions for auth_none and krb, + // mon <-> mon sessions (including proxied sessions) for cephx + ceph_assert(s->global_id_status == global_id_status_t::NONE || + s->global_id_status == global_id_status_t::NEW_OK || + s->global_id_status == global_id_status_t::NEW_NOT_EXPOSED || + s->global_id_status == global_id_status_t::RECLAIM_OK || + s->global_id_status == global_id_status_t::RECLAIM_INSECURE); + + // let mon_getmap through for "ping" (which doesn't reconnect) + // and "tell" (which reconnects but doesn't attempt to preserve + // its global_id and stays in NEW_NOT_EXPOSED, retrying until + // ->send_attempts reaches 0) + if (cct->_conf->auth_expose_insecure_global_id_reclaim && + s->global_id_status == global_id_status_t::NEW_NOT_EXPOSED && + op->get_req()->get_type() != CEPH_MSG_MON_GET_MAP) { + dout(5) << __func__ << " " << op->get_req()->get_source_inst() + << " may omit old_ticket on reconnects, discarding " + << *op->get_req() << " and forcing reconnect" << dendl; + ceph_assert(s->con && !s->proxy_con); + s->con->mark_down(); + { + std::lock_guard l(session_map_lock); + remove_session(s); + } + op->mark_zap(); + return; + } + + switch (op->get_req()->get_type()) { + case CEPH_MSG_MON_GET_MAP: + handle_mon_get_map(op); + return; + + case MSG_GET_CONFIG: + configmon()->handle_get_config(op); + return; + + case CEPH_MSG_MON_SUBSCRIBE: + /* FIXME: check what's being subscribed, filter accordingly */ + handle_subscribe(op); + return; + } + + /* well, maybe the op belongs to a service... */ + op->set_type_service(); + /* deal with all messages which caps should be checked somewhere else */ + switch (op->get_req()->get_type()) { + + // OSDs + case CEPH_MSG_MON_GET_OSDMAP: + case CEPH_MSG_POOLOP: + case MSG_OSD_BEACON: + case MSG_OSD_MARK_ME_DOWN: + case MSG_OSD_MARK_ME_DEAD: + case MSG_OSD_FULL: + case MSG_OSD_FAILURE: + case MSG_OSD_BOOT: + case MSG_OSD_ALIVE: + case MSG_OSD_PGTEMP: + case MSG_OSD_PG_CREATED: + case MSG_REMOVE_SNAPS: + case MSG_MON_GET_PURGED_SNAPS: + case MSG_OSD_PG_READY_TO_MERGE: + paxos_service[PAXOS_OSDMAP]->dispatch(op); + return; + + // MDSs + case MSG_MDS_BEACON: + case MSG_MDS_OFFLOAD_TARGETS: + paxos_service[PAXOS_MDSMAP]->dispatch(op); + return; + + // Mgrs + case MSG_MGR_BEACON: + paxos_service[PAXOS_MGR]->dispatch(op); + return; + + // MgrStat + case MSG_MON_MGR_REPORT: + case CEPH_MSG_STATFS: + case MSG_GETPOOLSTATS: + paxos_service[PAXOS_MGRSTAT]->dispatch(op); + return; + + // log + case MSG_LOG: + paxos_service[PAXOS_LOG]->dispatch(op); + return; + + // handle_command() does its own caps checking + case MSG_MON_COMMAND: + op->set_type_command(); + handle_command(op); + return; + } + + /* nop, looks like it's not a service message; revert back to monitor */ + op->set_type_monitor(); + + /* messages we, the Monitor class, need to deal with + * but may be sent by clients. */ + + if (!op->get_session()->is_capable("mon", MON_CAP_R)) { + dout(5) << __func__ << " " << op->get_req()->get_source_inst() + << " not enough caps for " << *(op->get_req()) << " -- dropping" + << dendl; + return; + } + + switch (op->get_req()->get_type()) { + // misc + case CEPH_MSG_MON_GET_VERSION: + handle_get_version(op); + return; + } + + if (!op->is_src_mon()) { + dout(1) << __func__ << " unexpected monitor message from" + << " non-monitor entity " << op->get_req()->get_source_inst() + << " " << *(op->get_req()) << " -- dropping" << dendl; + return; + } + + /* messages that should only be sent by another monitor */ + switch (op->get_req()->get_type()) { + + case MSG_ROUTE: + handle_route(op); + return; + + case MSG_MON_PROBE: + handle_probe(op); + return; + + // Sync (i.e., the new slurp, but on steroids) + case MSG_MON_SYNC: + handle_sync(op); + return; + case MSG_MON_SCRUB: + handle_scrub(op); + return; + + /* log acks are sent from a monitor we sent the MLog to, and are + never sent by clients to us. */ + case MSG_LOGACK: + log_client.handle_log_ack((MLogAck*)op->get_req()); + return; + + // monmap + case MSG_MON_JOIN: + op->set_type_service(); + paxos_service[PAXOS_MONMAP]->dispatch(op); + return; + + // paxos + case MSG_MON_PAXOS: + { + op->set_type_paxos(); + auto pm = op->get_req<MMonPaxos>(); + if (!op->get_session()->is_capable("mon", MON_CAP_X)) { + //can't send these! + return; + } + + if (state == STATE_SYNCHRONIZING) { + // we are synchronizing. These messages would do us no + // good, thus just drop them and ignore them. + dout(10) << __func__ << " ignore paxos msg from " + << pm->get_source_inst() << dendl; + return; + } + + // sanitize + if (pm->epoch > get_epoch()) { + bootstrap(); + return; + } + if (pm->epoch != get_epoch()) { + return; + } + + paxos->dispatch(op); + } + return; + + // elector messages + case MSG_MON_ELECTION: + op->set_type_election_or_ping(); + //check privileges here for simplicity + if (!op->get_session()->is_capable("mon", MON_CAP_X)) { + dout(0) << "MMonElection received from entity without enough caps!" + << op->get_session()->caps << dendl; + return;; + } + if (!is_probing() && !is_synchronizing()) { + elector.dispatch(op); + } + return; + + case MSG_MON_PING: + op->set_type_election_or_ping(); + elector.dispatch(op); + return; + + case MSG_FORWARD: + handle_forward(op); + return; + + case MSG_TIMECHECK: + dout(5) << __func__ << " ignoring " << op << dendl; + return; + case MSG_TIMECHECK2: + handle_timecheck(op); + return; + + case MSG_MON_HEALTH: + dout(5) << __func__ << " dropping deprecated message: " + << *op->get_req() << dendl; + break; + case MSG_MON_HEALTH_CHECKS: + op->set_type_service(); + paxos_service[PAXOS_HEALTH]->dispatch(op); + return; + } + dout(1) << "dropping unexpected " << *(op->get_req()) << dendl; + return; +} + +void Monitor::handle_ping(MonOpRequestRef op) +{ + auto m = op->get_req<MPing>(); + dout(10) << __func__ << " " << *m << dendl; + MPing *reply = new MPing; + bufferlist payload; + boost::scoped_ptr<Formatter> f(new JSONFormatter(true)); + f->open_object_section("pong"); + + healthmon()->get_health_status(false, f.get(), nullptr); + get_mon_status(f.get()); + + f->close_section(); + stringstream ss; + f->flush(ss); + encode(ss.str(), payload); + reply->set_payload(payload); + dout(10) << __func__ << " reply payload len " << reply->get_payload().length() << dendl; + m->get_connection()->send_message(reply); +} + +void Monitor::timecheck_start() +{ + dout(10) << __func__ << dendl; + timecheck_cleanup(); + if (get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + timecheck_start_round(); + } +} + +void Monitor::timecheck_finish() +{ + dout(10) << __func__ << dendl; + timecheck_cleanup(); +} + +void Monitor::timecheck_start_round() +{ + dout(10) << __func__ << " curr " << timecheck_round << dendl; + ceph_assert(is_leader()); + + if (monmap->size() == 1) { + ceph_abort_msg("We are alone; this shouldn't have been scheduled!"); + return; + } + + if (timecheck_round % 2) { + dout(10) << __func__ << " there's a timecheck going on" << dendl; + utime_t curr_time = ceph_clock_now(); + double max = g_conf()->mon_timecheck_interval*3; + if (curr_time - timecheck_round_start < max) { + dout(10) << __func__ << " keep current round going" << dendl; + goto out; + } else { + dout(10) << __func__ + << " finish current timecheck and start new" << dendl; + timecheck_cancel_round(); + } + } + + ceph_assert(timecheck_round % 2 == 0); + timecheck_acks = 0; + timecheck_round ++; + timecheck_round_start = ceph_clock_now(); + dout(10) << __func__ << " new " << timecheck_round << dendl; + + timecheck(); +out: + dout(10) << __func__ << " setting up next event" << dendl; + timecheck_reset_event(); +} + +void Monitor::timecheck_finish_round(bool success) +{ + dout(10) << __func__ << " curr " << timecheck_round << dendl; + ceph_assert(timecheck_round % 2); + timecheck_round ++; + timecheck_round_start = utime_t(); + + if (success) { + ceph_assert(timecheck_waiting.empty()); + ceph_assert(timecheck_acks == quorum.size()); + timecheck_report(); + timecheck_check_skews(); + return; + } + + dout(10) << __func__ << " " << timecheck_waiting.size() + << " peers still waiting:"; + for (auto& p : timecheck_waiting) { + *_dout << " mon." << p.first; + } + *_dout << dendl; + timecheck_waiting.clear(); + + dout(10) << __func__ << " finished to " << timecheck_round << dendl; +} + +void Monitor::timecheck_cancel_round() +{ + timecheck_finish_round(false); +} + +void Monitor::timecheck_cleanup() +{ + timecheck_round = 0; + timecheck_acks = 0; + timecheck_round_start = utime_t(); + + if (timecheck_event) { + timer.cancel_event(timecheck_event); + timecheck_event = NULL; + } + timecheck_waiting.clear(); + timecheck_skews.clear(); + timecheck_latencies.clear(); + + timecheck_rounds_since_clean = 0; +} + +void Monitor::timecheck_reset_event() +{ + if (timecheck_event) { + timer.cancel_event(timecheck_event); + timecheck_event = NULL; + } + + double delay = + cct->_conf->mon_timecheck_skew_interval * timecheck_rounds_since_clean; + + if (delay <= 0 || delay > cct->_conf->mon_timecheck_interval) { + delay = cct->_conf->mon_timecheck_interval; + } + + dout(10) << __func__ << " delay " << delay + << " rounds_since_clean " << timecheck_rounds_since_clean + << dendl; + + timecheck_event = timer.add_event_after( + delay, + new C_MonContext{this, [this](int) { + timecheck_start_round(); + }}); +} + +void Monitor::timecheck_check_skews() +{ + dout(10) << __func__ << dendl; + ceph_assert(is_leader()); + ceph_assert((timecheck_round % 2) == 0); + if (monmap->size() == 1) { + ceph_abort_msg("We are alone; we shouldn't have gotten here!"); + return; + } + ceph_assert(timecheck_latencies.size() == timecheck_skews.size()); + + bool found_skew = false; + for (auto& p : timecheck_skews) { + double abs_skew; + if (timecheck_has_skew(p.second, &abs_skew)) { + dout(10) << __func__ + << " " << p.first << " skew " << abs_skew << dendl; + found_skew = true; + } + } + + if (found_skew) { + ++timecheck_rounds_since_clean; + timecheck_reset_event(); + } else if (timecheck_rounds_since_clean > 0) { + dout(1) << __func__ + << " no clock skews found after " << timecheck_rounds_since_clean + << " rounds" << dendl; + // make sure the skews are really gone and not just a transient success + // this will run just once if not in the presence of skews again. + timecheck_rounds_since_clean = 1; + timecheck_reset_event(); + timecheck_rounds_since_clean = 0; + } + +} + +void Monitor::timecheck_report() +{ + dout(10) << __func__ << dendl; + ceph_assert(is_leader()); + ceph_assert((timecheck_round % 2) == 0); + if (monmap->size() == 1) { + ceph_abort_msg("We are alone; we shouldn't have gotten here!"); + return; + } + + ceph_assert(timecheck_latencies.size() == timecheck_skews.size()); + bool do_output = true; // only output report once + for (set<int>::iterator q = quorum.begin(); q != quorum.end(); ++q) { + if (monmap->get_name(*q) == name) + continue; + + MTimeCheck2 *m = new MTimeCheck2(MTimeCheck2::OP_REPORT); + m->epoch = get_epoch(); + m->round = timecheck_round; + + for (auto& it : timecheck_skews) { + double skew = it.second; + double latency = timecheck_latencies[it.first]; + + m->skews[it.first] = skew; + m->latencies[it.first] = latency; + + if (do_output) { + dout(25) << __func__ << " mon." << it.first + << " latency " << latency + << " skew " << skew << dendl; + } + } + do_output = false; + dout(10) << __func__ << " send report to mon." << *q << dendl; + send_mon_message(m, *q); + } +} + +void Monitor::timecheck() +{ + dout(10) << __func__ << dendl; + ceph_assert(is_leader()); + if (monmap->size() == 1) { + ceph_abort_msg("We are alone; we shouldn't have gotten here!"); + return; + } + ceph_assert(timecheck_round % 2 != 0); + + timecheck_acks = 1; // we ack ourselves + + dout(10) << __func__ << " start timecheck epoch " << get_epoch() + << " round " << timecheck_round << dendl; + + // we are at the eye of the storm; the point of reference + timecheck_skews[rank] = 0.0; + timecheck_latencies[rank] = 0.0; + + for (set<int>::iterator it = quorum.begin(); it != quorum.end(); ++it) { + if (monmap->get_name(*it) == name) + continue; + + utime_t curr_time = ceph_clock_now(); + timecheck_waiting[*it] = curr_time; + MTimeCheck2 *m = new MTimeCheck2(MTimeCheck2::OP_PING); + m->epoch = get_epoch(); + m->round = timecheck_round; + dout(10) << __func__ << " send " << *m << " to mon." << *it << dendl; + send_mon_message(m, *it); + } +} + +health_status_t Monitor::timecheck_status(ostringstream &ss, + const double skew_bound, + const double latency) +{ + health_status_t status = HEALTH_OK; + ceph_assert(latency >= 0); + + double abs_skew; + if (timecheck_has_skew(skew_bound, &abs_skew)) { + status = HEALTH_WARN; + ss << "clock skew " << abs_skew << "s" + << " > max " << g_conf()->mon_clock_drift_allowed << "s"; + } + + return status; +} + +void Monitor::handle_timecheck_leader(MonOpRequestRef op) +{ + auto m = op->get_req<MTimeCheck2>(); + dout(10) << __func__ << " " << *m << dendl; + /* handles PONG's */ + ceph_assert(m->op == MTimeCheck2::OP_PONG); + + int other = m->get_source().num(); + if (m->epoch < get_epoch()) { + dout(1) << __func__ << " got old timecheck epoch " << m->epoch + << " from " << other + << " curr " << get_epoch() + << " -- severely lagged? discard" << dendl; + return; + } + ceph_assert(m->epoch == get_epoch()); + + if (m->round < timecheck_round) { + dout(1) << __func__ << " got old round " << m->round + << " from " << other + << " curr " << timecheck_round << " -- discard" << dendl; + return; + } + + utime_t curr_time = ceph_clock_now(); + + ceph_assert(timecheck_waiting.count(other) > 0); + utime_t timecheck_sent = timecheck_waiting[other]; + timecheck_waiting.erase(other); + if (curr_time < timecheck_sent) { + // our clock was readjusted -- drop everything until it all makes sense. + dout(1) << __func__ << " our clock was readjusted --" + << " bump round and drop current check" + << dendl; + timecheck_cancel_round(); + return; + } + + /* update peer latencies */ + double latency = (double)(curr_time - timecheck_sent); + + if (timecheck_latencies.count(other) == 0) + timecheck_latencies[other] = latency; + else { + double avg_latency = ((timecheck_latencies[other]*0.8)+(latency*0.2)); + timecheck_latencies[other] = avg_latency; + } + + /* + * update skews + * + * some nasty thing goes on if we were to do 'a - b' between two utime_t, + * and 'a' happens to be lower than 'b'; so we use double instead. + * + * latency is always expected to be >= 0. + * + * delta, the difference between theirs timestamp and ours, may either be + * lower or higher than 0; will hardly ever be 0. + * + * The absolute skew is the absolute delta minus the latency, which is + * taken as a whole instead of an rtt given that there is some queueing + * and dispatch times involved and it's hard to assess how long exactly + * it took for the message to travel to the other side and be handled. So + * we call it a bounded skew, the worst case scenario. + * + * Now, to math! + * + * Given that the latency is always positive, we can establish that the + * bounded skew will be: + * + * 1. positive if the absolute delta is higher than the latency and + * delta is positive + * 2. negative if the absolute delta is higher than the latency and + * delta is negative. + * 3. zero if the absolute delta is lower than the latency. + * + * On 3. we make a judgement call and treat the skew as non-existent. + * This is because that, if the absolute delta is lower than the + * latency, then the apparently existing skew is nothing more than a + * side-effect of the high latency at work. + * + * This may not be entirely true though, as a severely skewed clock + * may be masked by an even higher latency, but with high latencies + * we probably have worse issues to deal with than just skewed clocks. + */ + ceph_assert(latency >= 0); + + double delta = ((double) m->timestamp) - ((double) curr_time); + double abs_delta = (delta > 0 ? delta : -delta); + double skew_bound = abs_delta - latency; + if (skew_bound < 0) + skew_bound = 0; + else if (delta < 0) + skew_bound = -skew_bound; + + ostringstream ss; + health_status_t status = timecheck_status(ss, skew_bound, latency); + if (status != HEALTH_OK) { + clog->health(status) << other << " " << ss.str(); + } + + dout(10) << __func__ << " from " << other << " ts " << m->timestamp + << " delta " << delta << " skew_bound " << skew_bound + << " latency " << latency << dendl; + + timecheck_skews[other] = skew_bound; + + timecheck_acks++; + if (timecheck_acks == quorum.size()) { + dout(10) << __func__ << " got pongs from everybody (" + << timecheck_acks << " total)" << dendl; + ceph_assert(timecheck_skews.size() == timecheck_acks); + ceph_assert(timecheck_waiting.empty()); + // everyone has acked, so bump the round to finish it. + timecheck_finish_round(); + } +} + +void Monitor::handle_timecheck_peon(MonOpRequestRef op) +{ + auto m = op->get_req<MTimeCheck2>(); + dout(10) << __func__ << " " << *m << dendl; + + ceph_assert(is_peon()); + ceph_assert(m->op == MTimeCheck2::OP_PING || m->op == MTimeCheck2::OP_REPORT); + + if (m->epoch != get_epoch()) { + dout(1) << __func__ << " got wrong epoch " + << "(ours " << get_epoch() + << " theirs: " << m->epoch << ") -- discarding" << dendl; + return; + } + + if (m->round < timecheck_round) { + dout(1) << __func__ << " got old round " << m->round + << " current " << timecheck_round + << " (epoch " << get_epoch() << ") -- discarding" << dendl; + return; + } + + timecheck_round = m->round; + + if (m->op == MTimeCheck2::OP_REPORT) { + ceph_assert((timecheck_round % 2) == 0); + timecheck_latencies.swap(m->latencies); + timecheck_skews.swap(m->skews); + return; + } + + ceph_assert((timecheck_round % 2) != 0); + MTimeCheck2 *reply = new MTimeCheck2(MTimeCheck2::OP_PONG); + utime_t curr_time = ceph_clock_now(); + reply->timestamp = curr_time; + reply->epoch = m->epoch; + reply->round = m->round; + dout(10) << __func__ << " send " << *m + << " to " << m->get_source_inst() << dendl; + m->get_connection()->send_message(reply); +} + +void Monitor::handle_timecheck(MonOpRequestRef op) +{ + auto m = op->get_req<MTimeCheck2>(); + dout(10) << __func__ << " " << *m << dendl; + + if (is_leader()) { + if (m->op != MTimeCheck2::OP_PONG) { + dout(1) << __func__ << " drop unexpected msg (not pong)" << dendl; + } else { + handle_timecheck_leader(op); + } + } else if (is_peon()) { + if (m->op != MTimeCheck2::OP_PING && m->op != MTimeCheck2::OP_REPORT) { + dout(1) << __func__ << " drop unexpected msg (not ping or report)" << dendl; + } else { + handle_timecheck_peon(op); + } + } else { + dout(1) << __func__ << " drop unexpected msg" << dendl; + } +} + +void Monitor::handle_subscribe(MonOpRequestRef op) +{ + auto m = op->get_req<MMonSubscribe>(); + dout(10) << "handle_subscribe " << *m << dendl; + + bool reply = false; + + MonSession *s = op->get_session(); + ceph_assert(s); + + if (m->hostname.size()) { + s->remote_host = m->hostname; + } + + for (map<string,ceph_mon_subscribe_item>::iterator p = m->what.begin(); + p != m->what.end(); + ++p) { + if (p->first == "monmap" || p->first == "config") { + // these require no caps + } else if (!s->is_capable("mon", MON_CAP_R)) { + dout(5) << __func__ << " " << op->get_req()->get_source_inst() + << " not enough caps for " << *(op->get_req()) << " -- dropping" + << dendl; + continue; + } + + // if there are any non-onetime subscriptions, we need to reply to start the resubscribe timer + if ((p->second.flags & CEPH_SUBSCRIBE_ONETIME) == 0) + reply = true; + + // remove conflicting subscribes + if (logmon()->sub_name_to_id(p->first) >= 0) { + for (map<string, Subscription*>::iterator it = s->sub_map.begin(); + it != s->sub_map.end(); ) { + if (it->first != p->first && logmon()->sub_name_to_id(it->first) >= 0) { + std::lock_guard l(session_map_lock); + session_map.remove_sub((it++)->second); + } else { + ++it; + } + } + } + + { + std::lock_guard l(session_map_lock); + session_map.add_update_sub(s, p->first, p->second.start, + p->second.flags & CEPH_SUBSCRIBE_ONETIME, + m->get_connection()->has_feature(CEPH_FEATURE_INCSUBOSDMAP)); + } + + if (p->first.compare(0, 6, "mdsmap") == 0 || p->first.compare(0, 5, "fsmap") == 0) { + dout(10) << __func__ << ": MDS sub '" << p->first << "'" << dendl; + if ((int)s->is_capable("mds", MON_CAP_R)) { + Subscription *sub = s->sub_map[p->first]; + ceph_assert(sub != nullptr); + mdsmon()->check_sub(sub); + } + } else if (p->first == "osdmap") { + if ((int)s->is_capable("osd", MON_CAP_R)) { + if (s->osd_epoch > p->second.start) { + // client needs earlier osdmaps on purpose, so reset the sent epoch + s->osd_epoch = 0; + } + osdmon()->check_osdmap_sub(s->sub_map["osdmap"]); + } + } else if (p->first == "osd_pg_creates") { + if ((int)s->is_capable("osd", MON_CAP_W)) { + osdmon()->check_pg_creates_sub(s->sub_map["osd_pg_creates"]); + } + } else if (p->first == "monmap") { + monmon()->check_sub(s->sub_map[p->first]); + } else if (logmon()->sub_name_to_id(p->first) >= 0) { + logmon()->check_sub(s->sub_map[p->first]); + } else if (p->first == "mgrmap" || p->first == "mgrdigest") { + mgrmon()->check_sub(s->sub_map[p->first]); + } else if (p->first == "servicemap") { + mgrstatmon()->check_sub(s->sub_map[p->first]); + } else if (p->first == "config") { + configmon()->check_sub(s); + } else if (p->first.find("kv:") == 0) { + kvmon()->check_sub(s->sub_map[p->first]); + } + } + + if (reply) { + // we only need to reply if the client is old enough to think it + // has to send renewals. + ConnectionRef con = m->get_connection(); + if (!con->has_feature(CEPH_FEATURE_MON_STATEFUL_SUB)) + m->get_connection()->send_message(new MMonSubscribeAck( + monmap->get_fsid(), (int)g_conf()->mon_subscribe_interval)); + } + +} + +void Monitor::handle_get_version(MonOpRequestRef op) +{ + auto m = op->get_req<MMonGetVersion>(); + dout(10) << "handle_get_version " << *m << dendl; + PaxosService *svc = NULL; + + MonSession *s = op->get_session(); + ceph_assert(s); + + if (!is_leader() && !is_peon()) { + dout(10) << " waiting for quorum" << dendl; + waitfor_quorum.push_back(new C_RetryMessage(this, op)); + goto out; + } + + if (m->what == "mdsmap") { + svc = mdsmon(); + } else if (m->what == "fsmap") { + svc = mdsmon(); + } else if (m->what == "osdmap") { + svc = osdmon(); + } else if (m->what == "monmap") { + svc = monmon(); + } else { + derr << "invalid map type " << m->what << dendl; + } + + if (svc) { + if (!svc->is_readable()) { + svc->wait_for_readable(op, new C_RetryMessage(this, op)); + goto out; + } + + MMonGetVersionReply *reply = new MMonGetVersionReply(); + reply->handle = m->handle; + reply->version = svc->get_last_committed(); + reply->oldest_version = svc->get_first_committed(); + reply->set_tid(m->get_tid()); + + m->get_connection()->send_message(reply); + } + out: + return; +} + +bool Monitor::ms_handle_reset(Connection *con) +{ + dout(10) << "ms_handle_reset " << con << " " << con->get_peer_addr() << dendl; + + // ignore lossless monitor sessions + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) + return false; + + auto priv = con->get_priv(); + auto s = static_cast<MonSession*>(priv.get()); + if (!s) + return false; + + // break any con <-> session ref cycle + s->con->set_priv(nullptr); + + if (is_shutdown()) + return false; + + std::lock_guard l(lock); + + dout(10) << "reset/close on session " << s->name << " " << s->addrs << dendl; + if (!s->closed && s->item.is_on_list()) { + std::lock_guard l(session_map_lock); + remove_session(s); + } + return true; +} + +bool Monitor::ms_handle_refused(Connection *con) +{ + // just log for now... + dout(10) << "ms_handle_refused " << con << " " << con->get_peer_addr() << dendl; + return false; +} + +// ----- + +void Monitor::send_latest_monmap(Connection *con) +{ + bufferlist bl; + monmap->encode(bl, con->get_features()); + con->send_message(new MMonMap(bl)); +} + +void Monitor::handle_mon_get_map(MonOpRequestRef op) +{ + auto m = op->get_req<MMonGetMap>(); + dout(10) << "handle_mon_get_map" << dendl; + send_latest_monmap(m->get_connection().get()); +} + +int Monitor::load_metadata() +{ + bufferlist bl; + int r = store->get(MONITOR_STORE_PREFIX, "last_metadata", bl); + if (r) + return r; + auto it = bl.cbegin(); + decode(mon_metadata, it); + + pending_metadata = mon_metadata; + return 0; +} + +int Monitor::get_mon_metadata(int mon, Formatter *f, ostream& err) +{ + ceph_assert(f); + if (!mon_metadata.count(mon)) { + err << "mon." << mon << " not found"; + return -EINVAL; + } + const Metadata& m = mon_metadata[mon]; + for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) { + f->dump_string(p->first.c_str(), p->second); + } + return 0; +} + +void Monitor::count_metadata(const string& field, map<string,int> *out) +{ + for (auto& p : mon_metadata) { + auto q = p.second.find(field); + if (q == p.second.end()) { + (*out)["unknown"]++; + } else { + (*out)[q->second]++; + } + } +} + +void Monitor::count_metadata(const string& field, Formatter *f) +{ + map<string,int> by_val; + count_metadata(field, &by_val); + f->open_object_section(field.c_str()); + for (auto& p : by_val) { + f->dump_int(p.first.c_str(), p.second); + } + f->close_section(); +} + +void Monitor::get_all_versions(std::map<string, list<string> > &versions) +{ + // mon + get_versions(versions); + // osd + osdmon()->get_versions(versions); + // mgr + mgrmon()->get_versions(versions); + // mds + mdsmon()->get_versions(versions); + dout(20) << __func__ << " all versions=" << versions << dendl; +} + +void Monitor::get_versions(std::map<string, list<string> > &versions) +{ + for (auto& [rank, metadata] : mon_metadata) { + auto q = metadata.find("ceph_version_short"); + if (q == metadata.end()) { + // not likely + continue; + } + versions[q->second].push_back(string("mon.") + monmap->get_name(rank)); + } +} + +int Monitor::print_nodes(Formatter *f, ostream& err) +{ + map<string, list<string> > mons; // hostname => mon + for (map<int, Metadata>::iterator it = mon_metadata.begin(); + it != mon_metadata.end(); ++it) { + const Metadata& m = it->second; + Metadata::const_iterator hostname = m.find("hostname"); + if (hostname == m.end()) { + // not likely though + continue; + } + mons[hostname->second].push_back(monmap->get_name(it->first)); + } + + dump_services(f, mons, "mon"); + return 0; +} + +// ---------------------------------------------- +// scrub + +int Monitor::scrub_start() +{ + dout(10) << __func__ << dendl; + ceph_assert(is_leader()); + + if (!scrub_result.empty()) { + clog->info() << "scrub already in progress"; + return -EBUSY; + } + + scrub_event_cancel(); + scrub_result.clear(); + scrub_state.reset(new ScrubState); + + scrub(); + return 0; +} + +int Monitor::scrub() +{ + ceph_assert(is_leader()); + ceph_assert(scrub_state); + + scrub_cancel_timeout(); + wait_for_paxos_write(); + scrub_version = paxos->get_version(); + + + // scrub all keys if we're the only monitor in the quorum + int32_t num_keys = + (quorum.size() == 1 ? -1 : cct->_conf->mon_scrub_max_keys); + + for (set<int>::iterator p = quorum.begin(); + p != quorum.end(); + ++p) { + if (*p == rank) + continue; + MMonScrub *r = new MMonScrub(MMonScrub::OP_SCRUB, scrub_version, + num_keys); + r->key = scrub_state->last_key; + send_mon_message(r, *p); + } + + // scrub my keys + bool r = _scrub(&scrub_result[rank], + &scrub_state->last_key, + &num_keys); + + scrub_state->finished = !r; + + // only after we got our scrub results do we really care whether the + // other monitors are late on their results. Also, this way we avoid + // triggering the timeout if we end up getting stuck in _scrub() for + // longer than the duration of the timeout. + scrub_reset_timeout(); + + if (quorum.size() == 1) { + ceph_assert(scrub_state->finished == true); + scrub_finish(); + } + return 0; +} + +void Monitor::handle_scrub(MonOpRequestRef op) +{ + auto m = op->get_req<MMonScrub>(); + dout(10) << __func__ << " " << *m << dendl; + switch (m->op) { + case MMonScrub::OP_SCRUB: + { + if (!is_peon()) + break; + + wait_for_paxos_write(); + + if (m->version != paxos->get_version()) + break; + + MMonScrub *reply = new MMonScrub(MMonScrub::OP_RESULT, + m->version, + m->num_keys); + + reply->key = m->key; + _scrub(&reply->result, &reply->key, &reply->num_keys); + m->get_connection()->send_message(reply); + } + break; + + case MMonScrub::OP_RESULT: + { + if (!is_leader()) + break; + if (m->version != scrub_version) + break; + // reset the timeout each time we get a result + scrub_reset_timeout(); + + int from = m->get_source().num(); + ceph_assert(scrub_result.count(from) == 0); + scrub_result[from] = m->result; + + if (scrub_result.size() == quorum.size()) { + scrub_check_results(); + scrub_result.clear(); + if (scrub_state->finished) + scrub_finish(); + else + scrub(); + } + } + break; + } +} + +bool Monitor::_scrub(ScrubResult *r, + pair<string,string> *start, + int *num_keys) +{ + ceph_assert(r != NULL); + ceph_assert(start != NULL); + ceph_assert(num_keys != NULL); + + set<string> prefixes = get_sync_targets_names(); + prefixes.erase("paxos"); // exclude paxos, as this one may have extra states for proposals, etc. + + dout(10) << __func__ << " start (" << *start << ")" + << " num_keys " << *num_keys << dendl; + + MonitorDBStore::Synchronizer it = store->get_synchronizer(*start, prefixes); + + int scrubbed_keys = 0; + pair<string,string> last_key; + + while (it->has_next_chunk()) { + + if (*num_keys > 0 && scrubbed_keys == *num_keys) + break; + + pair<string,string> k = it->get_next_key(); + if (prefixes.count(k.first) == 0) + continue; + + if (cct->_conf->mon_scrub_inject_missing_keys > 0.0 && + (rand() % 10000 < cct->_conf->mon_scrub_inject_missing_keys*10000.0)) { + dout(10) << __func__ << " inject missing key, skipping (" << k << ")" + << dendl; + continue; + } + + bufferlist bl; + int err = store->get(k.first, k.second, bl); + ceph_assert(err == 0); + + uint32_t key_crc = bl.crc32c(0); + dout(30) << __func__ << " " << k << " bl " << bl.length() << " bytes" + << " crc " << key_crc << dendl; + r->prefix_keys[k.first]++; + if (r->prefix_crc.count(k.first) == 0) { + r->prefix_crc[k.first] = 0; + } + r->prefix_crc[k.first] = bl.crc32c(r->prefix_crc[k.first]); + + if (cct->_conf->mon_scrub_inject_crc_mismatch > 0.0 && + (rand() % 10000 < cct->_conf->mon_scrub_inject_crc_mismatch*10000.0)) { + dout(10) << __func__ << " inject failure at (" << k << ")" << dendl; + r->prefix_crc[k.first] += 1; + } + + ++scrubbed_keys; + last_key = k; + } + + dout(20) << __func__ << " last_key (" << last_key << ")" + << " scrubbed_keys " << scrubbed_keys + << " has_next " << it->has_next_chunk() << dendl; + + *start = last_key; + *num_keys = scrubbed_keys; + + return it->has_next_chunk(); +} + +void Monitor::scrub_check_results() +{ + dout(10) << __func__ << dendl; + + // compare + int errors = 0; + ScrubResult& mine = scrub_result[rank]; + for (map<int,ScrubResult>::iterator p = scrub_result.begin(); + p != scrub_result.end(); + ++p) { + if (p->first == rank) + continue; + if (p->second != mine) { + ++errors; + clog->error() << "scrub mismatch"; + clog->error() << " mon." << rank << " " << mine; + clog->error() << " mon." << p->first << " " << p->second; + } + } + if (!errors) + clog->debug() << "scrub ok on " << quorum << ": " << mine; +} + +inline void Monitor::scrub_timeout() +{ + dout(1) << __func__ << " restarting scrub" << dendl; + scrub_reset(); + scrub_start(); +} + +void Monitor::scrub_finish() +{ + dout(10) << __func__ << dendl; + scrub_reset(); + scrub_event_start(); +} + +void Monitor::scrub_reset() +{ + dout(10) << __func__ << dendl; + scrub_cancel_timeout(); + scrub_version = 0; + scrub_result.clear(); + scrub_state.reset(); +} + +inline void Monitor::scrub_update_interval(ceph::timespan interval) +{ + // we don't care about changes if we are not the leader. + // changes will be visible if we become the leader. + if (!is_leader()) + return; + + dout(1) << __func__ << " new interval = " << interval << dendl; + + // if scrub already in progress, all changes will already be visible during + // the next round. Nothing to do. + if (scrub_state != NULL) + return; + + scrub_event_cancel(); + scrub_event_start(); +} + +void Monitor::scrub_event_start() +{ + dout(10) << __func__ << dendl; + + if (scrub_event) + scrub_event_cancel(); + + auto scrub_interval = + cct->_conf.get_val<std::chrono::seconds>("mon_scrub_interval"); + if (scrub_interval == std::chrono::seconds::zero()) { + dout(1) << __func__ << " scrub event is disabled" + << " (mon_scrub_interval = " << scrub_interval + << ")" << dendl; + return; + } + + scrub_event = timer.add_event_after( + scrub_interval, + new C_MonContext{this, [this](int) { + scrub_start(); + }}); +} + +void Monitor::scrub_event_cancel() +{ + dout(10) << __func__ << dendl; + if (scrub_event) { + timer.cancel_event(scrub_event); + scrub_event = NULL; + } +} + +inline void Monitor::scrub_cancel_timeout() +{ + if (scrub_timeout_event) { + timer.cancel_event(scrub_timeout_event); + scrub_timeout_event = NULL; + } +} + +void Monitor::scrub_reset_timeout() +{ + dout(15) << __func__ << " reset timeout event" << dendl; + scrub_cancel_timeout(); + scrub_timeout_event = timer.add_event_after( + g_conf()->mon_scrub_timeout, + new C_MonContext{this, [this](int) { + scrub_timeout(); + }}); +} + +/************ TICK ***************/ +void Monitor::new_tick() +{ + timer.add_event_after(g_conf()->mon_tick_interval, new C_MonContext{this, [this](int) { + tick(); + }}); +} + +void Monitor::tick() +{ + // ok go. + dout(11) << "tick" << dendl; + const utime_t now = ceph_clock_now(); + + // Check if we need to emit any delayed health check updated messages + if (is_leader()) { + const auto min_period = g_conf().get_val<int64_t>( + "mon_health_log_update_period"); + for (auto& svc : paxos_service) { + auto health = svc->get_health_checks(); + + for (const auto &i : health.checks) { + const std::string &code = i.first; + const std::string &summary = i.second.summary; + const health_status_t severity = i.second.severity; + + auto status_iter = health_check_log_times.find(code); + if (status_iter == health_check_log_times.end()) { + continue; + } + + auto &log_status = status_iter->second; + bool const changed = log_status.last_message != summary + || log_status.severity != severity; + + if (changed && now - log_status.updated_at > min_period) { + log_status.last_message = summary; + log_status.updated_at = now; + log_status.severity = severity; + + ostringstream ss; + ss << "Health check update: " << summary << " (" << code << ")"; + clog->health(severity) << ss.str(); + } + } + } + } + + + for (auto& svc : paxos_service) { + svc->tick(); + svc->maybe_trim(); + } + + // trim sessions + { + std::lock_guard l(session_map_lock); + auto p = session_map.sessions.begin(); + + bool out_for_too_long = (!exited_quorum.is_zero() && + now > (exited_quorum + 2*g_conf()->mon_lease)); + + while (!p.end()) { + MonSession *s = *p; + ++p; + + // don't trim monitors + if (s->name.is_mon()) + continue; + + if (s->session_timeout < now && s->con) { + // check keepalive, too + s->session_timeout = s->con->get_last_keepalive(); + s->session_timeout += g_conf()->mon_session_timeout; + } + if (s->session_timeout < now) { + dout(10) << " trimming session " << s->con << " " << s->name + << " " << s->addrs + << " (timeout " << s->session_timeout + << " < now " << now << ")" << dendl; + } else if (out_for_too_long) { + // boot the client Session because we've taken too long getting back in + dout(10) << " trimming session " << s->con << " " << s->name + << " because we've been out of quorum too long" << dendl; + } else { + continue; + } + + s->con->mark_down(); + remove_session(s); + logger->inc(l_mon_session_trim); + } + } + sync_trim_providers(); + + if (!maybe_wait_for_quorum.empty()) { + finish_contexts(g_ceph_context, maybe_wait_for_quorum); + } + + if (is_leader() && paxos->is_active() && fingerprint.is_zero()) { + // this is only necessary on upgraded clusters. + MonitorDBStore::TransactionRef t = paxos->get_pending_transaction(); + prepare_new_fingerprint(t); + paxos->trigger_propose(); + } + + mgr_client.update_daemon_health(get_health_metrics()); + new_tick(); +} + +vector<DaemonHealthMetric> Monitor::get_health_metrics() +{ + vector<DaemonHealthMetric> metrics; + + utime_t oldest_secs; + const utime_t now = ceph_clock_now(); + auto too_old = now; + too_old -= g_conf().get_val<std::chrono::seconds>("mon_op_complaint_time").count(); + int slow = 0; + TrackedOpRef oldest_op; + auto count_slow_ops = [&](TrackedOp& op) { + if (op.get_initiated() < too_old) { + slow++; + if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) { + oldest_op = &op; + } + return true; + } else { + return false; + } + }; + if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) { + if (slow) { + derr << __func__ << " reporting " << slow << " slow ops, oldest is " + << oldest_op->get_desc() << dendl; + } + metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs); + } else { + metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0); + } + return metrics; +} + +void Monitor::prepare_new_fingerprint(MonitorDBStore::TransactionRef t) +{ + uuid_d nf; + nf.generate_random(); + dout(10) << __func__ << " proposing cluster_fingerprint " << nf << dendl; + + bufferlist bl; + encode(nf, bl); + t->put(MONITOR_NAME, "cluster_fingerprint", bl); +} + +int Monitor::check_fsid() +{ + bufferlist ebl; + int r = store->get(MONITOR_NAME, "cluster_uuid", ebl); + if (r == -ENOENT) + return r; + ceph_assert(r == 0); + + string es(ebl.c_str(), ebl.length()); + + // only keep the first line + size_t pos = es.find_first_of('\n'); + if (pos != string::npos) + es.resize(pos); + + dout(10) << "check_fsid cluster_uuid contains '" << es << "'" << dendl; + uuid_d ondisk; + if (!ondisk.parse(es.c_str())) { + derr << "error: unable to parse uuid" << dendl; + return -EINVAL; + } + + if (monmap->get_fsid() != ondisk) { + derr << "error: cluster_uuid file exists with value " << ondisk + << ", != our uuid " << monmap->get_fsid() << dendl; + return -EEXIST; + } + + return 0; +} + +int Monitor::write_fsid() +{ + auto t(std::make_shared<MonitorDBStore::Transaction>()); + write_fsid(t); + int r = store->apply_transaction(t); + return r; +} + +int Monitor::write_fsid(MonitorDBStore::TransactionRef t) +{ + ostringstream ss; + ss << monmap->get_fsid() << "\n"; + string us = ss.str(); + + bufferlist b; + b.append(us); + + t->put(MONITOR_NAME, "cluster_uuid", b); + return 0; +} + +/* + * this is the closest thing to a traditional 'mkfs' for ceph. + * initialize the monitor state machines to their initial values. + */ +int Monitor::mkfs(bufferlist& osdmapbl) +{ + auto t(std::make_shared<MonitorDBStore::Transaction>()); + + // verify cluster fsid + int r = check_fsid(); + if (r < 0 && r != -ENOENT) + return r; + + bufferlist magicbl; + magicbl.append(CEPH_MON_ONDISK_MAGIC); + magicbl.append("\n"); + t->put(MONITOR_NAME, "magic", magicbl); + + + features = get_initial_supported_features(); + write_features(t); + + // save monmap, osdmap, keyring. + bufferlist monmapbl; + monmap->encode(monmapbl, CEPH_FEATURES_ALL); + monmap->set_epoch(0); // must be 0 to avoid confusing first MonmapMonitor::update_from_paxos() + t->put("mkfs", "monmap", monmapbl); + + if (osdmapbl.length()) { + // make sure it's a valid osdmap + try { + OSDMap om; + om.decode(osdmapbl); + } + catch (ceph::buffer::error& e) { + derr << "error decoding provided osdmap: " << e.what() << dendl; + return -EINVAL; + } + t->put("mkfs", "osdmap", osdmapbl); + } + + if (is_keyring_required()) { + KeyRing keyring; + string keyring_filename; + + r = ceph_resolve_file_search(g_conf()->keyring, keyring_filename); + if (r) { + if (g_conf()->key != "") { + string keyring_plaintext = "[mon.]\n\tkey = " + g_conf()->key + + "\n\tcaps mon = \"allow *\"\n"; + bufferlist bl; + bl.append(keyring_plaintext); + try { + auto i = bl.cbegin(); + keyring.decode_plaintext(i); + } + catch (const ceph::buffer::error& e) { + derr << "error decoding keyring " << keyring_plaintext + << ": " << e.what() << dendl; + return -EINVAL; + } + } else { + derr << "unable to find a keyring on " << g_conf()->keyring + << ": " << cpp_strerror(r) << dendl; + return r; + } + } else { + r = keyring.load(g_ceph_context, keyring_filename); + if (r < 0) { + derr << "unable to load initial keyring " << g_conf()->keyring << dendl; + return r; + } + } + + // put mon. key in external keyring; seed with everything else. + extract_save_mon_key(keyring); + + bufferlist keyringbl; + keyring.encode_plaintext(keyringbl); + t->put("mkfs", "keyring", keyringbl); + } + write_fsid(t); + store->apply_transaction(t); + + return 0; +} + +int Monitor::write_default_keyring(bufferlist& bl) +{ + ostringstream os; + os << g_conf()->mon_data << "/keyring"; + + int err = 0; + int fd = ::open(os.str().c_str(), O_WRONLY|O_CREAT|O_CLOEXEC, 0600); + if (fd < 0) { + err = -errno; + dout(0) << __func__ << " failed to open " << os.str() + << ": " << cpp_strerror(err) << dendl; + return err; + } + + err = bl.write_fd(fd); + if (!err) + ::fsync(fd); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + + return err; +} + +void Monitor::extract_save_mon_key(KeyRing& keyring) +{ + EntityName mon_name; + mon_name.set_type(CEPH_ENTITY_TYPE_MON); + EntityAuth mon_key; + if (keyring.get_auth(mon_name, mon_key)) { + dout(10) << "extract_save_mon_key moving mon. key to separate keyring" << dendl; + KeyRing pkey; + pkey.add(mon_name, mon_key); + bufferlist bl; + pkey.encode_plaintext(bl); + write_default_keyring(bl); + keyring.remove(mon_name); + } +} + +// AuthClient methods -- for mon <-> mon communication +int Monitor::get_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t *method, + vector<uint32_t> *preferred_modes, + bufferlist *out) +{ + std::scoped_lock l(auth_lock); + if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON && + con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) { + return -EACCES; + } + AuthAuthorizer *auth; + if (!get_authorizer(con->get_peer_type(), &auth)) { + return -EACCES; + } + auth_meta->authorizer.reset(auth); + auth_registry.get_supported_modes(con->get_peer_type(), + auth->protocol, + preferred_modes); + *method = auth->protocol; + *out = auth->bl; + return 0; +} + +int Monitor::handle_auth_reply_more( + Connection *con, + AuthConnectionMeta *auth_meta, + const bufferlist& bl, + bufferlist *reply) +{ + std::scoped_lock l(auth_lock); + if (!auth_meta->authorizer) { + derr << __func__ << " no authorizer?" << dendl; + return -EACCES; + } + auth_meta->authorizer->add_challenge(cct, bl); + *reply = auth_meta->authorizer->bl; + return 0; +} + +int Monitor::handle_auth_done( + Connection *con, + AuthConnectionMeta *auth_meta, + uint64_t global_id, + uint32_t con_mode, + const bufferlist& bl, + CryptoKey *session_key, + std::string *connection_secret) +{ + std::scoped_lock l(auth_lock); + // verify authorizer reply + auto p = bl.begin(); + if (!auth_meta->authorizer->verify_reply(p, connection_secret)) { + dout(0) << __func__ << " failed verifying authorizer reply" << dendl; + return -EACCES; + } + auth_meta->session_key = auth_meta->authorizer->session_key; + return 0; +} + +int Monitor::handle_auth_bad_method( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) +{ + derr << __func__ << " hmm, they didn't like " << old_auth_method + << " result " << cpp_strerror(result) << dendl; + return -EACCES; +} + +bool Monitor::get_authorizer(int service_id, AuthAuthorizer **authorizer) +{ + dout(10) << "get_authorizer for " << ceph_entity_type_name(service_id) + << dendl; + + if (is_shutdown()) + return false; + + // we only connect to other monitors and mgr; every else connects to us. + if (service_id != CEPH_ENTITY_TYPE_MON && + service_id != CEPH_ENTITY_TYPE_MGR) + return false; + + if (!auth_cluster_required.is_supported_auth(CEPH_AUTH_CEPHX)) { + // auth_none + dout(20) << __func__ << " building auth_none authorizer" << dendl; + AuthNoneClientHandler handler{g_ceph_context}; + handler.set_global_id(0); + *authorizer = handler.build_authorizer(service_id); + return true; + } + + CephXServiceTicketInfo auth_ticket_info; + CephXSessionAuthInfo info; + int ret; + + EntityName name; + name.set_type(CEPH_ENTITY_TYPE_MON); + auth_ticket_info.ticket.name = name; + auth_ticket_info.ticket.global_id = 0; + + if (service_id == CEPH_ENTITY_TYPE_MON) { + // mon to mon authentication uses the private monitor shared key and not the + // rotating key + CryptoKey secret; + if (!keyring.get_secret(name, secret) && + !key_server.get_secret(name, secret)) { + dout(0) << " couldn't get secret for mon service from keyring or keyserver" + << dendl; + stringstream ss, ds; + int err = key_server.list_secrets(ds); + if (err < 0) + ss << "no installed auth entries!"; + else + ss << "installed auth entries:"; + dout(0) << ss.str() << "\n" << ds.str() << dendl; + return false; + } + + ret = key_server.build_session_auth_info( + service_id, auth_ticket_info.ticket, secret, (uint64_t)-1, info); + if (ret < 0) { + dout(0) << __func__ << " failed to build mon session_auth_info " + << cpp_strerror(ret) << dendl; + return false; + } + } else if (service_id == CEPH_ENTITY_TYPE_MGR) { + // mgr + ret = key_server.build_session_auth_info( + service_id, auth_ticket_info.ticket, info); + if (ret < 0) { + derr << __func__ << " failed to build mgr service session_auth_info " + << cpp_strerror(ret) << dendl; + return false; + } + } else { + ceph_abort(); // see check at top of fn + } + + CephXTicketBlob blob; + if (!cephx_build_service_ticket_blob(cct, info, blob)) { + dout(0) << "get_authorizer failed to build service ticket" << dendl; + return false; + } + bufferlist ticket_data; + encode(blob, ticket_data); + + auto iter = ticket_data.cbegin(); + CephXTicketHandler handler(g_ceph_context, service_id); + decode(handler.ticket, iter); + + handler.session_key = info.session_key; + + *authorizer = handler.build_authorizer(0); + + return true; +} + +int Monitor::handle_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + bool more, + uint32_t auth_method, + const bufferlist &payload, + bufferlist *reply) +{ + std::scoped_lock l(auth_lock); + + // NOTE: be careful, the Connection hasn't fully negotiated yet, so + // e.g., peer_features, peer_addrs, and others are still unknown. + + dout(10) << __func__ << " con " << con << (more ? " (more)":" (start)") + << " method " << auth_method + << " payload " << payload.length() + << dendl; + if (!payload.length()) { + if (!con->is_msgr2() && + con->get_peer_type() != CEPH_ENTITY_TYPE_MON) { + // for v1 connections, we tolerate no authorizer (from + // non-monitors), because authentication happens via MAuth + // messages. + return 1; + } + return -EACCES; + } + if (!more) { + auth_meta->auth_mode = payload[0]; + } + + if (auth_meta->auth_mode >= AUTH_MODE_AUTHORIZER && + auth_meta->auth_mode <= AUTH_MODE_AUTHORIZER_MAX) { + AuthAuthorizeHandler *ah = get_auth_authorize_handler(con->get_peer_type(), + auth_method); + if (!ah) { + lderr(cct) << __func__ << " no AuthAuthorizeHandler found for auth method " + << auth_method << dendl; + return -EOPNOTSUPP; + } + bool was_challenge = (bool)auth_meta->authorizer_challenge; + bool isvalid = ah->verify_authorizer( + cct, + keyring, + payload, + auth_meta->get_connection_secret_length(), + reply, + &con->peer_name, + &con->peer_global_id, + &con->peer_caps_info, + &auth_meta->session_key, + &auth_meta->connection_secret, + &auth_meta->authorizer_challenge); + if (isvalid) { + ms_handle_authentication(con); + return 1; + } + if (!more && !was_challenge && auth_meta->authorizer_challenge) { + return 0; + } + dout(10) << __func__ << " bad authorizer on " << con << dendl; + return -EACCES; + } else if (auth_meta->auth_mode < AUTH_MODE_MON || + auth_meta->auth_mode > AUTH_MODE_MON_MAX) { + derr << __func__ << " unrecognized auth mode " << auth_meta->auth_mode + << dendl; + return -EACCES; + } + + // wait until we've formed an initial quorum on mkfs so that we have + // the initial keys (e.g., client.admin). + if (authmon()->get_last_committed() == 0) { + dout(10) << __func__ << " haven't formed initial quorum, EBUSY" << dendl; + return -EBUSY; + } + + RefCountedPtr priv; + MonSession *s; + int32_t r = 0; + auto p = payload.begin(); + if (!more) { + if (con->get_priv()) { + return -EACCES; // wtf + } + + // handler? + unique_ptr<AuthServiceHandler> auth_handler{get_auth_service_handler( + auth_method, g_ceph_context, &key_server)}; + if (!auth_handler) { + dout(1) << __func__ << " auth_method " << auth_method << " not supported" + << dendl; + return -EOPNOTSUPP; + } + + uint8_t mode; + EntityName entity_name; + + try { + decode(mode, p); + if (mode < AUTH_MODE_MON || + mode > AUTH_MODE_MON_MAX) { + dout(1) << __func__ << " invalid mode " << (int)mode << dendl; + return -EACCES; + } + assert(mode >= AUTH_MODE_MON && mode <= AUTH_MODE_MON_MAX); + decode(entity_name, p); + decode(con->peer_global_id, p); + } catch (ceph::buffer::error& e) { + dout(1) << __func__ << " failed to decode, " << e.what() << dendl; + return -EACCES; + } + + // supported method? + if (entity_name.get_type() == CEPH_ENTITY_TYPE_MON || + entity_name.get_type() == CEPH_ENTITY_TYPE_OSD || + entity_name.get_type() == CEPH_ENTITY_TYPE_MDS || + entity_name.get_type() == CEPH_ENTITY_TYPE_MGR) { + if (!auth_cluster_required.is_supported_auth(auth_method)) { + dout(10) << __func__ << " entity " << entity_name << " method " + << auth_method << " not among supported " + << auth_cluster_required.get_supported_set() << dendl; + return -EOPNOTSUPP; + } + } else { + if (!auth_service_required.is_supported_auth(auth_method)) { + dout(10) << __func__ << " entity " << entity_name << " method " + << auth_method << " not among supported " + << auth_cluster_required.get_supported_set() << dendl; + return -EOPNOTSUPP; + } + } + + // for msgr1 we would do some weirdness here to ensure signatures + // are supported by the client if we require it. for msgr2 that + // is not necessary. + + bool is_new_global_id = false; + if (!con->peer_global_id) { + con->peer_global_id = authmon()->_assign_global_id(); + if (!con->peer_global_id) { + dout(1) << __func__ << " failed to assign global_id" << dendl; + return -EBUSY; + } + is_new_global_id = true; + } + + // set up partial session + s = new MonSession(con); + s->auth_handler = auth_handler.release(); + con->set_priv(RefCountedPtr{s, false}); + + r = s->auth_handler->start_session( + entity_name, + con->peer_global_id, + is_new_global_id, + reply, + &con->peer_caps_info); + } else { + priv = con->get_priv(); + if (!priv) { + // this can happen if the async ms_handle_reset event races with + // the unlocked call into handle_auth_request + return -EACCES; + } + s = static_cast<MonSession*>(priv.get()); + r = s->auth_handler->handle_request( + p, + auth_meta->get_connection_secret_length(), + reply, + &con->peer_caps_info, + &auth_meta->session_key, + &auth_meta->connection_secret); + } + if (r > 0 && + !s->authenticated) { + ms_handle_authentication(con); + } + + dout(30) << " r " << r << " reply:\n"; + reply->hexdump(*_dout); + *_dout << dendl; + return r; +} + +void Monitor::ms_handle_accept(Connection *con) +{ + auto priv = con->get_priv(); + MonSession *s = static_cast<MonSession*>(priv.get()); + if (!s) { + // legacy protocol v1? + dout(10) << __func__ << " con " << con << " no session" << dendl; + return; + } + + if (s->item.is_on_list()) { + dout(10) << __func__ << " con " << con << " session " << s + << " already on list" << dendl; + } else { + std::lock_guard l(session_map_lock); + if (state == STATE_SHUTDOWN) { + dout(10) << __func__ << " ignoring new con " << con << " (shutdown)" << dendl; + con->mark_down(); + return; + } + dout(10) << __func__ << " con " << con << " session " << s + << " registering session for " + << con->get_peer_addrs() << dendl; + s->_ident(entity_name_t(con->get_peer_type(), con->get_peer_id()), + con->get_peer_addrs()); + session_map.add_session(s); + } +} + +int Monitor::ms_handle_authentication(Connection *con) +{ + if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) { + // mon <-> mon connections need no Session, and setting one up + // creates an awkward ref cycle between Session and Connection. + return 1; + } + + auto priv = con->get_priv(); + MonSession *s = static_cast<MonSession*>(priv.get()); + if (!s) { + // must be msgr2, otherwise dispatch would have set up the session. + s = session_map.new_session( + entity_name_t(con->get_peer_type(), -1), // we don't know yet + con->get_peer_addrs(), + con); + assert(s); + dout(10) << __func__ << " adding session " << s << " to con " << con + << dendl; + con->set_priv(s); + logger->set(l_mon_num_sessions, session_map.get_size()); + logger->inc(l_mon_session_add); + } + dout(10) << __func__ << " session " << s << " con " << con + << " addr " << s->con->get_peer_addr() + << " " << *s << dendl; + + AuthCapsInfo &caps_info = con->get_peer_caps_info(); + int ret = 0; + if (caps_info.allow_all) { + s->caps.set_allow_all(); + s->authenticated = true; + ret = 1; + } else if (caps_info.caps.length()) { + bufferlist::const_iterator p = caps_info.caps.cbegin(); + string str; + try { + decode(str, p); + } catch (const ceph::buffer::error &err) { + derr << __func__ << " corrupt cap data for " << con->get_peer_entity_name() + << " in auth db" << dendl; + str.clear(); + ret = -EACCES; + } + if (ret >= 0) { + if (s->caps.parse(str, NULL)) { + s->authenticated = true; + ret = 1; + } else { + derr << __func__ << " unparseable caps '" << str << "' for " + << con->get_peer_entity_name() << dendl; + ret = -EACCES; + } + } + } + + return ret; +} + +void Monitor::set_mon_crush_location(const string& loc) +{ + if (loc.empty()) { + return; + } + vector<string> loc_vec; + loc_vec.push_back(loc); + CrushWrapper::parse_loc_map(loc_vec, &crush_loc); + need_set_crush_loc = true; +} + +void Monitor::notify_new_monmap(bool can_change_external_state, bool remove_rank_elector) +{ + if (need_set_crush_loc) { + auto my_info_i = monmap->mon_info.find(name); + if (my_info_i != monmap->mon_info.end() && + my_info_i->second.crush_loc == crush_loc) { + need_set_crush_loc = false; + } + } + elector.notify_strategy_maybe_changed(monmap->strategy); + if (remove_rank_elector){ + dout(10) << __func__ << " we have " << monmap->ranks.size()<< " ranks" << dendl; + dout(10) << __func__ << " we have " << monmap->removed_ranks.size() << " removed ranks" << dendl; + for (auto i = monmap->removed_ranks.rbegin(); + i != monmap->removed_ranks.rend(); ++i) { + int remove_rank = *i; + dout(10) << __func__ << " removing rank " << remove_rank << dendl; + if (rank == remove_rank) { + dout(5) << "We are removing our own rank, probably we" + << " are removed from monmap before we shutdown ... dropping." << dendl; + continue; + } + int new_rank = monmap->get_rank(messenger->get_myaddrs()); + if (new_rank == -1) { + dout(5) << "We no longer exists in the monmap! ... dropping." << dendl; + continue; + } + elector.notify_rank_removed(remove_rank, new_rank); + } + } + + if (monmap->stretch_mode_enabled) { + try_engage_stretch_mode(); + } + + if (is_stretch_mode()) { + if (!monmap->stretch_marked_down_mons.empty()) { + set_degraded_stretch_mode(); + } + } + set_elector_disallowed_leaders(can_change_external_state); +} + +void Monitor::set_elector_disallowed_leaders(bool allow_election) +{ + set<int> dl; + for (auto name : monmap->disallowed_leaders) { + dl.insert(monmap->get_rank(name)); + } + if (is_stretch_mode()) { + for (auto name : monmap->stretch_marked_down_mons) { + dl.insert(monmap->get_rank(name)); + } + dl.insert(monmap->get_rank(monmap->tiebreaker_mon)); + } + + bool disallowed_changed = elector.set_disallowed_leaders(dl); + if (disallowed_changed && allow_election) { + elector.call_election(); + } +} + +struct CMonEnableStretchMode : public Context { + Monitor *m; + CMonEnableStretchMode(Monitor *mon) : m(mon) {} + void finish(int r) { + m->try_engage_stretch_mode(); + } +}; +void Monitor::try_engage_stretch_mode() +{ + dout(20) << __func__ << dendl; + if (stretch_mode_engaged) return; + if (!osdmon()->is_readable()) { + osdmon()->wait_for_readable_ctx(new CMonEnableStretchMode(this)); + } + if (osdmon()->osdmap.stretch_mode_enabled && + monmap->stretch_mode_enabled) { + dout(10) << "Engaging stretch mode!" << dendl; + stretch_mode_engaged = true; + int32_t stretch_divider_id = osdmon()->osdmap.stretch_mode_bucket; + stretch_bucket_divider = osdmon()->osdmap. + crush->get_type_name(stretch_divider_id); + disconnect_disallowed_stretch_sessions(); + } +} + +void Monitor::do_stretch_mode_election_work() +{ + dout(20) << __func__ << dendl; + if (!is_stretch_mode() || + !is_leader()) return; + dout(20) << "checking for degraded stretch mode" << dendl; + map<string, set<string>> old_dead_buckets; + old_dead_buckets.swap(dead_mon_buckets); + up_mon_buckets.clear(); + // identify if we've lost a CRUSH bucket, request OSDMonitor check for death + map<string,set<string>> down_mon_buckets; + for (unsigned i = 0; i < monmap->size(); ++i) { + const auto &mi = monmap->mon_info[monmap->get_name(i)]; + auto ci = mi.crush_loc.find(stretch_bucket_divider); + ceph_assert(ci != mi.crush_loc.end()); + if (quorum.count(i)) { + up_mon_buckets.insert(ci->second); + } else { + down_mon_buckets[ci->second].insert(mi.name); + } + } + dout(20) << "prior dead_mon_buckets: " << old_dead_buckets + << "; down_mon_buckets: " << down_mon_buckets + << "; up_mon_buckets: " << up_mon_buckets << dendl; + for (auto di : down_mon_buckets) { + if (!up_mon_buckets.count(di.first)) { + dead_mon_buckets[di.first] = di.second; + } + } + dout(20) << "new dead_mon_buckets " << dead_mon_buckets << dendl; + + if (dead_mon_buckets != old_dead_buckets && + dead_mon_buckets.size() >= old_dead_buckets.size()) { + maybe_go_degraded_stretch_mode(); + } +} + +struct CMonGoDegraded : public Context { + Monitor *m; + CMonGoDegraded(Monitor *mon) : m(mon) {} + void finish(int r) { + m->maybe_go_degraded_stretch_mode(); + } +}; + +struct CMonGoRecovery : public Context { + Monitor *m; + CMonGoRecovery(Monitor *mon) : m(mon) {} + void finish(int r) { + m->go_recovery_stretch_mode(); + } +}; +void Monitor::go_recovery_stretch_mode() +{ + dout(20) << __func__ << dendl; + if (!is_leader()) return; + if (!is_degraded_stretch_mode()) return; + if (is_recovering_stretch_mode()) return; + + if (dead_mon_buckets.size()) { + ceph_assert( 0 == "how did we try and do stretch recovery while we have dead monitor buckets?"); + // we can't recover if we are missing monitors in a zone! + return; + } + + if (!osdmon()->is_readable()) { + osdmon()->wait_for_readable_ctx(new CMonGoRecovery(this)); + return; + } + + if (!osdmon()->is_writeable()) { + osdmon()->wait_for_writeable_ctx(new CMonGoRecovery(this)); + } + osdmon()->trigger_recovery_stretch_mode(); +} + +void Monitor::set_recovery_stretch_mode() +{ + degraded_stretch_mode = true; + recovering_stretch_mode = true; + osdmon()->set_recovery_stretch_mode(); +} + +void Monitor::maybe_go_degraded_stretch_mode() +{ + dout(20) << __func__ << dendl; + if (is_degraded_stretch_mode()) return; + if (!is_leader()) return; + if (dead_mon_buckets.empty()) return; + if (!osdmon()->is_readable()) { + osdmon()->wait_for_readable_ctx(new CMonGoDegraded(this)); + return; + } + ceph_assert(monmap->contains(monmap->tiebreaker_mon)); + // filter out the tiebreaker zone and check if remaining sites are down by OSDs too + const auto &mi = monmap->mon_info[monmap->tiebreaker_mon]; + auto ci = mi.crush_loc.find(stretch_bucket_divider); + map<string, set<string>> filtered_dead_buckets = dead_mon_buckets; + filtered_dead_buckets.erase(ci->second); + + set<int> matched_down_buckets; + set<string> matched_down_mons; + bool dead = osdmon()->check_for_dead_crush_zones(filtered_dead_buckets, + &matched_down_buckets, + &matched_down_mons); + if (dead) { + if (!osdmon()->is_writeable()) { + osdmon()->wait_for_writeable_ctx(new CMonGoDegraded(this)); + } + if (!monmon()->is_writeable()) { + monmon()->wait_for_writeable_ctx(new CMonGoDegraded(this)); + } + trigger_degraded_stretch_mode(matched_down_mons, matched_down_buckets); + } +} + +void Monitor::trigger_degraded_stretch_mode(const set<string>& dead_mons, + const set<int>& dead_buckets) +{ + dout(20) << __func__ << dendl; + ceph_assert(osdmon()->is_writeable()); + ceph_assert(monmon()->is_writeable()); + + // figure out which OSD zone(s) remains alive by removing + // tiebreaker mon from up_mon_buckets + set<string> live_zones = up_mon_buckets; + ceph_assert(monmap->contains(monmap->tiebreaker_mon)); + const auto &mi = monmap->mon_info[monmap->tiebreaker_mon]; + auto ci = mi.crush_loc.find(stretch_bucket_divider); + live_zones.erase(ci->second); + ceph_assert(live_zones.size() == 1); // only support 2 zones right now + + osdmon()->trigger_degraded_stretch_mode(dead_buckets, live_zones); + monmon()->trigger_degraded_stretch_mode(dead_mons); + set_degraded_stretch_mode(); +} + +void Monitor::set_degraded_stretch_mode() +{ + degraded_stretch_mode = true; + recovering_stretch_mode = false; + osdmon()->set_degraded_stretch_mode(); +} + +struct CMonGoHealthy : public Context { + Monitor *m; + CMonGoHealthy(Monitor *mon) : m(mon) {} + void finish(int r) { + m->trigger_healthy_stretch_mode(); + } +}; + + +void Monitor::trigger_healthy_stretch_mode() +{ + dout(20) << __func__ << dendl; + if (!is_degraded_stretch_mode()) return; + if (!is_leader()) return; + if (!osdmon()->is_writeable()) { + osdmon()->wait_for_writeable_ctx(new CMonGoHealthy(this)); + } + if (!monmon()->is_writeable()) { + monmon()->wait_for_writeable_ctx(new CMonGoHealthy(this)); + } + + ceph_assert(osdmon()->osdmap.recovering_stretch_mode); + osdmon()->trigger_healthy_stretch_mode(); + monmon()->trigger_healthy_stretch_mode(); +} + +void Monitor::set_healthy_stretch_mode() +{ + degraded_stretch_mode = false; + recovering_stretch_mode = false; + osdmon()->set_healthy_stretch_mode(); +} + +bool Monitor::session_stretch_allowed(MonSession *s, MonOpRequestRef& op) +{ + if (!is_stretch_mode()) return true; + if (s->proxy_con) return true; + if (s->validated_stretch_connection) return true; + if (!s->con) return true; + if (s->con->peer_is_osd()) { + dout(20) << __func__ << "checking OSD session" << s << dendl; + // okay, check the crush location + int barrier_id; + int retval = osdmon()->osdmap.crush->get_validated_type_id(stretch_bucket_divider, + &barrier_id); + ceph_assert(retval >= 0); + int osd_bucket_id = osdmon()->osdmap.crush->get_parent_of_type(s->con->peer_id, + barrier_id); + const auto &mi = monmap->mon_info.find(name); + ceph_assert(mi != monmap->mon_info.end()); + auto ci = mi->second.crush_loc.find(stretch_bucket_divider); + ceph_assert(ci != mi->second.crush_loc.end()); + int mon_bucket_id = osdmon()->osdmap.crush->get_item_id(ci->second); + + if (osd_bucket_id != mon_bucket_id) { + dout(5) << "discarding session " << *s + << " and sending OSD to matched zone" << dendl; + s->con->mark_down(); + std::lock_guard l(session_map_lock); + remove_session(s); + if (op) { + op->mark_zap(); + } + return false; + } + } + + s->validated_stretch_connection = true; + return true; +} + +void Monitor::disconnect_disallowed_stretch_sessions() +{ + dout(20) << __func__ << dendl; + MonOpRequestRef blank; + auto i = session_map.sessions.begin(); + while (i != session_map.sessions.end()) { + auto j = i; + ++i; + session_stretch_allowed(*j, blank); + } +} diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h new file mode 100644 index 000000000..1093649bb --- /dev/null +++ b/src/mon/Monitor.h @@ -0,0 +1,1148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* + * This is the top level monitor. It runs on each machine in the Monitor + * Cluster. The election of a leader for the paxos algorithm only happens + * once per machine via the elector. There is a separate paxos instance (state) + * kept for each of the system components: Object Store Device (OSD) Monitor, + * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor. + */ + +#ifndef CEPH_MONITOR_H +#define CEPH_MONITOR_H + +#include <errno.h> +#include <cmath> +#include <string> +#include <array> + +#include "include/types.h" +#include "include/health.h" +#include "msg/Messenger.h" + +#include "common/Timer.h" + +#include "health_check.h" +#include "MonMap.h" +#include "Elector.h" +#include "Paxos.h" +#include "Session.h" +#include "MonCommand.h" + + +#include "common/config_obs.h" +#include "common/LogClient.h" +#include "auth/AuthClient.h" +#include "auth/AuthServer.h" +#include "auth/cephx/CephxKeyServer.h" +#include "auth/AuthMethodList.h" +#include "auth/KeyRing.h" +#include "include/common_fwd.h" +#include "messages/MMonCommand.h" +#include "mon/MonitorDBStore.h" +#include "mgr/MgrClient.h" + +#include "mon/MonOpRequest.h" +#include "common/WorkQueue.h" + +using namespace TOPNSPC::common; + +#define CEPH_MON_PROTOCOL 13 /* cluster internal */ + + +enum { + l_cluster_first = 555000, + l_cluster_num_mon, + l_cluster_num_mon_quorum, + l_cluster_num_osd, + l_cluster_num_osd_up, + l_cluster_num_osd_in, + l_cluster_osd_epoch, + l_cluster_osd_bytes, + l_cluster_osd_bytes_used, + l_cluster_osd_bytes_avail, + l_cluster_num_pool, + l_cluster_num_pg, + l_cluster_num_pg_active_clean, + l_cluster_num_pg_active, + l_cluster_num_pg_peering, + l_cluster_num_object, + l_cluster_num_object_degraded, + l_cluster_num_object_misplaced, + l_cluster_num_object_unfound, + l_cluster_num_bytes, + l_cluster_last, +}; + +enum { + l_mon_first = 456000, + l_mon_num_sessions, + l_mon_session_add, + l_mon_session_rm, + l_mon_session_trim, + l_mon_num_elections, + l_mon_election_call, + l_mon_election_win, + l_mon_election_lose, + l_mon_last, +}; + +class PaxosService; + +class AdminSocketHook; + +#define COMPAT_SET_LOC "feature_set" + +class Monitor : public Dispatcher, + public AuthClient, + public AuthServer, + public md_config_obs_t { +public: + int orig_argc = 0; + const char **orig_argv = nullptr; + + // me + std::string name; + int rank; + Messenger *messenger; + ConnectionRef con_self; + ceph::mutex lock = ceph::make_mutex("Monitor::lock"); + SafeTimer timer; + Finisher finisher; + ThreadPool cpu_tp; ///< threadpool for CPU intensive work + + ceph::mutex auth_lock = ceph::make_mutex("Monitor::auth_lock"); + + /// true if we have ever joined a quorum. if false, we are either a + /// new cluster, a newly joining monitor, or a just-upgraded + /// monitor. + bool has_ever_joined; + + PerfCounters *logger, *cluster_logger; + bool cluster_logger_registered; + + void register_cluster_logger(); + void unregister_cluster_logger(); + + MonMap *monmap; + uuid_d fingerprint; + + std::set<entity_addrvec_t> extra_probe_peers; + + LogClient log_client; + LogChannelRef clog; + LogChannelRef audit_clog; + KeyRing keyring; + KeyServer key_server; + + AuthMethodList auth_cluster_required; + AuthMethodList auth_service_required; + + CompatSet features; + + std::vector<MonCommand> leader_mon_commands; // quorum leader's commands + std::vector<MonCommand> local_mon_commands; // commands i support + ceph::buffer::list local_mon_commands_bl; // encoded version of above + + std::vector<MonCommand> prenautilus_local_mon_commands; + ceph::buffer::list prenautilus_local_mon_commands_bl; + + Messenger *mgr_messenger; + MgrClient mgr_client; + uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes + std::string gss_ktfile_client{}; + +private: + void new_tick(); + + // -- local storage -- +public: + MonitorDBStore *store; + static const std::string MONITOR_NAME; + static const std::string MONITOR_STORE_PREFIX; + + // -- monitor state -- +private: + enum { + STATE_INIT = 1, + STATE_PROBING, + STATE_SYNCHRONIZING, + STATE_ELECTING, + STATE_LEADER, + STATE_PEON, + STATE_SHUTDOWN + }; + int state = STATE_INIT; + +public: + static const char *get_state_name(int s) { + switch (s) { + case STATE_PROBING: return "probing"; + case STATE_SYNCHRONIZING: return "synchronizing"; + case STATE_ELECTING: return "electing"; + case STATE_LEADER: return "leader"; + case STATE_PEON: return "peon"; + case STATE_SHUTDOWN: return "shutdown"; + default: return "???"; + } + } + const char *get_state_name() const { + return get_state_name(state); + } + + bool is_init() const { return state == STATE_INIT; } + bool is_shutdown() const { return state == STATE_SHUTDOWN; } + bool is_probing() const { return state == STATE_PROBING; } + bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; } + bool is_electing() const { return state == STATE_ELECTING; } + bool is_leader() const { return state == STATE_LEADER; } + bool is_peon() const { return state == STATE_PEON; } + + const utime_t &get_leader_since() const; + + void prepare_new_fingerprint(MonitorDBStore::TransactionRef t); + + std::vector<DaemonHealthMetric> get_health_metrics(); + + int quorum_age() const { + auto age = std::chrono::duration_cast<std::chrono::seconds>( + ceph::mono_clock::now() - quorum_since); + return age.count(); + } + + bool is_mon_down() const { + int max = monmap->size(); + int actual = get_quorum().size(); + auto now = ceph::real_clock::now(); + return actual < max && now > monmap->created.to_real_time(); + } + + // -- elector -- +private: + std::unique_ptr<Paxos> paxos; + Elector elector; + friend class Elector; + + /// features we require of peers (based on on-disk compatset) + uint64_t required_features; + + int leader; // current leader (to best of knowledge) + std::set<int> quorum; // current active set of monitors (if !starting) + ceph::mono_clock::time_point quorum_since; // when quorum formed + utime_t leader_since; // when this monitor became the leader, if it is the leader + utime_t exited_quorum; // time detected as not in quorum; 0 if in + + // map of counts of connected clients, by type and features, for + // each quorum mon + std::map<int,FeatureMap> quorum_feature_map; + + /** + * Intersection of quorum member's connection feature bits. + */ + uint64_t quorum_con_features; + /** + * Intersection of quorum members mon-specific feature bits + */ + mon_feature_t quorum_mon_features; + + ceph_release_t quorum_min_mon_release{ceph_release_t::unknown}; + + std::set<std::string> outside_quorum; + + bool stretch_mode_engaged{false}; + bool degraded_stretch_mode{false}; + bool recovering_stretch_mode{false}; + string stretch_bucket_divider; + map<string, set<string>> dead_mon_buckets; // bucket->mon ranks, locations with no live mons + set<string> up_mon_buckets; // locations with a live mon + void do_stretch_mode_election_work(); + + bool session_stretch_allowed(MonSession *s, MonOpRequestRef& op); + void disconnect_disallowed_stretch_sessions(); + void set_elector_disallowed_leaders(bool allow_election); + + map <string,string> crush_loc; + bool need_set_crush_loc{false}; +public: + bool is_stretch_mode() { return stretch_mode_engaged; } + bool is_degraded_stretch_mode() { return degraded_stretch_mode; } + bool is_recovering_stretch_mode() { return recovering_stretch_mode; } + + /** + * This set of functions maintains the in-memory stretch state + * and sets up transitions of the map states by calling in to + * MonmapMonitor and OSDMonitor. + * + * The [maybe_]go_* functions are called on the leader to + * decide if transitions should happen; the trigger_* functions + * set up the map transitions; and the set_* functions actually + * change the memory state -- but these are only called + * via OSDMonitor::update_from_paxos, to guarantee consistent + * updates across the entire cluster. + */ + void try_engage_stretch_mode(); + void maybe_go_degraded_stretch_mode(); + void trigger_degraded_stretch_mode(const set<string>& dead_mons, + const set<int>& dead_buckets); + void set_degraded_stretch_mode(); + void go_recovery_stretch_mode(); + void set_recovery_stretch_mode(); + void trigger_healthy_stretch_mode(); + void set_healthy_stretch_mode(); + void enable_stretch_mode(); + void set_mon_crush_location(const string& loc); + + +private: + + /** + * @defgroup Monitor_h_scrub + * @{ + */ + version_t scrub_version; ///< paxos version we are scrubbing + std::map<int,ScrubResult> scrub_result; ///< results so far + + /** + * trigger a cross-mon scrub + * + * Verify all mons are storing identical content + */ + int scrub_start(); + int scrub(); + void handle_scrub(MonOpRequestRef op); + bool _scrub(ScrubResult *r, + std::pair<std::string,std::string> *start, + int *num_keys); + void scrub_check_results(); + void scrub_timeout(); + void scrub_finish(); + void scrub_reset(); + void scrub_update_interval(ceph::timespan interval); + + Context *scrub_event; ///< periodic event to trigger scrub (leader) + Context *scrub_timeout_event; ///< scrub round timeout (leader) + void scrub_event_start(); + void scrub_event_cancel(); + void scrub_reset_timeout(); + void scrub_cancel_timeout(); + + struct ScrubState { + std::pair<std::string,std::string> last_key; ///< last scrubbed key + bool finished; + + ScrubState() : finished(false) { } + virtual ~ScrubState() { } + }; + std::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub + + /** + * @defgroup Monitor_h_sync Synchronization + * @{ + */ + /** + * @} // provider state + */ + struct SyncProvider { + entity_addrvec_t addrs; + uint64_t cookie; ///< unique cookie for this sync attempt + utime_t timeout; ///< when we give up and expire this attempt + version_t last_committed; ///< last paxos version on peer + std::pair<std::string,std::string> last_key; ///< last key sent to (or on) peer + bool full; ///< full scan? + MonitorDBStore::Synchronizer synchronizer; ///< iterator + + SyncProvider() : cookie(0), last_committed(0), full(false) {} + + void reset_timeout(CephContext *cct, int grace) { + timeout = ceph_clock_now(); + timeout += grace; + } + }; + + std::map<std::uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us + uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique + + /** + * @} // requester state + */ + entity_addrvec_t sync_provider; ///< who we are syncing from + uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise + bool sync_full; ///< true if we are a full sync, false for recent catch-up + version_t sync_start_version; ///< last_committed at sync start + Context *sync_timeout_event; ///< timeout event + + /** + * floor for sync source + * + * When we sync we forget about our old last_committed value which + * can be dangerous. For example, if we have a cluster of: + * + * mon.a: lc 100 + * mon.b: lc 80 + * mon.c: lc 100 (us) + * + * If something forces us to sync (say, corruption, or manual + * intervention, or bug), we forget last_committed, and might abort. + * If mon.a happens to be down when we come back, we will see: + * + * mon.b: lc 80 + * mon.c: lc 0 (us) + * + * and sync from mon.b, at which point a+b will both have lc 80 and + * come online with a majority holding out of date commits. + * + * Avoid this by preserving our old last_committed value prior to + * sync and never going backwards. + */ + version_t sync_last_committed_floor; + + /** + * Obtain the synchronization target prefixes in set form. + * + * We consider a target prefix all those that are relevant when + * synchronizing two stores. That is, all those that hold paxos service's + * versions, as well as paxos versions, or any control keys such as the + * first or last committed version. + * + * Given the current design, this function should return the name of all and + * any available paxos service, plus the paxos name. + * + * @returns a set of strings referring to the prefixes being synchronized + */ + std::set<std::string> get_sync_targets_names(); + + /** + * Reset the monitor's sync-related data structures for syncing *from* a peer + */ + void sync_reset_requester(); + + /** + * Reset sync state related to allowing others to sync from us + */ + void sync_reset_provider(); + + /** + * Caled when a sync attempt times out (requester-side) + */ + void sync_timeout(); + + /** + * Get the latest monmap for backup purposes during sync + */ + void sync_obtain_latest_monmap(ceph::buffer::list &bl); + + /** + * Start sync process + * + * Start pulling committed state from another monitor. + * + * @param entity where to pull committed state from + * @param full whether to do a full sync or just catch up on recent paxos + */ + void sync_start(entity_addrvec_t &addrs, bool full); + +public: + /** + * force a sync on next mon restart + */ + void sync_force(ceph::Formatter *f); + +private: + /** + * store critical state for safekeeping during sync + * + * We store a few things on the side that we don't want to get clobbered by sync. This + * includes the latest monmap and a lower bound on last_committed. + */ + void sync_stash_critical_state(MonitorDBStore::TransactionRef tx); + + /** + * reset the sync timeout + * + * This is used on the client to restart if things aren't progressing + */ + void sync_reset_timeout(); + + /** + * trim stale sync provider state + * + * If someone is syncing from us and hasn't talked to us recently, expire their state. + */ + void sync_trim_providers(); + + /** + * Complete a sync + * + * Finish up a sync after we've gotten all of the chunks. + * + * @param last_committed final last_committed value from provider + */ + void sync_finish(version_t last_committed); + + /** + * request the next chunk from the provider + */ + void sync_get_next_chunk(); + + /** + * handle sync message + * + * @param m Sync message with operation type MMonSync::OP_START_CHUNKS + */ + void handle_sync(MonOpRequestRef op); + + void _sync_reply_no_cookie(MonOpRequestRef op); + + void handle_sync_get_cookie(MonOpRequestRef op); + void handle_sync_get_chunk(MonOpRequestRef op); + void handle_sync_finish(MonOpRequestRef op); + + void handle_sync_cookie(MonOpRequestRef op); + void handle_sync_forward(MonOpRequestRef op); + void handle_sync_chunk(MonOpRequestRef op); + void handle_sync_no_cookie(MonOpRequestRef op); + + /** + * @} // Synchronization + */ + + std::list<Context*> waitfor_quorum; + std::list<Context*> maybe_wait_for_quorum; + + /** + * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System + * @{ + * + * We use time checks to keep track of any clock drifting going on in the + * cluster. This is accomplished by periodically ping each monitor in the + * quorum and register its response time on a map, assessing how much its + * clock has drifted. We also take this opportunity to assess the latency + * on response. + * + * This mechanism works as follows: + * + * - Leader sends out a 'PING' message to each other monitor in the quorum. + * The message is timestamped with the leader's current time. The leader's + * current time is recorded in a map, associated with each peon's + * instance. + * - The peon replies to the leader with a timestamped 'PONG' message. + * - The leader calculates a delta between the peon's timestamp and its + * current time and stashes it. + * - The leader also calculates the time it took to receive the 'PONG' + * since the 'PING' was sent, and stashes an approximate latency estimate. + * - Once all the quorum members have pong'ed, the leader will share the + * clock skew and latency maps with all the monitors in the quorum. + */ + std::map<int, utime_t> timecheck_waiting; + std::map<int, double> timecheck_skews; + std::map<int, double> timecheck_latencies; + // odd value means we are mid-round; even value means the round has + // finished. + version_t timecheck_round; + unsigned int timecheck_acks; + utime_t timecheck_round_start; + friend class HealthMonitor; + /* When we hit a skew we will start a new round based off of + * 'mon_timecheck_skew_interval'. Each new round will be backed off + * until we hit 'mon_timecheck_interval' -- which is the typical + * interval when not in the presence of a skew. + * + * This variable tracks the number of rounds with skews since last clean + * so that we can report to the user and properly adjust the backoff. + */ + uint64_t timecheck_rounds_since_clean; + /** + * Time Check event. + */ + Context *timecheck_event; + + void timecheck_start(); + void timecheck_finish(); + void timecheck_start_round(); + void timecheck_finish_round(bool success = true); + void timecheck_cancel_round(); + void timecheck_cleanup(); + void timecheck_reset_event(); + void timecheck_check_skews(); + void timecheck_report(); + void timecheck(); + health_status_t timecheck_status(std::ostringstream &ss, + const double skew_bound, + const double latency); + void handle_timecheck_leader(MonOpRequestRef op); + void handle_timecheck_peon(MonOpRequestRef op); + void handle_timecheck(MonOpRequestRef op); + + /** + * Returns 'true' if this is considered to be a skew; 'false' otherwise. + */ + bool timecheck_has_skew(const double skew_bound, double *abs) const { + double abs_skew = std::fabs(skew_bound); + if (abs) + *abs = abs_skew; + return (abs_skew > g_conf()->mon_clock_drift_allowed); + } + + /** + * @} + */ + /** + * Handle ping messages from others. + */ + void handle_ping(MonOpRequestRef op); + + Context *probe_timeout_event = nullptr; // for probing + + void reset_probe_timeout(); + void cancel_probe_timeout(); + void probe_timeout(int r); + + void _apply_compatset_features(CompatSet &new_features); + +public: + epoch_t get_epoch(); + int get_leader() const { return leader; } + std::string get_leader_name() { + return quorum.empty() ? std::string() : monmap->get_name(leader); + } + const std::set<int>& get_quorum() const { return quorum; } + std::list<std::string> get_quorum_names() { + std::list<std::string> q; + for (auto p = quorum.begin(); p != quorum.end(); ++p) + q.push_back(monmap->get_name(*p)); + return q; + } + uint64_t get_quorum_con_features() const { + return quorum_con_features; + } + mon_feature_t get_quorum_mon_features() const { + return quorum_mon_features; + } + uint64_t get_required_features() const { + return required_features; + } + mon_feature_t get_required_mon_features() const { + return monmap->get_required_features(); + } + void apply_quorum_to_compatset_features(); + void apply_monmap_to_compatset_features(); + void calc_quorum_requirements(); + + void get_combined_feature_map(FeatureMap *fm); + +private: + void _reset(); ///< called from bootstrap, start_, or join_election + void wait_for_paxos_write(); + void _finish_svc_election(); ///< called by {win,lose}_election + void respawn(); +public: + void bootstrap(); + void join_election(); + void start_election(); + void win_standalone_election(); + // end election (called by Elector) + void win_election(epoch_t epoch, const std::set<int>& q, + uint64_t features, + const mon_feature_t& mon_features, + ceph_release_t min_mon_release, + const std::map<int,Metadata>& metadata); + void lose_election(epoch_t epoch, std::set<int>& q, int l, + uint64_t features, + const mon_feature_t& mon_features, + ceph_release_t min_mon_release); + // end election (called by Elector) + void finish_election(); + + void update_logger(); + + /** + * Vector holding the Services serviced by this Monitor. + */ + std::array<std::unique_ptr<PaxosService>, PAXOS_NUM> paxos_service; + + class MDSMonitor *mdsmon() { + return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP].get(); + } + + class MonmapMonitor *monmon() { + return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP].get(); + } + + class OSDMonitor *osdmon() { + return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP].get(); + } + + class AuthMonitor *authmon() { + return (class AuthMonitor *)paxos_service[PAXOS_AUTH].get(); + } + + class LogMonitor *logmon() { + return (class LogMonitor*) paxos_service[PAXOS_LOG].get(); + } + + class MgrMonitor *mgrmon() { + return (class MgrMonitor*) paxos_service[PAXOS_MGR].get(); + } + + class MgrStatMonitor *mgrstatmon() { + return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT].get(); + } + + class HealthMonitor *healthmon() { + return (class HealthMonitor*) paxos_service[PAXOS_HEALTH].get(); + } + + class ConfigMonitor *configmon() { + return (class ConfigMonitor*) paxos_service[PAXOS_CONFIG].get(); + } + + class KVMonitor *kvmon() { + return (class KVMonitor*) paxos_service[PAXOS_KV].get(); + } + + friend class Paxos; + friend class OSDMonitor; + friend class MDSMonitor; + friend class MonmapMonitor; + friend class LogMonitor; + friend class KVMonitor; + + // -- sessions -- + MonSessionMap session_map; + ceph::mutex session_map_lock = ceph::make_mutex("Monitor::session_map_lock"); + AdminSocketHook *admin_hook; + + template<typename Func, typename...Args> + void with_session_map(Func&& func) { + std::lock_guard l(session_map_lock); + std::forward<Func>(func)(session_map); + } + void send_latest_monmap(Connection *con); + + // messages + void handle_get_version(MonOpRequestRef op); + void handle_subscribe(MonOpRequestRef op); + void handle_mon_get_map(MonOpRequestRef op); + + static void _generate_command_map(cmdmap_t& cmdmap, + std::map<std::string,std::string> ¶m_str_map); + static const MonCommand *_get_moncommand( + const std::string &cmd_prefix, + const std::vector<MonCommand>& cmds); + bool _allowed_command(MonSession *s, const std::string& module, + const std::string& prefix, + const cmdmap_t& cmdmap, + const std::map<std::string,std::string>& param_str_map, + const MonCommand *this_cmd); + void get_mon_status(ceph::Formatter *f); + void _quorum_status(ceph::Formatter *f, std::ostream& ss); + bool _add_bootstrap_peer_hint(std::string_view cmd, const cmdmap_t& cmdmap, + std::ostream& ss); + void handle_tell_command(MonOpRequestRef op); + void handle_command(MonOpRequestRef op); + void handle_route(MonOpRequestRef op); + + int get_mon_metadata(int mon, ceph::Formatter *f, std::ostream& err); + int print_nodes(ceph::Formatter *f, std::ostream& err); + + // track metadata reported by win_election() + std::map<int, Metadata> mon_metadata; + std::map<int, Metadata> pending_metadata; + + /** + * + */ + struct health_cache_t { + health_status_t overall; + std::string summary; + + void reset() { + // health_status_t doesn't really have a NONE value and we're not + // okay with setting something else (say, HEALTH_ERR). so just + // leave it be. + summary.clear(); + } + } health_status_cache; + + Context *health_tick_event = nullptr; + Context *health_interval_event = nullptr; + + void health_tick_start(); + void health_tick_stop(); + ceph::real_clock::time_point health_interval_calc_next_update(); + void health_interval_start(); + void health_interval_stop(); + void health_events_cleanup(); + + void health_to_clog_update_conf(const std::set<std::string> &changed); + + void do_health_to_clog_interval(); + void do_health_to_clog(bool force = false); + + void log_health( + const health_check_map_t& updated, + const health_check_map_t& previous, + MonitorDBStore::TransactionRef t); + + void update_pending_metadata(); + +protected: + + class HealthCheckLogStatus { + public: + health_status_t severity; + std::string last_message; + utime_t updated_at = 0; + HealthCheckLogStatus(health_status_t severity_, + const std::string &last_message_, + utime_t updated_at_) + : severity(severity_), + last_message(last_message_), + updated_at(updated_at_) + {} + }; + std::map<std::string, HealthCheckLogStatus> health_check_log_times; + +public: + + void get_cluster_status(std::stringstream &ss, ceph::Formatter *f, + MonSession *session); + + void reply_command(MonOpRequestRef op, int rc, const std::string &rs, version_t version); + void reply_command(MonOpRequestRef op, int rc, const std::string &rs, ceph::buffer::list& rdata, version_t version); + + void reply_tell_command(MonOpRequestRef op, int rc, const std::string &rs); + + + + void handle_probe(MonOpRequestRef op); + /** + * Handle a Probe Operation, replying with our name, quorum and known versions. + * + * We use the MMonProbe message class for anything and everything related with + * Monitor probing. One of the operations relates directly with the probing + * itself, in which we receive a probe request and to which we reply with + * our name, our quorum and the known versions for each Paxos service. Thus the + * redundant function name. This reply will obviously be sent to the one + * probing/requesting these infos. + * + * @todo Add @pre and @post + * + * @param m A Probe message, with an operation of type Probe. + */ + void handle_probe_probe(MonOpRequestRef op); + void handle_probe_reply(MonOpRequestRef op); + + // request routing + struct RoutedRequest { + uint64_t tid; + ceph::buffer::list request_bl; + MonSession *session; + ConnectionRef con; + uint64_t con_features; + MonOpRequestRef op; + + RoutedRequest() : tid(0), session(NULL), con_features(0) {} + ~RoutedRequest() { + if (session) + session->put(); + } + }; + uint64_t routed_request_tid; + std::map<uint64_t, RoutedRequest*> routed_requests; + + void forward_request_leader(MonOpRequestRef op); + void handle_forward(MonOpRequestRef op); + void send_reply(MonOpRequestRef op, Message *reply); + void no_reply(MonOpRequestRef op); + void resend_routed_requests(); + void remove_session(MonSession *s); + void remove_all_sessions(); + void waitlist_or_zap_client(MonOpRequestRef op); + + void send_mon_message(Message *m, int rank); + /** can_change_external_state if we can do things like + * call elections as a result of the new map. + */ + void notify_new_monmap(bool can_change_external_state=false, bool remove_rank_elector=true); + +public: + struct C_Command : public C_MonOp { + Monitor &mon; + int rc; + std::string rs; + ceph::buffer::list rdata; + version_t version; + C_Command(Monitor &_mm, MonOpRequestRef _op, int r, std::string s, version_t v) : + C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){} + C_Command(Monitor &_mm, MonOpRequestRef _op, int r, std::string s, ceph::buffer::list rd, version_t v) : + C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){} + + void _finish(int r) override { + auto m = op->get_req<MMonCommand>(); + if (r >= 0) { + std::ostringstream ss; + if (!op->get_req()->get_connection()) { + ss << "connection dropped for command "; + } else { + MonSession *s = op->get_session(); + + // if client drops we may not have a session to draw information from. + if (s) { + ss << "from='" << s->name << " " << s->addrs << "' " + << "entity='" << s->entity_name << "' "; + } else { + ss << "session dropped for command "; + } + } + cmdmap_t cmdmap; + std::ostringstream ds; + string prefix; + cmdmap_from_json(m->cmd, &cmdmap, ds); + cmd_getval(cmdmap, "prefix", prefix); + if (prefix != "config set" && prefix != "config-key set") + ss << "cmd='" << m->cmd << "': finished"; + + mon.audit_clog->info() << ss.str(); + mon.reply_command(op, rc, rs, rdata, version); + } + else if (r == -ECANCELED) + return; + else if (r == -EAGAIN) + mon.dispatch_op(op); + else + ceph_abort_msg("bad C_Command return value"); + } + }; + + private: + class C_RetryMessage : public C_MonOp { + Monitor *mon; + public: + C_RetryMessage(Monitor *m, MonOpRequestRef op) : + C_MonOp(op), mon(m) { } + + void _finish(int r) override { + if (r == -EAGAIN || r >= 0) + mon->dispatch_op(op); + else if (r == -ECANCELED) + return; + else + ceph_abort_msg("bad C_RetryMessage return value"); + } + }; + + //ms_dispatch handles a lot of logic and we want to reuse it + //on forwarded messages, so we create a non-locking version for this class + void _ms_dispatch(Message *m); + bool ms_dispatch(Message *m) override { + std::lock_guard l{lock}; + _ms_dispatch(m); + return true; + } + void dispatch_op(MonOpRequestRef op); + //mon_caps is used for un-connected messages from monitors + MonCap mon_caps; + bool get_authorizer(int dest_type, AuthAuthorizer **authorizer); +public: // for AuthMonitor msgr1: + int ms_handle_authentication(Connection *con) override; +private: + void ms_handle_accept(Connection *con) override; + bool ms_handle_reset(Connection *con) override; + void ms_handle_remote_reset(Connection *con) override {} + bool ms_handle_refused(Connection *con) override; + + // AuthClient + int get_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t *method, + std::vector<uint32_t> *preferred_modes, + ceph::buffer::list *out) override; + int handle_auth_reply_more( + Connection *con, + AuthConnectionMeta *auth_meta, + const ceph::buffer::list& bl, + ceph::buffer::list *reply) override; + int handle_auth_done( + Connection *con, + AuthConnectionMeta *auth_meta, + uint64_t global_id, + uint32_t con_mode, + const ceph::buffer::list& bl, + CryptoKey *session_key, + std::string *connection_secret) override; + int handle_auth_bad_method( + Connection *con, + AuthConnectionMeta *auth_meta, + uint32_t old_auth_method, + int result, + const std::vector<uint32_t>& allowed_methods, + const std::vector<uint32_t>& allowed_modes) override; + // /AuthClient + // AuthServer + int handle_auth_request( + Connection *con, + AuthConnectionMeta *auth_meta, + bool more, + uint32_t auth_method, + const ceph::buffer::list& bl, + ceph::buffer::list *reply) override; + // /AuthServer + + int write_default_keyring(ceph::buffer::list& bl); + void extract_save_mon_key(KeyRing& keyring); + + void collect_metadata(Metadata *m); + int load_metadata(); + void count_metadata(const std::string& field, ceph::Formatter *f); + void count_metadata(const std::string& field, std::map<std::string,int> *out); + // get_all_versions() gathers version information from daemons for health check + void get_all_versions(std::map<string, std::list<std::string>> &versions); + void get_versions(std::map<string, std::list<std::string>> &versions); + + // features + static CompatSet get_initial_supported_features(); + static CompatSet get_supported_features(); + static CompatSet get_legacy_features(); + /// read the ondisk features into the CompatSet pointed to by read_features + static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features); + void read_features(); + void write_features(MonitorDBStore::TransactionRef t); + + OpTracker op_tracker; + + public: + Monitor(CephContext *cct_, std::string nm, MonitorDBStore *s, + Messenger *m, Messenger *mgr_m, MonMap *map); + ~Monitor() override; + + static int check_features(MonitorDBStore *store); + + // config observer + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) override; + + void update_log_clients(); + int sanitize_options(); + int preinit(); + int init(); + void init_paxos(); + void refresh_from_paxos(bool *need_bootstrap); + void shutdown(); + void tick(); + + void handle_signal(int sig); + + int mkfs(ceph::buffer::list& osdmapbl); + + /** + * check cluster_fsid file + * + * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code + */ + int check_fsid(); + + /** + * write cluster_fsid file + * + * @return 0 on success, or negative error code + */ + int write_fsid(); + int write_fsid(MonitorDBStore::TransactionRef t); + + int do_admin_command(std::string_view command, const cmdmap_t& cmdmap, + ceph::Formatter *f, + std::ostream& err, + std::ostream& out); + +private: + // don't allow copying + Monitor(const Monitor& rhs); + Monitor& operator=(const Monitor &rhs); + +public: + static void format_command_descriptions(const std::vector<MonCommand> &commands, + ceph::Formatter *f, + uint64_t features, + ceph::buffer::list *rdata); + + const std::vector<MonCommand> &get_local_commands(mon_feature_t f) { + if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) { + return local_mon_commands; + } else { + return prenautilus_local_mon_commands; + } + } + const ceph::buffer::list& get_local_commands_bl(mon_feature_t f) { + if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) { + return local_mon_commands_bl; + } else { + return prenautilus_local_mon_commands_bl; + } + } + void set_leader_commands(const std::vector<MonCommand>& cmds) { + leader_mon_commands = cmds; + } + + bool is_keyring_required(); +}; + +#define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)") +#define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)") +#define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)") +#define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools") +#define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding") +#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code") +#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code") +#define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features") +#define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout") +#define CEPH_MON_FEATURE_INCOMPAT_MIMIC CompatSet::Feature(10, "mimic ondisk layout") +#define CEPH_MON_FEATURE_INCOMPAT_NAUTILUS CompatSet::Feature(11, "nautilus ondisk layout") +#define CEPH_MON_FEATURE_INCOMPAT_OCTOPUS CompatSet::Feature(12, "octopus ondisk layout") +#define CEPH_MON_FEATURE_INCOMPAT_PACIFIC CompatSet::Feature(13, "pacific ondisk layout") +// make sure you add your feature to Monitor::get_supported_features + + +/* Callers use: + * + * new C_MonContext{...} + * + * instead of + * + * new C_MonContext(...) + * + * because of gcc bug [1]. + * + * [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85883 + */ +template<typename T> +class C_MonContext : public LambdaContext<T> { +public: + C_MonContext(const Monitor* m, T&& f) : + LambdaContext<T>(std::forward<T>(f)), + mon(m) + {} + void finish(int r) override { + if (mon->is_shutdown()) + return; + LambdaContext<T>::finish(r); + } +private: + const Monitor* mon; +}; + +#endif diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h new file mode 100644 index 000000000..c33d35e48 --- /dev/null +++ b/src/mon/MonitorDBStore.h @@ -0,0 +1,814 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* +* Ceph - scalable distributed file system +* +* Copyright (C) 2012 Inktank, Inc. +* +* This is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License version 2.1, as published by the Free Software +* Foundation. See file COPYING. +*/ +#ifndef CEPH_MONITOR_DB_STORE_H +#define CEPH_MONITOR_DB_STORE_H + +#include "include/types.h" +#include "include/buffer.h" +#include <set> +#include <map> +#include <string> +#include <boost/scoped_ptr.hpp> +#include <sstream> +#include <fstream> +#include "kv/KeyValueDB.h" + +#include "include/ceph_assert.h" +#include "common/Formatter.h" +#include "common/Finisher.h" +#include "common/errno.h" +#include "common/debug.h" +#include "common/safe_io.h" +#include "common/blkdev.h" +#include "common/PriorityCache.h" + +#define dout_context g_ceph_context + +class MonitorDBStore +{ + std::string path; + boost::scoped_ptr<KeyValueDB> db; + bool do_dump; + int dump_fd_binary; + std::ofstream dump_fd_json; + ceph::JSONFormatter dump_fmt; + + + Finisher io_work; + + bool is_open; + + public: + + std::string get_devname() { + char devname[4096] = {0}, partition[4096]; + get_device_by_path(path.c_str(), partition, devname, + sizeof(devname)); + return devname; + } + + std::string get_path() { + return path; + } + + std::shared_ptr<PriorityCache::PriCache> get_priority_cache() const { + return db->get_priority_cache(); + } + + struct Op { + uint8_t type; + std::string prefix; + std::string key, endkey; + ceph::buffer::list bl; + + Op() + : type(0) { } + Op(int t, const std::string& p, const std::string& k) + : type(t), prefix(p), key(k) { } + Op(int t, const std::string& p, const std::string& k, const ceph::buffer::list& b) + : type(t), prefix(p), key(k), bl(b) { } + Op(int t, const std::string& p, const std::string& start, const std::string& end) + : type(t), prefix(p), key(start), endkey(end) { } + + void encode(ceph::buffer::list& encode_bl) const { + ENCODE_START(2, 1, encode_bl); + encode(type, encode_bl); + encode(prefix, encode_bl); + encode(key, encode_bl); + encode(bl, encode_bl); + encode(endkey, encode_bl); + ENCODE_FINISH(encode_bl); + } + + void decode(ceph::buffer::list::const_iterator& decode_bl) { + DECODE_START(2, decode_bl); + decode(type, decode_bl); + decode(prefix, decode_bl); + decode(key, decode_bl); + decode(bl, decode_bl); + if (struct_v >= 2) + decode(endkey, decode_bl); + DECODE_FINISH(decode_bl); + } + + void dump(ceph::Formatter *f) const { + f->dump_int("type", type); + f->dump_string("prefix", prefix); + f->dump_string("key", key); + if (endkey.length()) { + f->dump_string("endkey", endkey); + } + } + + int approx_size() const { + return 6 + 1 + + 4 + prefix.size() + + 4 + key.size() + + 4 + endkey.size() + + 4 + bl.length(); + } + + static void generate_test_instances(std::list<Op*>& ls) { + ls.push_back(new Op); + // we get coverage here from the Transaction instances + } + }; + + struct Transaction; + typedef std::shared_ptr<Transaction> TransactionRef; + struct Transaction { + std::list<Op> ops; + uint64_t bytes, keys; + + Transaction() : bytes(6 + 4 + 8*2), keys(0) {} + + enum { + OP_PUT = 1, + OP_ERASE = 2, + OP_COMPACT = 3, + OP_ERASE_RANGE = 4, + }; + + void put(const std::string& prefix, const std::string& key, const ceph::buffer::list& bl) { + ops.push_back(Op(OP_PUT, prefix, key, bl)); + ++keys; + bytes += ops.back().approx_size(); + } + + void put(const std::string& prefix, version_t ver, const ceph::buffer::list& bl) { + std::ostringstream os; + os << ver; + put(prefix, os.str(), bl); + } + + void put(const std::string& prefix, const std::string& key, version_t ver) { + using ceph::encode; + ceph::buffer::list bl; + encode(ver, bl); + put(prefix, key, bl); + } + + void erase(const std::string& prefix, const std::string& key) { + ops.push_back(Op(OP_ERASE, prefix, key)); + ++keys; + bytes += ops.back().approx_size(); + } + + void erase(const std::string& prefix, version_t ver) { + std::ostringstream os; + os << ver; + erase(prefix, os.str()); + } + + void erase_range(const std::string& prefix, const std::string& begin, + const std::string& end) { + ops.push_back(Op(OP_ERASE_RANGE, prefix, begin, end)); + ++keys; + bytes += ops.back().approx_size(); + } + + void compact_prefix(const std::string& prefix) { + ops.push_back(Op(OP_COMPACT, prefix, {})); + } + + void compact_range(const std::string& prefix, const std::string& start, + const std::string& end) { + ops.push_back(Op(OP_COMPACT, prefix, start, end)); + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(2, 1, bl); + encode(ops, bl); + encode(bytes, bl); + encode(keys, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator& bl) { + DECODE_START(2, bl); + decode(ops, bl); + if (struct_v >= 2) { + decode(bytes, bl); + decode(keys, bl); + } + DECODE_FINISH(bl); + } + + static void generate_test_instances(std::list<Transaction*>& ls) { + ls.push_back(new Transaction); + ls.push_back(new Transaction); + ceph::buffer::list bl; + bl.append("value"); + ls.back()->put("prefix", "key", bl); + ls.back()->erase("prefix2", "key2"); + ls.back()->erase_range("prefix3", "key3", "key4"); + ls.back()->compact_prefix("prefix3"); + ls.back()->compact_range("prefix4", "from", "to"); + } + + void append(TransactionRef other) { + ops.splice(ops.end(), other->ops); + keys += other->keys; + bytes += other->bytes; + } + + void append_from_encoded(ceph::buffer::list& bl) { + auto other(std::make_shared<Transaction>()); + auto it = bl.cbegin(); + other->decode(it); + append(other); + } + + bool empty() { + return (size() == 0); + } + + size_t size() const { + return ops.size(); + } + uint64_t get_keys() const { + return keys; + } + uint64_t get_bytes() const { + return bytes; + } + + void dump(ceph::Formatter *f, bool dump_val=false) const { + f->open_object_section("transaction"); + f->open_array_section("ops"); + int op_num = 0; + for (auto it = ops.begin(); it != ops.end(); ++it) { + const Op& op = *it; + f->open_object_section("op"); + f->dump_int("op_num", op_num++); + switch (op.type) { + case OP_PUT: + { + f->dump_string("type", "PUT"); + f->dump_string("prefix", op.prefix); + f->dump_string("key", op.key); + f->dump_unsigned("length", op.bl.length()); + if (dump_val) { + std::ostringstream os; + op.bl.hexdump(os); + f->dump_string("bl", os.str()); + } + } + break; + case OP_ERASE: + { + f->dump_string("type", "ERASE"); + f->dump_string("prefix", op.prefix); + f->dump_string("key", op.key); + } + break; + case OP_ERASE_RANGE: + { + f->dump_string("type", "ERASE_RANGE"); + f->dump_string("prefix", op.prefix); + f->dump_string("start", op.key); + f->dump_string("end", op.endkey); + } + break; + case OP_COMPACT: + { + f->dump_string("type", "COMPACT"); + f->dump_string("prefix", op.prefix); + f->dump_string("start", op.key); + f->dump_string("end", op.endkey); + } + break; + default: + { + f->dump_string("type", "unknown"); + f->dump_unsigned("op_code", op.type); + break; + } + } + f->close_section(); + } + f->close_section(); + f->dump_unsigned("num_keys", keys); + f->dump_unsigned("num_bytes", bytes); + f->close_section(); + } + }; + + int apply_transaction(MonitorDBStore::TransactionRef t) { + KeyValueDB::Transaction dbt = db->get_transaction(); + + if (do_dump) { + if (!g_conf()->mon_debug_dump_json) { + ceph::buffer::list bl; + t->encode(bl); + bl.write_fd(dump_fd_binary); + } else { + t->dump(&dump_fmt, true); + dump_fmt.flush(dump_fd_json); + dump_fd_json.flush(); + } + } + + std::list<std::pair<std::string, std::pair<std::string,std::string>>> compact; + for (auto it = t->ops.begin(); it != t->ops.end(); ++it) { + const Op& op = *it; + switch (op.type) { + case Transaction::OP_PUT: + dbt->set(op.prefix, op.key, op.bl); + break; + case Transaction::OP_ERASE: + dbt->rmkey(op.prefix, op.key); + break; + case Transaction::OP_ERASE_RANGE: + dbt->rm_range_keys(op.prefix, op.key, op.endkey); + break; + case Transaction::OP_COMPACT: + compact.push_back(make_pair(op.prefix, make_pair(op.key, op.endkey))); + break; + default: + derr << __func__ << " unknown op type " << op.type << dendl; + ceph_abort(); + break; + } + } + int r = db->submit_transaction_sync(dbt); + if (r >= 0) { + while (!compact.empty()) { + if (compact.front().second.first == std::string() && + compact.front().second.second == std::string()) + db->compact_prefix_async(compact.front().first); + else + db->compact_range_async(compact.front().first, compact.front().second.first, compact.front().second.second); + compact.pop_front(); + } + } else { + ceph_abort_msg("failed to write to db"); + } + return r; + } + + struct C_DoTransaction : public Context { + MonitorDBStore *store; + MonitorDBStore::TransactionRef t; + Context *oncommit; + C_DoTransaction(MonitorDBStore *s, MonitorDBStore::TransactionRef t, + Context *f) + : store(s), t(t), oncommit(f) + {} + void finish(int r) override { + /* The store serializes writes. Each transaction is handled + * sequentially by the io_work Finisher. If a transaction takes longer + * to apply its state to permanent storage, then no other transaction + * will be handled meanwhile. + * + * We will now randomly inject random delays. We can safely sleep prior + * to applying the transaction as it won't break the model. + */ + double delay_prob = g_conf()->mon_inject_transaction_delay_probability; + if (delay_prob && (rand() % 10000 < delay_prob * 10000.0)) { + utime_t delay; + double delay_max = g_conf()->mon_inject_transaction_delay_max; + delay.set_from_double(delay_max * (double)(rand() % 10000) / 10000.0); + lsubdout(g_ceph_context, mon, 1) + << "apply_transaction will be delayed for " << delay + << " seconds" << dendl; + delay.sleep(); + } + int ret = store->apply_transaction(t); + oncommit->complete(ret); + } + }; + + /** + * queue transaction + * + * Queue a transaction to commit asynchronously. Trigger a context + * on completion (without any locks held). + */ + void queue_transaction(MonitorDBStore::TransactionRef t, + Context *oncommit) { + io_work.queue(new C_DoTransaction(this, t, oncommit)); + } + + /** + * block and flush all io activity + */ + void flush() { + io_work.wait_for_empty(); + } + + class StoreIteratorImpl { + protected: + bool done; + std::pair<std::string,std::string> last_key; + ceph::buffer::list crc_bl; + + StoreIteratorImpl() : done(false) { } + virtual ~StoreIteratorImpl() { } + + virtual bool _is_valid() = 0; + + public: + __u32 crc() { + if (g_conf()->mon_sync_debug) + return crc_bl.crc32c(0); + return 0; + } + std::pair<std::string,std::string> get_last_key() { + return last_key; + } + virtual bool has_next_chunk() { + return !done && _is_valid(); + } + virtual void get_chunk_tx(TransactionRef tx, uint64_t max_bytes, + uint64_t max_keys) = 0; + virtual std::pair<std::string,std::string> get_next_key() = 0; + }; + typedef std::shared_ptr<StoreIteratorImpl> Synchronizer; + + class WholeStoreIteratorImpl : public StoreIteratorImpl { + KeyValueDB::WholeSpaceIterator iter; + std::set<std::string> sync_prefixes; + + public: + WholeStoreIteratorImpl(KeyValueDB::WholeSpaceIterator iter, + std::set<std::string> &prefixes) + : StoreIteratorImpl(), + iter(iter), + sync_prefixes(prefixes) + { } + + ~WholeStoreIteratorImpl() override { } + + /** + * Obtain a chunk of the store + * + * @param bl Encoded transaction that will recreate the chunk + * @param first_key Pair containing the first key to obtain, and that + * will contain the first key in the chunk (that may + * differ from the one passed on to the function) + * @param last_key[out] Last key in the chunk + */ + void get_chunk_tx(TransactionRef tx, uint64_t max_bytes, + uint64_t max_keys) override { + using ceph::encode; + ceph_assert(done == false); + ceph_assert(iter->valid() == true); + + while (iter->valid()) { + std::string prefix(iter->raw_key().first); + std::string key(iter->raw_key().second); + if (sync_prefixes.count(prefix)) { + ceph::buffer::list value = iter->value(); + if (tx->empty() || + (tx->get_bytes() + value.length() + key.size() + + prefix.size() < max_bytes && + tx->get_keys() < max_keys)) { + // NOTE: putting every key in a separate transaction is + // questionable as far as efficiency goes + auto tmp(std::make_shared<Transaction>()); + tmp->put(prefix, key, value); + tx->append(tmp); + if (g_conf()->mon_sync_debug) { + encode(prefix, crc_bl); + encode(key, crc_bl); + encode(value, crc_bl); + } + } else { + last_key.first = prefix; + last_key.second = key; + return; + } + } + iter->next(); + } + ceph_assert(iter->valid() == false); + done = true; + } + + std::pair<std::string,std::string> get_next_key() override { + ceph_assert(iter->valid()); + + for (; iter->valid(); iter->next()) { + std::pair<std::string,std::string> r = iter->raw_key(); + if (sync_prefixes.count(r.first) > 0) { + iter->next(); + return r; + } + } + return std::pair<std::string,std::string>(); + } + + bool _is_valid() override { + return iter->valid(); + } + }; + + Synchronizer get_synchronizer(std::pair<std::string,std::string> &key, + std::set<std::string> &prefixes) { + KeyValueDB::WholeSpaceIterator iter; + iter = db->get_wholespace_iterator(); + + if (!key.first.empty() && !key.second.empty()) + iter->upper_bound(key.first, key.second); + else + iter->seek_to_first(); + + return std::shared_ptr<StoreIteratorImpl>( + new WholeStoreIteratorImpl(iter, prefixes) + ); + } + + KeyValueDB::Iterator get_iterator(const std::string &prefix) { + ceph_assert(!prefix.empty()); + KeyValueDB::Iterator iter = db->get_iterator(prefix); + iter->seek_to_first(); + return iter; + } + + KeyValueDB::WholeSpaceIterator get_iterator() { + KeyValueDB::WholeSpaceIterator iter; + iter = db->get_wholespace_iterator(); + iter->seek_to_first(); + return iter; + } + + int get(const std::string& prefix, const std::string& key, ceph::buffer::list& bl) { + ceph_assert(bl.length() == 0); + return db->get(prefix, key, &bl); + } + + int get(const std::string& prefix, const version_t ver, ceph::buffer::list& bl) { + std::ostringstream os; + os << ver; + return get(prefix, os.str(), bl); + } + + version_t get(const std::string& prefix, const std::string& key) { + using ceph::decode; + ceph::buffer::list bl; + int err = get(prefix, key, bl); + if (err < 0) { + if (err == -ENOENT) // if key doesn't exist, assume its value is 0 + return 0; + // we're not expecting any other negative return value, and we can't + // just return a negative value if we're returning a version_t + generic_dout(0) << "MonitorDBStore::get() error obtaining" + << " (" << prefix << ":" << key << "): " + << cpp_strerror(err) << dendl; + ceph_abort_msg("error obtaining key"); + } + + ceph_assert(bl.length()); + version_t ver; + auto p = bl.cbegin(); + decode(ver, p); + return ver; + } + + bool exists(const std::string& prefix, const std::string& key) { + KeyValueDB::Iterator it = db->get_iterator(prefix); + int err = it->lower_bound(key); + if (err < 0) + return false; + + return (it->valid() && it->key() == key); + } + + bool exists(const std::string& prefix, version_t ver) { + std::ostringstream os; + os << ver; + return exists(prefix, os.str()); + } + + std::string combine_strings(const std::string& prefix, const std::string& value) { + std::string out = prefix; + out.push_back('_'); + out.append(value); + return out; + } + + std::string combine_strings(const std::string& prefix, const version_t ver) { + std::ostringstream os; + os << ver; + return combine_strings(prefix, os.str()); + } + + void clear(std::set<std::string>& prefixes) { + KeyValueDB::Transaction dbt = db->get_transaction(); + + for (auto iter = prefixes.begin(); iter != prefixes.end(); ++iter) { + dbt->rmkeys_by_prefix((*iter)); + } + int r = db->submit_transaction_sync(dbt); + ceph_assert(r >= 0); + } + + void _open(const std::string& kv_type) { + int pos = 0; + for (auto rit = path.rbegin(); rit != path.rend(); ++rit, ++pos) { + if (*rit != '/') + break; + } + std::ostringstream os; + os << path.substr(0, path.size() - pos) << "/store.db"; + std::string full_path = os.str(); + + KeyValueDB *db_ptr = KeyValueDB::create(g_ceph_context, + kv_type, + full_path); + if (!db_ptr) { + derr << __func__ << " error initializing " + << kv_type << " db back storage in " + << full_path << dendl; + ceph_abort_msg("MonitorDBStore: error initializing keyvaluedb back storage"); + } + db.reset(db_ptr); + + if (g_conf()->mon_debug_dump_transactions) { + if (!g_conf()->mon_debug_dump_json) { + dump_fd_binary = ::open( + g_conf()->mon_debug_dump_location.c_str(), + O_CREAT|O_APPEND|O_WRONLY|O_CLOEXEC, 0644); + if (dump_fd_binary < 0) { + dump_fd_binary = -errno; + derr << "Could not open log file, got " + << cpp_strerror(dump_fd_binary) << dendl; + } + } else { + dump_fmt.reset(); + dump_fmt.open_array_section("dump"); + dump_fd_json.open(g_conf()->mon_debug_dump_location.c_str()); + } + do_dump = true; + } + if (kv_type == "rocksdb") + db->init(g_conf()->mon_rocksdb_options); + else + db->init(); + + + } + + int open(std::ostream &out) { + std::string kv_type; + int r = read_meta("kv_backend", &kv_type); + if (r < 0 || kv_type.empty()) { + // assume old monitors that did not mark the type were leveldb. + kv_type = "leveldb"; + r = write_meta("kv_backend", kv_type); + if (r < 0) + return r; + } + _open(kv_type); + r = db->open(out); + if (r < 0) + return r; + + // Monitors are few in number, so the resource cost of exposing + // very detailed stats is low: ramp up the priority of all the + // KV store's perf counters. Do this after open, because backend may + // not have constructed PerfCounters earlier. + if (db->get_perf_counters()) { + db->get_perf_counters()->set_prio_adjust( + PerfCountersBuilder::PRIO_USEFUL - PerfCountersBuilder::PRIO_DEBUGONLY); + } + + io_work.start(); + is_open = true; + return 0; + } + + int create_and_open(std::ostream &out) { + // record the type before open + std::string kv_type; + int r = read_meta("kv_backend", &kv_type); + if (r < 0) { + kv_type = g_conf()->mon_keyvaluedb; + r = write_meta("kv_backend", kv_type); + if (r < 0) + return r; + } + _open(kv_type); + r = db->create_and_open(out); + if (r < 0) + return r; + io_work.start(); + is_open = true; + return 0; + } + + void close() { + // there should be no work queued! + io_work.stop(); + is_open = false; + db.reset(NULL); + } + + void compact() { + db->compact(); + } + + void compact_async() { + db->compact_async(); + } + + void compact_prefix(const std::string& prefix) { + db->compact_prefix(prefix); + } + + uint64_t get_estimated_size(std::map<std::string, uint64_t> &extras) { + return db->get_estimated_size(extras); + } + + /** + * write_meta - write a simple configuration key out-of-band + * + * Write a simple key/value pair for basic store configuration + * (e.g., a uuid or magic number) to an unopened/unmounted store. + * The default implementation writes this to a plaintext file in the + * path. + * + * A newline is appended. + * + * @param key key name (e.g., "fsid") + * @param value value (e.g., a uuid rendered as a string) + * @returns 0 for success, or an error code + */ + int write_meta(const std::string& key, + const std::string& value) const { + std::string v = value; + v += "\n"; + int r = safe_write_file(path.c_str(), key.c_str(), + v.c_str(), v.length(), + 0600); + if (r < 0) + return r; + return 0; + } + + /** + * read_meta - read a simple configuration key out-of-band + * + * Read a simple key value to an unopened/mounted store. + * + * Trailing whitespace is stripped off. + * + * @param key key name + * @param value pointer to value string + * @returns 0 for success, or an error code + */ + int read_meta(const std::string& key, + std::string *value) const { + char buf[4096]; + int r = safe_read_file(path.c_str(), key.c_str(), + buf, sizeof(buf)); + if (r <= 0) + return r; + // drop trailing newlines + while (r && isspace(buf[r-1])) { + --r; + } + *value = std::string(buf, r); + return 0; + } + + explicit MonitorDBStore(const std::string& path) + : path(path), + db(0), + do_dump(false), + dump_fd_binary(-1), + dump_fmt(true), + io_work(g_ceph_context, "monstore", "fn_monstore"), + is_open(false) { + } + ~MonitorDBStore() { + ceph_assert(!is_open); + if (do_dump) { + if (!g_conf()->mon_debug_dump_json) { + ::close(dump_fd_binary); + } else { + dump_fmt.close_section(); + dump_fmt.flush(dump_fd_json); + dump_fd_json.flush(); + dump_fd_json.close(); + } + } + } + +}; + +WRITE_CLASS_ENCODER(MonitorDBStore::Op) +WRITE_CLASS_ENCODER(MonitorDBStore::Transaction) + +#endif /* CEPH_MONITOR_DB_STORE_H */ diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc new file mode 100644 index 000000000..91d9021c2 --- /dev/null +++ b/src/mon/MonmapMonitor.cc @@ -0,0 +1,1470 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2009 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MonmapMonitor.h" +#include "Monitor.h" +#include "OSDMonitor.h" +#include "messages/MMonCommand.h" +#include "messages/MMonJoin.h" + +#include "common/ceph_argparse.h" +#include "common/errno.h" +#include <sstream> +#include "common/config.h" +#include "common/cmdparse.h" + +#include "include/ceph_assert.h" +#include "include/stringify.h" + +#define dout_subsys ceph_subsys_mon +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon) +using namespace TOPNSPC::common; + +using std::cout; +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::set; +using std::setfill; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; +using std::unique_ptr; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::make_message; +using ceph::mono_clock; +using ceph::mono_time; +using ceph::timespan_str; +static ostream& _prefix(std::ostream *_dout, Monitor &mon) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() + << ").monmap v" << mon.monmap->epoch << " "; +} + +void MonmapMonitor::create_initial() +{ + dout(10) << __func__ << " using current monmap" << dendl; + pending_map = *mon.monmap; + pending_map.epoch = 1; + + if (g_conf()->mon_debug_no_initial_persistent_features) { + derr << __func__ << " mon_debug_no_initial_persistent_features=true" + << dendl; + } else { + // initialize with default persistent features for new clusters + pending_map.persistent_features = ceph::features::mon::get_persistent(); + pending_map.min_mon_release = ceph_release(); + } +} + +void MonmapMonitor::update_from_paxos(bool *need_bootstrap) +{ + version_t version = get_last_committed(); + if (version <= mon.monmap->get_epoch()) + return; + + dout(10) << __func__ << " version " << version + << ", my v " << mon.monmap->epoch << dendl; + + if (need_bootstrap && version != mon.monmap->get_epoch()) { + dout(10) << " signaling that we need a bootstrap" << dendl; + *need_bootstrap = true; + } + + // read and decode + monmap_bl.clear(); + int ret = get_version(version, monmap_bl); + ceph_assert(ret == 0); + ceph_assert(monmap_bl.length()); + + dout(10) << __func__ << " got " << version << dendl; + mon.monmap->decode(monmap_bl); + + if (mon.store->exists("mkfs", "monmap")) { + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->erase("mkfs", "monmap"); + mon.store->apply_transaction(t); + } + + check_subs(); + + // make sure we've recorded min_mon_release + string val; + if (mon.store->read_meta("min_mon_release", &val) < 0 || + val.size() == 0 || + atoi(val.c_str()) != (int)ceph_release()) { + dout(10) << __func__ << " updating min_mon_release meta" << dendl; + mon.store->write_meta("min_mon_release", + stringify(ceph_release())); + } + + mon.notify_new_monmap(true); +} + +void MonmapMonitor::create_pending() +{ + pending_map = *mon.monmap; + pending_map.epoch++; + pending_map.last_changed = ceph_clock_now(); + pending_map.removed_ranks.clear(); +} + +void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t) +{ + dout(10) << __func__ << " epoch " << pending_map.epoch << dendl; + + ceph_assert(mon.monmap->epoch + 1 == pending_map.epoch || + pending_map.epoch == 1); // special case mkfs! + bufferlist bl; + pending_map.encode(bl, mon.get_quorum_con_features()); + + put_version(t, pending_map.epoch, bl); + put_last_committed(t, pending_map.epoch); + + // generate a cluster fingerprint, too? + if (pending_map.epoch == 1) { + mon.prepare_new_fingerprint(t); + } + + //health + health_check_map_t next; + pending_map.check_health(&next); + encode_health(next, t); +} + +class C_ApplyFeatures : public Context { + MonmapMonitor *svc; + mon_feature_t features; + ceph_release_t min_mon_release; +public: + C_ApplyFeatures(MonmapMonitor *s, const mon_feature_t& f, ceph_release_t mmr) : + svc(s), features(f), min_mon_release(mmr) { } + void finish(int r) override { + if (r >= 0) { + svc->apply_mon_features(features, min_mon_release); + } else if (r == -EAGAIN || r == -ECANCELED) { + // discard features if we're no longer on the quorum that + // established them in the first place. + return; + } else { + ceph_abort_msg("bad C_ApplyFeatures return value"); + } + } +}; + +void MonmapMonitor::apply_mon_features(const mon_feature_t& features, + ceph_release_t min_mon_release) +{ + if (!is_writeable()) { + dout(5) << __func__ << " wait for service to be writeable" << dendl; + wait_for_writeable_ctx(new C_ApplyFeatures(this, features, min_mon_release)); + return; + } + + // do nothing here unless we have a full quorum + if (mon.get_quorum().size() < mon.monmap->size()) { + return; + } + + ceph_assert(is_writeable()); + ceph_assert(features.contains_all(pending_map.persistent_features)); + // we should never hit this because `features` should be the result + // of the quorum's supported features. But if it happens, die. + ceph_assert(ceph::features::mon::get_supported().contains_all(features)); + + mon_feature_t new_features = + (pending_map.persistent_features ^ + (features & ceph::features::mon::get_persistent())); + + if (new_features.empty() && + pending_map.min_mon_release == min_mon_release) { + dout(10) << __func__ << " min_mon_release (" << (int)min_mon_release + << ") and features (" << features << ") match" << dendl; + return; + } + + if (!new_features.empty()) { + dout(1) << __func__ << " applying new features " + << new_features << ", had " << pending_map.persistent_features + << ", will have " + << (new_features | pending_map.persistent_features) + << dendl; + pending_map.persistent_features |= new_features; + } + if (min_mon_release > pending_map.min_mon_release) { + dout(1) << __func__ << " increasing min_mon_release to " + << to_integer<int>(min_mon_release) << " (" << min_mon_release + << ")" << dendl; + pending_map.min_mon_release = min_mon_release; + } + + propose_pending(); +} + +void MonmapMonitor::on_active() +{ + if (get_last_committed() >= 1 && !mon.has_ever_joined) { + // make note of the fact that i was, once, part of the quorum. + dout(10) << "noting that i was, once, part of an active quorum." << dendl; + + /* This is some form of nasty in-breeding we have between the MonmapMonitor + and the Monitor itself. We should find a way to get rid of it given our + new architecture. Until then, stick with it since we are a + single-threaded process and, truth be told, no one else relies on this + thing besides us. + */ + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put(Monitor::MONITOR_NAME, "joined", 1); + mon.store->apply_transaction(t); + mon.has_ever_joined = true; + } + + if (mon.is_leader()) { + mon.clog->debug() << "monmap " << *mon.monmap; + } + + apply_mon_features(mon.get_quorum_mon_features(), + mon.quorum_min_mon_release); + + mon.update_pending_metadata(); +} + +bool MonmapMonitor::preprocess_query(MonOpRequestRef op) +{ + auto m = op->get_req<PaxosServiceMessage>(); + switch (m->get_type()) { + // READs + case MSG_MON_COMMAND: + try { + return preprocess_command(op); + } + catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + case MSG_MON_JOIN: + return preprocess_join(op); + default: + ceph_abort(); + return true; + } +} + +void MonmapMonitor::dump_info(Formatter *f) +{ + f->dump_unsigned("monmap_first_committed", get_first_committed()); + f->dump_unsigned("monmap_last_committed", get_last_committed()); + f->open_object_section("monmap"); + mon.monmap->dump(f); + f->close_section(); + f->open_array_section("quorum"); + for (set<int>::iterator q = mon.get_quorum().begin(); q != mon.get_quorum().end(); ++q) + f->dump_int("mon", *q); + f->close_section(); +} + +bool MonmapMonitor::preprocess_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + int r = -1; + bufferlist rdata; + stringstream ss; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed()); + return true; + } + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", get_last_committed()); + return true; + } + + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + if (prefix == "mon stat") { + if (f) { + f->open_object_section("monmap"); + mon.monmap->dump_summary(f.get()); + f->dump_string("leader", mon.get_leader_name()); + f->open_array_section("quorum"); + for (auto rank: mon.get_quorum()) { + std::string name = mon.monmap->get_name(rank); + f->open_object_section("mon"); + f->dump_int("rank", rank); + f->dump_string("name", name); + f->close_section(); // mon + } + f->close_section(); // quorum + f->close_section(); // monmap + f->flush(ss); + } else { + mon.monmap->print_summary(ss); + ss << ", election epoch " << mon.get_epoch() << ", leader " + << mon.get_leader() << " " << mon.get_leader_name() + << ", quorum " << mon.get_quorum() + << " " << mon.get_quorum_names(); + } + + rdata.append(ss); + ss.str(""); + r = 0; + + } else if (prefix == "mon getmap" || + prefix == "mon dump") { + + epoch_t epoch; + int64_t epochnum; + cmd_getval(cmdmap, "epoch", epochnum, (int64_t)0); + epoch = epochnum; + + MonMap *p = mon.monmap; + if (epoch) { + bufferlist bl; + r = get_version(epoch, bl); + if (r == -ENOENT) { + ss << "there is no map for epoch " << epoch; + goto reply; + } + ceph_assert(r == 0); + ceph_assert(bl.length() > 0); + p = new MonMap; + p->decode(bl); + } + + ceph_assert(p); + + if (prefix == "mon getmap") { + p->encode(rdata, m->get_connection()->get_features()); + r = 0; + ss << "got monmap epoch " << p->get_epoch(); + } else if (prefix == "mon dump") { + stringstream ds; + if (f) { + f->open_object_section("monmap"); + p->dump(f.get()); + f->open_array_section("quorum"); + for (set<int>::iterator q = mon.get_quorum().begin(); + q != mon.get_quorum().end(); ++q) { + f->dump_int("mon", *q); + } + f->close_section(); + f->close_section(); + f->flush(ds); + r = 0; + } else { + p->print(ds); + r = 0; + } + rdata.append(ds); + ss << "dumped monmap epoch " << p->get_epoch(); + } + if (p != mon.monmap) { + delete p; + p = nullptr; + } + + } else if (prefix == "mon feature ls") { + + bool list_with_value = false; + string with_value; + if (cmd_getval(cmdmap, "with_value", with_value) && + with_value == "--with-value") { + list_with_value = true; + } + + MonMap *p = mon.monmap; + + // list features + mon_feature_t supported = ceph::features::mon::get_supported(); + mon_feature_t persistent = ceph::features::mon::get_persistent(); + mon_feature_t required = p->get_required_features(); + + stringstream ds; + auto print_feature = [&](mon_feature_t& m_features, const char* m_str) { + if (f) { + if (list_with_value) + m_features.dump_with_value(f.get(), m_str); + else + m_features.dump(f.get(), m_str); + } else { + if (list_with_value) + m_features.print_with_value(ds); + else + m_features.print(ds); + } + }; + + if (f) { + f->open_object_section("features"); + + f->open_object_section("all"); + print_feature(supported, "supported"); + print_feature(persistent, "persistent"); + f->close_section(); // all + + f->open_object_section("monmap"); + print_feature(p->persistent_features, "persistent"); + print_feature(p->optional_features, "optional"); + print_feature(required, "required"); + f->close_section(); // monmap + + f->close_section(); // features + f->flush(ds); + + } else { + ds << "all features" << std::endl + << "\tsupported: "; + print_feature(supported, nullptr); + ds << std::endl + << "\tpersistent: "; + print_feature(persistent, nullptr); + ds << std::endl + << std::endl; + + ds << "on current monmap (epoch " + << p->get_epoch() << ")" << std::endl + << "\tpersistent: "; + print_feature(p->persistent_features, nullptr); + ds << std::endl + // omit optional features in plain-text + // makes it easier to read, and they're, currently, empty. + << "\trequired: "; + print_feature(required, nullptr); + ds << std::endl; + } + rdata.append(ds); + r = 0; + } + +reply: + if (r != -1) { + string rs; + getline(ss, rs); + + mon.reply_command(op, r, rs, rdata, get_last_committed()); + return true; + } else + return false; +} + + +bool MonmapMonitor::prepare_update(MonOpRequestRef op) +{ + auto m = op->get_req<PaxosServiceMessage>(); + dout(7) << __func__ << " " << *m << " from " << m->get_orig_source_inst() << dendl; + + switch (m->get_type()) { + case MSG_MON_COMMAND: + try { + return prepare_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + case MSG_MON_JOIN: + return prepare_join(op); + default: + ceph_abort(); + } + + return false; +} + +bool MonmapMonitor::prepare_command(MonOpRequestRef op) +{ + auto m = op->get_req<MMonCommand>(); + stringstream ss; + string rs; + int err = -EINVAL; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + + MonSession *session = op->get_session(); + if (!session) { + mon.reply_command(op, -EACCES, "access denied", get_last_committed()); + return true; + } + + /* We should follow the following rules: + * + * - 'monmap' is the current, consistent version of the monmap + * - 'pending_map' is the uncommitted version of the monmap + * + * All checks for the current state must be made against 'monmap'. + * All changes are made against 'pending_map'. + * + * If there are concurrent operations modifying 'pending_map', please + * follow the following rules. + * + * - if pending_map has already been changed, the second operation must + * wait for the proposal to finish and be run again; This is the easiest + * path to guarantee correctness but may impact performance (i.e., it + * will take longer for the user to get a reply). + * + * - if the result of the second operation can be guaranteed to be + * idempotent, the operation may reply to the user once the proposal + * finishes; still needs to wait for the proposal to finish. + * + * - An operation _NEVER_ returns to the user based on pending state. + * + * If an operation does not modify current stable monmap, it may be + * serialized before current pending map, regardless of any change that + * has been made to the pending map -- remember, pending is uncommitted + * state, thus we are not bound by it. + */ + + ceph_assert(mon.monmap); + MonMap &monmap = *mon.monmap; + + + /* Please note: + * + * Adding or removing monitors may lead to loss of quorum. + * + * Because quorum may be lost, it's important to reply something + * to the user, lest she end up waiting forever for a reply. And + * no reply will ever be sent until quorum is formed again. + * + * On the other hand, this means we're leaking uncommitted state + * to the user. As such, please be mindful of the reply message. + * + * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going + * operation and conveys its not-yet-permanent nature); whereas + * 'added monitor mon.foo' presumes the action has successfully + * completed and state has been committed, which may not be true. + */ + + + bool propose = false; + if (prefix == "mon add") { + string name; + cmd_getval(cmdmap, "name", name); + string addrstr; + cmd_getval(cmdmap, "addr", addrstr); + entity_addr_t addr; + bufferlist rdata; + + if (!addr.parse(addrstr.c_str())) { + err = -EINVAL; + ss << "addr " << addrstr << "does not parse"; + goto reply; + } + + vector<string> locationvec; + map<string, string> loc; + cmd_getval(cmdmap, "location", locationvec); + CrushWrapper::parse_loc_map(locationvec, &loc); + if (locationvec.size() && + !mon.get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_PINGING)) { + err = -ENOTSUP; + ss << "Not all monitors support adding monitors with a location; please upgrade first!"; + goto reply; + } + if (locationvec.size() && !loc.size()) { + ss << "We could not parse your input location to anything real; " << locationvec + << " turned into an empty map!"; + err = -EINVAL; + goto reply; + } + + dout(10) << "mon add setting location for " << name << " to " << loc << dendl; + + // TODO: validate location in crush map + if (monmap.stretch_mode_enabled && !loc.size()) { + ss << "We are in stretch mode and new monitors must have a location, but " + << "could not parse your input location to anything real; " << locationvec + << " turned into an empty map!"; + err = -EINVAL; + goto reply; + } + // TODO: validate location against any existing stretch config + + entity_addrvec_t addrs; + if (monmap.persistent_features.contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + if (addr.get_port() == CEPH_MON_PORT_IANA) { + addr.set_type(entity_addr_t::TYPE_MSGR2); + } + if (addr.get_port() == CEPH_MON_PORT_LEGACY) { + // if they specified the *old* default they probably don't care + addr.set_port(0); + } + if (addr.get_port()) { + addrs.v.push_back(addr); + } else { + addr.set_type(entity_addr_t::TYPE_MSGR2); + addr.set_port(CEPH_MON_PORT_IANA); + addrs.v.push_back(addr); + addr.set_type(entity_addr_t::TYPE_LEGACY); + addr.set_port(CEPH_MON_PORT_LEGACY); + addrs.v.push_back(addr); + } + } else { + if (addr.get_port() == 0) { + addr.set_port(CEPH_MON_PORT_LEGACY); + } + addr.set_type(entity_addr_t::TYPE_LEGACY); + addrs.v.push_back(addr); + } + dout(20) << __func__ << " addr " << addr << " -> addrs " << addrs << dendl; + + /** + * If we have a monitor with the same name and different addr, then EEXIST + * If we have a monitor with the same addr and different name, then EEXIST + * If we have a monitor with the same addr and same name, then wait for + * the proposal to finish and return success. + * If we don't have the monitor, add it. + */ + + err = 0; + if (!ss.str().empty()) + ss << "; "; + + do { + if (monmap.contains(name)) { + if (monmap.get_addrs(name) == addrs) { + // stable map contains monitor with the same name at the same address. + // serialize before current pending map. + err = 0; // for clarity; this has already been set above. + ss << "mon." << name << " at " << addrs << " already exists"; + goto reply; + } else { + ss << "mon." << name + << " already exists at address " << monmap.get_addrs(name); + } + } else if (monmap.contains(addrs)) { + // we established on the previous branch that name is different + ss << "mon." << monmap.get_name(addrs) + << " already exists at address " << addr; + } else { + // go ahead and add + break; + } + err = -EEXIST; + goto reply; + } while (false); + + if (pending_map.stretch_mode_enabled) { + + } + + /* Given there's no delay between proposals on the MonmapMonitor (see + * MonmapMonitor::should_propose()), there is no point in checking for + * a mismatch between name and addr on pending_map. + * + * Once we established the monitor does not exist in the committed state, + * we can simply go ahead and add the monitor. + */ + + pending_map.add(name, addrs); + pending_map.mon_info[name].crush_loc = loc; + pending_map.last_changed = ceph_clock_now(); + ss << "adding mon." << name << " at " << addrs; + propose = true; + dout(0) << __func__ << " proposing new mon." << name << dendl; + + } else if (prefix == "mon remove" || + prefix == "mon rm") { + string name; + cmd_getval(cmdmap, "name", name); + if (!monmap.contains(name)) { + err = 0; + ss << "mon." << name << " does not exist or has already been removed"; + goto reply; + } + + if (monmap.size() == 1) { + err = -EINVAL; + ss << "error: refusing removal of last monitor " << name; + goto reply; + } + + if (pending_map.stretch_mode_enabled && + name == pending_map.tiebreaker_mon) { + err = -EINVAL; + ss << "you cannot remove stretch mode's tiebreaker monitor"; + goto reply; + } + /* At the time of writing, there is no risk of races when multiple clients + * attempt to use the same name. The reason is simple but may not be + * obvious. + * + * In a nutshell, we do not collate proposals on the MonmapMonitor. As + * soon as we return 'true' below, PaxosService::dispatch() will check if + * the service should propose, and - if so - the service will be marked as + * 'proposing' and a proposal will be triggered. The PaxosService class + * guarantees that once a service is marked 'proposing' no further writes + * will be handled. + * + * The decision on whether the service should propose or not is, in this + * case, made by MonmapMonitor::should_propose(), which always considers + * the proposal delay being 0.0 seconds. This is key for PaxosService to + * trigger the proposal immediately. + * 0.0 seconds of delay. + * + * From the above, there's no point in performing further checks on the + * pending_map, as we don't ever have multiple proposals in-flight in + * this service. As we've established the committed state contains the + * monitor, we can simply go ahead and remove it. + * + * Please note that the code hinges on all of the above to be true. It + * has been true since time immemorial and we don't see a good reason + * to make it sturdier at this time - mainly because we don't think it's + * going to change any time soon, lest for any bug that may be unwillingly + * introduced. + */ + + entity_addrvec_t addrs = pending_map.get_addrs(name); + pending_map.remove(name); + pending_map.disallowed_leaders.erase(name); + pending_map.last_changed = ceph_clock_now(); + propose = true; + err = 0; + + } else if (prefix == "mon feature set") { + + /* PLEASE NOTE: + * + * We currently only support setting/unsetting persistent features. + * This is by design, given at the moment we still don't have optional + * features, and, as such, there is no point introducing an interface + * to manipulate them. This allows us to provide a cleaner, more + * intuitive interface to the user, modifying solely persistent + * features. + * + * In the future we should consider adding another interface to handle + * optional features/flags; e.g., 'mon feature flag set/unset', or + * 'mon flag set/unset'. + */ + string feature_name; + if (!cmd_getval(cmdmap, "feature_name", feature_name)) { + ss << "missing required feature name"; + err = -EINVAL; + goto reply; + } + + mon_feature_t feature; + feature = ceph::features::mon::get_feature_by_name(feature_name); + if (feature == ceph::features::mon::FEATURE_NONE) { + ss << "unknown feature '" << feature_name << "'"; + err = -ENOENT; + goto reply; + } + + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure) { + ss << "please specify '--yes-i-really-mean-it' if you " + << "really, **really** want to set feature '" + << feature << "' in the monmap."; + err = -EPERM; + goto reply; + } + + if (!mon.get_quorum_mon_features().contains_all(feature)) { + ss << "current quorum does not support feature '" << feature + << "'; supported features: " + << mon.get_quorum_mon_features(); + err = -EINVAL; + goto reply; + } + + ss << "setting feature '" << feature << "'"; + + err = 0; + if (monmap.persistent_features.contains_all(feature)) { + dout(10) << __func__ << " feature '" << feature + << "' already set on monmap; no-op." << dendl; + goto reply; + } + + pending_map.persistent_features.set_feature(feature); + pending_map.last_changed = ceph_clock_now(); + propose = true; + + dout(1) << __func__ << " " << ss.str() << "; new features will be: " + << "persistent = " << pending_map.persistent_features + // output optional nevertheless, for auditing purposes. + << ", optional = " << pending_map.optional_features << dendl; + + } else if (prefix == "mon set-rank") { + string name; + int64_t rank; + if (!cmd_getval(cmdmap, "name", name) || + !cmd_getval(cmdmap, "rank", rank)) { + err = -EINVAL; + goto reply; + } + int oldrank = pending_map.get_rank(name); + if (oldrank < 0) { + ss << "mon." << name << " does not exist in monmap"; + err = -ENOENT; + goto reply; + } + err = 0; + pending_map.set_rank(name, rank); + pending_map.last_changed = ceph_clock_now(); + propose = true; + } else if (prefix == "mon set-addrs") { + string name; + string addrs; + if (!cmd_getval(cmdmap, "name", name) || + !cmd_getval(cmdmap, "addrs", addrs)) { + err = -EINVAL; + goto reply; + } + if (!pending_map.contains(name)) { + ss << "mon." << name << " does not exist"; + err = -ENOENT; + goto reply; + } + entity_addrvec_t av; + if (!av.parse(addrs.c_str(), nullptr)) { + ss << "failed to parse addrs '" << addrs << "'"; + err = -EINVAL; + goto reply; + } + for (auto& a : av.v) { + a.set_nonce(0); + if (!a.get_port()) { + ss << "monitor must bind to a non-zero port, not " << a; + err = -EINVAL; + goto reply; + } + } + err = 0; + pending_map.set_addrvec(name, av); + pending_map.last_changed = ceph_clock_now(); + propose = true; + } else if (prefix == "mon set-weight") { + string name; + int64_t weight; + if (!cmd_getval(cmdmap, "name", name) || + !cmd_getval(cmdmap, "weight", weight)) { + err = -EINVAL; + goto reply; + } + if (!pending_map.contains(name)) { + ss << "mon." << name << " does not exist"; + err = -ENOENT; + goto reply; + } + err = 0; + pending_map.set_weight(name, weight); + pending_map.last_changed = ceph_clock_now(); + propose = true; + } else if (prefix == "mon enable-msgr2") { + if (!monmap.get_required_features().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + err = -EACCES; + ss << "all monitors must be running nautilus to enable v2"; + goto reply; + } + for (auto& i : pending_map.mon_info) { + if (i.second.public_addrs.v.size() == 1 && + i.second.public_addrs.front().is_legacy() && + i.second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) { + entity_addrvec_t av; + entity_addr_t a = i.second.public_addrs.front(); + a.set_type(entity_addr_t::TYPE_MSGR2); + a.set_port(CEPH_MON_PORT_IANA); + av.v.push_back(a); + av.v.push_back(i.second.public_addrs.front()); + dout(10) << " setting mon." << i.first + << " addrs " << i.second.public_addrs + << " -> " << av << dendl; + pending_map.set_addrvec(i.first, av); + propose = true; + pending_map.last_changed = ceph_clock_now(); + } + } + err = 0; + } else if (prefix == "mon set election_strategy") { + if (!mon.get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_PINGING)) { + err = -ENOTSUP; + ss << "Not all monitors support changing election strategies; please upgrade first!"; + goto reply; + } + string strat; + MonMap::election_strategy strategy; + if (!cmd_getval(cmdmap, "strategy", strat)) { + err = -EINVAL; + goto reply; + } + if (strat == "classic") { + strategy = MonMap::CLASSIC; + } else if (strat == "disallow") { + strategy = MonMap::DISALLOW; + } else if (strat == "connectivity") { + strategy = MonMap::CONNECTIVITY; + } else { + err = -EINVAL; + goto reply; + } + err = 0; + pending_map.strategy = strategy; + pending_map.last_changed = ceph_clock_now(); + propose = true; + } else if (prefix == "mon add disallowed_leader") { + if (!mon.get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_PINGING)) { + err = -ENOTSUP; + ss << "Not all monitors support changing election strategies; please upgrade first!"; + goto reply; + } + string name; + if (!cmd_getval(cmdmap, "name", name)) { + err = -EINVAL; + goto reply; + } + if (pending_map.strategy != MonMap::DISALLOW && + pending_map.strategy != MonMap::CONNECTIVITY) { + ss << "You cannot disallow monitors in your current election mode"; + err = -EINVAL; + goto reply; + } + if (!pending_map.contains(name)) { + ss << "mon." << name << " does not exist"; + err = -ENOENT; + goto reply; + } + if (pending_map.disallowed_leaders.count(name)) { + ss << "mon." << name << " is already disallowed"; + err = 0; + goto reply; + } + if (pending_map.disallowed_leaders.size() == pending_map.size() - 1) { + ss << "mon." << name << " is the only remaining allowed leader!"; + err = -EINVAL; + goto reply; + } + pending_map.disallowed_leaders.insert(name); + pending_map.last_changed = ceph_clock_now(); + err = 0; + propose = true; + } else if (prefix == "mon rm disallowed_leader") { + if (!mon.get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_PINGING)) { + err = -ENOTSUP; + ss << "Not all monitors support changing election strategies; please upgrade first!"; + goto reply; + } + string name; + if (!cmd_getval(cmdmap, "name", name)) { + err = -EINVAL; + goto reply; + } + if (pending_map.strategy != MonMap::DISALLOW && + pending_map.strategy != MonMap::CONNECTIVITY) { + ss << "You cannot disallow monitors in your current election mode"; + err = -EINVAL; + goto reply; + } + if (!pending_map.contains(name)) { + ss << "mon." << name << " does not exist"; + err = -ENOENT; + goto reply; + } + if (!pending_map.disallowed_leaders.count(name)) { + ss << "mon." << name << " is already allowed"; + err = 0; + goto reply; + } + pending_map.disallowed_leaders.erase(name); + pending_map.last_changed = ceph_clock_now(); + err = 0; + propose = true; + } else if (prefix == "mon set_location") { + if (!mon.get_quorum_mon_features().contains_all( + ceph::features::mon::FEATURE_PINGING)) { + err = -ENOTSUP; + ss << "Not all monitors support monitor locations; please upgrade first!"; + goto reply; + } + string name; + if (!cmd_getval(cmdmap, "name", name)) { + err = -EINVAL; + goto reply; + } + if (!pending_map.contains(name)) { + ss << "mon." << name << " does not exist"; + err = -ENOENT; + goto reply; + } + + vector<string> argvec; + map<string, string> loc; + cmd_getval(cmdmap, "args", argvec); + CrushWrapper::parse_loc_map(argvec, &loc); + + dout(10) << "mon set_location for " << name << " to " << loc << dendl; + + // TODO: validate location in crush map + if (!loc.size()) { + ss << "We could not parse your input location to anything real; " << argvec + << " turned into an empty map!"; + err = -EINVAL; + goto reply; + } + // TODO: validate location against any existing stretch config + pending_map.mon_info[name].crush_loc = loc; + pending_map.last_changed = ceph_clock_now(); + err = 0; + propose = true; + } else if (prefix == "mon set_new_tiebreaker") { + if (!pending_map.stretch_mode_enabled) { + err = -EINVAL; + ss << "Stretch mode is not enabled, so there is no tiebreaker"; + goto reply; + } + string name; + if (!cmd_getval(cmdmap, "name", name)) { + err = -EINVAL; + goto reply; + } + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + + const auto &existing_tiebreaker_info_i = pending_map.mon_info.find(pending_map.tiebreaker_mon); + const auto &new_tiebreaker_info_i = pending_map.mon_info.find(name); + if (new_tiebreaker_info_i == pending_map.mon_info.end()) { + ss << "mon." << name << " does not exist"; + err = -ENOENT; + goto reply; + } + const auto& new_info = new_tiebreaker_info_i->second; + if (new_info.crush_loc.empty()) { + ss << "mon." << name << " does not have a location specified"; + err = -EINVAL; + goto reply; + } + + if (!mon.osdmon()->is_readable()) { + dout(10) << __func__ + << ": waiting for osdmon readable to inspect crush barrier" + << dendl; + mon.osdmon()->wait_for_readable(op, new Monitor::C_RetryMessage(&mon, op)); + return false; + } + int32_t stretch_divider_id = mon.osdmon()->osdmap.stretch_mode_bucket; + string stretch_bucket_divider = mon.osdmon()->osdmap.crush-> + get_type_name(stretch_divider_id); + + const auto& new_loc_i = new_info.crush_loc.find(stretch_bucket_divider); + if (new_loc_i == new_info.crush_loc.end()) { + ss << "mon." << name << " has a specificed location, but not a " + << stretch_bucket_divider << ", which is the stretch divider"; + err = -EINVAL; + goto reply; + } + const string& new_loc = new_loc_i->second; + set<string> matching_mons; + for (const auto& mii : pending_map.mon_info) { + const auto& other_loc_i = mii.second.crush_loc.find(stretch_bucket_divider); + if (mii.first == name) { + continue; + } + if (other_loc_i == mii.second.crush_loc.end()) { // huh + continue; + } + const string& other_loc = other_loc_i->second; + if (other_loc == new_loc && + mii.first != existing_tiebreaker_info_i->first) { + matching_mons.insert(mii.first); + } + } + if (!matching_mons.empty()) { + ss << "mon." << name << " has location " << new_loc_i->second + << ", which matches mons " << matching_mons << " on the " + << stretch_bucket_divider << " dividing bucket for stretch mode. " + "Pass --yes-i-really-mean-it if you're sure you want to do this." + "(You really don't.)"; + err = -EINVAL; + goto reply; + } + pending_map.tiebreaker_mon = name; + pending_map.disallowed_leaders.insert(name); + pending_map.last_changed = ceph_clock_now(); + err = 0; + propose = true; + } else if (prefix == "mon enable_stretch_mode") { + if (!mon.osdmon()->is_writeable()) { + dout(10) << __func__ + << ": waiting for osdmon writeable for stretch mode" << dendl; + mon.osdmon()->wait_for_writeable(op, new Monitor::C_RetryMessage(&mon, op)); + return false; + } + { + if (monmap.stretch_mode_enabled) { + ss << "stretch mode is already engaged"; + err = -EINVAL; + goto reply; + } + if (pending_map.stretch_mode_enabled) { + ss << "stretch mode currently committing"; + err = 0; + goto reply; + } + string tiebreaker_mon; + if (!cmd_getval(cmdmap, "tiebreaker_mon", tiebreaker_mon)) { + ss << "must specify a tiebreaker monitor"; + err = -EINVAL; + goto reply; + } + string new_crush_rule; + if (!cmd_getval(cmdmap, "new_crush_rule", new_crush_rule)) { + ss << "must specify a new crush rule that spreads out copies over multiple sites"; + err = -EINVAL; + goto reply; + } + string dividing_bucket; + if (!cmd_getval(cmdmap, "dividing_bucket", dividing_bucket)) { + ss << "must specify a dividing bucket"; + err = -EINVAL; + goto reply; + } + //okay, initial arguments make sense, check pools and cluster state + err = mon.osdmon()->check_cluster_features(CEPH_FEATUREMASK_STRETCH_MODE, ss); + if (err) + goto reply; + struct Plugger { + Paxos &p; + Plugger(Paxos &p) : p(p) { p.plug(); } + ~Plugger() { p.unplug(); } + } plugger(paxos); + + set<pg_pool_t*> pools; + bool okay = false; + int errcode = 0; + + mon.osdmon()->try_enable_stretch_mode_pools(ss, &okay, &errcode, + &pools, new_crush_rule); + if (!okay) { + err = errcode; + goto reply; + } + try_enable_stretch_mode(ss, &okay, &errcode, false, + tiebreaker_mon, dividing_bucket); + if (!okay) { + err = errcode; + goto reply; + } + mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, false, + dividing_bucket, 2, pools, new_crush_rule); + if (!okay) { + err = errcode; + goto reply; + } + // everything looks good, actually commit the changes! + try_enable_stretch_mode(ss, &okay, &errcode, true, + tiebreaker_mon, dividing_bucket); + mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, true, + dividing_bucket, + 2, // right now we only support 2 sites + pools, new_crush_rule); + ceph_assert(okay == true); + } + request_proposal(mon.osdmon()); + err = 0; + propose = true; + } else { + ss << "unknown command " << prefix; + err = -EINVAL; + } + +reply: + getline(ss, rs); + mon.reply_command(op, err, rs, get_last_committed()); + // we are returning to the user; do not propose. + return propose; +} + +void MonmapMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay, + int *errcode, bool commit, + const string& tiebreaker_mon, + const string& dividing_bucket) +{ + dout(20) << __func__ << dendl; + *okay = false; + if (pending_map.strategy != MonMap::CONNECTIVITY) { + ss << "Monitors must use the connectivity strategy to enable stretch mode"; + *errcode = -EINVAL; + ceph_assert(!commit); + return; + } + if (!pending_map.contains(tiebreaker_mon)) { + ss << "mon " << tiebreaker_mon << "does not seem to exist"; + *errcode = -ENOENT; + ceph_assert(!commit); + return; + } + map<string,string> buckets; + for (const auto&mii : mon.monmap->mon_info) { + const auto& mi = mii.second; + const auto& bi = mi.crush_loc.find(dividing_bucket); + if (bi == mi.crush_loc.end()) { + ss << "Could not find location entry for " << dividing_bucket + << " on monitor " << mi.name; + *errcode = -EINVAL; + ceph_assert(!commit); + return; + } + buckets[mii.first] = bi->second; + } + string bucket1, bucket2, tiebreaker_bucket; + for (auto& i : buckets) { + if (i.first == tiebreaker_mon) { + tiebreaker_bucket = i.second; + continue; + } + if (bucket1.empty()) { + bucket1 = i.second; + } + if (bucket1 != i.second && + bucket2.empty()) { + bucket2 = i.second; + } + if (bucket1 != i.second && + bucket2 != i.second) { + ss << "There are too many monitor buckets for stretch mode, found " + << bucket1 << "," << bucket2 << "," << i.second; + *errcode = -EINVAL; + ceph_assert(!commit); + return; + } + } + if (bucket1.empty() || bucket2.empty()) { + ss << "There are not enough monitor buckets for stretch mode;" + << " must have at least 2 plus the tiebreaker but only found " + << (bucket1.empty() ? bucket1 : bucket2); + *errcode = -EINVAL; + ceph_assert(!commit); + return; + } + if (tiebreaker_bucket == bucket1 || + tiebreaker_bucket == bucket2) { + ss << "The named tiebreaker monitor " << tiebreaker_mon + << " is in the same CRUSH bucket " << tiebreaker_bucket + << " as other monitors"; + *errcode = -EINVAL; + ceph_assert(!commit); + return; + } + if (commit) { + pending_map.disallowed_leaders.insert(tiebreaker_mon); + pending_map.tiebreaker_mon = tiebreaker_mon; + pending_map.stretch_mode_enabled = true; + } + *okay = true; +} + +void MonmapMonitor::trigger_degraded_stretch_mode(const set<string>& dead_mons) +{ + dout(20) << __func__ << dendl; + pending_map.stretch_marked_down_mons.insert(dead_mons.begin(), dead_mons.end()); + propose_pending(); +} + +void MonmapMonitor::trigger_healthy_stretch_mode() +{ + dout(20) << __func__ << dendl; + pending_map.stretch_marked_down_mons.clear(); + propose_pending(); +} + +bool MonmapMonitor::preprocess_join(MonOpRequestRef op) +{ + auto join = op->get_req<MMonJoin>(); + dout(10) << __func__ << " " << join->name << " at " << join->addrs << dendl; + + MonSession *session = op->get_session(); + if (!session || + !session->is_capable("mon", MON_CAP_W | MON_CAP_X)) { + dout(10) << " insufficient caps" << dendl; + return true; + } + + const auto name_info_i = pending_map.mon_info.find(join->name); + if (name_info_i != pending_map.mon_info.end() && + !name_info_i->second.public_addrs.front().is_blank_ip() && + (!join->force_loc || join->crush_loc == name_info_i->second.crush_loc)) { + dout(10) << " already have " << join->name << dendl; + return true; + } + string addr_name; + if (pending_map.contains(join->addrs)) { + addr_name = pending_map.get_name(join->addrs); + } + if (!addr_name.empty() && + addr_name == join->name && + (!join->force_loc || join->crush_loc.empty() || + pending_map.mon_info[addr_name].crush_loc == join->crush_loc)) { + dout(10) << " already have " << join->addrs << dendl; + return true; + } + if (pending_map.stretch_mode_enabled && + join->crush_loc.empty() && + (addr_name.empty() || + pending_map.mon_info[addr_name].crush_loc.empty())) { + dout(10) << "stretch mode engaged but no source of crush_loc" << dendl; + mon.clog->info() << join->name << " attempted to join from " << join->name + << ' ' << join->addrs + << "; but lacks a crush_location for stretch mode"; + return true; + } + return false; +} + +bool MonmapMonitor::prepare_join(MonOpRequestRef op) +{ + auto join = op->get_req<MMonJoin>(); + dout(0) << "adding/updating " << join->name + << " at " << join->addrs << " to monitor cluster" << dendl; + map<string,string> existing_loc; + if (pending_map.contains(join->addrs)) { + string name = pending_map.get_name(join->addrs); + existing_loc = pending_map.mon_info[name].crush_loc; + pending_map.remove(name); + } + if (pending_map.contains(join->name)) + pending_map.remove(join->name); + pending_map.add(join->name, join->addrs); + pending_map.mon_info[join->name].crush_loc = + ((join->force_loc || existing_loc.empty()) ? + join->crush_loc : existing_loc); + pending_map.last_changed = ceph_clock_now(); + return true; +} + +bool MonmapMonitor::should_propose(double& delay) +{ + delay = 0.0; + return true; +} + +int MonmapMonitor::get_monmap(bufferlist &bl) +{ + version_t latest_ver = get_last_committed(); + dout(10) << __func__ << " ver " << latest_ver << dendl; + + if (!mon.store->exists(get_service_name(), stringify(latest_ver))) + return -ENOENT; + + int err = get_version(latest_ver, bl); + if (err < 0) { + dout(1) << __func__ << " error obtaining monmap: " + << cpp_strerror(err) << dendl; + return err; + } + return 0; +} + +void MonmapMonitor::check_subs() +{ + const string type = "monmap"; + mon.with_session_map([this, &type](const MonSessionMap& session_map) { + auto subs = session_map.subs.find(type); + if (subs == session_map.subs.end()) + return; + for (auto sub : *subs->second) { + check_sub(sub); + } + }); +} + +void MonmapMonitor::check_sub(Subscription *sub) +{ + const auto epoch = mon.monmap->get_epoch(); + dout(10) << __func__ + << " monmap next " << sub->next + << " have " << epoch << dendl; + if (sub->next <= epoch) { + mon.send_latest_monmap(sub->session->con.get()); + if (sub->onetime) { + mon.with_session_map([sub](MonSessionMap& session_map) { + session_map.remove_sub(sub); + }); + } else { + sub->next = epoch + 1; + } + } +} + +void MonmapMonitor::tick() +{ + if (!is_active() || + !mon.is_leader()) { + return; + } + + if (mon.monmap->created.is_zero()) { + dout(10) << __func__ << " detected empty created stamp" << dendl; + utime_t ctime; + for (version_t v = 1; v <= get_last_committed(); v++) { + bufferlist bl; + int r = get_version(v, bl); + if (r < 0) { + continue; + } + MonMap m; + auto p = bl.cbegin(); + decode(m, p); + if (!m.last_changed.is_zero()) { + dout(10) << __func__ << " first monmap with last_changed is " + << v << " with " << m.last_changed << dendl; + ctime = m.last_changed; + break; + } + } + if (ctime.is_zero()) { + ctime = ceph_clock_now(); + } + dout(10) << __func__ << " updating created stamp to " << ctime << dendl; + pending_map.created = ctime; + propose_pending(); + } +} diff --git a/src/mon/MonmapMonitor.h b/src/mon/MonmapMonitor.h new file mode 100644 index 000000000..cf22ae9f8 --- /dev/null +++ b/src/mon/MonmapMonitor.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2009 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* + * The Monmap Monitor is used to track the monitors in the cluster. + */ + +#ifndef CEPH_MONMAPMONITOR_H +#define CEPH_MONMAPMONITOR_H + +#include <map> +#include <set> + +#include "include/types.h" +#include "msg/Messenger.h" + +#include "PaxosService.h" +#include "MonMap.h" +#include "MonitorDBStore.h" + +class MonmapMonitor : public PaxosService { + public: + MonmapMonitor(Monitor &mn, Paxos &p, const std::string& service_name) + : PaxosService(mn, p, service_name) + { + } + MonMap pending_map; //the pending map awaiting passage + + void create_initial() override; + + void update_from_paxos(bool *need_bootstrap) override; + + void create_pending() override; + + void encode_pending(MonitorDBStore::TransactionRef t) override; + // we always encode the full map; we have no use for full versions + void encode_full(MonitorDBStore::TransactionRef t) override { } + + void on_active() override; + void apply_mon_features(const mon_feature_t& features, + ceph_release_t min_mon_release); + + void dump_info(ceph::Formatter *f); + + bool preprocess_query(MonOpRequestRef op) override; + bool prepare_update(MonOpRequestRef op) override; + + bool preprocess_join(MonOpRequestRef op); + bool prepare_join(MonOpRequestRef op); + + bool preprocess_command(MonOpRequestRef op); + bool prepare_command(MonOpRequestRef op); + + int get_monmap(ceph::buffer::list &bl); + + /* + * Since monitors are pretty + * important, this implementation will just write 0.0. + */ + bool should_propose(double& delay) override; + + void check_sub(Subscription *sub); + + void tick() override; + +private: + void check_subs(); + ceph::buffer::list monmap_bl; + /** + * Check validity of inputs and monitor state to + * engage stretch mode. Designed to be used with + * OSDMonitor::try_enable_stretch_mode() where we call both twice, + * first with commit=false to validate. + * @param ss: a stringstream to write errors into + * @param okay: Filled to true if okay, false if validation fails + * @param errcode: filled with -errno if there's a problem + * @param commit: true if we should commit the change, false if just testing + * @param tiebreaker_mon: the name of the monitor to declare tiebreaker + * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster + */ + void try_enable_stretch_mode(stringstream& ss, bool *okay, + int *errcode, bool commit, + const string& tiebreaker_mon, + const string& dividing_bucket); + +public: + /** + * Set us to degraded stretch mode. Put the dead_mons in + * the MonMap. + */ + void trigger_degraded_stretch_mode(const set<string>& dead_mons); + /** + * Set us to healthy stretch mode: clear out the + * down list to allow any non-tiebreaker mon to be the leader again. + */ + void trigger_healthy_stretch_mode(); +}; + + +#endif diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc new file mode 100644 index 000000000..3191ed5bf --- /dev/null +++ b/src/mon/OSDMonitor.cc @@ -0,0 +1,14832 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * Copyright (C) 2014 Red Hat <contact@redhat.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <algorithm> +#include <boost/algorithm/string.hpp> +#include <experimental/iterator> +#include <locale> +#include <sstream> + +#include "mon/OSDMonitor.h" +#include "mon/Monitor.h" +#include "mon/MDSMonitor.h" +#include "mon/MgrStatMonitor.h" +#include "mon/AuthMonitor.h" +#include "mon/KVMonitor.h" + +#include "mon/MonitorDBStore.h" +#include "mon/Session.h" + +#include "crush/CrushWrapper.h" +#include "crush/CrushTester.h" +#include "crush/CrushTreeDumper.h" + +#include "messages/MOSDBeacon.h" +#include "messages/MOSDFailure.h" +#include "messages/MOSDMarkMeDown.h" +#include "messages/MOSDMarkMeDead.h" +#include "messages/MOSDFull.h" +#include "messages/MOSDMap.h" +#include "messages/MMonGetOSDMap.h" +#include "messages/MOSDBoot.h" +#include "messages/MOSDAlive.h" +#include "messages/MPoolOp.h" +#include "messages/MPoolOpReply.h" +#include "messages/MOSDPGCreate.h" +#include "messages/MOSDPGCreate2.h" +#include "messages/MOSDPGCreated.h" +#include "messages/MOSDPGTemp.h" +#include "messages/MOSDPGReadyToMerge.h" +#include "messages/MMonCommand.h" +#include "messages/MRemoveSnaps.h" +#include "messages/MOSDScrub.h" +#include "messages/MRoute.h" +#include "messages/MMonGetPurgedSnaps.h" +#include "messages/MMonGetPurgedSnapsReply.h" + +#include "common/TextTable.h" +#include "common/Timer.h" +#include "common/ceph_argparse.h" +#include "common/perf_counters.h" +#include "common/PriorityCache.h" +#include "common/strtol.h" +#include "common/numa.h" + +#include "common/config.h" +#include "common/errno.h" + +#include "erasure-code/ErasureCodePlugin.h" +#include "compressor/Compressor.h" +#include "common/Checksummer.h" + +#include "include/compat.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "include/util.h" +#include "common/cmdparse.h" +#include "include/str_list.h" +#include "include/str_map.h" +#include "include/scope_guard.h" +#include "perfglue/heap_profiler.h" + +#include "auth/cephx/CephxKeyServer.h" +#include "osd/OSDCap.h" + +#include "json_spirit/json_spirit_reader.h" + +#include <boost/algorithm/string/predicate.hpp> + +using std::dec; +using std::hex; +using std::list; +using std::map; +using std::make_pair; +using std::ostringstream; +using std::pair; +using std::set; +using std::string; +using std::stringstream; +using std::to_string; +using std::vector; + +using ceph::bufferlist; +using ceph::decode; +using ceph::encode; +using ceph::ErasureCodeInterfaceRef; +using ceph::ErasureCodePluginRegistry; +using ceph::ErasureCodeProfile; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::make_message; + +#define dout_subsys ceph_subsys_mon +static const string OSD_PG_CREATING_PREFIX("osd_pg_creating"); +static const string OSD_METADATA_PREFIX("osd_metadata"); +static const string OSD_SNAP_PREFIX("osd_snap"); + +/* + + OSD snapshot metadata + --------------------- + + -- starting with mimic, removed in octopus -- + + "removed_epoch_%llu_%08lx" % (pool, epoch) + -> interval_set<snapid_t> + + "removed_snap_%llu_%016llx" % (pool, last_snap) + -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1) + + + -- starting with mimic -- + + "purged_snap_%llu_%016llx" % (pool, last_snap) + -> { first_snap, end_snap, epoch } (last_snap = end_snap - 1) + + - note that the {removed,purged}_snap put the last snap in they key so + that we can use forward iteration only to search for an epoch in an + interval. e.g., to test if epoch N is removed/purged, we'll find a key + >= N that either does or doesn't contain the given snap. + + + -- starting with octopus -- + + "purged_epoch_%08lx" % epoch + -> map<int64_t,interval_set<snapid_t>> + + */ +using namespace TOPNSPC::common; +namespace { + +struct OSDMemCache : public PriorityCache::PriCache { + OSDMonitor *osdmon; + int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0}; + int64_t committed_bytes = 0; + double cache_ratio = 0; + + OSDMemCache(OSDMonitor *m) : osdmon(m) {}; + + virtual uint64_t _get_used_bytes() const = 0; + + virtual int64_t request_cache_bytes( + PriorityCache::Priority pri, uint64_t total_cache) const { + int64_t assigned = get_cache_bytes(pri); + + switch (pri) { + // All cache items are currently set to have PRI1 priority + case PriorityCache::Priority::PRI1: + { + int64_t request = _get_used_bytes(); + return (request > assigned) ? request - assigned : 0; + } + default: + break; + } + return -EOPNOTSUPP; + } + + virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const { + return cache_bytes[pri]; + } + + virtual int64_t get_cache_bytes() const { + int64_t total = 0; + + for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) { + PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i); + total += get_cache_bytes(pri); + } + return total; + } + + virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) { + cache_bytes[pri] = bytes; + } + virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) { + cache_bytes[pri] += bytes; + } + virtual int64_t commit_cache_size(uint64_t total_cache) { + committed_bytes = PriorityCache::get_chunk( + get_cache_bytes(), total_cache); + return committed_bytes; + } + virtual int64_t get_committed_size() const { + return committed_bytes; + } + virtual double get_cache_ratio() const { + return cache_ratio; + } + virtual void set_cache_ratio(double ratio) { + cache_ratio = ratio; + } + virtual string get_cache_name() const = 0; +}; + +struct IncCache : public OSDMemCache { + IncCache(OSDMonitor *m) : OSDMemCache(m) {}; + + virtual uint64_t _get_used_bytes() const { + return osdmon->inc_osd_cache.get_bytes(); + } + + virtual string get_cache_name() const { + return "OSDMap Inc Cache"; + } + + uint64_t _get_num_osdmaps() const { + return osdmon->inc_osd_cache.get_size(); + } +}; + +struct FullCache : public OSDMemCache { + FullCache(OSDMonitor *m) : OSDMemCache(m) {}; + + virtual uint64_t _get_used_bytes() const { + return osdmon->full_osd_cache.get_bytes(); + } + + virtual string get_cache_name() const { + return "OSDMap Full Cache"; + } + + uint64_t _get_num_osdmaps() const { + return osdmon->full_osd_cache.get_size(); + } +}; + +std::shared_ptr<IncCache> inc_cache; +std::shared_ptr<FullCache> full_cache; + +const uint32_t MAX_POOL_APPLICATIONS = 4; +const uint32_t MAX_POOL_APPLICATION_KEYS = 64; +const uint32_t MAX_POOL_APPLICATION_LENGTH = 128; + +bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) { + // Note: this doesn't include support for the application tag match + if ((grant.spec.allow & OSD_CAP_W) != 0) { + auto& match = grant.match; + if (match.is_match_all()) { + return true; + } else if (pool_name != nullptr && + !match.pool_namespace.pool_name.empty() && + match.pool_namespace.pool_name == *pool_name) { + return true; + } + } + return false; +} + +bool is_unmanaged_snap_op_permitted(CephContext* cct, + const KeyServer& key_server, + const EntityName& entity_name, + const MonCap& mon_caps, + const entity_addr_t& peer_socket_addr, + const std::string* pool_name) +{ + typedef std::map<std::string, std::string> CommandArgs; + + if (mon_caps.is_capable( + cct, entity_name, "osd", + "osd pool op unmanaged-snap", + (pool_name == nullptr ? + CommandArgs{} /* pool DNE, require unrestricted cap */ : + CommandArgs{{"poolname", *pool_name}}), + false, true, false, + peer_socket_addr)) { + return true; + } + + AuthCapsInfo caps_info; + if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD, + caps_info)) { + dout(10) << "unable to locate OSD cap data for " << entity_name + << " in auth db" << dendl; + return false; + } + + string caps_str; + if (caps_info.caps.length() > 0) { + auto p = caps_info.caps.cbegin(); + try { + decode(caps_str, p); + } catch (const ceph::buffer::error &err) { + derr << "corrupt OSD cap data for " << entity_name << " in auth db" + << dendl; + return false; + } + } + + OSDCap osd_cap; + if (!osd_cap.parse(caps_str, nullptr)) { + dout(10) << "unable to parse OSD cap data for " << entity_name + << " in auth db" << dendl; + return false; + } + + // if the entity has write permissions in one or all pools, permit + // usage of unmanaged-snapshots + if (osd_cap.allow_all()) { + return true; + } + + for (auto& grant : osd_cap.grants) { + if (grant.profile.is_valid()) { + for (auto& profile_grant : grant.profile_grants) { + if (is_osd_writable(profile_grant, pool_name)) { + return true; + } + } + } else if (is_osd_writable(grant, pool_name)) { + return true; + } + } + + return false; +} + +} // anonymous namespace + +void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps, + epoch_t last_epoch_clean) +{ + if (ps >= pg_num) { + // removed PG + return; + } + epoch_by_pg.resize(pg_num, 0); + const auto old_lec = epoch_by_pg[ps]; + if (old_lec >= last_epoch_clean) { + // stale lec + return; + } + epoch_by_pg[ps] = last_epoch_clean; + if (last_epoch_clean < floor) { + floor = last_epoch_clean; + } else if (last_epoch_clean > floor) { + if (old_lec == floor) { + // probably should increase floor? + auto new_floor = std::min_element(std::begin(epoch_by_pg), + std::end(epoch_by_pg)); + floor = *new_floor; + } + } + if (ps != next_missing) { + return; + } + for (; next_missing < epoch_by_pg.size(); next_missing++) { + if (epoch_by_pg[next_missing] == 0) { + break; + } + } +} + +void LastEpochClean::remove_pool(uint64_t pool) +{ + report_by_pool.erase(pool); +} + +void LastEpochClean::report(unsigned pg_num, const pg_t& pg, + epoch_t last_epoch_clean) +{ + auto& lec = report_by_pool[pg.pool()]; + return lec.report(pg_num, pg.ps(), last_epoch_clean); +} + +epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const +{ + auto floor = latest.get_epoch(); + for (auto& pool : latest.get_pools()) { + auto reported = report_by_pool.find(pool.first); + if (reported == report_by_pool.end()) { + return 0; + } + if (reported->second.next_missing < pool.second.get_pg_num()) { + return 0; + } + if (reported->second.floor < floor) { + floor = reported->second.floor; + } + } + return floor; +} + +void LastEpochClean::dump(Formatter *f) const +{ + f->open_array_section("per_pool"); + + for (auto& [pool, lec] : report_by_pool) { + f->open_object_section("pool"); + f->dump_unsigned("poolid", pool); + f->dump_unsigned("floor", lec.floor); + f->close_section(); + } + + f->close_section(); +} + +class C_UpdateCreatingPGs : public Context { +public: + OSDMonitor *osdmon; + utime_t start; + epoch_t epoch; + C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) : + osdmon(osdmon), start(ceph_clock_now()), epoch(e) {} + void finish(int r) override { + if (r >= 0) { + utime_t end = ceph_clock_now(); + dout(10) << "osdmap epoch " << epoch << " mapping took " + << (end - start) << " seconds" << dendl; + osdmon->update_creating_pgs(); + osdmon->check_pg_creates_subs(); + } + } +}; + +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, osdmap) +static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() + << ").osd e" << osdmap.get_epoch() << " "; +} + +OSDMonitor::OSDMonitor( + CephContext *cct, + Monitor &mn, + Paxos &p, + const string& service_name) + : PaxosService(mn, p, service_name), + cct(cct), + inc_osd_cache(g_conf()->mon_osd_cache_size), + full_osd_cache(g_conf()->mon_osd_cache_size), + has_osdmap_manifest(false), + mapper(mn.cct, &mn.cpu_tp) +{ + inc_cache = std::make_shared<IncCache>(this); + full_cache = std::make_shared<FullCache>(this); + cct->_conf.add_observer(this); + int r = _set_cache_sizes(); + if (r < 0) { + derr << __func__ << " using default osd cache size - mon_osd_cache_size (" + << g_conf()->mon_osd_cache_size + << ") without priority cache management" + << dendl; + } +} + +const char **OSDMonitor::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "mon_memory_target", + "mon_memory_autotune", + "rocksdb_cache_size", + NULL + }; + return KEYS; +} + +void OSDMonitor::handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) +{ + dout(10) << __func__ << " " << changed << dendl; + + if (changed.count("mon_memory_autotune")) { + _set_cache_autotuning(); + } + if (changed.count("mon_memory_target") || + changed.count("rocksdb_cache_size")) { + int r = _update_mon_cache_settings(); + if (r < 0) { + derr << __func__ << " mon_memory_target:" + << g_conf()->mon_memory_target + << " rocksdb_cache_size:" + << g_conf()->rocksdb_cache_size + << ". Unable to update cache size." + << dendl; + } + } +} + +void OSDMonitor::_set_cache_autotuning() +{ + if (!g_conf()->mon_memory_autotune && pcm != nullptr) { + // Disable cache autotuning + std::lock_guard l(balancer_lock); + pcm = nullptr; + } + + if (g_conf()->mon_memory_autotune && pcm == nullptr) { + int r = register_cache_with_pcm(); + if (r < 0) { + dout(10) << __func__ + << " Error while registering osdmon caches with pcm." + << " Cache auto tuning not enabled." + << dendl; + mon_memory_autotune = false; + } else { + mon_memory_autotune = true; + } + } +} + +int OSDMonitor::_update_mon_cache_settings() +{ + if (g_conf()->mon_memory_target <= 0 || + g_conf()->mon_memory_target < mon_memory_min || + g_conf()->rocksdb_cache_size <= 0) { + return -EINVAL; + } + + if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) { + derr << __func__ << " not using pcm and rocksdb" << dendl; + return -EINVAL; + } + + uint64_t old_mon_memory_target = mon_memory_target; + uint64_t old_rocksdb_cache_size = rocksdb_cache_size; + + // Set the new pcm memory cache sizes + mon_memory_target = g_conf()->mon_memory_target; + rocksdb_cache_size = g_conf()->rocksdb_cache_size; + + uint64_t base = mon_memory_base; + double fragmentation = mon_memory_fragmentation; + uint64_t target = mon_memory_target; + uint64_t min = mon_memory_min; + uint64_t max = min; + + uint64_t ltarget = (1.0 - fragmentation) * target; + if (ltarget > base + min) { + max = ltarget - base; + } + + int r = _set_cache_ratios(); + if (r < 0) { + derr << __func__ << " Cache ratios for pcm could not be set." + << " Review the kv (rocksdb) and mon_memory_target sizes." + << dendl; + mon_memory_target = old_mon_memory_target; + rocksdb_cache_size = old_rocksdb_cache_size; + return -EINVAL; + } + + if (mon_memory_autotune && pcm != nullptr) { + std::lock_guard l(balancer_lock); + // set pcm cache levels + pcm->set_target_memory(target); + pcm->set_min_memory(min); + pcm->set_max_memory(max); + // tune memory based on new values + pcm->tune_memory(); + pcm->balance(); + _set_new_cache_sizes(); + dout(1) << __func__ << " Updated mon cache setting." + << " target: " << target + << " min: " << min + << " max: " << max + << dendl; + } + return 0; +} + +int OSDMonitor::_set_cache_sizes() +{ + if (g_conf()->mon_memory_autotune) { + // set the new osdmon cache targets to be managed by pcm + mon_osd_cache_size = g_conf()->mon_osd_cache_size; + rocksdb_cache_size = g_conf()->rocksdb_cache_size; + mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base"); + mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation"); + mon_memory_target = g_conf()->mon_memory_target; + mon_memory_min = g_conf()->mon_osd_cache_size_min; + if (mon_memory_target <= 0 || mon_memory_min <= 0) { + derr << __func__ << " mon_memory_target:" << mon_memory_target + << " mon_memory_min:" << mon_memory_min + << ". Invalid size option(s) provided." + << dendl; + return -EINVAL; + } + // Set the initial inc and full LRU cache sizes + inc_osd_cache.set_bytes(mon_memory_min); + full_osd_cache.set_bytes(mon_memory_min); + mon_memory_autotune = g_conf()->mon_memory_autotune; + } + return 0; +} + +bool OSDMonitor::_have_pending_crush() +{ + return pending_inc.crush.length() > 0; +} + +CrushWrapper &OSDMonitor::_get_stable_crush() +{ + return *osdmap.crush; +} + +void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush) +{ + bufferlist bl; + if (pending_inc.crush.length()) + bl = pending_inc.crush; + else + osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT); + + auto p = bl.cbegin(); + newcrush.decode(p); +} + +void OSDMonitor::create_initial() +{ + dout(10) << "create_initial for " << mon.monmap->fsid << dendl; + + OSDMap newmap; + + bufferlist bl; + mon.store->get("mkfs", "osdmap", bl); + + if (bl.length()) { + newmap.decode(bl); + newmap.set_fsid(mon.monmap->fsid); + } else { + newmap.build_simple(cct, 0, mon.monmap->fsid, 0); + } + newmap.set_epoch(1); + newmap.created = newmap.modified = ceph_clock_now(); + + // new clusters should sort bitwise by default. + newmap.set_flag(CEPH_OSDMAP_SORTBITWISE); + + newmap.flags |= + CEPH_OSDMAP_RECOVERY_DELETES | + CEPH_OSDMAP_PURGED_SNAPDIRS | + CEPH_OSDMAP_PGLOG_HARDLIMIT; + newmap.full_ratio = g_conf()->mon_osd_full_ratio; + if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100; + newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio; + if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100; + newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio; + if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100; + + // new cluster should require latest by default + if (g_conf().get_val<bool>("mon_debug_no_require_pacific")) { + if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) { + derr << __func__ << " mon_debug_no_require_pacific and octopus=true" << dendl; + newmap.require_osd_release = ceph_release_t::nautilus; + } else { + derr << __func__ << " mon_debug_no_require_pacific=true" << dendl; + newmap.require_osd_release = ceph_release_t::octopus; + } + } else { + newmap.require_osd_release = ceph_release_t::pacific; + } + + if (newmap.require_osd_release >= ceph_release_t::octopus) { + ceph_release_t r = ceph_release_from_name( + g_conf()->mon_osd_initial_require_min_compat_client); + if (!r) { + ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid"); + } + newmap.require_min_compat_client = r; + } + + // encode into pending incremental + uint64_t features = newmap.get_encoding_features(); + newmap.encode(pending_inc.fullmap, + features | CEPH_FEATURE_RESERVED); + pending_inc.full_crc = newmap.get_crc(); + dout(20) << " full crc " << pending_inc.full_crc << dendl; +} + +void OSDMonitor::get_store_prefixes(std::set<string>& s) const +{ + s.insert(service_name); + s.insert(OSD_PG_CREATING_PREFIX); + s.insert(OSD_METADATA_PREFIX); + s.insert(OSD_SNAP_PREFIX); +} + +void OSDMonitor::update_from_paxos(bool *need_bootstrap) +{ + // we really don't care if the version has been updated, because we may + // have trimmed without having increased the last committed; yet, we may + // need to update the in-memory manifest. + load_osdmap_manifest(); + + version_t version = get_last_committed(); + if (version == osdmap.epoch) + return; + ceph_assert(version > osdmap.epoch); + + dout(15) << "update_from_paxos paxos e " << version + << ", my e " << osdmap.epoch << dendl; + + int prev_num_up_osd = osdmap.num_up_osd; + + if (mapping_job) { + if (!mapping_job->is_done()) { + dout(1) << __func__ << " mapping job " + << mapping_job.get() << " did not complete, " + << mapping_job->shards << " left, canceling" << dendl; + mapping_job->abort(); + } + mapping_job.reset(); + } + + load_health(); + + /* + * We will possibly have a stashed latest that *we* wrote, and we will + * always be sure to have the oldest full map in the first..last range + * due to encode_trim_extra(), which includes the oldest full map in the trim + * transaction. + * + * encode_trim_extra() does not however write the full map's + * version to 'full_latest'. This is only done when we are building the + * full maps from the incremental versions. But don't panic! We make sure + * that the following conditions find whichever full map version is newer. + */ + version_t latest_full = get_version_latest_full(); + if (latest_full == 0 && get_first_committed() > 1) + latest_full = get_first_committed(); + + if (get_first_committed() > 1 && + latest_full < get_first_committed()) { + // the monitor could be just sync'ed with its peer, and the latest_full key + // is not encoded in the paxos commits in encode_pending(), so we need to + // make sure we get it pointing to a proper version. + version_t lc = get_last_committed(); + version_t fc = get_first_committed(); + + dout(10) << __func__ << " looking for valid full map in interval" + << " [" << fc << ", " << lc << "]" << dendl; + + latest_full = 0; + for (version_t v = lc; v >= fc; v--) { + string full_key = "full_" + stringify(v); + if (mon.store->exists(get_service_name(), full_key)) { + dout(10) << __func__ << " found latest full map v " << v << dendl; + latest_full = v; + break; + } + } + + ceph_assert(latest_full > 0); + auto t(std::make_shared<MonitorDBStore::Transaction>()); + put_version_latest_full(t, latest_full); + mon.store->apply_transaction(t); + dout(10) << __func__ << " updated the on-disk full map version to " + << latest_full << dendl; + } + + if ((latest_full > 0) && (latest_full > osdmap.epoch)) { + bufferlist latest_bl; + get_version_full(latest_full, latest_bl); + ceph_assert(latest_bl.length() != 0); + dout(7) << __func__ << " loading latest full map e" << latest_full << dendl; + osdmap = OSDMap(); + osdmap.decode(latest_bl); + } + + bufferlist bl; + if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) { + auto p = bl.cbegin(); + std::lock_guard<std::mutex> l(creating_pgs_lock); + creating_pgs.decode(p); + dout(7) << __func__ << " loading creating_pgs last_scan_epoch " + << creating_pgs.last_scan_epoch + << " with " << creating_pgs.pgs.size() << " pgs" << dendl; + } else { + dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?" + << dendl; + } + + // walk through incrementals + MonitorDBStore::TransactionRef t; + size_t tx_size = 0; + while (version > osdmap.epoch) { + bufferlist inc_bl; + int err = get_version(osdmap.epoch+1, inc_bl); + ceph_assert(err == 0); + ceph_assert(inc_bl.length()); + // set priority cache manager levels if the osdmap is + // being populated for the first time. + if (mon_memory_autotune && pcm == nullptr) { + int r = register_cache_with_pcm(); + if (r < 0) { + dout(10) << __func__ + << " Error while registering osdmon caches with pcm." + << " Proceeding without cache auto tuning." + << dendl; + } + } + + dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1 + << dendl; + OSDMap::Incremental inc(inc_bl); + err = osdmap.apply_incremental(inc); + ceph_assert(err == 0); + + if (!t) + t.reset(new MonitorDBStore::Transaction); + + // Write out the full map for all past epochs. Encode the full + // map with the same features as the incremental. If we don't + // know, use the quorum features. If we don't know those either, + // encode with all features. + uint64_t f = inc.encode_features; + if (!f) + f = mon.get_quorum_con_features(); + if (!f) + f = -1; + bufferlist full_bl; + osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED); + tx_size += full_bl.length(); + + bufferlist orig_full_bl; + get_version_full(osdmap.epoch, orig_full_bl); + if (orig_full_bl.length()) { + // the primary provided the full map + ceph_assert(inc.have_crc); + if (inc.full_crc != osdmap.crc) { + // This will happen if the mons were running mixed versions in + // the past or some other circumstance made the full encoded + // maps divergent. Reloading here will bring us back into + // sync with the primary for this and all future maps. OSDs + // will also be brought back into sync when they discover the + // crc mismatch and request a full map from a mon. + derr << __func__ << " full map CRC mismatch, resetting to canonical" + << dendl; + + dout(20) << __func__ << " my (bad) full osdmap:\n"; + JSONFormatter jf(true); + jf.dump_object("osdmap", osdmap); + jf.flush(*_dout); + *_dout << "\nhexdump:\n"; + full_bl.hexdump(*_dout); + *_dout << dendl; + + osdmap = OSDMap(); + osdmap.decode(orig_full_bl); + + dout(20) << __func__ << " canonical full osdmap:\n"; + JSONFormatter jf(true); + jf.dump_object("osdmap", osdmap); + jf.flush(*_dout); + *_dout << "\nhexdump:\n"; + orig_full_bl.hexdump(*_dout); + *_dout << dendl; + } + } else { + ceph_assert(!inc.have_crc); + put_version_full(t, osdmap.epoch, full_bl); + } + put_version_latest_full(t, osdmap.epoch); + + // share + dout(1) << osdmap << dendl; + + if (osdmap.epoch == 1) { + t->erase("mkfs", "osdmap"); + } + + if (tx_size > g_conf()->mon_sync_max_payload_size*2) { + mon.store->apply_transaction(t); + t = MonitorDBStore::TransactionRef(); + tx_size = 0; + } + for (const auto [osd, state] : inc.new_state) { + if (state & CEPH_OSD_UP) { + // could be marked up *or* down, but we're too lazy to check which + last_osd_report.erase(osd); + } + if (state & CEPH_OSD_OUT) { + // could be marked in *or* out, but we can safely drop it + osd_epochs.erase(osd); + } + } + for (const auto [osd, weight] : inc.new_weight) { + if (weight == CEPH_OSD_OUT) { + // manually marked out, so drop it + osd_epochs.erase(osd); + } + } + } + + if (t) { + mon.store->apply_transaction(t); + } + + bool marked_osd_down = false; + for (int o = 0; o < osdmap.get_max_osd(); o++) { + if (osdmap.is_out(o)) + continue; + auto found = down_pending_out.find(o); + if (osdmap.is_down(o)) { + // populate down -> out map + if (found == down_pending_out.end()) { + dout(10) << " adding osd." << o << " to down_pending_out map" << dendl; + down_pending_out[o] = ceph_clock_now(); + marked_osd_down = true; + } + } else { + if (found != down_pending_out.end()) { + dout(10) << " removing osd." << o << " from down_pending_out map" << dendl; + down_pending_out.erase(found); + } + } + } + // XXX: need to trim MonSession connected with a osd whose id > max_osd? + + check_osdmap_subs(); + check_pg_creates_subs(); + + share_map_with_random_osd(); + update_logger(); + process_failures(); + + // make sure our feature bits reflect the latest map + update_msgr_features(); + + if (!mon.is_leader()) { + // will be called by on_active() on the leader, avoid doing so twice + start_mapping(); + } + if (osdmap.stretch_mode_enabled) { + dout(20) << "Stretch mode enabled in this map" << dendl; + mon.try_engage_stretch_mode(); + if (osdmap.degraded_stretch_mode) { + dout(20) << "Degraded stretch mode set in this map" << dendl; + if (!osdmap.recovering_stretch_mode) { + mon.set_degraded_stretch_mode(); + if (prev_num_up_osd < osdmap.num_up_osd && + (osdmap.num_up_osd / (double)osdmap.num_osd) > + cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio")) { + // TODO: This works for 2-site clusters when the OSD maps are appropriately + // trimmed and everything is "normal" but not if you have a lot of out OSDs + // you're ignoring or in some really degenerate failure cases + dout(10) << "Enabling recovery stretch mode in this map" << dendl; + mon.go_recovery_stretch_mode(); + } + } else { + mon.set_recovery_stretch_mode(); + } + } else { + mon.set_healthy_stretch_mode(); + } + if (marked_osd_down && + (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) { + dout(20) << "Checking degraded stretch mode due to osd changes" << dendl; + mon.maybe_go_degraded_stretch_mode(); + } + } +} + +int OSDMonitor::register_cache_with_pcm() +{ + if (mon_memory_target <= 0 || mon_memory_min <= 0) { + derr << __func__ << " Invalid memory size specified for mon caches." + << " Caches will not be auto-tuned." + << dendl; + return -EINVAL; + } + uint64_t base = mon_memory_base; + double fragmentation = mon_memory_fragmentation; + // For calculating total target memory, consider rocksdb cache size. + uint64_t target = mon_memory_target; + uint64_t min = mon_memory_min; + uint64_t max = min; + + // Apply the same logic as in bluestore to set the max amount + // of memory to use for cache. Assume base memory for OSDMaps + // and then add in some overhead for fragmentation. + uint64_t ltarget = (1.0 - fragmentation) * target; + if (ltarget > base + min) { + max = ltarget - base; + } + + rocksdb_binned_kv_cache = mon.store->get_priority_cache(); + if (!rocksdb_binned_kv_cache) { + derr << __func__ << " not using rocksdb" << dendl; + return -EINVAL; + } + + int r = _set_cache_ratios(); + if (r < 0) { + derr << __func__ << " Cache ratios for pcm could not be set." + << " Review the kv (rocksdb) and mon_memory_target sizes." + << dendl; + return -EINVAL; + } + + pcm = std::make_shared<PriorityCache::Manager>( + cct, min, max, target, true); + pcm->insert("kv", rocksdb_binned_kv_cache, true); + pcm->insert("inc", inc_cache, true); + pcm->insert("full", full_cache, true); + dout(1) << __func__ << " pcm target: " << target + << " pcm max: " << max + << " pcm min: " << min + << " inc_osd_cache size: " << inc_osd_cache.get_size() + << dendl; + return 0; +} + +int OSDMonitor::_set_cache_ratios() +{ + double old_cache_kv_ratio = cache_kv_ratio; + + // Set the cache ratios for kv(rocksdb), inc and full caches + cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target; + if (cache_kv_ratio >= 1.0) { + derr << __func__ << " Cache kv ratio (" << cache_kv_ratio + << ") must be in range [0,<1.0]." + << dendl; + cache_kv_ratio = old_cache_kv_ratio; + return -EINVAL; + } + rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio); + cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2; + inc_cache->set_cache_ratio(cache_inc_ratio); + full_cache->set_cache_ratio(cache_full_ratio); + + dout(1) << __func__ << " kv ratio " << cache_kv_ratio + << " inc ratio " << cache_inc_ratio + << " full ratio " << cache_full_ratio + << dendl; + return 0; +} + +void OSDMonitor::start_mapping() +{ + // initiate mapping job + if (mapping_job) { + dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get() + << dendl; + mapping_job->abort(); + } + if (!osdmap.get_pools().empty()) { + auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch()); + mapping_job = mapping.start_update(osdmap, mapper, + g_conf()->mon_osd_mapping_pgs_per_chunk); + dout(10) << __func__ << " started mapping job " << mapping_job.get() + << " at " << fin->start << dendl; + mapping_job->set_finish_event(fin); + } else { + dout(10) << __func__ << " no pools, no mapping job" << dendl; + mapping_job = nullptr; + } +} + +void OSDMonitor::update_msgr_features() +{ + const int types[] = { + entity_name_t::TYPE_OSD, + entity_name_t::TYPE_CLIENT, + entity_name_t::TYPE_MDS, + entity_name_t::TYPE_MON + }; + for (int type : types) { + uint64_t mask; + uint64_t features = osdmap.get_features(type, &mask); + if ((mon.messenger->get_policy(type).features_required & mask) != features) { + dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl; + ceph::net::Policy p = mon.messenger->get_policy(type); + p.features_required = (p.features_required & ~mask) | features; + mon.messenger->set_policy(type, p); + } + } +} + +void OSDMonitor::on_active() +{ + update_logger(); + + if (mon.is_leader()) { + mon.clog->debug() << "osdmap " << osdmap; + if (!priority_convert) { + // Only do this once at start-up + convert_pool_priorities(); + priority_convert = true; + } + } else { + list<MonOpRequestRef> ls; + take_all_failures(ls); + while (!ls.empty()) { + MonOpRequestRef op = ls.front(); + op->mark_osdmon_event(__func__); + dispatch(op); + ls.pop_front(); + } + } + start_mapping(); +} + +void OSDMonitor::on_restart() +{ + last_osd_report.clear(); +} + +void OSDMonitor::on_shutdown() +{ + dout(10) << __func__ << dendl; + if (mapping_job) { + dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get() + << dendl; + mapping_job->abort(); + } + + // discard failure info, waiters + list<MonOpRequestRef> ls; + take_all_failures(ls); + ls.clear(); +} + +void OSDMonitor::update_logger() +{ + dout(10) << "update_logger" << dendl; + + mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds()); + mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds()); + mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds()); + mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch()); +} + +void OSDMonitor::create_pending() +{ + pending_inc = OSDMap::Incremental(osdmap.epoch+1); + pending_inc.fsid = mon.monmap->fsid; + pending_metadata.clear(); + pending_metadata_rm.clear(); + pending_pseudo_purged_snaps.clear(); + + dout(10) << "create_pending e " << pending_inc.epoch << dendl; + + // safety checks (this shouldn't really happen) + { + if (osdmap.backfillfull_ratio <= 0) { + pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio; + if (pending_inc.new_backfillfull_ratio > 1.0) + pending_inc.new_backfillfull_ratio /= 100; + dout(1) << __func__ << " setting backfillfull_ratio = " + << pending_inc.new_backfillfull_ratio << dendl; + } + if (osdmap.full_ratio <= 0) { + pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio; + if (pending_inc.new_full_ratio > 1.0) + pending_inc.new_full_ratio /= 100; + dout(1) << __func__ << " setting full_ratio = " + << pending_inc.new_full_ratio << dendl; + } + if (osdmap.nearfull_ratio <= 0) { + pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio; + if (pending_inc.new_nearfull_ratio > 1.0) + pending_inc.new_nearfull_ratio /= 100; + dout(1) << __func__ << " setting nearfull_ratio = " + << pending_inc.new_nearfull_ratio << dendl; + } + } + + // Rewrite CRUSH rule IDs if they are using legacy "ruleset" + // structure. + if (osdmap.crush->has_legacy_rule_ids()) { + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + // First, for all pools, work out which rule they really used + // by resolving ruleset to rule. + for (const auto &i : osdmap.get_pools()) { + const auto pool_id = i.first; + const auto &pool = i.second; + int new_rule_id = newcrush.find_rule(pool.crush_rule, + pool.type, pool.size); + + dout(1) << __func__ << " rewriting pool " + << osdmap.get_pool_name(pool_id) << " crush ruleset " + << pool.crush_rule << " -> rule id " << new_rule_id << dendl; + if (pending_inc.new_pools.count(pool_id) == 0) { + pending_inc.new_pools[pool_id] = pool; + } + pending_inc.new_pools[pool_id].crush_rule = new_rule_id; + } + + // Now, go ahead and renumber all the rules so that their + // rule_id field corresponds to their position in the array + auto old_to_new = newcrush.renumber_rules(); + dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl; + for (const auto &i : old_to_new) { + dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl; + } + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + } +} + +creating_pgs_t +OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc, + const OSDMap& nextmap) +{ + dout(10) << __func__ << dendl; + creating_pgs_t pending_creatings; + { + std::lock_guard<std::mutex> l(creating_pgs_lock); + pending_creatings = creating_pgs; + } + // check for new or old pools + if (pending_creatings.last_scan_epoch < inc.epoch) { + unsigned queued = 0; + queued += scan_for_creating_pgs(osdmap.get_pools(), + inc.old_pools, + inc.modified, + &pending_creatings); + queued += scan_for_creating_pgs(inc.new_pools, + inc.old_pools, + inc.modified, + &pending_creatings); + dout(10) << __func__ << " " << queued << " pools queued" << dendl; + for (auto deleted_pool : inc.old_pools) { + auto removed = pending_creatings.remove_pool(deleted_pool); + dout(10) << __func__ << " " << removed + << " pg removed because containing pool deleted: " + << deleted_pool << dendl; + last_epoch_clean.remove_pool(deleted_pool); + } + // pgmon updates its creating_pgs in check_osd_map() which is called by + // on_active() and check_osd_map() could be delayed if lease expires, so its + // creating_pgs could be stale in comparison with the one of osdmon. let's + // trim them here. otherwise, they will be added back after being erased. + unsigned removed = 0; + for (auto& pg : pending_created_pgs) { + dout(20) << __func__ << " noting created pg " << pg << dendl; + pending_creatings.created_pools.insert(pg.pool()); + removed += pending_creatings.pgs.erase(pg); + } + pending_created_pgs.clear(); + dout(10) << __func__ << " " << removed + << " pgs removed because they're created" << dendl; + pending_creatings.last_scan_epoch = osdmap.get_epoch(); + } + + // filter out any pgs that shouldn't exist. + { + auto i = pending_creatings.pgs.begin(); + while (i != pending_creatings.pgs.end()) { + if (!nextmap.pg_exists(i->first)) { + dout(10) << __func__ << " removing pg " << i->first + << " which should not exist" << dendl; + i = pending_creatings.pgs.erase(i); + } else { + ++i; + } + } + } + + // process queue + unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs); + const auto total = pending_creatings.pgs.size(); + while (pending_creatings.pgs.size() < max && + !pending_creatings.queue.empty()) { + auto p = pending_creatings.queue.begin(); + int64_t poolid = p->first; + dout(10) << __func__ << " pool " << poolid + << " created " << p->second.created + << " modified " << p->second.modified + << " [" << p->second.start << "-" << p->second.end << ")" + << dendl; + int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(), + p->second.end - p->second.start); + ps_t first = p->second.start; + ps_t end = first + n; + for (ps_t ps = first; ps < end; ++ps) { + const pg_t pgid{ps, static_cast<uint64_t>(poolid)}; + // NOTE: use the *current* epoch as the PG creation epoch so that the + // OSD does not have to generate a long set of PastIntervals. + pending_creatings.pgs.emplace( + pgid, + creating_pgs_t::pg_create_info(inc.epoch, + p->second.modified)); + dout(10) << __func__ << " adding " << pgid << dendl; + } + p->second.start = end; + if (p->second.done()) { + dout(10) << __func__ << " done with queue for " << poolid << dendl; + pending_creatings.queue.erase(p); + } else { + dout(10) << __func__ << " pool " << poolid + << " now [" << p->second.start << "-" << p->second.end << ")" + << dendl; + } + } + dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size() + << " pools" << dendl; + + if (mon.monmap->min_mon_release >= ceph_release_t::octopus) { + // walk creating pgs' history and past_intervals forward + for (auto& i : pending_creatings.pgs) { + // this mirrors PG::start_peering_interval() + pg_t pgid = i.first; + + // this is a bit imprecise, but sufficient? + struct min_size_predicate_t : public IsPGRecoverablePredicate { + const pg_pool_t *pi; + bool operator()(const set<pg_shard_t> &have) const { + return have.size() >= pi->min_size; + } + explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {} + } min_size_predicate(nextmap.get_pg_pool(pgid.pool())); + + vector<int> up, acting; + int up_primary, acting_primary; + nextmap.pg_to_up_acting_osds( + pgid, &up, &up_primary, &acting, &acting_primary); + if (i.second.history.epoch_created == 0) { + // new pg entry, set it up + i.second.up = up; + i.second.acting = acting; + i.second.up_primary = up_primary; + i.second.acting_primary = acting_primary; + i.second.history = pg_history_t(i.second.create_epoch, + i.second.create_stamp); + dout(10) << __func__ << " pg " << pgid << " just added, " + << " up " << i.second.up + << " p " << i.second.up_primary + << " acting " << i.second.acting + << " p " << i.second.acting_primary + << " history " << i.second.history + << " past_intervals " << i.second.past_intervals + << dendl; + } else { + std::stringstream debug; + if (PastIntervals::check_new_interval( + i.second.acting_primary, acting_primary, + i.second.acting, acting, + i.second.up_primary, up_primary, + i.second.up, up, + i.second.history.same_interval_since, + i.second.history.last_epoch_clean, + &nextmap, + &osdmap, + pgid, + min_size_predicate, + &i.second.past_intervals, + &debug)) { + epoch_t e = inc.epoch; + i.second.history.same_interval_since = e; + if (i.second.up != up) { + i.second.history.same_up_since = e; + } + if (i.second.acting_primary != acting_primary) { + i.second.history.same_primary_since = e; + } + if (pgid.is_split( + osdmap.get_pg_num(pgid.pool()), + nextmap.get_pg_num(pgid.pool()), + nullptr)) { + i.second.history.last_epoch_split = e; + } + dout(10) << __func__ << " pg " << pgid << " new interval," + << " up " << i.second.up << " -> " << up + << " p " << i.second.up_primary << " -> " << up_primary + << " acting " << i.second.acting << " -> " << acting + << " p " << i.second.acting_primary << " -> " + << acting_primary + << " history " << i.second.history + << " past_intervals " << i.second.past_intervals + << dendl; + dout(20) << " debug: " << debug.str() << dendl; + i.second.up = up; + i.second.acting = acting; + i.second.up_primary = up_primary; + i.second.acting_primary = acting_primary; + } + } + } + } + dout(10) << __func__ + << " " << (pending_creatings.pgs.size() - total) + << "/" << pending_creatings.pgs.size() + << " pgs added from queued pools" << dendl; + return pending_creatings; +} + +void OSDMonitor::maybe_prime_pg_temp() +{ + bool all = false; + if (pending_inc.crush.length()) { + dout(10) << __func__ << " new crush map, all" << dendl; + all = true; + } + + if (!pending_inc.new_up_client.empty()) { + dout(10) << __func__ << " new up osds, all" << dendl; + all = true; + } + + // check for interesting OSDs + set<int> osds; + for (auto p = pending_inc.new_state.begin(); + !all && p != pending_inc.new_state.end(); + ++p) { + if ((p->second & CEPH_OSD_UP) && + osdmap.is_up(p->first)) { + osds.insert(p->first); + } + } + for (auto p = pending_inc.new_weight.begin(); + !all && p != pending_inc.new_weight.end(); + ++p) { + if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) { + // weight reduction + osds.insert(p->first); + } else { + dout(10) << __func__ << " osd." << p->first << " weight increase, all" + << dendl; + all = true; + } + } + + if (!all && osds.empty()) + return; + + if (!all) { + unsigned estimate = + mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size(); + if (estimate > mapping.get_num_pgs() * + g_conf()->mon_osd_prime_pg_temp_max_estimate) { + dout(10) << __func__ << " estimate " << estimate << " pgs on " + << osds.size() << " osds >= " + << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total " + << mapping.get_num_pgs() << " pgs, all" + << dendl; + all = true; + } else { + dout(10) << __func__ << " estimate " << estimate << " pgs on " + << osds.size() << " osds" << dendl; + } + } + + OSDMap next; + next.deepish_copy_from(osdmap); + next.apply_incremental(pending_inc); + + if (next.get_pools().empty()) { + dout(10) << __func__ << " no pools, no pg_temp priming" << dendl; + } else if (all) { + PrimeTempJob job(next, this); + mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {}); + if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) { + dout(10) << __func__ << " done in " << job.get_duration() << dendl; + } else { + dout(10) << __func__ << " did not finish in " + << g_conf()->mon_osd_prime_pg_temp_max_time + << ", stopping" << dendl; + job.abort(); + } + } else { + dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl; + utime_t stop = ceph_clock_now(); + stop += g_conf()->mon_osd_prime_pg_temp_max_time; + const int chunk = 1000; + int n = chunk; + std::unordered_set<pg_t> did_pgs; + for (auto osd : osds) { + auto& pgs = mapping.get_osd_acting_pgs(osd); + dout(20) << __func__ << " osd." << osd << " " << pgs << dendl; + for (auto pgid : pgs) { + if (!did_pgs.insert(pgid).second) { + continue; + } + prime_pg_temp(next, pgid); + if (--n <= 0) { + n = chunk; + if (ceph_clock_now() > stop) { + dout(10) << __func__ << " consumed more than " + << g_conf()->mon_osd_prime_pg_temp_max_time + << " seconds, stopping" + << dendl; + return; + } + } + } + } + } +} + +void OSDMonitor::prime_pg_temp( + const OSDMap& next, + pg_t pgid) +{ + // TODO: remove this creating_pgs direct access? + if (creating_pgs.pgs.count(pgid)) { + return; + } + if (!osdmap.pg_exists(pgid)) { + return; + } + + vector<int> up, acting; + mapping.get(pgid, &up, nullptr, &acting, nullptr); + + vector<int> next_up, next_acting; + int next_up_primary, next_acting_primary; + next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary, + &next_acting, &next_acting_primary); + if (acting == next_acting && + !(up != acting && next_up == next_acting)) + return; // no change since last epoch + + if (acting.empty()) + return; // if previously empty now we can be no worse off + const pg_pool_t *pool = next.get_pg_pool(pgid.pool()); + if (pool && acting.size() < pool->min_size) + return; // can be no worse off than before + + if (next_up == next_acting) { + acting.clear(); + dout(20) << __func__ << " next_up == next_acting now, clear pg_temp" + << dendl; + } + + dout(20) << __func__ << " " << pgid << " " << up << "/" << acting + << " -> " << next_up << "/" << next_acting + << ", priming " << acting + << dendl; + { + std::lock_guard l(prime_pg_temp_lock); + // do not touch a mapping if a change is pending + pending_inc.new_pg_temp.emplace( + pgid, + mempool::osdmap::vector<int>(acting.begin(), acting.end())); + } +} + +/** + * @note receiving a transaction in this function gives a fair amount of + * freedom to the service implementation if it does need it. It shouldn't. + */ +void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) +{ + dout(10) << "encode_pending e " << pending_inc.epoch + << dendl; + + if (do_prune(t)) { + dout(1) << __func__ << " osdmap full prune encoded e" + << pending_inc.epoch << dendl; + } + + // finalize up pending_inc + pending_inc.modified = ceph_clock_now(); + + int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap); + ceph_assert(r == 0); + + if (mapping_job) { + if (!mapping_job->is_done()) { + dout(1) << __func__ << " skipping prime_pg_temp; mapping job " + << mapping_job.get() << " did not complete, " + << mapping_job->shards << " left" << dendl; + mapping_job->abort(); + } else if (mapping.get_epoch() < osdmap.get_epoch()) { + dout(1) << __func__ << " skipping prime_pg_temp; mapping job " + << mapping_job.get() << " is prior epoch " + << mapping.get_epoch() << dendl; + } else { + if (g_conf()->mon_osd_prime_pg_temp) { + maybe_prime_pg_temp(); + } + } + } else if (g_conf()->mon_osd_prime_pg_temp) { + dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start" + << dendl; + } + mapping_job.reset(); + + // ensure we don't have blank new_state updates. these are interrpeted as + // CEPH_OSD_UP (and almost certainly not what we want!). + auto p = pending_inc.new_state.begin(); + while (p != pending_inc.new_state.end()) { + if (p->second == 0) { + dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl; + p = pending_inc.new_state.erase(p); + } else { + if (p->second & CEPH_OSD_UP) { + pending_inc.new_last_up_change = pending_inc.modified; + } + ++p; + } + } + if (!pending_inc.new_up_client.empty()) { + pending_inc.new_last_up_change = pending_inc.modified; + } + for (auto& i : pending_inc.new_weight) { + if (i.first >= osdmap.max_osd) { + if (i.second) { + // new osd is already marked in + pending_inc.new_last_in_change = pending_inc.modified; + break; + } + } else if (!!i.second != !!osdmap.osd_weight[i.first]) { + // existing osd marked in or out + pending_inc.new_last_in_change = pending_inc.modified; + break; + } + } + + { + OSDMap tmp; + tmp.deepish_copy_from(osdmap); + tmp.apply_incremental(pending_inc); + + // clean pg_temp mappings + OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc); + + // clean inappropriate pg_upmap/pg_upmap_items (if any) + { + // check every upmapped pg for now + // until we could reliably identify certain cases to ignore, + // which is obviously the hard part TBD.. + vector<pg_t> pgs_to_check; + tmp.get_upmap_pgs(&pgs_to_check); + if (pgs_to_check.size() < + static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) { + // not enough pgs, do it inline + tmp.clean_pg_upmaps(cct, &pending_inc); + } else { + CleanUpmapJob job(cct, tmp, pending_inc); + mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check); + job.wait(); + } + } + + // update creating pgs first so that we can remove the created pgid and + // process the pool flag removal below in the same osdmap epoch. + auto pending_creatings = update_pending_pgs(pending_inc, tmp); + bufferlist creatings_bl; + uint64_t features = CEPH_FEATURES_ALL; + if (mon.monmap->min_mon_release < ceph_release_t::octopus) { + dout(20) << __func__ << " encoding pending pgs without octopus features" + << dendl; + features &= ~CEPH_FEATURE_SERVER_OCTOPUS; + } + encode(pending_creatings, creatings_bl, features); + t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl); + + // remove any old (or incompat) POOL_CREATING flags + for (auto& i : tmp.get_pools()) { + if (tmp.require_osd_release < ceph_release_t::nautilus) { + // pre-nautilus OSDMaps shouldn't get this flag. + if (pending_inc.new_pools.count(i.first)) { + pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING; + } + } + if (i.second.has_flag(pg_pool_t::FLAG_CREATING) && + !pending_creatings.still_creating_pool(i.first)) { + dout(10) << __func__ << " done creating pool " << i.first + << ", clearing CREATING flag" << dendl; + if (pending_inc.new_pools.count(i.first) == 0) { + pending_inc.new_pools[i.first] = i.second; + } + pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING; + } + } + + // collect which pools are currently affected by + // the near/backfill/full osd(s), + // and set per-pool near/backfill/full flag instead + set<int64_t> full_pool_ids; + set<int64_t> backfillfull_pool_ids; + set<int64_t> nearfull_pool_ids; + tmp.get_full_pools(cct, + &full_pool_ids, + &backfillfull_pool_ids, + &nearfull_pool_ids); + if (full_pool_ids.empty() || + backfillfull_pool_ids.empty() || + nearfull_pool_ids.empty()) { + // normal case - no nearfull, backfillfull or full osds + // try cancel any improper nearfull/backfillfull/full pool + // flags first + for (auto &pool: tmp.get_pools()) { + auto p = pool.first; + if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) && + nearfull_pool_ids.empty()) { + dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p] + << "'s nearfull flag" << dendl; + if (pending_inc.new_pools.count(p) == 0) { + // load original pool info first! + pending_inc.new_pools[p] = pool.second; + } + pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL; + } + if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) && + backfillfull_pool_ids.empty()) { + dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p] + << "'s backfillfull flag" << dendl; + if (pending_inc.new_pools.count(p) == 0) { + pending_inc.new_pools[p] = pool.second; + } + pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL; + } + if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) && + full_pool_ids.empty()) { + if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) { + // set by EQUOTA, skipping + continue; + } + dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p] + << "'s full flag" << dendl; + if (pending_inc.new_pools.count(p) == 0) { + pending_inc.new_pools[p] = pool.second; + } + pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL; + } + } + } + if (!full_pool_ids.empty()) { + dout(10) << __func__ << " marking pool(s) " << full_pool_ids + << " as full" << dendl; + for (auto &p: full_pool_ids) { + if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) { + continue; + } + if (pending_inc.new_pools.count(p) == 0) { + pending_inc.new_pools[p] = tmp.pools[p]; + } + pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL; + pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL; + pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL; + } + // cancel FLAG_FULL for pools which are no longer full too + for (auto &pool: tmp.get_pools()) { + auto p = pool.first; + if (full_pool_ids.count(p)) { + // skip pools we have just marked as full above + continue; + } + if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) || + tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) { + // don't touch if currently is not full + // or is running out of quota (and hence considered as full) + continue; + } + dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p] + << "'s full flag" << dendl; + if (pending_inc.new_pools.count(p) == 0) { + pending_inc.new_pools[p] = pool.second; + } + pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL; + } + } + if (!backfillfull_pool_ids.empty()) { + for (auto &p: backfillfull_pool_ids) { + if (full_pool_ids.count(p)) { + // skip pools we have already considered as full above + continue; + } + if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) { + // make sure FLAG_FULL is truly set, so we are safe not + // to set a extra (redundant) FLAG_BACKFILLFULL flag + ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)); + continue; + } + if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) { + // don't bother if pool is already marked as backfillfull + continue; + } + dout(10) << __func__ << " marking pool '" << tmp.pool_name[p] + << "'s as backfillfull" << dendl; + if (pending_inc.new_pools.count(p) == 0) { + pending_inc.new_pools[p] = tmp.pools[p]; + } + pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL; + pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL; + } + // cancel FLAG_BACKFILLFULL for pools + // which are no longer backfillfull too + for (auto &pool: tmp.get_pools()) { + auto p = pool.first; + if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) { + // skip pools we have just marked as backfillfull/full above + continue; + } + if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) { + // and don't touch if currently is not backfillfull + continue; + } + dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p] + << "'s backfillfull flag" << dendl; + if (pending_inc.new_pools.count(p) == 0) { + pending_inc.new_pools[p] = pool.second; + } + pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL; + } + } + if (!nearfull_pool_ids.empty()) { + for (auto &p: nearfull_pool_ids) { + if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) { + continue; + } + if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) { + // make sure FLAG_FULL is truly set, so we are safe not + // to set a extra (redundant) FLAG_NEARFULL flag + ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)); + continue; + } + if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) { + // don't bother if pool is already marked as nearfull + continue; + } + dout(10) << __func__ << " marking pool '" << tmp.pool_name[p] + << "'s as nearfull" << dendl; + if (pending_inc.new_pools.count(p) == 0) { + pending_inc.new_pools[p] = tmp.pools[p]; + } + pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL; + } + // cancel FLAG_NEARFULL for pools + // which are no longer nearfull too + for (auto &pool: tmp.get_pools()) { + auto p = pool.first; + if (full_pool_ids.count(p) || + backfillfull_pool_ids.count(p) || + nearfull_pool_ids.count(p)) { + // skip pools we have just marked as + // nearfull/backfillfull/full above + continue; + } + if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) { + // and don't touch if currently is not nearfull + continue; + } + dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p] + << "'s nearfull flag" << dendl; + if (pending_inc.new_pools.count(p) == 0) { + pending_inc.new_pools[p] = pool.second; + } + pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL; + } + } + + // min_compat_client? + if (!tmp.require_min_compat_client) { + auto mv = tmp.get_min_compat_client(); + dout(1) << __func__ << " setting require_min_compat_client to currently " + << "required " << mv << dendl; + mon.clog->info() << "setting require_min_compat_client to currently " + << "required " << mv; + pending_inc.new_require_min_compat_client = mv; + } + + if (osdmap.require_osd_release < ceph_release_t::nautilus && + tmp.require_osd_release >= ceph_release_t::nautilus) { + dout(10) << __func__ << " first nautilus+ epoch" << dendl; + // add creating flags? + for (auto& i : tmp.get_pools()) { + if (pending_creatings.still_creating_pool(i.first)) { + dout(10) << __func__ << " adding CREATING flag to pool " << i.first + << dendl; + if (pending_inc.new_pools.count(i.first) == 0) { + pending_inc.new_pools[i.first] = i.second; + } + pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING; + } + } + // adjust blocklist items to all be TYPE_ANY + for (auto& i : tmp.blocklist) { + auto a = i.first; + a.set_type(entity_addr_t::TYPE_ANY); + pending_inc.new_blocklist[a] = i.second; + pending_inc.old_blocklist.push_back(i.first); + } + } + + if (osdmap.require_osd_release < ceph_release_t::octopus && + tmp.require_osd_release >= ceph_release_t::octopus) { + dout(10) << __func__ << " first octopus+ epoch" << dendl; + + // adjust obsoleted cache modes + for (auto& [poolid, pi] : tmp.pools) { + if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) { + if (pending_inc.new_pools.count(poolid) == 0) { + pending_inc.new_pools[poolid] = pi; + } + dout(10) << __func__ << " switching pool " << poolid + << " cachemode from forward -> proxy" << dendl; + pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY; + } + if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) { + if (pending_inc.new_pools.count(poolid) == 0) { + pending_inc.new_pools[poolid] = pi; + } + dout(10) << __func__ << " switching pool " << poolid + << " cachemode from readforward -> readproxy" << dendl; + pending_inc.new_pools[poolid].cache_mode = + pg_pool_t::CACHEMODE_READPROXY; + } + } + + // clear removed_snaps for every pool + for (auto& [poolid, pi] : tmp.pools) { + if (pi.removed_snaps.empty()) { + continue; + } + if (pending_inc.new_pools.count(poolid) == 0) { + pending_inc.new_pools[poolid] = pi; + } + dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps" + << dendl; + pending_inc.new_pools[poolid].removed_snaps.clear(); + } + + // create a combined purged snap epoch key for all purged snaps + // prior to this epoch, and store it in the current epoch (i.e., + // the last pre-octopus epoch, just prior to the one we're + // encoding now). + auto it = mon.store->get_iterator(OSD_SNAP_PREFIX); + it->lower_bound("purged_snap_"); + map<int64_t,snap_interval_set_t> combined; + while (it->valid()) { + if (it->key().find("purged_snap_") != 0) { + break; + } + string k = it->key(); + long long unsigned pool; + int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool); + if (n != 1) { + derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl; + } else { + bufferlist v = it->value(); + auto p = v.cbegin(); + snapid_t begin, end; + ceph::decode(begin, p); + ceph::decode(end, p); + combined[pool].insert(begin, end - begin); + } + it->next(); + } + if (!combined.empty()) { + string k = make_purged_snap_epoch_key(pending_inc.epoch - 1); + bufferlist v; + ceph::encode(combined, v); + t->put(OSD_SNAP_PREFIX, k, v); + dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch " + << (pending_inc.epoch - 1) << ", " << v.length() << " bytes" + << dendl; + } else { + dout(10) << __func__ << " there were no pre-octopus purged snaps" + << dendl; + } + + // clean out the old removed_snap_ and removed_epoch keys + // ('`' is ASCII '_' + 1) + t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`"); + t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`"); + } + } + + // tell me about it + for (auto i = pending_inc.new_state.begin(); + i != pending_inc.new_state.end(); + ++i) { + int s = i->second ? i->second : CEPH_OSD_UP; + if (s & CEPH_OSD_UP) { + dout(2) << " osd." << i->first << " DOWN" << dendl; + // Reset laggy parameters if failure interval exceeds a threshold. + const osd_xinfo_t& xi = osdmap.get_xinfo(i->first); + if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) { + int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec(); + if (grace_interval_threshold_exceeded(last_failure_interval)) { + set_default_laggy_params(i->first); + } + } + } + if (s & CEPH_OSD_EXISTS) + dout(2) << " osd." << i->first << " DNE" << dendl; + } + for (auto i = pending_inc.new_up_client.begin(); + i != pending_inc.new_up_client.end(); + ++i) { + //FIXME: insert cluster addresses too + dout(2) << " osd." << i->first << " UP " << i->second << dendl; + } + for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin(); + i != pending_inc.new_weight.end(); + ++i) { + if (i->second == CEPH_OSD_OUT) { + dout(2) << " osd." << i->first << " OUT" << dendl; + } else if (i->second == CEPH_OSD_IN) { + dout(2) << " osd." << i->first << " IN" << dendl; + } else { + dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl; + } + } + + // features for osdmap and its incremental + uint64_t features; + + // encode full map and determine its crc + OSDMap tmp; + { + tmp.deepish_copy_from(osdmap); + tmp.apply_incremental(pending_inc); + + // determine appropriate features + features = tmp.get_encoding_features(); + dout(10) << __func__ << " encoding full map with " + << tmp.require_osd_release + << " features " << features << dendl; + + // the features should be a subset of the mon quorum's features! + ceph_assert((features & ~mon.get_quorum_con_features()) == 0); + + bufferlist fullbl; + encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED); + pending_inc.full_crc = tmp.get_crc(); + + // include full map in the txn. note that old monitors will + // overwrite this. new ones will now skip the local full map + // encode and reload from this. + put_version_full(t, pending_inc.epoch, fullbl); + } + + // encode + ceph_assert(get_last_committed() + 1 == pending_inc.epoch); + bufferlist bl; + encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED); + + dout(20) << " full_crc " << tmp.get_crc() + << " inc_crc " << pending_inc.inc_crc << dendl; + + /* put everything in the transaction */ + put_version(t, pending_inc.epoch, bl); + put_last_committed(t, pending_inc.epoch); + + // metadata, too! + for (map<int,bufferlist>::iterator p = pending_metadata.begin(); + p != pending_metadata.end(); + ++p) + t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second); + for (set<int>::iterator p = pending_metadata_rm.begin(); + p != pending_metadata_rm.end(); + ++p) + t->erase(OSD_METADATA_PREFIX, stringify(*p)); + pending_metadata.clear(); + pending_metadata_rm.clear(); + + // purged_snaps + if (tmp.require_osd_release >= ceph_release_t::octopus && + !pending_inc.new_purged_snaps.empty()) { + // all snaps purged this epoch (across all pools) + string k = make_purged_snap_epoch_key(pending_inc.epoch); + bufferlist v; + encode(pending_inc.new_purged_snaps, v); + t->put(OSD_SNAP_PREFIX, k, v); + } + for (auto& i : pending_inc.new_purged_snaps) { + for (auto q = i.second.begin(); + q != i.second.end(); + ++q) { + insert_purged_snap_update(i.first, q.get_start(), q.get_end(), + pending_inc.epoch, + t); + } + } + for (auto& [pool, snaps] : pending_pseudo_purged_snaps) { + for (auto snap : snaps) { + insert_purged_snap_update(pool, snap, snap + 1, + pending_inc.epoch, + t); + } + } + + // health + health_check_map_t next; + tmp.check_health(cct, &next); + encode_health(next, t); +} + +int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err) +{ + bufferlist bl; + int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl); + if (r < 0) + return r; + try { + auto p = bl.cbegin(); + decode(m, p); + } + catch (ceph::buffer::error& e) { + if (err) + *err << "osd." << osd << " metadata is corrupt"; + return -EIO; + } + return 0; +} + +void OSDMonitor::count_metadata(const string& field, map<string,int> *out) +{ + for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) { + if (osdmap.is_up(osd)) { + map<string,string> meta; + load_metadata(osd, meta, nullptr); + auto p = meta.find(field); + if (p == meta.end()) { + (*out)["unknown"]++; + } else { + (*out)[p->second]++; + } + } + } +} + +void OSDMonitor::count_metadata(const string& field, Formatter *f) +{ + map<string,int> by_val; + count_metadata(field, &by_val); + f->open_object_section(field.c_str()); + for (auto& p : by_val) { + f->dump_int(p.first.c_str(), p.second); + } + f->close_section(); +} + +void OSDMonitor::get_versions(std::map<string, list<string>> &versions) +{ + for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) { + if (osdmap.is_up(osd)) { + map<string,string> meta; + load_metadata(osd, meta, nullptr); + auto p = meta.find("ceph_version_short"); + if (p == meta.end()) continue; + versions[p->second].push_back(string("osd.") + stringify(osd)); + } + } +} + +int OSDMonitor::get_osd_objectstore_type(int osd, string *type) +{ + map<string, string> metadata; + int r = load_metadata(osd, metadata, nullptr); + if (r < 0) + return r; + + auto it = metadata.find("osd_objectstore"); + if (it == metadata.end()) + return -ENOENT; + *type = it->second; + return 0; +} + +bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id, + const pg_pool_t &pool, + ostream *err) +{ + // just check a few pgs for efficiency - this can't give a guarantee anyway, + // since filestore osds could always join the pool later + set<int> checked_osds; + for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) { + vector<int> up, acting; + pg_t pgid(ps, pool_id); + osdmap.pg_to_up_acting_osds(pgid, up, acting); + for (int osd : up) { + if (checked_osds.find(osd) != checked_osds.end()) + continue; + string objectstore_type; + int r = get_osd_objectstore_type(osd, &objectstore_type); + // allow with missing metadata, e.g. due to an osd never booting yet + if (r < 0 || objectstore_type == "bluestore") { + checked_osds.insert(osd); + continue; + } + *err << "osd." << osd << " uses " << objectstore_type; + return false; + } + } + return true; +} + +int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err) +{ + map<string,string> m; + if (int r = load_metadata(osd, m, err)) + return r; + for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p) + f->dump_string(p->first.c_str(), p->second); + return 0; +} + +void OSDMonitor::print_nodes(Formatter *f) +{ + // group OSDs by their hosts + map<string, list<int> > osds; // hostname => osd + for (int osd = 0; osd < osdmap.get_max_osd(); osd++) { + map<string, string> m; + if (load_metadata(osd, m, NULL)) { + continue; + } + map<string, string>::iterator hostname = m.find("hostname"); + if (hostname == m.end()) { + // not likely though + continue; + } + osds[hostname->second].push_back(osd); + } + + dump_services(f, osds, "osd"); +} + +void OSDMonitor::share_map_with_random_osd() +{ + if (osdmap.get_num_up_osds() == 0) { + dout(10) << __func__ << " no up osds, don't share with anyone" << dendl; + return; + } + + MonSession *s = mon.session_map.get_random_osd_session(&osdmap); + if (!s) { + dout(10) << __func__ << " no up osd on our session map" << dendl; + return; + } + + dout(10) << "committed, telling random " << s->name + << " all about it" << dendl; + + // get feature of the peer + // use quorum_con_features, if it's an anonymous connection. + uint64_t features = s->con_features ? s->con_features : + mon.get_quorum_con_features(); + // whatev, they'll request more if they need it + MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features); + s->con->send_message(m); + // NOTE: do *not* record osd has up to this epoch (as we do + // elsewhere) as they may still need to request older values. +} + +version_t OSDMonitor::get_trim_to() const +{ + if (mon.get_quorum().empty()) { + dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl; + return 0; + } + + { + std::lock_guard<std::mutex> l(creating_pgs_lock); + if (!creating_pgs.pgs.empty()) { + dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl; + return 0; + } + } + + if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) { + dout(0) << __func__ + << " blocking osdmap trim" + << " ('mon_debug_block_osdmap_trim' set to 'true')" + << " trim_to = 0" << dendl; + return 0; + } + + { + epoch_t floor = get_min_last_epoch_clean(); + dout(10) << " min_last_epoch_clean " << floor << dendl; + if (g_conf()->mon_osd_force_trim_to > 0 && + g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) { + floor = g_conf()->mon_osd_force_trim_to; + dout(10) << __func__ + << " explicit mon_osd_force_trim_to = " << floor << dendl; + } + unsigned min = g_conf()->mon_min_osdmap_epochs; + if (floor + min > get_last_committed()) { + if (min < get_last_committed()) + floor = get_last_committed() - min; + else + floor = 0; + } + if (floor > get_first_committed()) { + dout(10) << __func__ << " trim_to = " << floor << dendl; + return floor; + } + } + dout(10) << __func__ << " trim_to = 0" << dendl; + return 0; +} + +epoch_t OSDMonitor::get_min_last_epoch_clean() const +{ + auto floor = last_epoch_clean.get_lower_bound(osdmap); + // also scan osd epochs + // don't trim past the oldest reported osd epoch + for (auto [osd, epoch] : osd_epochs) { + if (epoch < floor) { + floor = epoch; + } + } + return floor; +} + +void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx, + version_t first) +{ + dout(10) << __func__ << " including full map for e " << first << dendl; + bufferlist bl; + get_version_full(first, bl); + put_version_full(tx, first, bl); + + if (has_osdmap_manifest && + first > osdmap_manifest.get_first_pinned()) { + _prune_update_trimmed(tx, first); + } +} + + +/* full osdmap prune + * + * for more information, please refer to doc/dev/mon-osdmap-prune.rst + */ + +void OSDMonitor::load_osdmap_manifest() +{ + bool store_has_manifest = + mon.store->exists(get_service_name(), "osdmap_manifest"); + + if (!store_has_manifest) { + if (!has_osdmap_manifest) { + return; + } + + dout(20) << __func__ + << " dropping osdmap manifest from memory." << dendl; + osdmap_manifest = osdmap_manifest_t(); + has_osdmap_manifest = false; + return; + } + + dout(20) << __func__ + << " osdmap manifest detected in store; reload." << dendl; + + bufferlist manifest_bl; + int r = get_value("osdmap_manifest", manifest_bl); + if (r < 0) { + derr << __func__ << " unable to read osdmap version manifest" << dendl; + ceph_abort_msg("error reading manifest"); + } + osdmap_manifest.decode(manifest_bl); + has_osdmap_manifest = true; + + dout(10) << __func__ << " store osdmap manifest pinned (" + << osdmap_manifest.get_first_pinned() + << " .. " + << osdmap_manifest.get_last_pinned() + << ")" + << dendl; +} + +bool OSDMonitor::should_prune() const +{ + version_t first = get_first_committed(); + version_t last = get_last_committed(); + version_t min_osdmap_epochs = + g_conf().get_val<int64_t>("mon_min_osdmap_epochs"); + version_t prune_min = + g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min"); + version_t prune_interval = + g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval"); + version_t last_pinned = osdmap_manifest.get_last_pinned(); + version_t last_to_pin = last - min_osdmap_epochs; + + // Make it or break it constraints. + // + // If any of these conditions fails, we will not prune, regardless of + // whether we have an on-disk manifest with an on-going pruning state. + // + if ((last - first) <= min_osdmap_epochs) { + // between the first and last committed epochs, we don't have + // enough epochs to trim, much less to prune. + dout(10) << __func__ + << " currently holding only " << (last - first) + << " epochs (min osdmap epochs: " << min_osdmap_epochs + << "); do not prune." + << dendl; + return false; + + } else if ((last_to_pin - first) < prune_min) { + // between the first committed epoch and the last epoch we would prune, + // we simply don't have enough versions over the minimum to prune maps. + dout(10) << __func__ + << " could only prune " << (last_to_pin - first) + << " epochs (" << first << ".." << last_to_pin << "), which" + " is less than the required minimum (" << prune_min << ")" + << dendl; + return false; + + } else if (has_osdmap_manifest && last_pinned >= last_to_pin) { + dout(10) << __func__ + << " we have pruned as far as we can; do not prune." + << dendl; + return false; + + } else if (last_pinned + prune_interval > last_to_pin) { + dout(10) << __func__ + << " not enough epochs to form an interval (last pinned: " + << last_pinned << ", last to pin: " + << last_to_pin << ", interval: " << prune_interval << ")" + << dendl; + return false; + } + + dout(15) << __func__ + << " should prune (" << last_pinned << ".." << last_to_pin << ")" + << " lc (" << first << ".." << last << ")" + << dendl; + return true; +} + +void OSDMonitor::_prune_update_trimmed( + MonitorDBStore::TransactionRef tx, + version_t first) +{ + dout(10) << __func__ + << " first " << first + << " last_pinned " << osdmap_manifest.get_last_pinned() + << dendl; + + osdmap_manifest_t manifest = osdmap_manifest; + + if (!manifest.is_pinned(first)) { + manifest.pin(first); + } + + set<version_t>::iterator p_end = manifest.pinned.find(first); + set<version_t>::iterator p = manifest.pinned.begin(); + manifest.pinned.erase(p, p_end); + ceph_assert(manifest.get_first_pinned() == first); + + if (manifest.get_last_pinned() == first+1 || + manifest.pinned.size() == 1) { + // we reached the end of the line, as pinned maps go; clean up our + // manifest, and let `should_prune()` decide whether we should prune + // again. + tx->erase(get_service_name(), "osdmap_manifest"); + return; + } + + bufferlist bl; + manifest.encode(bl); + tx->put(get_service_name(), "osdmap_manifest", bl); +} + +void OSDMonitor::prune_init(osdmap_manifest_t& manifest) +{ + dout(1) << __func__ << dendl; + + version_t pin_first; + + // verify constrainsts on stable in-memory state + if (!has_osdmap_manifest) { + // we must have never pruned, OR if we pruned the state must no longer + // be relevant (i.e., the state must have been removed alongside with + // the trim that *must* have removed past the last pinned map in a + // previous prune). + ceph_assert(osdmap_manifest.pinned.empty()); + ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest")); + pin_first = get_first_committed(); + + } else { + // we must have pruned in the past AND its state is still relevant + // (i.e., even if we trimmed, we still hold pinned maps in the manifest, + // and thus we still hold a manifest in the store). + ceph_assert(!osdmap_manifest.pinned.empty()); + ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed()); + ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed()); + + dout(10) << __func__ + << " first_pinned " << osdmap_manifest.get_first_pinned() + << " last_pinned " << osdmap_manifest.get_last_pinned() + << dendl; + + pin_first = osdmap_manifest.get_last_pinned(); + } + + manifest.pin(pin_first); +} + +bool OSDMonitor::_prune_sanitize_options() const +{ + uint64_t prune_interval = + g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval"); + uint64_t prune_min = + g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min"); + uint64_t txsize = + g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize"); + + bool r = true; + + if (prune_interval == 0) { + derr << __func__ + << " prune is enabled BUT prune interval is zero; abort." + << dendl; + r = false; + } else if (prune_interval == 1) { + derr << __func__ + << " prune interval is equal to one, which essentially means" + " no pruning; abort." + << dendl; + r = false; + } + if (prune_min == 0) { + derr << __func__ + << " prune is enabled BUT prune min is zero; abort." + << dendl; + r = false; + } + if (prune_interval > prune_min) { + derr << __func__ + << " impossible to ascertain proper prune interval because" + << " it is greater than the minimum prune epochs" + << " (min: " << prune_min << ", interval: " << prune_interval << ")" + << dendl; + r = false; + } + + if (txsize < prune_interval - 1) { + derr << __func__ + << " 'mon_osdmap_full_prune_txsize' (" << txsize + << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1 + << "); abort." << dendl; + r = false; + } + return r; +} + +bool OSDMonitor::is_prune_enabled() const { + return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled"); +} + +bool OSDMonitor::is_prune_supported() const { + return mon.get_required_mon_features().contains_any( + ceph::features::mon::FEATURE_OSDMAP_PRUNE); +} + +/** do_prune + * + * @returns true if has side-effects; false otherwise. + */ +bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx) +{ + bool enabled = is_prune_enabled(); + + dout(1) << __func__ << " osdmap full prune " + << ( enabled ? "enabled" : "disabled") + << dendl; + + if (!enabled || !_prune_sanitize_options() || !should_prune()) { + return false; + } + + // we are beyond the minimum prune versions, we need to remove maps because + // otherwise the store will grow unbounded and we may end up having issues + // with available disk space or store hangs. + + // we will not pin all versions. We will leave a buffer number of versions. + // this allows us the monitor to trim maps without caring too much about + // pinned maps, and then allow us to use another ceph-mon without these + // capabilities, without having to repair the store. + + osdmap_manifest_t manifest = osdmap_manifest; + + version_t first = get_first_committed(); + version_t last = get_last_committed(); + + version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs; + version_t last_pinned = manifest.get_last_pinned(); + uint64_t prune_interval = + g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval"); + uint64_t txsize = + g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize"); + + prune_init(manifest); + + // we need to get rid of some osdmaps + + dout(5) << __func__ + << " lc (" << first << " .. " << last << ")" + << " last_pinned " << last_pinned + << " interval " << prune_interval + << " last_to_pin " << last_to_pin + << dendl; + + // We will be erasing maps as we go. + // + // We will erase all maps between `last_pinned` and the `next_to_pin`. + // + // If `next_to_pin` happens to be greater than `last_to_pin`, then + // we stop pruning. We could prune the maps between `next_to_pin` and + // `last_to_pin`, but by not doing it we end up with neater pruned + // intervals, aligned with `prune_interval`. Besides, this should not be a + // problem as long as `prune_interval` is set to a sane value, instead of + // hundreds or thousands of maps. + + auto map_exists = [this](version_t v) { + string k = mon.store->combine_strings("full", v); + return mon.store->exists(get_service_name(), k); + }; + + // 'interval' represents the number of maps from the last pinned + // i.e., if we pinned version 1 and have an interval of 10, we're pinning + // version 11 next; all intermediate versions will be removed. + // + // 'txsize' represents the maximum number of versions we'll be removing in + // this iteration. If 'txsize' is large enough to perform multiple passes + // pinning and removing maps, we will do so; if not, we'll do at least one + // pass. We are quite relaxed about honouring 'txsize', but we'll always + // ensure that we never go *over* the maximum. + + // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps. + uint64_t removal_interval = prune_interval - 1; + + if (txsize < removal_interval) { + dout(5) << __func__ + << " setting txsize to removal interval size (" + << removal_interval << " versions" + << dendl; + txsize = removal_interval; + } + ceph_assert(removal_interval > 0); + + uint64_t num_pruned = 0; + while (num_pruned + removal_interval <= txsize) { + last_pinned = manifest.get_last_pinned(); + + if (last_pinned + prune_interval > last_to_pin) { + break; + } + ceph_assert(last_pinned < last_to_pin); + + version_t next_pinned = last_pinned + prune_interval; + ceph_assert(next_pinned <= last_to_pin); + manifest.pin(next_pinned); + + dout(20) << __func__ + << " last_pinned " << last_pinned + << " next_pinned " << next_pinned + << " num_pruned " << num_pruned + << " removal interval (" << (last_pinned+1) + << ".." << (next_pinned-1) << ")" + << " txsize " << txsize << dendl; + + ceph_assert(map_exists(last_pinned)); + ceph_assert(map_exists(next_pinned)); + + for (version_t v = last_pinned+1; v < next_pinned; ++v) { + ceph_assert(!manifest.is_pinned(v)); + + dout(20) << __func__ << " pruning full osdmap e" << v << dendl; + string full_key = mon.store->combine_strings("full", v); + tx->erase(get_service_name(), full_key); + ++num_pruned; + } + } + + ceph_assert(num_pruned > 0); + + bufferlist bl; + manifest.encode(bl); + tx->put(get_service_name(), "osdmap_manifest", bl); + + return true; +} + + +// ------------- + +bool OSDMonitor::preprocess_query(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + Message *m = op->get_req(); + dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl; + + switch (m->get_type()) { + // READs + case MSG_MON_COMMAND: + try { + return preprocess_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + case CEPH_MSG_MON_GET_OSDMAP: + return preprocess_get_osdmap(op); + + // damp updates + case MSG_OSD_MARK_ME_DOWN: + return preprocess_mark_me_down(op); + case MSG_OSD_MARK_ME_DEAD: + return preprocess_mark_me_dead(op); + case MSG_OSD_FULL: + return preprocess_full(op); + case MSG_OSD_FAILURE: + return preprocess_failure(op); + case MSG_OSD_BOOT: + return preprocess_boot(op); + case MSG_OSD_ALIVE: + return preprocess_alive(op); + case MSG_OSD_PG_CREATED: + return preprocess_pg_created(op); + case MSG_OSD_PG_READY_TO_MERGE: + return preprocess_pg_ready_to_merge(op); + case MSG_OSD_PGTEMP: + return preprocess_pgtemp(op); + case MSG_OSD_BEACON: + return preprocess_beacon(op); + + case CEPH_MSG_POOLOP: + return preprocess_pool_op(op); + + case MSG_REMOVE_SNAPS: + return preprocess_remove_snaps(op); + + case MSG_MON_GET_PURGED_SNAPS: + return preprocess_get_purged_snaps(op); + + default: + ceph_abort(); + return true; + } +} + +bool OSDMonitor::prepare_update(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + Message *m = op->get_req(); + dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl; + + switch (m->get_type()) { + // damp updates + case MSG_OSD_MARK_ME_DOWN: + return prepare_mark_me_down(op); + case MSG_OSD_MARK_ME_DEAD: + return prepare_mark_me_dead(op); + case MSG_OSD_FULL: + return prepare_full(op); + case MSG_OSD_FAILURE: + return prepare_failure(op); + case MSG_OSD_BOOT: + return prepare_boot(op); + case MSG_OSD_ALIVE: + return prepare_alive(op); + case MSG_OSD_PG_CREATED: + return prepare_pg_created(op); + case MSG_OSD_PGTEMP: + return prepare_pgtemp(op); + case MSG_OSD_PG_READY_TO_MERGE: + return prepare_pg_ready_to_merge(op); + case MSG_OSD_BEACON: + return prepare_beacon(op); + + case MSG_MON_COMMAND: + try { + return prepare_command(op); + } catch (const bad_cmd_get& e) { + bufferlist bl; + mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); + return true; + } + + case CEPH_MSG_POOLOP: + return prepare_pool_op(op); + + case MSG_REMOVE_SNAPS: + return prepare_remove_snaps(op); + + + default: + ceph_abort(); + } + + return false; +} + +bool OSDMonitor::should_propose(double& delay) +{ + dout(10) << "should_propose" << dendl; + + // if full map, propose immediately! any subsequent changes will be clobbered. + if (pending_inc.fullmap.length()) + return true; + + // adjust osd weights? + if (!osd_weight.empty() && + osd_weight.size() == (unsigned)osdmap.get_max_osd()) { + dout(0) << " adjusting osd weights based on " << osd_weight << dendl; + osdmap.adjust_osd_weights(osd_weight, pending_inc); + delay = 0.0; + osd_weight.clear(); + return true; + } + + return PaxosService::should_propose(delay); +} + + + +// --------------------------- +// READs + +bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MMonGetOSDMap>(); + + uint64_t features = mon.get_quorum_con_features(); + if (op->get_session() && op->get_session()->con_features) + features = op->get_session()->con_features; + + dout(10) << __func__ << " " << *m << dendl; + MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features); + epoch_t first = get_first_committed(); + epoch_t last = osdmap.get_epoch(); + int max = g_conf()->osd_map_message_max; + ssize_t max_bytes = g_conf()->osd_map_message_max_bytes; + for (epoch_t e = std::max(first, m->get_full_first()); + e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0; + ++e, --max) { + bufferlist& bl = reply->maps[e]; + int r = get_version_full(e, features, bl); + ceph_assert(r >= 0); + max_bytes -= bl.length(); + } + for (epoch_t e = std::max(first, m->get_inc_first()); + e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0; + ++e, --max) { + bufferlist& bl = reply->incremental_maps[e]; + int r = get_version(e, features, bl); + ceph_assert(r >= 0); + max_bytes -= bl.length(); + } + reply->oldest_map = first; + reply->newest_map = last; + mon.send_reply(op, reply); + return true; +} + + +// --------------------------- +// UPDATEs + +// failure -- + +bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) { + // check permissions + MonSession *session = op->get_session(); + if (!session) + return true; + if (!session->is_capable("osd", MON_CAP_X)) { + dout(0) << "got MOSDFailure from entity with insufficient caps " + << session->caps << dendl; + return true; + } + if (fsid != mon.monmap->fsid) { + dout(0) << "check_source: on fsid " << fsid + << " != " << mon.monmap->fsid << dendl; + return true; + } + return false; +} + + +bool OSDMonitor::preprocess_failure(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDFailure>(); + // who is target_osd + int badboy = m->get_target_osd(); + + // check permissions + if (check_source(op, m->fsid)) + goto didit; + + // first, verify the reporting host is valid + if (m->get_orig_source().is_osd()) { + int from = m->get_orig_source().num(); + if (!osdmap.exists(from) || + !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) || + (osdmap.is_down(from) && m->if_osd_failed())) { + dout(5) << "preprocess_failure from dead osd." << from + << ", ignoring" << dendl; + send_incremental(op, m->get_epoch()+1); + goto didit; + } + } + + + // weird? + if (osdmap.is_down(badboy)) { + dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd() + << " " << m->get_target_addrs() + << ", from " << m->get_orig_source() << dendl; + if (m->get_epoch() < osdmap.get_epoch()) + send_incremental(op, m->get_epoch()+1); + goto didit; + } + if (osdmap.get_addrs(badboy) != m->get_target_addrs()) { + dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd() + << " " << m->get_target_addrs() + << " != map's " << osdmap.get_addrs(badboy) + << ", from " << m->get_orig_source() << dendl; + if (m->get_epoch() < osdmap.get_epoch()) + send_incremental(op, m->get_epoch()+1); + goto didit; + } + + // already reported? + if (osdmap.is_down(badboy) || + osdmap.get_up_from(badboy) > m->get_epoch()) { + dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd() + << " " << m->get_target_addrs() + << ", from " << m->get_orig_source() << dendl; + if (m->get_epoch() < osdmap.get_epoch()) + send_incremental(op, m->get_epoch()+1); + goto didit; + } + + if (!can_mark_down(badboy)) { + dout(5) << "preprocess_failure ignoring report of osd." + << m->get_target_osd() << " " << m->get_target_addrs() + << " from " << m->get_orig_source() << dendl; + goto didit; + } + + dout(10) << "preprocess_failure new: osd." << m->get_target_osd() + << " " << m->get_target_addrs() + << ", from " << m->get_orig_source() << dendl; + return false; + + didit: + mon.no_reply(op); + return true; +} + +class C_AckMarkedDown : public C_MonOp { + OSDMonitor *osdmon; +public: + C_AckMarkedDown( + OSDMonitor *osdmon, + MonOpRequestRef op) + : C_MonOp(op), osdmon(osdmon) {} + + void _finish(int r) override { + if (r == 0) { + auto m = op->get_req<MOSDMarkMeDown>(); + osdmon->mon.send_reply( + op, + new MOSDMarkMeDown( + m->fsid, + m->target_osd, + m->target_addrs, + m->get_epoch(), + false)); // ACK itself does not request an ack + } else if (r == -EAGAIN) { + osdmon->dispatch(op); + } else { + ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r); + } + } + ~C_AckMarkedDown() override { + } +}; + +bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDMarkMeDown>(); + int from = m->target_osd; + + // check permissions + if (check_source(op, m->fsid)) + goto reply; + + // first, verify the reporting host is valid + if (!m->get_orig_source().is_osd()) + goto reply; + + if (!osdmap.exists(from) || + osdmap.is_down(from) || + osdmap.get_addrs(from) != m->target_addrs) { + dout(5) << "preprocess_mark_me_down from dead osd." + << from << ", ignoring" << dendl; + send_incremental(op, m->get_epoch()+1); + goto reply; + } + + // no down might be set + if (!can_mark_down(from)) + goto reply; + + dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source() + << " " << m->target_addrs << dendl; + return false; + + reply: + if (m->request_ack) { + Context *c(new C_AckMarkedDown(this, op)); + c->complete(0); + } + return true; +} + +bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDMarkMeDown>(); + int target_osd = m->target_osd; + + ceph_assert(osdmap.is_up(target_osd)); + ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs); + + mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down"); + pending_inc.new_state[target_osd] = CEPH_OSD_UP; + if (m->down_and_dead) { + if (!pending_inc.new_xinfo.count(target_osd)) { + pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd]; + } + pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch(); + } + if (m->request_ack) + wait_for_finished_proposal(op, new C_AckMarkedDown(this, op)); + return true; +} + +bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDMarkMeDead>(); + int from = m->target_osd; + + // check permissions + if (check_source(op, m->fsid)) { + mon.no_reply(op); + return true; + } + + // first, verify the reporting host is valid + if (!m->get_orig_source().is_osd()) { + mon.no_reply(op); + return true; + } + + if (!osdmap.exists(from) || + !osdmap.is_down(from)) { + dout(5) << __func__ << " from nonexistent or up osd." << from + << ", ignoring" << dendl; + send_incremental(op, m->get_epoch()+1); + mon.no_reply(op); + return true; + } + + return false; +} + +bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDMarkMeDead>(); + int target_osd = m->target_osd; + + ceph_assert(osdmap.is_down(target_osd)); + + mon.clog->info() << "osd." << target_osd << " marked itself dead as of e" + << m->get_epoch(); + if (!pending_inc.new_xinfo.count(target_osd)) { + pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd]; + } + pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch(); + wait_for_finished_proposal( + op, + new LambdaContext( + [op, this] (int r) { + if (r >= 0) { + mon.no_reply(op); // ignore on success + } + } + )); + return true; +} + +bool OSDMonitor::can_mark_down(int i) +{ + if (osdmap.is_nodown(i)) { + dout(5) << __func__ << " osd." << i << " is marked as nodown, " + << "will not mark it down" << dendl; + return false; + } + + int num_osds = osdmap.get_num_osds(); + if (num_osds == 0) { + dout(5) << __func__ << " no osds" << dendl; + return false; + } + int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap); + float up_ratio = (float)up / (float)num_osds; + if (up_ratio < g_conf()->mon_osd_min_up_ratio) { + dout(2) << __func__ << " current up_ratio " << up_ratio << " < min " + << g_conf()->mon_osd_min_up_ratio + << ", will not mark osd." << i << " down" << dendl; + return false; + } + return true; +} + +bool OSDMonitor::can_mark_up(int i) +{ + if (osdmap.is_noup(i)) { + dout(5) << __func__ << " osd." << i << " is marked as noup, " + << "will not mark it up" << dendl; + return false; + } + + return true; +} + +/** + * @note the parameter @p i apparently only exists here so we can output the + * osd's id on messages. + */ +bool OSDMonitor::can_mark_out(int i) +{ + if (osdmap.is_noout(i)) { + dout(5) << __func__ << " osd." << i << " is marked as noout, " + << "will not mark it out" << dendl; + return false; + } + + int num_osds = osdmap.get_num_osds(); + if (num_osds == 0) { + dout(5) << __func__ << " no osds" << dendl; + return false; + } + int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap); + float in_ratio = (float)in / (float)num_osds; + if (in_ratio < g_conf()->mon_osd_min_in_ratio) { + if (i >= 0) + dout(5) << __func__ << " current in_ratio " << in_ratio << " < min " + << g_conf()->mon_osd_min_in_ratio + << ", will not mark osd." << i << " out" << dendl; + else + dout(5) << __func__ << " current in_ratio " << in_ratio << " < min " + << g_conf()->mon_osd_min_in_ratio + << ", will not mark osds out" << dendl; + return false; + } + + return true; +} + +bool OSDMonitor::can_mark_in(int i) +{ + if (osdmap.is_noin(i)) { + dout(5) << __func__ << " osd." << i << " is marked as noin, " + << "will not mark it in" << dendl; + return false; + } + + return true; +} + +bool OSDMonitor::check_failures(utime_t now) +{ + bool found_failure = false; + auto p = failure_info.begin(); + while (p != failure_info.end()) { + auto& [target_osd, fi] = *p; + if (can_mark_down(target_osd) && + check_failure(now, target_osd, fi)) { + found_failure = true; + ++p; + } else if (is_failure_stale(now, fi)) { + dout(10) << " dropping stale failure_info for osd." << target_osd + << " from " << fi.reporters.size() << " reporters" + << dendl; + p = failure_info.erase(p); + } else { + ++p; + } + } + return found_failure; +} + +utime_t OSDMonitor::get_grace_time(utime_t now, + int target_osd, + failure_info_t& fi) const +{ + utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0); + if (!g_conf()->mon_osd_adjust_heartbeat_grace) { + return orig_grace; + } + utime_t grace = orig_grace; + double halflife = (double)g_conf()->mon_osd_laggy_halflife; + double decay_k = ::log(.5) / halflife; + + // scale grace period based on historical probability of 'lagginess' + // (false positive failures due to slowness). + const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd); + const utime_t failed_for = now - fi.get_failed_since(); + double decay = exp((double)failed_for * decay_k); + dout(20) << " halflife " << halflife << " decay_k " << decay_k + << " failed_for " << failed_for << " decay " << decay << dendl; + double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability; + grace += my_grace; + + // consider the peers reporting a failure a proxy for a potential + // 'subcluster' over the overall cluster that is similarly + // laggy. this is clearly not true in all cases, but will sometimes + // help us localize the grace correction to a subset of the system + // (say, a rack with a bad switch) that is unhappy. + double peer_grace = 0; + for (auto& [reporter, report] : fi.reporters) { + if (osdmap.exists(reporter)) { + const osd_xinfo_t& xi = osdmap.get_xinfo(reporter); + utime_t elapsed = now - xi.down_stamp; + double decay = exp((double)elapsed * decay_k); + peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability; + } + } + peer_grace /= (double)fi.reporters.size(); + grace += peer_grace; + dout(10) << " osd." << target_osd << " has " + << fi.reporters.size() << " reporters, " + << grace << " grace (" << orig_grace << " + " << my_grace + << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since() + << dendl; + + return grace; +} + +bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) +{ + // already pending failure? + if (pending_inc.new_state.count(target_osd) && + pending_inc.new_state[target_osd] & CEPH_OSD_UP) { + dout(10) << " already pending failure" << dendl; + return true; + } + + set<string> reporters_by_subtree; + auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level"); + ceph_assert(fi.reporters.size()); + for (auto p = fi.reporters.begin(); p != fi.reporters.end();) { + // get the parent bucket whose type matches with "reporter_subtree_level". + // fall back to OSD if the level doesn't exist. + if (osdmap.exists(p->first)) { + auto reporter_loc = osdmap.crush->get_full_location(p->first); + if (auto iter = reporter_loc.find(reporter_subtree_level); + iter == reporter_loc.end()) { + reporters_by_subtree.insert("osd." + to_string(p->first)); + } else { + reporters_by_subtree.insert(iter->second); + } + ++p; + } else { + fi.cancel_report(p->first);; + p = fi.reporters.erase(p); + } + } + if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) { + return false; + } + const utime_t failed_for = now - fi.get_failed_since(); + const utime_t grace = get_grace_time(now, target_osd, fi); + if (failed_for >= grace) { + dout(1) << " we have enough reporters to mark osd." << target_osd + << " down" << dendl; + pending_inc.new_state[target_osd] = CEPH_OSD_UP; + + mon.clog->info() << "osd." << target_osd << " failed (" + << osdmap.crush->get_full_location_ordered_string( + target_osd) + << ") (" + << (int)reporters_by_subtree.size() + << " reporters from different " + << reporter_subtree_level << " after " + << failed_for << " >= grace " << grace << ")"; + return true; + } + return false; +} + +bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const +{ + // if it takes too long to either cancel the report to mark the osd down, + // some reporters must have failed to cancel their reports. let's just + // forget these reports. + const utime_t failed_for = now - fi.get_failed_since(); + auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace"); + auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale"); + return failed_for >= (heartbeat_grace + heartbeat_stale); +} + +void OSDMonitor::force_failure(int target_osd, int by) +{ + // already pending failure? + if (pending_inc.new_state.count(target_osd) && + pending_inc.new_state[target_osd] & CEPH_OSD_UP) { + dout(10) << " already pending failure" << dendl; + return; + } + + dout(1) << " we're forcing failure of osd." << target_osd << dendl; + pending_inc.new_state[target_osd] = CEPH_OSD_UP; + if (!pending_inc.new_xinfo.count(target_osd)) { + pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd]; + } + pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch; + + mon.clog->info() << "osd." << target_osd << " failed (" + << osdmap.crush->get_full_location_ordered_string(target_osd) + << ") (connection refused reported by osd." << by << ")"; + return; +} + +bool OSDMonitor::prepare_failure(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDFailure>(); + dout(1) << "prepare_failure osd." << m->get_target_osd() + << " " << m->get_target_addrs() + << " from " << m->get_orig_source() + << " is reporting failure:" << m->if_osd_failed() << dendl; + + int target_osd = m->get_target_osd(); + int reporter = m->get_orig_source().num(); + ceph_assert(osdmap.is_up(target_osd)); + ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs()); + + mon.no_reply(op); + + if (m->if_osd_failed()) { + // calculate failure time + utime_t now = ceph_clock_now(); + utime_t failed_since = + m->get_recv_stamp() - utime_t(m->failed_for, 0); + + // add a report + if (m->is_immediate()) { + mon.clog->debug() << "osd." << m->get_target_osd() + << " reported immediately failed by " + << m->get_orig_source(); + force_failure(target_osd, reporter); + return true; + } + mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by " + << m->get_orig_source(); + + failure_info_t& fi = failure_info[target_osd]; + fi.add_report(reporter, failed_since, op); + return check_failure(now, target_osd, fi); + } else { + // remove the report + mon.clog->debug() << "osd." << m->get_target_osd() + << " failure report canceled by " + << m->get_orig_source(); + if (failure_info.count(target_osd)) { + failure_info_t& fi = failure_info[target_osd]; + fi.cancel_report(reporter); + if (fi.reporters.empty()) { + dout(10) << " removing last failure_info for osd." << target_osd + << dendl; + failure_info.erase(target_osd); + } else { + dout(10) << " failure_info for osd." << target_osd << " now " + << fi.reporters.size() << " reporters" << dendl; + } + } else { + dout(10) << " no failure_info for osd." << target_osd << dendl; + } + } + + return false; +} + +void OSDMonitor::process_failures() +{ + map<int,failure_info_t>::iterator p = failure_info.begin(); + while (p != failure_info.end()) { + if (osdmap.is_up(p->first)) { + ++p; + } else { + dout(10) << "process_failures osd." << p->first << dendl; + list<MonOpRequestRef> ls; + p->second.take_report_messages(ls); + failure_info.erase(p++); + + while (!ls.empty()) { + MonOpRequestRef o = ls.front(); + if (o) { + o->mark_event(__func__); + MOSDFailure *m = o->get_req<MOSDFailure>(); + send_latest(o, m->get_epoch()); + mon.no_reply(o); + } + ls.pop_front(); + } + } + } +} + +void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls) +{ + dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl; + + for (map<int,failure_info_t>::iterator p = failure_info.begin(); + p != failure_info.end(); + ++p) { + p->second.take_report_messages(ls); + } + failure_info.clear(); +} + +int OSDMonitor::get_grace_interval_threshold() +{ + int halflife = g_conf()->mon_osd_laggy_halflife; + // Scale the halflife period (default: 1_hr) by + // a factor (48) to calculate the threshold. + int grace_threshold_factor = 48; + return halflife * grace_threshold_factor; +} + +bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval) +{ + int grace_interval_threshold_secs = get_grace_interval_threshold(); + if (last_failed_interval > grace_interval_threshold_secs) { + dout(1) << " last_failed_interval " << last_failed_interval + << " > grace_interval_threshold_secs " << grace_interval_threshold_secs + << dendl; + return true; + } + return false; +} + +void OSDMonitor::set_default_laggy_params(int target_osd) +{ + if (pending_inc.new_xinfo.count(target_osd) == 0) { + pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd]; + } + osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd]; + xi.down_stamp = pending_inc.modified; + xi.laggy_probability = 0.0; + xi.laggy_interval = 0; + dout(20) << __func__ << " reset laggy, now xi " << xi << dendl; +} + + +// boot -- + +bool OSDMonitor::preprocess_boot(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDBoot>(); + int from = m->get_orig_source_inst().name.num(); + + // check permissions, ignore if failed (no response expected) + MonSession *session = op->get_session(); + if (!session) + goto ignore; + if (!session->is_capable("osd", MON_CAP_X)) { + dout(0) << "got preprocess_boot message from entity with insufficient caps" + << session->caps << dendl; + goto ignore; + } + + if (m->sb.cluster_fsid != mon.monmap->fsid) { + dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid + << " != " << mon.monmap->fsid << dendl; + goto ignore; + } + + if (m->get_orig_source_inst().addr.is_blank_ip()) { + dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl; + goto ignore; + } + + ceph_assert(m->get_orig_source_inst().name.is_osd()); + + // force all osds to have gone through luminous prior to upgrade to nautilus + { + vector<string> missing; + if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) { + missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS"); + } + if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) { + missing.push_back("CEPH_FEATURE_SERVER_JEWEL"); + } + if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) { + missing.push_back("CEPH_FEATURE_SERVER_KRAKEN"); + } + if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) { + missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES"); + } + + if (!missing.empty()) { + using std::experimental::make_ostream_joiner; + + stringstream ss; + copy(begin(missing), end(missing), make_ostream_joiner(ss, ";")); + + mon.clog->info() << "disallowing boot of OSD " + << m->get_orig_source_inst() + << " because the osd lacks " << ss.str(); + goto ignore; + } + } + + // make sure osd versions do not span more than 3 releases + if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) && + osdmap.require_osd_release < ceph_release_t::mimic) { + mon.clog->info() << "disallowing boot of octopus+ OSD " + << m->get_orig_source_inst() + << " because require_osd_release < mimic"; + goto ignore; + } + if (HAVE_FEATURE(m->osd_features, SERVER_PACIFIC) && + osdmap.require_osd_release < ceph_release_t::nautilus) { + mon.clog->info() << "disallowing boot of pacific+ OSD " + << m->get_orig_source_inst() + << " because require_osd_release < nautilus"; + goto ignore; + } + + // The release check here is required because for OSD_PGLOG_HARDLIMIT, + // we are reusing a jewel feature bit that was retired in luminous. + if (osdmap.require_osd_release >= ceph_release_t::luminous && + osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) && + !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) { + mon.clog->info() << "disallowing boot of OSD " + << m->get_orig_source_inst() + << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature"; + goto ignore; + } + + if (osdmap.stretch_mode_enabled && + !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) { + mon.clog->info() << "disallowing boot of OSD " + << m->get_orig_source_inst() + << " because stretch mode is on and OSD lacks support"; + goto ignore; + } + + // already booted? + if (osdmap.is_up(from) && + osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) && + osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) { + // yup. + dout(7) << "preprocess_boot dup from " << m->get_orig_source() + << " " << m->get_orig_source_addrs() + << " =~ " << osdmap.get_addrs(from) << dendl; + _booted(op, false); + return true; + } + + if (osdmap.exists(from) && + !osdmap.get_uuid(from).is_zero() && + osdmap.get_uuid(from) != m->sb.osd_fsid) { + dout(7) << __func__ << " from " << m->get_orig_source_inst() + << " clashes with existing osd: different fsid" + << " (ours: " << osdmap.get_uuid(from) + << " ; theirs: " << m->sb.osd_fsid << ")" << dendl; + goto ignore; + } + + if (osdmap.exists(from) && + osdmap.get_info(from).up_from > m->version && + osdmap.get_most_recent_addrs(from).legacy_equals( + m->get_orig_source_addrs())) { + dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl; + send_latest(op, m->sb.current_epoch+1); + return true; + } + + // noup? + if (!can_mark_up(from)) { + dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl; + send_latest(op, m->sb.current_epoch+1); + return true; + } + + dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl; + return false; + + ignore: + return true; +} + +bool OSDMonitor::prepare_boot(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDBoot>(); + dout(7) << __func__ << " from " << m->get_source() + << " sb " << m->sb + << " client_addrs" << m->get_connection()->get_peer_addrs() + << " cluster_addrs " << m->cluster_addrs + << " hb_back_addrs " << m->hb_back_addrs + << " hb_front_addrs " << m->hb_front_addrs + << dendl; + + ceph_assert(m->get_orig_source().is_osd()); + int from = m->get_orig_source().num(); + + // does this osd exist? + if (from >= osdmap.get_max_osd()) { + dout(1) << "boot from osd." << from << " >= max_osd " + << osdmap.get_max_osd() << dendl; + return false; + } + + int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW; + if (pending_inc.new_state.count(from)) + oldstate ^= pending_inc.new_state[from]; + + // already up? mark down first? + if (osdmap.is_up(from)) { + dout(7) << __func__ << " was up, first marking down osd." << from << " " + << osdmap.get_addrs(from) << dendl; + // preprocess should have caught these; if not, assert. + ceph_assert(!osdmap.get_addrs(from).legacy_equals( + m->get_orig_source_addrs()) || + !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)); + ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid); + + if (pending_inc.new_state.count(from) == 0 || + (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) { + // mark previous guy down + pending_inc.new_state[from] = CEPH_OSD_UP; + } + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + } else if (pending_inc.new_up_client.count(from)) { + // already prepared, just wait + dout(7) << __func__ << " already prepared, waiting on " + << m->get_orig_source_addr() << dendl; + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + } else { + // mark new guy up. + pending_inc.new_up_client[from] = m->get_orig_source_addrs(); + pending_inc.new_up_cluster[from] = m->cluster_addrs; + pending_inc.new_hb_back_up[from] = m->hb_back_addrs; + pending_inc.new_hb_front_up[from] = m->hb_front_addrs; + + down_pending_out.erase(from); // if any + + if (m->sb.weight) + osd_weight[from] = m->sb.weight; + + // set uuid? + dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid + << dendl; + if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) { + // preprocess should have caught this; if not, assert. + ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero()); + pending_inc.new_uuid[from] = m->sb.osd_fsid; + } + + // fresh osd? + if (m->sb.newest_map == 0 && osdmap.exists(from)) { + const osd_info_t& i = osdmap.get_info(from); + if (i.up_from > i.lost_at) { + dout(10) << " fresh osd; marking lost_at too" << dendl; + pending_inc.new_lost[from] = osdmap.get_epoch(); + } + } + + // metadata + bufferlist osd_metadata; + encode(m->metadata, osd_metadata); + pending_metadata[from] = osd_metadata; + pending_metadata_rm.erase(from); + + // adjust last clean unmount epoch? + const osd_info_t& info = osdmap.get_info(from); + dout(10) << " old osd_info: " << info << dendl; + if (m->sb.mounted > info.last_clean_begin || + (m->sb.mounted == info.last_clean_begin && + m->sb.clean_thru > info.last_clean_end)) { + epoch_t begin = m->sb.mounted; + epoch_t end = m->sb.clean_thru; + + dout(10) << __func__ << " osd." << from << " last_clean_interval " + << "[" << info.last_clean_begin << "," << info.last_clean_end + << ") -> [" << begin << "-" << end << ")" + << dendl; + pending_inc.new_last_clean_interval[from] = + pair<epoch_t,epoch_t>(begin, end); + } + + if (pending_inc.new_xinfo.count(from) == 0) + pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from]; + osd_xinfo_t& xi = pending_inc.new_xinfo[from]; + if (m->boot_epoch == 0) { + xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight); + xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight); + dout(10) << " not laggy, new xi " << xi << dendl; + } else { + if (xi.down_stamp.sec()) { + int interval = ceph_clock_now().sec() - + xi.down_stamp.sec(); + if (g_conf()->mon_osd_laggy_max_interval && + (interval > g_conf()->mon_osd_laggy_max_interval)) { + interval = g_conf()->mon_osd_laggy_max_interval; + } + xi.laggy_interval = + interval * g_conf()->mon_osd_laggy_weight + + xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight); + } + xi.laggy_probability = + g_conf()->mon_osd_laggy_weight + + xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight); + dout(10) << " laggy, now xi " << xi << dendl; + } + + // set features shared by the osd + if (m->osd_features) + xi.features = m->osd_features; + else + xi.features = m->get_connection()->get_features(); + + // mark in? + if ((g_conf()->mon_osd_auto_mark_auto_out_in && + (oldstate & CEPH_OSD_AUTOOUT)) || + (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) || + (g_conf()->mon_osd_auto_mark_in)) { + if (can_mark_in(from)) { + if (xi.old_weight > 0) { + pending_inc.new_weight[from] = xi.old_weight; + xi.old_weight = 0; + } else { + pending_inc.new_weight[from] = CEPH_OSD_IN; + } + } else { + dout(7) << __func__ << " NOIN set, will not mark in " + << m->get_orig_source_addr() << dendl; + } + } + + // wait + wait_for_finished_proposal(op, new C_Booted(this, op)); + } + return true; +} + +void OSDMonitor::_booted(MonOpRequestRef op, bool logit) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDBoot>(); + dout(7) << "_booted " << m->get_orig_source_inst() + << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl; + + if (logit) { + mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs() + << " boot"; + } + + send_latest(op, m->sb.current_epoch+1); +} + + +// ------------- +// full + +bool OSDMonitor::preprocess_full(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDFull>(); + int from = m->get_orig_source().num(); + set<string> state; + unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL; + + // check permissions, ignore if failed + MonSession *session = op->get_session(); + if (!session) + goto ignore; + if (!session->is_capable("osd", MON_CAP_X)) { + dout(0) << "MOSDFull from entity with insufficient privileges:" + << session->caps << dendl; + goto ignore; + } + + // ignore a full message from the osd instance that already went down + if (!osdmap.exists(from)) { + dout(7) << __func__ << " ignoring full message from nonexistent " + << m->get_orig_source_inst() << dendl; + goto ignore; + } + if ((!osdmap.is_up(from) && + osdmap.get_most_recent_addrs(from).legacy_equals( + m->get_orig_source_addrs())) || + (osdmap.is_up(from) && + !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) { + dout(7) << __func__ << " ignoring full message from down " + << m->get_orig_source_inst() << dendl; + goto ignore; + } + + OSDMap::calc_state_set(osdmap.get_state(from), state); + + if ((osdmap.get_state(from) & mask) == m->state) { + dout(7) << __func__ << " state already " << state << " for osd." << from + << " " << m->get_orig_source_inst() << dendl; + _reply_map(op, m->version); + goto ignore; + } + + dout(10) << __func__ << " want state " << state << " for osd." << from + << " " << m->get_orig_source_inst() << dendl; + return false; + + ignore: + return true; +} + +bool OSDMonitor::prepare_full(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDFull>(); + const int from = m->get_orig_source().num(); + + const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL; + const unsigned want_state = m->state & mask; // safety first + + unsigned cur_state = osdmap.get_state(from); + auto p = pending_inc.new_state.find(from); + if (p != pending_inc.new_state.end()) { + cur_state ^= p->second; + } + cur_state &= mask; + + set<string> want_state_set, cur_state_set; + OSDMap::calc_state_set(want_state, want_state_set); + OSDMap::calc_state_set(cur_state, cur_state_set); + + if (cur_state != want_state) { + if (p != pending_inc.new_state.end()) { + p->second &= ~mask; + } else { + pending_inc.new_state[from] = 0; + } + pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state; + dout(7) << __func__ << " osd." << from << " " << cur_state_set + << " -> " << want_state_set << dendl; + } else { + dout(7) << __func__ << " osd." << from << " " << cur_state_set + << " = wanted " << want_state_set << ", just waiting" << dendl; + } + + wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version)); + return true; +} + +// ------------- +// alive + +bool OSDMonitor::preprocess_alive(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDAlive>(); + int from = m->get_orig_source().num(); + + // check permissions, ignore if failed + MonSession *session = op->get_session(); + if (!session) + goto ignore; + if (!session->is_capable("osd", MON_CAP_X)) { + dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:" + << session->caps << dendl; + goto ignore; + } + + if (!osdmap.is_up(from) || + !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) { + dout(7) << "preprocess_alive ignoring alive message from down " + << m->get_orig_source() << " " << m->get_orig_source_addrs() + << dendl; + goto ignore; + } + + if (osdmap.get_up_thru(from) >= m->want) { + // yup. + dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl; + _reply_map(op, m->version); + return true; + } + + dout(10) << "preprocess_alive want up_thru " << m->want + << " from " << m->get_orig_source_inst() << dendl; + return false; + + ignore: + return true; +} + +bool OSDMonitor::prepare_alive(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDAlive>(); + int from = m->get_orig_source().num(); + + if (0) { // we probably don't care much about these + mon.clog->debug() << m->get_orig_source_inst() << " alive"; + } + + dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version + << " from " << m->get_orig_source_inst() << dendl; + + update_up_thru(from, m->version); // set to the latest map the OSD has + wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version)); + return true; +} + +void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e) +{ + op->mark_osdmon_event(__func__); + dout(7) << "_reply_map " << e + << " from " << op->get_req()->get_orig_source_inst() + << dendl; + send_latest(op, e); +} + +// pg_created +bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDPGCreated>(); + dout(10) << __func__ << " " << *m << dendl; + auto session = op->get_session(); + mon.no_reply(op); + if (!session) { + dout(10) << __func__ << ": no monitor session!" << dendl; + return true; + } + if (!session->is_capable("osd", MON_CAP_X)) { + derr << __func__ << " received from entity " + << "with insufficient privileges " << session->caps << dendl; + return true; + } + // always forward the "created!" to the leader + return false; +} + +bool OSDMonitor::prepare_pg_created(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDPGCreated>(); + dout(10) << __func__ << " " << *m << dendl; + auto src = m->get_orig_source(); + auto from = src.num(); + if (!src.is_osd() || + !mon.osdmon()->osdmap.is_up(from) || + !mon.osdmon()->osdmap.get_addrs(from).legacy_equals( + m->get_orig_source_addrs())) { + dout(1) << __func__ << " ignoring stats from non-active osd." << dendl; + return false; + } + pending_created_pgs.push_back(m->pgid); + return true; +} + +bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDPGReadyToMerge>(); + dout(10) << __func__ << " " << *m << dendl; + const pg_pool_t *pi; + auto session = op->get_session(); + if (!session) { + dout(10) << __func__ << ": no monitor session!" << dendl; + goto ignore; + } + if (!session->is_capable("osd", MON_CAP_X)) { + derr << __func__ << " received from entity " + << "with insufficient privileges " << session->caps << dendl; + goto ignore; + } + pi = osdmap.get_pg_pool(m->pgid.pool()); + if (!pi) { + derr << __func__ << " pool for " << m->pgid << " dne" << dendl; + goto ignore; + } + if (pi->get_pg_num() <= m->pgid.ps()) { + dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl; + goto ignore; + } + if (pi->get_pg_num() != m->pgid.ps() + 1) { + derr << " OSD trying to merge wrong pgid " << m->pgid << dendl; + goto ignore; + } + if (pi->get_pg_num_pending() > m->pgid.ps()) { + dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl; + goto ignore; + } + return false; + + ignore: + mon.no_reply(op); + return true; +} + +bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDPGReadyToMerge>(); + dout(10) << __func__ << " " << *m << dendl; + pg_pool_t p; + if (pending_inc.new_pools.count(m->pgid.pool())) + p = pending_inc.new_pools[m->pgid.pool()]; + else + p = *osdmap.get_pg_pool(m->pgid.pool()); + if (p.get_pg_num() != m->pgid.ps() + 1 || + p.get_pg_num_pending() > m->pgid.ps()) { + dout(10) << __func__ + << " race with concurrent pg_num[_pending] update, will retry" + << dendl; + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + + if (m->ready) { + p.dec_pg_num(m->pgid, + pending_inc.epoch, + m->source_version, + m->target_version, + m->last_epoch_started, + m->last_epoch_clean); + p.last_change = pending_inc.epoch; + } else { + // back off the merge attempt! + p.set_pg_num_pending(p.get_pg_num()); + } + + // force pre-nautilus clients to resend their ops, since they + // don't understand pg_num_pending changes form a new interval + p.last_force_op_resend_prenautilus = pending_inc.epoch; + + pending_inc.new_pools[m->pgid.pool()] = p; + + auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability"); + if (m->ready && + prob > 0 && + prob > (double)(rand() % 1000)/1000.0) { + derr << __func__ << " injecting pg merge pg_num bounce" << dendl; + auto n = new MMonCommand(mon.monmap->get_fsid()); + n->set_connection(m->get_connection()); + n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" + + osdmap.get_pool_name(m->pgid.pool()) + + "\", \"var\": \"pg_num_actual\", \"val\": \"" + + stringify(m->pgid.ps() + 1) + "\"}" }; + MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n); + nop->set_type_service(); + wait_for_finished_proposal(op, new C_RetryMessage(this, nop)); + } else { + wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version)); + } + return true; +} + + +// ------------- +// pg_temp changes + +bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op) +{ + auto m = op->get_req<MOSDPGTemp>(); + dout(10) << "preprocess_pgtemp " << *m << dendl; + mempool::osdmap::vector<int> empty; + int from = m->get_orig_source().num(); + size_t ignore_cnt = 0; + + // check caps + MonSession *session = op->get_session(); + if (!session) + goto ignore; + if (!session->is_capable("osd", MON_CAP_X)) { + dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps " + << session->caps << dendl; + goto ignore; + } + + if (!osdmap.is_up(from) || + !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) { + dout(7) << "ignoring pgtemp message from down " + << m->get_orig_source() << " " << m->get_orig_source_addrs() + << dendl; + goto ignore; + } + + if (m->forced) { + return false; + } + + for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) { + dout(20) << " " << p->first + << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty) + << " -> " << p->second << dendl; + + // does the pool exist? + if (!osdmap.have_pg_pool(p->first.pool())) { + /* + * 1. If the osdmap does not have the pool, it means the pool has been + * removed in-between the osd sending this message and us handling it. + * 2. If osdmap doesn't have the pool, it is safe to assume the pool does + * not exist in the pending either, as the osds would not send a + * message about a pool they know nothing about (yet). + * 3. However, if the pool does exist in the pending, then it must be a + * new pool, and not relevant to this message (see 1). + */ + dout(10) << __func__ << " ignore " << p->first << " -> " << p->second + << ": pool has been removed" << dendl; + ignore_cnt++; + continue; + } + + int acting_primary = -1; + osdmap.pg_to_up_acting_osds( + p->first, nullptr, nullptr, nullptr, &acting_primary); + if (acting_primary != from) { + /* If the source isn't the primary based on the current osdmap, we know + * that the interval changed and that we can discard this message. + * Indeed, we must do so to avoid 16127 since we can't otherwise determine + * which of two pg temp mappings on the same pg is more recent. + */ + dout(10) << __func__ << " ignore " << p->first << " -> " << p->second + << ": primary has changed" << dendl; + ignore_cnt++; + continue; + } + + // removal? + if (p->second.empty() && (osdmap.pg_temp->count(p->first) || + osdmap.primary_temp->count(p->first))) + return false; + // change? + // NOTE: we assume that this will clear pg_primary, so consider + // an existing pg_primary field to imply a change + if (p->second.size() && + (osdmap.pg_temp->count(p->first) == 0 || + osdmap.pg_temp->get(p->first) != p->second || + osdmap.primary_temp->count(p->first))) + return false; + } + + // should we ignore all the pgs? + if (ignore_cnt == m->pg_temp.size()) + goto ignore; + + dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl; + _reply_map(op, m->map_epoch); + return true; + + ignore: + mon.no_reply(op); + return true; +} + +void OSDMonitor::update_up_thru(int from, epoch_t up_thru) +{ + epoch_t old_up_thru = osdmap.get_up_thru(from); + auto ut = pending_inc.new_up_thru.find(from); + if (ut != pending_inc.new_up_thru.end()) { + old_up_thru = ut->second; + } + if (up_thru > old_up_thru) { + // set up_thru too, so the osd doesn't have to ask again + pending_inc.new_up_thru[from] = up_thru; + } +} + +bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MOSDPGTemp>(); + int from = m->get_orig_source().num(); + dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl; + for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) { + uint64_t pool = p->first.pool(); + if (pending_inc.old_pools.count(pool)) { + dout(10) << __func__ << " ignore " << p->first << " -> " << p->second + << ": pool pending removal" << dendl; + continue; + } + if (!osdmap.have_pg_pool(pool)) { + dout(10) << __func__ << " ignore " << p->first << " -> " << p->second + << ": pool has been removed" << dendl; + continue; + } + pending_inc.new_pg_temp[p->first] = + mempool::osdmap::vector<int>(p->second.begin(), p->second.end()); + + // unconditionally clear pg_primary (until this message can encode + // a change for that, too.. at which point we need to also fix + // preprocess_pg_temp) + if (osdmap.primary_temp->count(p->first) || + pending_inc.new_primary_temp.count(p->first)) + pending_inc.new_primary_temp[p->first] = -1; + } + + // set up_thru too, so the osd doesn't have to ask again + update_up_thru(from, m->map_epoch); + + wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch)); + return true; +} + + +// --- + +bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MRemoveSnaps>(); + dout(7) << "preprocess_remove_snaps " << *m << dendl; + + // check privilege, ignore if failed + MonSession *session = op->get_session(); + mon.no_reply(op); + if (!session) + goto ignore; + if (!session->caps.is_capable( + cct, + session->entity_name, + "osd", "osd pool rmsnap", {}, true, true, false, + session->get_peer_socket_addr())) { + dout(0) << "got preprocess_remove_snaps from entity with insufficient caps " + << session->caps << dendl; + goto ignore; + } + + for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin(); + q != m->snaps.end(); + ++q) { + if (!osdmap.have_pg_pool(q->first)) { + dout(10) << " ignoring removed_snaps " << q->second + << " on non-existent pool " << q->first << dendl; + continue; + } + const pg_pool_t *pi = osdmap.get_pg_pool(q->first); + for (vector<snapid_t>::iterator p = q->second.begin(); + p != q->second.end(); + ++p) { + if (*p > pi->get_snap_seq() || + !_is_removed_snap(q->first, *p)) { + return false; + } + } + } + + if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) { + auto reply = make_message<MRemoveSnaps>(); + reply->snaps = m->snaps; + mon.send_reply(op, reply.detach()); + } + + ignore: + return true; +} + +bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MRemoveSnaps>(); + dout(7) << "prepare_remove_snaps " << *m << dendl; + + for (auto& [pool, snaps] : m->snaps) { + if (!osdmap.have_pg_pool(pool)) { + dout(10) << " ignoring removed_snaps " << snaps + << " on non-existent pool " << pool << dendl; + continue; + } + + pg_pool_t& pi = osdmap.pools[pool]; + for (auto s : snaps) { + if (!_is_removed_snap(pool, s) && + (!pending_inc.new_pools.count(pool) || + !pending_inc.new_pools[pool].removed_snaps.contains(s)) && + (!pending_inc.new_removed_snaps.count(pool) || + !pending_inc.new_removed_snaps[pool].contains(s))) { + pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi); + if (osdmap.require_osd_release < ceph_release_t::octopus) { + newpi->removed_snaps.insert(s); + dout(10) << " pool " << pool << " removed_snaps added " << s + << " (now " << newpi->removed_snaps << ")" << dendl; + } + newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS; + if (s > newpi->get_snap_seq()) { + dout(10) << " pool " << pool << " snap_seq " + << newpi->get_snap_seq() << " -> " << s << dendl; + newpi->set_snap_seq(s); + } + newpi->set_snap_epoch(pending_inc.epoch); + dout(10) << " added pool " << pool << " snap " << s + << " to removed_snaps queue" << dendl; + pending_inc.new_removed_snaps[pool].insert(s); + } + } + } + + if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) { + auto reply = make_message<MRemoveSnaps>(); + reply->snaps = m->snaps; + wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply)); + } + + return true; +} + +bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MMonGetPurgedSnaps>(); + dout(7) << __func__ << " " << *m << dendl; + + map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r; + + string k = make_purged_snap_epoch_key(m->start); + auto it = mon.store->get_iterator(OSD_SNAP_PREFIX); + it->upper_bound(k); + unsigned long epoch = m->last; + while (it->valid()) { + if (it->key().find("purged_epoch_") != 0) { + break; + } + string k = it->key(); + int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch); + if (n != 1) { + derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl; + } else if (epoch > m->last) { + break; + } else { + bufferlist bl = it->value(); + auto p = bl.cbegin(); + auto &v = r[epoch]; + try { + ceph::decode(v, p); + } catch (ceph::buffer::error& e) { + derr << __func__ << " unable to parse value for key '" << it->key() + << "': \n"; + bl.hexdump(*_dout); + *_dout << dendl; + } + n += 4 + v.size() * 16; + } + if (n > 1048576) { + // impose a semi-arbitrary limit to message size + break; + } + it->next(); + } + + auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch); + reply->purged_snaps.swap(r); + mon.send_reply(op, reply.detach()); + + return true; +} + +// osd beacon +bool OSDMonitor::preprocess_beacon(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + // check caps + auto session = op->get_session(); + mon.no_reply(op); + if (!session) { + dout(10) << __func__ << " no monitor session!" << dendl; + return true; + } + if (!session->is_capable("osd", MON_CAP_X)) { + derr << __func__ << " received from entity " + << "with insufficient privileges " << session->caps << dendl; + return true; + } + // Always forward the beacon to the leader, even if they are the same as + // the old one. The leader will mark as down osds that haven't sent + // beacon for a few minutes. + return false; +} + +bool OSDMonitor::prepare_beacon(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + const auto beacon = op->get_req<MOSDBeacon>(); + const auto src = beacon->get_orig_source(); + dout(10) << __func__ << " " << *beacon + << " from " << src << dendl; + int from = src.num(); + + if (!src.is_osd() || + !osdmap.is_up(from) || + !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) { + if (src.is_osd() && !osdmap.is_up(from)) { + // share some new maps with this guy in case it may not be + // aware of its own deadness... + send_latest(op, beacon->version+1); + } + dout(1) << " ignoring beacon from non-active osd." << from << dendl; + return false; + } + + last_osd_report[from].first = ceph_clock_now(); + last_osd_report[from].second = beacon->osd_beacon_report_interval; + osd_epochs[from] = beacon->version; + + for (const auto& pg : beacon->pgs) { + if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) { + unsigned pg_num = pool->get_pg_num(); + last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean); + } + } + + if (osdmap.osd_xinfo[from].last_purged_snaps_scrub < + beacon->last_purged_snaps_scrub) { + if (pending_inc.new_xinfo.count(from) == 0) { + pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from]; + } + pending_inc.new_xinfo[from].last_purged_snaps_scrub = + beacon->last_purged_snaps_scrub; + return true; + } else { + return false; + } +} + +// --------------- +// map helpers + +void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start) +{ + op->mark_osdmon_event(__func__); + dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst() + << " start " << start << dendl; + if (start == 0) + send_full(op); + else + send_incremental(op, start); +} + + +MOSDMap *OSDMonitor::build_latest_full(uint64_t features) +{ + MOSDMap *r = new MOSDMap(mon.monmap->fsid, features); + get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]); + r->oldest_map = get_first_committed(); + r->newest_map = osdmap.get_epoch(); + return r; +} + +MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features) +{ + dout(10) << "build_incremental [" << from << ".." << to << "] with features " + << std::hex << features << std::dec << dendl; + MOSDMap *m = new MOSDMap(mon.monmap->fsid, features); + m->oldest_map = get_first_committed(); + m->newest_map = osdmap.get_epoch(); + + for (epoch_t e = to; e >= from && e > 0; e--) { + bufferlist bl; + int err = get_version(e, features, bl); + if (err == 0) { + ceph_assert(bl.length()); + // if (get_version(e, bl) > 0) { + dout(20) << "build_incremental inc " << e << " " + << bl.length() << " bytes" << dendl; + m->incremental_maps[e] = bl; + } else { + ceph_assert(err == -ENOENT); + ceph_assert(!bl.length()); + get_version_full(e, features, bl); + if (bl.length() > 0) { + //else if (get_version("full", e, bl) > 0) { + dout(20) << "build_incremental full " << e << " " + << bl.length() << " bytes" << dendl; + m->maps[e] = bl; + } else { + ceph_abort(); // we should have all maps. + } + } + } + return m; +} + +void OSDMonitor::send_full(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl; + mon.send_reply(op, build_latest_full(op->get_session()->con_features)); +} + +void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first) +{ + op->mark_osdmon_event(__func__); + + MonSession *s = op->get_session(); + ceph_assert(s); + + if (s->proxy_con) { + // oh, we can tell the other mon to do it + dout(10) << __func__ << " asking proxying mon to send_incremental from " + << first << dendl; + MRoute *r = new MRoute(s->proxy_tid, NULL); + r->send_osdmap_first = first; + s->proxy_con->send_message(r); + op->mark_event("reply: send routed send_osdmap_first reply"); + } else { + // do it ourselves + send_incremental(first, s, false, op); + } +} + +void OSDMonitor::send_incremental(epoch_t first, + MonSession *session, + bool onetime, + MonOpRequestRef req) +{ + dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]" + << " to " << session->name << dendl; + + // get feature of the peer + // use quorum_con_features, if it's an anonymous connection. + uint64_t features = session->con_features ? session->con_features : + mon.get_quorum_con_features(); + + if (first <= session->osd_epoch) { + dout(10) << __func__ << " " << session->name << " should already have epoch " + << session->osd_epoch << dendl; + first = session->osd_epoch + 1; + } + + if (first < get_first_committed()) { + MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features); + m->oldest_map = get_first_committed(); + m->newest_map = osdmap.get_epoch(); + + first = get_first_committed(); + bufferlist bl; + int err = get_version_full(first, features, bl); + ceph_assert(err == 0); + ceph_assert(bl.length()); + dout(20) << "send_incremental starting with base full " + << first << " " << bl.length() << " bytes" << dendl; + m->maps[first] = bl; + + if (req) { + mon.send_reply(req, m); + session->osd_epoch = first; + return; + } else { + session->con->send_message(m); + session->osd_epoch = first; + } + first++; + } + + while (first <= osdmap.get_epoch()) { + epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1, + osdmap.get_epoch()); + MOSDMap *m = build_incremental(first, last, features); + + if (req) { + // send some maps. it may not be all of them, but it will get them + // started. + mon.send_reply(req, m); + } else { + session->con->send_message(m); + first = last + 1; + } + session->osd_epoch = last; + if (onetime || req) + break; + } +} + +int OSDMonitor::get_version(version_t ver, bufferlist& bl) +{ + return get_version(ver, mon.get_quorum_con_features(), bl); +} + +void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features) +{ + OSDMap::Incremental inc; + auto q = bl.cbegin(); + inc.decode(q); + // always encode with subset of osdmap's canonical features + uint64_t f = features & inc.encode_features; + dout(20) << __func__ << " " << inc.epoch << " with features " << f + << dendl; + bl.clear(); + if (inc.fullmap.length()) { + // embedded full map? + OSDMap m; + m.decode(inc.fullmap); + inc.fullmap.clear(); + m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED); + } + if (inc.crush.length()) { + // embedded crush map + CrushWrapper c; + auto p = inc.crush.cbegin(); + c.decode(p); + inc.crush.clear(); + c.encode(inc.crush, f); + } + inc.encode(bl, f | CEPH_FEATURE_RESERVED); +} + +void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features) +{ + OSDMap m; + auto q = bl.cbegin(); + m.decode(q); + // always encode with subset of osdmap's canonical features + uint64_t f = features & m.get_encoding_features(); + dout(20) << __func__ << " " << m.get_epoch() << " with features " << f + << dendl; + bl.clear(); + m.encode(bl, f | CEPH_FEATURE_RESERVED); +} + +int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl) +{ + uint64_t significant_features = OSDMap::get_significant_features(features); + if (inc_osd_cache.lookup({ver, significant_features}, &bl)) { + return 0; + } + int ret = PaxosService::get_version(ver, bl); + if (ret < 0) { + return ret; + } + // NOTE: this check is imprecise; the OSDMap encoding features may + // be a subset of the latest mon quorum features, but worst case we + // reencode once and then cache the (identical) result under both + // feature masks. + if (significant_features != + OSDMap::get_significant_features(mon.get_quorum_con_features())) { + reencode_incremental_map(bl, features); + } + inc_osd_cache.add_bytes({ver, significant_features}, bl); + return 0; +} + +int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc) +{ + bufferlist inc_bl; + int err = get_version(ver, inc_bl); + ceph_assert(err == 0); + ceph_assert(inc_bl.length()); + + auto p = inc_bl.cbegin(); + inc.decode(p); + dout(10) << __func__ << " " + << " epoch " << inc.epoch + << " inc_crc " << inc.inc_crc + << " full_crc " << inc.full_crc + << " encode_features " << inc.encode_features << dendl; + return 0; +} + +int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl) +{ + dout(10) << __func__ << " ver " << ver << dendl; + + version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver); + if (closest_pinned == 0) { + return -ENOENT; + } + if (closest_pinned > ver) { + dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl; + } + ceph_assert(closest_pinned <= ver); + + dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl; + + // get osdmap incremental maps and apply on top of this one. + bufferlist osdm_bl; + bool has_cached_osdmap = false; + for (version_t v = ver-1; v >= closest_pinned; --v) { + if (full_osd_cache.lookup({v, mon.get_quorum_con_features()}, + &osdm_bl)) { + dout(10) << __func__ << " found map in cache ver " << v << dendl; + closest_pinned = v; + has_cached_osdmap = true; + break; + } + } + + if (!has_cached_osdmap) { + int err = PaxosService::get_version_full(closest_pinned, osdm_bl); + if (err != 0) { + derr << __func__ << " closest pinned map ver " << closest_pinned + << " not available! error: " << cpp_strerror(err) << dendl; + } + ceph_assert(err == 0); + } + + ceph_assert(osdm_bl.length()); + + OSDMap osdm; + osdm.decode(osdm_bl); + + dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned + << " e" << osdm.epoch + << " crc " << osdm.get_crc() + << " -- applying incremental maps." << dendl; + + uint64_t encode_features = 0; + for (version_t v = closest_pinned + 1; v <= ver; ++v) { + dout(20) << __func__ << " applying inc epoch " << v << dendl; + + OSDMap::Incremental inc; + int err = get_inc(v, inc); + ceph_assert(err == 0); + + encode_features = inc.encode_features; + + err = osdm.apply_incremental(inc); + ceph_assert(err == 0); + + // this block performs paranoid checks on map retrieval + if (g_conf().get_val<bool>("mon_debug_extra_checks") && + inc.full_crc != 0) { + + uint64_t f = encode_features; + if (!f) { + f = (mon.quorum_con_features ? mon.quorum_con_features : -1); + } + + // encode osdmap to force calculating crcs + bufferlist tbl; + osdm.encode(tbl, f | CEPH_FEATURE_RESERVED); + // decode osdmap to compare crcs with what's expected by incremental + OSDMap tosdm; + tosdm.decode(tbl); + + if (tosdm.get_crc() != inc.full_crc) { + derr << __func__ + << " osdmap crc mismatch! (osdmap crc " << tosdm.get_crc() + << ", expected " << inc.full_crc << ")" << dendl; + ceph_abort_msg("osdmap crc mismatch"); + } + } + + // note: we cannot add the recently computed map to the cache, as is, + // because we have not encoded the map into a bl. + } + + if (!encode_features) { + dout(10) << __func__ + << " last incremental map didn't have features;" + << " defaulting to quorum's or all" << dendl; + encode_features = + (mon.quorum_con_features ? mon.quorum_con_features : -1); + } + osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED); + + return 0; +} + +int OSDMonitor::get_version_full(version_t ver, bufferlist& bl) +{ + return get_version_full(ver, mon.get_quorum_con_features(), bl); +} + +int OSDMonitor::get_version_full(version_t ver, uint64_t features, + bufferlist& bl) +{ + uint64_t significant_features = OSDMap::get_significant_features(features); + if (full_osd_cache.lookup({ver, significant_features}, &bl)) { + return 0; + } + int ret = PaxosService::get_version_full(ver, bl); + if (ret == -ENOENT) { + // build map? + ret = get_full_from_pinned_map(ver, bl); + } + if (ret < 0) { + return ret; + } + // NOTE: this check is imprecise; the OSDMap encoding features may + // be a subset of the latest mon quorum features, but worst case we + // reencode once and then cache the (identical) result under both + // feature masks. + if (significant_features != + OSDMap::get_significant_features(mon.get_quorum_con_features())) { + reencode_full_map(bl, features); + } + full_osd_cache.add_bytes({ver, significant_features}, bl); + return 0; +} + +epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until) +{ + dout(10) << "blocklist " << av << " until " << until << dendl; + for (auto a : av.v) { + if (osdmap.require_osd_release >= ceph_release_t::nautilus) { + a.set_type(entity_addr_t::TYPE_ANY); + } else { + a.set_type(entity_addr_t::TYPE_LEGACY); + } + pending_inc.new_blocklist[a] = until; + } + return pending_inc.epoch; +} + +epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until) +{ + if (osdmap.require_osd_release >= ceph_release_t::nautilus) { + a.set_type(entity_addr_t::TYPE_ANY); + } else { + a.set_type(entity_addr_t::TYPE_LEGACY); + } + dout(10) << "blocklist " << a << " until " << until << dendl; + pending_inc.new_blocklist[a] = until; + return pending_inc.epoch; +} + + +void OSDMonitor::check_osdmap_subs() +{ + dout(10) << __func__ << dendl; + if (!osdmap.get_epoch()) { + return; + } + auto osdmap_subs = mon.session_map.subs.find("osdmap"); + if (osdmap_subs == mon.session_map.subs.end()) { + return; + } + auto p = osdmap_subs->second->begin(); + while (!p.end()) { + auto sub = *p; + ++p; + check_osdmap_sub(sub); + } +} + +void OSDMonitor::check_osdmap_sub(Subscription *sub) +{ + dout(10) << __func__ << " " << sub << " next " << sub->next + << (sub->onetime ? " (onetime)":" (ongoing)") << dendl; + if (sub->next <= osdmap.get_epoch()) { + if (sub->next >= 1) + send_incremental(sub->next, sub->session, sub->incremental_onetime); + else + sub->session->con->send_message(build_latest_full(sub->session->con_features)); + if (sub->onetime) + mon.session_map.remove_sub(sub); + else + sub->next = osdmap.get_epoch() + 1; + } +} + +void OSDMonitor::check_pg_creates_subs() +{ + if (!osdmap.get_num_up_osds()) { + return; + } + ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB); + mon.with_session_map([this](const MonSessionMap& session_map) { + auto pg_creates_subs = session_map.subs.find("osd_pg_creates"); + if (pg_creates_subs == session_map.subs.end()) { + return; + } + for (auto sub : *pg_creates_subs->second) { + check_pg_creates_sub(sub); + } + }); +} + +void OSDMonitor::check_pg_creates_sub(Subscription *sub) +{ + dout(20) << __func__ << " .. " << sub->session->name << dendl; + ceph_assert(sub->type == "osd_pg_creates"); + // only send these if the OSD is up. we will check_subs() when they do + // come up so they will get the creates then. + if (sub->session->name.is_osd() && + mon.osdmon()->osdmap.is_up(sub->session->name.num())) { + sub->next = send_pg_creates(sub->session->name.num(), + sub->session->con.get(), + sub->next); + } +} + +void OSDMonitor::do_application_enable(int64_t pool_id, + const std::string &app_name, + const std::string &app_key, + const std::string &app_value, + bool force) +{ + ceph_assert(paxos.is_plugged() && is_writeable()); + + dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name + << dendl; + + ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous); + + auto pp = osdmap.get_pg_pool(pool_id); + ceph_assert(pp != nullptr); + + pg_pool_t p = *pp; + if (pending_inc.new_pools.count(pool_id)) { + p = pending_inc.new_pools[pool_id]; + } + + if (app_key.empty()) { + p.application_metadata.insert({app_name, {}}); + } else { + if (force) { + p.application_metadata[app_name][app_key] = app_value; + } else { + p.application_metadata.insert({app_name, {{app_key, app_value}}}); + } + } + p.last_change = pending_inc.epoch; + pending_inc.new_pools[pool_id] = p; +} + +void OSDMonitor::do_set_pool_opt(int64_t pool_id, + pool_opts_t::key_t opt, + pool_opts_t::value_t val) +{ + dout(10) << __func__ << " pool: " << pool_id << " option: " << opt + << " val: " << val << dendl; + auto p = pending_inc.new_pools.try_emplace( + pool_id, *osdmap.get_pg_pool(pool_id)); + p.first->second.opts.set(opt, val); +} + +unsigned OSDMonitor::scan_for_creating_pgs( + const mempool::osdmap::map<int64_t,pg_pool_t>& pools, + const mempool::osdmap::set<int64_t>& removed_pools, + utime_t modified, + creating_pgs_t* creating_pgs) const +{ + unsigned queued = 0; + for (auto& p : pools) { + int64_t poolid = p.first; + if (creating_pgs->created_pools.count(poolid)) { + dout(10) << __func__ << " already created " << poolid << dendl; + continue; + } + const pg_pool_t& pool = p.second; + int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(), + pool.get_type(), pool.get_size()); + if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno)) + continue; + + const auto last_scan_epoch = creating_pgs->last_scan_epoch; + const auto created = pool.get_last_change(); + if (last_scan_epoch && created <= last_scan_epoch) { + dout(10) << __func__ << " no change in pool " << poolid + << " " << pool << dendl; + continue; + } + if (removed_pools.count(poolid)) { + dout(10) << __func__ << " pool is being removed: " << poolid + << " " << pool << dendl; + continue; + } + dout(10) << __func__ << " queueing pool create for " << poolid + << " " << pool << dendl; + creating_pgs->create_pool(poolid, pool.get_pg_num(), + created, modified); + queued++; + } + return queued; +} + +void OSDMonitor::update_creating_pgs() +{ + dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, " + << creating_pgs.queue.size() << " pools in queue" << dendl; + decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch; + std::lock_guard<std::mutex> l(creating_pgs_lock); + for (const auto& pg : creating_pgs.pgs) { + int acting_primary = -1; + auto pgid = pg.first; + if (!osdmap.pg_exists(pgid)) { + dout(20) << __func__ << " ignoring " << pgid << " which should not exist" + << dendl; + continue; + } + auto mapped = pg.second.create_epoch; + dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl; + spg_t spgid(pgid); + mapping.get_primary_and_shard(pgid, &acting_primary, &spgid); + // check the previous creating_pgs, look for the target to whom the pg was + // previously mapped + for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) { + const auto last_acting_primary = pgs_by_epoch.first; + for (auto& pgs: pgs_by_epoch.second) { + if (pgs.second.count(spgid)) { + if (last_acting_primary == acting_primary) { + mapped = pgs.first; + } else { + dout(20) << __func__ << " " << pgid << " " + << " acting_primary:" << last_acting_primary + << " -> " << acting_primary << dendl; + // note epoch if the target of the create message changed. + mapped = mapping.get_epoch(); + } + break; + } else { + // newly creating + mapped = mapping.get_epoch(); + } + } + } + dout(10) << __func__ << " will instruct osd." << acting_primary + << " to create " << pgid << "@" << mapped << dendl; + new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid); + } + creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch); + creating_pgs_epoch = mapping.get_epoch(); +} + +epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const +{ + dout(30) << __func__ << " osd." << osd << " next=" << next + << " " << creating_pgs_by_osd_epoch << dendl; + std::lock_guard<std::mutex> l(creating_pgs_lock); + if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) { + dout(20) << __func__ + << " not using stale creating_pgs@" << creating_pgs_epoch << dendl; + // the subscribers will be updated when the mapping is completed anyway + return next; + } + auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd); + if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end()) + return next; + ceph_assert(!creating_pgs_by_epoch->second.empty()); + + MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat + MOSDPGCreate2 *m = nullptr; + + bool old = osdmap.require_osd_release < ceph_release_t::nautilus; + + epoch_t last = 0; + for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next); + epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) { + auto epoch = epoch_pgs->first; + auto& pgs = epoch_pgs->second; + dout(20) << __func__ << " osd." << osd << " from " << next + << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl; + last = epoch; + for (auto& pg : pgs) { + // Need the create time from the monitor using its clock to set + // last_scrub_stamp upon pg creation. + auto create = creating_pgs.pgs.find(pg.pgid); + ceph_assert(create != creating_pgs.pgs.end()); + if (old) { + if (!oldm) { + oldm = new MOSDPGCreate(creating_pgs_epoch); + } + oldm->mkpg.emplace(pg.pgid, + pg_create_t{create->second.create_epoch, pg.pgid, 0}); + oldm->ctimes.emplace(pg.pgid, create->second.create_stamp); + } else { + if (!m) { + m = new MOSDPGCreate2(creating_pgs_epoch); + } + m->pgs.emplace(pg, make_pair(create->second.create_epoch, + create->second.create_stamp)); + if (create->second.history.epoch_created) { + dout(20) << __func__ << " " << pg << " " << create->second.history + << " " << create->second.past_intervals << dendl; + m->pg_extra.emplace(pg, make_pair(create->second.history, + create->second.past_intervals)); + } + } + dout(20) << __func__ << " will create " << pg + << " at " << create->second.create_epoch << dendl; + } + } + if (m) { + con->send_message(m); + } else if (oldm) { + con->send_message(oldm); + } else { + dout(20) << __func__ << " osd." << osd << " from " << next + << " has nothing to send" << dendl; + return next; + } + + // sub is current through last + 1 + return last + 1; +} + +// TICK + + +void OSDMonitor::tick() +{ + if (!is_active()) return; + + dout(10) << osdmap << dendl; + + // always update osdmap manifest, regardless of being the leader. + load_osdmap_manifest(); + + // always tune priority cache manager memory on leader and peons + if (ceph_using_tcmalloc() && mon_memory_autotune) { + std::lock_guard l(balancer_lock); + if (pcm != nullptr) { + pcm->tune_memory(); + pcm->balance(); + _set_new_cache_sizes(); + dout(10) << "tick balancer " + << " inc cache_bytes: " << inc_cache->get_cache_bytes() + << " inc comtd_bytes: " << inc_cache->get_committed_size() + << " inc used_bytes: " << inc_cache->_get_used_bytes() + << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps() + << dendl; + dout(10) << "tick balancer " + << " full cache_bytes: " << full_cache->get_cache_bytes() + << " full comtd_bytes: " << full_cache->get_committed_size() + << " full used_bytes: " << full_cache->_get_used_bytes() + << " full num_osdmaps: " << full_cache->_get_num_osdmaps() + << dendl; + } + } + + if (!mon.is_leader()) return; + + bool do_propose = false; + utime_t now = ceph_clock_now(); + + if (handle_osd_timeouts(now, last_osd_report)) { + do_propose = true; + } + + // mark osds down? + if (check_failures(now)) { + do_propose = true; + } + + // Force a proposal if we need to prune; pruning is performed on + // ``encode_pending()``, hence why we need to regularly trigger a proposal + // even if there's nothing going on. + if (is_prune_enabled() && should_prune()) { + do_propose = true; + } + + // mark down osds out? + + /* can_mark_out() checks if we can mark osds as being out. The -1 has no + * influence at all. The decision is made based on the ratio of "in" osds, + * and the function returns false if this ratio is lower that the minimum + * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us. + */ + if (can_mark_out(-1)) { + string down_out_subtree_limit = g_conf().get_val<string>( + "mon_osd_down_out_subtree_limit"); + set<int> down_cache; // quick cache of down subtrees + + map<int,utime_t>::iterator i = down_pending_out.begin(); + while (i != down_pending_out.end()) { + int o = i->first; + utime_t down = now; + down -= i->second; + ++i; + + if (osdmap.is_down(o) && + osdmap.is_in(o) && + can_mark_out(o)) { + utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0); + utime_t grace = orig_grace; + double my_grace = 0.0; + + if (g_conf()->mon_osd_adjust_down_out_interval) { + // scale grace period the same way we do the heartbeat grace. + const osd_xinfo_t& xi = osdmap.get_xinfo(o); + double halflife = (double)g_conf()->mon_osd_laggy_halflife; + double decay_k = ::log(.5) / halflife; + double decay = exp((double)down * decay_k); + dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k + << " down for " << down << " decay " << decay << dendl; + my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability; + grace += my_grace; + } + + // is this an entire large subtree down? + if (down_out_subtree_limit.length()) { + int type = osdmap.crush->get_type_id(down_out_subtree_limit); + if (type > 0) { + if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) { + dout(10) << "tick entire containing " << down_out_subtree_limit + << " subtree for osd." << o + << " is down; resetting timer" << dendl; + // reset timer, too. + down_pending_out[o] = now; + continue; + } + } + } + + bool down_out = !osdmap.is_destroyed(o) && + g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace; + bool destroyed_out = osdmap.is_destroyed(o) && + g_conf()->mon_osd_destroyed_out_interval > 0 && + // this is not precise enough as we did not make a note when this osd + // was marked as destroyed, but let's not bother with that + // complexity for now. + down.sec() >= g_conf()->mon_osd_destroyed_out_interval; + if (down_out || destroyed_out) { + dout(10) << "tick marking osd." << o << " OUT after " << down + << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl; + pending_inc.new_weight[o] = CEPH_OSD_OUT; + + // set the AUTOOUT bit. + if (pending_inc.new_state.count(o) == 0) + pending_inc.new_state[o] = 0; + pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT; + + // remember previous weight + if (pending_inc.new_xinfo.count(o) == 0) + pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o]; + pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o]; + + do_propose = true; + + mon.clog->info() << "Marking osd." << o << " out (has been down for " + << int(down.sec()) << " seconds)"; + } else + continue; + } + + down_pending_out.erase(o); + } + } else { + dout(10) << "tick NOOUT flag set, not checking down osds" << dendl; + } + + // expire blocklisted items? + for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin(); + p != osdmap.blocklist.end(); + ++p) { + if (p->second < now) { + dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl; + pending_inc.old_blocklist.push_back(p->first); + do_propose = true; + } + } + for (auto p = osdmap.range_blocklist.begin(); + p != osdmap.range_blocklist.end(); + ++p) { + if (p->second < now) { + dout(10) << "expiring range_blocklist item " << p->first + << " expired " << p->second << " < now " << now << dendl; + pending_inc.old_range_blocklist.push_back(p->first); + do_propose = true; + } + } + + if (try_prune_purged_snaps()) { + do_propose = true; + } + + if (update_pools_status()) + do_propose = true; + + if (do_propose || + !pending_inc.new_pg_temp.empty()) // also propose if we adjusted pg_temp + propose_pending(); +} + +void OSDMonitor::_set_new_cache_sizes() +{ + uint64_t cache_size = 0; + int64_t inc_alloc = 0; + int64_t full_alloc = 0; + int64_t kv_alloc = 0; + + if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) { + cache_size = pcm->get_tuned_mem(); + inc_alloc = inc_cache->get_committed_size(); + full_alloc = full_cache->get_committed_size(); + kv_alloc = rocksdb_binned_kv_cache->get_committed_size(); + } + + inc_osd_cache.set_bytes(inc_alloc); + full_osd_cache.set_bytes(full_alloc); + + dout(1) << __func__ << " cache_size:" << cache_size + << " inc_alloc: " << inc_alloc + << " full_alloc: " << full_alloc + << " kv_alloc: " << kv_alloc + << dendl; +} + +bool OSDMonitor::handle_osd_timeouts(const utime_t &now, + std::map<int, std::pair<utime_t, int>> &last_osd_report) +{ + utime_t timeo(g_conf()->mon_osd_report_timeout, 0); + if (now - mon.get_leader_since() < timeo) { + // We haven't been the leader for long enough to consider OSD timeouts + return false; + } + + int max_osd = osdmap.get_max_osd(); + bool new_down = false; + + for (int i=0; i < max_osd; ++i) { + dout(30) << __func__ << ": checking up on osd " << i << dendl; + if (!osdmap.exists(i)) { + last_osd_report.erase(i); // if any + continue; + } + if (!osdmap.is_up(i)) + continue; + const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i); + if (t == last_osd_report.end()) { + // it wasn't in the map; start the timer. + last_osd_report[i].first = now; + last_osd_report[i].second = 0; + } else if (can_mark_down(i)) { + utime_t diff = now - t->second.first; + // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout + // to allow for the osd to miss a beacon. + int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout; + utime_t max_timeout(std::max(mon_osd_report_timeout, 2 * t->second.second), 0); + if (diff > max_timeout) { + mon.clog->info() << "osd." << i << " marked down after no beacon for " + << diff << " seconds"; + derr << "no beacon from osd." << i << " since " << t->second.first + << ", " << diff << " seconds ago. marking down" << dendl; + pending_inc.new_state[i] = CEPH_OSD_UP; + new_down = true; + } + } + } + return new_down; +} + +static void dump_cpu_list(Formatter *f, const char *name, + const string& strlist) +{ + cpu_set_t cpu_set; + size_t cpu_set_size; + if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) { + return; + } + set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set); + f->open_array_section(name); + for (auto cpu : cpus) { + f->dump_int("cpu", cpu); + } + f->close_section(); +} + +void OSDMonitor::dump_info(Formatter *f) +{ + f->open_object_section("osdmap"); + osdmap.dump(f); + f->close_section(); + + f->open_array_section("osd_metadata"); + for (int i=0; i<osdmap.get_max_osd(); ++i) { + if (osdmap.exists(i)) { + f->open_object_section("osd"); + f->dump_unsigned("id", i); + dump_osd_metadata(i, f, NULL); + f->close_section(); + } + } + f->close_section(); + + f->open_object_section("osdmap_clean_epochs"); + f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean()); + + f->open_object_section("last_epoch_clean"); + last_epoch_clean.dump(f); + f->close_section(); + + f->open_array_section("osd_epochs"); + for (auto& osd_epoch : osd_epochs) { + f->open_object_section("osd"); + f->dump_unsigned("id", osd_epoch.first); + f->dump_unsigned("epoch", osd_epoch.second); + f->close_section(); + } + f->close_section(); // osd_epochs + + f->close_section(); // osd_clean_epochs + + f->dump_unsigned("osdmap_first_committed", get_first_committed()); + f->dump_unsigned("osdmap_last_committed", get_last_committed()); + + f->open_object_section("crushmap"); + osdmap.crush->dump(f); + f->close_section(); + + if (has_osdmap_manifest) { + f->open_object_section("osdmap_manifest"); + osdmap_manifest.dump(f); + f->close_section(); + } +} + +namespace { + enum osd_pool_get_choices { + SIZE, MIN_SIZE, + PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES, + NODELETE, NOPGCHANGE, NOSIZECHANGE, + WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB, + HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP, + USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, + CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO, + CACHE_TARGET_FULL_RATIO, + CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE, + ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE, + MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ, + HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N, + SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL, + RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY, + COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO, + COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE, + CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM, + PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO, + PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM, + DEDUP_CDC_CHUNK_SIZE, PG_NUM_MAX, BULK }; + + std::set<osd_pool_get_choices> + subtract_second_from_first(const std::set<osd_pool_get_choices>& first, + const std::set<osd_pool_get_choices>& second) + { + std::set<osd_pool_get_choices> result; + std::set_difference(first.begin(), first.end(), + second.begin(), second.end(), + std::inserter(result, result.end())); + return result; + } +} + + +bool OSDMonitor::preprocess_command(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MMonCommand>(); + int r = 0; + bufferlist rdata; + stringstream ss, ds; + + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + + MonSession *session = op->get_session(); + if (!session) { + derr << __func__ << " no session" << dendl; + mon.reply_command(op, -EACCES, "access denied", get_last_committed()); + return true; + } + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + if (prefix == "osd stat") { + if (f) { + f->open_object_section("osdmap"); + osdmap.print_summary(f.get(), ds, "", true); + f->close_section(); + f->flush(rdata); + } else { + osdmap.print_summary(nullptr, ds, "", true); + rdata.append(ds); + } + } + else if (prefix == "osd dump" || + prefix == "osd tree" || + prefix == "osd tree-from" || + prefix == "osd ls" || + prefix == "osd getmap" || + prefix == "osd getcrushmap" || + prefix == "osd ls-tree" || + prefix == "osd info") { + + epoch_t epoch = 0; + int64_t epochnum; + cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch()); + epoch = epochnum; + + bufferlist osdmap_bl; + int err = get_version_full(epoch, osdmap_bl); + if (err == -ENOENT) { + r = -ENOENT; + ss << "there is no map for epoch " << epoch; + goto reply; + } + ceph_assert(err == 0); + ceph_assert(osdmap_bl.length()); + + OSDMap *p; + if (epoch == osdmap.get_epoch()) { + p = &osdmap; + } else { + p = new OSDMap; + p->decode(osdmap_bl); + } + + auto sg = make_scope_guard([&] { + if (p != &osdmap) { + delete p; + } + }); + + if (prefix == "osd dump") { + stringstream ds; + if (f) { + f->open_object_section("osdmap"); + p->dump(f.get()); + f->close_section(); + f->flush(ds); + } else { + p->print(ds); + } + rdata.append(ds); + if (!f) + ds << " "; + } else if (prefix == "osd ls") { + if (f) { + f->open_array_section("osds"); + for (int i = 0; i < osdmap.get_max_osd(); i++) { + if (osdmap.exists(i)) { + f->dump_int("osd", i); + } + } + f->close_section(); + f->flush(ds); + } else { + bool first = true; + for (int i = 0; i < osdmap.get_max_osd(); i++) { + if (osdmap.exists(i)) { + if (!first) + ds << "\n"; + first = false; + ds << i; + } + } + } + rdata.append(ds); + } else if (prefix == "osd info") { + int64_t osd_id; + bool do_single_osd = true; + if (!cmd_getval(cmdmap, "id", osd_id)) { + do_single_osd = false; + } + + if (do_single_osd && !osdmap.exists(osd_id)) { + ss << "osd." << osd_id << " does not exist"; + r = -EINVAL; + goto reply; + } + + if (f) { + if (do_single_osd) { + osdmap.dump_osd(osd_id, f.get()); + } else { + osdmap.dump_osds(f.get()); + } + f->flush(ds); + } else { + if (do_single_osd) { + osdmap.print_osd(osd_id, ds); + } else { + osdmap.print_osds(ds); + } + } + rdata.append(ds); + } else if (prefix == "osd tree" || prefix == "osd tree-from") { + string bucket; + if (prefix == "osd tree-from") { + cmd_getval(cmdmap, "bucket", bucket); + if (!osdmap.crush->name_exists(bucket)) { + ss << "bucket '" << bucket << "' does not exist"; + r = -ENOENT; + goto reply; + } + int id = osdmap.crush->get_item_id(bucket); + if (id >= 0) { + ss << "\"" << bucket << "\" is not a bucket"; + r = -EINVAL; + goto reply; + } + } + + vector<string> states; + cmd_getval(cmdmap, "states", states); + unsigned filter = 0; + for (auto& s : states) { + if (s == "up") { + filter |= OSDMap::DUMP_UP; + } else if (s == "down") { + filter |= OSDMap::DUMP_DOWN; + } else if (s == "in") { + filter |= OSDMap::DUMP_IN; + } else if (s == "out") { + filter |= OSDMap::DUMP_OUT; + } else if (s == "destroyed") { + filter |= OSDMap::DUMP_DESTROYED; + } else { + ss << "unrecognized state '" << s << "'"; + r = -EINVAL; + goto reply; + } + } + if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) == + (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) { + ss << "cannot specify both 'in' and 'out'"; + r = -EINVAL; + goto reply; + } + if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) == + (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) || + ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) == + (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) || + ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) == + (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) { + ss << "can specify only one of 'up', 'down' and 'destroyed'"; + r = -EINVAL; + goto reply; + } + if (f) { + f->open_object_section("tree"); + p->print_tree(f.get(), NULL, filter, bucket); + f->close_section(); + f->flush(ds); + } else { + p->print_tree(NULL, &ds, filter, bucket); + } + rdata.append(ds); + } else if (prefix == "osd getmap") { + rdata.append(osdmap_bl); + ss << "got osdmap epoch " << p->get_epoch(); + } else if (prefix == "osd getcrushmap") { + p->crush->encode(rdata, mon.get_quorum_con_features()); + ss << p->get_crush_version(); + } else if (prefix == "osd ls-tree") { + string bucket_name; + cmd_getval(cmdmap, "name", bucket_name); + set<int> osds; + r = p->get_osds_by_bucket_name(bucket_name, &osds); + if (r == -ENOENT) { + ss << "\"" << bucket_name << "\" does not exist"; + goto reply; + } else if (r < 0) { + ss << "can not parse bucket name:\"" << bucket_name << "\""; + goto reply; + } + + if (f) { + f->open_array_section("osds"); + for (auto &i : osds) { + if (osdmap.exists(i)) { + f->dump_int("osd", i); + } + } + f->close_section(); + f->flush(ds); + } else { + bool first = true; + for (auto &i : osds) { + if (osdmap.exists(i)) { + if (!first) + ds << "\n"; + first = false; + ds << i; + } + } + } + + rdata.append(ds); + } + } else if (prefix == "osd getmaxosd") { + if (f) { + f->open_object_section("getmaxosd"); + f->dump_unsigned("epoch", osdmap.get_epoch()); + f->dump_int("max_osd", osdmap.get_max_osd()); + f->close_section(); + f->flush(rdata); + } else { + ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch(); + rdata.append(ds); + } + } else if (prefix == "osd utilization") { + string out; + osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get()); + if (f) + f->flush(rdata); + else + rdata.append(out); + r = 0; + goto reply; + } else if (prefix == "osd find") { + int64_t osd; + if (!cmd_getval(cmdmap, "id", osd)) { + ss << "unable to parse osd id value '" + << cmd_vartype_stringify(cmdmap["id"]) << "'"; + r = -EINVAL; + goto reply; + } + if (!osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist"; + r = -ENOENT; + goto reply; + } + string format; + cmd_getval(cmdmap, "format", format); + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty")); + f->open_object_section("osd_location"); + f->dump_int("osd", osd); + f->dump_object("addrs", osdmap.get_addrs(osd)); + f->dump_stream("osd_fsid") << osdmap.get_uuid(osd); + + // try to identify host, pod/container name, etc. + map<string,string> m; + load_metadata(osd, m, nullptr); + if (auto p = m.find("hostname"); p != m.end()) { + f->dump_string("host", p->second); + } + for (auto& k : { + "pod_name", "pod_namespace", // set by rook + "container_name" // set by cephadm, ceph-ansible + }) { + if (auto p = m.find(k); p != m.end()) { + f->dump_string(k, p->second); + } + } + + // crush is helpful too + f->open_object_section("crush_location"); + map<string,string> loc = osdmap.crush->get_full_location(osd); + for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) + f->dump_string(p->first.c_str(), p->second); + f->close_section(); + f->close_section(); + f->flush(rdata); + } else if (prefix == "osd metadata") { + int64_t osd = -1; + if (cmd_vartype_stringify(cmdmap["id"]).size() && + !cmd_getval(cmdmap, "id", osd)) { + ss << "unable to parse osd id value '" + << cmd_vartype_stringify(cmdmap["id"]) << "'"; + r = -EINVAL; + goto reply; + } + if (osd >= 0 && !osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist"; + r = -ENOENT; + goto reply; + } + string format; + cmd_getval(cmdmap, "format", format); + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty")); + if (osd >= 0) { + f->open_object_section("osd_metadata"); + f->dump_unsigned("id", osd); + r = dump_osd_metadata(osd, f.get(), &ss); + if (r < 0) + goto reply; + f->close_section(); + } else { + r = 0; + f->open_array_section("osd_metadata"); + for (int i=0; i<osdmap.get_max_osd(); ++i) { + if (osdmap.exists(i)) { + f->open_object_section("osd"); + f->dump_unsigned("id", i); + r = dump_osd_metadata(i, f.get(), NULL); + if (r == -EINVAL || r == -ENOENT) { + // Drop error, continue to get other daemons' metadata + dout(4) << "No metadata for osd." << i << dendl; + r = 0; + } else if (r < 0) { + // Unexpected error + goto reply; + } + f->close_section(); + } + } + f->close_section(); + } + f->flush(rdata); + } else if (prefix == "osd versions") { + if (!f) + f.reset(Formatter::create("json-pretty")); + count_metadata("ceph_version", f.get()); + f->flush(rdata); + r = 0; + } else if (prefix == "osd count-metadata") { + if (!f) + f.reset(Formatter::create("json-pretty")); + string field; + cmd_getval(cmdmap, "property", field); + count_metadata(field, f.get()); + f->flush(rdata); + r = 0; + } else if (prefix == "osd numa-status") { + TextTable tbl; + if (f) { + f->open_array_section("osds"); + } else { + tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT); + tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT); + } + for (int i=0; i<osdmap.get_max_osd(); ++i) { + if (osdmap.exists(i)) { + map<string,string> m; + ostringstream err; + if (load_metadata(i, m, &err) < 0) { + continue; + } + string host; + auto p = m.find("hostname"); + if (p != m.end()) { + host = p->second; + } + if (f) { + f->open_object_section("osd"); + f->dump_int("osd", i); + f->dump_string("host", host); + for (auto n : { "network_numa_node", "objectstore_numa_node", + "numa_node" }) { + p = m.find(n); + if (p != m.end()) { + f->dump_int(n, atoi(p->second.c_str())); + } + } + for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) { + p = m.find(n); + if (p != m.end()) { + list<string> ls = get_str_list(p->second, ","); + f->open_array_section(n); + for (auto node : ls) { + f->dump_int("node", atoi(node.c_str())); + } + f->close_section(); + } + } + for (auto n : { "numa_node_cpus" }) { + p = m.find(n); + if (p != m.end()) { + dump_cpu_list(f.get(), n, p->second); + } + } + f->close_section(); + } else { + tbl << i; + tbl << host; + p = m.find("network_numa_nodes"); + if (p != m.end()) { + tbl << p->second; + } else { + tbl << "-"; + } + p = m.find("objectstore_numa_nodes"); + if (p != m.end()) { + tbl << p->second; + } else { + tbl << "-"; + } + p = m.find("numa_node"); + auto q = m.find("numa_node_cpus"); + if (p != m.end() && q != m.end()) { + tbl << p->second; + tbl << q->second; + } else { + tbl << "-"; + tbl << "-"; + } + tbl << TextTable::endrow; + } + } + } + if (f) { + f->close_section(); + f->flush(rdata); + } else { + rdata.append(stringify(tbl)); + } + } else if (prefix == "osd map") { + string poolstr, objstr, namespacestr; + cmd_getval(cmdmap, "pool", poolstr); + cmd_getval(cmdmap, "object", objstr); + cmd_getval(cmdmap, "nspace", namespacestr); + + int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str()); + if (pool < 0) { + ss << "pool " << poolstr << " does not exist"; + r = -ENOENT; + goto reply; + } + object_locator_t oloc(pool, namespacestr); + object_t oid(objstr); + pg_t pgid = osdmap.object_locator_to_pg(oid, oloc); + pg_t mpgid = osdmap.raw_pg_to_pg(pgid); + vector<int> up, acting; + int up_p, acting_p; + osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p); + + string fullobjname; + if (!namespacestr.empty()) + fullobjname = namespacestr + string("/") + oid.name; + else + fullobjname = oid.name; + if (f) { + f->open_object_section("osd_map"); + f->dump_unsigned("epoch", osdmap.get_epoch()); + f->dump_string("pool", poolstr); + f->dump_int("pool_id", pool); + f->dump_stream("objname") << fullobjname; + f->dump_stream("raw_pgid") << pgid; + f->dump_stream("pgid") << mpgid; + f->open_array_section("up"); + for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("up_primary", up_p); + f->open_array_section("acting"); + for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->dump_int("acting_primary", acting_p); + f->close_section(); // osd_map + f->flush(rdata); + } else { + ds << "osdmap e" << osdmap.get_epoch() + << " pool '" << poolstr << "' (" << pool << ")" + << " object '" << fullobjname << "' ->" + << " pg " << pgid << " (" << mpgid << ")" + << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting (" + << pg_vector_string(acting) << ", p" << acting_p << ")"; + rdata.append(ds); + } + + } else if (prefix == "pg map") { + pg_t pgid; + string pgidstr; + cmd_getval(cmdmap, "pgid", pgidstr); + if (!pgid.parse(pgidstr.c_str())) { + ss << "invalid pgid '" << pgidstr << "'"; + r = -EINVAL; + goto reply; + } + vector<int> up, acting; + if (!osdmap.have_pg_pool(pgid.pool())) { + ss << "pg '" << pgidstr << "' does not exist"; + r = -ENOENT; + goto reply; + } + pg_t mpgid = osdmap.raw_pg_to_pg(pgid); + osdmap.pg_to_up_acting_osds(pgid, up, acting); + if (f) { + f->open_object_section("pg_map"); + f->dump_unsigned("epoch", osdmap.get_epoch()); + f->dump_stream("raw_pgid") << pgid; + f->dump_stream("pgid") << mpgid; + f->open_array_section("up"); + for (auto osd : up) { + f->dump_int("up_osd", osd); + } + f->close_section(); + f->open_array_section("acting"); + for (auto osd : acting) { + f->dump_int("acting_osd", osd); + } + f->close_section(); + f->close_section(); + f->flush(rdata); + } else { + ds << "osdmap e" << osdmap.get_epoch() + << " pg " << pgid << " (" << mpgid << ")" + << " -> up " << up << " acting " << acting; + rdata.append(ds); + } + goto reply; + + } else if (prefix == "osd lspools") { + if (f) + f->open_array_section("pools"); + for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin(); + p != osdmap.pools.end(); + ++p) { + if (f) { + f->open_object_section("pool"); + f->dump_int("poolnum", p->first); + f->dump_string("poolname", osdmap.pool_name[p->first]); + f->close_section(); + } else { + ds << p->first << ' ' << osdmap.pool_name[p->first]; + if (next(p) != osdmap.pools.end()) { + ds << '\n'; + } + } + } + if (f) { + f->close_section(); + f->flush(ds); + } + rdata.append(ds); + } else if (prefix == "osd blocklist ls" || + prefix == "osd blacklist ls") { + if (f) + f->open_array_section("blocklist"); + + for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin(); + p != osdmap.blocklist.end(); + ++p) { + if (f) { + f->open_object_section("entry"); + f->dump_string("addr", p->first.get_legacy_str()); + f->dump_stream("until") << p->second; + f->close_section(); + } else { + stringstream ss; + string s; + ss << p->first << " " << p->second; + getline(ss, s); + s += "\n"; + rdata.append(s); + } + } + if (f) { + f->close_section(); + f->flush(rdata); + } + if (f) + f->open_array_section("range_blocklist"); + + for (auto p = osdmap.range_blocklist.begin(); + p != osdmap.range_blocklist.end(); + ++p) { + if (f) { + f->open_object_section("entry"); + f->dump_string("range", p->first.get_legacy_str()); + f->dump_stream("until") << p->second; + f->close_section(); + } else { + stringstream ss; + string s; + ss << p->first << " " << p->second; + getline(ss, s); + s += "\n"; + rdata.append(s); + } + } + if (f) { + f->close_section(); + f->flush(rdata); + } + ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries"; + + } else if (prefix == "osd pool ls") { + string detail; + cmd_getval(cmdmap, "detail", detail); + if (!f && detail == "detail") { + ostringstream ss; + osdmap.print_pools(ss); + rdata.append(ss.str()); + } else { + if (f) + f->open_array_section("pools"); + for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin(); + it != osdmap.get_pools().end(); + ++it) { + if (f) { + if (detail == "detail") { + f->open_object_section("pool"); + f->dump_int("pool_id", it->first); + f->dump_string("pool_name", osdmap.get_pool_name(it->first)); + it->second.dump(f.get()); + f->close_section(); + } else { + f->dump_string("pool_name", osdmap.get_pool_name(it->first)); + } + } else { + rdata.append(osdmap.get_pool_name(it->first) + "\n"); + } + } + if (f) { + f->close_section(); + f->flush(rdata); + } + } + + } else if (prefix == "osd crush get-tunable") { + string tunable; + cmd_getval(cmdmap, "tunable", tunable); + ostringstream rss; + if (f) + f->open_object_section("tunable"); + if (tunable == "straw_calc_version") { + if (f) + f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version()); + else + rss << osdmap.crush->get_straw_calc_version() << "\n"; + } else { + r = -EINVAL; + goto reply; + } + if (f) { + f->close_section(); + f->flush(rdata); + } else { + rdata.append(rss.str()); + } + r = 0; + + } else if (prefix == "osd pool get") { + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str()); + if (pool < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + r = -ENOENT; + goto reply; + } + + const pg_pool_t *p = osdmap.get_pg_pool(pool); + string var; + cmd_getval(cmdmap, "var", var); + + typedef std::map<std::string, osd_pool_get_choices> choices_map_t; + const choices_map_t ALL_CHOICES = { + {"size", SIZE}, + {"min_size", MIN_SIZE}, + {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM}, + {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL}, + {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE}, + {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE}, + {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB}, + {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED}, + {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD}, + {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP}, + {"use_gmt_hitset", USE_GMT_HITSET}, + {"target_max_objects", TARGET_MAX_OBJECTS}, + {"target_max_bytes", TARGET_MAX_BYTES}, + {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO}, + {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO}, + {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO}, + {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE}, + {"cache_min_evict_age", CACHE_MIN_EVICT_AGE}, + {"erasure_code_profile", ERASURE_CODE_PROFILE}, + {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE}, + {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE}, + {"fast_read", FAST_READ}, + {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE}, + {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N}, + {"scrub_min_interval", SCRUB_MIN_INTERVAL}, + {"scrub_max_interval", SCRUB_MAX_INTERVAL}, + {"deep_scrub_interval", DEEP_SCRUB_INTERVAL}, + {"recovery_priority", RECOVERY_PRIORITY}, + {"recovery_op_priority", RECOVERY_OP_PRIORITY}, + {"scrub_priority", SCRUB_PRIORITY}, + {"compression_mode", COMPRESSION_MODE}, + {"compression_algorithm", COMPRESSION_ALGORITHM}, + {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO}, + {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE}, + {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE}, + {"csum_type", CSUM_TYPE}, + {"csum_max_block", CSUM_MAX_BLOCK}, + {"csum_min_block", CSUM_MIN_BLOCK}, + {"fingerprint_algorithm", FINGERPRINT_ALGORITHM}, + {"pg_autoscale_mode", PG_AUTOSCALE_MODE}, + {"pg_num_min", PG_NUM_MIN}, + {"pg_num_max", PG_NUM_MAX}, + {"target_size_bytes", TARGET_SIZE_BYTES}, + {"target_size_ratio", TARGET_SIZE_RATIO}, + {"pg_autoscale_bias", PG_AUTOSCALE_BIAS}, + {"dedup_tier", DEDUP_TIER}, + {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM}, + {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE}, + {"bulk", BULK} + }; + + typedef std::set<osd_pool_get_choices> choices_set_t; + + const choices_set_t ONLY_TIER_CHOICES = { + HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP, + TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO, + CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO, + CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE, + MIN_READ_RECENCY_FOR_PROMOTE, + MIN_WRITE_RECENCY_FOR_PROMOTE, + HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N + }; + const choices_set_t ONLY_ERASURE_CHOICES = { + EC_OVERWRITES, ERASURE_CODE_PROFILE + }; + + choices_set_t selected_choices; + if (var == "all") { + for(choices_map_t::const_iterator it = ALL_CHOICES.begin(); + it != ALL_CHOICES.end(); ++it) { + selected_choices.insert(it->second); + } + + if(!p->is_tier()) { + selected_choices = subtract_second_from_first(selected_choices, + ONLY_TIER_CHOICES); + } + + if(!p->is_erasure()) { + selected_choices = subtract_second_from_first(selected_choices, + ONLY_ERASURE_CHOICES); + } + } else /* var != "all" */ { + choices_map_t::const_iterator found = ALL_CHOICES.find(var); + if (found == ALL_CHOICES.end()) { + ss << "pool '" << poolstr + << "': invalid variable: '" << var << "'"; + r = -EINVAL; + goto reply; + } + + osd_pool_get_choices selected = found->second; + + if (!p->is_tier() && + ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) { + ss << "pool '" << poolstr + << "' is not a tier pool: variable not applicable"; + r = -EACCES; + goto reply; + } + + if (!p->is_erasure() && + ONLY_ERASURE_CHOICES.find(selected) + != ONLY_ERASURE_CHOICES.end()) { + ss << "pool '" << poolstr + << "' is not a erasure pool: variable not applicable"; + r = -EACCES; + goto reply; + } + + if (pool_opts_t::is_opt_name(var) && + !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) { + ss << "option '" << var << "' is not set on pool '" << poolstr << "'"; + r = -ENOENT; + goto reply; + } + + selected_choices.insert(selected); + } + + if (f) { + f->open_object_section("pool"); + f->dump_string("pool", poolstr); + f->dump_int("pool_id", pool); + for(choices_set_t::const_iterator it = selected_choices.begin(); + it != selected_choices.end(); ++it) { + choices_map_t::const_iterator i; + for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) { + if (i->second == *it) { + break; + } + } + ceph_assert(i != ALL_CHOICES.end()); + switch(*it) { + case PG_NUM: + f->dump_int("pg_num", p->get_pg_num()); + break; + case PGP_NUM: + f->dump_int("pgp_num", p->get_pgp_num()); + break; + case SIZE: + f->dump_int("size", p->get_size()); + break; + case MIN_SIZE: + f->dump_int("min_size", p->get_min_size()); + break; + case CRUSH_RULE: + if (osdmap.crush->rule_exists(p->get_crush_rule())) { + f->dump_string("crush_rule", osdmap.crush->get_rule_name( + p->get_crush_rule())); + } else { + f->dump_string("crush_rule", stringify(p->get_crush_rule())); + } + break; + case EC_OVERWRITES: + f->dump_bool("allow_ec_overwrites", + p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES)); + break; + case PG_AUTOSCALE_MODE: + f->dump_string("pg_autoscale_mode", + pg_pool_t::get_pg_autoscale_mode_name( + p->pg_autoscale_mode)); + break; + case HASHPSPOOL: + case NODELETE: + case BULK: + case NOPGCHANGE: + case NOSIZECHANGE: + case WRITE_FADVISE_DONTNEED: + case NOSCRUB: + case NODEEP_SCRUB: + f->dump_bool(i->first.c_str(), + p->has_flag(pg_pool_t::get_flag_by_name(i->first))); + break; + case HIT_SET_PERIOD: + f->dump_int("hit_set_period", p->hit_set_period); + break; + case HIT_SET_COUNT: + f->dump_int("hit_set_count", p->hit_set_count); + break; + case HIT_SET_TYPE: + f->dump_string("hit_set_type", + HitSet::get_type_name(p->hit_set_params.get_type())); + break; + case HIT_SET_FPP: + { + if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) { + BloomHitSet::Params *bloomp = + static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get()); + f->dump_float("hit_set_fpp", bloomp->get_fpp()); + } else if(var != "all") { + f->close_section(); + ss << "hit set is not of type Bloom; " << + "invalid to get a false positive rate!"; + r = -EINVAL; + goto reply; + } + } + break; + case USE_GMT_HITSET: + f->dump_bool("use_gmt_hitset", p->use_gmt_hitset); + break; + case TARGET_MAX_OBJECTS: + f->dump_unsigned("target_max_objects", p->target_max_objects); + break; + case TARGET_MAX_BYTES: + f->dump_unsigned("target_max_bytes", p->target_max_bytes); + break; + case CACHE_TARGET_DIRTY_RATIO: + f->dump_unsigned("cache_target_dirty_ratio_micro", + p->cache_target_dirty_ratio_micro); + f->dump_float("cache_target_dirty_ratio", + ((float)p->cache_target_dirty_ratio_micro/1000000)); + break; + case CACHE_TARGET_DIRTY_HIGH_RATIO: + f->dump_unsigned("cache_target_dirty_high_ratio_micro", + p->cache_target_dirty_high_ratio_micro); + f->dump_float("cache_target_dirty_high_ratio", + ((float)p->cache_target_dirty_high_ratio_micro/1000000)); + break; + case CACHE_TARGET_FULL_RATIO: + f->dump_unsigned("cache_target_full_ratio_micro", + p->cache_target_full_ratio_micro); + f->dump_float("cache_target_full_ratio", + ((float)p->cache_target_full_ratio_micro/1000000)); + break; + case CACHE_MIN_FLUSH_AGE: + f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age); + break; + case CACHE_MIN_EVICT_AGE: + f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age); + break; + case ERASURE_CODE_PROFILE: + f->dump_string("erasure_code_profile", p->erasure_code_profile); + break; + case MIN_READ_RECENCY_FOR_PROMOTE: + f->dump_int("min_read_recency_for_promote", + p->min_read_recency_for_promote); + break; + case MIN_WRITE_RECENCY_FOR_PROMOTE: + f->dump_int("min_write_recency_for_promote", + p->min_write_recency_for_promote); + break; + case FAST_READ: + f->dump_int("fast_read", p->fast_read); + break; + case HIT_SET_GRADE_DECAY_RATE: + f->dump_int("hit_set_grade_decay_rate", + p->hit_set_grade_decay_rate); + break; + case HIT_SET_SEARCH_LAST_N: + f->dump_int("hit_set_search_last_n", + p->hit_set_search_last_n); + break; + case SCRUB_MIN_INTERVAL: + case SCRUB_MAX_INTERVAL: + case DEEP_SCRUB_INTERVAL: + case RECOVERY_PRIORITY: + case RECOVERY_OP_PRIORITY: + case SCRUB_PRIORITY: + case COMPRESSION_MODE: + case COMPRESSION_ALGORITHM: + case COMPRESSION_REQUIRED_RATIO: + case COMPRESSION_MAX_BLOB_SIZE: + case COMPRESSION_MIN_BLOB_SIZE: + case CSUM_TYPE: + case CSUM_MAX_BLOCK: + case CSUM_MIN_BLOCK: + case FINGERPRINT_ALGORITHM: + case PG_NUM_MIN: + case PG_NUM_MAX: + case TARGET_SIZE_BYTES: + case TARGET_SIZE_RATIO: + case PG_AUTOSCALE_BIAS: + case DEDUP_TIER: + case DEDUP_CHUNK_ALGORITHM: + case DEDUP_CDC_CHUNK_SIZE: + pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key; + if (p->opts.is_set(key)) { + if(*it == CSUM_TYPE) { + int64_t val; + p->opts.get(pool_opts_t::CSUM_TYPE, &val); + f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val)); + } else { + p->opts.dump(i->first, f.get()); + } + } + break; + } + } + f->close_section(); + f->flush(rdata); + } else /* !f */ { + for(choices_set_t::const_iterator it = selected_choices.begin(); + it != selected_choices.end(); ++it) { + choices_map_t::const_iterator i; + switch(*it) { + case PG_NUM: + ss << "pg_num: " << p->get_pg_num() << "\n"; + break; + case PGP_NUM: + ss << "pgp_num: " << p->get_pgp_num() << "\n"; + break; + case SIZE: + ss << "size: " << p->get_size() << "\n"; + break; + case MIN_SIZE: + ss << "min_size: " << p->get_min_size() << "\n"; + break; + case CRUSH_RULE: + if (osdmap.crush->rule_exists(p->get_crush_rule())) { + ss << "crush_rule: " << osdmap.crush->get_rule_name( + p->get_crush_rule()) << "\n"; + } else { + ss << "crush_rule: " << p->get_crush_rule() << "\n"; + } + break; + case PG_AUTOSCALE_MODE: + ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name( + p->pg_autoscale_mode) <<"\n"; + break; + case HIT_SET_PERIOD: + ss << "hit_set_period: " << p->hit_set_period << "\n"; + break; + case HIT_SET_COUNT: + ss << "hit_set_count: " << p->hit_set_count << "\n"; + break; + case HIT_SET_TYPE: + ss << "hit_set_type: " << + HitSet::get_type_name(p->hit_set_params.get_type()) << "\n"; + break; + case HIT_SET_FPP: + { + if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) { + BloomHitSet::Params *bloomp = + static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get()); + ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n"; + } else if(var != "all") { + ss << "hit set is not of type Bloom; " << + "invalid to get a false positive rate!"; + r = -EINVAL; + goto reply; + } + } + break; + case USE_GMT_HITSET: + ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n"; + break; + case TARGET_MAX_OBJECTS: + ss << "target_max_objects: " << p->target_max_objects << "\n"; + break; + case TARGET_MAX_BYTES: + ss << "target_max_bytes: " << p->target_max_bytes << "\n"; + break; + case CACHE_TARGET_DIRTY_RATIO: + ss << "cache_target_dirty_ratio: " + << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n"; + break; + case CACHE_TARGET_DIRTY_HIGH_RATIO: + ss << "cache_target_dirty_high_ratio: " + << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n"; + break; + case CACHE_TARGET_FULL_RATIO: + ss << "cache_target_full_ratio: " + << ((float)p->cache_target_full_ratio_micro/1000000) << "\n"; + break; + case CACHE_MIN_FLUSH_AGE: + ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n"; + break; + case CACHE_MIN_EVICT_AGE: + ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n"; + break; + case ERASURE_CODE_PROFILE: + ss << "erasure_code_profile: " << p->erasure_code_profile << "\n"; + break; + case MIN_READ_RECENCY_FOR_PROMOTE: + ss << "min_read_recency_for_promote: " << + p->min_read_recency_for_promote << "\n"; + break; + case HIT_SET_GRADE_DECAY_RATE: + ss << "hit_set_grade_decay_rate: " << + p->hit_set_grade_decay_rate << "\n"; + break; + case HIT_SET_SEARCH_LAST_N: + ss << "hit_set_search_last_n: " << + p->hit_set_search_last_n << "\n"; + break; + case EC_OVERWRITES: + ss << "allow_ec_overwrites: " << + (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") << + "\n"; + break; + case HASHPSPOOL: + case NODELETE: + case BULK: + case NOPGCHANGE: + case NOSIZECHANGE: + case WRITE_FADVISE_DONTNEED: + case NOSCRUB: + case NODEEP_SCRUB: + for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) { + if (i->second == *it) + break; + } + ceph_assert(i != ALL_CHOICES.end()); + ss << i->first << ": " << + (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ? + "true" : "false") << "\n"; + break; + case MIN_WRITE_RECENCY_FOR_PROMOTE: + ss << "min_write_recency_for_promote: " << + p->min_write_recency_for_promote << "\n"; + break; + case FAST_READ: + ss << "fast_read: " << p->fast_read << "\n"; + break; + case SCRUB_MIN_INTERVAL: + case SCRUB_MAX_INTERVAL: + case DEEP_SCRUB_INTERVAL: + case RECOVERY_PRIORITY: + case RECOVERY_OP_PRIORITY: + case SCRUB_PRIORITY: + case COMPRESSION_MODE: + case COMPRESSION_ALGORITHM: + case COMPRESSION_REQUIRED_RATIO: + case COMPRESSION_MAX_BLOB_SIZE: + case COMPRESSION_MIN_BLOB_SIZE: + case CSUM_TYPE: + case CSUM_MAX_BLOCK: + case CSUM_MIN_BLOCK: + case FINGERPRINT_ALGORITHM: + case PG_NUM_MIN: + case PG_NUM_MAX: + case TARGET_SIZE_BYTES: + case TARGET_SIZE_RATIO: + case PG_AUTOSCALE_BIAS: + case DEDUP_TIER: + case DEDUP_CHUNK_ALGORITHM: + case DEDUP_CDC_CHUNK_SIZE: + for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) { + if (i->second == *it) + break; + } + ceph_assert(i != ALL_CHOICES.end()); + { + pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key; + if (p->opts.is_set(key)) { + if(key == pool_opts_t::CSUM_TYPE) { + int64_t val; + p->opts.get(key, &val); + ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n"; + } else { + ss << i->first << ": " << p->opts.get(key) << "\n"; + } + } + } + break; + } + rdata.append(ss.str()); + ss.str(""); + } + } + r = 0; + } else if (prefix == "osd pool get-quota") { + string pool_name; + cmd_getval(cmdmap, "pool", pool_name); + + int64_t poolid = osdmap.lookup_pg_pool_name(pool_name); + if (poolid < 0) { + ceph_assert(poolid == -ENOENT); + ss << "unrecognized pool '" << pool_name << "'"; + r = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(poolid); + const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid); + if (!pstat) { + ss << "no stats for pool '" << pool_name << "'"; + r = -ENOENT; + goto reply; + } + const object_stat_sum_t& sum = pstat->stats.sum; + if (f) { + f->open_object_section("pool_quotas"); + f->dump_string("pool_name", pool_name); + f->dump_unsigned("pool_id", poolid); + f->dump_unsigned("quota_max_objects", p->quota_max_objects); + f->dump_int("current_num_objects", sum.num_objects); + f->dump_unsigned("quota_max_bytes", p->quota_max_bytes); + f->dump_int("current_num_bytes", sum.num_bytes); + f->close_section(); + f->flush(rdata); + } else { + stringstream rs; + rs << "quotas for pool '" << pool_name << "':\n" + << " max objects: "; + if (p->quota_max_objects == 0) + rs << "N/A"; + else { + rs << si_u_t(p->quota_max_objects) << " objects"; + rs << " (current num objects: " << sum.num_objects << " objects)"; + } + rs << "\n" + << " max bytes : "; + if (p->quota_max_bytes == 0) + rs << "N/A"; + else { + rs << byte_u_t(p->quota_max_bytes); + rs << " (current num bytes: " << sum.num_bytes << " bytes)"; + } + rdata.append(rs.str()); + } + rdata.append("\n"); + r = 0; + } else if (prefix == "osd crush rule list" || + prefix == "osd crush rule ls") { + if (f) { + f->open_array_section("rules"); + osdmap.crush->list_rules(f.get()); + f->close_section(); + f->flush(rdata); + } else { + ostringstream ss; + osdmap.crush->list_rules(&ss); + rdata.append(ss.str()); + } + } else if (prefix == "osd crush rule ls-by-class") { + string class_name; + cmd_getval(cmdmap, "class", class_name); + if (class_name.empty()) { + ss << "no class specified"; + r = -EINVAL; + goto reply; + } + set<int> rules; + r = osdmap.crush->get_rules_by_class(class_name, &rules); + if (r < 0) { + ss << "failed to get rules by class '" << class_name << "'"; + goto reply; + } + if (f) { + f->open_array_section("rules"); + for (auto &rule: rules) { + f->dump_string("name", osdmap.crush->get_rule_name(rule)); + } + f->close_section(); + f->flush(rdata); + } else { + ostringstream rs; + for (auto &rule: rules) { + rs << osdmap.crush->get_rule_name(rule) << "\n"; + } + rdata.append(rs.str()); + } + } else if (prefix == "osd crush rule dump") { + string name; + cmd_getval(cmdmap, "name", name); + string format; + cmd_getval(cmdmap, "format", format); + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty")); + if (name == "") { + f->open_array_section("rules"); + osdmap.crush->dump_rules(f.get()); + f->close_section(); + } else { + int ruleno = osdmap.crush->get_rule_id(name); + if (ruleno < 0) { + ss << "unknown crush rule '" << name << "'"; + r = ruleno; + goto reply; + } + osdmap.crush->dump_rule(ruleno, f.get()); + } + ostringstream rs; + f->flush(rs); + rs << "\n"; + rdata.append(rs.str()); + } else if (prefix == "osd crush dump") { + string format; + cmd_getval(cmdmap, "format", format); + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty")); + f->open_object_section("crush_map"); + osdmap.crush->dump(f.get()); + f->close_section(); + ostringstream rs; + f->flush(rs); + rs << "\n"; + rdata.append(rs.str()); + } else if (prefix == "osd crush show-tunables") { + string format; + cmd_getval(cmdmap, "format", format); + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty")); + f->open_object_section("crush_map_tunables"); + osdmap.crush->dump_tunables(f.get()); + f->close_section(); + ostringstream rs; + f->flush(rs); + rs << "\n"; + rdata.append(rs.str()); + } else if (prefix == "osd crush tree") { + string shadow; + cmd_getval(cmdmap, "shadow", shadow); + bool show_shadow = shadow == "--show-shadow"; + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + if (f) { + f->open_object_section("crush_tree"); + osdmap.crush->dump_tree(nullptr, + f.get(), + osdmap.get_pool_names(), + show_shadow); + f->close_section(); + f->flush(rdata); + } else { + ostringstream ss; + osdmap.crush->dump_tree(&ss, + nullptr, + osdmap.get_pool_names(), + show_shadow); + rdata.append(ss.str()); + } + } else if (prefix == "osd crush ls") { + string name; + if (!cmd_getval(cmdmap, "node", name)) { + ss << "no node specified"; + r = -EINVAL; + goto reply; + } + if (!osdmap.crush->name_exists(name)) { + ss << "node '" << name << "' does not exist"; + r = -ENOENT; + goto reply; + } + int id = osdmap.crush->get_item_id(name); + list<int> result; + if (id >= 0) { + result.push_back(id); + } else { + int num = osdmap.crush->get_bucket_size(id); + for (int i = 0; i < num; ++i) { + result.push_back(osdmap.crush->get_bucket_item(id, i)); + } + } + if (f) { + f->open_array_section("items"); + for (auto i : result) { + f->dump_string("item", osdmap.crush->get_item_name(i)); + } + f->close_section(); + f->flush(rdata); + } else { + ostringstream ss; + for (auto i : result) { + ss << osdmap.crush->get_item_name(i) << "\n"; + } + rdata.append(ss.str()); + } + r = 0; + } else if (prefix == "osd crush class ls") { + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty")); + f->open_array_section("crush_classes"); + for (auto i : osdmap.crush->class_name) + f->dump_string("class", i.second); + f->close_section(); + f->flush(rdata); + } else if (prefix == "osd crush class ls-osd") { + string name; + cmd_getval(cmdmap, "class", name); + set<int> osds; + osdmap.crush->get_devices_by_class(name, &osds); + if (f) { + f->open_array_section("osds"); + for (auto &osd: osds) + f->dump_int("osd", osd); + f->close_section(); + f->flush(rdata); + } else { + bool first = true; + for (auto &osd : osds) { + if (!first) + ds << "\n"; + first = false; + ds << osd; + } + rdata.append(ds); + } + } else if (prefix == "osd crush get-device-class") { + vector<string> idvec; + cmd_getval(cmdmap, "ids", idvec); + map<int, string> class_by_osd; + for (auto& id : idvec) { + ostringstream ts; + long osd = parse_osd_id(id.c_str(), &ts); + if (osd < 0) { + ss << "unable to parse osd id:'" << id << "'"; + r = -EINVAL; + goto reply; + } + auto device_class = osdmap.crush->get_item_class(osd); + if (device_class) + class_by_osd[osd] = device_class; + else + class_by_osd[osd] = ""; // no class + } + if (f) { + f->open_array_section("osd_device_classes"); + for (auto& i : class_by_osd) { + f->open_object_section("osd_device_class"); + f->dump_int("osd", i.first); + f->dump_string("device_class", i.second); + f->close_section(); + } + f->close_section(); + f->flush(rdata); + } else { + if (class_by_osd.size() == 1) { + // for single input, make a clean output + ds << class_by_osd.begin()->second; + } else { + // note that we do not group osds by class here + for (auto it = class_by_osd.begin(); + it != class_by_osd.end(); + it++) { + ds << "osd." << it->first << ' ' << it->second; + if (next(it) != class_by_osd.end()) + ds << '\n'; + } + } + rdata.append(ds); + } + } else if (prefix == "osd erasure-code-profile ls") { + const auto &profiles = osdmap.get_erasure_code_profiles(); + if (f) + f->open_array_section("erasure-code-profiles"); + for (auto i = profiles.begin(); i != profiles.end(); ++i) { + if (f) + f->dump_string("profile", i->first.c_str()); + else + rdata.append(i->first + "\n"); + } + if (f) { + f->close_section(); + ostringstream rs; + f->flush(rs); + rs << "\n"; + rdata.append(rs.str()); + } + } else if (prefix == "osd crush weight-set ls") { + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + if (f) { + f->open_array_section("weight_sets"); + if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) { + f->dump_string("pool", "(compat)"); + } + for (auto& i : osdmap.crush->choose_args) { + if (i.first >= 0) { + f->dump_string("pool", osdmap.get_pool_name(i.first)); + } + } + f->close_section(); + f->flush(rdata); + } else { + ostringstream rs; + if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) { + rs << "(compat)\n"; + } + for (auto& i : osdmap.crush->choose_args) { + if (i.first >= 0) { + rs << osdmap.get_pool_name(i.first) << "\n"; + } + } + rdata.append(rs.str()); + } + } else if (prefix == "osd crush weight-set dump") { + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", + "json-pretty")); + osdmap.crush->dump_choose_args(f.get()); + f->flush(rdata); + } else if (prefix == "osd erasure-code-profile get") { + string name; + cmd_getval(cmdmap, "name", name); + if (!osdmap.has_erasure_code_profile(name)) { + ss << "unknown erasure code profile '" << name << "'"; + r = -ENOENT; + goto reply; + } + const map<string,string> &profile = osdmap.get_erasure_code_profile(name); + if (f) + f->open_object_section("profile"); + for (map<string,string>::const_iterator i = profile.begin(); + i != profile.end(); + ++i) { + if (f) + f->dump_string(i->first.c_str(), i->second.c_str()); + else + rdata.append(i->first + "=" + i->second + "\n"); + } + if (f) { + f->close_section(); + ostringstream rs; + f->flush(rs); + rs << "\n"; + rdata.append(rs.str()); + } + } else if (prefix == "osd pool application get") { + boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", + "json-pretty")); + string pool_name; + cmd_getval(cmdmap, "pool", pool_name); + string app; + cmd_getval(cmdmap, "app", app); + string key; + cmd_getval(cmdmap, "key", key); + + if (pool_name.empty()) { + // all + f->open_object_section("pools"); + for (const auto &pool : osdmap.pools) { + std::string name("<unknown>"); + const auto &pni = osdmap.pool_name.find(pool.first); + if (pni != osdmap.pool_name.end()) + name = pni->second; + f->open_object_section(name.c_str()); + for (auto &app_pair : pool.second.application_metadata) { + f->open_object_section(app_pair.first.c_str()); + for (auto &kv_pair : app_pair.second) { + f->dump_string(kv_pair.first.c_str(), kv_pair.second); + } + f->close_section(); + } + f->close_section(); // name + } + f->close_section(); // pools + f->flush(rdata); + } else { + int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str()); + if (pool < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + r = -ENOENT; + goto reply; + } + auto p = osdmap.get_pg_pool(pool); + // filter by pool + if (app.empty()) { + f->open_object_section(pool_name.c_str()); + for (auto &app_pair : p->application_metadata) { + f->open_object_section(app_pair.first.c_str()); + for (auto &kv_pair : app_pair.second) { + f->dump_string(kv_pair.first.c_str(), kv_pair.second); + } + f->close_section(); // application + } + f->close_section(); // pool_name + f->flush(rdata); + goto reply; + } + + auto app_it = p->application_metadata.find(app); + if (app_it == p->application_metadata.end()) { + ss << "pool '" << pool_name << "' has no application '" << app << "'"; + r = -ENOENT; + goto reply; + } + // filter by pool + app + if (key.empty()) { + f->open_object_section(app_it->first.c_str()); + for (auto &kv_pair : app_it->second) { + f->dump_string(kv_pair.first.c_str(), kv_pair.second); + } + f->close_section(); // application + f->flush(rdata); + goto reply; + } + // filter by pool + app + key + auto key_it = app_it->second.find(key); + if (key_it == app_it->second.end()) { + ss << "application '" << app << "' on pool '" << pool_name + << "' does not have key '" << key << "'"; + r = -ENOENT; + goto reply; + } + ss << key_it->second << "\n"; + rdata.append(ss.str()); + ss.str(""); + } + } else if (prefix == "osd get-require-min-compat-client") { + ss << osdmap.require_min_compat_client << std::endl; + rdata.append(ss.str()); + ss.str(""); + goto reply; + } else if (prefix == "osd pool application enable" || + prefix == "osd pool application disable" || + prefix == "osd pool application set" || + prefix == "osd pool application rm") { + bool changed = false; + r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed); + if (r != 0) { + // Error, reply. + goto reply; + } else if (changed) { + // Valid mutation, proceed to prepare phase + return false; + } else { + // Idempotent case, reply + goto reply; + } + } else { + // try prepare update + return false; + } + + reply: + string rs; + getline(ss, rs); + mon.reply_command(op, r, rs, rdata, get_last_committed()); + return true; +} + +void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags) +{ + pg_pool_t *pool = pending_inc.get_new_pool(pool_id, + osdmap.get_pg_pool(pool_id)); + ceph_assert(pool); + pool->set_flag(flags); +} + +void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags) +{ + pg_pool_t *pool = pending_inc.get_new_pool(pool_id, + osdmap.get_pg_pool(pool_id)); + ceph_assert(pool); + pool->unset_flag(flags); +} + +string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch) +{ + char k[80]; + snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch); + return k; +} + +string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap) +{ + char k[80]; + snprintf(k, sizeof(k), "purged_snap_%llu_%016llx", + (unsigned long long)pool, (unsigned long long)snap); + return k; +} + +string OSDMonitor::make_purged_snap_key_value( + int64_t pool, snapid_t snap, snapid_t num, + epoch_t epoch, bufferlist *v) +{ + // encode the *last* epoch in the key so that we can use forward + // iteration only to search for an epoch in an interval. + encode(snap, *v); + encode(snap + num, *v); + encode(epoch, *v); + return make_purged_snap_key(pool, snap + num - 1); +} + + +int OSDMonitor::lookup_purged_snap( + int64_t pool, snapid_t snap, + snapid_t *begin, snapid_t *end) +{ + string k = make_purged_snap_key(pool, snap); + auto it = mon.store->get_iterator(OSD_SNAP_PREFIX); + it->lower_bound(k); + if (!it->valid()) { + dout(20) << __func__ + << " pool " << pool << " snap " << snap + << " - key '" << k << "' not found" << dendl; + return -ENOENT; + } + if (it->key().find("purged_snap_") != 0) { + dout(20) << __func__ + << " pool " << pool << " snap " << snap + << " - key '" << k << "' got '" << it->key() + << "', wrong prefix" << dendl; + return -ENOENT; + } + string gotk = it->key(); + const char *format = "purged_snap_%llu_"; + long long int keypool; + int n = sscanf(gotk.c_str(), format, &keypool); + if (n != 1) { + derr << __func__ << " invalid k '" << gotk << "'" << dendl; + return -ENOENT; + } + if (pool != keypool) { + dout(20) << __func__ + << " pool " << pool << " snap " << snap + << " - key '" << k << "' got '" << gotk + << "', wrong pool " << keypool + << dendl; + return -ENOENT; + } + bufferlist v = it->value(); + auto p = v.cbegin(); + decode(*begin, p); + decode(*end, p); + if (snap < *begin || snap >= *end) { + dout(20) << __func__ + << " pool " << pool << " snap " << snap + << " - found [" << *begin << "," << *end << "), no overlap" + << dendl; + return -ENOENT; + } + return 0; +} + +void OSDMonitor::insert_purged_snap_update( + int64_t pool, + snapid_t start, snapid_t end, + epoch_t epoch, + MonitorDBStore::TransactionRef t) +{ + snapid_t before_begin, before_end; + snapid_t after_begin, after_end; + int b = lookup_purged_snap(pool, start - 1, + &before_begin, &before_end); + int a = lookup_purged_snap(pool, end, + &after_begin, &after_end); + if (!b && !a) { + dout(10) << __func__ + << " [" << start << "," << end << ") - joins [" + << before_begin << "," << before_end << ") and [" + << after_begin << "," << after_end << ")" << dendl; + // erase only the begin record; we'll overwrite the end one. + t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1)); + bufferlist v; + string k = make_purged_snap_key_value(pool, + before_begin, after_end - before_begin, + pending_inc.epoch, &v); + t->put(OSD_SNAP_PREFIX, k, v); + } else if (!b) { + dout(10) << __func__ + << " [" << start << "," << end << ") - join with earlier [" + << before_begin << "," << before_end << ")" << dendl; + t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1)); + bufferlist v; + string k = make_purged_snap_key_value(pool, + before_begin, end - before_begin, + pending_inc.epoch, &v); + t->put(OSD_SNAP_PREFIX, k, v); + } else if (!a) { + dout(10) << __func__ + << " [" << start << "," << end << ") - join with later [" + << after_begin << "," << after_end << ")" << dendl; + // overwrite after record + bufferlist v; + string k = make_purged_snap_key_value(pool, + start, after_end - start, + pending_inc.epoch, &v); + t->put(OSD_SNAP_PREFIX, k, v); + } else { + dout(10) << __func__ + << " [" << start << "," << end << ") - new" + << dendl; + bufferlist v; + string k = make_purged_snap_key_value(pool, + start, end - start, + pending_inc.epoch, &v); + t->put(OSD_SNAP_PREFIX, k, v); + } +} + +bool OSDMonitor::try_prune_purged_snaps() +{ + if (!mon.mgrstatmon()->is_readable()) { + return false; + } + if (!pending_inc.new_purged_snaps.empty()) { + return false; // we already pruned for this epoch + } + + unsigned max_prune = cct->_conf.get_val<uint64_t>( + "mon_max_snap_prune_per_epoch"); + if (!max_prune) { + max_prune = 100000; + } + dout(10) << __func__ << " max_prune " << max_prune << dendl; + + unsigned actually_pruned = 0; + auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps; + for (auto& p : osdmap.get_pools()) { + auto q = purged_snaps.find(p.first); + if (q == purged_snaps.end()) { + continue; + } + auto& purged = q->second; + if (purged.empty()) { + dout(20) << __func__ << " " << p.first << " nothing purged" << dendl; + continue; + } + dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl; + snap_interval_set_t to_prune; + unsigned maybe_pruned = actually_pruned; + for (auto i = purged.begin(); i != purged.end(); ++i) { + snapid_t begin = i.get_start(); + auto end = i.get_start() + i.get_len(); + snapid_t pbegin = 0, pend = 0; + int r = lookup_purged_snap(p.first, begin, &pbegin, &pend); + if (r == 0) { + // already purged. + // be a bit aggressive about backing off here, because the mon may + // do a lot of work going through this set, and if we know the + // purged set from the OSDs is at least *partly* stale we may as + // well wait for it to be fresh. + dout(20) << __func__ << " we've already purged " << pbegin + << "~" << (pend - pbegin) << dendl; + break; // next pool + } + if (pbegin && pbegin > begin && pbegin < end) { + // the tail of [begin,end) is purged; shorten the range + end = pbegin; + } + to_prune.insert(begin, end - begin); + maybe_pruned += end - begin; + if (maybe_pruned >= max_prune) { + break; + } + } + if (!to_prune.empty()) { + // PGs may still be reporting things as purged that we have already + // pruned from removed_snaps_queue. + snap_interval_set_t actual; + auto r = osdmap.removed_snaps_queue.find(p.first); + if (r != osdmap.removed_snaps_queue.end()) { + actual.intersection_of(to_prune, r->second); + } + actually_pruned += actual.size(); + dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune + << ", actual pruned " << actual << dendl; + if (!actual.empty()) { + pending_inc.new_purged_snaps[p.first].swap(actual); + } + } + if (actually_pruned >= max_prune) { + break; + } + } + dout(10) << __func__ << " actually pruned " << actually_pruned << dendl; + return !!actually_pruned; +} + +bool OSDMonitor::update_pools_status() +{ + if (!mon.mgrstatmon()->is_readable()) + return false; + + bool ret = false; + + auto& pools = osdmap.get_pools(); + for (auto it = pools.begin(); it != pools.end(); ++it) { + const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first); + if (!pstat) + continue; + const object_stat_sum_t& sum = pstat->stats.sum; + const pg_pool_t &pool = it->second; + const string& pool_name = osdmap.get_pool_name(it->first); + + bool pool_is_full = + (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) || + (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects); + + if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) { + if (pool_is_full) + continue; + + mon.clog->info() << "pool '" << pool_name + << "' no longer out of quota; removing NO_QUOTA flag"; + // below we cancel FLAG_FULL too, we'll set it again in + // OSDMonitor::encode_pending if it still fails the osd-full checking. + clear_pool_flags(it->first, + pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL); + ret = true; + } else { + if (!pool_is_full) + continue; + + if (pool.quota_max_bytes > 0 && + (uint64_t)sum.num_bytes >= pool.quota_max_bytes) { + mon.clog->warn() << "pool '" << pool_name << "' is full" + << " (reached quota's max_bytes: " + << byte_u_t(pool.quota_max_bytes) << ")"; + } + if (pool.quota_max_objects > 0 && + (uint64_t)sum.num_objects >= pool.quota_max_objects) { + mon.clog->warn() << "pool '" << pool_name << "' is full" + << " (reached quota's max_objects: " + << pool.quota_max_objects << ")"; + } + // set both FLAG_FULL_QUOTA and FLAG_FULL + // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too + // since FLAG_FULL should always take precedence + set_pool_flags(it->first, + pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL); + clear_pool_flags(it->first, + pg_pool_t::FLAG_NEARFULL | + pg_pool_t::FLAG_BACKFILLFULL); + ret = true; + } + } + return ret; +} + +int OSDMonitor::prepare_new_pool(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MPoolOp>(); + dout(10) << "prepare_new_pool from " << m->get_connection() << dendl; + MonSession *session = op->get_session(); + if (!session) + return -EPERM; + string erasure_code_profile; + stringstream ss; + string rule_name; + bool bulk = false; + int ret = 0; + ret = prepare_new_pool(m->name, m->crush_rule, rule_name, + 0, 0, 0, 0, 0, 0, 0.0, + erasure_code_profile, + pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk, + &ss); + + if (ret < 0) { + dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl; + } + return ret; +} + +int OSDMonitor::crush_rename_bucket(const string& srcname, + const string& dstname, + ostream *ss) +{ + int ret; + // + // Avoid creating a pending crush if it does not already exists and + // the rename would fail. + // + if (!_have_pending_crush()) { + ret = _get_stable_crush().can_rename_bucket(srcname, + dstname, + ss); + if (ret) + return ret; + } + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + ret = newcrush.rename_bucket(srcname, + dstname, + ss); + if (ret) + return ret; + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + *ss << "renamed bucket " << srcname << " into " << dstname; + return 0; +} + +void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const +{ + string replacement = ""; + + if (plugin == "jerasure_generic" || + plugin == "jerasure_sse3" || + plugin == "jerasure_sse4" || + plugin == "jerasure_neon") { + replacement = "jerasure"; + } else if (plugin == "shec_generic" || + plugin == "shec_sse3" || + plugin == "shec_sse4" || + plugin == "shec_neon") { + replacement = "shec"; + } + + if (replacement != "") { + dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin " + << plugin << " that has been deprecated. Please use " + << replacement << " instead." << dendl; + } +} + +int OSDMonitor::normalize_profile(const string& profilename, + ErasureCodeProfile &profile, + bool force, + ostream *ss) +{ + ErasureCodeInterfaceRef erasure_code; + ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance(); + ErasureCodeProfile::const_iterator plugin = profile.find("plugin"); + check_legacy_ec_plugin(plugin->second, profilename); + int err = instance.factory(plugin->second, + g_conf().get_val<std::string>("erasure_code_dir"), + profile, &erasure_code, ss); + if (err) { + return err; + } + + err = erasure_code->init(profile, ss); + if (err) { + return err; + } + + auto it = profile.find("stripe_unit"); + if (it != profile.end()) { + string err_str; + uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str); + if (!err_str.empty()) { + *ss << "could not parse stripe_unit '" << it->second + << "': " << err_str << std::endl; + return -EINVAL; + } + uint32_t data_chunks = erasure_code->get_data_chunk_count(); + uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks); + if (chunk_size != stripe_unit) { + *ss << "stripe_unit " << stripe_unit << " does not match ec profile " + << "alignment. Would be padded to " << chunk_size + << std::endl; + return -EINVAL; + } + if ((stripe_unit % 4096) != 0 && !force) { + *ss << "stripe_unit should be a multiple of 4096 bytes for best performance." + << "use --force to override this check" << std::endl; + return -EINVAL; + } + } + return 0; +} + +int OSDMonitor::crush_rule_create_erasure(const string &name, + const string &profile, + int *rule, + ostream *ss) +{ + int ruleid = osdmap.crush->get_rule_id(name); + if (ruleid != -ENOENT) { + *rule = osdmap.crush->get_rule_mask_ruleset(ruleid); + return -EEXIST; + } + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + ruleid = newcrush.get_rule_id(name); + if (ruleid != -ENOENT) { + *rule = newcrush.get_rule_mask_ruleset(ruleid); + return -EALREADY; + } else { + ErasureCodeInterfaceRef erasure_code; + int err = get_erasure_code(profile, &erasure_code, ss); + if (err) { + *ss << "failed to load plugin using profile " << profile << std::endl; + return err; + } + + err = erasure_code->create_rule(name, newcrush, ss); + erasure_code.reset(); + if (err < 0) + return err; + *rule = err; + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + return 0; + } +} + +int OSDMonitor::get_erasure_code(const string &erasure_code_profile, + ErasureCodeInterfaceRef *erasure_code, + ostream *ss) const +{ + if (pending_inc.has_erasure_code_profile(erasure_code_profile)) + return -EAGAIN; + ErasureCodeProfile profile = + osdmap.get_erasure_code_profile(erasure_code_profile); + ErasureCodeProfile::const_iterator plugin = + profile.find("plugin"); + if (plugin == profile.end()) { + *ss << "cannot determine the erasure code plugin" + << " because there is no 'plugin' entry in the erasure_code_profile " + << profile << std::endl; + return -EINVAL; + } + check_legacy_ec_plugin(plugin->second, erasure_code_profile); + auto& instance = ErasureCodePluginRegistry::instance(); + return instance.factory(plugin->second, + g_conf().get_val<std::string>("erasure_code_dir"), + profile, erasure_code, ss); +} + +int OSDMonitor::check_cluster_features(uint64_t features, + stringstream &ss) +{ + stringstream unsupported_ss; + int unsupported_count = 0; + if ((mon.get_quorum_con_features() & features) != features) { + unsupported_ss << "the monitor cluster"; + ++unsupported_count; + } + + set<int32_t> up_osds; + osdmap.get_up_osds(up_osds); + for (set<int32_t>::iterator it = up_osds.begin(); + it != up_osds.end(); ++it) { + const osd_xinfo_t &xi = osdmap.get_xinfo(*it); + if ((xi.features & features) != features) { + if (unsupported_count > 0) + unsupported_ss << ", "; + unsupported_ss << "osd." << *it; + unsupported_count ++; + } + } + + if (unsupported_count > 0) { + ss << "features " << features << " unsupported by: " + << unsupported_ss.str(); + return -ENOTSUP; + } + + // check pending osd state, too! + for (map<int32_t,osd_xinfo_t>::const_iterator p = + pending_inc.new_xinfo.begin(); + p != pending_inc.new_xinfo.end(); ++p) { + const osd_xinfo_t &xi = p->second; + if ((xi.features & features) != features) { + dout(10) << __func__ << " pending osd." << p->first + << " features are insufficient; retry" << dendl; + return -EAGAIN; + } + } + + return 0; +} + +bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush, + stringstream& ss) +{ + OSDMap::Incremental new_pending = pending_inc; + encode(*newcrush, new_pending.crush, mon.get_quorum_con_features()); + OSDMap newmap; + newmap.deepish_copy_from(osdmap); + newmap.apply_incremental(new_pending); + + // client compat + if (newmap.require_min_compat_client != ceph_release_t::unknown) { + auto mv = newmap.get_min_compat_client(); + if (mv > newmap.require_min_compat_client) { + ss << "new crush map requires client version " << mv + << " but require_min_compat_client is " + << newmap.require_min_compat_client; + return false; + } + } + + // osd compat + uint64_t features = + newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) | + newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL); + stringstream features_ss; + int r = check_cluster_features(features, features_ss); + if (r) { + ss << "Could not change CRUSH: " << features_ss.str(); + return false; + } + + return true; +} + +bool OSDMonitor::erasure_code_profile_in_use( + const mempool::osdmap::map<int64_t, pg_pool_t> &pools, + const string &profile, + ostream *ss) +{ + bool found = false; + for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin(); + p != pools.end(); + ++p) { + if (p->second.erasure_code_profile == profile && p->second.is_erasure()) { + *ss << osdmap.pool_name[p->first] << " "; + found = true; + } + } + if (found) { + *ss << "pool(s) are using the erasure code profile '" << profile << "'"; + } + return found; +} + +int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile, + map<string,string> *erasure_code_profile_map, + ostream *ss) +{ + int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile", + get_json_str_map, + *ss, + erasure_code_profile_map, + true); + if (r) + return r; + ceph_assert((*erasure_code_profile_map).count("plugin")); + string default_plugin = (*erasure_code_profile_map)["plugin"]; + map<string,string> user_map; + for (vector<string>::const_iterator i = erasure_code_profile.begin(); + i != erasure_code_profile.end(); + ++i) { + size_t equal = i->find('='); + if (equal == string::npos) { + user_map[*i] = string(); + (*erasure_code_profile_map)[*i] = string(); + } else { + const string key = i->substr(0, equal); + equal++; + const string value = i->substr(equal); + if (key.find("ruleset-") == 0) { + *ss << "property '" << key << "' is no longer supported; try " + << "'crush-" << key.substr(8) << "' instead"; + return -EINVAL; + } + user_map[key] = value; + (*erasure_code_profile_map)[key] = value; + } + } + + if (user_map.count("plugin") && user_map["plugin"] != default_plugin) + (*erasure_code_profile_map) = user_map; + + return 0; +} + +int OSDMonitor::prepare_pool_size(const unsigned pool_type, + const string &erasure_code_profile, + uint8_t repl_size, + unsigned *size, unsigned *min_size, + ostream *ss) +{ + int err = 0; + bool set_min_size = false; + switch (pool_type) { + case pg_pool_t::TYPE_REPLICATED: + if (osdmap.stretch_mode_enabled) { + if (repl_size == 0) + repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size"); + if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) { + *ss << "prepare_pool_size: we are in stretch mode but size " + << repl_size << " does not match!"; + return -EINVAL; + } + *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size"); + set_min_size = true; + } + if (repl_size == 0) { + repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size"); + } + *size = repl_size; + if (!set_min_size) + *min_size = g_conf().get_osd_pool_default_min_size(repl_size); + break; + case pg_pool_t::TYPE_ERASURE: + { + if (osdmap.stretch_mode_enabled) { + *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!"; + return -EINVAL; + } + ErasureCodeInterfaceRef erasure_code; + err = get_erasure_code(erasure_code_profile, &erasure_code, ss); + if (err == 0) { + *size = erasure_code->get_chunk_count(); + *min_size = + erasure_code->get_data_chunk_count() + + std::min<int>(1, erasure_code->get_coding_chunk_count() - 1); + assert(*min_size <= *size); + assert(*min_size >= erasure_code->get_data_chunk_count()); + } + } + break; + default: + *ss << "prepare_pool_size: " << pool_type << " is not a known pool type"; + err = -EINVAL; + break; + } + return err; +} + +int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type, + const string &erasure_code_profile, + uint32_t *stripe_width, + ostream *ss) +{ + int err = 0; + switch (pool_type) { + case pg_pool_t::TYPE_REPLICATED: + // ignored + break; + case pg_pool_t::TYPE_ERASURE: + { + ErasureCodeProfile profile = + osdmap.get_erasure_code_profile(erasure_code_profile); + ErasureCodeInterfaceRef erasure_code; + err = get_erasure_code(erasure_code_profile, &erasure_code, ss); + if (err) + break; + uint32_t data_chunks = erasure_code->get_data_chunk_count(); + uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit"); + auto it = profile.find("stripe_unit"); + if (it != profile.end()) { + string err_str; + stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str); + ceph_assert(err_str.empty()); + } + *stripe_width = data_chunks * + erasure_code->get_chunk_size(stripe_unit * data_chunks); + } + break; + default: + *ss << "prepare_pool_stripe_width: " + << pool_type << " is not a known pool type"; + err = -EINVAL; + break; + } + return err; +} + +int OSDMonitor::get_replicated_stretch_crush_rule() +{ + /* we don't write down the stretch rule anywhere, so + * we have to guess it. How? Look at all the pools + * and count up how many times a given rule is used + * on stretch pools and then return the one with + * the most users! + */ + map<int,int> rule_counts; + for (const auto& pooli : osdmap.pools) { + const pg_pool_t& p = pooli.second; + if (p.is_replicated() && p.is_stretch_pool()) { + if (!rule_counts.count(p.crush_rule)) { + rule_counts[p.crush_rule] = 1; + } else { + ++rule_counts[p.crush_rule]; + } + } + } + + if (rule_counts.empty()) { + return -ENOENT; + } + + int most_used_count = 0; + int most_used_rule = -1; + for (auto i : rule_counts) { + if (i.second > most_used_count) { + most_used_rule = i.first; + most_used_count = i.second; + } + } + ceph_assert(most_used_count > 0); + ceph_assert(most_used_rule >= 0); + return most_used_rule; +} + +int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type, + const string &erasure_code_profile, + const string &rule_name, + int *crush_rule, + ostream *ss) +{ + + if (*crush_rule < 0) { + switch (pool_type) { + case pg_pool_t::TYPE_REPLICATED: + { + if (rule_name == "") { + if (osdmap.stretch_mode_enabled) { + *crush_rule = get_replicated_stretch_crush_rule(); + } else { + // Use default rule + *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct); + } + if (*crush_rule < 0) { + // Errors may happen e.g. if no valid rule is available + *ss << "No suitable CRUSH rule exists, check " + << "'osd pool default crush *' config options"; + return -ENOENT; + } + } else { + return get_crush_rule(rule_name, crush_rule, ss); + } + } + break; + case pg_pool_t::TYPE_ERASURE: + { + int err = crush_rule_create_erasure(rule_name, + erasure_code_profile, + crush_rule, ss); + switch (err) { + case -EALREADY: + dout(20) << "prepare_pool_crush_rule: rule " + << rule_name << " try again" << dendl; + // fall through + case 0: + // need to wait for the crush rule to be proposed before proceeding + err = -EAGAIN; + break; + case -EEXIST: + err = 0; + break; + } + return err; + } + break; + default: + *ss << "prepare_pool_crush_rule: " << pool_type + << " is not a known pool type"; + return -EINVAL; + } + } else { + if (!osdmap.crush->ruleset_exists(*crush_rule)) { + *ss << "CRUSH rule " << *crush_rule << " not found"; + return -ENOENT; + } + } + + return 0; +} + +int OSDMonitor::get_crush_rule(const string &rule_name, + int *crush_rule, + ostream *ss) +{ + int ret; + ret = osdmap.crush->get_rule_id(rule_name); + if (ret != -ENOENT) { + // found it, use it + *crush_rule = ret; + } else { + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + ret = newcrush.get_rule_id(rule_name); + if (ret != -ENOENT) { + // found it, wait for it to be proposed + dout(20) << __func__ << ": rule " << rule_name + << " try again" << dendl; + return -EAGAIN; + } else { + // Cannot find it , return error + *ss << "specified rule " << rule_name << " doesn't exist"; + return ret; + } + } + return 0; +} + +int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss) +{ + auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd"); + auto num_osds = std::max(osdmap.get_num_in_osds(), 3u); // assume min cluster size 3 + auto max_pgs = max_pgs_per_osd * num_osds; + uint64_t projected = 0; + if (pool < 0) { + projected += pg_num * size; + } + for (const auto& i : osdmap.get_pools()) { + if (i.first == pool) { + projected += pg_num * size; + } else { + projected += i.second.get_pg_num_target() * i.second.get_size(); + } + } + if (projected > max_pgs) { + if (pool >= 0) { + *ss << "pool id " << pool; + } + *ss << " pg_num " << pg_num << " size " << size + << " would mean " << projected + << " total pgs, which exceeds max " << max_pgs + << " (mon_max_pg_per_osd " << max_pgs_per_osd + << " * num_in_osds " << num_osds << ")"; + return -ERANGE; + } + return 0; +} + +/** + * @param name The name of the new pool + * @param crush_rule The crush rule to use. If <0, will use the system default + * @param crush_rule_name The crush rule to use, if crush_rulset <0 + * @param pg_num The pg_num to use. If set to 0, will use the system default + * @param pgp_num The pgp_num to use. If set to 0, will use the system default + * @param pg_num_min min pg_num + * @param pg_num_max max pg_num + * @param repl_size Replication factor, or 0 for default + * @param erasure_code_profile The profile name in OSDMap to be used for erasure code + * @param pool_type TYPE_ERASURE, or TYPE_REP + * @param expected_num_objects expected number of objects on the pool + * @param fast_read fast read type. + * @param ss human readable error message, if any. + * + * @return 0 on success, negative errno on failure. + */ +int OSDMonitor::prepare_new_pool(string& name, + int crush_rule, + const string &crush_rule_name, + unsigned pg_num, unsigned pgp_num, + unsigned pg_num_min, + unsigned pg_num_max, + const uint64_t repl_size, + const uint64_t target_size_bytes, + const float target_size_ratio, + const string &erasure_code_profile, + const unsigned pool_type, + const uint64_t expected_num_objects, + FastReadType fast_read, + const string& pg_autoscale_mode, + bool bulk, + ostream *ss) +{ + if (name.length() == 0) + return -EINVAL; + if (pg_num == 0) + pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num"); + if (pgp_num == 0) + pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num"); + if (!pgp_num) + pgp_num = pg_num; + if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) { + *ss << "'pg_num' must be greater than 0 and less than or equal to " + << g_conf().get_val<uint64_t>("mon_max_pool_pg_num") + << " (you may adjust 'mon max pool pg num' for higher values)"; + return -ERANGE; + } + if (pgp_num > pg_num) { + *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'" + << ", which in this case is " << pg_num; + return -ERANGE; + } + if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) { + *ss << "'fast_read' can only apply to erasure coding pool"; + return -EINVAL; + } + int r; + r = prepare_pool_crush_rule(pool_type, erasure_code_profile, + crush_rule_name, &crush_rule, ss); + if (r) { + dout(10) << "prepare_pool_crush_rule returns " << r << dendl; + return r; + } + if (g_conf()->mon_osd_crush_smoke_test) { + CrushWrapper newcrush; + _get_pending_crush(newcrush); + ostringstream err; + CrushTester tester(newcrush, err); + tester.set_min_x(0); + tester.set_max_x(50); + tester.set_rule(crush_rule); + auto start = ceph::coarse_mono_clock::now(); + r = tester.test_with_fork(g_conf()->mon_lease); + auto duration = ceph::coarse_mono_clock::now() - start; + if (r < 0) { + dout(10) << "tester.test_with_fork returns " << r + << ": " << err.str() << dendl; + *ss << "crush test failed with " << r << ": " << err.str(); + return r; + } + dout(10) << __func__ << " crush smoke test duration: " + << duration << dendl; + } + unsigned size, min_size; + r = prepare_pool_size(pool_type, erasure_code_profile, repl_size, + &size, &min_size, ss); + if (r) { + dout(10) << "prepare_pool_size returns " << r << dendl; + return r; + } + r = check_pg_num(-1, pg_num, size, ss); + if (r) { + dout(10) << "check_pg_num returns " << r << dendl; + return r; + } + + if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) { + return -EINVAL; + } + + uint32_t stripe_width = 0; + r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss); + if (r) { + dout(10) << "prepare_pool_stripe_width returns " << r << dendl; + return r; + } + + bool fread = false; + if (pool_type == pg_pool_t::TYPE_ERASURE) { + switch (fast_read) { + case FAST_READ_OFF: + fread = false; + break; + case FAST_READ_ON: + fread = true; + break; + case FAST_READ_DEFAULT: + fread = g_conf()->osd_pool_default_ec_fast_read; + break; + default: + *ss << "invalid fast_read setting: " << fast_read; + return -EINVAL; + } + } + + for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin(); + p != pending_inc.new_pool_names.end(); + ++p) { + if (p->second == name) + return 0; + } + + if (-1 == pending_inc.new_pool_max) + pending_inc.new_pool_max = osdmap.pool_max; + int64_t pool = ++pending_inc.new_pool_max; + pg_pool_t empty; + pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty); + pi->create_time = ceph_clock_now(); + pi->type = pool_type; + pi->fast_read = fread; + pi->flags = g_conf()->osd_pool_default_flags; + if (bulk) { + pi->set_flag(pg_pool_t::FLAG_BULK); + } else if (g_conf()->osd_pool_default_flag_bulk) { + pi->set_flag(pg_pool_t::FLAG_BULK); + } + if (g_conf()->osd_pool_default_flag_hashpspool) + pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL); + if (g_conf()->osd_pool_default_flag_nodelete) + pi->set_flag(pg_pool_t::FLAG_NODELETE); + if (g_conf()->osd_pool_default_flag_nopgchange) + pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE); + if (g_conf()->osd_pool_default_flag_nosizechange) + pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE); + pi->set_flag(pg_pool_t::FLAG_CREATING); + if (g_conf()->osd_pool_use_gmt_hitset) + pi->use_gmt_hitset = true; + else + pi->use_gmt_hitset = false; + + pi->size = size; + pi->min_size = min_size; + pi->crush_rule = crush_rule; + pi->expected_num_objects = expected_num_objects; + pi->object_hash = CEPH_STR_HASH_RJENKINS; + if (osdmap.stretch_mode_enabled) { + pi->peering_crush_bucket_count = osdmap.stretch_bucket_count; + pi->peering_crush_bucket_target = osdmap.stretch_bucket_count; + pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket; + pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE; + if (osdmap.degraded_stretch_mode) { + pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode; + pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode; + // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE; + // TODO: drat, we don't record this ^ anywhere, though given that it + // necessarily won't exist elsewhere it likely doesn't matter + pi->min_size = pi->min_size / 2; + pi->size = pi->size / 2; // only support 2 zones now + } + } + + if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name( + g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode")); + m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) { + pi->pg_autoscale_mode = m; + } else { + pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF; + } + auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs"); + pi->set_pg_num( + max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max)) + : pg_num); + pi->set_pg_num_pending(pi->get_pg_num()); + pi->set_pg_num_target(pg_num); + pi->set_pgp_num(pi->get_pg_num()); + pi->set_pgp_num_target(pgp_num); + if (osdmap.require_osd_release >= ceph_release_t::nautilus && + pg_num_min) { + pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min)); + } + if (osdmap.require_osd_release >= ceph_release_t::pacific && + pg_num_max) { + pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max)); + } + if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name( + pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) { + pi->pg_autoscale_mode = m; + } + + pi->last_change = pending_inc.epoch; + pi->auid = 0; + + if (pool_type == pg_pool_t::TYPE_ERASURE) { + pi->erasure_code_profile = erasure_code_profile; + } else { + pi->erasure_code_profile = ""; + } + pi->stripe_width = stripe_width; + + if (osdmap.require_osd_release >= ceph_release_t::nautilus && + target_size_bytes) { + // only store for nautilus+ because TARGET_SIZE_BYTES may be + // larger than int32_t max. + pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes)); + } + if (target_size_ratio > 0.0 && + osdmap.require_osd_release >= ceph_release_t::nautilus) { + // only store for nautilus+, just to be consistent and tidy. + pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio); + } + + pi->cache_target_dirty_ratio_micro = + g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000; + pi->cache_target_dirty_high_ratio_micro = + g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000; + pi->cache_target_full_ratio_micro = + g_conf()->osd_pool_default_cache_target_full_ratio * 1000000; + pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age; + pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age; + + pending_inc.new_pool_names[pool] = name; + return 0; +} + +bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag) +{ + op->mark_osdmon_event(__func__); + ostringstream ss; + if (pending_inc.new_flags < 0) + pending_inc.new_flags = osdmap.get_flags(); + pending_inc.new_flags |= flag; + ss << OSDMap::get_flag_string(flag) << " is set"; + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(), + get_last_committed() + 1)); + return true; +} + +bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag) +{ + op->mark_osdmon_event(__func__); + ostringstream ss; + if (pending_inc.new_flags < 0) + pending_inc.new_flags = osdmap.get_flags(); + pending_inc.new_flags &= ~flag; + ss << OSDMap::get_flag_string(flag) << " is unset"; + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(), + get_last_committed() + 1)); + return true; +} + +int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap, + stringstream& ss) +{ + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str()); + if (pool < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + return -ENOENT; + } + string var; + cmd_getval(cmdmap, "var", var); + + pg_pool_t p = *osdmap.get_pg_pool(pool); + if (pending_inc.new_pools.count(pool)) + p = pending_inc.new_pools[pool]; + + // accept val as a json string in the normal case (current + // generation monitor). parse out int or float values from the + // string as needed. however, if it is not a string, try to pull + // out an int, in case an older monitor with an older json schema is + // forwarding a request. + string val; + string interr, floaterr; + int64_t n = 0; + double f = 0; + int64_t uf = 0; // micro-f + cmd_getval(cmdmap, "val", val); + + auto si_options = { + "target_max_objects" + }; + auto iec_options = { + "target_max_bytes", + "target_size_bytes", + "compression_max_blob_size", + "compression_min_blob_size", + "csum_max_block", + "csum_min_block", + }; + if (count(begin(si_options), end(si_options), var)) { + n = strict_si_cast<int64_t>(val.c_str(), &interr); + } else if (count(begin(iec_options), end(iec_options), var)) { + n = strict_iec_cast<int64_t>(val.c_str(), &interr); + } else { + // parse string as both int and float; different fields use different types. + n = strict_strtoll(val.c_str(), 10, &interr); + f = strict_strtod(val.c_str(), &floaterr); + uf = llrintl(f * (double)1000000.0); + } + + if (!p.is_tier() && + (var == "hit_set_type" || var == "hit_set_period" || + var == "hit_set_count" || var == "hit_set_fpp" || + var == "target_max_objects" || var == "target_max_bytes" || + var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" || + var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" || + var == "cache_min_flush_age" || var == "cache_min_evict_age" || + var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" || + var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) { + return -EACCES; + } + + if (var == "size") { + if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) { + ss << "pool size change is disabled; you must unset nosizechange flag for the pool first"; + return -EPERM; + } + if (p.type == pg_pool_t::TYPE_ERASURE) { + ss << "can not change the size of an erasure-coded pool"; + return -ENOTSUP; + } + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n <= 0 || n > 10) { + ss << "pool size must be between 1 and 10"; + return -EINVAL; + } + if (n == 1) { + if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) { + ss << "configuring pool size as 1 is disabled by default."; + return -EPERM; + } + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss " + "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, " + "pass the flag --yes-i-really-mean-it."; + return -EPERM; + } + } + if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) { + return -EINVAL; + } + int r = check_pg_num(pool, p.get_pg_num(), n, &ss); + if (r < 0) { + return r; + } + p.size = n; + p.min_size = g_conf().get_osd_pool_default_min_size(p.size); + } else if (var == "min_size") { + if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) { + ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first"; + return -EPERM; + } + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + + if (p.type != pg_pool_t::TYPE_ERASURE) { + if (n < 1 || n > p.size) { + ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size; + return -EINVAL; + } + } else { + ErasureCodeInterfaceRef erasure_code; + int k; + stringstream tmp; + int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp); + if (err == 0) { + k = erasure_code->get_data_chunk_count(); + } else { + ss << __func__ << " get_erasure_code failed: " << tmp.str(); + return err; + } + + if (n < k || n > p.size) { + ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size; + return -EINVAL; + } + } + p.min_size = n; + } else if (var == "pg_num_actual") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n == (int)p.get_pg_num()) { + return 0; + } + if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) { + ss << "'pg_num' must be greater than 0 and less than or equal to " + << g_conf().get_val<uint64_t>("mon_max_pool_pg_num") + << " (you may adjust 'mon max pool pg num' for higher values)"; + return -ERANGE; + } + if (p.has_flag(pg_pool_t::FLAG_CREATING)) { + ss << "cannot adjust pg_num while initial PGs are being created"; + return -EBUSY; + } + if (n > (int)p.get_pg_num()) { + if (p.get_pg_num() != p.get_pg_num_pending()) { + // force pre-nautilus clients to resend their ops, since they + // don't understand pg_num_pending changes form a new interval + p.last_force_op_resend_prenautilus = pending_inc.epoch; + } + p.set_pg_num(n); + } else { + if (osdmap.require_osd_release < ceph_release_t::nautilus) { + ss << "nautilus OSDs are required to adjust pg_num_pending"; + return -EPERM; + } + if (n < (int)p.get_pgp_num()) { + ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num(); + return -EINVAL; + } + if (n < (int)p.get_pg_num() - 1) { + ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num() + << ") - 1; only single pg decrease is currently supported"; + return -EINVAL; + } + p.set_pg_num_pending(n); + // force pre-nautilus clients to resend their ops, since they + // don't understand pg_num_pending changes form a new interval + p.last_force_op_resend_prenautilus = pending_inc.epoch; + } + // force pre-luminous clients to resend their ops, since they + // don't understand that split PGs now form a new interval. + p.last_force_op_resend_preluminous = pending_inc.epoch; + } else if (var == "pg_num") { + if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) { + ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first"; + return -EPERM; + } + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n == (int)p.get_pg_num_target()) { + return 0; + } + if (n <= 0 || static_cast<uint64_t>(n) > + g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) { + ss << "'pg_num' must be greater than 0 and less than or equal to " + << g_conf().get_val<uint64_t>("mon_max_pool_pg_num") + << " (you may adjust 'mon max pool pg num' for higher values)"; + return -ERANGE; + } + if (n > (int)p.get_pg_num_target()) { + int r = check_pg_num(pool, n, p.get_size(), &ss); + if (r) { + return r; + } + bool force = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", force); + if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) { + ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force."; + return -EPERM; + } + } else { + if (osdmap.require_osd_release < ceph_release_t::nautilus) { + ss << "nautilus OSDs are required to decrease pg_num"; + return -EPERM; + } + } + int64_t pg_min = 0, pg_max = 0; + p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min); + p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max); + if (pg_min && n < pg_min) { + ss << "specified pg_num " << n + << " < pg_num_min " << pg_min; + return -EINVAL; + } + if (pg_max && n > pg_max) { + ss << "specified pg_num " << n + << " < pg_num_max " << pg_max; + return -EINVAL; + } + if (osdmap.require_osd_release < ceph_release_t::nautilus) { + // pre-nautilus osdmap format; increase pg_num directly + assert(n > (int)p.get_pg_num()); + // force pre-nautilus clients to resend their ops, since they + // don't understand pg_num_target changes form a new interval + p.last_force_op_resend_prenautilus = pending_inc.epoch; + // force pre-luminous clients to resend their ops, since they + // don't understand that split PGs now form a new interval. + p.last_force_op_resend_preluminous = pending_inc.epoch; + p.set_pg_num(n); + } else { + // set targets; mgr will adjust pg_num_actual and pgp_num later. + // make pgp_num track pg_num if it already matches. if it is set + // differently, leave it different and let the user control it + // manually. + if (p.get_pg_num_target() == p.get_pgp_num_target()) { + p.set_pgp_num_target(n); + } + p.set_pg_num_target(n); + } + } else if (var == "pgp_num_actual") { + if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) { + ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first"; + return -EPERM; + } + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n <= 0) { + ss << "specified pgp_num must > 0, but you set to " << n; + return -EINVAL; + } + if (n > (int)p.get_pg_num()) { + ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num(); + return -EINVAL; + } + if (n > (int)p.get_pg_num_pending()) { + ss << "specified pgp_num " << n + << " > pg_num_pending " << p.get_pg_num_pending(); + return -EINVAL; + } + p.set_pgp_num(n); + } else if (var == "pgp_num") { + if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) { + ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first"; + return -EPERM; + } + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n <= 0) { + ss << "specified pgp_num must > 0, but you set to " << n; + return -EINVAL; + } + if (n > (int)p.get_pg_num_target()) { + ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target(); + return -EINVAL; + } + if (osdmap.require_osd_release < ceph_release_t::nautilus) { + // pre-nautilus osdmap format; increase pgp_num directly + p.set_pgp_num(n); + } else { + p.set_pgp_num_target(n); + } + } else if (var == "pg_autoscale_mode") { + auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val); + if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) { + ss << "specified invalid mode " << val; + return -EINVAL; + } + if (osdmap.require_osd_release < ceph_release_t::nautilus) { + ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode"; + return -EINVAL; + } + p.pg_autoscale_mode = m; + } else if (var == "crush_rule") { + int id = osdmap.crush->get_rule_id(val); + if (id == -ENOENT) { + ss << "crush rule " << val << " does not exist"; + return -ENOENT; + } + if (id < 0) { + ss << cpp_strerror(id); + return -ENOENT; + } + if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) { + return -EINVAL; + } + p.crush_rule = id; + } else if (var == "nodelete" || var == "nopgchange" || + var == "nosizechange" || var == "write_fadvise_dontneed" || + var == "noscrub" || var == "nodeep-scrub" || var == "bulk") { + uint64_t flag = pg_pool_t::get_flag_by_name(var); + // make sure we only compare against 'n' if we didn't receive a string + if (val == "true" || (interr.empty() && n == 1)) { + p.set_flag(flag); + } else if (val == "false" || (interr.empty() && n == 0)) { + p.unset_flag(flag); + } else { + ss << "expecting value 'true', 'false', '0', or '1'"; + return -EINVAL; + } + } else if (var == "hashpspool") { + uint64_t flag = pg_pool_t::get_flag_by_name(var); + bool force = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", force); + + if (!force) { + ss << "are you SURE? this will remap all placement groups in this pool," + " this triggers large data movement," + " pass --yes-i-really-mean-it if you really do."; + return -EPERM; + } + // make sure we only compare against 'n' if we didn't receive a string + if (val == "true" || (interr.empty() && n == 1)) { + p.set_flag(flag); + } else if (val == "false" || (interr.empty() && n == 0)) { + p.unset_flag(flag); + } else { + ss << "expecting value 'true', 'false', '0', or '1'"; + return -EINVAL; + } + } else if (var == "hit_set_type") { + if (val == "none") + p.hit_set_params = HitSet::Params(); + else { + int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss); + if (err) + return err; + if (val == "bloom") { + BloomHitSet::Params *bsp = new BloomHitSet::Params; + bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp")); + p.hit_set_params = HitSet::Params(bsp); + } else if (val == "explicit_hash") + p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params); + else if (val == "explicit_object") + p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params); + else { + ss << "unrecognized hit_set type '" << val << "'"; + return -EINVAL; + } + } + } else if (var == "hit_set_period") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } else if (n < 0) { + ss << "hit_set_period should be non-negative"; + return -EINVAL; + } + p.hit_set_period = n; + } else if (var == "hit_set_count") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } else if (n < 0) { + ss << "hit_set_count should be non-negative"; + return -EINVAL; + } + p.hit_set_count = n; + } else if (var == "hit_set_fpp") { + if (floaterr.length()) { + ss << "error parsing floating point value '" << val << "': " << floaterr; + return -EINVAL; + } else if (f < 0 || f > 1.0) { + ss << "hit_set_fpp should be in the range 0..1"; + return -EINVAL; + } + if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) { + ss << "hit set is not of type Bloom; invalid to set a false positive rate!"; + return -EINVAL; + } + BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get()); + bloomp->set_fpp(f); + } else if (var == "use_gmt_hitset") { + if (val == "true" || (interr.empty() && n == 1)) { + p.use_gmt_hitset = true; + } else { + ss << "expecting value 'true' or '1'"; + return -EINVAL; + } + } else if (var == "allow_ec_overwrites") { + if (!p.is_erasure()) { + ss << "ec overwrites can only be enabled for an erasure coded pool"; + return -EINVAL; + } + stringstream err; + if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites && + !is_pool_currently_all_bluestore(pool, p, &err)) { + ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str(); + return -EINVAL; + } + if (val == "true" || (interr.empty() && n == 1)) { + p.flags |= pg_pool_t::FLAG_EC_OVERWRITES; + } else if (val == "false" || (interr.empty() && n == 0)) { + ss << "ec overwrites cannot be disabled once enabled"; + return -EINVAL; + } else { + ss << "expecting value 'true', 'false', '0', or '1'"; + return -EINVAL; + } + } else if (var == "target_max_objects") { + if (interr.length()) { + ss << "error parsing int '" << val << "': " << interr; + return -EINVAL; + } + p.target_max_objects = n; + } else if (var == "target_max_bytes") { + if (interr.length()) { + ss << "error parsing int '" << val << "': " << interr; + return -EINVAL; + } + p.target_max_bytes = n; + } else if (var == "cache_target_dirty_ratio") { + if (floaterr.length()) { + ss << "error parsing float '" << val << "': " << floaterr; + return -EINVAL; + } + if (f < 0 || f > 1.0) { + ss << "value must be in the range 0..1"; + return -ERANGE; + } + p.cache_target_dirty_ratio_micro = uf; + } else if (var == "cache_target_dirty_high_ratio") { + if (floaterr.length()) { + ss << "error parsing float '" << val << "': " << floaterr; + return -EINVAL; + } + if (f < 0 || f > 1.0) { + ss << "value must be in the range 0..1"; + return -ERANGE; + } + p.cache_target_dirty_high_ratio_micro = uf; + } else if (var == "cache_target_full_ratio") { + if (floaterr.length()) { + ss << "error parsing float '" << val << "': " << floaterr; + return -EINVAL; + } + if (f < 0 || f > 1.0) { + ss << "value must be in the range 0..1"; + return -ERANGE; + } + p.cache_target_full_ratio_micro = uf; + } else if (var == "cache_min_flush_age") { + if (interr.length()) { + ss << "error parsing int '" << val << "': " << interr; + return -EINVAL; + } + p.cache_min_flush_age = n; + } else if (var == "cache_min_evict_age") { + if (interr.length()) { + ss << "error parsing int '" << val << "': " << interr; + return -EINVAL; + } + p.cache_min_evict_age = n; + } else if (var == "min_read_recency_for_promote") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + p.min_read_recency_for_promote = n; + } else if (var == "hit_set_grade_decay_rate") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n > 100 || n < 0) { + ss << "value out of range,valid range is 0 - 100"; + return -EINVAL; + } + p.hit_set_grade_decay_rate = n; + } else if (var == "hit_set_search_last_n") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n > p.hit_set_count || n < 0) { + ss << "value out of range,valid range is 0 - hit_set_count"; + return -EINVAL; + } + p.hit_set_search_last_n = n; + } else if (var == "min_write_recency_for_promote") { + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + p.min_write_recency_for_promote = n; + } else if (var == "fast_read") { + if (p.is_replicated()) { + ss << "fast read is not supported in replication pool"; + return -EINVAL; + } + if (val == "true" || (interr.empty() && n == 1)) { + p.fast_read = true; + } else if (val == "false" || (interr.empty() && n == 0)) { + p.fast_read = false; + } else { + ss << "expecting value 'true', 'false', '0', or '1'"; + return -EINVAL; + } + } else if (pool_opts_t::is_opt_name(var)) { + bool unset = val == "unset"; + if (var == "compression_mode") { + if (!unset) { + auto cmode = Compressor::get_comp_mode_type(val); + if (!cmode) { + ss << "unrecognized compression mode '" << val << "'"; + return -EINVAL; + } + } + } else if (var == "compression_algorithm") { + if (!unset) { + auto alg = Compressor::get_comp_alg_type(val); + if (!alg) { + ss << "unrecognized compression_algorithm '" << val << "'"; + return -EINVAL; + } + } + } else if (var == "compression_required_ratio") { + if (floaterr.length()) { + ss << "error parsing float value '" << val << "': " << floaterr; + return -EINVAL; + } + if (f < 0 || f > 1) { + ss << "compression_required_ratio is out of range (0-1): '" << val << "'"; + return -EINVAL; + } + } else if (var == "csum_type") { + auto t = unset ? 0 : Checksummer::get_csum_string_type(val); + if (t < 0 ) { + ss << "unrecognized csum_type '" << val << "'"; + return -EINVAL; + } + //preserve csum_type numeric value + n = t; + interr.clear(); + } else if (var == "compression_max_blob_size" || + var == "compression_min_blob_size" || + var == "csum_max_block" || + var == "csum_min_block") { + if (interr.length()) { + ss << "error parsing int value '" << val << "': " << interr; + return -EINVAL; + } + } else if (var == "fingerprint_algorithm") { + if (!unset) { + auto alg = pg_pool_t::get_fingerprint_from_str(val); + if (!alg) { + ss << "unrecognized fingerprint_algorithm '" << val << "'"; + return -EINVAL; + } + } + } else if (var == "target_size_bytes") { + if (interr.length()) { + ss << "error parsing unit value '" << val << "': " << interr; + return -EINVAL; + } + if (osdmap.require_osd_release < ceph_release_t::nautilus) { + ss << "must set require_osd_release to nautilus or " + << "later before setting target_size_bytes"; + return -EINVAL; + } + } else if (var == "target_size_ratio") { + if (f < 0.0) { + ss << "target_size_ratio cannot be negative"; + return -EINVAL; + } + } else if (var == "pg_num_min") { + if (interr.length()) { + ss << "error parsing int value '" << val << "': " << interr; + return -EINVAL; + } + if (n > (int)p.get_pg_num_target()) { + ss << "specified pg_num_min " << n + << " > pg_num " << p.get_pg_num_target(); + return -EINVAL; + } + } else if (var == "pg_num_max") { + if (interr.length()) { + ss << "error parsing int value '" << val << "': " << interr; + return -EINVAL; + } + if (n && n < (int)p.get_pg_num_target()) { + ss << "specified pg_num_max " << n + << " < pg_num " << p.get_pg_num_target(); + return -EINVAL; + } + } else if (var == "recovery_priority") { + if (interr.length()) { + ss << "error parsing int value '" << val << "': " << interr; + return -EINVAL; + } + if (!g_conf()->debug_allow_any_pool_priority) { + if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) { + ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN + << " and " << OSD_POOL_PRIORITY_MAX; + return -EINVAL; + } + } + } else if (var == "pg_autoscale_bias") { + if (f < 0.0 || f > 1000.0) { + ss << "pg_autoscale_bias must be between 0 and 1000"; + return -EINVAL; + } + } else if (var == "dedup_tier") { + if (interr.empty()) { + ss << "expecting value 'pool name'"; + return -EINVAL; + } + // Current base tier in dedup does not support ec pool + if (p.is_erasure()) { + ss << "pool '" << poolstr + << "' is an ec pool, which cannot be a base tier"; + return -ENOTSUP; + } + int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val); + if (lowtierpool_id < 0) { + ss << "unrecognized pool '" << val << "'"; + return -ENOENT; + } + const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id); + ceph_assert(tp); + n = lowtierpool_id; + // The original input is string (pool name), but we convert it to int64_t. + // So, clear interr + interr.clear(); + } else if (var == "dedup_chunk_algorithm") { + if (!unset) { + auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val); + if (!alg) { + ss << "unrecognized fingerprint_algorithm '" << val << "'"; + return -EINVAL; + } + } + } else if (var == "dedup_cdc_chunk_size") { + if (interr.length()) { + ss << "error parsing int value '" << val << "': " << interr; + return -EINVAL; + } + } + + pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var); + switch (desc.type) { + case pool_opts_t::STR: + if (unset) { + p.opts.unset(desc.key); + } else { + p.opts.set(desc.key, static_cast<std::string>(val)); + } + break; + case pool_opts_t::INT: + if (interr.length()) { + ss << "error parsing integer value '" << val << "': " << interr; + return -EINVAL; + } + if (n == 0) { + p.opts.unset(desc.key); + } else { + p.opts.set(desc.key, static_cast<int64_t>(n)); + } + break; + case pool_opts_t::DOUBLE: + if (floaterr.length()) { + ss << "error parsing floating point value '" << val << "': " << floaterr; + return -EINVAL; + } + if (f == 0) { + p.opts.unset(desc.key); + } else { + p.opts.set(desc.key, static_cast<double>(f)); + } + break; + default: + ceph_assert(!"unknown type"); + } + } else { + ss << "unrecognized variable '" << var << "'"; + return -EINVAL; + } + if (val != "unset") { + ss << "set pool " << pool << " " << var << " to " << val; + } else { + ss << "unset pool " << pool << " " << var; + } + p.last_change = pending_inc.epoch; + pending_inc.new_pools[pool] = p; + return 0; +} + +int OSDMonitor::prepare_command_pool_application(const string &prefix, + const cmdmap_t& cmdmap, + stringstream& ss) +{ + return _command_pool_application(prefix, cmdmap, ss, nullptr, true); +} + +int OSDMonitor::preprocess_command_pool_application(const string &prefix, + const cmdmap_t& cmdmap, + stringstream& ss, + bool *modified) +{ + return _command_pool_application(prefix, cmdmap, ss, modified, false); +} + + +/** + * Common logic for preprocess and prepare phases of pool application + * tag commands. In preprocess mode we're only detecting invalid + * commands, and determining whether it was a modification or a no-op. + * In prepare mode we're actually updating the pending state. + */ +int OSDMonitor::_command_pool_application(const string &prefix, + const cmdmap_t& cmdmap, + stringstream& ss, + bool *modified, + bool preparing) +{ + string pool_name; + cmd_getval(cmdmap, "pool", pool_name); + int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str()); + if (pool < 0) { + ss << "unrecognized pool '" << pool_name << "'"; + return -ENOENT; + } + + pg_pool_t p = *osdmap.get_pg_pool(pool); + if (preparing) { + if (pending_inc.new_pools.count(pool)) { + p = pending_inc.new_pools[pool]; + } + } + + string app; + cmd_getval(cmdmap, "app", app); + bool app_exists = (p.application_metadata.count(app) > 0); + + string key; + cmd_getval(cmdmap, "key", key); + if (key == "all") { + ss << "key cannot be 'all'"; + return -EINVAL; + } + + string value; + cmd_getval(cmdmap, "value", value); + if (value == "all") { + ss << "value cannot be 'all'"; + return -EINVAL; + } + + if (boost::algorithm::ends_with(prefix, "enable")) { + if (app.empty()) { + ss << "application name must be provided"; + return -EINVAL; + } + + if (p.is_tier()) { + ss << "application must be enabled on base tier"; + return -EINVAL; + } + + bool force = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", force); + + if (!app_exists && !p.application_metadata.empty() && !force) { + ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled " + << "application; pass --yes-i-really-mean-it to proceed anyway"; + return -EPERM; + } + + if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) { + ss << "too many enabled applications on pool '" << pool_name << "'; " + << "max " << MAX_POOL_APPLICATIONS; + return -EINVAL; + } + + if (app.length() > MAX_POOL_APPLICATION_LENGTH) { + ss << "application name '" << app << "' too long; max length " + << MAX_POOL_APPLICATION_LENGTH; + return -EINVAL; + } + + if (!app_exists) { + p.application_metadata[app] = {}; + } + ss << "enabled application '" << app << "' on pool '" << pool_name << "'"; + + } else if (boost::algorithm::ends_with(prefix, "disable")) { + bool force = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", force); + + if (!force) { + ss << "Are you SURE? Disabling an application within a pool might result " + << "in loss of application functionality; pass " + << "--yes-i-really-mean-it to proceed anyway"; + return -EPERM; + } + + if (!app_exists) { + ss << "application '" << app << "' is not enabled on pool '" << pool_name + << "'"; + return 0; // idempotent + } + + p.application_metadata.erase(app); + ss << "disable application '" << app << "' on pool '" << pool_name << "'"; + + } else if (boost::algorithm::ends_with(prefix, "set")) { + if (p.is_tier()) { + ss << "application metadata must be set on base tier"; + return -EINVAL; + } + + if (!app_exists) { + ss << "application '" << app << "' is not enabled on pool '" << pool_name + << "'"; + return -ENOENT; + } + + string key; + cmd_getval(cmdmap, "key", key); + + if (key.empty()) { + ss << "key must be provided"; + return -EINVAL; + } + + auto &app_keys = p.application_metadata[app]; + if (app_keys.count(key) == 0 && + app_keys.size() >= MAX_POOL_APPLICATION_KEYS) { + ss << "too many keys set for application '" << app << "' on pool '" + << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS; + return -EINVAL; + } + + if (key.length() > MAX_POOL_APPLICATION_LENGTH) { + ss << "key '" << app << "' too long; max length " + << MAX_POOL_APPLICATION_LENGTH; + return -EINVAL; + } + + string value; + cmd_getval(cmdmap, "value", value); + if (value.length() > MAX_POOL_APPLICATION_LENGTH) { + ss << "value '" << value << "' too long; max length " + << MAX_POOL_APPLICATION_LENGTH; + return -EINVAL; + } + + p.application_metadata[app][key] = value; + ss << "set application '" << app << "' key '" << key << "' to '" + << value << "' on pool '" << pool_name << "'"; + } else if (boost::algorithm::ends_with(prefix, "rm")) { + if (!app_exists) { + ss << "application '" << app << "' is not enabled on pool '" << pool_name + << "'"; + return -ENOENT; + } + + string key; + cmd_getval(cmdmap, "key", key); + auto it = p.application_metadata[app].find(key); + if (it == p.application_metadata[app].end()) { + ss << "application '" << app << "' on pool '" << pool_name + << "' does not have key '" << key << "'"; + return 0; // idempotent + } + + p.application_metadata[app].erase(it); + ss << "removed application '" << app << "' key '" << key << "' on pool '" + << pool_name << "'"; + } else { + ceph_abort(); + } + + if (preparing) { + p.last_change = pending_inc.epoch; + pending_inc.new_pools[pool] = p; + } + + // Because we fell through this far, we didn't hit no-op cases, + // so pool was definitely modified + if (modified != nullptr) { + *modified = true; + } + + return 0; +} + +int OSDMonitor::_prepare_command_osd_crush_remove( + CrushWrapper &newcrush, + int32_t id, + int32_t ancestor, + bool has_ancestor, + bool unlink_only) +{ + int err = 0; + + if (has_ancestor) { + err = newcrush.remove_item_under(cct, id, ancestor, + unlink_only); + } else { + err = newcrush.remove_item(cct, id, unlink_only); + } + return err; +} + +void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush) +{ + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); +} + +int OSDMonitor::prepare_command_osd_crush_remove( + CrushWrapper &newcrush, + int32_t id, + int32_t ancestor, + bool has_ancestor, + bool unlink_only) +{ + int err = _prepare_command_osd_crush_remove( + newcrush, id, ancestor, + has_ancestor, unlink_only); + + if (err < 0) + return err; + + ceph_assert(err == 0); + do_osd_crush_remove(newcrush); + + return 0; +} + +int OSDMonitor::prepare_command_osd_remove(int32_t id) +{ + if (osdmap.is_up(id)) { + return -EBUSY; + } + + pending_inc.new_state[id] = osdmap.get_state(id); + pending_inc.new_uuid[id] = uuid_d(); + pending_metadata_rm.insert(id); + pending_metadata.erase(id); + + return 0; +} + +int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id) +{ + ceph_assert(existing_id); + *existing_id = -1; + + for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) { + if (!osdmap.exists(i) && + pending_inc.new_up_client.count(i) == 0 && + (pending_inc.new_state.count(i) == 0 || + (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) { + *existing_id = i; + return -1; + } + } + + if (pending_inc.new_max_osd < 0) { + return osdmap.get_max_osd(); + } + return pending_inc.new_max_osd; +} + +void OSDMonitor::do_osd_create( + const int32_t id, + const uuid_d& uuid, + const string& device_class, + int32_t* new_id) +{ + dout(10) << __func__ << " uuid " << uuid << dendl; + ceph_assert(new_id); + + // We presume validation has been performed prior to calling this + // function. We assert with prejudice. + + int32_t allocated_id = -1; // declare here so we can jump + int32_t existing_id = -1; + if (!uuid.is_zero()) { + existing_id = osdmap.identify_osd(uuid); + if (existing_id >= 0) { + ceph_assert(id < 0 || id == existing_id); + *new_id = existing_id; + goto out; + } else if (id >= 0) { + // uuid does not exist, and id has been provided, so just create + // the new osd.id + *new_id = id; + goto out; + } + } + + // allocate a new id + allocated_id = _allocate_osd_id(&existing_id); + dout(10) << __func__ << " allocated id " << allocated_id + << " existing id " << existing_id << dendl; + if (existing_id >= 0) { + ceph_assert(existing_id < osdmap.get_max_osd()); + ceph_assert(allocated_id < 0); + *new_id = existing_id; + } else if (allocated_id >= 0) { + ceph_assert(existing_id < 0); + // raise max_osd + if (pending_inc.new_max_osd < 0) { + pending_inc.new_max_osd = osdmap.get_max_osd() + 1; + } else { + ++pending_inc.new_max_osd; + } + *new_id = pending_inc.new_max_osd - 1; + ceph_assert(*new_id == allocated_id); + } else { + ceph_abort_msg("unexpected condition"); + } + +out: + if (device_class.size()) { + CrushWrapper newcrush; + _get_pending_crush(newcrush); + if (newcrush.get_max_devices() < *new_id + 1) { + newcrush.set_max_devices(*new_id + 1); + } + string name = string("osd.") + stringify(*new_id); + if (!newcrush.item_exists(*new_id)) { + newcrush.set_item_name(*new_id, name); + } + ostringstream ss; + int r = newcrush.update_device_class(*new_id, device_class, name, &ss); + if (r < 0) { + derr << __func__ << " failed to set " << name << " device_class " + << device_class << ": " << cpp_strerror(r) << " - " << ss.str() + << dendl; + // non-fatal... this might be a replay and we want to be idempotent. + } else { + dout(20) << __func__ << " set " << name << " device_class " << device_class + << dendl; + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + } + } else { + dout(20) << __func__ << " no device_class" << dendl; + } + + dout(10) << __func__ << " using id " << *new_id << dendl; + if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) { + pending_inc.new_max_osd = *new_id + 1; + } + + pending_inc.new_weight[*new_id] = CEPH_OSD_IN; + // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will + // set it for us. (ugh.) + pending_inc.new_state[*new_id] |= CEPH_OSD_NEW; + if (!uuid.is_zero()) + pending_inc.new_uuid[*new_id] = uuid; +} + +int OSDMonitor::validate_osd_create( + const int32_t id, + const uuid_d& uuid, + const bool check_osd_exists, + int32_t* existing_id, + stringstream& ss) +{ + + dout(10) << __func__ << " id " << id << " uuid " << uuid + << " check_osd_exists " << check_osd_exists << dendl; + + ceph_assert(existing_id); + + if (id < 0 && uuid.is_zero()) { + // we have nothing to validate + *existing_id = -1; + return 0; + } else if (uuid.is_zero()) { + // we have an id but we will ignore it - because that's what + // `osd create` does. + return 0; + } + + /* + * This function will be used to validate whether we are able to + * create a new osd when the `uuid` is specified. + * + * It will be used by both `osd create` and `osd new`, as the checks + * are basically the same when it pertains to osd id and uuid validation. + * However, `osd create` presumes an `uuid` is optional, for legacy + * reasons, while `osd new` requires the `uuid` to be provided. This + * means that `osd create` will not be idempotent if an `uuid` is not + * provided, but we will always guarantee the idempotency of `osd new`. + */ + + ceph_assert(!uuid.is_zero()); + if (pending_inc.identify_osd(uuid) >= 0) { + // osd is about to exist + return -EAGAIN; + } + + int32_t i = osdmap.identify_osd(uuid); + if (i >= 0) { + // osd already exists + if (id >= 0 && i != id) { + ss << "uuid " << uuid << " already in use for different id " << i; + return -EEXIST; + } + // return a positive errno to distinguish between a blocking error + // and an error we consider to not be a problem (i.e., this would be + // an idempotent operation). + *existing_id = i; + return EEXIST; + } + // i < 0 + if (id >= 0) { + if (pending_inc.new_state.count(id)) { + // osd is about to exist + return -EAGAIN; + } + // we may not care if an osd exists if we are recreating a previously + // destroyed osd. + if (check_osd_exists && osdmap.exists(id)) { + ss << "id " << id << " already in use and does not match uuid " + << uuid; + return -EINVAL; + } + } + return 0; +} + +int OSDMonitor::prepare_command_osd_create( + const int32_t id, + const uuid_d& uuid, + int32_t* existing_id, + stringstream& ss) +{ + dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl; + ceph_assert(existing_id); + if (osdmap.is_destroyed(id)) { + ss << "ceph osd create has been deprecated. Please use ceph osd new " + "instead."; + return -EINVAL; + } + + if (uuid.is_zero()) { + dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl; + } + + return validate_osd_create(id, uuid, true, existing_id, ss); +} + +int OSDMonitor::prepare_command_osd_new( + MonOpRequestRef op, + const cmdmap_t& cmdmap, + const map<string,string>& params, + stringstream &ss, + Formatter *f) +{ + uuid_d uuid; + string uuidstr; + int64_t id = -1; + + ceph_assert(paxos.is_plugged()); + + dout(10) << __func__ << " " << op << dendl; + + /* validate command. abort now if something's wrong. */ + + /* `osd new` will expect a `uuid` to be supplied; `id` is optional. + * + * If `id` is not specified, we will identify any existing osd based + * on `uuid`. Operation will be idempotent iff secrets match. + * + * If `id` is specified, we will identify any existing osd based on + * `uuid` and match against `id`. If they match, operation will be + * idempotent iff secrets match. + * + * `-i secrets.json` will be optional. If supplied, will be used + * to check for idempotency when `id` and `uuid` match. + * + * If `id` is not specified, and `uuid` does not exist, an id will + * be found or allocated for the osd. + * + * If `id` is specified, and the osd has been previously marked + * as destroyed, then the `id` will be reused. + */ + if (!cmd_getval(cmdmap, "uuid", uuidstr)) { + ss << "requires the OSD's UUID to be specified."; + return -EINVAL; + } else if (!uuid.parse(uuidstr.c_str())) { + ss << "invalid UUID value '" << uuidstr << "'."; + return -EINVAL; + } + + if (cmd_getval(cmdmap, "id", id) && + (id < 0)) { + ss << "invalid OSD id; must be greater or equal than zero."; + return -EINVAL; + } + + // are we running an `osd create`-like command, or recreating + // a previously destroyed osd? + + bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id)); + + // we will care about `id` to assess whether osd is `destroyed`, or + // to create a new osd. + // we will need an `id` by the time we reach auth. + + int32_t existing_id = -1; + int err = validate_osd_create(id, uuid, !is_recreate_destroyed, + &existing_id, ss); + + bool may_be_idempotent = false; + if (err == EEXIST) { + // this is idempotent from the osdmon's point-of-view + may_be_idempotent = true; + ceph_assert(existing_id >= 0); + id = existing_id; + } else if (err < 0) { + return err; + } + + if (!may_be_idempotent) { + // idempotency is out of the window. We are either creating a new + // osd or recreating a destroyed osd. + // + // We now need to figure out if we have an `id` (and if it's valid), + // of find an `id` if we don't have one. + + // NOTE: we need to consider the case where the `id` is specified for + // `osd create`, and we must honor it. So this means checking if + // the `id` is destroyed, and if so assume the destroy; otherwise, + // check if it `exists` - in which case we complain about not being + // `destroyed`. In the end, if nothing fails, we must allow the + // creation, so that we are compatible with `create`. + if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) { + dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl; + ss << "OSD " << id << " has not yet been destroyed"; + return -EINVAL; + } else if (id < 0) { + // find an `id` + id = _allocate_osd_id(&existing_id); + if (id < 0) { + ceph_assert(existing_id >= 0); + id = existing_id; + } + dout(10) << __func__ << " found id " << id << " to use" << dendl; + } else if (id >= 0 && osdmap.is_destroyed(id)) { + dout(10) << __func__ << " recreating osd." << id << dendl; + } else { + dout(10) << __func__ << " creating new osd." << id << dendl; + } + } else { + ceph_assert(id >= 0); + ceph_assert(osdmap.exists(id)); + } + + // we are now able to either create a brand new osd or reuse an existing + // osd that has been previously destroyed. + + dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl; + + if (may_be_idempotent && params.empty()) { + // nothing to do, really. + dout(10) << __func__ << " idempotent and no params -- no op." << dendl; + ceph_assert(id >= 0); + if (f) { + f->open_object_section("created_osd"); + f->dump_int("osdid", id); + f->close_section(); + } else { + ss << id; + } + return EEXIST; + } + + string device_class; + auto p = params.find("crush_device_class"); + if (p != params.end()) { + device_class = p->second; + dout(20) << __func__ << " device_class will be " << device_class << dendl; + } + string cephx_secret, lockbox_secret, dmcrypt_key; + bool has_lockbox = false; + bool has_secrets = params.count("cephx_secret") + || params.count("cephx_lockbox_secret") + || params.count("dmcrypt_key"); + + KVMonitor *svc = nullptr; + AuthMonitor::auth_entity_t cephx_entity, lockbox_entity; + + if (has_secrets) { + if (params.count("cephx_secret") == 0) { + ss << "requires a cephx secret."; + return -EINVAL; + } + cephx_secret = params.at("cephx_secret"); + + bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0); + bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0); + + dout(10) << __func__ << " has lockbox " << has_lockbox_secret + << " dmcrypt " << has_dmcrypt_key << dendl; + + if (has_lockbox_secret && has_dmcrypt_key) { + has_lockbox = true; + lockbox_secret = params.at("cephx_lockbox_secret"); + dmcrypt_key = params.at("dmcrypt_key"); + } else if (!has_lockbox_secret != !has_dmcrypt_key) { + ss << "requires both a cephx lockbox secret and a dm-crypt key."; + return -EINVAL; + } + + dout(10) << __func__ << " validate secrets using osd id " << id << dendl; + + err = mon.authmon()->validate_osd_new(id, uuid, + cephx_secret, + lockbox_secret, + cephx_entity, + lockbox_entity, + ss); + if (err < 0) { + return err; + } else if (may_be_idempotent && err != EEXIST) { + // for this to be idempotent, `id` should already be >= 0; no need + // to use validate_id. + ceph_assert(id >= 0); + ss << "osd." << id << " exists but secrets do not match"; + return -EEXIST; + } + + if (has_lockbox) { + svc = mon.kvmon(); + err = svc->validate_osd_new(uuid, dmcrypt_key, ss); + if (err < 0) { + return err; + } else if (may_be_idempotent && err != EEXIST) { + ceph_assert(id >= 0); + ss << "osd." << id << " exists but dm-crypt key does not match."; + return -EEXIST; + } + } + } + ceph_assert(!has_secrets || !cephx_secret.empty()); + ceph_assert(!has_lockbox || !lockbox_secret.empty()); + + if (may_be_idempotent) { + // we have nothing to do for either the osdmon or the authmon, + // and we have no lockbox - so the config key service will not be + // touched. This is therefore an idempotent operation, and we can + // just return right away. + dout(10) << __func__ << " idempotent -- no op." << dendl; + ceph_assert(id >= 0); + if (f) { + f->open_object_section("created_osd"); + f->dump_int("osdid", id); + f->close_section(); + } else { + ss << id; + } + return EEXIST; + } + ceph_assert(!may_be_idempotent); + + // perform updates. + if (has_secrets) { + ceph_assert(!cephx_secret.empty()); + ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) || + (!lockbox_secret.empty() && !dmcrypt_key.empty())); + + err = mon.authmon()->do_osd_new(cephx_entity, + lockbox_entity, + has_lockbox); + ceph_assert(0 == err); + + if (has_lockbox) { + ceph_assert(nullptr != svc); + svc->do_osd_new(uuid, dmcrypt_key); + } + } + + if (is_recreate_destroyed) { + ceph_assert(id >= 0); + ceph_assert(osdmap.is_destroyed(id)); + pending_inc.new_state[id] |= CEPH_OSD_DESTROYED; + if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) { + pending_inc.new_state[id] |= CEPH_OSD_NEW; + } + if (osdmap.get_state(id) & CEPH_OSD_UP) { + // due to http://tracker.ceph.com/issues/20751 some clusters may + // have UP set for non-existent OSDs; make sure it is cleared + // for a newly created osd. + pending_inc.new_state[id] |= CEPH_OSD_UP; + } + pending_inc.new_uuid[id] = uuid; + } else { + ceph_assert(id >= 0); + int32_t new_id = -1; + do_osd_create(id, uuid, device_class, &new_id); + ceph_assert(new_id >= 0); + ceph_assert(id == new_id); + } + + if (f) { + f->open_object_section("created_osd"); + f->dump_int("osdid", id); + f->close_section(); + } else { + ss << id; + } + + return 0; +} + +bool OSDMonitor::prepare_command(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MMonCommand>(); + stringstream ss; + cmdmap_t cmdmap; + if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { + string rs = ss.str(); + mon.reply_command(op, -EINVAL, rs, get_last_committed()); + return true; + } + + MonSession *session = op->get_session(); + if (!session) { + derr << __func__ << " no session" << dendl; + mon.reply_command(op, -EACCES, "access denied", get_last_committed()); + return true; + } + + return prepare_command_impl(op, cmdmap); +} + +static int parse_reweights(CephContext *cct, + const cmdmap_t& cmdmap, + const OSDMap& osdmap, + map<int32_t, uint32_t>* weights) +{ + string weights_str; + if (!cmd_getval(cmdmap, "weights", weights_str)) { + return -EINVAL; + } + std::replace(begin(weights_str), end(weights_str), '\'', '"'); + json_spirit::mValue json_value; + if (!json_spirit::read(weights_str, json_value)) { + return -EINVAL; + } + if (json_value.type() != json_spirit::obj_type) { + return -EINVAL; + } + const auto obj = json_value.get_obj(); + try { + for (auto& osd_weight : obj) { + auto osd_id = std::stoi(osd_weight.first); + if (!osdmap.exists(osd_id)) { + return -ENOENT; + } + if (osd_weight.second.type() != json_spirit::str_type) { + return -EINVAL; + } + auto weight = std::stoul(osd_weight.second.get_str()); + weights->insert({osd_id, weight}); + } + } catch (const std::logic_error& e) { + return -EINVAL; + } + return 0; +} + +int OSDMonitor::prepare_command_osd_destroy( + int32_t id, + stringstream& ss) +{ + ceph_assert(paxos.is_plugged()); + + // we check if the osd exists for the benefit of `osd purge`, which may + // have previously removed the osd. If the osd does not exist, return + // -ENOENT to convey this, and let the caller deal with it. + // + // we presume that all auth secrets and config keys were removed prior + // to this command being called. if they exist by now, we also assume + // they must have been created by some other command and do not pertain + // to this non-existent osd. + if (!osdmap.exists(id)) { + dout(10) << __func__ << " osd." << id << " does not exist." << dendl; + return -ENOENT; + } + + uuid_d uuid = osdmap.get_uuid(id); + dout(10) << __func__ << " destroying osd." << id + << " uuid " << uuid << dendl; + + // if it has been destroyed, we assume our work here is done. + if (osdmap.is_destroyed(id)) { + ss << "destroyed osd." << id; + return 0; + } + + EntityName cephx_entity, lockbox_entity; + bool idempotent_auth = false, idempotent_cks = false; + + int err = mon.authmon()->validate_osd_destroy(id, uuid, + cephx_entity, + lockbox_entity, + ss); + if (err < 0) { + if (err == -ENOENT) { + idempotent_auth = true; + } else { + return err; + } + } + + auto svc = mon.kvmon(); + err = svc->validate_osd_destroy(id, uuid); + if (err < 0) { + ceph_assert(err == -ENOENT); + err = 0; + idempotent_cks = true; + } + + if (!idempotent_auth) { + err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity); + ceph_assert(0 == err); + } + + if (!idempotent_cks) { + svc->do_osd_destroy(id, uuid); + } + + pending_inc.new_state[id] = CEPH_OSD_DESTROYED; + pending_inc.new_uuid[id] = uuid_d(); + + // we can only propose_pending() once per service, otherwise we'll be + // defying PaxosService and all laws of nature. Therefore, as we may + // be used during 'osd purge', let's keep the caller responsible for + // proposing. + ceph_assert(err == 0); + return 0; +} + +int OSDMonitor::prepare_command_osd_purge( + int32_t id, + stringstream& ss) +{ + ceph_assert(paxos.is_plugged()); + dout(10) << __func__ << " purging osd." << id << dendl; + + ceph_assert(!osdmap.is_up(id)); + + /* + * This may look a bit weird, but this is what's going to happen: + * + * 1. we make sure that removing from crush works + * 2. we call `prepare_command_osd_destroy()`. If it returns an + * error, then we abort the whole operation, as no updates + * have been made. However, we this function will have + * side-effects, thus we need to make sure that all operations + * performed henceforth will *always* succeed. + * 3. we call `prepare_command_osd_remove()`. Although this + * function can return an error, it currently only checks if the + * osd is up - and we have made sure that it is not so, so there + * is no conflict, and it is effectively an update. + * 4. finally, we call `do_osd_crush_remove()`, which will perform + * the crush update we delayed from before. + */ + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + bool may_be_idempotent = false; + + int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false); + if (err == -ENOENT) { + err = 0; + may_be_idempotent = true; + } else if (err < 0) { + ss << "error removing osd." << id << " from crush"; + return err; + } + + // no point destroying the osd again if it has already been marked destroyed + if (!osdmap.is_destroyed(id)) { + err = prepare_command_osd_destroy(id, ss); + if (err < 0) { + if (err == -ENOENT) { + err = 0; + } else { + return err; + } + } else { + may_be_idempotent = false; + } + } + ceph_assert(0 == err); + + if (may_be_idempotent && !osdmap.exists(id)) { + dout(10) << __func__ << " osd." << id << " does not exist and " + << "we are idempotent." << dendl; + return -ENOENT; + } + + err = prepare_command_osd_remove(id); + // we should not be busy, as we should have made sure this id is not up. + ceph_assert(0 == err); + + do_osd_crush_remove(newcrush); + return 0; +} + +bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, + const cmdmap_t& cmdmap) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MMonCommand>(); + bool ret = false; + stringstream ss; + string rs; + bufferlist rdata; + int err = 0; + + string format; + cmd_getval(cmdmap, "format", format, string("plain")); + boost::scoped_ptr<Formatter> f(Formatter::create(format)); + + string prefix; + cmd_getval(cmdmap, "prefix", prefix); + + int64_t osdid; + string osd_name; + bool osdid_present = false; + if (prefix != "osd pg-temp" && + prefix != "osd pg-upmap" && + prefix != "osd pg-upmap-items") { // avoid commands with non-int id arg + osdid_present = cmd_getval(cmdmap, "id", osdid); + } + if (osdid_present) { + ostringstream oss; + oss << "osd." << osdid; + osd_name = oss.str(); + } + + // Even if there's a pending state with changes that could affect + // a command, considering that said state isn't yet committed, we + // just don't care about those changes if the command currently being + // handled acts as a no-op against the current committed state. + // In a nutshell, we assume this command happens *before*. + // + // Let me make this clearer: + // + // - If we have only one client, and that client issues some + // operation that would conflict with this operation but is + // still on the pending state, then we would be sure that said + // operation wouldn't have returned yet, so the client wouldn't + // issue this operation (unless the client didn't wait for the + // operation to finish, and that would be the client's own fault). + // + // - If we have more than one client, each client will observe + // whatever is the state at the moment of the commit. So, if we + // have two clients, one issuing an unlink and another issuing a + // link, and if the link happens while the unlink is still on the + // pending state, from the link's point-of-view this is a no-op. + // If different clients are issuing conflicting operations and + // they care about that, then the clients should make sure they + // enforce some kind of concurrency mechanism -- from our + // perspective that's what Douglas Adams would call an SEP. + // + // This should be used as a general guideline for most commands handled + // in this function. Adapt as you see fit, but please bear in mind that + // this is the expected behavior. + + + if (prefix == "osd setcrushmap" || + (prefix == "osd crush set" && !osdid_present)) { + if (pending_inc.crush.length()) { + dout(10) << __func__ << " waiting for pending crush update " << dendl; + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + dout(10) << "prepare_command setting new crush map" << dendl; + bufferlist data(m->get_data()); + CrushWrapper crush; + try { + auto bl = data.cbegin(); + crush.decode(bl); + } + catch (const std::exception &e) { + err = -EINVAL; + ss << "Failed to parse crushmap: " << e.what(); + goto reply; + } + + int64_t prior_version = 0; + if (cmd_getval(cmdmap, "prior_version", prior_version)) { + if (prior_version == osdmap.get_crush_version() - 1) { + // see if we are a resend of the last update. this is imperfect + // (multiple racing updaters may not both get reliable success) + // but we expect crush updaters (via this interface) to be rare-ish. + bufferlist current, proposed; + osdmap.crush->encode(current, mon.get_quorum_con_features()); + crush.encode(proposed, mon.get_quorum_con_features()); + if (current.contents_equal(proposed)) { + dout(10) << __func__ + << " proposed matches current and version equals previous" + << dendl; + err = 0; + ss << osdmap.get_crush_version(); + goto reply; + } + } + if (prior_version != osdmap.get_crush_version()) { + err = -EPERM; + ss << "prior_version " << prior_version << " != crush version " + << osdmap.get_crush_version(); + goto reply; + } + } + + if (crush.has_legacy_rule_ids()) { + err = -EINVAL; + ss << "crush maps with ruleset != ruleid are no longer allowed"; + goto reply; + } + if (!validate_crush_against_features(&crush, ss)) { + err = -EINVAL; + goto reply; + } + + err = osdmap.validate_crush_rules(&crush, &ss); + if (err < 0) { + goto reply; + } + + if (g_conf()->mon_osd_crush_smoke_test) { + // sanity check: test some inputs to make sure this map isn't + // totally broken + dout(10) << " testing map" << dendl; + stringstream ess; + CrushTester tester(crush, ess); + tester.set_min_x(0); + tester.set_max_x(50); + auto start = ceph::coarse_mono_clock::now(); + int r = tester.test_with_fork(g_conf()->mon_lease); + auto duration = ceph::coarse_mono_clock::now() - start; + if (r < 0) { + dout(10) << " tester.test_with_fork returns " << r + << ": " << ess.str() << dendl; + ss << "crush smoke test failed with " << r << ": " << ess.str(); + err = r; + goto reply; + } + dout(10) << __func__ << " crush somke test duration: " + << duration << ", result: " << ess.str() << dendl; + } + + pending_inc.crush = data; + ss << osdmap.get_crush_version() + 1; + goto update; + + } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") { + CrushWrapper newcrush; + _get_pending_crush(newcrush); + for (int b = 0; b < newcrush.get_max_buckets(); ++b) { + int bid = -1 - b; + if (newcrush.bucket_exists(bid) && + newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) { + dout(20) << " bucket " << bid << " is straw, can convert" << dendl; + newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2); + } + } + if (!validate_crush_against_features(&newcrush, ss)) { + err = -EINVAL; + goto reply; + } + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd crush set-device-class") { + string device_class; + if (!cmd_getval(cmdmap, "class", device_class)) { + err = -EINVAL; // no value! + goto reply; + } + + bool stop = false; + vector<string> idvec; + cmd_getval(cmdmap, "ids", idvec); + CrushWrapper newcrush; + _get_pending_crush(newcrush); + set<int> updated; + for (unsigned j = 0; j < idvec.size() && !stop; j++) { + set<int> osds; + // wildcard? + if (j == 0 && + (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) { + osdmap.get_all_osds(osds); + stop = true; + } else { + // try traditional single osd way + long osd = parse_osd_id(idvec[j].c_str(), &ss); + if (osd < 0) { + // ss has reason for failure + ss << ", unable to parse osd id:\"" << idvec[j] << "\". "; + err = -EINVAL; + continue; + } + osds.insert(osd); + } + + for (auto &osd : osds) { + if (!osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist. "; + continue; + } + + ostringstream oss; + oss << "osd." << osd; + string name = oss.str(); + + if (newcrush.get_max_devices() < osd + 1) { + newcrush.set_max_devices(osd + 1); + } + string action; + if (newcrush.item_exists(osd)) { + action = "updating"; + } else { + action = "creating"; + newcrush.set_item_name(osd, name); + } + + dout(5) << action << " crush item id " << osd << " name '" << name + << "' device_class '" << device_class << "'" + << dendl; + err = newcrush.update_device_class(osd, device_class, name, &ss); + if (err < 0) { + goto reply; + } + if (err == 0 && !_have_pending_crush()) { + if (!stop) { + // for single osd only, wildcard makes too much noise + ss << "set-device-class item id " << osd << " name '" << name + << "' device_class '" << device_class << "': no change. "; + } + } else { + updated.insert(osd); + } + } + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "set osd(s) " << updated << " to class '" << device_class << "'"; + getline(ss, rs); + wait_for_finished_proposal( + op, + new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1)); + return true; + } else if (prefix == "osd crush rm-device-class") { + bool stop = false; + vector<string> idvec; + cmd_getval(cmdmap, "ids", idvec); + CrushWrapper newcrush; + _get_pending_crush(newcrush); + set<int> updated; + + for (unsigned j = 0; j < idvec.size() && !stop; j++) { + set<int> osds; + + // wildcard? + if (j == 0 && + (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) { + osdmap.get_all_osds(osds); + stop = true; + } else { + // try traditional single osd way + long osd = parse_osd_id(idvec[j].c_str(), &ss); + if (osd < 0) { + // ss has reason for failure + ss << ", unable to parse osd id:\"" << idvec[j] << "\". "; + err = -EINVAL; + goto reply; + } + osds.insert(osd); + } + + for (auto &osd : osds) { + if (!osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist. "; + continue; + } + + auto class_name = newcrush.get_item_class(osd); + if (!class_name) { + ss << "osd." << osd << " belongs to no class, "; + continue; + } + // note that we do not verify if class_is_in_use here + // in case the device is misclassified and user wants + // to overridely reset... + + err = newcrush.remove_device_class(cct, osd, &ss); + if (err < 0) { + // ss has reason for failure + goto reply; + } + updated.insert(osd); + } + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "done removing class of osd(s): " << updated; + getline(ss, rs); + wait_for_finished_proposal( + op, + new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1)); + return true; + } else if (prefix == "osd crush class create") { + string device_class; + if (!cmd_getval(cmdmap, "class", device_class)) { + err = -EINVAL; // no value! + goto reply; + } + if (osdmap.require_osd_release < ceph_release_t::luminous) { + ss << "you must complete the upgrade and 'ceph osd require-osd-release " + << "luminous' before using crush device classes"; + err = -EPERM; + goto reply; + } + if (!_have_pending_crush() && + _get_stable_crush().class_exists(device_class)) { + ss << "class '" << device_class << "' already exists"; + goto reply; + } + CrushWrapper newcrush; + _get_pending_crush(newcrush); + if (newcrush.class_exists(device_class)) { + ss << "class '" << device_class << "' already exists"; + goto update; + } + int class_id = newcrush.get_or_create_class_id(device_class); + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "created class " << device_class << " with id " << class_id + << " to crush map"; + goto update; + } else if (prefix == "osd crush class rm") { + string device_class; + if (!cmd_getval(cmdmap, "class", device_class)) { + err = -EINVAL; // no value! + goto reply; + } + if (osdmap.require_osd_release < ceph_release_t::luminous) { + ss << "you must complete the upgrade and 'ceph osd require-osd-release " + << "luminous' before using crush device classes"; + err = -EPERM; + goto reply; + } + + if (!osdmap.crush->class_exists(device_class)) { + err = 0; + goto reply; + } + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + if (!newcrush.class_exists(device_class)) { + err = 0; // make command idempotent + goto wait; + } + int class_id = newcrush.get_class_id(device_class); + stringstream ts; + if (newcrush.class_is_in_use(class_id, &ts)) { + err = -EBUSY; + ss << "class '" << device_class << "' " << ts.str(); + goto reply; + } + + // check if class is used by any erasure-code-profiles + mempool::osdmap::map<string,map<string,string>> old_ec_profiles = + osdmap.get_erasure_code_profiles(); + auto ec_profiles = pending_inc.get_erasure_code_profiles(); +#ifdef HAVE_STDLIB_MAP_SPLICING + ec_profiles.merge(old_ec_profiles); +#else + ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)), + make_move_iterator(end(old_ec_profiles))); +#endif + list<string> referenced_by; + for (auto &i: ec_profiles) { + for (auto &j: i.second) { + if ("crush-device-class" == j.first && device_class == j.second) { + referenced_by.push_back(i.first); + } + } + } + if (!referenced_by.empty()) { + err = -EBUSY; + ss << "class '" << device_class + << "' is still referenced by erasure-code-profile(s): " << referenced_by; + goto reply; + } + + set<int> osds; + newcrush.get_devices_by_class(device_class, &osds); + for (auto& p: osds) { + err = newcrush.remove_device_class(g_ceph_context, p, &ss); + if (err < 0) { + // ss has reason for failure + goto reply; + } + } + + if (osds.empty()) { + // empty class, remove directly + err = newcrush.remove_class_name(device_class); + if (err < 0) { + ss << "class '" << device_class << "' cannot be removed '" + << cpp_strerror(err) << "'"; + goto reply; + } + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "removed class " << device_class << " with id " << class_id + << " from crush map"; + goto update; + } else if (prefix == "osd crush class rename") { + string srcname, dstname; + if (!cmd_getval(cmdmap, "srcname", srcname)) { + err = -EINVAL; + goto reply; + } + if (!cmd_getval(cmdmap, "dstname", dstname)) { + err = -EINVAL; + goto reply; + } + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) { + // suppose this is a replay and return success + // so command is idempotent + ss << "already renamed to '" << dstname << "'"; + err = 0; + goto reply; + } + + err = newcrush.rename_class(srcname, dstname); + if (err < 0) { + ss << "fail to rename '" << srcname << "' to '" << dstname << "' : " + << cpp_strerror(err); + goto reply; + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "rename class '" << srcname << "' to '" << dstname << "'"; + goto update; + } else if (prefix == "osd crush add-bucket") { + // os crush add-bucket <name> <type> + string name, typestr; + vector<string> argvec; + cmd_getval(cmdmap, "name", name); + cmd_getval(cmdmap, "type", typestr); + cmd_getval(cmdmap, "args", argvec); + map<string,string> loc; + if (!argvec.empty()) { + CrushWrapper::parse_loc_map(argvec, &loc); + dout(0) << "will create and move bucket '" << name + << "' to location " << loc << dendl; + } + + if (!_have_pending_crush() && + _get_stable_crush().name_exists(name)) { + ss << "bucket '" << name << "' already exists"; + goto reply; + } + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + if (newcrush.name_exists(name)) { + ss << "bucket '" << name << "' already exists"; + goto update; + } + int type = newcrush.get_type_id(typestr); + if (type < 0) { + ss << "type '" << typestr << "' does not exist"; + err = -EINVAL; + goto reply; + } + if (type == 0) { + ss << "type '" << typestr << "' is for devices, not buckets"; + err = -EINVAL; + goto reply; + } + int bucketno; + err = newcrush.add_bucket(0, 0, + CRUSH_HASH_DEFAULT, type, 0, NULL, + NULL, &bucketno); + if (err < 0) { + ss << "add_bucket error: '" << cpp_strerror(err) << "'"; + goto reply; + } + err = newcrush.set_item_name(bucketno, name); + if (err < 0) { + ss << "error setting bucket name to '" << name << "'"; + goto reply; + } + + if (!loc.empty()) { + if (!newcrush.check_item_loc(cct, bucketno, loc, + (int *)NULL)) { + err = newcrush.move_bucket(cct, bucketno, loc); + if (err < 0) { + ss << "error moving bucket '" << name << "' to location " << loc; + goto reply; + } + } else { + ss << "no need to move item id " << bucketno << " name '" << name + << "' to location " << loc << " in crush map"; + } + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + if (loc.empty()) { + ss << "added bucket " << name << " type " << typestr + << " to crush map"; + } else { + ss << "added bucket " << name << " type " << typestr + << " to location " << loc; + } + goto update; + } else if (prefix == "osd crush rename-bucket") { + string srcname, dstname; + cmd_getval(cmdmap, "srcname", srcname); + cmd_getval(cmdmap, "dstname", dstname); + + err = crush_rename_bucket(srcname, dstname, &ss); + if (err == -EALREADY) // equivalent to success for idempotency + err = 0; + if (err) + goto reply; + else + goto update; + } else if (prefix == "osd crush weight-set create" || + prefix == "osd crush weight-set create-compat") { + CrushWrapper newcrush; + _get_pending_crush(newcrush); + int64_t pool; + int positions; + if (newcrush.has_non_straw2_buckets()) { + ss << "crush map contains one or more bucket(s) that are not straw2"; + err = -EPERM; + goto reply; + } + if (prefix == "osd crush weight-set create") { + if (osdmap.require_min_compat_client != ceph_release_t::unknown && + osdmap.require_min_compat_client < ceph_release_t::luminous) { + ss << "require_min_compat_client " + << osdmap.require_min_compat_client + << " < luminous, which is required for per-pool weight-sets. " + << "Try 'ceph osd set-require-min-compat-client luminous' " + << "before using the new interface"; + err = -EPERM; + goto reply; + } + string poolname, mode; + cmd_getval(cmdmap, "pool", poolname); + pool = osdmap.lookup_pg_pool_name(poolname.c_str()); + if (pool < 0) { + ss << "pool '" << poolname << "' not found"; + err = -ENOENT; + goto reply; + } + cmd_getval(cmdmap, "mode", mode); + if (mode != "flat" && mode != "positional") { + ss << "unrecognized weight-set mode '" << mode << "'"; + err = -EINVAL; + goto reply; + } + positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size(); + } else { + pool = CrushWrapper::DEFAULT_CHOOSE_ARGS; + positions = 1; + } + if (!newcrush.create_choose_args(pool, positions)) { + if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) { + ss << "compat weight-set already created"; + } else { + ss << "weight-set for pool '" << osdmap.get_pool_name(pool) + << "' already created"; + } + goto reply; + } + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + goto update; + + } else if (prefix == "osd crush weight-set rm" || + prefix == "osd crush weight-set rm-compat") { + CrushWrapper newcrush; + _get_pending_crush(newcrush); + int64_t pool; + if (prefix == "osd crush weight-set rm") { + string poolname; + cmd_getval(cmdmap, "pool", poolname); + pool = osdmap.lookup_pg_pool_name(poolname.c_str()); + if (pool < 0) { + ss << "pool '" << poolname << "' not found"; + err = -ENOENT; + goto reply; + } + } else { + pool = CrushWrapper::DEFAULT_CHOOSE_ARGS; + } + newcrush.rm_choose_args(pool); + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + goto update; + + } else if (prefix == "osd crush weight-set reweight" || + prefix == "osd crush weight-set reweight-compat") { + string poolname, item; + vector<double> weight; + cmd_getval(cmdmap, "pool", poolname); + cmd_getval(cmdmap, "item", item); + cmd_getval(cmdmap, "weight", weight); + CrushWrapper newcrush; + _get_pending_crush(newcrush); + int64_t pool; + if (prefix == "osd crush weight-set reweight") { + pool = osdmap.lookup_pg_pool_name(poolname.c_str()); + if (pool < 0) { + ss << "pool '" << poolname << "' not found"; + err = -ENOENT; + goto reply; + } + if (!newcrush.have_choose_args(pool)) { + ss << "no weight-set for pool '" << poolname << "'"; + err = -ENOENT; + goto reply; + } + auto arg_map = newcrush.choose_args_get(pool); + int positions = newcrush.get_choose_args_positions(arg_map); + if (weight.size() != (size_t)positions) { + ss << "must specify exact " << positions << " weight values"; + err = -EINVAL; + goto reply; + } + } else { + pool = CrushWrapper::DEFAULT_CHOOSE_ARGS; + if (!newcrush.have_choose_args(pool)) { + ss << "no backward-compatible weight-set"; + err = -ENOENT; + goto reply; + } + } + if (!newcrush.name_exists(item)) { + ss << "item '" << item << "' does not exist"; + err = -ENOENT; + goto reply; + } + err = newcrush.choose_args_adjust_item_weightf( + cct, + newcrush.choose_args_get(pool), + newcrush.get_item_id(item), + weight, + &ss); + if (err < 0) { + goto reply; + } + err = 0; + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + goto update; + } else if (osdid_present && + (prefix == "osd crush set" || prefix == "osd crush add")) { + // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id + // osd crush set <OsdName> <weight> <loc1> [<loc2> ...] + // osd crush add <OsdName> <weight> <loc1> [<loc2> ...] + + if (!osdmap.exists(osdid)) { + err = -ENOENT; + ss << osd_name + << " does not exist. Create it before updating the crush map"; + goto reply; + } + + double weight; + if (!cmd_getval(cmdmap, "weight", weight)) { + ss << "unable to parse weight value '" + << cmd_vartype_stringify(cmdmap.at("weight")) << "'"; + err = -EINVAL; + goto reply; + } + + string args; + vector<string> argvec; + cmd_getval(cmdmap, "args", argvec); + map<string,string> loc; + CrushWrapper::parse_loc_map(argvec, &loc); + + if (prefix == "osd crush set" + && !_get_stable_crush().item_exists(osdid)) { + err = -ENOENT; + ss << "unable to set item id " << osdid << " name '" << osd_name + << "' weight " << weight << " at location " << loc + << ": does not exist"; + goto reply; + } + + dout(5) << "adding/updating crush item id " << osdid << " name '" + << osd_name << "' weight " << weight << " at location " + << loc << dendl; + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + string action; + if (prefix == "osd crush set" || + newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) { + action = "set"; + err = newcrush.update_item(cct, osdid, weight, osd_name, loc); + } else { + action = "add"; + err = newcrush.insert_item(cct, osdid, weight, osd_name, loc); + if (err == 0) + err = 1; + } + + if (err < 0) + goto reply; + + if (err == 0 && !_have_pending_crush()) { + ss << action << " item id " << osdid << " name '" << osd_name + << "' weight " << weight << " at location " << loc << ": no change"; + goto reply; + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << action << " item id " << osdid << " name '" << osd_name << "' weight " + << weight << " at location " << loc << " to crush map"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd crush create-or-move") { + do { + // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...] + if (!osdmap.exists(osdid)) { + err = -ENOENT; + ss << osd_name + << " does not exist. create it before updating the crush map"; + goto reply; + } + + double weight; + if (!cmd_getval(cmdmap, "weight", weight)) { + ss << "unable to parse weight value '" + << cmd_vartype_stringify(cmdmap.at("weight")) << "'"; + err = -EINVAL; + goto reply; + } + + string args; + vector<string> argvec; + cmd_getval(cmdmap, "args", argvec); + map<string,string> loc; + CrushWrapper::parse_loc_map(argvec, &loc); + + dout(0) << "create-or-move crush item name '" << osd_name + << "' initial_weight " << weight << " at location " << loc + << dendl; + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc, + g_conf()->osd_crush_update_weight_set); + if (err == 0) { + ss << "create-or-move updated item name '" << osd_name + << "' weight " << weight + << " at location " << loc << " to crush map"; + break; + } + if (err > 0) { + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "create-or-move updating item name '" << osd_name + << "' weight " << weight + << " at location " << loc << " to crush map"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } + } while (false); + + } else if (prefix == "osd crush move") { + do { + // osd crush move <name> <loc1> [<loc2> ...] + string name; + vector<string> argvec; + cmd_getval(cmdmap, "name", name); + cmd_getval(cmdmap, "args", argvec); + map<string,string> loc; + CrushWrapper::parse_loc_map(argvec, &loc); + + dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl; + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + if (!newcrush.name_exists(name)) { + err = -ENOENT; + ss << "item " << name << " does not exist"; + break; + } + int id = newcrush.get_item_id(name); + + if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) { + if (id >= 0) { + err = newcrush.create_or_move_item( + cct, id, 0, name, loc, + g_conf()->osd_crush_update_weight_set); + } else { + err = newcrush.move_bucket(cct, id, loc); + } + if (err >= 0) { + ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map"; + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } + } else { + ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map"; + err = 0; + } + } while (false); + } else if (prefix == "osd crush swap-bucket") { + string source, dest; + cmd_getval(cmdmap, "source", source); + cmd_getval(cmdmap, "dest", dest); + + bool force = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", force); + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + if (!newcrush.name_exists(source)) { + ss << "source item " << source << " does not exist"; + err = -ENOENT; + goto reply; + } + if (!newcrush.name_exists(dest)) { + ss << "dest item " << dest << " does not exist"; + err = -ENOENT; + goto reply; + } + int sid = newcrush.get_item_id(source); + int did = newcrush.get_item_id(dest); + int sparent; + if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) { + ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway"; + err = -EPERM; + goto reply; + } + if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) && + !force) { + ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != " + << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did)) + << "; pass --yes-i-really-mean-it to proceed anyway"; + err = -EPERM; + goto reply; + } + int r = newcrush.swap_bucket(cct, sid, did); + if (r < 0) { + ss << "failed to swap bucket contents: " << cpp_strerror(r); + err = r; + goto reply; + } + ss << "swapped bucket of " << source << " to " << dest; + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + wait_for_finished_proposal(op, + new Monitor::C_Command(mon, op, err, ss.str(), + get_last_committed() + 1)); + return true; + } else if (prefix == "osd crush link") { + // osd crush link <name> <loc1> [<loc2> ...] + string name; + cmd_getval(cmdmap, "name", name); + vector<string> argvec; + cmd_getval(cmdmap, "args", argvec); + map<string,string> loc; + CrushWrapper::parse_loc_map(argvec, &loc); + + // Need an explicit check for name_exists because get_item_id returns + // 0 on unfound. + int id = osdmap.crush->get_item_id(name); + if (!osdmap.crush->name_exists(name)) { + err = -ENOENT; + ss << "item " << name << " does not exist"; + goto reply; + } else { + dout(5) << "resolved crush name '" << name << "' to id " << id << dendl; + } + if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) { + ss << "no need to move item id " << id << " name '" << name + << "' to location " << loc << " in crush map"; + err = 0; + goto reply; + } + + dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl; + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + if (!newcrush.name_exists(name)) { + err = -ENOENT; + ss << "item " << name << " does not exist"; + goto reply; + } else { + int id = newcrush.get_item_id(name); + if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) { + err = newcrush.link_bucket(cct, id, loc); + if (err >= 0) { + ss << "linked item id " << id << " name '" << name + << "' to location " << loc << " in crush map"; + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + } else { + ss << "cannot link item id " << id << " name '" << name + << "' to location " << loc; + goto reply; + } + } else { + ss << "no need to move item id " << id << " name '" << name + << "' to location " << loc << " in crush map"; + err = 0; + } + } + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(), + get_last_committed() + 1)); + return true; + } else if (prefix == "osd crush rm" || + prefix == "osd crush remove" || + prefix == "osd crush unlink") { + do { + // osd crush rm <id> [ancestor] + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + string name; + cmd_getval(cmdmap, "name", name); + + if (!osdmap.crush->name_exists(name)) { + err = 0; + ss << "device '" << name << "' does not appear in the crush map"; + break; + } + if (!newcrush.name_exists(name)) { + err = 0; + ss << "device '" << name << "' does not appear in the crush map"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } + int id = newcrush.get_item_id(name); + int ancestor = 0; + + bool unlink_only = prefix == "osd crush unlink"; + string ancestor_str; + if (cmd_getval(cmdmap, "ancestor", ancestor_str)) { + if (!newcrush.name_exists(ancestor_str)) { + err = -ENOENT; + ss << "ancestor item '" << ancestor_str + << "' does not appear in the crush map"; + break; + } + ancestor = newcrush.get_item_id(ancestor_str); + } + + err = prepare_command_osd_crush_remove( + newcrush, + id, ancestor, + (ancestor < 0), unlink_only); + + if (err == -ENOENT) { + ss << "item " << id << " does not appear in that position"; + err = 0; + break; + } + if (err == 0) { + if (!unlink_only) + pending_inc.new_crush_node_flags[id] = 0; + ss << "removed item id " << id << " name '" << name << "' from crush map"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } + } while (false); + + } else if (prefix == "osd crush reweight-all") { + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + newcrush.reweight(cct); + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "reweighted crush hierarchy"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd crush reweight") { + // osd crush reweight <name> <weight> + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + string name; + cmd_getval(cmdmap, "name", name); + if (!newcrush.name_exists(name)) { + err = -ENOENT; + ss << "device '" << name << "' does not appear in the crush map"; + goto reply; + } + + int id = newcrush.get_item_id(name); + if (id < 0) { + ss << "device '" << name << "' is not a leaf in the crush map"; + err = -EINVAL; + goto reply; + } + double w; + if (!cmd_getval(cmdmap, "weight", w)) { + ss << "unable to parse weight value '" + << cmd_vartype_stringify(cmdmap.at("weight")) << "'"; + err = -EINVAL; + goto reply; + } + + err = newcrush.adjust_item_weightf(cct, id, w, + g_conf()->osd_crush_update_weight_set); + if (err < 0) + goto reply; + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "reweighted item id " << id << " name '" << name << "' to " << w + << " in crush map"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd crush reweight-subtree") { + // osd crush reweight <name> <weight> + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + string name; + cmd_getval(cmdmap, "name", name); + if (!newcrush.name_exists(name)) { + err = -ENOENT; + ss << "device '" << name << "' does not appear in the crush map"; + goto reply; + } + + int id = newcrush.get_item_id(name); + if (id >= 0) { + ss << "device '" << name << "' is not a subtree in the crush map"; + err = -EINVAL; + goto reply; + } + double w; + if (!cmd_getval(cmdmap, "weight", w)) { + ss << "unable to parse weight value '" + << cmd_vartype_stringify(cmdmap.at("weight")) << "'"; + err = -EINVAL; + goto reply; + } + + err = newcrush.adjust_subtree_weightf(cct, id, w, + g_conf()->osd_crush_update_weight_set); + if (err < 0) + goto reply; + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "reweighted subtree id " << id << " name '" << name << "' to " << w + << " in crush map"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd crush tunables") { + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + err = 0; + string profile; + cmd_getval(cmdmap, "profile", profile); + if (profile == "legacy" || profile == "argonaut") { + newcrush.set_tunables_legacy(); + } else if (profile == "bobtail") { + newcrush.set_tunables_bobtail(); + } else if (profile == "firefly") { + newcrush.set_tunables_firefly(); + } else if (profile == "hammer") { + newcrush.set_tunables_hammer(); + } else if (profile == "jewel") { + newcrush.set_tunables_jewel(); + } else if (profile == "optimal") { + newcrush.set_tunables_optimal(); + } else if (profile == "default") { + newcrush.set_tunables_default(); + } else { + ss << "unrecognized profile '" << profile << "'"; + err = -EINVAL; + goto reply; + } + + if (!validate_crush_against_features(&newcrush, ss)) { + err = -EINVAL; + goto reply; + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "adjusted tunables profile to " << profile; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd crush set-tunable") { + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + err = 0; + string tunable; + cmd_getval(cmdmap, "tunable", tunable); + + int64_t value = -1; + if (!cmd_getval(cmdmap, "value", value)) { + err = -EINVAL; + ss << "failed to parse integer value " + << cmd_vartype_stringify(cmdmap.at("value")); + goto reply; + } + + if (tunable == "straw_calc_version") { + if (value != 0 && value != 1) { + ss << "value must be 0 or 1; got " << value; + err = -EINVAL; + goto reply; + } + newcrush.set_straw_calc_version(value); + } else { + ss << "unrecognized tunable '" << tunable << "'"; + err = -EINVAL; + goto reply; + } + + if (!validate_crush_against_features(&newcrush, ss)) { + err = -EINVAL; + goto reply; + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + ss << "adjusted tunable " << tunable << " to " << value; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd crush rule create-simple") { + string name, root, type, mode; + cmd_getval(cmdmap, "name", name); + cmd_getval(cmdmap, "root", root); + cmd_getval(cmdmap, "type", type); + cmd_getval(cmdmap, "mode", mode); + if (mode == "") + mode = "firstn"; + + if (osdmap.crush->rule_exists(name)) { + // The name is uniquely associated to a ruleid and the rule it contains + // From the user point of view, the rule is more meaningfull. + ss << "rule " << name << " already exists"; + err = 0; + goto reply; + } + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + if (newcrush.rule_exists(name)) { + // The name is uniquely associated to a ruleid and the rule it contains + // From the user point of view, the rule is more meaningfull. + ss << "rule " << name << " already exists"; + err = 0; + } else { + int ruleno = newcrush.add_simple_rule(name, root, type, "", mode, + pg_pool_t::TYPE_REPLICATED, &ss); + if (ruleno < 0) { + err = ruleno; + goto reply; + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + } + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd crush rule create-replicated") { + string name, root, type, device_class; + cmd_getval(cmdmap, "name", name); + cmd_getval(cmdmap, "root", root); + cmd_getval(cmdmap, "type", type); + cmd_getval(cmdmap, "class", device_class); + + if (osdmap.crush->rule_exists(name)) { + // The name is uniquely associated to a ruleid and the rule it contains + // From the user point of view, the rule is more meaningfull. + ss << "rule " << name << " already exists"; + err = 0; + goto reply; + } + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + if (newcrush.rule_exists(name)) { + // The name is uniquely associated to a ruleid and the rule it contains + // From the user point of view, the rule is more meaningfull. + ss << "rule " << name << " already exists"; + err = 0; + } else { + int ruleno = newcrush.add_simple_rule( + name, root, type, device_class, + "firstn", pg_pool_t::TYPE_REPLICATED, &ss); + if (ruleno < 0) { + err = ruleno; + goto reply; + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + } + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd erasure-code-profile rm") { + string name; + cmd_getval(cmdmap, "name", name); + + if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss)) + goto wait; + + if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) { + err = -EBUSY; + goto reply; + } + + if (osdmap.has_erasure_code_profile(name) || + pending_inc.new_erasure_code_profiles.count(name)) { + if (osdmap.has_erasure_code_profile(name)) { + pending_inc.old_erasure_code_profiles.push_back(name); + } else { + dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl; + pending_inc.new_erasure_code_profiles.erase(name); + } + + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else { + ss << "erasure-code-profile " << name << " does not exist"; + err = 0; + goto reply; + } + + } else if (prefix == "osd erasure-code-profile set") { + string name; + cmd_getval(cmdmap, "name", name); + vector<string> profile; + cmd_getval(cmdmap, "profile", profile); + + bool force = false; + cmd_getval(cmdmap, "force", force); + + map<string,string> profile_map; + err = parse_erasure_code_profile(profile, &profile_map, &ss); + if (err) + goto reply; + if (auto found = profile_map.find("crush-failure-domain"); + found != profile_map.end()) { + const auto& failure_domain = found->second; + int failure_domain_type = osdmap.crush->get_type_id(failure_domain); + if (failure_domain_type < 0) { + ss << "erasure-code-profile " << profile_map + << " contains an invalid failure-domain " << std::quoted(failure_domain); + err = -EINVAL; + goto reply; + } + } + + if (profile_map.find("plugin") == profile_map.end()) { + ss << "erasure-code-profile " << profile_map + << " must contain a plugin entry" << std::endl; + err = -EINVAL; + goto reply; + } + string plugin = profile_map["plugin"]; + + if (pending_inc.has_erasure_code_profile(name)) { + dout(20) << "erasure code profile " << name << " try again" << dendl; + goto wait; + } else { + err = normalize_profile(name, profile_map, force, &ss); + if (err) + goto reply; + + if (osdmap.has_erasure_code_profile(name)) { + ErasureCodeProfile existing_profile_map = + osdmap.get_erasure_code_profile(name); + err = normalize_profile(name, existing_profile_map, force, &ss); + if (err) + goto reply; + + if (existing_profile_map == profile_map) { + err = 0; + goto reply; + } + if (!force) { + err = -EPERM; + ss << "will not override erasure code profile " << name + << " because the existing profile " + << existing_profile_map + << " is different from the proposed profile " + << profile_map; + goto reply; + } + } + + dout(20) << "erasure code profile set " << name << "=" + << profile_map << dendl; + pending_inc.set_erasure_code_profile(name, profile_map); + } + + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd crush rule create-erasure") { + err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss); + if (err == -EAGAIN) + goto wait; + if (err) + goto reply; + string name, poolstr; + cmd_getval(cmdmap, "name", name); + string profile; + cmd_getval(cmdmap, "profile", profile); + if (profile == "") + profile = "default"; + if (profile == "default") { + if (!osdmap.has_erasure_code_profile(profile)) { + if (pending_inc.has_erasure_code_profile(profile)) { + dout(20) << "erasure code profile " << profile << " already pending" << dendl; + goto wait; + } + + map<string,string> profile_map; + err = osdmap.get_erasure_code_profile_default(cct, + profile_map, + &ss); + if (err) + goto reply; + err = normalize_profile(name, profile_map, true, &ss); + if (err) + goto reply; + dout(20) << "erasure code profile set " << profile << "=" + << profile_map << dendl; + pending_inc.set_erasure_code_profile(profile, profile_map); + goto wait; + } + } + + int rule; + err = crush_rule_create_erasure(name, profile, &rule, &ss); + if (err < 0) { + switch(err) { + case -EEXIST: // return immediately + ss << "rule " << name << " already exists"; + err = 0; + goto reply; + break; + case -EALREADY: // wait for pending to be proposed + ss << "rule " << name << " already exists"; + err = 0; + break; + default: // non recoverable error + goto reply; + break; + } + } else { + ss << "created rule " << name << " at " << rule; + } + + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd crush rule rm") { + string name; + cmd_getval(cmdmap, "name", name); + + if (!osdmap.crush->rule_exists(name)) { + ss << "rule " << name << " does not exist"; + err = 0; + goto reply; + } + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + + if (!newcrush.rule_exists(name)) { + ss << "rule " << name << " does not exist"; + err = 0; + } else { + int ruleno = newcrush.get_rule_id(name); + ceph_assert(ruleno >= 0); + + // make sure it is not in use. + // FIXME: this is ok in some situations, but let's not bother with that + // complexity now. + int ruleset = newcrush.get_rule_mask_ruleset(ruleno); + if (osdmap.crush_rule_in_use(ruleset)) { + ss << "crush ruleset " << name << " " << ruleset << " is in use"; + err = -EBUSY; + goto reply; + } + + err = newcrush.remove_rule(ruleno); + if (err < 0) { + goto reply; + } + + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + } + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd crush rule rename") { + string srcname; + string dstname; + cmd_getval(cmdmap, "srcname", srcname); + cmd_getval(cmdmap, "dstname", dstname); + if (srcname.empty() || dstname.empty()) { + ss << "must specify both source rule name and destination rule name"; + err = -EINVAL; + goto reply; + } + if (srcname == dstname) { + ss << "destination rule name is equal to source rule name"; + err = 0; + goto reply; + } + + CrushWrapper newcrush; + _get_pending_crush(newcrush); + if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) { + // srcname does not exist and dstname already exists + // suppose this is a replay and return success + // (so this command is idempotent) + ss << "already renamed to '" << dstname << "'"; + err = 0; + goto reply; + } + + err = newcrush.rename_rule(srcname, dstname, &ss); + if (err < 0) { + // ss has reason for failure + goto reply; + } + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd setmaxosd") { + int64_t newmax; + if (!cmd_getval(cmdmap, "newmax", newmax)) { + ss << "unable to parse 'newmax' value '" + << cmd_vartype_stringify(cmdmap.at("newmax")) << "'"; + err = -EINVAL; + goto reply; + } + + if (newmax > g_conf()->mon_max_osd) { + err = -ERANGE; + ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd (" + << g_conf()->mon_max_osd << ")"; + goto reply; + } + + // Don't allow shrinking OSD number as this will cause data loss + // and may cause kernel crashes. + // Note: setmaxosd sets the maximum OSD number and not the number of OSDs + if (newmax < osdmap.get_max_osd()) { + // Check if the OSDs exist between current max and new value. + // If there are any OSDs exist, then don't allow shrinking number + // of OSDs. + for (int i = newmax; i < osdmap.get_max_osd(); i++) { + if (osdmap.exists(i)) { + err = -EBUSY; + ss << "cannot shrink max_osd to " << newmax + << " because osd." << i << " (and possibly others) still in use"; + goto reply; + } + } + } + + pending_inc.new_max_osd = newmax; + ss << "set new max_osd = " << pending_inc.new_max_osd; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd set-full-ratio" || + prefix == "osd set-backfillfull-ratio" || + prefix == "osd set-nearfull-ratio") { + double n; + if (!cmd_getval(cmdmap, "ratio", n)) { + ss << "unable to parse 'ratio' value '" + << cmd_vartype_stringify(cmdmap.at("ratio")) << "'"; + err = -EINVAL; + goto reply; + } + if (prefix == "osd set-full-ratio") + pending_inc.new_full_ratio = n; + else if (prefix == "osd set-backfillfull-ratio") + pending_inc.new_backfillfull_ratio = n; + else if (prefix == "osd set-nearfull-ratio") + pending_inc.new_nearfull_ratio = n; + ss << prefix << " " << n; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd set-require-min-compat-client") { + string v; + cmd_getval(cmdmap, "version", v); + ceph_release_t vno = ceph_release_from_name(v); + if (!vno) { + ss << "version " << v << " is not recognized"; + err = -EINVAL; + goto reply; + } + OSDMap newmap; + newmap.deepish_copy_from(osdmap); + newmap.apply_incremental(pending_inc); + newmap.require_min_compat_client = vno; + auto mvno = newmap.get_min_compat_client(); + if (vno < mvno) { + ss << "osdmap current utilizes features that require " << mvno + << "; cannot set require_min_compat_client below that to " << vno; + err = -EPERM; + goto reply; + } + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure) { + FeatureMap m; + mon.get_combined_feature_map(&m); + uint64_t features = ceph_release_features(to_integer<int>(vno)); + bool first = true; + bool ok = true; + for (int type : { + CEPH_ENTITY_TYPE_CLIENT, + CEPH_ENTITY_TYPE_MDS, + CEPH_ENTITY_TYPE_MGR }) { + auto p = m.m.find(type); + if (p == m.m.end()) { + continue; + } + for (auto& q : p->second) { + uint64_t missing = ~q.first & features; + if (missing) { + if (first) { + ss << "cannot set require_min_compat_client to " << v << ": "; + } else { + ss << "; "; + } + first = false; + ss << q.second << " connected " << ceph_entity_type_name(type) + << "(s) look like " << ceph_release_name( + ceph_release_from_features(q.first)) + << " (missing 0x" << std::hex << missing << std::dec << ")"; + ok = false; + } + } + } + if (!ok) { + ss << "; add --yes-i-really-mean-it to do it anyway"; + err = -EPERM; + goto reply; + } + } + ss << "set require_min_compat_client to " << vno; + pending_inc.new_require_min_compat_client = vno; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd pause") { + return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR); + + } else if (prefix == "osd unpause") { + return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR); + + } else if (prefix == "osd set") { + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + + string key; + cmd_getval(cmdmap, "key", key); + if (key == "pause") + return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR); + else if (key == "noup") + return prepare_set_flag(op, CEPH_OSDMAP_NOUP); + else if (key == "nodown") + return prepare_set_flag(op, CEPH_OSDMAP_NODOWN); + else if (key == "noout") + return prepare_set_flag(op, CEPH_OSDMAP_NOOUT); + else if (key == "noin") + return prepare_set_flag(op, CEPH_OSDMAP_NOIN); + else if (key == "nobackfill") + return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL); + else if (key == "norebalance") + return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE); + else if (key == "norecover") + return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER); + else if (key == "noscrub") + return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB); + else if (key == "nodeep-scrub") + return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB); + else if (key == "notieragent") + return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT); + else if (key == "nosnaptrim") + return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM); + else if (key == "pglog_hardlimit") { + if (!osdmap.get_num_up_osds() && !sure) { + ss << "Not advisable to continue since no OSDs are up. Pass " + << "--yes-i-really-mean-it if you really wish to continue."; + err = -EPERM; + goto reply; + } + // The release check here is required because for OSD_PGLOG_HARDLIMIT, + // we are reusing a jewel feature bit that was retired in luminous. + if (osdmap.require_osd_release >= ceph_release_t::luminous && + (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT) + || sure)) { + return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT); + } else { + ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature"; + err = -EPERM; + goto reply; + } + } else { + ss << "unrecognized flag '" << key << "'"; + err = -EINVAL; + } + + } else if (prefix == "osd unset") { + string key; + cmd_getval(cmdmap, "key", key); + if (key == "pause") + return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR); + else if (key == "noup") + return prepare_unset_flag(op, CEPH_OSDMAP_NOUP); + else if (key == "nodown") + return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN); + else if (key == "noout") + return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT); + else if (key == "noin") + return prepare_unset_flag(op, CEPH_OSDMAP_NOIN); + else if (key == "nobackfill") + return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL); + else if (key == "norebalance") + return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE); + else if (key == "norecover") + return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER); + else if (key == "noscrub") + return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB); + else if (key == "nodeep-scrub") + return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB); + else if (key == "notieragent") + return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT); + else if (key == "nosnaptrim") + return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM); + else { + ss << "unrecognized flag '" << key << "'"; + err = -EINVAL; + } + + } else if (prefix == "osd require-osd-release") { + string release; + cmd_getval(cmdmap, "release", release); + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + ceph_release_t rel = ceph_release_from_name(release.c_str()); + if (!rel) { + ss << "unrecognized release " << release; + err = -EINVAL; + goto reply; + } + if (rel == osdmap.require_osd_release) { + // idempotent + err = 0; + goto reply; + } + ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous); + if (!osdmap.get_num_up_osds() && !sure) { + ss << "Not advisable to continue since no OSDs are up. Pass " + << "--yes-i-really-mean-it if you really wish to continue."; + err = -EPERM; + goto reply; + } + if (rel == ceph_release_t::mimic) { + if (!mon.monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_MIMIC)) { + ss << "not all mons are mimic"; + err = -EPERM; + goto reply; + } + if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC)) + && !sure) { + ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature"; + err = -EPERM; + goto reply; + } + } else if (rel == ceph_release_t::nautilus) { + if (!mon.monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + ss << "not all mons are nautilus"; + err = -EPERM; + goto reply; + } + if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS)) + && !sure) { + ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature"; + err = -EPERM; + goto reply; + } + } else if (rel == ceph_release_t::octopus) { + if (!mon.monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_OCTOPUS)) { + ss << "not all mons are octopus"; + err = -EPERM; + goto reply; + } + if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS)) + && !sure) { + ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature"; + err = -EPERM; + goto reply; + } + } else if (rel == ceph_release_t::pacific) { + if (!mon.monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_PACIFIC)) { + ss << "not all mons are pacific"; + err = -EPERM; + goto reply; + } + if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC)) + && !sure) { + ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature"; + err = -EPERM; + goto reply; + } + } else { + ss << "not supported for this release yet"; + err = -EPERM; + goto reply; + } + if (rel < osdmap.require_osd_release) { + ss << "require_osd_release cannot be lowered once it has been set"; + err = -EPERM; + goto reply; + } + pending_inc.new_require_osd_release = rel; + goto update; + } else if (prefix == "osd down" || + prefix == "osd out" || + prefix == "osd in" || + prefix == "osd rm" || + prefix == "osd stop") { + + bool any = false; + bool stop = false; + bool verbose = true; + bool definitely_dead = false; + + vector<string> idvec; + cmd_getval(cmdmap, "ids", idvec); + cmd_getval(cmdmap, "definitely_dead", definitely_dead); + derr << "definitely_dead " << (int)definitely_dead << dendl; + for (unsigned j = 0; j < idvec.size() && !stop; j++) { + set<int> osds; + + // wildcard? + if (j == 0 && + (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) { + if (prefix == "osd in") { + // touch out osds only + osdmap.get_out_existing_osds(osds); + } else { + osdmap.get_all_osds(osds); + } + stop = true; + verbose = false; // so the output is less noisy. + } else { + long osd = parse_osd_id(idvec[j].c_str(), &ss); + if (osd < 0) { + ss << "invalid osd id" << osd; + err = -EINVAL; + continue; + } else if (!osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist. "; + continue; + } + + osds.insert(osd); + } + + for (auto &osd : osds) { + if (prefix == "osd down") { + if (osdmap.is_down(osd)) { + if (verbose) + ss << "osd." << osd << " is already down. "; + } else { + pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP); + ss << "marked down osd." << osd << ". "; + any = true; + } + if (definitely_dead) { + if (!pending_inc.new_xinfo.count(osd)) { + pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd]; + } + if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) { + any = true; + } + pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch; + } + } else if (prefix == "osd out") { + if (osdmap.is_out(osd)) { + if (verbose) + ss << "osd." << osd << " is already out. "; + } else { + pending_inc.new_weight[osd] = CEPH_OSD_OUT; + if (osdmap.osd_weight[osd]) { + if (pending_inc.new_xinfo.count(osd) == 0) { + pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd]; + } + pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd]; + } + ss << "marked out osd." << osd << ". "; + std::ostringstream msg; + msg << "Client " << op->get_session()->entity_name + << " marked osd." << osd << " out"; + if (osdmap.is_up(osd)) { + msg << ", while it was still marked up"; + } else { + auto period = ceph_clock_now() - down_pending_out[osd]; + msg << ", after it was down for " << int(period.sec()) + << " seconds"; + } + + mon.clog->info() << msg.str(); + any = true; + } + } else if (prefix == "osd in") { + if (osdmap.is_in(osd)) { + if (verbose) + ss << "osd." << osd << " is already in. "; + } else { + if (osdmap.osd_xinfo[osd].old_weight > 0) { + pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight; + if (pending_inc.new_xinfo.count(osd) == 0) { + pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd]; + } + pending_inc.new_xinfo[osd].old_weight = 0; + } else { + pending_inc.new_weight[osd] = CEPH_OSD_IN; + } + ss << "marked in osd." << osd << ". "; + any = true; + } + } else if (prefix == "osd rm") { + err = prepare_command_osd_remove(osd); + + if (err == -EBUSY) { + if (any) + ss << ", "; + ss << "osd." << osd << " is still up; must be down before removal. "; + } else { + ceph_assert(err == 0); + if (any) { + ss << ", osd." << osd; + } else { + ss << "removed osd." << osd; + } + any = true; + } + } else if (prefix == "osd stop") { + if (osdmap.is_stop(osd)) { + if (verbose) + ss << "osd." << osd << " is already stopped. "; + } else if (osdmap.is_down(osd)) { + pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP); + ss << "stop down osd." << osd << ". "; + any = true; + } else { + pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP); + ss << "stop osd." << osd << ". "; + any = true; + } + } + } + } + if (any) { + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs, + get_last_committed() + 1)); + return true; + } + } else if (prefix == "osd set-group" || + prefix == "osd unset-group" || + prefix == "osd add-noup" || + prefix == "osd add-nodown" || + prefix == "osd add-noin" || + prefix == "osd add-noout" || + prefix == "osd rm-noup" || + prefix == "osd rm-nodown" || + prefix == "osd rm-noin" || + prefix == "osd rm-noout") { + bool do_set = prefix == "osd set-group" || + prefix.find("add") != string::npos; + string flag_str; + unsigned flags = 0; + vector<string> who; + if (prefix == "osd set-group" || prefix == "osd unset-group") { + cmd_getval(cmdmap, "flags", flag_str); + cmd_getval(cmdmap, "who", who); + vector<string> raw_flags; + boost::split(raw_flags, flag_str, boost::is_any_of(",")); + for (auto& f : raw_flags) { + if (f == "noup") + flags |= CEPH_OSD_NOUP; + else if (f == "nodown") + flags |= CEPH_OSD_NODOWN; + else if (f == "noin") + flags |= CEPH_OSD_NOIN; + else if (f == "noout") + flags |= CEPH_OSD_NOOUT; + else { + ss << "unrecognized flag '" << f << "', must be one of " + << "{noup,nodown,noin,noout}"; + err = -EINVAL; + goto reply; + } + } + } else { + cmd_getval(cmdmap, "ids", who); + if (prefix.find("noup") != string::npos) + flags = CEPH_OSD_NOUP; + else if (prefix.find("nodown") != string::npos) + flags = CEPH_OSD_NODOWN; + else if (prefix.find("noin") != string::npos) + flags = CEPH_OSD_NOIN; + else if (prefix.find("noout") != string::npos) + flags = CEPH_OSD_NOOUT; + else + ceph_assert(0 == "Unreachable!"); + } + if (flags == 0) { + ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset"; + err = -EINVAL; + goto reply; + } + if (who.empty()) { + ss << "must specify at least one or more targets to set/unset"; + err = -EINVAL; + goto reply; + } + set<int> osds; + set<int> crush_nodes; + set<int> device_classes; + for (auto& w : who) { + if (w == "any" || w == "all" || w == "*") { + osdmap.get_all_osds(osds); + break; + } + std::stringstream ts; + if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) { + osds.insert(osd); + } else if (osdmap.crush->name_exists(w)) { + crush_nodes.insert(osdmap.crush->get_item_id(w)); + } else if (osdmap.crush->class_exists(w)) { + device_classes.insert(osdmap.crush->get_class_id(w)); + } else { + ss << "unable to parse osd id or crush node or device class: " + << "\"" << w << "\". "; + } + } + if (osds.empty() && crush_nodes.empty() && device_classes.empty()) { + // ss has reason for failure + err = -EINVAL; + goto reply; + } + bool any = false; + for (auto osd : osds) { + if (!osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist. "; + continue; + } + if (do_set) { + if (flags & CEPH_OSD_NOUP) { + any |= osdmap.is_noup_by_osd(osd) ? + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) : + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP); + } + if (flags & CEPH_OSD_NODOWN) { + any |= osdmap.is_nodown_by_osd(osd) ? + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) : + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN); + } + if (flags & CEPH_OSD_NOIN) { + any |= osdmap.is_noin_by_osd(osd) ? + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) : + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN); + } + if (flags & CEPH_OSD_NOOUT) { + any |= osdmap.is_noout_by_osd(osd) ? + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) : + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT); + } + } else { + if (flags & CEPH_OSD_NOUP) { + any |= osdmap.is_noup_by_osd(osd) ? + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) : + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP); + } + if (flags & CEPH_OSD_NODOWN) { + any |= osdmap.is_nodown_by_osd(osd) ? + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) : + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN); + } + if (flags & CEPH_OSD_NOIN) { + any |= osdmap.is_noin_by_osd(osd) ? + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) : + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN); + } + if (flags & CEPH_OSD_NOOUT) { + any |= osdmap.is_noout_by_osd(osd) ? + pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) : + pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT); + } + } + } + for (auto& id : crush_nodes) { + auto old_flags = osdmap.get_crush_node_flags(id); + auto& pending_flags = pending_inc.new_crush_node_flags[id]; + pending_flags |= old_flags; // adopt existing flags first! + if (do_set) { + pending_flags |= flags; + } else { + pending_flags &= ~flags; + } + any = true; + } + for (auto& id : device_classes) { + auto old_flags = osdmap.get_device_class_flags(id); + auto& pending_flags = pending_inc.new_device_class_flags[id]; + pending_flags |= old_flags; + if (do_set) { + pending_flags |= flags; + } else { + pending_flags &= ~flags; + } + any = true; + } + if (any) { + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs, + get_last_committed() + 1)); + return true; + } + } else if (prefix == "osd pg-temp") { + string pgidstr; + if (!cmd_getval(cmdmap, "pgid", pgidstr)) { + ss << "unable to parse 'pgid' value '" + << cmd_vartype_stringify(cmdmap.at("pgid")) << "'"; + err = -EINVAL; + goto reply; + } + pg_t pgid; + if (!pgid.parse(pgidstr.c_str())) { + ss << "invalid pgid '" << pgidstr << "'"; + err = -EINVAL; + goto reply; + } + if (!osdmap.pg_exists(pgid)) { + ss << "pg " << pgid << " does not exist"; + err = -ENOENT; + goto reply; + } + if (pending_inc.new_pg_temp.count(pgid)) { + dout(10) << __func__ << " waiting for pending update on " << pgid << dendl; + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + + vector<int64_t> id_vec; + vector<int32_t> new_pg_temp; + cmd_getval(cmdmap, "id", id_vec); + if (id_vec.empty()) { + pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(); + ss << "done cleaning up pg_temp of " << pgid; + goto update; + } + for (auto osd : id_vec) { + if (!osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist"; + err = -ENOENT; + goto reply; + } + new_pg_temp.push_back(osd); + } + + int pool_min_size = osdmap.get_pg_pool_min_size(pgid); + if ((int)new_pg_temp.size() < pool_min_size) { + ss << "num of osds (" << new_pg_temp.size() <<") < pool min size (" + << pool_min_size << ")"; + err = -EINVAL; + goto reply; + } + + int pool_size = osdmap.get_pg_pool_size(pgid); + if ((int)new_pg_temp.size() > pool_size) { + ss << "num of osds (" << new_pg_temp.size() <<") > pool size (" + << pool_size << ")"; + err = -EINVAL; + goto reply; + } + + pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>( + new_pg_temp.begin(), new_pg_temp.end()); + ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp; + goto update; + } else if (prefix == "osd primary-temp") { + string pgidstr; + if (!cmd_getval(cmdmap, "pgid", pgidstr)) { + ss << "unable to parse 'pgid' value '" + << cmd_vartype_stringify(cmdmap.at("pgid")) << "'"; + err = -EINVAL; + goto reply; + } + pg_t pgid; + if (!pgid.parse(pgidstr.c_str())) { + ss << "invalid pgid '" << pgidstr << "'"; + err = -EINVAL; + goto reply; + } + if (!osdmap.pg_exists(pgid)) { + ss << "pg " << pgid << " does not exist"; + err = -ENOENT; + goto reply; + } + + int64_t osd; + if (!cmd_getval(cmdmap, "id", osd)) { + ss << "unable to parse 'id' value '" + << cmd_vartype_stringify(cmdmap.at("id")) << "'"; + err = -EINVAL; + goto reply; + } + if (osd != -1 && !osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist"; + err = -ENOENT; + goto reply; + } + + if (osdmap.require_min_compat_client != ceph_release_t::unknown && + osdmap.require_min_compat_client < ceph_release_t::firefly) { + ss << "require_min_compat_client " + << osdmap.require_min_compat_client + << " < firefly, which is required for primary-temp"; + err = -EPERM; + goto reply; + } + + pending_inc.new_primary_temp[pgid] = osd; + ss << "set " << pgid << " primary_temp mapping to " << osd; + goto update; + } else if (prefix == "pg repeer") { + pg_t pgid; + string pgidstr; + cmd_getval(cmdmap, "pgid", pgidstr); + if (!pgid.parse(pgidstr.c_str())) { + ss << "invalid pgid '" << pgidstr << "'"; + err = -EINVAL; + goto reply; + } + if (!osdmap.pg_exists(pgid)) { + ss << "pg '" << pgidstr << "' does not exist"; + err = -ENOENT; + goto reply; + } + vector<int> acting; + int primary; + osdmap.pg_to_acting_osds(pgid, &acting, &primary); + if (primary < 0) { + err = -EAGAIN; + ss << "pg currently has no primary"; + goto reply; + } + if (acting.size() > 1) { + // map to just primary; it will map back to what it wants + pending_inc.new_pg_temp[pgid] = { primary }; + } else { + // hmm, pick another arbitrary osd to induce a change. Note + // that this won't work if there is only one suitable OSD in the cluster. + int i; + bool done = false; + for (i = 0; i < osdmap.get_max_osd(); ++i) { + if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) { + continue; + } + pending_inc.new_pg_temp[pgid] = { primary, i }; + done = true; + break; + } + if (!done) { + err = -EAGAIN; + ss << "not enough up OSDs in the cluster to force repeer"; + goto reply; + } + } + goto update; + } else if (prefix == "osd pg-upmap" || + prefix == "osd rm-pg-upmap" || + prefix == "osd pg-upmap-items" || + prefix == "osd rm-pg-upmap-items") { + if (osdmap.require_min_compat_client < ceph_release_t::luminous) { + ss << "min_compat_client " + << osdmap.require_min_compat_client + << " < luminous, which is required for pg-upmap. " + << "Try 'ceph osd set-require-min-compat-client luminous' " + << "before using the new interface"; + err = -EPERM; + goto reply; + } + err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss); + if (err == -EAGAIN) + goto wait; + if (err < 0) + goto reply; + string pgidstr; + if (!cmd_getval(cmdmap, "pgid", pgidstr)) { + ss << "unable to parse 'pgid' value '" + << cmd_vartype_stringify(cmdmap.at("pgid")) << "'"; + err = -EINVAL; + goto reply; + } + pg_t pgid; + if (!pgid.parse(pgidstr.c_str())) { + ss << "invalid pgid '" << pgidstr << "'"; + err = -EINVAL; + goto reply; + } + if (!osdmap.pg_exists(pgid)) { + ss << "pg " << pgid << " does not exist"; + err = -ENOENT; + goto reply; + } + if (pending_inc.old_pools.count(pgid.pool())) { + ss << "pool of " << pgid << " is pending removal"; + err = -ENOENT; + getline(ss, rs); + wait_for_finished_proposal(op, + new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1)); + return true; + } + + enum { + OP_PG_UPMAP, + OP_RM_PG_UPMAP, + OP_PG_UPMAP_ITEMS, + OP_RM_PG_UPMAP_ITEMS, + } option; + + if (prefix == "osd pg-upmap") { + option = OP_PG_UPMAP; + } else if (prefix == "osd rm-pg-upmap") { + option = OP_RM_PG_UPMAP; + } else if (prefix == "osd pg-upmap-items") { + option = OP_PG_UPMAP_ITEMS; + } else { + option = OP_RM_PG_UPMAP_ITEMS; + } + + // check pending upmap changes + switch (option) { + case OP_PG_UPMAP: // fall through + case OP_RM_PG_UPMAP: + if (pending_inc.new_pg_upmap.count(pgid) || + pending_inc.old_pg_upmap.count(pgid)) { + dout(10) << __func__ << " waiting for pending update on " + << pgid << dendl; + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + break; + + case OP_PG_UPMAP_ITEMS: // fall through + case OP_RM_PG_UPMAP_ITEMS: + if (pending_inc.new_pg_upmap_items.count(pgid) || + pending_inc.old_pg_upmap_items.count(pgid)) { + dout(10) << __func__ << " waiting for pending update on " + << pgid << dendl; + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + break; + + default: + ceph_abort_msg("invalid option"); + } + + switch (option) { + case OP_PG_UPMAP: + { + vector<int64_t> id_vec; + if (!cmd_getval(cmdmap, "id", id_vec)) { + ss << "unable to parse 'id' value(s) '" + << cmd_vartype_stringify(cmdmap.at("id")) << "'"; + err = -EINVAL; + goto reply; + } + + int pool_min_size = osdmap.get_pg_pool_min_size(pgid); + if ((int)id_vec.size() < pool_min_size) { + ss << "num of osds (" << id_vec.size() <<") < pool min size (" + << pool_min_size << ")"; + err = -EINVAL; + goto reply; + } + + int pool_size = osdmap.get_pg_pool_size(pgid); + if ((int)id_vec.size() > pool_size) { + ss << "num of osds (" << id_vec.size() <<") > pool size (" + << pool_size << ")"; + err = -EINVAL; + goto reply; + } + + vector<int32_t> new_pg_upmap; + for (auto osd : id_vec) { + if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) { + ss << "osd." << osd << " does not exist"; + err = -ENOENT; + goto reply; + } + auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd); + if (it != new_pg_upmap.end()) { + ss << "osd." << osd << " already exists, "; + continue; + } + new_pg_upmap.push_back(osd); + } + + if (new_pg_upmap.empty()) { + ss << "no valid upmap items(pairs) is specified"; + err = -EINVAL; + goto reply; + } + + pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>( + new_pg_upmap.begin(), new_pg_upmap.end()); + ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap; + } + break; + + case OP_RM_PG_UPMAP: + { + pending_inc.old_pg_upmap.insert(pgid); + ss << "clear " << pgid << " pg_upmap mapping"; + } + break; + + case OP_PG_UPMAP_ITEMS: + { + vector<int64_t> id_vec; + if (!cmd_getval(cmdmap, "id", id_vec)) { + ss << "unable to parse 'id' value(s) '" + << cmd_vartype_stringify(cmdmap.at("id")) << "'"; + err = -EINVAL; + goto reply; + } + + if (id_vec.size() % 2) { + ss << "you must specify pairs of osd ids to be remapped"; + err = -EINVAL; + goto reply; + } + + int pool_size = osdmap.get_pg_pool_size(pgid); + if ((int)(id_vec.size() / 2) > pool_size) { + ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size (" + << pool_size << ")"; + err = -EINVAL; + goto reply; + } + + vector<pair<int32_t,int32_t>> new_pg_upmap_items; + ostringstream items; + items << "["; + for (auto p = id_vec.begin(); p != id_vec.end(); ++p) { + int from = *p++; + int to = *p; + if (from == to) { + ss << "from osd." << from << " == to osd." << to << ", "; + continue; + } + if (!osdmap.exists(from)) { + ss << "osd." << from << " does not exist"; + err = -ENOENT; + goto reply; + } + if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) { + ss << "osd." << to << " does not exist"; + err = -ENOENT; + goto reply; + } + pair<int32_t,int32_t> entry = make_pair(from, to); + auto it = std::find(new_pg_upmap_items.begin(), + new_pg_upmap_items.end(), entry); + if (it != new_pg_upmap_items.end()) { + ss << "osd." << from << " -> osd." << to << " already exists, "; + continue; + } + new_pg_upmap_items.push_back(entry); + items << from << "->" << to << ","; + } + string out(items.str()); + out.resize(out.size() - 1); // drop last ',' + out += "]"; + + if (new_pg_upmap_items.empty()) { + ss << "no valid upmap items(pairs) is specified"; + err = -EINVAL; + goto reply; + } + + pending_inc.new_pg_upmap_items[pgid] = + mempool::osdmap::vector<pair<int32_t,int32_t>>( + new_pg_upmap_items.begin(), new_pg_upmap_items.end()); + ss << "set " << pgid << " pg_upmap_items mapping to " << out; + } + break; + + case OP_RM_PG_UPMAP_ITEMS: + { + pending_inc.old_pg_upmap_items.insert(pgid); + ss << "clear " << pgid << " pg_upmap_items mapping"; + } + break; + + default: + ceph_abort_msg("invalid option"); + } + + goto update; + } else if (prefix == "osd primary-affinity") { + int64_t id; + if (!cmd_getval(cmdmap, "id", id)) { + ss << "invalid osd id value '" + << cmd_vartype_stringify(cmdmap.at("id")) << "'"; + err = -EINVAL; + goto reply; + } + double w; + if (!cmd_getval(cmdmap, "weight", w)) { + ss << "unable to parse 'weight' value '" + << cmd_vartype_stringify(cmdmap.at("weight")) << "'"; + err = -EINVAL; + goto reply; + } + long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w); + if (ww < 0L) { + ss << "weight must be >= 0"; + err = -EINVAL; + goto reply; + } + if (osdmap.require_min_compat_client != ceph_release_t::unknown && + osdmap.require_min_compat_client < ceph_release_t::firefly) { + ss << "require_min_compat_client " + << osdmap.require_min_compat_client + << " < firefly, which is required for primary-affinity"; + err = -EPERM; + goto reply; + } + if (osdmap.exists(id)) { + pending_inc.new_primary_affinity[id] = ww; + ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else { + ss << "osd." << id << " does not exist"; + err = -ENOENT; + goto reply; + } + } else if (prefix == "osd reweight") { + int64_t id; + if (!cmd_getval(cmdmap, "id", id)) { + ss << "unable to parse osd id value '" + << cmd_vartype_stringify(cmdmap.at("id")) << "'"; + err = -EINVAL; + goto reply; + } + double w; + if (!cmd_getval(cmdmap, "weight", w)) { + ss << "unable to parse weight value '" + << cmd_vartype_stringify(cmdmap.at("weight")) << "'"; + err = -EINVAL; + goto reply; + } + long ww = (int)((double)CEPH_OSD_IN*w); + if (ww < 0L) { + ss << "weight must be >= 0"; + err = -EINVAL; + goto reply; + } + if (osdmap.exists(id)) { + pending_inc.new_weight[id] = ww; + ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else { + ss << "osd." << id << " does not exist"; + err = -ENOENT; + goto reply; + } + } else if (prefix == "osd reweightn") { + map<int32_t, uint32_t> weights; + err = parse_reweights(cct, cmdmap, osdmap, &weights); + if (err) { + ss << "unable to parse 'weights' value '" + << cmd_vartype_stringify(cmdmap.at("weights")) << "'"; + goto reply; + } + pending_inc.new_weight.insert(weights.begin(), weights.end()); + wait_for_finished_proposal( + op, + new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1)); + return true; + } else if (prefix == "osd lost") { + int64_t id; + if (!cmd_getval(cmdmap, "id", id)) { + ss << "unable to parse osd id value '" + << cmd_vartype_stringify(cmdmap.at("id")) << "'"; + err = -EINVAL; + goto reply; + } + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure) { + ss << "are you SURE? this might mean real, permanent data loss. pass " + "--yes-i-really-mean-it if you really do."; + err = -EPERM; + goto reply; + } else if (!osdmap.exists(id)) { + ss << "osd." << id << " does not exist"; + err = -ENOENT; + goto reply; + } else if (!osdmap.is_down(id)) { + ss << "osd." << id << " is not down"; + err = -EBUSY; + goto reply; + } else { + epoch_t e = osdmap.get_info(id).down_at; + pending_inc.new_lost[id] = e; + ss << "marked osd lost in epoch " << e; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } + + } else if (prefix == "osd destroy-actual" || + prefix == "osd purge-actual" || + prefix == "osd purge-new") { + /* Destroying an OSD means that we don't expect to further make use of + * the OSDs data (which may even become unreadable after this operation), + * and that we are okay with scrubbing all its cephx keys and config-key + * data (which may include lockbox keys, thus rendering the osd's data + * unreadable). + * + * The OSD will not be removed. Instead, we will mark it as destroyed, + * such that a subsequent call to `create` will not reuse the osd id. + * This will play into being able to recreate the OSD, at the same + * crush location, with minimal data movement. + */ + + // make sure authmon is writeable. + if (!mon.authmon()->is_writeable()) { + dout(10) << __func__ << " waiting for auth mon to be writeable for " + << "osd destroy" << dendl; + mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + + int64_t id; + if (!cmd_getval(cmdmap, "id", id)) { + auto p = cmdmap.find("id"); + if (p == cmdmap.end()) { + ss << "no osd id specified"; + } else { + ss << "unable to parse osd id value '" + << cmd_vartype_stringify(cmdmap.at("id")) << ""; + } + err = -EINVAL; + goto reply; + } + + bool is_destroy = (prefix == "osd destroy-actual"); + if (!is_destroy) { + ceph_assert("osd purge-actual" == prefix || + "osd purge-new" == prefix); + } + + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure) { + ss << "Are you SURE? Did you verify with 'ceph osd safe-to-destroy'? " + << "This will mean real, permanent data loss, as well " + << "as deletion of cephx and lockbox keys. " + << "Pass --yes-i-really-mean-it if you really do."; + err = -EPERM; + goto reply; + } else if (!osdmap.exists(id)) { + ss << "osd." << id << " does not exist"; + err = 0; // idempotent + goto reply; + } else if (osdmap.is_up(id)) { + ss << "osd." << id << " is not `down`."; + err = -EBUSY; + goto reply; + } else if (is_destroy && osdmap.is_destroyed(id)) { + ss << "destroyed osd." << id; + err = 0; + goto reply; + } + + if (prefix == "osd purge-new" && + (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) { + ss << "osd." << id << " is not new"; + err = -EPERM; + goto reply; + } + + bool goto_reply = false; + + paxos.plug(); + if (is_destroy) { + err = prepare_command_osd_destroy(id, ss); + // we checked above that it should exist. + ceph_assert(err != -ENOENT); + } else { + err = prepare_command_osd_purge(id, ss); + if (err == -ENOENT) { + err = 0; + ss << "osd." << id << " does not exist."; + goto_reply = true; + } + } + paxos.unplug(); + + if (err < 0 || goto_reply) { + goto reply; + } + + if (is_destroy) { + ss << "destroyed osd." << id; + } else { + ss << "purged osd." << id; + } + + getline(ss, rs); + wait_for_finished_proposal(op, + new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1)); + force_immediate_propose(); + return true; + + } else if (prefix == "osd new") { + + // make sure authmon is writeable. + if (!mon.authmon()->is_writeable()) { + dout(10) << __func__ << " waiting for auth mon to be writeable for " + << "osd new" << dendl; + mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + + // make sure kvmon is writeable. + if (!mon.kvmon()->is_writeable()) { + dout(10) << __func__ << " waiting for kv mon to be writeable for " + << "osd new" << dendl; + mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op)); + return false; + } + + map<string,string> param_map; + + bufferlist bl = m->get_data(); + string param_json = bl.to_str(); + dout(20) << __func__ << " osd new json = " << param_json << dendl; + + err = get_json_str_map(param_json, ss, ¶m_map); + if (err < 0) + goto reply; + + dout(20) << __func__ << " osd new params " << param_map << dendl; + + paxos.plug(); + err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get()); + paxos.unplug(); + + if (err < 0) { + goto reply; + } + + if (f) { + f->flush(rdata); + } else { + rdata.append(ss); + } + + if (err == EEXIST) { + // idempotent operation + err = 0; + goto reply; + } + + wait_for_finished_proposal(op, + new Monitor::C_Command(mon, op, 0, rs, rdata, + get_last_committed() + 1)); + force_immediate_propose(); + return true; + + } else if (prefix == "osd create") { + + // optional id provided? + int64_t id = -1, cmd_id = -1; + if (cmd_getval(cmdmap, "id", cmd_id)) { + if (cmd_id < 0) { + ss << "invalid osd id value '" << cmd_id << "'"; + err = -EINVAL; + goto reply; + } + dout(10) << " osd create got id " << cmd_id << dendl; + } + + uuid_d uuid; + string uuidstr; + if (cmd_getval(cmdmap, "uuid", uuidstr)) { + if (!uuid.parse(uuidstr.c_str())) { + ss << "invalid uuid value '" << uuidstr << "'"; + err = -EINVAL; + goto reply; + } + // we only care about the id if we also have the uuid, to + // ensure the operation's idempotency. + id = cmd_id; + } + + int32_t new_id = -1; + err = prepare_command_osd_create(id, uuid, &new_id, ss); + if (err < 0) { + if (err == -EAGAIN) { + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + // a check has failed; reply to the user. + goto reply; + + } else if (err == EEXIST) { + // this is an idempotent operation; we can go ahead and reply. + if (f) { + f->open_object_section("created_osd"); + f->dump_int("osdid", new_id); + f->close_section(); + f->flush(rdata); + } else { + ss << new_id; + rdata.append(ss); + } + err = 0; + goto reply; + } + + string empty_device_class; + do_osd_create(id, uuid, empty_device_class, &new_id); + + if (f) { + f->open_object_section("created_osd"); + f->dump_int("osdid", new_id); + f->close_section(); + f->flush(rdata); + } else { + ss << new_id; + rdata.append(ss); + } + wait_for_finished_proposal(op, + new Monitor::C_Command(mon, op, 0, rs, rdata, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd blocklist clear" || + prefix == "osd blacklist clear") { + pending_inc.new_blocklist.clear(); + std::list<std::pair<entity_addr_t,utime_t > > blocklist; + std::list<std::pair<entity_addr_t,utime_t > > range_b; + osdmap.get_blocklist(&blocklist, &range_b); + for (const auto &entry : blocklist) { + pending_inc.old_blocklist.push_back(entry.first); + } + for (const auto &entry : range_b) { + pending_inc.old_range_blocklist.push_back(entry.first); + } + ss << " removed all blocklist entries"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd blocklist" || + prefix == "osd blacklist") { + string addrstr, rangestr; + bool range = false; + cmd_getval(cmdmap, "addr", addrstr); + if (cmd_getval(cmdmap, "range", rangestr)) { + if (rangestr == "range") { + range = true; + } else { + ss << "Did you mean to specify \"osd blocklist range\"?"; + err = -EINVAL; + goto reply; + } + } + entity_addr_t addr; + if (!addr.parse(addrstr.c_str(), 0)) { + ss << "unable to parse address " << addrstr; + err = -EINVAL; + goto reply; + } + else { + if (range) { + if (!addr.maybe_cidr()) { + ss << "You specified a range command, but " << addr + << " does not parse as a CIDR range"; + err = -EINVAL; + goto reply; + } + addr.type = entity_addr_t::TYPE_CIDR; + err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss); + if (err) { + goto reply; + } + if ((addr.is_ipv4() && addr.get_nonce() > 32) || + (addr.is_ipv6() && addr.get_nonce() > 128)) { + ss << "Too many bits in range for that protocol!"; + err = -EINVAL; + goto reply; + } + } else { + if (osdmap.require_osd_release >= ceph_release_t::nautilus) { + // always blocklist type ANY + addr.set_type(entity_addr_t::TYPE_ANY); + } else { + addr.set_type(entity_addr_t::TYPE_LEGACY); + } + } + + string blocklistop; + if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) { + cmd_getval(cmdmap, "blacklistop", blocklistop); + } + if (blocklistop == "add") { + utime_t expires = ceph_clock_now(); + double d; + // default one hour + cmd_getval(cmdmap, "expire", d, + g_conf()->mon_osd_blocklist_default_expire); + expires += d; + + auto add_to_pending_blocklists = [](auto& nb, auto& ob, + const auto& addr, + const auto& expires) { + nb[addr] = expires; + // cancel any pending un-blocklisting request too + auto it = std::find(ob.begin(), + ob.end(), addr); + if (it != ob.end()) { + ob.erase(it); + } + }; + if (range) { + add_to_pending_blocklists(pending_inc.new_range_blocklist, + pending_inc.old_range_blocklist, + addr, expires); + + } else { + add_to_pending_blocklists(pending_inc.new_blocklist, + pending_inc.old_blocklist, + addr, expires); + } + + ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)"; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (blocklistop == "rm") { + auto rm_from_pending_blocklists = [](const auto& addr, + auto& blocklist, + auto& ob, auto& pb) { + if (blocklist.count(addr)) { + ob.push_back(addr); + return true; + } else if (pb.count(addr)) { + pb.erase(addr); + return true; + } + return false; + }; + if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist, + pending_inc.old_blocklist, + pending_inc.new_blocklist)) || + (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist, + pending_inc.old_range_blocklist, + pending_inc.new_range_blocklist))) { + ss << "un-blocklisting " << addr; + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } + ss << addr << " isn't blocklisted"; + err = 0; + goto reply; + } + } + } else if (prefix == "osd pool mksnap") { + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str()); + if (pool < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + string snapname; + cmd_getval(cmdmap, "snap", snapname); + const pg_pool_t *p = osdmap.get_pg_pool(pool); + if (p->is_unmanaged_snaps_mode()) { + ss << "pool " << poolstr << " is in unmanaged snaps mode"; + err = -EINVAL; + goto reply; + } else if (p->snap_exists(snapname.c_str())) { + ss << "pool " << poolstr << " snap " << snapname << " already exists"; + err = 0; + goto reply; + } else if (p->is_tier()) { + ss << "pool " << poolstr << " is a cache tier"; + err = -EINVAL; + goto reply; + } + pg_pool_t *pp = 0; + if (pending_inc.new_pools.count(pool)) + pp = &pending_inc.new_pools[pool]; + if (!pp) { + pp = &pending_inc.new_pools[pool]; + *pp = *p; + } + if (pp->snap_exists(snapname.c_str())) { + ss << "pool " << poolstr << " snap " << snapname << " already exists"; + } else { + pp->add_snap(snapname.c_str(), ceph_clock_now()); + pp->set_snap_epoch(pending_inc.epoch); + ss << "created pool " << poolstr << " snap " << snapname; + } + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd pool rmsnap") { + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str()); + if (pool < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + string snapname; + cmd_getval(cmdmap, "snap", snapname); + const pg_pool_t *p = osdmap.get_pg_pool(pool); + if (p->is_unmanaged_snaps_mode()) { + ss << "pool " << poolstr << " is in unmanaged snaps mode"; + err = -EINVAL; + goto reply; + } else if (!p->snap_exists(snapname.c_str())) { + ss << "pool " << poolstr << " snap " << snapname << " does not exist"; + err = 0; + goto reply; + } + pg_pool_t *pp = 0; + if (pending_inc.new_pools.count(pool)) + pp = &pending_inc.new_pools[pool]; + if (!pp) { + pp = &pending_inc.new_pools[pool]; + *pp = *p; + } + snapid_t sn = pp->snap_exists(snapname.c_str()); + if (sn) { + pp->remove_snap(sn); + pp->set_snap_epoch(pending_inc.epoch); + ss << "removed pool " << poolstr << " snap " << snapname; + } else { + ss << "already removed pool " << poolstr << " snap " << snapname; + } + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd pool create") { + int64_t pg_num, pgp_num, pg_num_min, pg_num_max; + cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0)); + cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0)); + cmd_getval(cmdmap, "pg_num_max", pg_num_max, int64_t(0)); + cmd_getval(cmdmap, "pgp_num", pgp_num, int64_t(pg_num)); + string pool_type_str; + cmd_getval(cmdmap, "pool_type", pool_type_str); + if (pool_type_str.empty()) + pool_type_str = g_conf().get_val<string>("osd_pool_default_type"); + + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id >= 0) { + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + if (pool_type_str != p->get_type_name()) { + ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str; + err = -EINVAL; + } else { + ss << "pool '" << poolstr << "' already exists"; + err = 0; + } + goto reply; + } + + int pool_type; + if (pool_type_str == "replicated") { + pool_type = pg_pool_t::TYPE_REPLICATED; + } else if (pool_type_str == "erasure") { + pool_type = pg_pool_t::TYPE_ERASURE; + } else { + ss << "unknown pool type '" << pool_type_str << "'"; + err = -EINVAL; + goto reply; + } + + bool implicit_rule_creation = false; + int64_t expected_num_objects = 0; + string rule_name; + cmd_getval(cmdmap, "rule", rule_name); + string erasure_code_profile; + cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile); + + if (pool_type == pg_pool_t::TYPE_ERASURE) { + if (erasure_code_profile == "") + erasure_code_profile = "default"; + //handle the erasure code profile + if (erasure_code_profile == "default") { + if (!osdmap.has_erasure_code_profile(erasure_code_profile)) { + if (pending_inc.has_erasure_code_profile(erasure_code_profile)) { + dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl; + goto wait; + } + + map<string,string> profile_map; + err = osdmap.get_erasure_code_profile_default(cct, + profile_map, + &ss); + if (err) + goto reply; + dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl; + pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map); + goto wait; + } + } + if (rule_name == "") { + implicit_rule_creation = true; + if (erasure_code_profile == "default") { + rule_name = "erasure-code"; + } else { + dout(1) << "implicitly use rule named after the pool: " + << poolstr << dendl; + rule_name = poolstr; + } + } + cmd_getval(cmdmap, "expected_num_objects", + expected_num_objects, int64_t(0)); + } else { + //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field + // and put expected_num_objects to rule field + if (erasure_code_profile != "") { // cmd is from CLI + if (rule_name != "") { + string interr; + expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr); + if (interr.length()) { + ss << "error parsing integer value '" << rule_name << "': " << interr; + err = -EINVAL; + goto reply; + } + } + rule_name = erasure_code_profile; + } else { // cmd is well-formed + cmd_getval(cmdmap, "expected_num_objects", + expected_num_objects, int64_t(0)); + } + } + + if (!implicit_rule_creation && rule_name != "") { + int rule; + err = get_crush_rule(rule_name, &rule, &ss); + if (err == -EAGAIN) { + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + if (err) + goto reply; + } + + if (expected_num_objects < 0) { + ss << "'expected_num_objects' must be non-negative"; + err = -EINVAL; + goto reply; + } + + set<int32_t> osds; + osdmap.get_all_osds(osds); + bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) { + string type; + if (!get_osd_objectstore_type(osd, &type)) { + return type == "filestore"; + } else { + return false; + } + }); + + if (has_filestore_osd && + expected_num_objects > 0 && + cct->_conf->filestore_merge_threshold > 0) { + ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'"; + err = -EINVAL; + goto reply; + } + + if (has_filestore_osd && + expected_num_objects == 0 && + cct->_conf->filestore_merge_threshold < 0) { + int osds = osdmap.get_num_osds(); + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) { + ss << "For better initial performance on pools expected to store a " + << "large number of objects, consider supplying the " + << "expected_num_objects parameter when creating the pool." + << " Pass --yes-i-really-mean-it to ignore it"; + err = -EPERM; + goto reply; + } + } + + int64_t fast_read_param; + cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1)); + FastReadType fast_read = FAST_READ_DEFAULT; + if (fast_read_param == 0) + fast_read = FAST_READ_OFF; + else if (fast_read_param > 0) + fast_read = FAST_READ_ON; + + int64_t repl_size = 0; + cmd_getval(cmdmap, "size", repl_size); + int64_t target_size_bytes = 0; + double target_size_ratio = 0.0; + cmd_getval(cmdmap, "target_size_bytes", target_size_bytes); + cmd_getval(cmdmap, "target_size_ratio", target_size_ratio); + + string pg_autoscale_mode; + cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode); + + bool bulk = 0; + cmd_getval(cmdmap, "bulk", bulk); + err = prepare_new_pool(poolstr, + -1, // default crush rule + rule_name, + pg_num, pgp_num, pg_num_min, pg_num_max, + repl_size, target_size_bytes, target_size_ratio, + erasure_code_profile, pool_type, + (uint64_t)expected_num_objects, + fast_read, + pg_autoscale_mode, + bulk, + &ss); + if (err < 0) { + switch(err) { + case -EEXIST: + ss << "pool '" << poolstr << "' already exists"; + break; + case -EAGAIN: + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + case -ERANGE: + goto reply; + default: + goto reply; + break; + } + } else { + ss << "pool '" << poolstr << "' created"; + } + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd pool delete" || + prefix == "osd pool rm") { + // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it + string poolstr, poolstr2, sure; + cmd_getval(cmdmap, "pool", poolstr); + cmd_getval(cmdmap, "pool2", poolstr2); + int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str()); + if (pool < 0) { + ss << "pool '" << poolstr << "' does not exist"; + err = 0; + goto reply; + } + + bool force_no_fake = false; + cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake); + bool force = false; + cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force); + if (poolstr2 != poolstr || + (!force && !force_no_fake)) { + ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr + << ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, " + << "followed by --yes-i-really-really-mean-it."; + err = -EPERM; + goto reply; + } + err = _prepare_remove_pool(pool, &ss, force_no_fake); + if (err == -EAGAIN) { + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + if (err < 0) + goto reply; + goto update; + } else if (prefix == "osd pool rename") { + string srcpoolstr, destpoolstr; + cmd_getval(cmdmap, "srcpool", srcpoolstr); + cmd_getval(cmdmap, "destpool", destpoolstr); + int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str()); + int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str()); + + if (pool_src < 0) { + if (pool_dst >= 0) { + // src pool doesn't exist, dst pool does exist: to ensure idempotency + // of operations, assume this rename succeeded, as it is not changing + // the current state. Make sure we output something understandable + // for whoever is issuing the command, if they are paying attention, + // in case it was not intentional; or to avoid a "wtf?" and a bug + // report in case it was intentional, while expecting a failure. + ss << "pool '" << srcpoolstr << "' does not exist; pool '" + << destpoolstr << "' does -- assuming successful rename"; + err = 0; + } else { + ss << "unrecognized pool '" << srcpoolstr << "'"; + err = -ENOENT; + } + goto reply; + } else if (pool_dst >= 0) { + // source pool exists and so does the destination pool + ss << "pool '" << destpoolstr << "' already exists"; + err = -EEXIST; + goto reply; + } + + int ret = _prepare_rename_pool(pool_src, destpoolstr); + if (ret == 0) { + ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'"; + } else { + ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': " + << cpp_strerror(ret); + } + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs, + get_last_committed() + 1)); + return true; + + } else if (prefix == "osd pool set") { + err = prepare_command_pool_set(cmdmap, ss); + if (err == -EAGAIN) + goto wait; + if (err < 0) + goto reply; + + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd tier add") { + err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss); + if (err == -EAGAIN) + goto wait; + if (err) + goto reply; + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + string tierpoolstr; + cmd_getval(cmdmap, "tierpool", tierpoolstr); + int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr); + if (tierpool_id < 0) { + ss << "unrecognized pool '" << tierpoolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + ceph_assert(p); + const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id); + ceph_assert(tp); + + if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) { + goto reply; + } + + // make sure new tier is empty + string force_nonempty; + cmd_getval(cmdmap, "force_nonempty", force_nonempty); + const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id); + if (pstats && pstats->stats.sum.num_objects != 0 && + force_nonempty != "--force-nonempty") { + ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force"; + err = -ENOTEMPTY; + goto reply; + } + if (tp->is_erasure()) { + ss << "tier pool '" << tierpoolstr + << "' is an ec pool, which cannot be a tier"; + err = -ENOTSUP; + goto reply; + } + if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) && + ((force_nonempty != "--force-nonempty") || + (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) { + ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool"; + err = -ENOTEMPTY; + goto reply; + } + // go + pg_pool_t *np = pending_inc.get_new_pool(pool_id, p); + pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp); + if (np->tiers.count(tierpool_id) || ntp->is_tier()) { + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + np->tiers.insert(tierpool_id); + np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info + ntp->tier_of = pool_id; + ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'"; + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(), + get_last_committed() + 1)); + return true; + } else if (prefix == "osd tier remove" || + prefix == "osd tier rm") { + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + string tierpoolstr; + cmd_getval(cmdmap, "tierpool", tierpoolstr); + int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr); + if (tierpool_id < 0) { + ss << "unrecognized pool '" << tierpoolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + ceph_assert(p); + const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id); + ceph_assert(tp); + + if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) { + goto reply; + } + + if (p->tiers.count(tierpool_id) == 0) { + ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'"; + err = 0; + goto reply; + } + if (tp->tier_of != pool_id) { + ss << "tier pool '" << tierpoolstr << "' is a tier of '" + << osdmap.get_pool_name(tp->tier_of) << "': " + // be scary about it; this is an inconsistency and bells must go off + << "THIS SHOULD NOT HAVE HAPPENED AT ALL"; + err = -EINVAL; + goto reply; + } + if (p->read_tier == tierpool_id) { + ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first"; + err = -EBUSY; + goto reply; + } + // go + pg_pool_t *np = pending_inc.get_new_pool(pool_id, p); + pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp); + if (np->tiers.count(tierpool_id) == 0 || + ntp->tier_of != pool_id || + np->read_tier == tierpool_id) { + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + np->tiers.erase(tierpool_id); + ntp->clear_tier(); + ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'"; + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(), + get_last_committed() + 1)); + return true; + } else if (prefix == "osd tier set-overlay") { + err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss); + if (err == -EAGAIN) + goto wait; + if (err) + goto reply; + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + string overlaypoolstr; + cmd_getval(cmdmap, "overlaypool", overlaypoolstr); + int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr); + if (overlaypool_id < 0) { + ss << "unrecognized pool '" << overlaypoolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + ceph_assert(p); + const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id); + ceph_assert(overlay_p); + if (p->tiers.count(overlaypool_id) == 0) { + ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'"; + err = -EINVAL; + goto reply; + } + if (p->read_tier == overlaypool_id) { + err = 0; + ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'"; + goto reply; + } + if (p->has_read_tier()) { + ss << "pool '" << poolstr << "' has overlay '" + << osdmap.get_pool_name(p->read_tier) + << "'; please remove-overlay first"; + err = -EINVAL; + goto reply; + } + + // go + pg_pool_t *np = pending_inc.get_new_pool(pool_id, p); + np->read_tier = overlaypool_id; + np->write_tier = overlaypool_id; + np->set_last_force_op_resend(pending_inc.epoch); + pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p); + noverlay_p->set_last_force_op_resend(pending_inc.epoch); + ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'"; + if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE) + ss <<" (WARNING: overlay pool cache_mode is still NONE)"; + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(), + get_last_committed() + 1)); + return true; + } else if (prefix == "osd tier remove-overlay" || + prefix == "osd tier rm-overlay") { + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + ceph_assert(p); + if (!p->has_read_tier()) { + err = 0; + ss << "there is now (or already was) no overlay for '" << poolstr << "'"; + goto reply; + } + + if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) { + goto reply; + } + + // go + pg_pool_t *np = pending_inc.get_new_pool(pool_id, p); + if (np->has_read_tier()) { + const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier); + pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op); + nop->set_last_force_op_resend(pending_inc.epoch); + } + if (np->has_write_tier()) { + const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier); + pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op); + nop->set_last_force_op_resend(pending_inc.epoch); + } + np->clear_read_tier(); + np->clear_write_tier(); + np->set_last_force_op_resend(pending_inc.epoch); + ss << "there is now (or already was) no overlay for '" << poolstr << "'"; + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(), + get_last_committed() + 1)); + return true; + } else if (prefix == "osd tier cache-mode") { + err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss); + if (err == -EAGAIN) + goto wait; + if (err) + goto reply; + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + ceph_assert(p); + if (!p->is_tier()) { + ss << "pool '" << poolstr << "' is not a tier"; + err = -EINVAL; + goto reply; + } + string modestr; + cmd_getval(cmdmap, "mode", modestr); + pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr); + if (int(mode) < 0) { + ss << "'" << modestr << "' is not a valid cache mode"; + err = -EINVAL; + goto reply; + } + + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + + if (mode == pg_pool_t::CACHEMODE_FORWARD || + mode == pg_pool_t::CACHEMODE_READFORWARD) { + ss << "'" << modestr << "' is no longer a supported cache mode"; + err = -EPERM; + goto reply; + } + if ((mode != pg_pool_t::CACHEMODE_WRITEBACK && + mode != pg_pool_t::CACHEMODE_NONE && + mode != pg_pool_t::CACHEMODE_PROXY && + mode != pg_pool_t::CACHEMODE_READPROXY) && + !sure) { + ss << "'" << modestr << "' is not a well-supported cache mode and may " + << "corrupt your data. pass --yes-i-really-mean-it to force."; + err = -EPERM; + goto reply; + } + + // pool already has this cache-mode set and there are no pending changes + if (p->cache_mode == mode && + (pending_inc.new_pools.count(pool_id) == 0 || + pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) { + ss << "set cache-mode for pool '" << poolstr << "'" + << " to " << pg_pool_t::get_cache_mode_name(mode); + err = 0; + goto reply; + } + + /* Mode description: + * + * none: No cache-mode defined + * forward: Forward all reads and writes to base pool [removed] + * writeback: Cache writes, promote reads from base pool + * readonly: Forward writes to base pool + * readforward: Writes are in writeback mode, Reads are in forward mode [removed] + * proxy: Proxy all reads and writes to base pool + * readproxy: Writes are in writeback mode, Reads are in proxy mode + * + * Hence, these are the allowed transitions: + * + * none -> any + * forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0 + * proxy -> readproxy || writeback || any IF num_objects_dirty == 0 + * readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0 + * readproxy -> proxy || writeback || any IF num_objects_dirty == 0 + * writeback -> readproxy || proxy + * readonly -> any + */ + + // We check if the transition is valid against the current pool mode, as + // it is the only committed state thus far. We will blantly squash + // whatever mode is on the pending state. + + if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK && + (mode != pg_pool_t::CACHEMODE_PROXY && + mode != pg_pool_t::CACHEMODE_READPROXY)) { + ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode) + << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode) + << "' pool; only '" + << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY) + << "' allowed."; + err = -EINVAL; + goto reply; + } + if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD && + (mode != pg_pool_t::CACHEMODE_WRITEBACK && + mode != pg_pool_t::CACHEMODE_PROXY && + mode != pg_pool_t::CACHEMODE_READPROXY)) || + + (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY && + (mode != pg_pool_t::CACHEMODE_WRITEBACK && + mode != pg_pool_t::CACHEMODE_PROXY)) || + + (p->cache_mode == pg_pool_t::CACHEMODE_PROXY && + (mode != pg_pool_t::CACHEMODE_WRITEBACK && + mode != pg_pool_t::CACHEMODE_READPROXY)) || + + (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD && + (mode != pg_pool_t::CACHEMODE_WRITEBACK && + mode != pg_pool_t::CACHEMODE_PROXY && + mode != pg_pool_t::CACHEMODE_READPROXY))) { + + const pool_stat_t* pstats = + mon.mgrstatmon()->get_pool_stat(pool_id); + + if (pstats && pstats->stats.sum.num_objects_dirty > 0) { + ss << "unable to set cache-mode '" + << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr + << "': dirty objects found"; + err = -EBUSY; + goto reply; + } + } + // go + pg_pool_t *np = pending_inc.get_new_pool(pool_id, p); + np->cache_mode = mode; + // set this both when moving to and from cache_mode NONE. this is to + // capture legacy pools that were set up before this flag existed. + np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES; + ss << "set cache-mode for pool '" << poolstr + << "' to " << pg_pool_t::get_cache_mode_name(mode); + if (mode == pg_pool_t::CACHEMODE_NONE) { + const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of); + ceph_assert(base_pool); + if (base_pool->read_tier == pool_id || + base_pool->write_tier == pool_id) + ss <<" (WARNING: pool is still configured as read or write tier)"; + } + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(), + get_last_committed() + 1)); + return true; + } else if (prefix == "osd tier add-cache") { + err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss); + if (err == -EAGAIN) + goto wait; + if (err) + goto reply; + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + string tierpoolstr; + cmd_getval(cmdmap, "tierpool", tierpoolstr); + int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr); + if (tierpool_id < 0) { + ss << "unrecognized pool '" << tierpoolstr << "'"; + err = -ENOENT; + goto reply; + } + const pg_pool_t *p = osdmap.get_pg_pool(pool_id); + ceph_assert(p); + const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id); + ceph_assert(tp); + + if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) { + goto reply; + } + + int64_t size = 0; + if (!cmd_getval(cmdmap, "size", size)) { + ss << "unable to parse 'size' value '" + << cmd_vartype_stringify(cmdmap.at("size")) << "'"; + err = -EINVAL; + goto reply; + } + // make sure new tier is empty + const pool_stat_t *pstats = + mon.mgrstatmon()->get_pool_stat(tierpool_id); + if (pstats && pstats->stats.sum.num_objects != 0) { + ss << "tier pool '" << tierpoolstr << "' is not empty"; + err = -ENOTEMPTY; + goto reply; + } + auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode"); + pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr); + if (int(mode) < 0) { + ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode"; + err = -EINVAL; + goto reply; + } + HitSet::Params hsp; + auto& cache_hit_set_type = + g_conf().get_val<string>("osd_tier_default_cache_hit_set_type"); + if (cache_hit_set_type == "bloom") { + BloomHitSet::Params *bsp = new BloomHitSet::Params; + bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp")); + hsp = HitSet::Params(bsp); + } else if (cache_hit_set_type == "explicit_hash") { + hsp = HitSet::Params(new ExplicitHashHitSet::Params); + } else if (cache_hit_set_type == "explicit_object") { + hsp = HitSet::Params(new ExplicitObjectHitSet::Params); + } else { + ss << "osd tier cache default hit set type '" + << cache_hit_set_type << "' is not a known type"; + err = -EINVAL; + goto reply; + } + // go + pg_pool_t *np = pending_inc.get_new_pool(pool_id, p); + pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp); + if (np->tiers.count(tierpool_id) || ntp->is_tier()) { + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + np->tiers.insert(tierpool_id); + np->read_tier = np->write_tier = tierpool_id; + np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info + np->set_last_force_op_resend(pending_inc.epoch); + ntp->set_last_force_op_resend(pending_inc.epoch); + ntp->tier_of = pool_id; + ntp->cache_mode = mode; + ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count"); + ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period"); + ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote"); + ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote"); + ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate"); + ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n"); + ntp->hit_set_params = hsp; + ntp->target_max_bytes = size; + ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'"; + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(), + get_last_committed() + 1)); + return true; + } else if (prefix == "osd pool set-quota") { + string poolstr; + cmd_getval(cmdmap, "pool", poolstr); + int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr); + if (pool_id < 0) { + ss << "unrecognized pool '" << poolstr << "'"; + err = -ENOENT; + goto reply; + } + + string field; + cmd_getval(cmdmap, "field", field); + if (field != "max_objects" && field != "max_bytes") { + ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'"; + err = -EINVAL; + goto reply; + } + + // val could contain unit designations, so we treat as a string + string val; + cmd_getval(cmdmap, "val", val); + string tss; + int64_t value; + if (field == "max_objects") { + value = strict_sistrtoll(val.c_str(), &tss); + } else if (field == "max_bytes") { + value = strict_iecstrtoll(val.c_str(), &tss); + } else { + ceph_abort_msg("unrecognized option"); + } + if (!tss.empty()) { + ss << "error parsing value '" << val << "': " << tss; + err = -EINVAL; + goto reply; + } + + pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id)); + if (field == "max_objects") { + pi->quota_max_objects = value; + } else if (field == "max_bytes") { + pi->quota_max_bytes = value; + } else { + ceph_abort_msg("unrecognized option"); + } + ss << "set-quota " << field << " = " << value << " for pool " << poolstr; + rs = ss.str(); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + } else if (prefix == "osd pool application enable" || + prefix == "osd pool application disable" || + prefix == "osd pool application set" || + prefix == "osd pool application rm") { + err = prepare_command_pool_application(prefix, cmdmap, ss); + if (err == -EAGAIN) { + goto wait; + } else if (err < 0) { + goto reply; + } else { + goto update; + } + } else if (prefix == "osd force-create-pg") { + pg_t pgid; + string pgidstr; + cmd_getval(cmdmap, "pgid", pgidstr); + if (!pgid.parse(pgidstr.c_str())) { + ss << "invalid pgid '" << pgidstr << "'"; + err = -EINVAL; + goto reply; + } + if (!osdmap.pg_exists(pgid)) { + ss << "pg " << pgid << " should not exist"; + err = -ENOENT; + goto reply; + } + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure) { + ss << "This command will recreate a lost (as in data lost) PG with data in it, such " + << "that the cluster will give up ever trying to recover the lost data. Do this " + << "only if you are certain that all copies of the PG are in fact lost and you are " + << "willing to accept that the data is permanently destroyed. Pass " + << "--yes-i-really-mean-it to proceed."; + err = -EPERM; + goto reply; + } + bool creating_now; + { + std::lock_guard<std::mutex> l(creating_pgs_lock); + auto emplaced = creating_pgs.pgs.emplace( + pgid, + creating_pgs_t::pg_create_info(osdmap.get_epoch(), + ceph_clock_now())); + creating_now = emplaced.second; + } + if (creating_now) { + ss << "pg " << pgidstr << " now creating, ok"; + // set the pool's CREATING flag so that (1) the osd won't ignore our + // create message and (2) we won't propose any future pg_num changes + // until after the PG has been instantiated. + if (pending_inc.new_pools.count(pgid.pool()) == 0) { + pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool()); + } + pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING; + err = 0; + goto update; + } else { + ss << "pg " << pgid << " already creating"; + err = 0; + goto reply; + } + } else if (prefix == "osd force_healthy_stretch_mode") { + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure) { + ss << "This command will require peering across multiple CRUSH buckets " + "(probably two data centers or availability zones?) and may result in PGs " + "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed."; + err = -EPERM; + goto reply; + } + try_end_recovery_stretch_mode(true); + ss << "Triggering healthy stretch mode"; + err = 0; + goto reply; + } else if (prefix == "osd force_recovery_stretch_mode") { + bool sure = false; + cmd_getval(cmdmap, "yes_i_really_mean_it", sure); + if (!sure) { + ss << "This command will increase pool sizes to try and spread them " + "across multiple CRUSH buckets (probably two data centers or " + "availability zones?) and should have happened automatically" + "Pass --yes-i-really-mean-it to proceed."; + err = -EPERM; + goto reply; + } + mon.go_recovery_stretch_mode(); + ss << "Triggering recovery stretch mode"; + err = 0; + goto reply; + } else { + err = -EINVAL; + } + + reply: + getline(ss, rs); + if (err < 0 && rs.length() == 0) + rs = cpp_strerror(err); + mon.reply_command(op, err, rs, rdata, get_last_committed()); + return ret; + + update: + getline(ss, rs); + wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs, + get_last_committed() + 1)); + return true; + + wait: + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; +} + +bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + + auto m = op->get_req<MPoolOp>(); + MonSession *session = op->get_session(); + if (!session) { + _pool_op_reply(op, -EPERM, osdmap.get_epoch()); + return true; + } + + switch (m->op) { + case POOL_OP_CREATE_UNMANAGED_SNAP: + case POOL_OP_DELETE_UNMANAGED_SNAP: + { + const std::string* pool_name = nullptr; + const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool); + if (pg_pool != nullptr) { + pool_name = &osdmap.get_pool_name(m->pool); + } + + if (!is_unmanaged_snap_op_permitted(cct, mon.key_server, + session->entity_name, session->caps, + session->get_peer_socket_addr(), + pool_name)) { + dout(0) << "got unmanaged-snap pool op from entity with insufficient " + << "privileges. message: " << *m << std::endl + << "caps: " << session->caps << dendl; + _pool_op_reply(op, -EPERM, osdmap.get_epoch()); + return true; + } + } + break; + default: + if (!session->is_capable("osd", MON_CAP_W)) { + dout(0) << "got pool op from entity with insufficient privileges. " + << "message: " << *m << std::endl + << "caps: " << session->caps << dendl; + _pool_op_reply(op, -EPERM, osdmap.get_epoch()); + return true; + } + break; + } + + return false; +} + +bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MPoolOp>(); + + if (enforce_pool_op_caps(op)) { + return true; + } + + if (m->fsid != mon.monmap->fsid) { + dout(0) << __func__ << " drop message on fsid " << m->fsid + << " != " << mon.monmap->fsid << " for " << *m << dendl; + _pool_op_reply(op, -EINVAL, osdmap.get_epoch()); + return true; + } + + if (m->op == POOL_OP_CREATE) + return preprocess_pool_op_create(op); + + const pg_pool_t *p = osdmap.get_pg_pool(m->pool); + if (p == nullptr) { + dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl; + if (m->op == POOL_OP_DELETE) { + _pool_op_reply(op, 0, osdmap.get_epoch()); + } else { + _pool_op_reply(op, -ENOENT, osdmap.get_epoch()); + } + return true; + } + + // check if the snap and snapname exist + bool snap_exists = false; + if (p->snap_exists(m->name.c_str())) + snap_exists = true; + + switch (m->op) { + case POOL_OP_CREATE_SNAP: + if (p->is_unmanaged_snaps_mode() || p->is_tier()) { + _pool_op_reply(op, -EINVAL, osdmap.get_epoch()); + return true; + } + if (snap_exists) { + _pool_op_reply(op, 0, osdmap.get_epoch()); + return true; + } + return false; + case POOL_OP_CREATE_UNMANAGED_SNAP: + if (p->is_pool_snaps_mode()) { + _pool_op_reply(op, -EINVAL, osdmap.get_epoch()); + return true; + } + return false; + case POOL_OP_DELETE_SNAP: + if (p->is_unmanaged_snaps_mode()) { + _pool_op_reply(op, -EINVAL, osdmap.get_epoch()); + return true; + } + if (!snap_exists) { + _pool_op_reply(op, 0, osdmap.get_epoch()); + return true; + } + return false; + case POOL_OP_DELETE_UNMANAGED_SNAP: + if (p->is_pool_snaps_mode()) { + _pool_op_reply(op, -EINVAL, osdmap.get_epoch()); + return true; + } + if (_is_removed_snap(m->pool, m->snapid)) { + _pool_op_reply(op, 0, osdmap.get_epoch()); + return true; + } + return false; + case POOL_OP_DELETE: + if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) { + _pool_op_reply(op, 0, osdmap.get_epoch()); + return true; + } + return false; + case POOL_OP_AUID_CHANGE: + return false; + default: + ceph_abort(); + break; + } + + return false; +} + +bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap) +{ + if (!osdmap.have_pg_pool(pool)) { + dout(10) << __func__ << " pool " << pool << " snap " << snap + << " - pool dne" << dendl; + return true; + } + if (osdmap.in_removed_snaps_queue(pool, snap)) { + dout(10) << __func__ << " pool " << pool << " snap " << snap + << " - in osdmap removed_snaps_queue" << dendl; + return true; + } + snapid_t begin, end; + int r = lookup_purged_snap(pool, snap, &begin, &end); + if (r == 0) { + dout(10) << __func__ << " pool " << pool << " snap " << snap + << " - purged, [" << begin << "," << end << ")" << dendl; + return true; + } + return false; +} + +bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap) +{ + if (pending_inc.old_pools.count(pool)) { + dout(10) << __func__ << " pool " << pool << " snap " << snap + << " - pool pending deletion" << dendl; + return true; + } + if (pending_inc.in_new_removed_snaps(pool, snap)) { + dout(10) << __func__ << " pool " << pool << " snap " << snap + << " - in pending new_removed_snaps" << dendl; + return true; + } + return false; +} + +bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MPoolOp>(); + int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str()); + if (pool >= 0) { + _pool_op_reply(op, 0, osdmap.get_epoch()); + return true; + } + + return false; +} + +bool OSDMonitor::prepare_pool_op(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MPoolOp>(); + dout(10) << "prepare_pool_op " << *m << dendl; + if (m->op == POOL_OP_CREATE) { + return prepare_pool_op_create(op); + } else if (m->op == POOL_OP_DELETE) { + return prepare_pool_op_delete(op); + } + + int ret = 0; + bool changed = false; + + if (!osdmap.have_pg_pool(m->pool)) { + _pool_op_reply(op, -ENOENT, osdmap.get_epoch()); + return false; + } + + const pg_pool_t *pool = osdmap.get_pg_pool(m->pool); + + switch (m->op) { + case POOL_OP_CREATE_SNAP: + if (pool->is_tier()) { + ret = -EINVAL; + _pool_op_reply(op, ret, osdmap.get_epoch()); + return false; + } // else, fall through + case POOL_OP_DELETE_SNAP: + if (!pool->is_unmanaged_snaps_mode()) { + bool snap_exists = pool->snap_exists(m->name.c_str()); + if ((m->op == POOL_OP_CREATE_SNAP && snap_exists) + || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) { + ret = 0; + } else { + break; + } + } else { + ret = -EINVAL; + } + _pool_op_reply(op, ret, osdmap.get_epoch()); + return false; + + case POOL_OP_DELETE_UNMANAGED_SNAP: + // we won't allow removal of an unmanaged snapshot from a pool + // not in unmanaged snaps mode. + if (!pool->is_unmanaged_snaps_mode()) { + _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch()); + return false; + } + /* fall-thru */ + case POOL_OP_CREATE_UNMANAGED_SNAP: + // but we will allow creating an unmanaged snapshot on any pool + // as long as it is not in 'pool' snaps mode. + if (pool->is_pool_snaps_mode()) { + _pool_op_reply(op, -EINVAL, osdmap.get_epoch()); + return false; + } + } + + // projected pool info + pg_pool_t pp; + if (pending_inc.new_pools.count(m->pool)) + pp = pending_inc.new_pools[m->pool]; + else + pp = *osdmap.get_pg_pool(m->pool); + + bufferlist reply_data; + + // pool snaps vs unmanaged snaps are mutually exclusive + switch (m->op) { + case POOL_OP_CREATE_SNAP: + case POOL_OP_DELETE_SNAP: + if (pp.is_unmanaged_snaps_mode()) { + ret = -EINVAL; + goto out; + } + break; + + case POOL_OP_CREATE_UNMANAGED_SNAP: + case POOL_OP_DELETE_UNMANAGED_SNAP: + if (pp.is_pool_snaps_mode()) { + ret = -EINVAL; + goto out; + } + } + + switch (m->op) { + case POOL_OP_CREATE_SNAP: + if (!pp.snap_exists(m->name.c_str())) { + pp.add_snap(m->name.c_str(), ceph_clock_now()); + dout(10) << "create snap in pool " << m->pool << " " << m->name + << " seq " << pp.get_snap_epoch() << dendl; + changed = true; + } + break; + + case POOL_OP_DELETE_SNAP: + { + snapid_t s = pp.snap_exists(m->name.c_str()); + if (s) { + pp.remove_snap(s); + pending_inc.new_removed_snaps[m->pool].insert(s); + changed = true; + } + } + break; + + case POOL_OP_CREATE_UNMANAGED_SNAP: + { + uint64_t snapid = pp.add_unmanaged_snap( + osdmap.require_osd_release < ceph_release_t::octopus); + encode(snapid, reply_data); + changed = true; + } + break; + + case POOL_OP_DELETE_UNMANAGED_SNAP: + if (!_is_removed_snap(m->pool, m->snapid) && + !_is_pending_removed_snap(m->pool, m->snapid)) { + if (m->snapid > pp.get_snap_seq()) { + _pool_op_reply(op, -ENOENT, osdmap.get_epoch()); + return false; + } + pp.remove_unmanaged_snap( + m->snapid, + osdmap.require_osd_release < ceph_release_t::octopus); + pending_inc.new_removed_snaps[m->pool].insert(m->snapid); + // also record the new seq as purged: this avoids a discontinuity + // after all of the snaps have been purged, since the seq assigned + // during removal lives in the same namespace as the actual snaps. + pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq()); + changed = true; + } + break; + + case POOL_OP_AUID_CHANGE: + _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch()); + return false; + + default: + ceph_abort(); + break; + } + + if (changed) { + pp.set_snap_epoch(pending_inc.epoch); + pending_inc.new_pools[m->pool] = pp; + } + + out: + wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data)); + return true; +} + +bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + int err = prepare_new_pool(op); + wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch)); + return true; +} + +int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool, + ostream *ss) +{ + const string& poolstr = osdmap.get_pool_name(pool_id); + + // If the Pool is in use by CephFS, refuse to delete it + FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap(); + if (pending_fsmap.pool_in_use(pool_id)) { + *ss << "pool '" << poolstr << "' is in use by CephFS"; + return -EBUSY; + } + + if (pool.tier_of >= 0) { + *ss << "pool '" << poolstr << "' is a tier of '" + << osdmap.get_pool_name(pool.tier_of) << "'"; + return -EBUSY; + } + if (!pool.tiers.empty()) { + *ss << "pool '" << poolstr << "' has tiers"; + for(auto tier : pool.tiers) { + *ss << " " << osdmap.get_pool_name(tier); + } + return -EBUSY; + } + + if (!g_conf()->mon_allow_pool_delete) { + *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool"; + return -EPERM; + } + + if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) { + *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first"; + return -EPERM; + } + + *ss << "pool '" << poolstr << "' removed"; + return 0; +} + +/** + * Check if it is safe to add a tier to a base pool + * + * @return + * True if the operation should proceed, false if we should abort here + * (abort doesn't necessarily mean error, could be idempotency) + */ +bool OSDMonitor::_check_become_tier( + const int64_t tier_pool_id, const pg_pool_t *tier_pool, + const int64_t base_pool_id, const pg_pool_t *base_pool, + int *err, + ostream *ss) const +{ + const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id); + const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id); + + const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap(); + if (pending_fsmap.pool_in_use(tier_pool_id)) { + *ss << "pool '" << tier_pool_name << "' is in use by CephFS"; + *err = -EBUSY; + return false; + } + + if (base_pool->tiers.count(tier_pool_id)) { + ceph_assert(tier_pool->tier_of == base_pool_id); + *err = 0; + *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '" + << base_pool_name << "'"; + return false; + } + + if (base_pool->is_tier()) { + *ss << "pool '" << base_pool_name << "' is already a tier of '" + << osdmap.get_pool_name(base_pool->tier_of) << "', " + << "multiple tiers are not yet supported."; + *err = -EINVAL; + return false; + } + + if (tier_pool->has_tiers()) { + *ss << "pool '" << tier_pool_name << "' has following tier(s) already:"; + for (set<uint64_t>::iterator it = tier_pool->tiers.begin(); + it != tier_pool->tiers.end(); ++it) + *ss << "'" << osdmap.get_pool_name(*it) << "',"; + *ss << " multiple tiers are not yet supported."; + *err = -EINVAL; + return false; + } + + if (tier_pool->is_tier()) { + *ss << "tier pool '" << tier_pool_name << "' is already a tier of '" + << osdmap.get_pool_name(tier_pool->tier_of) << "'"; + *err = -EINVAL; + return false; + } + + *err = 0; + return true; +} + + +/** + * Check if it is safe to remove a tier from this base pool + * + * @return + * True if the operation should proceed, false if we should abort here + * (abort doesn't necessarily mean error, could be idempotency) + */ +bool OSDMonitor::_check_remove_tier( + const int64_t base_pool_id, const pg_pool_t *base_pool, + const pg_pool_t *tier_pool, + int *err, ostream *ss) const +{ + const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id); + + // Apply CephFS-specific checks + const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap(); + if (pending_fsmap.pool_in_use(base_pool_id)) { + if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) { + // If the underlying pool is erasure coded and does not allow EC + // overwrites, we can't permit the removal of the replicated tier that + // CephFS relies on to access it + *ss << "pool '" << base_pool_name << + "' does not allow EC overwrites and is in use by CephFS" + " via its tier"; + *err = -EBUSY; + return false; + } + + if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) { + *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this " + "tier is still in use as a writeback cache. Change the cache " + "mode and flush the cache before removing it"; + *err = -EBUSY; + return false; + } + } + + *err = 0; + return true; +} + +int OSDMonitor::_prepare_remove_pool( + int64_t pool, ostream *ss, bool no_fake) +{ + dout(10) << __func__ << " " << pool << dendl; + const pg_pool_t *p = osdmap.get_pg_pool(pool); + int r = _check_remove_pool(pool, *p, ss); + if (r < 0) + return r; + + auto new_pool = pending_inc.new_pools.find(pool); + if (new_pool != pending_inc.new_pools.end()) { + // if there is a problem with the pending info, wait and retry + // this op. + const auto& p = new_pool->second; + int r = _check_remove_pool(pool, p, ss); + if (r < 0) + return -EAGAIN; + } + + if (pending_inc.old_pools.count(pool)) { + dout(10) << __func__ << " " << pool << " already pending removal" + << dendl; + return 0; + } + + if (g_conf()->mon_fake_pool_delete && !no_fake) { + string old_name = osdmap.get_pool_name(pool); + string new_name = old_name + "." + stringify(pool) + ".DELETED"; + dout(1) << __func__ << " faking pool deletion: renaming " << pool << " " + << old_name << " -> " << new_name << dendl; + pending_inc.new_pool_names[pool] = new_name; + return 0; + } + + // remove + pending_inc.old_pools.insert(pool); + + // remove any pg_temp mappings for this pool + for (auto p = osdmap.pg_temp->begin(); + p != osdmap.pg_temp->end(); + ++p) { + if (p->first.pool() == pool) { + dout(10) << __func__ << " " << pool << " removing obsolete pg_temp " + << p->first << dendl; + pending_inc.new_pg_temp[p->first].clear(); + } + } + // remove any primary_temp mappings for this pool + for (auto p = osdmap.primary_temp->begin(); + p != osdmap.primary_temp->end(); + ++p) { + if (p->first.pool() == pool) { + dout(10) << __func__ << " " << pool + << " removing obsolete primary_temp" << p->first << dendl; + pending_inc.new_primary_temp[p->first] = -1; + } + } + // remove any pg_upmap mappings for this pool + for (auto& p : osdmap.pg_upmap) { + if (p.first.pool() == pool) { + dout(10) << __func__ << " " << pool + << " removing obsolete pg_upmap " + << p.first << dendl; + pending_inc.old_pg_upmap.insert(p.first); + } + } + // remove any pending pg_upmap mappings for this pool + { + auto it = pending_inc.new_pg_upmap.begin(); + while (it != pending_inc.new_pg_upmap.end()) { + if (it->first.pool() == pool) { + dout(10) << __func__ << " " << pool + << " removing pending pg_upmap " + << it->first << dendl; + it = pending_inc.new_pg_upmap.erase(it); + } else { + it++; + } + } + } + // remove any pg_upmap_items mappings for this pool + for (auto& p : osdmap.pg_upmap_items) { + if (p.first.pool() == pool) { + dout(10) << __func__ << " " << pool + << " removing obsolete pg_upmap_items " << p.first + << dendl; + pending_inc.old_pg_upmap_items.insert(p.first); + } + } + // remove any pending pg_upmap mappings for this pool + { + auto it = pending_inc.new_pg_upmap_items.begin(); + while (it != pending_inc.new_pg_upmap_items.end()) { + if (it->first.pool() == pool) { + dout(10) << __func__ << " " << pool + << " removing pending pg_upmap_items " + << it->first << dendl; + it = pending_inc.new_pg_upmap_items.erase(it); + } else { + it++; + } + } + } + + // remove any choose_args for this pool + CrushWrapper newcrush; + _get_pending_crush(newcrush); + if (newcrush.have_choose_args(pool)) { + dout(10) << __func__ << " removing choose_args for pool " << pool << dendl; + newcrush.rm_choose_args(pool); + pending_inc.crush.clear(); + newcrush.encode(pending_inc.crush, mon.get_quorum_con_features()); + } + return 0; +} + +int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname) +{ + dout(10) << "_prepare_rename_pool " << pool << dendl; + if (pending_inc.old_pools.count(pool)) { + dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl; + return -ENOENT; + } + for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin(); + p != pending_inc.new_pool_names.end(); + ++p) { + if (p->second == newname && p->first != pool) { + return -EEXIST; + } + } + + pending_inc.new_pool_names[pool] = newname; + return 0; +} + +bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MPoolOp>(); + ostringstream ss; + int ret = _prepare_remove_pool(m->pool, &ss, false); + if (ret == -EAGAIN) { + wait_for_finished_proposal(op, new C_RetryMessage(this, op)); + return true; + } + if (ret < 0) + dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl; + wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, + pending_inc.epoch)); + return true; +} + +void OSDMonitor::_pool_op_reply(MonOpRequestRef op, + int ret, epoch_t epoch, bufferlist *blp) +{ + op->mark_osdmon_event(__func__); + auto m = op->get_req<MPoolOp>(); + dout(20) << "_pool_op_reply " << ret << dendl; + MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(), + ret, epoch, get_last_committed(), blp); + mon.send_reply(op, reply); +} + +void OSDMonitor::convert_pool_priorities(void) +{ + pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key; + int64_t max_prio = 0; + int64_t min_prio = 0; + for (const auto &i : osdmap.get_pools()) { + const auto &pool = i.second; + + if (pool.opts.is_set(key)) { + int64_t prio = 0; + pool.opts.get(key, &prio); + if (prio > max_prio) + max_prio = prio; + if (prio < min_prio) + min_prio = prio; + } + } + if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) { + dout(20) << __func__ << " nothing to fix" << dendl; + return; + } + // Current pool priorities exceeds new maximum + for (const auto &i : osdmap.get_pools()) { + const auto pool_id = i.first; + pg_pool_t pool = i.second; + + int64_t prio = 0; + pool.opts.get(key, &prio); + int64_t n; + + if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario + // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX + n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX; + } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) { + // Scaled priority range OSD_POOL_PRIORITY_MIN to 0 + n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN; + } else { + continue; + } + if (n == 0) { + pool.opts.unset(key); + } else { + pool.opts.set(key, static_cast<int64_t>(n)); + } + dout(10) << __func__ << " pool " << pool_id + << " recovery_priority adjusted " + << prio << " to " << n << dendl; + pool.last_change = pending_inc.epoch; + pending_inc.new_pools[pool_id] = pool; + } +} + +void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay, + int *errcode, + set<pg_pool_t*>* pools, + const string& new_crush_rule) +{ + dout(20) << __func__ << dendl; + *okay = false; + int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule); + if (new_crush_rule_result < 0) { + ss << "unrecognized crush rule " << new_crush_rule_result; + *errcode = new_crush_rule_result; + return; + } + __u8 new_rule = static_cast<__u8>(new_crush_rule_result); + for (const auto& pooli : osdmap.pools) { + int64_t poolid = pooli.first; + const pg_pool_t *p = &pooli.second; + if (!p->is_replicated()) { + ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded"; + *errcode = -EINVAL; + return; + } + uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size"); + if ((p->get_size() != default_size || + (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) && + (p->get_crush_rule() != new_rule)) { + ss << "we currently require stretch mode pools start out with the" + " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not"; + *errcode = -EINVAL; + return; + } + pg_pool_t *pp = pending_inc.get_new_pool(poolid, p); + // TODO: The part where we unconditionally copy the pools into pending_inc is bad + // the attempt may fail and then we have these pool updates...but they won't do anything + // if there is a failure, so if it's hard to change the interface, no need to bother + pools->insert(pp); + } + *okay = true; + return; +} + +void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay, + int *errcode, bool commit, + const string& dividing_bucket, + uint32_t bucket_count, + const set<pg_pool_t*>& pools, + const string& new_crush_rule) +{ + dout(20) << __func__ << dendl; + *okay = false; + CrushWrapper crush; + _get_pending_crush(crush); + int dividing_id; + int retval = crush.get_validated_type_id(dividing_bucket, &dividing_id); + if (retval == -1) { + ss << dividing_bucket << " is not a valid crush bucket type"; + *errcode = -ENOENT; + ceph_assert(!commit || retval != -1); + return; + } + vector<int> subtrees; + crush.get_subtree_of_type(dividing_id, &subtrees); + if (subtrees.size() != 2) { + ss << "there are " << subtrees.size() << dividing_bucket + << "'s in the cluster but stretch mode currently only works with 2!"; + *errcode = -EINVAL; + ceph_assert(!commit || subtrees.size() == 2); + return; + } + + int new_crush_rule_result = crush.get_rule_id(new_crush_rule); + if (new_crush_rule_result < 0) { + ss << "unrecognized crush rule " << new_crush_rule; + *errcode = new_crush_rule_result; + ceph_assert(!commit || (new_crush_rule_result > 0)); + return; + } + __u8 new_rule = static_cast<__u8>(new_crush_rule_result); + + int weight1 = crush.get_item_weight(subtrees[0]); + int weight2 = crush.get_item_weight(subtrees[1]); + if (weight1 != weight2) { + // TODO: I'm really not sure this is a good idea? + ss << "the 2 " << dividing_bucket + << "instances in the cluster have differing weights " + << weight1 << " and " << weight2 + <<" but stretch mode currently requires they be the same!"; + *errcode = -EINVAL; + ceph_assert(!commit || (weight1 == weight2)); + return; + } + if (bucket_count != 2) { + ss << "currently we only support 2-site stretch clusters!"; + *errcode = -EINVAL; + ceph_assert(!commit || bucket_count == 2); + return; + } + // TODO: check CRUSH rules for pools so that we are appropriately divided + if (commit) { + for (auto pool : pools) { + pool->crush_rule = new_rule; + pool->peering_crush_bucket_count = bucket_count; + pool->peering_crush_bucket_target = bucket_count; + pool->peering_crush_bucket_barrier = dividing_id; + pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE; + pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size"); + pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size"); + } + pending_inc.change_stretch_mode = true; + pending_inc.stretch_mode_enabled = true; + pending_inc.new_stretch_bucket_count = bucket_count; + pending_inc.new_degraded_stretch_mode = 0; + pending_inc.new_stretch_mode_bucket = dividing_id; + } + *okay = true; + return; +} + +bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets, + set<int> *really_down_buckets, + set<string> *really_down_mons) +{ + dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl; + ceph_assert(is_readable()); + if (dead_buckets.empty()) return false; + set<int> down_cache; + bool really_down = false; + for (auto dbi : dead_buckets) { + const string& bucket_name = dbi.first; + ceph_assert(osdmap.crush->name_exists(bucket_name)); + int bucket_id = osdmap.crush->get_item_id(bucket_name); + dout(20) << "Checking " << bucket_name << " id " << bucket_id + << " to see if OSDs are also down" << dendl; + bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache); + if (subtree_down) { + dout(20) << "subtree is down!" << dendl; + really_down = true; + really_down_buckets->insert(bucket_id); + really_down_mons->insert(dbi.second.begin(), dbi.second.end()); + } + } + dout(10) << "We determined CRUSH buckets " << *really_down_buckets + << " and mons " << *really_down_mons << " are really down" << dendl; + return really_down; +} + +void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets, + const set<string>& live_zones) +{ + dout(20) << __func__ << dendl; + stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now! + // update the general OSDMap changes + pending_inc.change_stretch_mode = true; + pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled; + pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count; + int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size(); + ceph_assert(new_site_count == 1); // stretch count 2! + pending_inc.new_degraded_stretch_mode = new_site_count; + pending_inc.new_recovering_stretch_mode = 0; + pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket; + + // and then apply them to all the pg_pool_ts + ceph_assert(live_zones.size() == 1); // only support 2 zones now + const string& remaining_site_name = *(live_zones.begin()); + ceph_assert(osdmap.crush->name_exists(remaining_site_name)); + int remaining_site = osdmap.crush->get_item_id(remaining_site_name); + for (auto pgi : osdmap.pools) { + if (pgi.second.peering_crush_bucket_count) { + pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second); + newp.peering_crush_bucket_count = new_site_count; + newp.peering_crush_mandatory_member = remaining_site; + newp.min_size = pgi.second.min_size / 2; // only support 2 zones now + newp.set_last_force_op_resend(pending_inc.epoch); + } + } + propose_pending(); +} + +void OSDMonitor::trigger_recovery_stretch_mode() +{ + dout(20) << __func__ << dendl; + stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely + pending_inc.change_stretch_mode = true; + pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled; + pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count; + pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode; + pending_inc.new_recovering_stretch_mode = 1; + pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket; + + for (auto pgi : osdmap.pools) { + if (pgi.second.peering_crush_bucket_count) { + pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second); + newp.set_last_force_op_resend(pending_inc.epoch); + } + } + propose_pending(); +} + +void OSDMonitor::set_degraded_stretch_mode() +{ + stretch_recovery_triggered.set_from_double(0); +} + +void OSDMonitor::set_recovery_stretch_mode() +{ + if (stretch_recovery_triggered.is_zero()) { + stretch_recovery_triggered = ceph_clock_now(); + } +} + +void OSDMonitor::set_healthy_stretch_mode() +{ + stretch_recovery_triggered.set_from_double(0); +} + +void OSDMonitor::notify_new_pg_digest() +{ + dout(20) << __func__ << dendl; + if (!stretch_recovery_triggered.is_zero()) { + try_end_recovery_stretch_mode(false); + } +} + +struct CMonExitRecovery : public Context { + OSDMonitor *m; + bool force; + CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {} + void finish(int r) { + m->try_end_recovery_stretch_mode(force); + } +}; + +void OSDMonitor::try_end_recovery_stretch_mode(bool force) +{ + dout(20) << __func__ << dendl; + if (!mon.is_leader()) return; + if (!mon.is_degraded_stretch_mode()) return; + if (!mon.is_recovering_stretch_mode()) return; + if (!is_readable()) { + wait_for_readable_ctx(new CMonExitRecovery(this, force)); + return; + } + + if (osdmap.recovering_stretch_mode && + ((!stretch_recovery_triggered.is_zero() && + ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") > + stretch_recovery_triggered) || + force)) { + if (!mon.mgrstatmon()->is_readable()) { + mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force)); + return; + } + const PGMapDigest& pgd = mon.mgrstatmon()->get_digest(); + double misplaced, degraded, inactive, unknown; + pgd.get_recovery_stats(&misplaced, °raded, &inactive, &unknown); + if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) { + // we can exit degraded stretch mode! + mon.trigger_healthy_stretch_mode(); + } + } +} + +void OSDMonitor::trigger_healthy_stretch_mode() +{ + ceph_assert(is_writeable()); + stretch_recovery_triggered.set_from_double(0); + pending_inc.change_stretch_mode = true; + pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled; + pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count; + pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode... + pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode! + pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket; + for (auto pgi : osdmap.pools) { + if (pgi.second.peering_crush_bucket_count) { + pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second); + newp.peering_crush_bucket_count = osdmap.stretch_bucket_count; + newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE; + newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size"); + newp.set_last_force_op_resend(pending_inc.epoch); + } + } + propose_pending(); +} diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h new file mode 100644 index 000000000..e7701a639 --- /dev/null +++ b/src/mon/OSDMonitor.h @@ -0,0 +1,874 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* Object Store Device (OSD) Monitor + */ + +#ifndef CEPH_OSDMONITOR_H +#define CEPH_OSDMONITOR_H + +#include <map> +#include <set> +#include <utility> + +#include "include/types.h" +#include "include/encoding.h" +#include "common/simple_cache.hpp" +#include "common/PriorityCache.h" +#include "msg/Messenger.h" + +#include "osd/OSDMap.h" +#include "osd/OSDMapMapping.h" + +#include "CreatingPGs.h" +#include "PaxosService.h" + +#include "erasure-code/ErasureCodeInterface.h" +#include "mon/MonOpRequest.h" +#include <boost/functional/hash.hpp> + +class Monitor; +class PGMap; +struct MonSession; +class MOSDMap; + + +/// information about a particular peer's failure reports for one osd +struct failure_reporter_t { + utime_t failed_since; ///< when they think it failed + MonOpRequestRef op; ///< failure op request + + failure_reporter_t() {} + failure_reporter_t(utime_t s, MonOpRequestRef op) + : failed_since(s), op(op) {} + ~failure_reporter_t() { } +}; + +/// information about all failure reports for one osd +struct failure_info_t { + std::map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc + utime_t max_failed_since; ///< most recent failed_since + + failure_info_t() {} + + utime_t get_failed_since() { + if (max_failed_since == utime_t() && !reporters.empty()) { + // the old max must have canceled; recalculate. + for (auto p = reporters.begin(); p != reporters.end(); ++p) + if (p->second.failed_since > max_failed_since) + max_failed_since = p->second.failed_since; + } + return max_failed_since; + } + + // set the message for the latest report. + void add_report(int who, utime_t failed_since, MonOpRequestRef op) { + [[maybe_unused]] auto [it, new_reporter] = + reporters.insert_or_assign(who, failure_reporter_t{failed_since, op}); + if (new_reporter) { + if (max_failed_since != utime_t() && max_failed_since < failed_since) { + max_failed_since = failed_since; + } + } + } + + void take_report_messages(std::list<MonOpRequestRef>& ls) { + for (auto p = reporters.begin(); p != reporters.end(); ++p) { + if (p->second.op) { + ls.push_back(p->second.op); + p->second.op.reset(); + } + } + } + + void cancel_report(int who) { + reporters.erase(who); + max_failed_since = utime_t(); + } +}; + + +class LastEpochClean { + struct Lec { + std::vector<epoch_t> epoch_by_pg; + ps_t next_missing = 0; + epoch_t floor = std::numeric_limits<epoch_t>::max(); + void report(unsigned pg_num, ps_t pg, epoch_t last_epoch_clean); + }; + std::map<uint64_t, Lec> report_by_pool; +public: + void report(unsigned pg_num, const pg_t& pg, epoch_t last_epoch_clean); + void remove_pool(uint64_t pool); + epoch_t get_lower_bound(const OSDMap& latest) const; + + void dump(Formatter *f) const; +}; + + +struct osdmap_manifest_t { + // all the maps we have pinned -- i.e., won't be removed unless + // they are inside a trim interval. + std::set<version_t> pinned; + + osdmap_manifest_t() {} + + version_t get_last_pinned() const + { + auto it = pinned.crbegin(); + if (it == pinned.crend()) { + return 0; + } + return *it; + } + + version_t get_first_pinned() const + { + auto it = pinned.cbegin(); + if (it == pinned.cend()) { + return 0; + } + return *it; + } + + bool is_pinned(version_t v) const + { + return pinned.find(v) != pinned.end(); + } + + void pin(version_t v) + { + pinned.insert(v); + } + + version_t get_lower_closest_pinned(version_t v) const { + auto p = pinned.lower_bound(v); + if (p == pinned.cend()) { + return 0; + } else if (*p > v) { + if (p == pinned.cbegin()) { + return 0; + } + --p; + } + return *p; + } + + void encode(ceph::buffer::list& bl) const + { + ENCODE_START(1, 1, bl); + encode(pinned, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator& bl) + { + DECODE_START(1, bl); + decode(pinned, bl); + DECODE_FINISH(bl); + } + + void decode(ceph::buffer::list& bl) { + auto p = bl.cbegin(); + decode(p); + } + + void dump(ceph::Formatter *f) { + f->dump_unsigned("first_pinned", get_first_pinned()); + f->dump_unsigned("last_pinned", get_last_pinned()); + f->open_array_section("pinned_maps"); + for (auto& i : pinned) { + f->dump_unsigned("epoch", i); + } + f->close_section(); + } +}; +WRITE_CLASS_ENCODER(osdmap_manifest_t); + +class OSDMonitor : public PaxosService, + public md_config_obs_t { + CephContext *cct; + +public: + OSDMap osdmap; + + // config observer + const char** get_tracked_conf_keys() const override; + void handle_conf_change(const ConfigProxy& conf, + const std::set<std::string> &changed) override; + // [leader] + OSDMap::Incremental pending_inc; + std::map<int, ceph::buffer::list> pending_metadata; + std::set<int> pending_metadata_rm; + std::map<int, failure_info_t> failure_info; + std::map<int,utime_t> down_pending_out; // osd down -> out + bool priority_convert = false; + std::map<int64_t,std::set<snapid_t>> pending_pseudo_purged_snaps; + std::shared_ptr<PriorityCache::PriCache> rocksdb_binned_kv_cache = nullptr; + std::shared_ptr<PriorityCache::Manager> pcm = nullptr; + ceph::mutex balancer_lock = ceph::make_mutex("OSDMonitor::balancer_lock"); + + std::map<int,double> osd_weight; + + using osdmap_key_t = std::pair<version_t, uint64_t>; + using osdmap_cache_t = SimpleLRU<osdmap_key_t, + ceph::buffer::list, + std::less<osdmap_key_t>, + boost::hash<osdmap_key_t>>; + osdmap_cache_t inc_osd_cache; + osdmap_cache_t full_osd_cache; + + bool has_osdmap_manifest; + osdmap_manifest_t osdmap_manifest; + + bool check_failures(utime_t now); + bool check_failure(utime_t now, int target_osd, failure_info_t& fi); + utime_t get_grace_time(utime_t now, int target_osd, failure_info_t& fi) const; + bool is_failure_stale(utime_t now, failure_info_t& fi) const; + void force_failure(int target_osd, int by); + + bool _have_pending_crush(); + CrushWrapper &_get_stable_crush(); + void _get_pending_crush(CrushWrapper& newcrush); + + enum FastReadType { + FAST_READ_OFF, + FAST_READ_ON, + FAST_READ_DEFAULT + }; + + struct CleanUpmapJob : public ParallelPGMapper::Job { + CephContext *cct; + const OSDMap& osdmap; + OSDMap::Incremental& pending_inc; + // lock to protect pending_inc form changing + // when checking is done + ceph::mutex pending_inc_lock = + ceph::make_mutex("CleanUpmapJob::pending_inc_lock"); + + CleanUpmapJob(CephContext *cct, const OSDMap& om, OSDMap::Incremental& pi) + : ParallelPGMapper::Job(&om), + cct(cct), + osdmap(om), + pending_inc(pi) {} + + void process(const std::vector<pg_t>& to_check) override { + std::vector<pg_t> to_cancel; + std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> to_remap; + osdmap.check_pg_upmaps(cct, to_check, &to_cancel, &to_remap); + // don't bother taking lock if nothing changes + if (!to_cancel.empty() || !to_remap.empty()) { + std::lock_guard l(pending_inc_lock); + osdmap.clean_pg_upmaps(cct, &pending_inc, to_cancel, to_remap); + } + } + + void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) override {} + void complete() override {} + }; // public as this will need to be accessible from TestTestOSDMap.cc + + // svc +public: + void create_initial() override; + void get_store_prefixes(std::set<std::string>& s) const override; + +private: + void update_from_paxos(bool *need_bootstrap) override; + void create_pending() override; // prepare a new pending + void encode_pending(MonitorDBStore::TransactionRef t) override; + void on_active() override; + void on_restart() override; + void on_shutdown() override; + + /* osdmap full map prune */ + void load_osdmap_manifest(); + bool should_prune() const; + void _prune_update_trimmed( + MonitorDBStore::TransactionRef tx, + version_t first); + void prune_init(osdmap_manifest_t& manifest); + bool _prune_sanitize_options() const; + bool is_prune_enabled() const; + bool is_prune_supported() const; + bool do_prune(MonitorDBStore::TransactionRef tx); + + // Priority cache control + uint32_t mon_osd_cache_size = 0; ///< Number of cached OSDMaps + uint64_t rocksdb_cache_size = 0; ///< Cache for kv Db + double cache_kv_ratio = 0; ///< Cache ratio dedicated to kv + double cache_inc_ratio = 0; ///< Cache ratio dedicated to inc + double cache_full_ratio = 0; ///< Cache ratio dedicated to full + uint64_t mon_memory_base = 0; ///< Mon base memory for cache autotuning + double mon_memory_fragmentation = 0; ///< Expected memory fragmentation + uint64_t mon_memory_target = 0; ///< Mon target memory for cache autotuning + uint64_t mon_memory_min = 0; ///< Min memory to cache osdmaps + bool mon_memory_autotune = false; ///< Cache auto tune setting + int register_cache_with_pcm(); + int _set_cache_sizes(); + int _set_cache_ratios(); + void _set_new_cache_sizes(); + void _set_cache_autotuning(); + int _update_mon_cache_settings(); + + friend struct OSDMemCache; + friend struct IncCache; + friend struct FullCache; + + /** + * we haven't delegated full version stashing to paxosservice for some time + * now, making this function useless in current context. + */ + void encode_full(MonitorDBStore::TransactionRef t) override { } + /** + * do not let paxosservice periodically stash full osdmaps, or we will break our + * locally-managed full maps. (update_from_paxos loads the latest and writes them + * out going forward from there, but if we just synced that may mean we skip some.) + */ + bool should_stash_full() override { + return false; + } + + /** + * hook into trim to include the oldest full map in the trim transaction + * + * This ensures that anyone post-sync will have enough to rebuild their + * full osdmaps. + */ + void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override; + + void update_msgr_features(); + /** + * check if the cluster supports the features required by the + * given crush map. Outputs the daemons which don't support it + * to the stringstream. + * + * @returns true if the map is passable, false otherwise + */ + bool validate_crush_against_features(const CrushWrapper *newcrush, + std::stringstream &ss); + void check_osdmap_subs(); + void share_map_with_random_osd(); + + ceph::mutex prime_pg_temp_lock = + ceph::make_mutex("OSDMonitor::prime_pg_temp_lock"); + struct PrimeTempJob : public ParallelPGMapper::Job { + OSDMonitor *osdmon; + PrimeTempJob(const OSDMap& om, OSDMonitor *m) + : ParallelPGMapper::Job(&om), osdmon(m) {} + void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override { + for (unsigned ps = ps_begin; ps < ps_end; ++ps) { + pg_t pgid(ps, pool); + osdmon->prime_pg_temp(*osdmap, pgid); + } + } + void process(const std::vector<pg_t>& pgs) override {} + void complete() override {} + }; + void maybe_prime_pg_temp(); + void prime_pg_temp(const OSDMap& next, pg_t pgid); + + ParallelPGMapper mapper; ///< for background pg work + OSDMapMapping mapping; ///< pg <-> osd mappings + std::unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job + void start_mapping(); + + void update_logger(); + + void handle_query(PaxosServiceMessage *m); + bool preprocess_query(MonOpRequestRef op) override; // true if processed. + bool prepare_update(MonOpRequestRef op) override; + bool should_propose(double &delay) override; + + version_t get_trim_to() const override; + + bool can_mark_down(int o); + bool can_mark_up(int o); + bool can_mark_out(int o); + bool can_mark_in(int o); + + // ... + MOSDMap *build_latest_full(uint64_t features); + MOSDMap *build_incremental(epoch_t first, epoch_t last, uint64_t features); + void send_full(MonOpRequestRef op); + void send_incremental(MonOpRequestRef op, epoch_t first); +public: + /** + * Make sure the existing (up) OSDs support the given features + * @return 0 on success, or an error code if any OSDs re missing features. + * @param ss Filled in with ane explanation of failure, if any + */ + int check_cluster_features(uint64_t features, std::stringstream &ss); + // @param req an optional op request, if the osdmaps are replies to it. so + // @c Monitor::send_reply() can mark_event with it. + void send_incremental(epoch_t first, MonSession *session, bool onetime, + MonOpRequestRef req = MonOpRequestRef()); + +private: + void print_utilization(std::ostream &out, ceph::Formatter *f, bool tree) const; + + bool check_source(MonOpRequestRef op, uuid_d fsid); + + bool preprocess_get_osdmap(MonOpRequestRef op); + + bool preprocess_mark_me_down(MonOpRequestRef op); + + friend class C_AckMarkedDown; + bool preprocess_failure(MonOpRequestRef op); + bool prepare_failure(MonOpRequestRef op); + bool prepare_mark_me_down(MonOpRequestRef op); + void process_failures(); + void take_all_failures(std::list<MonOpRequestRef>& ls); + + bool preprocess_mark_me_dead(MonOpRequestRef op); + bool prepare_mark_me_dead(MonOpRequestRef op); + + bool preprocess_full(MonOpRequestRef op); + bool prepare_full(MonOpRequestRef op); + + bool preprocess_boot(MonOpRequestRef op); + bool prepare_boot(MonOpRequestRef op); + void _booted(MonOpRequestRef op, bool logit); + + void update_up_thru(int from, epoch_t up_thru); + bool preprocess_alive(MonOpRequestRef op); + bool prepare_alive(MonOpRequestRef op); + void _reply_map(MonOpRequestRef op, epoch_t e); + + bool preprocess_pgtemp(MonOpRequestRef op); + bool prepare_pgtemp(MonOpRequestRef op); + + bool preprocess_pg_created(MonOpRequestRef op); + bool prepare_pg_created(MonOpRequestRef op); + + bool preprocess_pg_ready_to_merge(MonOpRequestRef op); + bool prepare_pg_ready_to_merge(MonOpRequestRef op); + + int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, std::ostream *ss); + bool _check_become_tier( + int64_t tier_pool_id, const pg_pool_t *tier_pool, + int64_t base_pool_id, const pg_pool_t *base_pool, + int *err, std::ostream *ss) const; + bool _check_remove_tier( + int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool, + int *err, std::ostream *ss) const; + + int _prepare_remove_pool(int64_t pool, std::ostream *ss, bool no_fake); + int _prepare_rename_pool(int64_t pool, std::string newname); + + bool enforce_pool_op_caps(MonOpRequestRef op); + bool preprocess_pool_op (MonOpRequestRef op); + bool preprocess_pool_op_create (MonOpRequestRef op); + bool prepare_pool_op (MonOpRequestRef op); + bool prepare_pool_op_create (MonOpRequestRef op); + bool prepare_pool_op_delete(MonOpRequestRef op); + int crush_rename_bucket(const std::string& srcname, + const std::string& dstname, + std::ostream *ss); + void check_legacy_ec_plugin(const std::string& plugin, + const std::string& profile) const; + int normalize_profile(const std::string& profilename, + ceph::ErasureCodeProfile &profile, + bool force, + std::ostream *ss); + int crush_rule_create_erasure(const std::string &name, + const std::string &profile, + int *rule, + std::ostream *ss); + int get_crush_rule(const std::string &rule_name, + int *crush_rule, + std::ostream *ss); + int get_erasure_code(const std::string &erasure_code_profile, + ceph::ErasureCodeInterfaceRef *erasure_code, + std::ostream *ss) const; + int prepare_pool_crush_rule(const unsigned pool_type, + const std::string &erasure_code_profile, + const std::string &rule_name, + int *crush_rule, + std::ostream *ss); + bool erasure_code_profile_in_use( + const mempool::osdmap::map<int64_t, pg_pool_t> &pools, + const std::string &profile, + std::ostream *ss); + int parse_erasure_code_profile(const std::vector<std::string> &erasure_code_profile, + std::map<std::string,std::string> *erasure_code_profile_map, + std::ostream *ss); + int prepare_pool_size(const unsigned pool_type, + const std::string &erasure_code_profile, + uint8_t repl_size, + unsigned *size, unsigned *min_size, + std::ostream *ss); + int prepare_pool_stripe_width(const unsigned pool_type, + const std::string &erasure_code_profile, + unsigned *stripe_width, + std::ostream *ss); + int check_pg_num(int64_t pool, int pg_num, int size, std::ostream* ss); + int prepare_new_pool(std::string& name, + int crush_rule, + const std::string &crush_rule_name, + unsigned pg_num, unsigned pgp_num, + unsigned pg_num_min, + unsigned pg_num_max, + uint64_t repl_size, + const uint64_t target_size_bytes, + const float target_size_ratio, + const std::string &erasure_code_profile, + const unsigned pool_type, + const uint64_t expected_num_objects, + FastReadType fast_read, + const std::string& pg_autoscale_mode, + bool bulk, + std::ostream *ss); + int prepare_new_pool(MonOpRequestRef op); + + void set_pool_flags(int64_t pool_id, uint64_t flags); + void clear_pool_flags(int64_t pool_id, uint64_t flags); + bool update_pools_status(); + + bool _is_removed_snap(int64_t pool_id, snapid_t snapid); + bool _is_pending_removed_snap(int64_t pool_id, snapid_t snapid); + + std::string make_purged_snap_epoch_key(epoch_t epoch); + std::string make_purged_snap_key(int64_t pool, snapid_t snap); + std::string make_purged_snap_key_value(int64_t pool, snapid_t snap, snapid_t num, + epoch_t epoch, ceph::buffer::list *v); + + bool try_prune_purged_snaps(); + int lookup_purged_snap(int64_t pool, snapid_t snap, + snapid_t *begin, snapid_t *end); + + void insert_purged_snap_update( + int64_t pool, + snapid_t start, snapid_t end, + epoch_t epoch, + MonitorDBStore::TransactionRef t); + + bool prepare_set_flag(MonOpRequestRef op, int flag); + bool prepare_unset_flag(MonOpRequestRef op, int flag); + + void _pool_op_reply(MonOpRequestRef op, + int ret, epoch_t epoch, ceph::buffer::list *blp=NULL); + + struct C_Booted : public C_MonOp { + OSDMonitor *cmon; + bool logit; + C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) : + C_MonOp(op_), cmon(cm), logit(l) {} + void _finish(int r) override { + if (r >= 0) + cmon->_booted(op, logit); + else if (r == -ECANCELED) + return; + else if (r == -EAGAIN) + cmon->dispatch(op); + else + ceph_abort_msg("bad C_Booted return value"); + } + }; + + struct C_ReplyMap : public C_MonOp { + OSDMonitor *osdmon; + epoch_t e; + C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee) + : C_MonOp(op_), osdmon(o), e(ee) {} + void _finish(int r) override { + if (r >= 0) + osdmon->_reply_map(op, e); + else if (r == -ECANCELED) + return; + else if (r == -EAGAIN) + osdmon->dispatch(op); + else + ceph_abort_msg("bad C_ReplyMap return value"); + } + }; + struct C_PoolOp : public C_MonOp { + OSDMonitor *osdmon; + int replyCode; + int epoch; + ceph::buffer::list reply_data; + C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, ceph::buffer::list *rd=NULL) : + C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) { + if (rd) + reply_data = *rd; + } + void _finish(int r) override { + if (r >= 0) + osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data); + else if (r == -ECANCELED) + return; + else if (r == -EAGAIN) + osdmon->dispatch(op); + else + ceph_abort_msg("bad C_PoolOp return value"); + } + }; + + bool preprocess_remove_snaps(MonOpRequestRef op); + bool prepare_remove_snaps(MonOpRequestRef op); + + bool preprocess_get_purged_snaps(MonOpRequestRef op); + + int load_metadata(int osd, std::map<std::string, std::string>& m, + std::ostream *err); + void count_metadata(const std::string& field, ceph::Formatter *f); + + void reencode_incremental_map(ceph::buffer::list& bl, uint64_t features); + void reencode_full_map(ceph::buffer::list& bl, uint64_t features); +public: + void count_metadata(const std::string& field, std::map<std::string,int> *out); + void get_versions(std::map<std::string, std::list<std::string>> &versions); +protected: + int get_osd_objectstore_type(int osd, std::string *type); + bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool, + std::ostream *err); + + // when we last received PG stats from each osd and the osd's osd_beacon_report_interval + std::map<int, std::pair<utime_t, int>> last_osd_report; + // TODO: use last_osd_report to store the osd report epochs, once we don't + // need to upgrade from pre-luminous releases. + std::map<int,epoch_t> osd_epochs; + LastEpochClean last_epoch_clean; + bool preprocess_beacon(MonOpRequestRef op); + bool prepare_beacon(MonOpRequestRef op); + epoch_t get_min_last_epoch_clean() const; + + friend class C_UpdateCreatingPGs; + std::map<int, std::map<epoch_t, std::set<spg_t>>> creating_pgs_by_osd_epoch; + std::vector<pg_t> pending_created_pgs; + // the epoch when the pg mapping was calculated + epoch_t creating_pgs_epoch = 0; + creating_pgs_t creating_pgs; + mutable std::mutex creating_pgs_lock; + + creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc, + const OSDMap& nextmap); + unsigned scan_for_creating_pgs( + const mempool::osdmap::map<int64_t,pg_pool_t>& pools, + const mempool::osdmap::set<int64_t>& removed_pools, + utime_t modified, + creating_pgs_t* creating_pgs) const; + std::pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const; + void update_creating_pgs(); + void check_pg_creates_subs(); + epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const; + + int32_t _allocate_osd_id(int32_t* existing_id); + + int get_grace_interval_threshold(); + bool grace_interval_threshold_exceeded(int last_failed); + void set_default_laggy_params(int target_osd); + +public: + OSDMonitor(CephContext *cct, Monitor &mn, Paxos &p, const std::string& service_name); + + void tick() override; // check state, take actions + + bool preprocess_command(MonOpRequestRef op); + bool prepare_command(MonOpRequestRef op); + bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap); + + int validate_osd_create( + const int32_t id, + const uuid_d& uuid, + const bool check_osd_exists, + int32_t* existing_id, + std::stringstream& ss); + int prepare_command_osd_create( + const int32_t id, + const uuid_d& uuid, + int32_t* existing_id, + std::stringstream& ss); + void do_osd_create(const int32_t id, const uuid_d& uuid, + const std::string& device_class, + int32_t* new_id); + int prepare_command_osd_purge(int32_t id, std::stringstream& ss); + int prepare_command_osd_destroy(int32_t id, std::stringstream& ss); + int _prepare_command_osd_crush_remove( + CrushWrapper &newcrush, + int32_t id, + int32_t ancestor, + bool has_ancestor, + bool unlink_only); + void do_osd_crush_remove(CrushWrapper& newcrush); + int prepare_command_osd_crush_remove( + CrushWrapper &newcrush, + int32_t id, + int32_t ancestor, + bool has_ancestor, + bool unlink_only); + int prepare_command_osd_remove(int32_t id); + int prepare_command_osd_new( + MonOpRequestRef op, + const cmdmap_t& cmdmap, + const std::map<std::string,std::string>& secrets, + std::stringstream &ss, + ceph::Formatter *f); + + int prepare_command_pool_set(const cmdmap_t& cmdmap, + std::stringstream& ss); + + int prepare_command_pool_application(const std::string &prefix, + const cmdmap_t& cmdmap, + std::stringstream& ss); + int preprocess_command_pool_application(const std::string &prefix, + const cmdmap_t& cmdmap, + std::stringstream& ss, + bool *modified); + int _command_pool_application(const std::string &prefix, + const cmdmap_t& cmdmap, + std::stringstream& ss, + bool *modified, + bool preparing); + + bool handle_osd_timeouts(const utime_t &now, + std::map<int, std::pair<utime_t, int>> &last_osd_report); + + void send_latest(MonOpRequestRef op, epoch_t start=0); + void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) { + op->mark_osdmon_event(__func__); + send_incremental(op, start); + } + + int get_version(version_t ver, ceph::buffer::list& bl) override; + int get_version(version_t ver, uint64_t feature, ceph::buffer::list& bl); + + int get_version_full(version_t ver, uint64_t feature, ceph::buffer::list& bl); + int get_version_full(version_t ver, ceph::buffer::list& bl) override; + int get_inc(version_t ver, OSDMap::Incremental& inc); + int get_full_from_pinned_map(version_t ver, ceph::buffer::list& bl); + + epoch_t blocklist(const entity_addrvec_t& av, utime_t until); + epoch_t blocklist(entity_addr_t a, utime_t until); + + void dump_info(ceph::Formatter *f); + int dump_osd_metadata(int osd, ceph::Formatter *f, std::ostream *err); + void print_nodes(ceph::Formatter *f); + + void check_osdmap_sub(Subscription *sub); + void check_pg_creates_sub(Subscription *sub); + + void do_application_enable(int64_t pool_id, const std::string &app_name, + const std::string &app_key="", + const std::string &app_value="", + bool force=false); + void do_set_pool_opt(int64_t pool_id, pool_opts_t::key_t opt, + pool_opts_t::value_t); + + void add_flag(int flag) { + if (!(osdmap.flags & flag)) { + if (pending_inc.new_flags < 0) + pending_inc.new_flags = osdmap.flags; + pending_inc.new_flags |= flag; + } + } + + void remove_flag(int flag) { + if(osdmap.flags & flag) { + if (pending_inc.new_flags < 0) + pending_inc.new_flags = osdmap.flags; + pending_inc.new_flags &= ~flag; + } + } + void convert_pool_priorities(void); + /** + * Find the pools which are requested to be put into stretch mode, + * validate that they are allowed to be in stretch mode (eg, are replicated) + * and place copies of them in the pools set. + * This does not make any changes to the pools or state; it's just + * a safety-check-and-collect function. + */ + void try_enable_stretch_mode_pools(stringstream& ss, bool *okay, + int *errcode, + set<pg_pool_t*>* pools, const string& new_crush_rule); + /** + * Check validity of inputs and OSD/CRUSH state to + * engage stretch mode. Designed to be used with + * MonmapMonitor::try_enable_stretch_mode() where we call both twice, + * first with commit=false to validate. + * @param ss: a stringstream to write errors into + * @param okay: Filled to true if okay, false if validation fails + * @param errcode: filled with -errno if there's a problem + * @param commit: true if we should commit the change, false if just testing + * @param dividing_bucket: the bucket type (eg 'dc') that divides the cluster + * @param bucket_count: The number of buckets required in peering. + * Currently must be 2. + * @param pools: The pg_pool_ts which are being set to stretch mode (obtained + * from try_enable_stretch_mode_pools()). + * @param new_crush_rule: The crush rule to set the pools to. + */ + void try_enable_stretch_mode(stringstream& ss, bool *okay, + int *errcode, bool commit, + const string& dividing_bucket, + uint32_t bucket_count, + const set<pg_pool_t*>& pools, + const string& new_crush_rule); + /** + * Check the input dead_buckets mapping (buckets->dead monitors) to see + * if the OSDs are also down. If so, fill in really_down_buckets and + * really_down_mons and return true; else return false. + */ + bool check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets, + set<int> *really_down_buckets, + set<string> *really_down_mons); + /** + * Set degraded mode in the OSDMap, adding the given dead buckets to the dead set + * and using the live_zones (should presently be size 1) + */ + void trigger_degraded_stretch_mode(const set<int>& dead_buckets, + const set<string>& live_zones); + /** + * This is just to maintain stretch_recovery_triggered; below + */ + void set_degraded_stretch_mode(); + /** + * Set recovery stretch mode in the OSDMap, resetting pool size back to normal + */ + void trigger_recovery_stretch_mode(); + /** + * This is just to maintain stretch_recovery_triggered; below + */ + void set_recovery_stretch_mode(); + /** + * This is just to maintain stretch_recovery_triggered; below + */ + void set_healthy_stretch_mode(); + /** + * Tells the OSD there's a new pg digest, in case it's interested. + * (It's interested when in recovering stretch mode.) + */ + void notify_new_pg_digest(); + /** + * Check if we can exit recovery stretch mode and go back to normal. + * @param force If true, we will force the exit through once it is legal, + * without regard to the reported PG status. + */ + void try_end_recovery_stretch_mode(bool force); + /** + * Sets the osdmap and pg_pool_t values back to healthy stretch mode status. + */ + void trigger_healthy_stretch_mode(); + /** + * Obtain the crush rule being used for stretch pools. + * Note that right now this is heuristic and simply selects the + * most-used rule on replicated stretch pools. + * @return the crush rule ID, or a negative errno + */ + int get_replicated_stretch_crush_rule(); +private: + utime_t stretch_recovery_triggered; // what time we committed a switch to recovery mode +}; + +#endif diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc new file mode 100644 index 000000000..220317603 --- /dev/null +++ b/src/mon/PGMap.cc @@ -0,0 +1,4171 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <boost/algorithm/string.hpp> + +#include "PGMap.h" + +#define dout_subsys ceph_subsys_mon +#include "common/debug.h" +#include "common/Clock.h" +#include "common/Formatter.h" +#include "global/global_context.h" +#include "include/ceph_features.h" +#include "include/stringify.h" + +#include "osd/osd_types.h" +#include "osd/OSDMap.h" +#include <boost/range/adaptor/reversed.hpp> + +#define dout_context g_ceph_context + +using std::list; +using std::make_pair; +using std::map; +using std::pair; +using std::ostream; +using std::ostringstream; +using std::set; +using std::string; +using std::stringstream; +using std::vector; + +using ceph::bufferlist; +using ceph::fixed_u_to_string; + +using TOPNSPC::common::cmd_getval; + +MEMPOOL_DEFINE_OBJECT_FACTORY(PGMapDigest, pgmap_digest, pgmap); +MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap, pgmap, pgmap); +MEMPOOL_DEFINE_OBJECT_FACTORY(PGMap::Incremental, pgmap_inc, pgmap); + + +// --------------------- +// PGMapDigest + +void PGMapDigest::encode(bufferlist& bl, uint64_t features) const +{ + // NOTE: see PGMap::encode_digest + uint8_t v = 4; + if (!HAVE_FEATURE(features, SERVER_MIMIC)) { + v = 1; + } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { + v = 3; + } + ENCODE_START(v, 1, bl); + encode(num_pg, bl); + encode(num_pg_active, bl); + encode(num_pg_unknown, bl); + encode(num_osd, bl); + encode(pg_pool_sum, bl, features); + encode(pg_sum, bl, features); + encode(osd_sum, bl, features); + if (v >= 2) { + encode(num_pg_by_state, bl); + } else { + uint32_t n = num_pg_by_state.size(); + encode(n, bl); + for (auto p : num_pg_by_state) { + encode((int32_t)p.first, bl); + encode(p.second, bl); + } + } + encode(num_pg_by_osd, bl); + encode(num_pg_by_pool, bl); + encode(osd_last_seq, bl); + encode(per_pool_sum_delta, bl, features); + encode(per_pool_sum_deltas_stamps, bl); + encode(pg_sum_delta, bl, features); + encode(stamp_delta, bl); + encode(avail_space_by_rule, bl); + if (struct_v >= 3) { + encode(purged_snaps, bl); + } + if (struct_v >= 4) { + encode(osd_sum_by_class, bl, features); + } + ENCODE_FINISH(bl); +} + +void PGMapDigest::decode(bufferlist::const_iterator& p) +{ + DECODE_START(4, p); + decode(num_pg, p); + decode(num_pg_active, p); + decode(num_pg_unknown, p); + decode(num_osd, p); + decode(pg_pool_sum, p); + decode(pg_sum, p); + decode(osd_sum, p); + if (struct_v >= 2) { + decode(num_pg_by_state, p); + } else { + map<int32_t, int32_t> nps; + decode(nps, p); + num_pg_by_state.clear(); + for (auto i : nps) { + num_pg_by_state[i.first] = i.second; + } + } + decode(num_pg_by_osd, p); + decode(num_pg_by_pool, p); + decode(osd_last_seq, p); + decode(per_pool_sum_delta, p); + decode(per_pool_sum_deltas_stamps, p); + decode(pg_sum_delta, p); + decode(stamp_delta, p); + decode(avail_space_by_rule, p); + if (struct_v >= 3) { + decode(purged_snaps, p); + } + if (struct_v >= 4) { + decode(osd_sum_by_class, p); + } + DECODE_FINISH(p); +} + +void PGMapDigest::dump(ceph::Formatter *f) const +{ + f->dump_unsigned("num_pg", num_pg); + f->dump_unsigned("num_pg_active", num_pg_active); + f->dump_unsigned("num_pg_unknown", num_pg_unknown); + f->dump_unsigned("num_osd", num_osd); + f->dump_object("pool_sum", pg_sum); + f->dump_object("osd_sum", osd_sum); + + f->open_object_section("osd_sum_by_class"); + for (auto& i : osd_sum_by_class) { + f->dump_object(i.first.c_str(), i.second); + } + f->close_section(); + + f->open_array_section("pool_stats"); + for (auto& p : pg_pool_sum) { + f->open_object_section("pool_stat"); + f->dump_int("poolid", p.first); + auto q = num_pg_by_pool.find(p.first); + if (q != num_pg_by_pool.end()) + f->dump_unsigned("num_pg", q->second); + p.second.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("osd_stats"); + int i = 0; + // TODO: this isn't really correct since we can dump non-existent OSDs + // I dunno what osd_last_seq is set to in that case... + for (auto& p : osd_last_seq) { + f->open_object_section("osd_stat"); + f->dump_int("osd", i); + f->dump_unsigned("seq", p); + f->close_section(); + ++i; + } + f->close_section(); + f->open_array_section("num_pg_by_state"); + for (auto& p : num_pg_by_state) { + f->open_object_section("count"); + f->dump_string("state", pg_state_string(p.first)); + f->dump_unsigned("num", p.second); + f->close_section(); + } + f->close_section(); + f->open_array_section("num_pg_by_osd"); + for (auto& p : num_pg_by_osd) { + f->open_object_section("count"); + f->dump_unsigned("osd", p.first); + f->dump_unsigned("num_primary_pg", p.second.primary); + f->dump_unsigned("num_acting_pg", p.second.acting); + f->dump_unsigned("num_up_not_acting_pg", p.second.up_not_acting); + f->close_section(); + } + f->close_section(); + f->open_array_section("purged_snaps"); + for (auto& j : purged_snaps) { + f->open_object_section("pool"); + f->dump_int("pool", j.first); + f->open_object_section("purged_snaps"); + for (auto i = j.second.begin(); i != j.second.end(); ++i) { + f->open_object_section("interval"); + f->dump_stream("start") << i.get_start(); + f->dump_stream("length") << i.get_len(); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); +} + +void PGMapDigest::generate_test_instances(list<PGMapDigest*>& ls) +{ + ls.push_back(new PGMapDigest); +} + +inline std::string percentify(const float& a) { + std::stringstream ss; + if (a < 0.01) + ss << "0"; + else + ss << std::fixed << std::setprecision(2) << a; + return ss.str(); +} + +void PGMapDigest::print_summary(ceph::Formatter *f, ostream *out) const +{ + if (f) + f->open_array_section("pgs_by_state"); + + // list is descending numeric order (by count) + std::multimap<int,uint64_t> state_by_count; // count -> state + for (auto p = num_pg_by_state.begin(); + p != num_pg_by_state.end(); + ++p) { + state_by_count.insert(make_pair(p->second, p->first)); + } + if (f) { + for (auto p = state_by_count.rbegin(); + p != state_by_count.rend(); + ++p) + { + f->open_object_section("pgs_by_state_element"); + f->dump_string("state_name", pg_state_string(p->second)); + f->dump_unsigned("count", p->first); + f->close_section(); + } + } + if (f) + f->close_section(); + + if (f) { + f->dump_unsigned("num_pgs", num_pg); + f->dump_unsigned("num_pools", pg_pool_sum.size()); + f->dump_unsigned("num_objects", pg_sum.stats.sum.num_objects); + f->dump_unsigned("data_bytes", pg_sum.stats.sum.num_bytes); + f->dump_unsigned("bytes_used", osd_sum.statfs.get_used_raw()); + f->dump_unsigned("bytes_avail", osd_sum.statfs.available); + f->dump_unsigned("bytes_total", osd_sum.statfs.total); + } else { + *out << " pools: " << pg_pool_sum.size() << " pools, " + << num_pg << " pgs\n"; + *out << " objects: " << si_u_t(pg_sum.stats.sum.num_objects) << " objects, " + << byte_u_t(pg_sum.stats.sum.num_bytes) << "\n"; + *out << " usage: " + << byte_u_t(osd_sum.statfs.get_used_raw()) << " used, " + << byte_u_t(osd_sum.statfs.available) << " / " + << byte_u_t(osd_sum.statfs.total) << " avail\n"; + *out << " pgs: "; + } + + bool pad = false; + + if (num_pg_unknown > 0) { + float p = (float)num_pg_unknown / (float)num_pg; + if (f) { + f->dump_float("unknown_pgs_ratio", p); + } else { + char b[20]; + snprintf(b, sizeof(b), "%.3lf", p * 100.0); + *out << b << "% pgs unknown\n"; + pad = true; + } + } + + int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown; + if (num_pg_inactive > 0) { + float p = (float)num_pg_inactive / (float)num_pg; + if (f) { + f->dump_float("inactive_pgs_ratio", p); + } else { + if (pad) { + *out << " "; + } + char b[20]; + snprintf(b, sizeof(b), "%.3f", p * 100.0); + *out << b << "% pgs not active\n"; + pad = true; + } + } + + list<string> sl; + overall_recovery_summary(f, &sl); + if (!f && !sl.empty()) { + for (auto p = sl.begin(); p != sl.end(); ++p) { + if (pad) { + *out << " "; + } + *out << *p << "\n"; + pad = true; + } + } + sl.clear(); + + if (!f) { + unsigned max_width = 1; + for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p) + { + std::stringstream ss; + ss << p->first; + max_width = std::max<size_t>(ss.str().size(), max_width); + } + + for (auto p = state_by_count.rbegin(); p != state_by_count.rend(); ++p) + { + if (pad) { + *out << " "; + } + pad = true; + out->setf(std::ios::left); + *out << std::setw(max_width) << p->first + << " " << pg_state_string(p->second) << "\n"; + out->unsetf(std::ios::left); + } + } + + ostringstream ss_rec_io; + overall_recovery_rate_summary(f, &ss_rec_io); + ostringstream ss_client_io; + overall_client_io_rate_summary(f, &ss_client_io); + ostringstream ss_cache_io; + overall_cache_io_rate_summary(f, &ss_cache_io); + + if (!f && (ss_client_io.str().length() || ss_rec_io.str().length() + || ss_cache_io.str().length())) { + *out << "\n \n"; + *out << " io:\n"; + } + + if (!f && ss_client_io.str().length()) + *out << " client: " << ss_client_io.str() << "\n"; + if (!f && ss_rec_io.str().length()) + *out << " recovery: " << ss_rec_io.str() << "\n"; + if (!f && ss_cache_io.str().length()) + *out << " cache: " << ss_cache_io.str() << "\n"; +} + +void PGMapDigest::print_oneline_summary(ceph::Formatter *f, ostream *out) const +{ + std::stringstream ss; + + if (f) + f->open_array_section("num_pg_by_state"); + for (auto p = num_pg_by_state.begin(); + p != num_pg_by_state.end(); + ++p) { + if (f) { + f->open_object_section("state"); + f->dump_string("name", pg_state_string(p->first)); + f->dump_unsigned("num", p->second); + f->close_section(); + } + if (p != num_pg_by_state.begin()) + ss << ", "; + ss << p->second << " " << pg_state_string(p->first); + } + if (f) + f->close_section(); + + string states = ss.str(); + if (out) + *out << num_pg << " pgs: " + << states << "; " + << byte_u_t(pg_sum.stats.sum.num_bytes) << " data, " + << byte_u_t(osd_sum.statfs.get_used()) << " used, " + << byte_u_t(osd_sum.statfs.available) << " / " + << byte_u_t(osd_sum.statfs.total) << " avail"; + if (f) { + f->dump_unsigned("num_pgs", num_pg); + f->dump_unsigned("num_bytes", pg_sum.stats.sum.num_bytes); + f->dump_int("total_bytes", osd_sum.statfs.total); + f->dump_int("total_avail_bytes", osd_sum.statfs.available); + f->dump_int("total_used_bytes", osd_sum.statfs.get_used()); + f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw()); + } + + // make non-negative; we can get negative values if osds send + // uncommitted stats and then "go backward" or if they are just + // buggy/wrong. + pool_stat_t pos_delta = pg_sum_delta; + pos_delta.floor(0); + if (pos_delta.stats.sum.num_rd || + pos_delta.stats.sum.num_wr) { + if (out) + *out << "; "; + if (pos_delta.stats.sum.num_rd) { + int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta; + if (out) + *out << byte_u_t(rd) << "/s rd, "; + if (f) + f->dump_unsigned("read_bytes_sec", rd); + } + if (pos_delta.stats.sum.num_wr) { + int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta; + if (out) + *out << byte_u_t(wr) << "/s wr, "; + if (f) + f->dump_unsigned("write_bytes_sec", wr); + } + int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta; + if (out) + *out << si_u_t(iops) << " op/s"; + if (f) + f->dump_unsigned("io_sec", iops); + } + + list<string> sl; + overall_recovery_summary(f, &sl); + if (out) + for (auto p = sl.begin(); p != sl.end(); ++p) + *out << "; " << *p; + std::stringstream ssr; + overall_recovery_rate_summary(f, &ssr); + if (out && ssr.str().length()) + *out << "; " << ssr.str() << " recovering"; +} + +void PGMapDigest::get_recovery_stats( + double *misplaced_ratio, + double *degraded_ratio, + double *inactive_pgs_ratio, + double *unknown_pgs_ratio) const +{ + if (pg_sum.stats.sum.num_objects_degraded && + pg_sum.stats.sum.num_object_copies > 0) { + *degraded_ratio = (double)pg_sum.stats.sum.num_objects_degraded / + (double)pg_sum.stats.sum.num_object_copies; + } else { + *degraded_ratio = 0; + } + if (pg_sum.stats.sum.num_objects_misplaced && + pg_sum.stats.sum.num_object_copies > 0) { + *misplaced_ratio = (double)pg_sum.stats.sum.num_objects_misplaced / + (double)pg_sum.stats.sum.num_object_copies; + } else { + *misplaced_ratio = 0; + } + if (num_pg > 0) { + int num_pg_inactive = num_pg - num_pg_active - num_pg_unknown; + *inactive_pgs_ratio = (double)num_pg_inactive / (double)num_pg; + *unknown_pgs_ratio = (double)num_pg_unknown / (double)num_pg; + } else { + *inactive_pgs_ratio = 0; + *unknown_pgs_ratio = 0; + } +} + +void PGMapDigest::recovery_summary(ceph::Formatter *f, list<string> *psl, + const pool_stat_t& pool_sum) const +{ + if (pool_sum.stats.sum.num_objects_degraded && pool_sum.stats.sum.num_object_copies > 0) { + double pc = (double)pool_sum.stats.sum.num_objects_degraded / + (double)pool_sum.stats.sum.num_object_copies * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + if (f) { + f->dump_unsigned("degraded_objects", pool_sum.stats.sum.num_objects_degraded); + f->dump_unsigned("degraded_total", pool_sum.stats.sum.num_object_copies); + f->dump_float("degraded_ratio", pc / 100.0); + } else { + ostringstream ss; + ss << pool_sum.stats.sum.num_objects_degraded + << "/" << pool_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)"; + psl->push_back(ss.str()); + } + } + if (pool_sum.stats.sum.num_objects_misplaced && pool_sum.stats.sum.num_object_copies > 0) { + double pc = (double)pool_sum.stats.sum.num_objects_misplaced / + (double)pool_sum.stats.sum.num_object_copies * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + if (f) { + f->dump_unsigned("misplaced_objects", pool_sum.stats.sum.num_objects_misplaced); + f->dump_unsigned("misplaced_total", pool_sum.stats.sum.num_object_copies); + f->dump_float("misplaced_ratio", pc / 100.0); + } else { + ostringstream ss; + ss << pool_sum.stats.sum.num_objects_misplaced + << "/" << pool_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)"; + psl->push_back(ss.str()); + } + } + if (pool_sum.stats.sum.num_objects_unfound && pool_sum.stats.sum.num_objects) { + double pc = (double)pool_sum.stats.sum.num_objects_unfound / + (double)pool_sum.stats.sum.num_objects * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + if (f) { + f->dump_unsigned("unfound_objects", pool_sum.stats.sum.num_objects_unfound); + f->dump_unsigned("unfound_total", pool_sum.stats.sum.num_objects); + f->dump_float("unfound_ratio", pc / 100.0); + } else { + ostringstream ss; + ss << pool_sum.stats.sum.num_objects_unfound + << "/" << pool_sum.stats.sum.num_objects << " objects unfound (" << b << "%)"; + psl->push_back(ss.str()); + } + } +} + +void PGMapDigest::recovery_rate_summary(ceph::Formatter *f, ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const +{ + // make non-negative; we can get negative values if osds send + // uncommitted stats and then "go backward" or if they are just + // buggy/wrong. + pool_stat_t pos_delta = delta_sum; + pos_delta.floor(0); + if (pos_delta.stats.sum.num_objects_recovered || + pos_delta.stats.sum.num_bytes_recovered || + pos_delta.stats.sum.num_keys_recovered) { + int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp; + int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp; + int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp; + if (f) { + f->dump_int("recovering_objects_per_sec", objps); + f->dump_int("recovering_bytes_per_sec", bps); + f->dump_int("recovering_keys_per_sec", kps); + f->dump_int("num_objects_recovered", pos_delta.stats.sum.num_objects_recovered); + f->dump_int("num_bytes_recovered", pos_delta.stats.sum.num_bytes_recovered); + f->dump_int("num_keys_recovered", pos_delta.stats.sum.num_keys_recovered); + } else { + *out << byte_u_t(bps) << "/s"; + if (pos_delta.stats.sum.num_keys_recovered) + *out << ", " << si_u_t(kps) << " keys/s"; + *out << ", " << si_u_t(objps) << " objects/s"; + } + } +} + +void PGMapDigest::overall_recovery_rate_summary(ceph::Formatter *f, ostream *out) const +{ + recovery_rate_summary(f, out, pg_sum_delta, stamp_delta); +} + +void PGMapDigest::overall_recovery_summary(ceph::Formatter *f, list<string> *psl) const +{ + recovery_summary(f, psl, pg_sum); +} + +void PGMapDigest::pool_recovery_rate_summary(ceph::Formatter *f, ostream *out, + uint64_t poolid) const +{ + auto p = per_pool_sum_delta.find(poolid); + if (p == per_pool_sum_delta.end()) + return; + + auto ts = per_pool_sum_deltas_stamps.find(p->first); + ceph_assert(ts != per_pool_sum_deltas_stamps.end()); + recovery_rate_summary(f, out, p->second.first, ts->second); +} + +void PGMapDigest::pool_recovery_summary(ceph::Formatter *f, list<string> *psl, + uint64_t poolid) const +{ + auto p = pg_pool_sum.find(poolid); + if (p == pg_pool_sum.end()) + return; + + recovery_summary(f, psl, p->second); +} + +void PGMapDigest::client_io_rate_summary(ceph::Formatter *f, ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const +{ + pool_stat_t pos_delta = delta_sum; + pos_delta.floor(0); + if (pos_delta.stats.sum.num_rd || + pos_delta.stats.sum.num_wr) { + if (pos_delta.stats.sum.num_rd) { + int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp; + if (f) { + f->dump_int("read_bytes_sec", rd); + } else { + *out << byte_u_t(rd) << "/s rd, "; + } + } + if (pos_delta.stats.sum.num_wr) { + int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp; + if (f) { + f->dump_int("write_bytes_sec", wr); + } else { + *out << byte_u_t(wr) << "/s wr, "; + } + } + int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp; + int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp; + if (f) { + f->dump_int("read_op_per_sec", iops_rd); + f->dump_int("write_op_per_sec", iops_wr); + } else { + *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr"; + } + } +} + +void PGMapDigest::overall_client_io_rate_summary(ceph::Formatter *f, ostream *out) const +{ + client_io_rate_summary(f, out, pg_sum_delta, stamp_delta); +} + +void PGMapDigest::pool_client_io_rate_summary(ceph::Formatter *f, ostream *out, + uint64_t poolid) const +{ + auto p = per_pool_sum_delta.find(poolid); + if (p == per_pool_sum_delta.end()) + return; + + auto ts = per_pool_sum_deltas_stamps.find(p->first); + ceph_assert(ts != per_pool_sum_deltas_stamps.end()); + client_io_rate_summary(f, out, p->second.first, ts->second); +} + +void PGMapDigest::cache_io_rate_summary(ceph::Formatter *f, ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const +{ + pool_stat_t pos_delta = delta_sum; + pos_delta.floor(0); + bool have_output = false; + + if (pos_delta.stats.sum.num_flush) { + int64_t flush = (pos_delta.stats.sum.num_flush_kb << 10) / (double)delta_stamp; + if (f) { + f->dump_int("flush_bytes_sec", flush); + } else { + *out << byte_u_t(flush) << "/s flush"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_evict) { + int64_t evict = (pos_delta.stats.sum.num_evict_kb << 10) / (double)delta_stamp; + if (f) { + f->dump_int("evict_bytes_sec", evict); + } else { + if (have_output) + *out << ", "; + *out << byte_u_t(evict) << "/s evict"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_promote) { + int64_t promote = pos_delta.stats.sum.num_promote / (double)delta_stamp; + if (f) { + f->dump_int("promote_op_per_sec", promote); + } else { + if (have_output) + *out << ", "; + *out << si_u_t(promote) << " op/s promote"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_flush_mode_low) { + if (f) { + f->dump_int("num_flush_mode_low", pos_delta.stats.sum.num_flush_mode_low); + } else { + if (have_output) + *out << ", "; + *out << si_u_t(pos_delta.stats.sum.num_flush_mode_low) << " PGs flushing"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_flush_mode_high) { + if (f) { + f->dump_int("num_flush_mode_high", pos_delta.stats.sum.num_flush_mode_high); + } else { + if (have_output) + *out << ", "; + *out << si_u_t(pos_delta.stats.sum.num_flush_mode_high) << " PGs flushing (high)"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_evict_mode_some) { + if (f) { + f->dump_int("num_evict_mode_some", pos_delta.stats.sum.num_evict_mode_some); + } else { + if (have_output) + *out << ", "; + *out << si_u_t(pos_delta.stats.sum.num_evict_mode_some) << " PGs evicting"; + have_output = true; + } + } + if (pos_delta.stats.sum.num_evict_mode_full) { + if (f) { + f->dump_int("num_evict_mode_full", pos_delta.stats.sum.num_evict_mode_full); + } else { + if (have_output) + *out << ", "; + *out << si_u_t(pos_delta.stats.sum.num_evict_mode_full) << " PGs evicting (full)"; + } + } +} + +void PGMapDigest::overall_cache_io_rate_summary(ceph::Formatter *f, ostream *out) const +{ + cache_io_rate_summary(f, out, pg_sum_delta, stamp_delta); +} + +void PGMapDigest::pool_cache_io_rate_summary(ceph::Formatter *f, ostream *out, + uint64_t poolid) const +{ + auto p = per_pool_sum_delta.find(poolid); + if (p == per_pool_sum_delta.end()) + return; + + auto ts = per_pool_sum_deltas_stamps.find(p->first); + ceph_assert(ts != per_pool_sum_deltas_stamps.end()); + cache_io_rate_summary(f, out, p->second.first, ts->second); +} + +ceph_statfs PGMapDigest::get_statfs(OSDMap &osdmap, + boost::optional<int64_t> data_pool) const +{ + ceph_statfs statfs; + bool filter = false; + object_stat_sum_t sum; + + if (data_pool) { + auto i = pg_pool_sum.find(*data_pool); + if (i != pg_pool_sum.end()) { + sum = i->second.stats.sum; + filter = true; + } + } + + if (filter) { + statfs.kb_used = (sum.num_bytes >> 10); + statfs.kb_avail = get_pool_free_space(osdmap, *data_pool) >> 10; + statfs.num_objects = sum.num_objects; + statfs.kb = statfs.kb_used + statfs.kb_avail; + } else { + // these are in KB. + statfs.kb = osd_sum.statfs.kb(); + statfs.kb_used = osd_sum.statfs.kb_used_raw(); + statfs.kb_avail = osd_sum.statfs.kb_avail(); + statfs.num_objects = pg_sum.stats.sum.num_objects; + } + + return statfs; +} + +void PGMapDigest::dump_pool_stats_full( + const OSDMap &osd_map, + stringstream *ss, + ceph::Formatter *f, + bool verbose) const +{ + TextTable tbl; + + if (f) { + f->open_array_section("pools"); + } else { + tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("ID", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("PGS", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("STORED", TextTable::RIGHT, TextTable::RIGHT); + if (verbose) { + tbl.define_column("(DATA)", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("(OMAP)", TextTable::RIGHT, TextTable::RIGHT); + } + tbl.define_column("OBJECTS", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT); + if (verbose) { + tbl.define_column("(DATA)", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("(OMAP)", TextTable::RIGHT, TextTable::RIGHT); + } + tbl.define_column("%USED", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("MAX AVAIL", TextTable::RIGHT, TextTable::RIGHT); + + if (verbose) { + tbl.define_column("QUOTA OBJECTS", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("QUOTA BYTES", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("DIRTY", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("USED COMPR", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("UNDER COMPR", TextTable::RIGHT, TextTable::RIGHT); + } + } + + map<int,uint64_t> avail_by_rule; + for (auto p = osd_map.get_pools().begin(); + p != osd_map.get_pools().end(); ++p) { + int64_t pool_id = p->first; + if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0)) + continue; + + const string& pool_name = osd_map.get_pool_name(pool_id); + auto pool_pg_num = osd_map.get_pg_num(pool_id); + const pool_stat_t &stat = pg_pool_sum.at(pool_id); + + const pg_pool_t *pool = osd_map.get_pg_pool(pool_id); + int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(), + pool->get_type(), + pool->get_size()); + int64_t avail; + if (avail_by_rule.count(ruleno) == 0) { + // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked + avail = get_rule_avail(ruleno); + if (avail < 0) + avail = 0; + avail_by_rule[ruleno] = avail; + } else { + avail = avail_by_rule[ruleno]; + } + if (f) { + f->open_object_section("pool"); + f->dump_string("name", pool_name); + f->dump_int("id", pool_id); + f->open_object_section("stats"); + } else { + tbl << pool_name + << pool_id + << pool_pg_num; + } + float raw_used_rate = osd_map.pool_raw_used_rate(pool_id); + bool per_pool = use_per_pool_stats(); + bool per_pool_omap = use_per_pool_omap_stats(); + dump_object_stat_sum(tbl, f, stat, avail, raw_used_rate, verbose, per_pool, + per_pool_omap, pool); + if (f) { + f->close_section(); // stats + f->close_section(); // pool + } else { + tbl << TextTable::endrow; + } + } + if (f) + f->close_section(); + else { + ceph_assert(ss != nullptr); + *ss << "--- POOLS ---\n"; + *ss << tbl; + } +} + +void PGMapDigest::dump_cluster_stats(stringstream *ss, + ceph::Formatter *f, + bool verbose) const +{ + if (f) { + f->open_object_section("stats"); + f->dump_int("total_bytes", osd_sum.statfs.total); + f->dump_int("total_avail_bytes", osd_sum.statfs.available); + f->dump_int("total_used_bytes", osd_sum.statfs.get_used()); + f->dump_int("total_used_raw_bytes", osd_sum.statfs.get_used_raw()); + f->dump_float("total_used_raw_ratio", osd_sum.statfs.get_used_raw_ratio()); + f->dump_unsigned("num_osds", osd_sum.num_osds); + f->dump_unsigned("num_per_pool_osds", osd_sum.num_per_pool_osds); + f->dump_unsigned("num_per_pool_omap_osds", osd_sum.num_per_pool_omap_osds); + f->close_section(); + f->open_object_section("stats_by_class"); + for (auto& i : osd_sum_by_class) { + f->open_object_section(i.first.c_str()); + f->dump_int("total_bytes", i.second.statfs.total); + f->dump_int("total_avail_bytes", i.second.statfs.available); + f->dump_int("total_used_bytes", i.second.statfs.get_used()); + f->dump_int("total_used_raw_bytes", i.second.statfs.get_used_raw()); + f->dump_float("total_used_raw_ratio", + i.second.statfs.get_used_raw_ratio()); + f->close_section(); + } + f->close_section(); + } else { + ceph_assert(ss != nullptr); + TextTable tbl; + tbl.define_column("CLASS", TextTable::LEFT, TextTable::LEFT); + tbl.define_column("SIZE", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("AVAIL", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("RAW USED", TextTable::RIGHT, TextTable::RIGHT); + tbl.define_column("%RAW USED", TextTable::RIGHT, TextTable::RIGHT); + + + for (auto& i : osd_sum_by_class) { + tbl << i.first; + tbl << stringify(byte_u_t(i.second.statfs.total)) + << stringify(byte_u_t(i.second.statfs.available)) + << stringify(byte_u_t(i.second.statfs.get_used())) + << stringify(byte_u_t(i.second.statfs.get_used_raw())) + << percentify(i.second.statfs.get_used_raw_ratio()*100.0) + << TextTable::endrow; + } + tbl << "TOTAL"; + tbl << stringify(byte_u_t(osd_sum.statfs.total)) + << stringify(byte_u_t(osd_sum.statfs.available)) + << stringify(byte_u_t(osd_sum.statfs.get_used())) + << stringify(byte_u_t(osd_sum.statfs.get_used_raw())) + << percentify(osd_sum.statfs.get_used_raw_ratio()*100.0) + << TextTable::endrow; + + *ss << "--- RAW STORAGE ---\n"; + *ss << tbl; + } +} + +void PGMapDigest::dump_object_stat_sum( + TextTable &tbl, ceph::Formatter *f, + const pool_stat_t &pool_stat, uint64_t avail, + float raw_used_rate, bool verbose, bool per_pool, bool per_pool_omap, + const pg_pool_t *pool) +{ + const object_stat_sum_t &sum = pool_stat.stats.sum; + const store_statfs_t statfs = pool_stat.store_stats; + + if (sum.num_object_copies > 0) { + raw_used_rate *= (float)(sum.num_object_copies - sum.num_objects_degraded) / sum.num_object_copies; + } + + uint64_t used_data_bytes = pool_stat.get_allocated_data_bytes(per_pool); + uint64_t used_omap_bytes = pool_stat.get_allocated_omap_bytes(per_pool_omap); + uint64_t used_bytes = used_data_bytes + used_omap_bytes; + + float used = 0.0; + // note avail passed in is raw_avail, calc raw_used here. + if (avail) { + used = used_bytes; + used /= used + avail; + } else if (used_bytes) { + used = 1.0; + } + auto avail_res = raw_used_rate ? avail / raw_used_rate : 0; + // an approximation for actually stored user data + auto stored_data_normalized = pool_stat.get_user_data_bytes( + raw_used_rate, per_pool); + auto stored_omap_normalized = pool_stat.get_user_omap_bytes( + raw_used_rate, per_pool_omap); + auto stored_normalized = stored_data_normalized + stored_omap_normalized; + // same, amplied by replication or EC + auto stored_raw = stored_normalized * raw_used_rate; + if (f) { + f->dump_int("stored", stored_normalized); + if (verbose) { + f->dump_int("stored_data", stored_data_normalized); + f->dump_int("stored_omap", stored_omap_normalized); + } + f->dump_int("objects", sum.num_objects); + f->dump_int("kb_used", shift_round_up(used_bytes, 10)); + f->dump_int("bytes_used", used_bytes); + if (verbose) { + f->dump_int("data_bytes_used", used_data_bytes); + f->dump_int("omap_bytes_used", used_omap_bytes); + } + f->dump_float("percent_used", used); + f->dump_unsigned("max_avail", avail_res); + if (verbose) { + f->dump_int("quota_objects", pool->quota_max_objects); + f->dump_int("quota_bytes", pool->quota_max_bytes); + if (pool->is_tier()) { + f->dump_int("dirty", sum.num_objects_dirty); + } else { + f->dump_int("dirty", 0); + } + f->dump_int("rd", sum.num_rd); + f->dump_int("rd_bytes", sum.num_rd_kb * 1024ull); + f->dump_int("wr", sum.num_wr); + f->dump_int("wr_bytes", sum.num_wr_kb * 1024ull); + f->dump_int("compress_bytes_used", statfs.data_compressed_allocated); + f->dump_int("compress_under_bytes", statfs.data_compressed_original); + // Stored by user amplified by replication + f->dump_int("stored_raw", stored_raw); + f->dump_unsigned("avail_raw", avail); + } + } else { + tbl << stringify(byte_u_t(stored_normalized)); + if (verbose) { + tbl << stringify(byte_u_t(stored_data_normalized)); + tbl << stringify(byte_u_t(stored_omap_normalized)); + } + tbl << stringify(si_u_t(sum.num_objects)); + tbl << stringify(byte_u_t(used_bytes)); + if (verbose) { + tbl << stringify(byte_u_t(used_data_bytes)); + tbl << stringify(byte_u_t(used_omap_bytes)); + } + tbl << percentify(used*100); + tbl << stringify(byte_u_t(avail_res)); + if (verbose) { + if (pool->quota_max_objects == 0) + tbl << "N/A"; + else + tbl << stringify(si_u_t(pool->quota_max_objects)); + if (pool->quota_max_bytes == 0) + tbl << "N/A"; + else + tbl << stringify(byte_u_t(pool->quota_max_bytes)); + if (pool->is_tier()) { + tbl << stringify(si_u_t(sum.num_objects_dirty)); + } else { + tbl << "N/A"; + } + tbl << stringify(byte_u_t(statfs.data_compressed_allocated)); + tbl << stringify(byte_u_t(statfs.data_compressed_original)); + } + } +} + +int64_t PGMapDigest::get_pool_free_space(const OSDMap &osd_map, + int64_t poolid) const +{ + const pg_pool_t *pool = osd_map.get_pg_pool(poolid); + int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(), + pool->get_type(), + pool->get_size()); + int64_t avail; + avail = get_rule_avail(ruleno); + if (avail < 0) + avail = 0; + + return avail / osd_map.pool_raw_used_rate(poolid); +} + +int64_t PGMap::get_rule_avail(const OSDMap& osdmap, int ruleno) const +{ + map<int,float> wm; + int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm); + if (r < 0) { + return r; + } + if (wm.empty()) { + return 0; + } + + float fratio = osdmap.get_full_ratio(); + + int64_t min = -1; + for (auto p = wm.begin(); p != wm.end(); ++p) { + auto osd_info = osd_stat.find(p->first); + if (osd_info != osd_stat.end()) { + if (osd_info->second.statfs.total == 0 || p->second == 0) { + // osd must be out, hence its stats have been zeroed + // (unless we somehow managed to have a disk with size 0...) + // + // (p->second == 0), if osd weight is 0, no need to + // calculate proj below. + continue; + } + double unusable = (double)osd_info->second.statfs.kb() * + (1.0 - fratio); + double avail = std::max(0.0, (double)osd_info->second.statfs.kb_avail() - unusable); + avail *= 1024.0; + int64_t proj = (int64_t)(avail / (double)p->second); + if (min < 0 || proj < min) { + min = proj; + } + } else { + if (osdmap.is_up(p->first)) { + // This is a level 4 rather than an error, because we might have + // only just started, and not received the first stats message yet. + dout(4) << "OSD " << p->first << " is up, but has no stats" << dendl; + } + } + } + return min; +} + +void PGMap::get_rules_avail(const OSDMap& osdmap, + std::map<int,int64_t> *avail_map) const +{ + avail_map->clear(); + for (auto p : osdmap.get_pools()) { + int64_t pool_id = p.first; + if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0)) + continue; + const pg_pool_t *pool = osdmap.get_pg_pool(pool_id); + int ruleno = osdmap.crush->find_rule(pool->get_crush_rule(), + pool->get_type(), + pool->get_size()); + if (avail_map->count(ruleno) == 0) + (*avail_map)[ruleno] = get_rule_avail(osdmap, ruleno); + } +} + +// --------------------- +// PGMap + +void PGMap::Incremental::dump(ceph::Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_stream("stamp") << stamp; + f->dump_unsigned("osdmap_epoch", osdmap_epoch); + f->dump_unsigned("pg_scan_epoch", pg_scan); + + f->open_array_section("pg_stat_updates"); + for (auto p = pg_stat_updates.begin(); p != pg_stat_updates.end(); ++p) { + f->open_object_section("pg_stat"); + f->dump_stream("pgid") << p->first; + p->second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_array_section("osd_stat_updates"); + for (auto p = osd_stat_updates.begin(); p != osd_stat_updates.end(); ++p) { + f->open_object_section("osd_stat"); + f->dump_int("osd", p->first); + p->second.dump(f); + f->close_section(); + } + f->close_section(); + f->open_array_section("pool_statfs_updates"); + for (auto p = pool_statfs_updates.begin(); p != pool_statfs_updates.end(); ++p) { + f->open_object_section("pool_statfs"); + f->dump_stream("poolid/osd") << p->first; + p->second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_array_section("osd_stat_removals"); + for (auto p = osd_stat_rm.begin(); p != osd_stat_rm.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + + f->open_array_section("pg_removals"); + for (auto p = pg_remove.begin(); p != pg_remove.end(); ++p) + f->dump_stream("pgid") << *p; + f->close_section(); +} + +void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o) +{ + o.push_back(new Incremental); + o.push_back(new Incremental); + o.back()->version = 1; + o.back()->stamp = utime_t(123,345); + o.push_back(new Incremental); + o.back()->version = 2; + o.back()->pg_stat_updates[pg_t(1,2)] = pg_stat_t(); + o.back()->osd_stat_updates[5] = osd_stat_t(); + o.push_back(new Incremental); + o.back()->version = 3; + o.back()->osdmap_epoch = 1; + o.back()->pg_scan = 2; + o.back()->pg_stat_updates[pg_t(4,5)] = pg_stat_t(); + o.back()->osd_stat_updates[6] = osd_stat_t(); + o.back()->pg_remove.insert(pg_t(1,2)); + o.back()->osd_stat_rm.insert(5); + o.back()->pool_statfs_updates[std::make_pair(1234,4)] = store_statfs_t(); +} + +// -- + +void PGMap::apply_incremental(CephContext *cct, const Incremental& inc) +{ + ceph_assert(inc.version == version+1); + version++; + + pool_stat_t pg_sum_old = pg_sum; + mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old; + pg_pool_sum_old = pg_pool_sum; + + for (auto p = inc.pg_stat_updates.begin(); + p != inc.pg_stat_updates.end(); + ++p) { + const pg_t &update_pg(p->first); + auto update_pool = update_pg.pool(); + const pg_stat_t &update_stat(p->second); + + auto pg_stat_iter = pg_stat.find(update_pg); + pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool]; + if (pg_stat_iter == pg_stat.end()) { + pg_stat.insert(make_pair(update_pg, update_stat)); + } else { + stat_pg_sub(update_pg, pg_stat_iter->second); + pool_sum_ref.sub(pg_stat_iter->second); + pg_stat_iter->second = update_stat; + } + stat_pg_add(update_pg, update_stat); + pool_sum_ref.add(update_stat); + } + + for (auto p = inc.pool_statfs_updates.begin(); + p != inc.pool_statfs_updates.end(); + ++p) { + auto update_pool = p->first.first; + auto update_osd = p->first.second; + auto& statfs_inc = p->second; + + auto pool_statfs_iter = + pool_statfs.find(std::make_pair(update_pool, update_osd)); + if (pg_pool_sum.count(update_pool)) { + pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool]; + if (pool_statfs_iter == pool_statfs.end()) { + pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc); + } else { + pool_sum_ref.sub(pool_statfs_iter->second); + pool_statfs_iter->second = statfs_inc; + } + pool_sum_ref.add(statfs_inc); + } + } + + for (auto p = inc.get_osd_stat_updates().begin(); + p != inc.get_osd_stat_updates().end(); + ++p) { + int osd = p->first; + const osd_stat_t &new_stats(p->second); + + auto t = osd_stat.find(osd); + if (t == osd_stat.end()) { + osd_stat.insert(make_pair(osd, new_stats)); + } else { + stat_osd_sub(t->first, t->second); + t->second = new_stats; + } + stat_osd_add(osd, new_stats); + } + set<int64_t> deleted_pools; + for (auto p = inc.pg_remove.begin(); + p != inc.pg_remove.end(); + ++p) { + const pg_t &removed_pg(*p); + auto s = pg_stat.find(removed_pg); + bool pool_erased = false; + if (s != pg_stat.end()) { + pool_erased = stat_pg_sub(removed_pg, s->second); + + // decrease pool stats if pg was removed + auto pool_stats_it = pg_pool_sum.find(removed_pg.pool()); + if (pool_stats_it != pg_pool_sum.end()) { + pool_stats_it->second.sub(s->second); + } + + pg_stat.erase(s); + if (pool_erased) { + deleted_pools.insert(removed_pg.pool()); + } + } + } + + for (auto p = inc.get_osd_stat_rm().begin(); + p != inc.get_osd_stat_rm().end(); + ++p) { + auto t = osd_stat.find(*p); + if (t != osd_stat.end()) { + stat_osd_sub(t->first, t->second); + osd_stat.erase(t); + } + for (auto i = pool_statfs.begin(); i != pool_statfs.end(); ++i) { + if (i->first.second == *p) { + pg_pool_sum[i->first.first].sub(i->second); + pool_statfs.erase(i); + } + } + } + + // skip calculating delta while sum was not synchronized + if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) { + utime_t delta_t; + delta_t = inc.stamp; + delta_t -= stamp; + // calculate a delta, and average over the last 2 deltas. + pool_stat_t d = pg_sum; + d.stats.sub(pg_sum_old.stats); + pg_sum_deltas.push_back(make_pair(d, delta_t)); + stamp_delta += delta_t; + pg_sum_delta.stats.add(d.stats); + auto smooth_intervals = + cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1; + while (pg_sum_deltas.size() > smooth_intervals) { + pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats); + stamp_delta -= pg_sum_deltas.front().second; + pg_sum_deltas.pop_front(); + } + } + stamp = inc.stamp; + + update_pool_deltas(cct, inc.stamp, pg_pool_sum_old); + + for (auto p : deleted_pools) { + if (cct) + dout(20) << " deleted pool " << p << dendl; + deleted_pool(p); + } + + if (inc.osdmap_epoch) + last_osdmap_epoch = inc.osdmap_epoch; + if (inc.pg_scan) + last_pg_scan = inc.pg_scan; +} + +void PGMap::calc_stats() +{ + num_pg = 0; + num_pg_active = 0; + num_pg_unknown = 0; + num_osd = 0; + pg_pool_sum.clear(); + num_pg_by_pool.clear(); + pg_by_osd.clear(); + pg_sum = pool_stat_t(); + osd_sum = osd_stat_t(); + osd_sum_by_class.clear(); + num_pg_by_state.clear(); + num_pg_by_pool_state.clear(); + num_pg_by_osd.clear(); + + for (auto p = pg_stat.begin(); + p != pg_stat.end(); + ++p) { + auto pg = p->first; + stat_pg_add(pg, p->second); + pg_pool_sum[pg.pool()].add(p->second); + } + for (auto p = pool_statfs.begin(); + p != pool_statfs.end(); + ++p) { + auto pool = p->first.first; + pg_pool_sum[pool].add(p->second); + } + for (auto p = osd_stat.begin(); + p != osd_stat.end(); + ++p) + stat_osd_add(p->first, p->second); +} + +void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s, + bool sameosds) +{ + auto pool = pgid.pool(); + pg_sum.add(s); + + num_pg++; + num_pg_by_state[s.state]++; + num_pg_by_pool_state[pgid.pool()][s.state]++; + num_pg_by_pool[pool]++; + + if ((s.state & PG_STATE_CREATING) && + s.parent_split_bits == 0) { + creating_pgs.insert(pgid); + if (s.acting_primary >= 0) { + creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid); + } + } + + if (s.state & PG_STATE_ACTIVE) { + ++num_pg_active; + } + if (s.state == 0) { + ++num_pg_unknown; + } + + if (sameosds) + return; + + for (auto p = s.blocked_by.begin(); + p != s.blocked_by.end(); + ++p) { + ++blocked_by_sum[*p]; + } + + for (auto p = s.acting.begin(); p != s.acting.end(); ++p) { + pg_by_osd[*p].insert(pgid); + num_pg_by_osd[*p].acting++; + } + for (auto p = s.up.begin(); p != s.up.end(); ++p) { + auto& t = pg_by_osd[*p]; + if (t.find(pgid) == t.end()) { + t.insert(pgid); + num_pg_by_osd[*p].up_not_acting++; + } + } + + if (s.up_primary >= 0) { + num_pg_by_osd[s.up_primary].primary++; + } +} + +bool PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, + bool sameosds) +{ + bool pool_erased = false; + pg_sum.sub(s); + + num_pg--; + int end = --num_pg_by_state[s.state]; + ceph_assert(end >= 0); + if (end == 0) + num_pg_by_state.erase(s.state); + if (--num_pg_by_pool_state[pgid.pool()][s.state] == 0) { + num_pg_by_pool_state[pgid.pool()].erase(s.state); + } + end = --num_pg_by_pool[pgid.pool()]; + if (end == 0) { + pool_erased = true; + } + + if ((s.state & PG_STATE_CREATING) && + s.parent_split_bits == 0) { + creating_pgs.erase(pgid); + if (s.acting_primary >= 0) { + map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary]; + r[s.mapping_epoch].erase(pgid); + if (r[s.mapping_epoch].empty()) + r.erase(s.mapping_epoch); + if (r.empty()) + creating_pgs_by_osd_epoch.erase(s.acting_primary); + } + } + + if (s.state & PG_STATE_ACTIVE) { + --num_pg_active; + } + if (s.state == 0) { + --num_pg_unknown; + } + + if (sameosds) + return pool_erased; + + for (auto p = s.blocked_by.begin(); + p != s.blocked_by.end(); + ++p) { + auto q = blocked_by_sum.find(*p); + ceph_assert(q != blocked_by_sum.end()); + --q->second; + if (q->second == 0) + blocked_by_sum.erase(q); + } + + set<int32_t> actingset; + for (auto p = s.acting.begin(); p != s.acting.end(); ++p) { + actingset.insert(*p); + auto& oset = pg_by_osd[*p]; + oset.erase(pgid); + if (oset.empty()) + pg_by_osd.erase(*p); + auto it = num_pg_by_osd.find(*p); + if (it != num_pg_by_osd.end() && it->second.acting > 0) + it->second.acting--; + } + for (auto p = s.up.begin(); p != s.up.end(); ++p) { + auto& oset = pg_by_osd[*p]; + oset.erase(pgid); + if (oset.empty()) + pg_by_osd.erase(*p); + if (actingset.count(*p)) + continue; + auto it = num_pg_by_osd.find(*p); + if (it != num_pg_by_osd.end() && it->second.up_not_acting > 0) + it->second.up_not_acting--; + } + + if (s.up_primary >= 0) { + auto it = num_pg_by_osd.find(s.up_primary); + if (it != num_pg_by_osd.end() && it->second.primary > 0) + it->second.primary--; + } + return pool_erased; +} + +void PGMap::calc_purged_snaps() +{ + purged_snaps.clear(); + set<int64_t> unknown; + for (auto& i : pg_stat) { + if (i.second.state == 0) { + unknown.insert(i.first.pool()); + purged_snaps.erase(i.first.pool()); + continue; + } else if (unknown.count(i.first.pool())) { + continue; + } + auto j = purged_snaps.find(i.first.pool()); + if (j == purged_snaps.end()) { + // base case + purged_snaps[i.first.pool()] = i.second.purged_snaps; + } else { + j->second.intersection_of(i.second.purged_snaps); + } + } +} + +void PGMap::calc_osd_sum_by_class(const OSDMap& osdmap) +{ + osd_sum_by_class.clear(); + for (auto& i : osd_stat) { + const char *class_name = osdmap.crush->get_item_class(i.first); + if (class_name) { + osd_sum_by_class[class_name].add(i.second); + } + } +} + +void PGMap::stat_osd_add(int osd, const osd_stat_t &s) +{ + num_osd++; + osd_sum.add(s); + if (osd >= (int)osd_last_seq.size()) { + osd_last_seq.resize(osd + 1); + } + osd_last_seq[osd] = s.seq; +} + +void PGMap::stat_osd_sub(int osd, const osd_stat_t &s) +{ + num_osd--; + osd_sum.sub(s); + ceph_assert(osd < (int)osd_last_seq.size()); + osd_last_seq[osd] = 0; +} + +void PGMap::encode_digest(const OSDMap& osdmap, + bufferlist& bl, uint64_t features) +{ + get_rules_avail(osdmap, &avail_space_by_rule); + calc_osd_sum_by_class(osdmap); + calc_purged_snaps(); + PGMapDigest::encode(bl, features); +} + +void PGMap::encode(bufferlist &bl, uint64_t features) const +{ + ENCODE_START(8, 8, bl); + encode(version, bl); + encode(pg_stat, bl); + encode(osd_stat, bl, features); + encode(last_osdmap_epoch, bl); + encode(last_pg_scan, bl); + encode(stamp, bl); + encode(pool_statfs, bl, features); + ENCODE_FINISH(bl); +} + +void PGMap::decode(bufferlist::const_iterator &bl) +{ + DECODE_START(8, bl); + decode(version, bl); + decode(pg_stat, bl); + decode(osd_stat, bl); + decode(last_osdmap_epoch, bl); + decode(last_pg_scan, bl); + decode(stamp, bl); + decode(pool_statfs, bl); + DECODE_FINISH(bl); + + calc_stats(); +} + +void PGMap::dump(ceph::Formatter *f, bool with_net) const +{ + dump_basic(f); + dump_pg_stats(f, false); + dump_pool_stats(f); + dump_osd_stats(f, with_net); +} + +void PGMap::dump_basic(ceph::Formatter *f) const +{ + f->dump_unsigned("version", version); + f->dump_stream("stamp") << stamp; + f->dump_unsigned("last_osdmap_epoch", last_osdmap_epoch); + f->dump_unsigned("last_pg_scan", last_pg_scan); + + f->open_object_section("pg_stats_sum"); + pg_sum.dump(f); + f->close_section(); + + f->open_object_section("osd_stats_sum"); + osd_sum.dump(f); + f->close_section(); + + dump_delta(f); +} + +void PGMap::dump_delta(ceph::Formatter *f) const +{ + f->open_object_section("pg_stats_delta"); + pg_sum_delta.dump(f); + f->dump_stream("stamp_delta") << stamp_delta; + f->close_section(); +} + +void PGMap::dump_pg_stats(ceph::Formatter *f, bool brief) const +{ + f->open_array_section("pg_stats"); + for (auto i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + f->open_object_section("pg_stat"); + f->dump_stream("pgid") << i->first; + if (brief) + i->second.dump_brief(f); + else + i->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_pg_progress(ceph::Formatter *f) const +{ + f->open_object_section("pgs"); + for (auto& i : pg_stat) { + std::string n = stringify(i.first); + f->open_object_section(n.c_str()); + f->dump_int("num_bytes_recovered", i.second.stats.sum.num_bytes_recovered); + f->dump_int("num_bytes", i.second.stats.sum.num_bytes); + f->dump_unsigned("reported_epoch", i.second.reported_epoch); + f->dump_string("state", pg_state_string(i.second.state)); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_pool_stats(ceph::Formatter *f) const +{ + f->open_array_section("pool_stats"); + for (auto p = pg_pool_sum.begin(); + p != pg_pool_sum.end(); + ++p) { + f->open_object_section("pool_stat"); + f->dump_int("poolid", p->first); + auto q = num_pg_by_pool.find(p->first); + if (q != num_pg_by_pool.end()) + f->dump_unsigned("num_pg", q->second); + p->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_osd_stats(ceph::Formatter *f, bool with_net) const +{ + f->open_array_section("osd_stats"); + for (auto q = osd_stat.begin(); + q != osd_stat.end(); + ++q) { + f->open_object_section("osd_stat"); + f->dump_int("osd", q->first); + q->second.dump(f, with_net); + f->close_section(); + } + f->close_section(); + + f->open_array_section("pool_statfs"); + for (auto& p : pool_statfs) { + f->open_object_section("item"); + f->dump_int("poolid", p.first.first); + f->dump_int("osd", p.first.second); + p.second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_osd_ping_times(ceph::Formatter *f) const +{ + f->open_array_section("osd_ping_times"); + for (auto& [osd, stat] : osd_stat) { + f->open_object_section("osd_ping_time"); + f->dump_int("osd", osd); + stat.dump_ping_time(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_pg_stats_plain( + ostream& ss, + const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats, + bool brief) const +{ + TextTable tab; + + if (brief){ + tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT); + tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + } + else { + tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT); + tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("STATE_STAMP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UP_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("ACTING_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LAST_SCRUB", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT); + } + + for (auto i = pg_stats.begin(); + i != pg_stats.end(); ++i) { + const pg_stat_t &st(i->second); + if (brief) { + tab << i->first + << pg_state_string(st.state) + << st.up + << st.up_primary + << st.acting + << st.acting_primary + << TextTable::endrow; + } else { + ostringstream reported; + reported << st.reported_epoch << ":" << st.reported_seq; + + tab << i->first + << st.stats.sum.num_objects + << st.stats.sum.num_objects_missing_on_primary + << st.stats.sum.num_objects_degraded + << st.stats.sum.num_objects_misplaced + << st.stats.sum.num_objects_unfound + << st.stats.sum.num_bytes + << st.stats.sum.num_omap_bytes + << st.stats.sum.num_omap_keys + << st.log_size + << st.ondisk_log_size + << pg_state_string(st.state) + << st.last_change + << st.version + << reported.str() + << pg_vector_string(st.up) + << st.up_primary + << pg_vector_string(st.acting) + << st.acting_primary + << st.last_scrub + << st.last_scrub_stamp + << st.last_deep_scrub + << st.last_deep_scrub_stamp + << st.snaptrimq_len + << TextTable::endrow; + } + } + + ss << tab; +} + +void PGMap::dump(ostream& ss) const +{ + dump_basic(ss); + dump_pg_stats(ss, false); + dump_pool_stats(ss, false); + dump_pg_sum_stats(ss, false); + dump_osd_stats(ss); +} + +void PGMap::dump_basic(ostream& ss) const +{ + ss << "version " << version << std::endl; + ss << "stamp " << stamp << std::endl; + ss << "last_osdmap_epoch " << last_osdmap_epoch << std::endl; + ss << "last_pg_scan " << last_pg_scan << std::endl; +} + +void PGMap::dump_pg_stats(ostream& ss, bool brief) const +{ + dump_pg_stats_plain(ss, pg_stat, brief); +} + +void PGMap::dump_pool_stats(ostream& ss, bool header) const +{ + TextTable tab; + + if (header) { + tab.define_column("POOLID", TextTable::LEFT, TextTable::LEFT); + tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT); + } else { + tab.define_column("", TextTable::LEFT, TextTable::LEFT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + } + + for (auto p = pg_pool_sum.begin(); + p != pg_pool_sum.end(); + ++p) { + tab << p->first + << p->second.stats.sum.num_objects + << p->second.stats.sum.num_objects_missing_on_primary + << p->second.stats.sum.num_objects_degraded + << p->second.stats.sum.num_objects_misplaced + << p->second.stats.sum.num_objects_unfound + << p->second.stats.sum.num_bytes + << p->second.stats.sum.num_omap_bytes + << p->second.stats.sum.num_omap_keys + << p->second.log_size + << p->second.ondisk_log_size + << TextTable::endrow; + } + + ss << tab; +} + +void PGMap::dump_pg_sum_stats(ostream& ss, bool header) const +{ + TextTable tab; + + if (header) { + tab.define_column("PG_STAT", TextTable::LEFT, TextTable::LEFT); + tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISSING_ON_PRIMARY", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DISK_LOG", TextTable::LEFT, TextTable::RIGHT); + } else { + tab.define_column("", TextTable::LEFT, TextTable::LEFT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("", TextTable::LEFT, TextTable::RIGHT); + }; + + tab << "sum" + << pg_sum.stats.sum.num_objects + << pg_sum.stats.sum.num_objects_missing_on_primary + << pg_sum.stats.sum.num_objects_degraded + << pg_sum.stats.sum.num_objects_misplaced + << pg_sum.stats.sum.num_objects_unfound + << pg_sum.stats.sum.num_bytes + << pg_sum.stats.sum.num_omap_bytes + << pg_sum.stats.sum.num_omap_keys + << pg_sum.log_size + << pg_sum.ondisk_log_size + << TextTable::endrow; + + ss << tab; +} + +void PGMap::dump_osd_stats(ostream& ss) const +{ + TextTable tab; + + tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT); + tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT); + + for (auto p = osd_stat.begin(); + p != osd_stat.end(); + ++p) { + tab << p->first + << byte_u_t(p->second.statfs.get_used()) + << byte_u_t(p->second.statfs.available) + << byte_u_t(p->second.statfs.get_used_raw()) + << byte_u_t(p->second.statfs.total) + << p->second.hb_peers + << get_num_pg_by_osd(p->first) + << get_num_primary_pg_by_osd(p->first) + << TextTable::endrow; + } + + tab << "sum" + << byte_u_t(osd_sum.statfs.get_used()) + << byte_u_t(osd_sum.statfs.available) + << byte_u_t(osd_sum.statfs.get_used_raw()) + << byte_u_t(osd_sum.statfs.total) + << TextTable::endrow; + + ss << tab; +} + +void PGMap::dump_osd_sum_stats(ostream& ss) const +{ + TextTable tab; + + tab.define_column("OSD_STAT", TextTable::LEFT, TextTable::LEFT); + tab.define_column("USED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("USED_RAW", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT); + + tab << "sum" + << byte_u_t(osd_sum.statfs.get_used()) + << byte_u_t(osd_sum.statfs.available) + << byte_u_t(osd_sum.statfs.get_used_raw()) + << byte_u_t(osd_sum.statfs.total) + << TextTable::endrow; + + ss << tab; +} + +void PGMap::get_stuck_stats( + int types, const utime_t cutoff, + mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const +{ + ceph_assert(types != 0); + for (auto i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + utime_t val = cutoff; // don't care about >= cutoff so that is infinity + + if ((types & STUCK_INACTIVE) && !(i->second.state & PG_STATE_ACTIVE)) { + if (i->second.last_active < val) + val = i->second.last_active; + } + + if ((types & STUCK_UNCLEAN) && !(i->second.state & PG_STATE_CLEAN)) { + if (i->second.last_clean < val) + val = i->second.last_clean; + } + + if ((types & STUCK_DEGRADED) && (i->second.state & PG_STATE_DEGRADED)) { + if (i->second.last_undegraded < val) + val = i->second.last_undegraded; + } + + if ((types & STUCK_UNDERSIZED) && (i->second.state & PG_STATE_UNDERSIZED)) { + if (i->second.last_fullsized < val) + val = i->second.last_fullsized; + } + + if ((types & STUCK_STALE) && (i->second.state & PG_STATE_STALE)) { + if (i->second.last_unstale < val) + val = i->second.last_unstale; + } + + // val is now the earliest any of the requested stuck states began + if (val < cutoff) { + stuck_pgs[i->first] = i->second; + } + } +} + +bool PGMap::get_stuck_counts(const utime_t cutoff, map<string, int>& note) const +{ + int inactive = 0; + int unclean = 0; + int degraded = 0; + int undersized = 0; + int stale = 0; + + for (auto i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + if (! (i->second.state & PG_STATE_ACTIVE)) { + if (i->second.last_active < cutoff) + ++inactive; + } + if (! (i->second.state & PG_STATE_CLEAN)) { + if (i->second.last_clean < cutoff) + ++unclean; + } + if (i->second.state & PG_STATE_DEGRADED) { + if (i->second.last_undegraded < cutoff) + ++degraded; + } + if (i->second.state & PG_STATE_UNDERSIZED) { + if (i->second.last_fullsized < cutoff) + ++undersized; + } + if (i->second.state & PG_STATE_STALE) { + if (i->second.last_unstale < cutoff) + ++stale; + } + } + + if (inactive) + note["stuck inactive"] = inactive; + + if (unclean) + note["stuck unclean"] = unclean; + + if (undersized) + note["stuck undersized"] = undersized; + + if (degraded) + note["stuck degraded"] = degraded; + + if (stale) + note["stuck stale"] = stale; + + return inactive || unclean || undersized || degraded || stale; +} + +void PGMap::dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const +{ + mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats; + get_stuck_stats(types, cutoff, stuck_pg_stats); + f->open_array_section("stuck_pg_stats"); + for (auto i = stuck_pg_stats.begin(); + i != stuck_pg_stats.end(); + ++i) { + f->open_object_section("pg_stat"); + f->dump_stream("pgid") << i->first; + i->second.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const +{ + mempool::pgmap::unordered_map<pg_t, pg_stat_t> stuck_pg_stats; + get_stuck_stats(types, cutoff, stuck_pg_stats); + if (!stuck_pg_stats.empty()) + dump_pg_stats_plain(ss, stuck_pg_stats, true); +} + +int PGMap::dump_stuck_pg_stats( + stringstream &ds, + ceph::Formatter *f, + int threshold, + vector<string>& args) const +{ + int stuck_types = 0; + + for (auto i = args.begin(); i != args.end(); ++i) { + if (*i == "inactive") + stuck_types |= PGMap::STUCK_INACTIVE; + else if (*i == "unclean") + stuck_types |= PGMap::STUCK_UNCLEAN; + else if (*i == "undersized") + stuck_types |= PGMap::STUCK_UNDERSIZED; + else if (*i == "degraded") + stuck_types |= PGMap::STUCK_DEGRADED; + else if (*i == "stale") + stuck_types |= PGMap::STUCK_STALE; + else { + ds << "Unknown type: " << *i << std::endl; + return -EINVAL; + } + } + + utime_t now(ceph_clock_now()); + utime_t cutoff = now - utime_t(threshold, 0); + + if (!f) { + dump_stuck_plain(ds, stuck_types, cutoff); + } else { + dump_stuck(f, stuck_types, cutoff); + f->flush(ds); + } + + return 0; +} + +void PGMap::dump_osd_perf_stats(ceph::Formatter *f) const +{ + f->open_array_section("osd_perf_infos"); + for (auto i = osd_stat.begin(); + i != osd_stat.end(); + ++i) { + f->open_object_section("osd"); + f->dump_int("id", i->first); + { + f->open_object_section("perf_stats"); + i->second.os_perf_stat.dump(f); + f->close_section(); + } + f->close_section(); + } + f->close_section(); +} +void PGMap::print_osd_perf_stats(std::ostream *ss) const +{ + TextTable tab; + tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("commit_latency(ms)", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("apply_latency(ms)", TextTable::LEFT, TextTable::RIGHT); + for (auto i = osd_stat.begin(); + i != osd_stat.end(); + ++i) { + tab << i->first; + tab << i->second.os_perf_stat.os_commit_latency_ns / 1000000ull; + tab << i->second.os_perf_stat.os_apply_latency_ns / 1000000ull; + tab << TextTable::endrow; + } + (*ss) << tab; +} + +void PGMap::dump_osd_blocked_by_stats(ceph::Formatter *f) const +{ + f->open_array_section("osd_blocked_by_infos"); + for (auto i = blocked_by_sum.begin(); + i != blocked_by_sum.end(); + ++i) { + f->open_object_section("osd"); + f->dump_int("id", i->first); + f->dump_int("num_blocked", i->second); + f->close_section(); + } + f->close_section(); +} +void PGMap::print_osd_blocked_by_stats(std::ostream *ss) const +{ + TextTable tab; + tab.define_column("osd", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("num_blocked", TextTable::LEFT, TextTable::RIGHT); + for (auto i = blocked_by_sum.begin(); + i != blocked_by_sum.end(); + ++i) { + tab << i->first; + tab << i->second; + tab << TextTable::endrow; + } + (*ss) << tab; +} + + +/** + * update aggregated delta + * + * @param cct ceph context + * @param ts Timestamp for the stats being delta'ed + * @param old_pool_sum Previous stats sum + * @param last_ts Last timestamp for pool + * @param result_pool_sum Resulting stats + * @param result_pool_delta Resulting pool delta + * @param result_ts_delta Resulting timestamp delta + * @param delta_avg_list List of last N computed deltas, used to average + */ +void PGMap::update_delta( + CephContext *cct, + const utime_t ts, + const pool_stat_t& old_pool_sum, + utime_t *last_ts, + const pool_stat_t& current_pool_sum, + pool_stat_t *result_pool_delta, + utime_t *result_ts_delta, + mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list) +{ + /* @p ts is the timestamp we want to associate with the data + * in @p old_pool_sum, and on which we will base ourselves to + * calculate the delta, stored in 'delta_t'. + */ + utime_t delta_t; + delta_t = ts; // start with the provided timestamp + delta_t -= *last_ts; // take the last timestamp we saw + *last_ts = ts; // @p ts becomes the last timestamp we saw + + // adjust delta_t, quick start if there is no update in a long period + delta_t = std::min(delta_t, + utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0)); + + // calculate a delta, and average over the last 6 deltas by default. + /* start by taking a copy of our current @p result_pool_sum, and by + * taking out the stats from @p old_pool_sum. This generates a stats + * delta. Stash this stats delta in @p delta_avg_list, along with the + * timestamp delta for these results. + */ + pool_stat_t d = current_pool_sum; + d.stats.sub(old_pool_sum.stats); + + /* Aggregate current delta, and take out the last seen delta (if any) to + * average it out. + * Skip calculating delta while sum was not synchronized. + */ + if(!old_pool_sum.stats.sum.is_zero()) { + delta_avg_list->push_back(make_pair(d,delta_t)); + *result_ts_delta += delta_t; + result_pool_delta->stats.add(d.stats); + } + size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1; + while (delta_avg_list->size() > s) { + result_pool_delta->stats.sub(delta_avg_list->front().first.stats); + *result_ts_delta -= delta_avg_list->front().second; + delta_avg_list->pop_front(); + } +} + +/** + * Update a given pool's deltas + * + * @param cct Ceph Context + * @param ts Timestamp for the stats being delta'ed + * @param pool Pool's id + * @param old_pool_sum Previous stats sum + */ +void PGMap::update_one_pool_delta( + CephContext *cct, + const utime_t ts, + const int64_t pool, + const pool_stat_t& old_pool_sum) +{ + if (per_pool_sum_deltas.count(pool) == 0) { + ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0); + ceph_assert(per_pool_sum_delta.count(pool) == 0); + } + + auto& sum_delta = per_pool_sum_delta[pool]; + + update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool], + &sum_delta.first, &per_pool_sum_deltas_stamps[pool], + &per_pool_sum_deltas[pool]); +} + +/** + * Update pools' deltas + * + * @param cct CephContext + * @param ts Timestamp for the stats being delta'ed + * @param pg_pool_sum_old Map of pool stats for delta calcs. + */ +void PGMap::update_pool_deltas( + CephContext *cct, const utime_t ts, + const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old) +{ + for (auto it = pg_pool_sum_old.begin(); + it != pg_pool_sum_old.end(); ++it) { + update_one_pool_delta(cct, ts, it->first, it->second); + } +} + +void PGMap::clear_delta() +{ + pg_sum_delta = pool_stat_t(); + pg_sum_deltas.clear(); + stamp_delta = utime_t(); +} + +void PGMap::generate_test_instances(list<PGMap*>& o) +{ + o.push_back(new PGMap); + list<Incremental*> inc; + Incremental::generate_test_instances(inc); + delete inc.front(); + inc.pop_front(); + while (!inc.empty()) { + PGMap *pmp = new PGMap(); + *pmp = *o.back(); + o.push_back(pmp); + o.back()->apply_incremental(NULL, *inc.front()); + delete inc.front(); + inc.pop_front(); + } +} + +void PGMap::get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid, + bool primary, set<pg_t>& pgs) const +{ + for (auto i = pg_stat.begin(); + i != pg_stat.end(); + ++i) { + if ((poolid >= 0) && (poolid != i->first.pool())) + continue; + if ((osdid >= 0) && !(i->second.is_acting_osd(osdid,primary))) + continue; + if (state == (uint64_t)-1 || // "all" + (i->second.state & state) || // matches a state bit + (state == 0 && i->second.state == 0)) { // matches "unknown" (== 0) + pgs.insert(i->first); + } + } +} + +void PGMap::dump_filtered_pg_stats(ceph::Formatter *f, set<pg_t>& pgs) const +{ + f->open_array_section("pg_stats"); + for (auto i = pgs.begin(); i != pgs.end(); ++i) { + const pg_stat_t& st = pg_stat.at(*i); + f->open_object_section("pg_stat"); + f->dump_stream("pgid") << *i; + st.dump(f); + f->close_section(); + } + f->close_section(); +} + +void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const +{ + TextTable tab; + utime_t now = ceph_clock_now(); + + tab.define_column("PG", TextTable::LEFT, TextTable::LEFT); + tab.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEGRADED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("MISPLACED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UNFOUND", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("BYTES", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_BYTES*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("OMAP_KEYS*", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("LOG", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("STATE", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("SINCE", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("VERSION", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("REPORTED", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("UP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); + + for (auto i = pgs.begin(); i != pgs.end(); ++i) { + const pg_stat_t& st = pg_stat.at(*i); + + ostringstream reported; + reported << st.reported_epoch << ":" << st.reported_seq; + + ostringstream upstr, actingstr; + upstr << pg_vector_string(st.up) << 'p' << st.up_primary; + actingstr << pg_vector_string(st.acting) << 'p' << st.acting_primary; + tab << *i + << st.stats.sum.num_objects + << st.stats.sum.num_objects_degraded + << st.stats.sum.num_objects_misplaced + << st.stats.sum.num_objects_unfound + << st.stats.sum.num_bytes + << st.stats.sum.num_omap_bytes + << st.stats.sum.num_omap_keys + << st.log_size + << pg_state_string(st.state) + << utimespan_str(now - st.last_change) + << st.version + << reported.str() + << upstr.str() + << actingstr.str() + << st.last_scrub_stamp + << st.last_deep_scrub_stamp + << TextTable::endrow; + } + + ss << tab; +} + +void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map, + ceph::Formatter *f, + stringstream *rs) const { + string pool_name = osd_map.get_pool_name(poolid); + if (f) { + f->open_object_section("pool"); + f->dump_string("pool_name", pool_name.c_str()); + f->dump_int("pool_id", poolid); + f->open_object_section("recovery"); + } + list<string> sl; + stringstream tss; + pool_recovery_summary(f, &sl, poolid); + if (!f && !sl.empty()) { + for (auto &p : sl) + tss << " " << p << "\n"; + } + if (f) { + f->close_section(); // object section recovery + f->open_object_section("recovery_rate"); + } + ostringstream rss; + pool_recovery_rate_summary(f, &rss, poolid); + if (!f && !rss.str().empty()) + tss << " recovery io " << rss.str() << "\n"; + if (f) { + f->close_section(); // object section recovery_rate + f->open_object_section("client_io_rate"); + } + rss.clear(); + rss.str(""); + pool_client_io_rate_summary(f, &rss, poolid); + if (!f && !rss.str().empty()) + tss << " client io " << rss.str() << "\n"; + // dump cache tier IO rate for cache pool + const pg_pool_t *pool = osd_map.get_pg_pool(poolid); + if (pool->is_tier()) { + if (f) { + f->close_section(); // object section client_io_rate + f->open_object_section("cache_io_rate"); + } + rss.clear(); + rss.str(""); + pool_cache_io_rate_summary(f, &rss, poolid); + if (!f && !rss.str().empty()) + tss << " cache tier io " << rss.str() << "\n"; + } + if (f) { + f->close_section(); // object section cache_io_rate + f->close_section(); // object section pool + } else { + *rs << "pool " << pool_name << " id " << poolid << "\n"; + if (!tss.str().empty()) + *rs << tss.str() << "\n"; + else + *rs << " nothing is going on\n\n"; + } +} + +// Get crush parentage for an osd (skip root) +set<std::string> PGMap::osd_parentage(const OSDMap& osdmap, int id) const +{ + set<std::string> reporters_by_subtree; + auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level"); + + auto loc = osdmap.crush->get_full_location(id); + for (auto& [parent_bucket_type, parent_id] : loc) { + // Should we show the root? Might not be too informative like "default" + if (parent_bucket_type != "root" && + parent_bucket_type != reporter_subtree_level) { + reporters_by_subtree.insert(parent_id); + } + } + return reporters_by_subtree; +} + +void PGMap::get_health_checks( + CephContext *cct, + const OSDMap& osdmap, + health_check_map_t *checks) const +{ + utime_t now = ceph_clock_now(); + const auto max = cct->_conf.get_val<uint64_t>("mon_health_max_detail"); + const auto& pools = osdmap.get_pools(); + + typedef enum pg_consequence_t { + UNAVAILABLE = 1, // Client IO to the pool may block + DEGRADED = 2, // Fewer than the requested number of replicas are present + BACKFILL_FULL = 3, // Backfill is blocked for space considerations + // This may or may not be a deadlock condition. + DAMAGED = 4, // The data may be missing or inconsistent on disk and + // requires repair + RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full + } pg_consequence_t; + + // For a given PG state, how should it be reported at the pool level? + class PgStateResponse { + public: + pg_consequence_t consequence; + typedef std::function< utime_t(const pg_stat_t&) > stuck_cb; + stuck_cb stuck_since; + bool invert; + + PgStateResponse(const pg_consequence_t& c, stuck_cb&& s) + : consequence(c), stuck_since(std::move(s)), invert(false) + { + } + + PgStateResponse(const pg_consequence_t& c, stuck_cb&& s, bool i) + : consequence(c), stuck_since(std::move(s)), invert(i) + { + } + }; + + // Record the PG state counts that contributed to a reported pool state + class PgCauses { + public: + // Map of PG_STATE_* to number of pgs in that state. + std::map<unsigned, unsigned> states; + + // List of all PG IDs that had a state contributing + // to this health condition. + std::set<pg_t> pgs; + + std::map<pg_t, std::string> pg_messages; + }; + + // Map of PG state to how to respond to it + std::map<unsigned, PgStateResponse> state_to_response = { + // Immediate reports + { PG_STATE_INCONSISTENT, {DAMAGED, {}} }, + { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} }, + { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} }, + { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} }, + { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} }, + { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} }, + { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} }, + { PG_STATE_DEGRADED, {DEGRADED, {}} }, + { PG_STATE_DOWN, {UNAVAILABLE, {}} }, + // Delayed (wait until stuck) reports + { PG_STATE_PEERING, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;} } }, + { PG_STATE_UNDERSIZED, {DEGRADED, [](const pg_stat_t &p){return p.last_fullsized;} } }, + { PG_STATE_STALE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;} } }, + // Delayed and inverted reports + { PG_STATE_ACTIVE, {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} } + }; + + // Specialized state printer that takes account of inversion of + // ACTIVE, CLEAN checks. + auto state_name = [](const uint64_t &state) { + // Special cases for the states that are inverted checks + if (state == PG_STATE_CLEAN) { + return std::string("unclean"); + } else if (state == PG_STATE_ACTIVE) { + return std::string("inactive"); + } else { + return pg_state_string(state); + } + }; + + // Map of what is wrong to information about why, implicitly also stores + // the list of what is wrong. + std::map<pg_consequence_t, PgCauses> detected; + + // Optimisation: trim down the number of checks to apply based on + // the summary counters + std::map<unsigned, PgStateResponse> possible_responses; + for (const auto &i : num_pg_by_state) { + for (const auto &j : state_to_response) { + if (!j.second.invert) { + // Check for normal tests by seeing if any pgs have the flag + if (i.first & j.first) { + possible_responses.insert(j); + } + } + } + } + + for (const auto &j : state_to_response) { + if (j.second.invert) { + // Check for inverted tests by seeing if not-all pgs have the flag + const auto &found = num_pg_by_state.find(j.first); + if (found == num_pg_by_state.end() || found->second != num_pg) { + possible_responses.insert(j); + } + } + } + + utime_t cutoff = now - utime_t(cct->_conf.get_val<int64_t>("mon_pg_stuck_threshold"), 0); + // Loop over all PGs, if there are any possibly-unhealthy states in there + if (!possible_responses.empty()) { + for (const auto& i : pg_stat) { + const auto &pg_id = i.first; + const auto &pg_info = i.second; + + for (const auto &j : state_to_response) { + const auto &pg_response_state = j.first; + const auto &pg_response = j.second; + + // Apply the state test + if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) { + continue; + } + + // Apply stuckness test if needed + if (pg_response.stuck_since) { + // Delayed response, check for stuckness + utime_t last_whatever = pg_response.stuck_since(pg_info); + if (last_whatever.is_zero() && + pg_info.last_change >= cutoff) { + // still moving, ignore + continue; + } else if (last_whatever >= cutoff) { + // Not stuck enough, ignore. + continue; + } else { + + } + } + + auto &causes = detected[pg_response.consequence]; + causes.states[pg_response_state]++; + causes.pgs.insert(pg_id); + + // Don't bother composing detail string if we have already recorded + // too many + if (causes.pg_messages.size() > max) { + continue; + } + + std::ostringstream ss; + if (pg_response.stuck_since) { + utime_t since = pg_response.stuck_since(pg_info); + ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state); + if (since == utime_t()) { + ss << " since forever"; + } else { + utime_t dur = now - since; + ss << " for " << utimespan_str(dur); + } + ss << ", current state " << pg_state_string(pg_info.state) + << ", last acting " << pg_info.acting; + } else { + ss << "pg " << pg_id << " is " + << pg_state_string(pg_info.state); + ss << ", acting " << pg_info.acting; + if (pg_info.stats.sum.num_objects_unfound) { + ss << ", " << pg_info.stats.sum.num_objects_unfound + << " unfound"; + } + } + + if (pg_info.state & PG_STATE_INCOMPLETE) { + const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool()); + if (pi && pi->min_size > 1) { + ss << " (reducing pool " + << osdmap.get_pool_name(pg_id.pool()) + << " min_size from " << (int)pi->min_size + << " may help; search ceph.com/docs for 'incomplete')"; + } + } + + causes.pg_messages[pg_id] = ss.str(); + } + } + } else { + dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl; + } + + for (const auto &i : detected) { + std::string health_code; + health_status_t sev; + std::string summary; + switch(i.first) { + case UNAVAILABLE: + health_code = "PG_AVAILABILITY"; + sev = HEALTH_WARN; + summary = "Reduced data availability: "; + break; + case DEGRADED: + health_code = "PG_DEGRADED"; + summary = "Degraded data redundancy: "; + sev = HEALTH_WARN; + break; + case BACKFILL_FULL: + health_code = "PG_BACKFILL_FULL"; + summary = "Low space hindering backfill (add storage if this doesn't resolve itself): "; + sev = HEALTH_WARN; + break; + case DAMAGED: + health_code = "PG_DAMAGED"; + summary = "Possible data damage: "; + sev = HEALTH_ERR; + break; + case RECOVERY_FULL: + health_code = "PG_RECOVERY_FULL"; + summary = "Full OSDs blocking recovery: "; + sev = HEALTH_ERR; + break; + default: + ceph_abort(); + } + + if (i.first == DEGRADED) { + if (pg_sum.stats.sum.num_objects_degraded && + pg_sum.stats.sum.num_object_copies > 0) { + double pc = (double)pg_sum.stats.sum.num_objects_degraded / + (double)pg_sum.stats.sum.num_object_copies * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + ostringstream ss; + ss << pg_sum.stats.sum.num_objects_degraded + << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded (" + << b << "%)"; + + // Throw in a comma for the benefit of the following PG counts + summary += ss.str() + ", "; + } + } + + // Compose summary message saying how many PGs in what states led + // to this health check failing + std::vector<std::string> pg_msgs; + int64_t count = 0; + for (const auto &j : i.second.states) { + std::ostringstream msg; + msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first); + pg_msgs.push_back(msg.str()); + count += j.second; + } + summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", ")); + + health_check_t *check = &checks->add( + health_code, + sev, + summary, + count); + + // Compose list of PGs contributing to this health check failing + for (const auto &j : i.second.pg_messages) { + check->detail.push_back(j.second); + } + } + + // OSD_SCRUB_ERRORS + if (pg_sum.stats.sum.num_scrub_errors) { + ostringstream ss; + ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors"; + checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(), + pg_sum.stats.sum.num_scrub_errors); + } + + // LARGE_OMAP_OBJECTS + if (pg_sum.stats.sum.num_large_omap_objects) { + list<string> detail; + for (auto &pool : pools) { + const string& pool_name = osdmap.get_pool_name(pool.first); + auto it2 = pg_pool_sum.find(pool.first); + if (it2 == pg_pool_sum.end()) { + continue; + } + const pool_stat_t *pstat = &it2->second; + if (pstat == nullptr) { + continue; + } + const object_stat_sum_t& sum = pstat->stats.sum; + if (sum.num_large_omap_objects) { + stringstream ss; + ss << sum.num_large_omap_objects << " large objects found in pool " + << "'" << pool_name << "'"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects"; + auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(), + pg_sum.stats.sum.num_large_omap_objects); + stringstream tip; + tip << "Search the cluster log for 'Large omap object found' for more " + << "details."; + detail.push_back(tip.str()); + d.detail.swap(detail); + } + } + + // CACHE_POOL_NEAR_FULL + { + list<string> detail; + unsigned num_pools = 0; + for (auto& p : pools) { + if ((!p.second.target_max_objects && !p.second.target_max_bytes) || + !pg_pool_sum.count(p.first)) { + continue; + } + bool nearfull = false; + const string& name = osdmap.get_pool_name(p.first); + const pool_stat_t& st = get_pg_pool_sum_stat(p.first); + uint64_t ratio = p.second.cache_target_full_ratio_micro + + ((1000000 - p.second.cache_target_full_ratio_micro) * + cct->_conf->mon_cache_target_full_warn_ratio); + if (p.second.target_max_objects && + (uint64_t)(st.stats.sum.num_objects - + st.stats.sum.num_objects_hit_set_archive) > + p.second.target_max_objects * (ratio / 1000000.0)) { + ostringstream ss; + ss << "cache pool '" << name << "' with " + << si_u_t(st.stats.sum.num_objects) + << " objects at/near target max " + << si_u_t(p.second.target_max_objects) << " objects"; + detail.push_back(ss.str()); + nearfull = true; + } + if (p.second.target_max_bytes && + (uint64_t)(st.stats.sum.num_bytes - + st.stats.sum.num_bytes_hit_set_archive) > + p.second.target_max_bytes * (ratio / 1000000.0)) { + ostringstream ss; + ss << "cache pool '" << name + << "' with " << byte_u_t(st.stats.sum.num_bytes) + << " at/near target max " + << byte_u_t(p.second.target_max_bytes); + detail.push_back(ss.str()); + nearfull = true; + } + if (nearfull) { + ++num_pools; + } + } + if (!detail.empty()) { + ostringstream ss; + ss << num_pools << " cache pools at or near target size"; + auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(), + num_pools); + d.detail.swap(detail); + } + } + + // TOO_FEW_PGS + unsigned num_in = osdmap.get_num_in_osds(); + auto sum_pg_up = std::max(static_cast<size_t>(pg_sum.up), pg_stat.size()); + const auto min_pg_per_osd = + cct->_conf.get_val<uint64_t>("mon_pg_warn_min_per_osd"); + if (num_in && min_pg_per_osd > 0 && osdmap.get_pools().size() > 0) { + auto per = sum_pg_up / num_in; + if (per < min_pg_per_osd && per) { + ostringstream ss; + ss << "too few PGs per OSD (" << per + << " < min " << min_pg_per_osd << ")"; + checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(), + min_pg_per_osd - per); + } + } + + // TOO_MANY_PGS + auto max_pg_per_osd = cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd"); + if (num_in && max_pg_per_osd > 0) { + auto per = sum_pg_up / num_in; + if (per > max_pg_per_osd) { + ostringstream ss; + ss << "too many PGs per OSD (" << per + << " > max " << max_pg_per_osd << ")"; + checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(), + per - max_pg_per_osd); + } + } + + // TOO_FEW_OSDS + auto warn_too_few_osds = cct->_conf.get_val<bool>("mon_warn_on_too_few_osds"); + auto osd_pool_default_size = cct->_conf.get_val<uint64_t>("osd_pool_default_size"); + if (warn_too_few_osds && osdmap.get_num_osds() < osd_pool_default_size) { + ostringstream ss; + ss << "OSD count " << osdmap.get_num_osds() + << " < osd_pool_default_size " << osd_pool_default_size; + checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(), + osd_pool_default_size - osdmap.get_num_osds()); + } + + // SLOW_PING_TIME + // Convert milliseconds to microseconds + auto warn_slow_ping_time = cct->_conf.get_val<double>("mon_warn_on_slow_ping_time") * 1000; + auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace"); + if (warn_slow_ping_time == 0) { + double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio"); + warn_slow_ping_time = grace; + warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio + } + if (warn_slow_ping_time > 0) { + + struct mon_ping_item_t { + uint32_t pingtime; + int from; + int to; + bool improving; + + bool operator<(const mon_ping_item_t& rhs) const { + if (pingtime < rhs.pingtime) + return true; + if (pingtime > rhs.pingtime) + return false; + if (from < rhs.from) + return true; + if (from > rhs.from) + return false; + return to < rhs.to; + } + }; + + list<string> detail_back; + list<string> detail_front; + list<string> detail; + set<mon_ping_item_t> back_sorted, front_sorted; + for (auto i : osd_stat) { + for (auto j : i.second.hb_pingtime) { + + // Maybe source info is old + if (now.sec() - j.second.last_update > grace * 60) + continue; + + mon_ping_item_t back; + back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); + back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]); + back.from = i.first; + back.to = j.first; + if (back.pingtime > warn_slow_ping_time) { + back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1] + && j.second.back_pingtime[1] < j.second.back_pingtime[2]); + back_sorted.emplace(back); + } + + mon_ping_item_t front; + front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]); + front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]); + front.from = i.first; + front.to = j.first; + if (front.pingtime > warn_slow_ping_time) { + front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1] + && j.second.front_pingtime[1] < j.second.back_pingtime[2]); + front_sorted.emplace(front); + } + } + if (i.second.num_shards_repaired > + cct->_conf.get_val<uint64_t>("mon_osd_warn_num_repaired")) { + ostringstream ss; + ss << "osd." << i.first << " had " << i.second.num_shards_repaired << " reads repaired"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << "Too many repaired reads on " << detail.size() << " OSDs"; + auto& d = checks->add("OSD_TOO_MANY_REPAIRS", HEALTH_WARN, ss.str(), + detail.size()); + d.detail.swap(detail); + } + int max_detail = 10; + for (auto &sback : boost::adaptors::reverse(back_sorted)) { + ostringstream ss; + if (max_detail == 0) { + ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information"; + detail_back.push_back(ss.str()); + break; + } + max_detail--; + ss << "Slow OSD heartbeats on back from osd." << sback.from + << " [" << osd_parentage(osdmap, sback.from) << "]" + << (osdmap.is_down(sback.from) ? " (down)" : "") + << " to osd." << sback.to + << " [" << osd_parentage(osdmap, sback.to) << "]" + << (osdmap.is_down(sback.to) ? " (down)" : "") + << " " << fixed_u_to_string(sback.pingtime, 3) << " msec" + << (sback.improving ? " possibly improving" : ""); + detail_back.push_back(ss.str()); + } + max_detail = 10; + for (auto &sfront : boost::adaptors::reverse(front_sorted)) { + ostringstream ss; + if (max_detail == 0) { + ss << "Truncated long network list. Use ceph daemon mgr.# dump_osd_network for more information"; + detail_front.push_back(ss.str()); + break; + } + max_detail--; + // Get crush parentage for each osd + ss << "Slow OSD heartbeats on front from osd." << sfront.from + << " [" << osd_parentage(osdmap, sfront.from) << "]" + << (osdmap.is_down(sfront.from) ? " (down)" : "") + << " to osd." << sfront.to + << " [" << osd_parentage(osdmap, sfront.to) << "]" + << (osdmap.is_down(sfront.to) ? " (down)" : "") + << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec" + << (sfront.improving ? " possibly improving" : ""); + detail_front.push_back(ss.str()); + } + if (detail_back.size() != 0) { + ostringstream ss; + ss << "Slow OSD heartbeats on back (longest " + << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << "ms)"; + auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(), + back_sorted.size()); + d.detail.swap(detail_back); + } + if (detail_front.size() != 0) { + ostringstream ss; + ss << "Slow OSD heartbeats on front (longest " + << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << "ms)"; + auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(), + front_sorted.size()); + d.detail.swap(detail_front); + } + } + + // SMALLER_PGP_NUM + // MANY_OBJECTS_PER_PG + if (!pg_stat.empty()) { + list<string> pgp_detail, many_detail; + const auto mon_pg_warn_min_objects = + cct->_conf.get_val<int64_t>("mon_pg_warn_min_objects"); + const auto mon_pg_warn_min_pool_objects = + cct->_conf.get_val<int64_t>("mon_pg_warn_min_pool_objects"); + const auto mon_pg_warn_max_object_skew = + cct->_conf.get_val<double>("mon_pg_warn_max_object_skew"); + for (auto p = pg_pool_sum.begin(); + p != pg_pool_sum.end(); + ++p) { + const pg_pool_t *pi = osdmap.get_pg_pool(p->first); + if (!pi) + continue; // in case osdmap changes haven't propagated to PGMap yet + const string& name = osdmap.get_pool_name(p->first); + // NOTE: we use pg_num_target and pgp_num_target for the purposes of + // the warnings. If the cluster is failing to converge on the target + // values that is a separate issue! + if (pi->get_pg_num_target() > pi->get_pgp_num_target() && + !(name.find(".DELETED") != string::npos && + cct->_conf->mon_fake_pool_delete)) { + ostringstream ss; + ss << "pool " << name << " pg_num " + << pi->get_pg_num_target() + << " > pgp_num " << pi->get_pgp_num_target(); + pgp_detail.push_back(ss.str()); + } + int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size(); + if (average_objects_per_pg > 0 && + pg_sum.stats.sum.num_objects >= mon_pg_warn_min_objects && + p->second.stats.sum.num_objects >= mon_pg_warn_min_pool_objects) { + int objects_per_pg = p->second.stats.sum.num_objects / + pi->get_pg_num_target(); + float ratio = (float)objects_per_pg / (float)average_objects_per_pg; + if (mon_pg_warn_max_object_skew > 0 && + ratio > mon_pg_warn_max_object_skew) { + ostringstream ss; + if (pi->pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::ON) { + ss << "pool " << name << " objects per pg (" + << objects_per_pg << ") is more than " << ratio + << " times cluster average (" + << average_objects_per_pg << ")"; + many_detail.push_back(ss.str()); + } + } + } + } + if (!pgp_detail.empty()) { + ostringstream ss; + ss << pgp_detail.size() << " pools have pg_num > pgp_num"; + auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(), + pgp_detail.size()); + d.detail.swap(pgp_detail); + } + if (!many_detail.empty()) { + ostringstream ss; + ss << many_detail.size() << " pools have many more objects per pg than" + << " average"; + auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(), + many_detail.size()); + d.detail.swap(many_detail); + } + } + + // POOL_FULL + // POOL_NEAR_FULL + { + float warn_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_warn_threshold")/100; + float crit_threshold = (float)g_conf().get_val<int64_t>("mon_pool_quota_crit_threshold")/100; + list<string> full_detail, nearfull_detail; + unsigned full_pools = 0, nearfull_pools = 0; + for (auto it : pools) { + auto it2 = pg_pool_sum.find(it.first); + if (it2 == pg_pool_sum.end()) { + continue; + } + const pool_stat_t *pstat = &it2->second; + const object_stat_sum_t& sum = pstat->stats.sum; + const string& pool_name = osdmap.get_pool_name(it.first); + const pg_pool_t &pool = it.second; + bool full = false, nearfull = false; + if (pool.quota_max_objects > 0) { + stringstream ss; + if ((uint64_t)sum.num_objects >= pool.quota_max_objects) { + } else if (crit_threshold > 0 && + sum.num_objects >= pool.quota_max_objects*crit_threshold) { + ss << "pool '" << pool_name + << "' has " << sum.num_objects << " objects" + << " (max " << pool.quota_max_objects << ")"; + full_detail.push_back(ss.str()); + full = true; + } else if (warn_threshold > 0 && + sum.num_objects >= pool.quota_max_objects*warn_threshold) { + ss << "pool '" << pool_name + << "' has " << sum.num_objects << " objects" + << " (max " << pool.quota_max_objects << ")"; + nearfull_detail.push_back(ss.str()); + nearfull = true; + } + } + if (pool.quota_max_bytes > 0) { + stringstream ss; + if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) { + } else if (crit_threshold > 0 && + sum.num_bytes >= pool.quota_max_bytes*crit_threshold) { + ss << "pool '" << pool_name + << "' has " << byte_u_t(sum.num_bytes) + << " (max " << byte_u_t(pool.quota_max_bytes) << ")"; + full_detail.push_back(ss.str()); + full = true; + } else if (warn_threshold > 0 && + sum.num_bytes >= pool.quota_max_bytes*warn_threshold) { + ss << "pool '" << pool_name + << "' has " << byte_u_t(sum.num_bytes) + << " (max " << byte_u_t(pool.quota_max_bytes) << ")"; + nearfull_detail.push_back(ss.str()); + nearfull = true; + } + } + if (full) { + ++full_pools; + } + if (nearfull) { + ++nearfull_pools; + } + } + if (full_pools) { + ostringstream ss; + ss << full_pools << " pools full"; + auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools); + d.detail.swap(full_detail); + } + if (nearfull_pools) { + ostringstream ss; + ss << nearfull_pools << " pools nearfull"; + auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools); + d.detail.swap(nearfull_detail); + } + } + + // OBJECT_MISPLACED + if (pg_sum.stats.sum.num_objects_misplaced && + pg_sum.stats.sum.num_object_copies > 0 && + cct->_conf->mon_warn_on_misplaced) { + double pc = (double)pg_sum.stats.sum.num_objects_misplaced / + (double)pg_sum.stats.sum.num_object_copies * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + ostringstream ss; + ss << pg_sum.stats.sum.num_objects_misplaced + << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced (" + << b << "%)"; + checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(), + pg_sum.stats.sum.num_objects_misplaced); + } + + // OBJECT_UNFOUND + if (pg_sum.stats.sum.num_objects_unfound && + pg_sum.stats.sum.num_objects) { + double pc = (double)pg_sum.stats.sum.num_objects_unfound / + (double)pg_sum.stats.sum.num_objects * (double)100.0; + char b[20]; + snprintf(b, sizeof(b), "%.3lf", pc); + ostringstream ss; + ss << pg_sum.stats.sum.num_objects_unfound + << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)"; + auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(), + pg_sum.stats.sum.num_objects_unfound); + + for (auto& p : pg_stat) { + if (p.second.stats.sum.num_objects_unfound) { + ostringstream ss; + ss << "pg " << p.first + << " has " << p.second.stats.sum.num_objects_unfound + << " unfound objects"; + d.detail.push_back(ss.str()); + if (d.detail.size() > max) { + d.detail.push_back("(additional pgs left out for brevity)"); + break; + } + } + } + } + + // REQUEST_SLOW + // REQUEST_STUCK + // SLOW_OPS unifies them in mimic. + if (osdmap.require_osd_release < ceph_release_t::mimic && + cct->_conf->mon_osd_warn_op_age > 0 && + !osd_sum.op_queue_age_hist.h.empty() && + osd_sum.op_queue_age_hist.upper_bound() / 1000.0 > + cct->_conf->mon_osd_warn_op_age) { + list<string> warn_detail, error_detail; + unsigned warn = 0, error = 0; + float err_age = + cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio; + const pow2_hist_t& h = osd_sum.op_queue_age_hist; + for (unsigned i = h.h.size() - 1; i > 0; --i) { + float ub = (float)(1 << i) / 1000.0; + if (ub < cct->_conf->mon_osd_warn_op_age) + break; + if (h.h[i]) { + ostringstream ss; + ss << h.h[i] << " ops are blocked > " << ub << " sec"; + if (ub > err_age) { + error += h.h[i]; + error_detail.push_back(ss.str()); + } else { + warn += h.h[i]; + warn_detail.push_back(ss.str()); + } + } + } + + map<float,set<int>> warn_osd_by_max; // max -> osds + map<float,set<int>> error_osd_by_max; // max -> osds + if (!warn_detail.empty() || !error_detail.empty()) { + for (auto& p : osd_stat) { + const pow2_hist_t& h = p.second.op_queue_age_hist; + for (unsigned i = h.h.size() - 1; i > 0; --i) { + float ub = (float)(1 << i) / 1000.0; + if (ub < cct->_conf->mon_osd_warn_op_age) + break; + if (h.h[i]) { + if (ub > err_age) { + error_osd_by_max[ub].insert(p.first); + } else { + warn_osd_by_max[ub].insert(p.first); + } + break; + } + } + } + } + + if (!warn_detail.empty()) { + ostringstream ss; + ss << warn << " slow requests are blocked > " + << cct->_conf->mon_osd_warn_op_age << " sec"; + auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn); + d.detail.swap(warn_detail); + int left = max; + for (auto& p : warn_osd_by_max) { + ostringstream ss; + if (p.second.size() > 1) { + ss << "osds " << p.second + << " have blocked requests > " << p.first << " sec"; + } else { + ss << "osd." << *p.second.begin() + << " has blocked requests > " << p.first << " sec"; + } + d.detail.push_back(ss.str()); + if (--left == 0) { + break; + } + } + } + if (!error_detail.empty()) { + ostringstream ss; + ss << error << " stuck requests are blocked > " + << err_age << " sec"; + auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error); + d.detail.swap(error_detail); + int left = max; + for (auto& p : error_osd_by_max) { + ostringstream ss; + if (p.second.size() > 1) { + ss << "osds " << p.second + << " have stuck requests > " << p.first << " sec"; + } else { + ss << "osd." << *p.second.begin() + << " has stuck requests > " << p.first << " sec"; + } + d.detail.push_back(ss.str()); + if (--left == 0) { + break; + } + } + } + } + + // OBJECT_STORE_WARN + if (osd_sum.os_alerts.size()) { + map<string, pair<size_t, list<string>>> os_alerts_sum; + + for (auto& a : osd_sum.os_alerts) { + int left = max; + string s0 = " osd."; + s0 += stringify(a.first); + for (auto& aa : a.second) { + string s(s0); + s += " "; + s += aa.second; + auto it = os_alerts_sum.find(aa.first); + if (it == os_alerts_sum.end()) { + list<string> d; + d.emplace_back(s); + os_alerts_sum.emplace(aa.first, std::make_pair(1, d)); + } else { + auto& p = it->second; + ++p.first; + p.second.emplace_back(s); + } + if (--left == 0) { + break; + } + } + } + + for (auto& asum : os_alerts_sum) { + string summary = stringify(asum.second.first) + " OSD(s)"; + if (asum.first == "BLUEFS_SPILLOVER") { + summary += " experiencing BlueFS spillover"; + } else if (asum.first == "BLUESTORE_NO_COMPRESSION") { + summary += " have broken BlueStore compression"; + } else if (asum.first == "BLUESTORE_LEGACY_STATFS") { + summary += " reporting legacy (not per-pool) BlueStore stats"; + } else if (asum.first == "BLUESTORE_DISK_SIZE_MISMATCH") { + summary += " have dangerous mismatch between BlueStore block device and free list sizes"; + } else if (asum.first == "BLUESTORE_NO_PER_PG_OMAP") { + summary += " reporting legacy (not per-pg) BlueStore omap"; + } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") { + summary += " reporting legacy (not per-pool) BlueStore omap usage stats"; + } else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") { + summary += " have spurious read errors"; + } + + auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first); + for (auto& s : asum.second.second) { + d.detail.push_back(s); + } + } + } + // PG_NOT_SCRUBBED + // PG_NOT_DEEP_SCRUBBED + if (cct->_conf->mon_warn_pg_not_scrubbed_ratio || + cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) { + list<string> detail, deep_detail; + int detail_max = max, deep_detail_max = max; + int detail_more = 0, deep_detail_more = 0; + int detail_total = 0, deep_detail_total = 0; + for (auto& p : pg_stat) { + int64_t pnum = p.first.pool(); + auto pool = osdmap.get_pg_pool(pnum); + if (!pool) + continue; + if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) { + double scrub_max_interval = 0; + pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval); + if (scrub_max_interval <= 0) { + scrub_max_interval = cct->_conf->osd_scrub_max_interval; + } + const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) + + scrub_max_interval; + utime_t cutoff = now; + cutoff -= age; + if (p.second.last_scrub_stamp < cutoff) { + if (detail_max > 0) { + ostringstream ss; + ss << "pg " << p.first << " not scrubbed since " + << p.second.last_scrub_stamp; + detail.push_back(ss.str()); + --detail_max; + } else { + ++detail_more; + } + ++detail_total; + } + } + if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) { + double deep_scrub_interval = 0; + pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval); + if (deep_scrub_interval <= 0) { + deep_scrub_interval = cct->_conf->osd_deep_scrub_interval; + } + double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) + + deep_scrub_interval; + utime_t deep_cutoff = now; + deep_cutoff -= deep_age; + if (p.second.last_deep_scrub_stamp < deep_cutoff) { + if (deep_detail_max > 0) { + ostringstream ss; + ss << "pg " << p.first << " not deep-scrubbed since " + << p.second.last_deep_scrub_stamp; + deep_detail.push_back(ss.str()); + --deep_detail_max; + } else { + ++deep_detail_more; + } + ++deep_detail_total; + } + } + } + if (detail_total) { + ostringstream ss; + ss << detail_total << " pgs not scrubbed in time"; + auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total); + + if (!detail.empty()) { + d.detail.swap(detail); + + if (detail_more) { + ostringstream ss; + ss << detail_more << " more pgs... "; + d.detail.push_back(ss.str()); + } + } + } + if (deep_detail_total) { + ostringstream ss; + ss << deep_detail_total << " pgs not deep-scrubbed in time"; + auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(), + deep_detail_total); + + if (!deep_detail.empty()) { + d.detail.swap(deep_detail); + + if (deep_detail_more) { + ostringstream ss; + ss << deep_detail_more << " more pgs... "; + d.detail.push_back(ss.str()); + } + } + } + } + + // POOL_APP + if (g_conf().get_val<bool>("mon_warn_on_pool_no_app")) { + list<string> detail; + for (auto &it : pools) { + const pg_pool_t &pool = it.second; + const string& pool_name = osdmap.get_pool_name(it.first); + auto it2 = pg_pool_sum.find(it.first); + if (it2 == pg_pool_sum.end()) { + continue; + } + const pool_stat_t *pstat = &it2->second; + if (pstat == nullptr) { + continue; + } + const object_stat_sum_t& sum = pstat->stats.sum; + // application metadata is not encoded until luminous is minimum + // required release + if (sum.num_objects > 0 && pool.application_metadata.empty() && + !pool.is_tier()) { + stringstream ss; + ss << "application not enabled on pool '" << pool_name << "'"; + detail.push_back(ss.str()); + } + } + if (!detail.empty()) { + ostringstream ss; + ss << detail.size() << " pool(s) do not have an application enabled"; + auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(), + detail.size()); + stringstream tip; + tip << "use 'ceph osd pool application enable <pool-name> " + << "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', " + << "or freeform for custom applications."; + detail.push_back(tip.str()); + d.detail.swap(detail); + } + } + + // PG_SLOW_SNAP_TRIMMING + if (!pg_stat.empty() && cct->_conf->mon_osd_snap_trim_queue_warn_on > 0) { + uint32_t snapthreshold = cct->_conf->mon_osd_snap_trim_queue_warn_on; + uint64_t snaptrimq_exceeded = 0; + uint32_t longest_queue = 0; + const pg_t* longest_q_pg = nullptr; + list<string> detail; + + for (auto& i: pg_stat) { + uint32_t current_len = i.second.snaptrimq_len; + if (current_len >= snapthreshold) { + snaptrimq_exceeded++; + if (longest_queue <= current_len) { + longest_q_pg = &i.first; + longest_queue = current_len; + } + if (detail.size() < max - 1) { + stringstream ss; + ss << "snap trim queue for pg " << i.first << " at " << current_len; + detail.push_back(ss.str()); + continue; + } + if (detail.size() < max) { + detail.push_back("...more pgs affected"); + continue; + } + } + } + + if (snaptrimq_exceeded) { + { + ostringstream ss; + ss << "longest queue on pg " << *longest_q_pg << " at " << longest_queue; + detail.push_back(ss.str()); + } + + stringstream ss; + ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)"; + auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(), + snaptrimq_exceeded); + detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\"."); + d.detail.swap(detail); + } + } +} + +void PGMap::print_summary(ceph::Formatter *f, ostream *out) const +{ + if (f) { + f->open_array_section("pgs_by_pool_state"); + for (auto& i: num_pg_by_pool_state) { + f->open_object_section("per_pool_pgs_by_state"); + f->dump_int("pool_id", i.first); + f->open_array_section("pg_state_counts"); + for (auto& j : i.second) { + f->open_object_section("pg_state_count"); + f->dump_string("state_name", pg_state_string(j.first)); + f->dump_int("count", j.second); + f->close_section(); + } + f->close_section(); + f->close_section(); + } + f->close_section(); + } + PGMapDigest::print_summary(f, out); +} + +int process_pg_map_command( + const string& orig_prefix, + const cmdmap_t& orig_cmdmap, + const PGMap& pg_map, + const OSDMap& osdmap, + ceph::Formatter *f, + stringstream *ss, + bufferlist *odata) +{ + string prefix = orig_prefix; + auto cmdmap = orig_cmdmap; + + string omap_stats_note = + "\n* NOTE: Omap statistics are gathered during deep scrub and " + "may be inaccurate soon afterwards depending on utilization. See " + "http://docs.ceph.com/en/latest/dev/placement-group/#omap-statistics " + "for further details.\n"; + bool omap_stats_note_required = false; + + // perhaps these would be better in the parsing, but it's weird + bool primary = false; + if (prefix == "pg dump_json") { + vector<string> v; + v.push_back(string("all")); + cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v); + prefix = "pg dump"; + } else if (prefix == "pg dump_pools_json") { + vector<string> v; + v.push_back(string("pools")); + cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v); + prefix = "pg dump"; + } else if (prefix == "pg ls-by-primary") { + primary = true; + prefix = "pg ls"; + } else if (prefix == "pg ls-by-osd") { + prefix = "pg ls"; + } else if (prefix == "pg ls-by-pool") { + prefix = "pg ls"; + string poolstr; + cmd_getval(cmdmap, "poolstr", poolstr); + int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str()); + if (pool < 0) { + *ss << "pool " << poolstr << " does not exist"; + return -ENOENT; + } + cmd_putval(g_ceph_context, cmdmap, "pool", pool); + } + + stringstream ds; + if (prefix == "pg stat") { + if (f) { + f->open_object_section("pg_summary"); + pg_map.print_oneline_summary(f, NULL); + f->close_section(); + f->flush(ds); + } else { + ds << pg_map; + } + odata->append(ds); + return 0; + } + + if (prefix == "pg getmap") { + pg_map.encode(*odata); + *ss << "got pgmap version " << pg_map.version; + return 0; + } + + if (prefix == "pg dump") { + string val; + vector<string> dumpcontents; + set<string> what; + if (cmd_getval(cmdmap, "dumpcontents", dumpcontents)) { + copy(dumpcontents.begin(), dumpcontents.end(), + inserter(what, what.end())); + } + if (what.empty()) + what.insert("all"); + if (f) { + if (what.count("all")) { + f->open_object_section("pg_map"); + pg_map.dump(f); + f->close_section(); + } else if (what.count("summary") || what.count("sum")) { + f->open_object_section("pg_map"); + pg_map.dump_basic(f); + f->close_section(); + } else { + if (what.count("pools")) { + pg_map.dump_pool_stats(f); + } + if (what.count("osds")) { + pg_map.dump_osd_stats(f); + } + if (what.count("pgs")) { + pg_map.dump_pg_stats(f, false); + } + if (what.count("pgs_brief")) { + pg_map.dump_pg_stats(f, true); + } + if (what.count("delta")) { + f->open_object_section("delta"); + pg_map.dump_delta(f); + f->close_section(); + } + } + f->flush(*odata); + } else { + if (what.count("all")) { + pg_map.dump(ds); + omap_stats_note_required = true; + } else if (what.count("summary") || what.count("sum")) { + pg_map.dump_basic(ds); + pg_map.dump_pg_sum_stats(ds, true); + pg_map.dump_osd_sum_stats(ds); + omap_stats_note_required = true; + } else { + if (what.count("pgs_brief")) { + pg_map.dump_pg_stats(ds, true); + } + bool header = true; + if (what.count("pgs")) { + pg_map.dump_pg_stats(ds, false); + header = false; + omap_stats_note_required = true; + } + if (what.count("pools")) { + pg_map.dump_pool_stats(ds, header); + omap_stats_note_required = true; + } + if (what.count("osds")) { + pg_map.dump_osd_stats(ds); + } + } + odata->append(ds); + if (omap_stats_note_required) { + odata->append(omap_stats_note); + } + } + *ss << "dumped " << what; + return 0; + } + + if (prefix == "pg ls") { + int64_t osd = -1; + int64_t pool = -1; + vector<string>states; + set<pg_t> pgs; + cmd_getval(cmdmap, "pool", pool); + cmd_getval(cmdmap, "osd", osd); + cmd_getval(cmdmap, "states", states); + if (pool >= 0 && !osdmap.have_pg_pool(pool)) { + *ss << "pool " << pool << " does not exist"; + return -ENOENT; + } + if (osd >= 0 && !osdmap.is_up(osd)) { + *ss << "osd " << osd << " is not up"; + return -EAGAIN; + } + if (states.empty()) + states.push_back("all"); + + uint64_t state = 0; + + while (!states.empty()) { + string state_str = states.back(); + + if (state_str == "all") { + state = -1; + break; + } else { + auto filter = pg_string_state(state_str); + if (!filter) { + *ss << "'" << state_str << "' is not a valid pg state," + << " available choices: " << pg_state_string(0xFFFFFFFF); + return -EINVAL; + } + state |= *filter; + } + + states.pop_back(); + } + + pg_map.get_filtered_pg_stats(state, pool, osd, primary, pgs); + + if (f && !pgs.empty()) { + pg_map.dump_filtered_pg_stats(f, pgs); + f->flush(*odata); + } else if (!pgs.empty()) { + pg_map.dump_filtered_pg_stats(ds, pgs); + odata->append(ds); + odata->append(omap_stats_note); + } + return 0; + } + + if (prefix == "pg dump_stuck") { + vector<string> stuckop_vec; + cmd_getval(cmdmap, "stuckops", stuckop_vec); + if (stuckop_vec.empty()) + stuckop_vec.push_back("unclean"); + int64_t threshold; + cmd_getval(cmdmap, "threshold", threshold, + g_conf().get_val<int64_t>("mon_pg_stuck_threshold")); + + if (pg_map.dump_stuck_pg_stats(ds, f, (int)threshold, stuckop_vec) < 0) { + *ss << "failed"; + } else { + *ss << "ok"; + } + odata->append(ds); + return 0; + } + + if (prefix == "pg debug") { + string debugop; + cmd_getval(cmdmap, "debugop", debugop, + string("unfound_objects_exist")); + if (debugop == "unfound_objects_exist") { + bool unfound_objects_exist = false; + for (const auto& p : pg_map.pg_stat) { + if (p.second.stats.sum.num_objects_unfound > 0) { + unfound_objects_exist = true; + break; + } + } + if (unfound_objects_exist) + ds << "TRUE"; + else + ds << "FALSE"; + odata->append(ds); + return 0; + } + if (debugop == "degraded_pgs_exist") { + bool degraded_pgs_exist = false; + for (const auto& p : pg_map.pg_stat) { + if (p.second.stats.sum.num_objects_degraded > 0) { + degraded_pgs_exist = true; + break; + } + } + if (degraded_pgs_exist) + ds << "TRUE"; + else + ds << "FALSE"; + odata->append(ds); + return 0; + } + } + + if (prefix == "osd perf") { + if (f) { + f->open_object_section("osdstats"); + pg_map.dump_osd_perf_stats(f); + f->close_section(); + f->flush(ds); + } else { + pg_map.print_osd_perf_stats(&ds); + } + odata->append(ds); + return 0; + } + + if (prefix == "osd blocked-by") { + if (f) { + f->open_object_section("osd_blocked_by"); + pg_map.dump_osd_blocked_by_stats(f); + f->close_section(); + f->flush(ds); + } else { + pg_map.print_osd_blocked_by_stats(&ds); + } + odata->append(ds); + return 0; + } + + return -EOPNOTSUPP; +} + +void PGMapUpdater::check_osd_map( + CephContext *cct, + const OSDMap& osdmap, + const PGMap& pgmap, + PGMap::Incremental *pending_inc) +{ + for (auto& p : pgmap.osd_stat) { + if (!osdmap.exists(p.first)) { + // remove osd_stat + pending_inc->rm_stat(p.first); + } else if (osdmap.is_out(p.first)) { + // zero osd_stat + if (p.second.statfs.total != 0) { + pending_inc->stat_osd_out(p.first); + } + } else if (!osdmap.is_up(p.first)) { + // zero the op_queue_age_hist + if (!p.second.op_queue_age_hist.empty()) { + pending_inc->stat_osd_down_up(p.first, pgmap); + } + } + } + + // deleted pgs (pools)? + for (auto& p : pgmap.pg_pool_sum) { + if (!osdmap.have_pg_pool(p.first)) { + ldout(cct, 10) << __func__ << " pool " << p.first << " gone, removing pgs" + << dendl; + for (auto& q : pgmap.pg_stat) { + if (q.first.pool() == p.first) { + pending_inc->pg_remove.insert(q.first); + } + } + auto q = pending_inc->pg_stat_updates.begin(); + while (q != pending_inc->pg_stat_updates.end()) { + if (q->first.pool() == p.first) { + q = pending_inc->pg_stat_updates.erase(q); + } else { + ++q; + } + } + } + } + + // new (split or new pool) or merged pgs? + map<int64_t,unsigned> new_pg_num; + for (auto& p : osdmap.get_pools()) { + int64_t poolid = p.first; + const pg_pool_t& pi = p.second; + auto q = pgmap.num_pg_by_pool.find(poolid); + unsigned my_pg_num = 0; + if (q != pgmap.num_pg_by_pool.end()) + my_pg_num = q->second; + unsigned pg_num = pi.get_pg_num(); + new_pg_num[poolid] = pg_num; + if (my_pg_num < pg_num) { + ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num + << " > my pg_num " << my_pg_num << dendl; + for (unsigned ps = my_pg_num; ps < pg_num; ++ps) { + pg_t pgid(ps, poolid); + if (pending_inc->pg_stat_updates.count(pgid) == 0) { + ldout(cct,20) << __func__ << " adding " << pgid << dendl; + pg_stat_t &stats = pending_inc->pg_stat_updates[pgid]; + stats.last_fresh = osdmap.get_modified(); + stats.last_active = osdmap.get_modified(); + stats.last_change = osdmap.get_modified(); + stats.last_peered = osdmap.get_modified(); + stats.last_clean = osdmap.get_modified(); + stats.last_unstale = osdmap.get_modified(); + stats.last_undegraded = osdmap.get_modified(); + stats.last_fullsized = osdmap.get_modified(); + stats.last_scrub_stamp = osdmap.get_modified(); + stats.last_deep_scrub_stamp = osdmap.get_modified(); + stats.last_clean_scrub_stamp = osdmap.get_modified(); + } + } + } else if (my_pg_num > pg_num) { + ldout(cct,10) << __func__ << " pool " << poolid << " pg_num " << pg_num + << " < my pg_num " << my_pg_num << dendl; + for (unsigned i = pg_num; i < my_pg_num; ++i) { + pg_t pgid(i, poolid); + ldout(cct,20) << __func__ << " removing merged " << pgid << dendl; + if (pgmap.pg_stat.count(pgid)) { + pending_inc->pg_remove.insert(pgid); + } + pending_inc->pg_stat_updates.erase(pgid); + } + } + } + auto i = pending_inc->pg_stat_updates.begin(); + while (i != pending_inc->pg_stat_updates.end()) { + auto j = new_pg_num.find(i->first.pool()); + if (j == new_pg_num.end() || + i->first.ps() >= j->second) { + ldout(cct,20) << __func__ << " removing pending update to old " + << i->first << dendl; + i = pending_inc->pg_stat_updates.erase(i); + } else { + ++i; + } + } +} + +static void _try_mark_pg_stale( + const OSDMap& osdmap, + pg_t pgid, + const pg_stat_t& cur, + PGMap::Incremental *pending_inc) +{ + if ((cur.state & PG_STATE_STALE) == 0 && + cur.acting_primary != -1 && + osdmap.is_down(cur.acting_primary)) { + pg_stat_t *newstat; + auto q = pending_inc->pg_stat_updates.find(pgid); + if (q != pending_inc->pg_stat_updates.end()) { + if ((q->second.acting_primary == cur.acting_primary) || + ((q->second.state & PG_STATE_STALE) == 0 && + q->second.acting_primary != -1 && + osdmap.is_down(q->second.acting_primary))) { + newstat = &q->second; + } else { + // pending update is no longer down or already stale + return; + } + } else { + newstat = &pending_inc->pg_stat_updates[pgid]; + *newstat = cur; + } + dout(10) << __func__ << " marking pg " << pgid + << " stale (acting_primary " << newstat->acting_primary + << ")" << dendl; + newstat->state |= PG_STATE_STALE; + newstat->last_unstale = ceph_clock_now(); + } +} + +void PGMapUpdater::check_down_pgs( + const OSDMap &osdmap, + const PGMap &pg_map, + bool check_all, + const set<int>& need_check_down_pg_osds, + PGMap::Incremental *pending_inc) +{ + // if a large number of osds changed state, just iterate over the whole + // pg map. + if (need_check_down_pg_osds.size() > (unsigned)osdmap.get_num_osds() * + g_conf().get_val<double>("mon_pg_check_down_all_threshold")) { + check_all = true; + } + + if (check_all) { + for (const auto& p : pg_map.pg_stat) { + _try_mark_pg_stale(osdmap, p.first, p.second, pending_inc); + } + } else { + for (auto osd : need_check_down_pg_osds) { + if (osdmap.is_down(osd)) { + auto p = pg_map.pg_by_osd.find(osd); + if (p == pg_map.pg_by_osd.end()) { + continue; + } + for (auto pgid : p->second) { + const pg_stat_t &stat = pg_map.pg_stat.at(pgid); + ceph_assert(stat.acting_primary == osd); + _try_mark_pg_stale(osdmap, pgid, stat, pending_inc); + } + } + } + } +} + +int reweight::by_utilization( + const OSDMap &osdmap, + const PGMap &pgm, + int oload, + double max_changef, + int max_osds, + bool by_pg, const set<int64_t> *pools, + bool no_increasing, + mempool::osdmap::map<int32_t, uint32_t>* new_weights, + std::stringstream *ss, + std::string *out_str, + ceph::Formatter *f) +{ + if (oload <= 100) { + *ss << "You must give a percentage higher than 100. " + "The reweighting threshold will be calculated as <average-utilization> " + "times <input-percentage>. For example, an argument of 200 would " + "reweight OSDs which are twice as utilized as the average OSD.\n"; + return -EINVAL; + } + + vector<int> pgs_by_osd(osdmap.get_max_osd()); + + // Avoid putting a small number (or 0) in the denominator when calculating + // average_util + double average_util; + if (by_pg) { + // by pg mapping + double weight_sum = 0.0; // sum up the crush weights + unsigned num_pg_copies = 0; + int num_osds = 0; + for (const auto& pg : pgm.pg_stat) { + if (pools && pools->count(pg.first.pool()) == 0) + continue; + for (const auto acting : pg.second.acting) { + if (!osdmap.exists(acting)) { + continue; + } + if (acting >= (int)pgs_by_osd.size()) + pgs_by_osd.resize(acting); + if (pgs_by_osd[acting] == 0) { + if (osdmap.crush->get_item_weightf(acting) <= 0) { + //skip if we currently can not identify item + continue; + } + weight_sum += osdmap.crush->get_item_weightf(acting); + ++num_osds; + } + ++pgs_by_osd[acting]; + ++num_pg_copies; + } + } + + if (!num_osds || (num_pg_copies / num_osds < g_conf()->mon_reweight_min_pgs_per_osd)) { + *ss << "Refusing to reweight: we only have " << num_pg_copies + << " PGs across " << num_osds << " osds!\n"; + return -EDOM; + } + + average_util = (double)num_pg_copies / weight_sum; + } else { + // by osd utilization + int num_osd = std::max<size_t>(1, pgm.osd_stat.size()); + if ((uint64_t)pgm.osd_sum.statfs.total / num_osd + < g_conf()->mon_reweight_min_bytes_per_osd) { + *ss << "Refusing to reweight: we only have " << pgm.osd_sum.statfs.kb() + << " kb across all osds!\n"; + return -EDOM; + } + if ((uint64_t)pgm.osd_sum.statfs.get_used_raw() / num_osd + < g_conf()->mon_reweight_min_bytes_per_osd) { + *ss << "Refusing to reweight: we only have " + << pgm.osd_sum.statfs.kb_used_raw() + << " kb used across all osds!\n"; + return -EDOM; + } + + average_util = (double)pgm.osd_sum.statfs.get_used_raw() / + (double)pgm.osd_sum.statfs.total; + } + + // adjust down only if we are above the threshold + const double overload_util = average_util * (double)oload / 100.0; + + // but aggressively adjust weights up whenever possible. + const double underload_util = average_util; + + const unsigned max_change = (unsigned)(max_changef * (double)0x10000); + + ostringstream oss; + if (f) { + f->open_object_section("reweight_by_utilization"); + f->dump_int("overload_min", oload); + f->dump_float("max_change", max_changef); + f->dump_int("max_change_osds", max_osds); + f->dump_float("average_utilization", average_util); + f->dump_float("overload_utilization", overload_util); + } else { + oss << "oload " << oload << "\n"; + oss << "max_change " << max_changef << "\n"; + oss << "max_change_osds " << max_osds << "\n"; + oss.precision(4); + oss << "average_utilization " << std::fixed << average_util << "\n"; + oss << "overload_utilization " << overload_util << "\n"; + } + int num_changed = 0; + + // precompute util for each OSD + std::vector<std::pair<int, float> > util_by_osd; + for (const auto& p : pgm.osd_stat) { + std::pair<int, float> osd_util; + osd_util.first = p.first; + if (by_pg) { + if (p.first >= (int)pgs_by_osd.size() || + pgs_by_osd[p.first] == 0) { + // skip if this OSD does not contain any pg + // belonging to the specified pool(s). + continue; + } + + if (osdmap.crush->get_item_weightf(p.first) <= 0) { + // skip if we are unable to locate item. + continue; + } + + osd_util.second = + pgs_by_osd[p.first] / osdmap.crush->get_item_weightf(p.first); + } else { + osd_util.second = + (double)p.second.statfs.get_used_raw() / (double)p.second.statfs.total; + } + util_by_osd.push_back(osd_util); + } + + // sort by absolute deviation from the mean utilization, + // in descending order. + std::sort(util_by_osd.begin(), util_by_osd.end(), + [average_util](std::pair<int, float> l, std::pair<int, float> r) { + return abs(l.second - average_util) > abs(r.second - average_util); + } + ); + + if (f) + f->open_array_section("reweights"); + + for (const auto& p : util_by_osd) { + unsigned weight = osdmap.get_weight(p.first); + if (weight == 0) { + // skip if OSD is currently out + continue; + } + float util = p.second; + + if (util >= overload_util) { + // Assign a lower weight to overloaded OSDs. The current weight + // is a factor to take into account the original weights, + // to represent e.g. differing storage capacities + unsigned new_weight = (unsigned)((average_util / util) * (float)weight); + if (weight > max_change) + new_weight = std::max(new_weight, weight - max_change); + new_weights->insert({p.first, new_weight}); + if (f) { + f->open_object_section("osd"); + f->dump_int("osd", p.first); + f->dump_float("weight", (float)weight / (float)0x10000); + f->dump_float("new_weight", (float)new_weight / (float)0x10000); + f->close_section(); + } else { + oss << "osd." << p.first << " weight " + << (float)weight / (float)0x10000 << " -> " + << (float)new_weight / (float)0x10000 << "\n"; + } + if (++num_changed >= max_osds) + break; + } + if (!no_increasing && util <= underload_util) { + // assign a higher weight.. if we can. + unsigned new_weight = (unsigned)((average_util / util) * (float)weight); + new_weight = std::min(new_weight, weight + max_change); + if (new_weight > 0x10000) + new_weight = 0x10000; + if (new_weight > weight) { + new_weights->insert({p.first, new_weight}); + oss << "osd." << p.first << " weight " + << (float)weight / (float)0x10000 << " -> " + << (float)new_weight / (float)0x10000 << "\n"; + if (++num_changed >= max_osds) + break; + } + } + } + if (f) { + f->close_section(); + } + + OSDMap newmap; + newmap.deepish_copy_from(osdmap); + OSDMap::Incremental newinc; + newinc.fsid = newmap.get_fsid(); + newinc.epoch = newmap.get_epoch() + 1; + newinc.new_weight = *new_weights; + newmap.apply_incremental(newinc); + + osdmap.summarize_mapping_stats(&newmap, pools, out_str, f); + + if (f) { + f->close_section(); + } else { + *out_str += "\n"; + *out_str += oss.str(); + } + return num_changed; +} diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h new file mode 100644 index 000000000..9bdabb046 --- /dev/null +++ b/src/mon/PGMap.h @@ -0,0 +1,558 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* + * Placement Group Map. Placement Groups are logical sets of objects + * that are replicated by the same set of devices. pgid=(r,hash(o)&m) + * where & is a bit-wise AND and m=2^k-1 + */ + +#ifndef CEPH_PGMAP_H +#define CEPH_PGMAP_H + +#include "include/health.h" +#include "common/debug.h" +#include "common/TextTable.h" +#include "osd/osd_types.h" +#include "include/mempool.h" +#include "mon/health_check.h" +#include <sstream> + +namespace ceph { class Formatter; } + +class PGMapDigest { +public: + MEMPOOL_CLASS_HELPERS(); + virtual ~PGMapDigest() {} + + mempool::pgmap::vector<uint64_t> osd_last_seq; + + mutable std::map<int, int64_t> avail_space_by_rule; + + // aggregate state, populated by PGMap child + int64_t num_pg = 0, num_osd = 0; + int64_t num_pg_active = 0; + int64_t num_pg_unknown = 0; + mempool::pgmap::unordered_map<int32_t,pool_stat_t> pg_pool_sum; + mempool::pgmap::map<int64_t,int64_t> num_pg_by_pool; + pool_stat_t pg_sum; + osd_stat_t osd_sum; + mempool::pgmap::map<std::string,osd_stat_t> osd_sum_by_class; + mempool::pgmap::unordered_map<uint64_t,int32_t> num_pg_by_state; + struct pg_count { + int32_t acting = 0; + int32_t up_not_acting = 0; + int32_t primary = 0; + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(acting, bl); + encode(up_not_acting, bl); + encode(primary, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + decode(acting, p); + decode(up_not_acting, p); + decode(primary, p); + } + }; + mempool::pgmap::unordered_map<int32_t,pg_count> num_pg_by_osd; + + mempool::pgmap::map<int64_t,interval_set<snapid_t>> purged_snaps; + + bool use_per_pool_stats() const { + return osd_sum.num_osds == osd_sum.num_per_pool_osds; + } + bool use_per_pool_omap_stats() const { + return osd_sum.num_osds == osd_sum.num_per_pool_omap_osds; + } + + // recent deltas, and summation + /** + * keep track of last deltas for each pool, calculated using + * @p pg_pool_sum as baseline. + */ + mempool::pgmap::unordered_map<int64_t, mempool::pgmap::list<std::pair<pool_stat_t, utime_t> > > per_pool_sum_deltas; + /** + * keep track of per-pool timestamp deltas, according to last update on + * each pool. + */ + mempool::pgmap::unordered_map<int64_t, utime_t> per_pool_sum_deltas_stamps; + /** + * keep track of sum deltas, per-pool, taking into account any previous + * deltas existing in @p per_pool_sum_deltas. The utime_t as second member + * of the pair is the timestamp referring to the last update (i.e., the first + * member of the pair) for a given pool. + */ + mempool::pgmap::unordered_map<int64_t, std::pair<pool_stat_t,utime_t> > per_pool_sum_delta; + + pool_stat_t pg_sum_delta; + utime_t stamp_delta; + + void get_recovery_stats( + double *misplaced_ratio, + double *degraded_ratio, + double *inactive_ratio, + double *unknown_pgs_ratio) const; + + void print_summary(ceph::Formatter *f, std::ostream *out) const; + void print_oneline_summary(ceph::Formatter *f, std::ostream *out) const; + + void recovery_summary(ceph::Formatter *f, std::list<std::string> *psl, + const pool_stat_t& pool_sum) const; + void overall_recovery_summary(ceph::Formatter *f, std::list<std::string> *psl) const; + void pool_recovery_summary(ceph::Formatter *f, std::list<std::string> *psl, + uint64_t poolid) const; + void recovery_rate_summary(ceph::Formatter *f, std::ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const; + void overall_recovery_rate_summary(ceph::Formatter *f, std::ostream *out) const; + void pool_recovery_rate_summary(ceph::Formatter *f, std::ostream *out, + uint64_t poolid) const; + /** + * Obtain a formatted/plain output for client I/O, source from stats for a + * given @p delta_sum pool over a given @p delta_stamp period of time. + */ + void client_io_rate_summary(ceph::Formatter *f, std::ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const; + /** + * Obtain a formatted/plain output for the overall client I/O, which is + * calculated resorting to @p pg_sum_delta and @p stamp_delta. + */ + void overall_client_io_rate_summary(ceph::Formatter *f, std::ostream *out) const; + /** + * Obtain a formatted/plain output for client I/O over a given pool + * with id @p pool_id. We will then obtain pool-specific data + * from @p per_pool_sum_delta. + */ + void pool_client_io_rate_summary(ceph::Formatter *f, std::ostream *out, + uint64_t poolid) const; + /** + * Obtain a formatted/plain output for cache tier IO, source from stats for a + * given @p delta_sum pool over a given @p delta_stamp period of time. + */ + void cache_io_rate_summary(ceph::Formatter *f, std::ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const; + /** + * Obtain a formatted/plain output for the overall cache tier IO, which is + * calculated resorting to @p pg_sum_delta and @p stamp_delta. + */ + void overall_cache_io_rate_summary(ceph::Formatter *f, std::ostream *out) const; + /** + * Obtain a formatted/plain output for cache tier IO over a given pool + * with id @p pool_id. We will then obtain pool-specific data + * from @p per_pool_sum_delta. + */ + void pool_cache_io_rate_summary(ceph::Formatter *f, std::ostream *out, + uint64_t poolid) const; + + /** + * Return the number of additional bytes that can be stored in this + * pool before the first OSD fills up, accounting for PG overhead. + */ + int64_t get_pool_free_space(const OSDMap &osd_map, int64_t poolid) const; + + + /** + * Dump pool usage and io ops/bytes, used by "ceph df" command + */ + virtual void dump_pool_stats_full(const OSDMap &osd_map, std::stringstream *ss, + ceph::Formatter *f, bool verbose) const; + void dump_cluster_stats(std::stringstream *ss, ceph::Formatter *f, bool verbose) const; + static void dump_object_stat_sum(TextTable &tbl, ceph::Formatter *f, + const pool_stat_t &pool_stat, + uint64_t avail, + float raw_used_rate, + bool verbose, + bool per_pool, + bool per_pool_omap, + const pg_pool_t *pool); + + size_t get_num_pg_by_osd(int osd) const { + auto p = num_pg_by_osd.find(osd); + if (p == num_pg_by_osd.end()) + return 0; + else + return p->second.acting; + } + int get_num_primary_pg_by_osd(int osd) const { + auto p = num_pg_by_osd.find(osd); + if (p == num_pg_by_osd.end()) + return 0; + else + return p->second.primary; + } + + ceph_statfs get_statfs(OSDMap &osdmap, + boost::optional<int64_t> data_pool) const; + + int64_t get_rule_avail(int ruleno) const { + auto i = avail_space_by_rule.find(ruleno); + if (i != avail_space_by_rule.end()) + return avail_space_by_rule[ruleno]; + else + return 0; + } + + // kill me post-mimic or -nautilus + bool definitely_converted_snapsets() const { + // false negative is okay; false positive is not! + return + num_pg && + num_pg_unknown == 0 && + pg_sum.stats.sum.num_legacy_snapsets == 0; + } + + uint64_t get_last_osd_stat_seq(int osd) { + if (osd < (int)osd_last_seq.size()) + return osd_last_seq[osd]; + return 0; + } + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<PGMapDigest*>& ls); +}; +WRITE_CLASS_ENCODER(PGMapDigest::pg_count); +WRITE_CLASS_ENCODER_FEATURES(PGMapDigest); + +class PGMap : public PGMapDigest { +public: + MEMPOOL_CLASS_HELPERS(); + + // the map + version_t version; + epoch_t last_osdmap_epoch; // last osdmap epoch i applied to the pgmap + epoch_t last_pg_scan; // osdmap epoch + mempool::pgmap::unordered_map<int32_t,osd_stat_t> osd_stat; + mempool::pgmap::unordered_map<pg_t,pg_stat_t> pg_stat; + + typedef mempool::pgmap::map< + std::pair<int64_t, int>, // <pool, osd> + store_statfs_t> + per_osd_pool_statfs_t; + + per_osd_pool_statfs_t pool_statfs; + + class Incremental { + public: + MEMPOOL_CLASS_HELPERS(); + version_t version; + mempool::pgmap::map<pg_t,pg_stat_t> pg_stat_updates; + epoch_t osdmap_epoch; + epoch_t pg_scan; // osdmap epoch + mempool::pgmap::set<pg_t> pg_remove; + utime_t stamp; + per_osd_pool_statfs_t pool_statfs_updates; + + private: + mempool::pgmap::map<int32_t,osd_stat_t> osd_stat_updates; + mempool::pgmap::set<int32_t> osd_stat_rm; + public: + + const mempool::pgmap::map<int32_t, osd_stat_t> &get_osd_stat_updates() const { + return osd_stat_updates; + } + const mempool::pgmap::set<int32_t> &get_osd_stat_rm() const { + return osd_stat_rm; + } + template<typename OsdStat> + void update_stat(int32_t osd, OsdStat&& stat) { + osd_stat_updates[osd] = std::forward<OsdStat>(stat); + } + void stat_osd_out(int32_t osd) { + osd_stat_updates[osd] = osd_stat_t(); + } + void stat_osd_down_up(int32_t osd, const PGMap& pg_map) { + // 0 the op_queue_age_hist for this osd + auto p = osd_stat_updates.find(osd); + if (p != osd_stat_updates.end()) { + p->second.op_queue_age_hist.clear(); + return; + } + auto q = pg_map.osd_stat.find(osd); + if (q != pg_map.osd_stat.end()) { + osd_stat_t& t = osd_stat_updates[osd] = q->second; + t.op_queue_age_hist.clear(); + } + } + void rm_stat(int32_t osd) { + osd_stat_rm.insert(osd); + osd_stat_updates.erase(osd); + } + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<Incremental*>& o); + + Incremental() : version(0), osdmap_epoch(0), pg_scan(0) {} + }; + + + // aggregate stats (soft state), generated by calc_stats() + mempool::pgmap::unordered_map<int,std::set<pg_t> > pg_by_osd; + mempool::pgmap::unordered_map<int,int> blocked_by_sum; + mempool::pgmap::list<std::pair<pool_stat_t, utime_t> > pg_sum_deltas; + mempool::pgmap::unordered_map<int64_t,mempool::pgmap::unordered_map<uint64_t,int32_t>> num_pg_by_pool_state; + + utime_t stamp; + + void update_pool_deltas( + CephContext *cct, + const utime_t ts, + const mempool::pgmap::unordered_map<int32_t, pool_stat_t>& pg_pool_sum_old); + void clear_delta(); + + void deleted_pool(int64_t pool) { + for (auto i = pool_statfs.begin(); i != pool_statfs.end();) { + if (i->first.first == pool) { + i = pool_statfs.erase(i); + } else { + ++i; + } + } + + pg_pool_sum.erase(pool); + num_pg_by_pool_state.erase(pool); + num_pg_by_pool.erase(pool); + per_pool_sum_deltas.erase(pool); + per_pool_sum_deltas_stamps.erase(pool); + per_pool_sum_delta.erase(pool); + } + + private: + void update_delta( + CephContext *cct, + const utime_t ts, + const pool_stat_t& old_pool_sum, + utime_t *last_ts, + const pool_stat_t& current_pool_sum, + pool_stat_t *result_pool_delta, + utime_t *result_ts_delta, + mempool::pgmap::list<std::pair<pool_stat_t,utime_t> > *delta_avg_list); + + void update_one_pool_delta(CephContext *cct, + const utime_t ts, + const int64_t pool, + const pool_stat_t& old_pool_sum); + + public: + + mempool::pgmap::set<pg_t> creating_pgs; + mempool::pgmap::map<int,std::map<epoch_t,std::set<pg_t> > > creating_pgs_by_osd_epoch; + + // Bits that use to be enum StuckPG + static const int STUCK_INACTIVE = (1<<0); + static const int STUCK_UNCLEAN = (1<<1); + static const int STUCK_UNDERSIZED = (1<<2); + static const int STUCK_DEGRADED = (1<<3); + static const int STUCK_STALE = (1<<4); + + PGMap() + : version(0), + last_osdmap_epoch(0), last_pg_scan(0) + {} + + version_t get_version() const { + return version; + } + void set_version(version_t v) { + version = v; + } + epoch_t get_last_osdmap_epoch() const { + return last_osdmap_epoch; + } + void set_last_osdmap_epoch(epoch_t e) { + last_osdmap_epoch = e; + } + epoch_t get_last_pg_scan() const { + return last_pg_scan; + } + void set_last_pg_scan(epoch_t e) { + last_pg_scan = e; + } + utime_t get_stamp() const { + return stamp; + } + void set_stamp(utime_t s) { + stamp = s; + } + + pool_stat_t get_pg_pool_sum_stat(int64_t pool) const { + auto p = pg_pool_sum.find(pool); + if (p != pg_pool_sum.end()) + return p->second; + return pool_stat_t(); + } + + osd_stat_t get_osd_sum(const std::set<int>& osds) const { + if (osds.empty()) // all + return osd_sum; + osd_stat_t sum; + for (auto i : osds) { + auto os = get_osd_stat(i); + if (os) + sum.add(*os); + } + return sum; + } + + const osd_stat_t *get_osd_stat(int osd) const { + auto i = osd_stat.find(osd); + if (i == osd_stat.end()) { + return nullptr; + } + return &i->second; + } + + + void apply_incremental(CephContext *cct, const Incremental& inc); + void calc_stats(); + void stat_pg_add(const pg_t &pgid, const pg_stat_t &s, + bool sameosds=false); + bool stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, + bool sameosds=false); + void calc_purged_snaps(); + void calc_osd_sum_by_class(const OSDMap& osdmap); + void stat_osd_add(int osd, const osd_stat_t &s); + void stat_osd_sub(int osd, const osd_stat_t &s); + + void encode(ceph::buffer::list &bl, uint64_t features=-1) const; + void decode(ceph::buffer::list::const_iterator &bl); + + /// encode subset of our data to a PGMapDigest + void encode_digest(const OSDMap& osdmap, + ceph::buffer::list& bl, uint64_t features); + + int64_t get_rule_avail(const OSDMap& osdmap, int ruleno) const; + void get_rules_avail(const OSDMap& osdmap, + std::map<int,int64_t> *avail_map) const; + void dump(ceph::Formatter *f, bool with_net = true) const; + void dump_basic(ceph::Formatter *f) const; + void dump_pg_stats(ceph::Formatter *f, bool brief) const; + void dump_pg_progress(ceph::Formatter *f) const; + void dump_pool_stats(ceph::Formatter *f) const; + void dump_osd_stats(ceph::Formatter *f, bool with_net = true) const; + void dump_osd_ping_times(ceph::Formatter *f) const; + void dump_delta(ceph::Formatter *f) const; + void dump_filtered_pg_stats(ceph::Formatter *f, std::set<pg_t>& pgs) const; + void dump_pool_stats_full(const OSDMap &osd_map, std::stringstream *ss, + ceph::Formatter *f, bool verbose) const override { + get_rules_avail(osd_map, &avail_space_by_rule); + PGMapDigest::dump_pool_stats_full(osd_map, ss, f, verbose); + } + + /* + * Dump client io rate, recovery io rate, cache io rate and recovery information. + * this function is used by "ceph osd pool stats" command + */ + void dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map, ceph::Formatter *f, + std::stringstream *ss) const; + + void dump_pg_stats_plain( + std::ostream& ss, + const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats, + bool brief) const; + void get_stuck_stats( + int types, const utime_t cutoff, + mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const; + bool get_stuck_counts(const utime_t cutoff, std::map<std::string, int>& note) const; + void dump_stuck(ceph::Formatter *f, int types, utime_t cutoff) const; + void dump_stuck_plain(std::ostream& ss, int types, utime_t cutoff) const; + int dump_stuck_pg_stats(std::stringstream &ds, + ceph::Formatter *f, + int threshold, + std::vector<std::string>& args) const; + void dump(std::ostream& ss) const; + void dump_basic(std::ostream& ss) const; + void dump_pg_stats(std::ostream& ss, bool brief) const; + void dump_pg_sum_stats(std::ostream& ss, bool header) const; + void dump_pool_stats(std::ostream& ss, bool header) const; + void dump_osd_stats(std::ostream& ss) const; + void dump_osd_sum_stats(std::ostream& ss) const; + void dump_filtered_pg_stats(std::ostream& ss, std::set<pg_t>& pgs) const; + + void dump_osd_perf_stats(ceph::Formatter *f) const; + void print_osd_perf_stats(std::ostream *ss) const; + + void dump_osd_blocked_by_stats(ceph::Formatter *f) const; + void print_osd_blocked_by_stats(std::ostream *ss) const; + + void get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid, + bool primary, std::set<pg_t>& pgs) const; + + std::set<std::string> osd_parentage(const OSDMap& osdmap, int id) const; + void get_health_checks( + CephContext *cct, + const OSDMap& osdmap, + health_check_map_t *checks) const; + void print_summary(ceph::Formatter *f, std::ostream *out) const; + + static void generate_test_instances(std::list<PGMap*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(PGMap) + +inline std::ostream& operator<<(std::ostream& out, const PGMapDigest& m) { + m.print_oneline_summary(NULL, &out); + return out; +} + +int process_pg_map_command( + const std::string& prefix, + const cmdmap_t& cmdmap, + const PGMap& pg_map, + const OSDMap& osdmap, + ceph::Formatter *f, + std::stringstream *ss, + ceph::buffer::list *odata); + +class PGMapUpdater +{ +public: + static void check_osd_map( + CephContext *cct, + const OSDMap &osdmap, + const PGMap& pg_map, + PGMap::Incremental *pending_inc); + + // mark pg's state stale if its acting primary osd is down + static void check_down_pgs( + const OSDMap &osd_map, + const PGMap &pg_map, + bool check_all, + const std::set<int>& need_check_down_pg_osds, + PGMap::Incremental *pending_inc); +}; + +namespace reweight { +/* Assign a lower weight to overloaded OSDs. + * + * The osds that will get a lower weight are those with with a utilization + * percentage 'oload' percent greater than the average utilization. + */ + int by_utilization(const OSDMap &osd_map, + const PGMap &pg_map, + int oload, + double max_changef, + int max_osds, + bool by_pg, const std::set<int64_t> *pools, + bool no_increasing, + mempool::osdmap::map<int32_t, uint32_t>* new_weights, + std::stringstream *ss, + std::string *out_str, + ceph::Formatter *f); +} + +#endif diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc new file mode 100644 index 000000000..21f244239 --- /dev/null +++ b/src/mon/Paxos.cc @@ -0,0 +1,1591 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <sstream> +#include "Paxos.h" +#include "Monitor.h" +#include "messages/MMonPaxos.h" + +#include "mon/mon_types.h" +#include "common/config.h" +#include "include/ceph_assert.h" +#include "include/stringify.h" +#include "common/Timer.h" +#include "messages/PaxosServiceMessage.h" + +using std::string; +using std::unique_lock; + +using ceph::bufferlist; +using ceph::Formatter; +using ceph::JSONFormatter; +using ceph::to_timespan; + +#define dout_subsys ceph_subsys_paxos +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, mon.name, mon.rank, paxos_name, state, first_committed, last_committed) +static std::ostream& _prefix(std::ostream *_dout, Monitor &mon, const string& name, + int rank, const string& paxos_name, int state, + version_t first_committed, version_t last_committed) +{ + return *_dout << "mon." << name << "@" << rank + << "(" << mon.get_state_name() << ")" + << ".paxos(" << paxos_name << " " << Paxos::get_statename(state) + << " c " << first_committed << ".." << last_committed + << ") "; +} + +class Paxos::C_Trimmed : public Context { + Paxos *paxos; +public: + explicit C_Trimmed(Paxos *p) : paxos(p) { } + void finish(int r) override { + paxos->trimming = false; + } +}; + +MonitorDBStore *Paxos::get_store() +{ + return mon.store; +} + +void Paxos::read_and_prepare_transactions(MonitorDBStore::TransactionRef tx, + version_t first, version_t last) +{ + dout(10) << __func__ << " first " << first << " last " << last << dendl; + for (version_t v = first; v <= last; ++v) { + dout(30) << __func__ << " apply version " << v << dendl; + bufferlist bl; + int err = get_store()->get(get_name(), v, bl); + ceph_assert(err == 0); + ceph_assert(bl.length()); + decode_append_transaction(tx, bl); + } + dout(15) << __func__ << " total versions " << (last-first) << dendl; +} + +void Paxos::init() +{ + // load paxos variables from stable storage + last_pn = get_store()->get(get_name(), "last_pn"); + accepted_pn = get_store()->get(get_name(), "accepted_pn"); + last_committed = get_store()->get(get_name(), "last_committed"); + first_committed = get_store()->get(get_name(), "first_committed"); + + dout(10) << __func__ << " last_pn: " << last_pn << " accepted_pn: " + << accepted_pn << " last_committed: " << last_committed + << " first_committed: " << first_committed << dendl; + + dout(10) << "init" << dendl; + ceph_assert(is_consistent()); +} + +void Paxos::init_logger() +{ + PerfCountersBuilder pcb(g_ceph_context, "paxos", l_paxos_first, l_paxos_last); + + // Because monitors are so few in number, the resource cost of capturing + // almost all their perf counters at USEFUL is trivial. + pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL); + + pcb.add_u64_counter(l_paxos_start_leader, "start_leader", "Starts in leader role"); + pcb.add_u64_counter(l_paxos_start_peon, "start_peon", "Starts in peon role"); + pcb.add_u64_counter(l_paxos_restart, "restart", "Restarts"); + pcb.add_u64_counter(l_paxos_refresh, "refresh", "Refreshes"); + pcb.add_time_avg(l_paxos_refresh_latency, "refresh_latency", "Refresh latency"); + pcb.add_u64_counter(l_paxos_begin, "begin", "Started and handled begins"); + pcb.add_u64_avg(l_paxos_begin_keys, "begin_keys", "Keys in transaction on begin"); + pcb.add_u64_avg(l_paxos_begin_bytes, "begin_bytes", "Data in transaction on begin", NULL, 0, unit_t(UNIT_BYTES)); + pcb.add_time_avg(l_paxos_begin_latency, "begin_latency", "Latency of begin operation"); + pcb.add_u64_counter(l_paxos_commit, "commit", + "Commits", "cmt"); + pcb.add_u64_avg(l_paxos_commit_keys, "commit_keys", "Keys in transaction on commit"); + pcb.add_u64_avg(l_paxos_commit_bytes, "commit_bytes", "Data in transaction on commit", NULL, 0, unit_t(UNIT_BYTES)); + pcb.add_time_avg(l_paxos_commit_latency, "commit_latency", + "Commit latency", "clat"); + pcb.add_u64_counter(l_paxos_collect, "collect", "Peon collects"); + pcb.add_u64_avg(l_paxos_collect_keys, "collect_keys", "Keys in transaction on peon collect"); + pcb.add_u64_avg(l_paxos_collect_bytes, "collect_bytes", "Data in transaction on peon collect", NULL, 0, unit_t(UNIT_BYTES)); + pcb.add_time_avg(l_paxos_collect_latency, "collect_latency", "Peon collect latency"); + pcb.add_u64_counter(l_paxos_collect_uncommitted, "collect_uncommitted", "Uncommitted values in started and handled collects"); + pcb.add_u64_counter(l_paxos_collect_timeout, "collect_timeout", "Collect timeouts"); + pcb.add_u64_counter(l_paxos_accept_timeout, "accept_timeout", "Accept timeouts"); + pcb.add_u64_counter(l_paxos_lease_ack_timeout, "lease_ack_timeout", "Lease acknowledgement timeouts"); + pcb.add_u64_counter(l_paxos_lease_timeout, "lease_timeout", "Lease timeouts"); + pcb.add_u64_counter(l_paxos_store_state, "store_state", "Store a shared state on disk"); + pcb.add_u64_avg(l_paxos_store_state_keys, "store_state_keys", "Keys in transaction in stored state"); + pcb.add_u64_avg(l_paxos_store_state_bytes, "store_state_bytes", "Data in transaction in stored state", NULL, 0, unit_t(UNIT_BYTES)); + pcb.add_time_avg(l_paxos_store_state_latency, "store_state_latency", "Storing state latency"); + pcb.add_u64_counter(l_paxos_share_state, "share_state", "Sharings of state"); + pcb.add_u64_avg(l_paxos_share_state_keys, "share_state_keys", "Keys in shared state"); + pcb.add_u64_avg(l_paxos_share_state_bytes, "share_state_bytes", "Data in shared state", NULL, 0, unit_t(UNIT_BYTES)); + pcb.add_u64_counter(l_paxos_new_pn, "new_pn", "New proposal number queries"); + pcb.add_time_avg(l_paxos_new_pn_latency, "new_pn_latency", "New proposal number getting latency"); + logger = pcb.create_perf_counters(); + g_ceph_context->get_perfcounters_collection()->add(logger); +} + +void Paxos::dump_info(Formatter *f) +{ + f->open_object_section("paxos"); + f->dump_unsigned("first_committed", first_committed); + f->dump_unsigned("last_committed", last_committed); + f->dump_unsigned("last_pn", last_pn); + f->dump_unsigned("accepted_pn", accepted_pn); + f->close_section(); +} + +// --------------------------------- + +// PHASE 1 + +// leader +void Paxos::collect(version_t oldpn) +{ + // we're recoverying, it seems! + state = STATE_RECOVERING; + ceph_assert(mon.is_leader()); + + // reset the number of lasts received + uncommitted_v = 0; + uncommitted_pn = 0; + uncommitted_value.clear(); + peer_first_committed.clear(); + peer_last_committed.clear(); + + // look for uncommitted value + if (get_store()->exists(get_name(), last_committed+1)) { + version_t v = get_store()->get(get_name(), "pending_v"); + version_t pn = get_store()->get(get_name(), "pending_pn"); + if (v && pn && v == last_committed + 1) { + uncommitted_pn = pn; + } else { + dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << accepted_pn + << " and crossing our fingers" << dendl; + uncommitted_pn = accepted_pn; + } + uncommitted_v = last_committed+1; + + get_store()->get(get_name(), last_committed+1, uncommitted_value); + ceph_assert(uncommitted_value.length()); + dout(10) << "learned uncommitted " << (last_committed+1) + << " pn " << uncommitted_pn + << " (" << uncommitted_value.length() << " bytes) from myself" + << dendl; + + logger->inc(l_paxos_collect_uncommitted); + } + + // pick new pn + accepted_pn = get_new_proposal_number(std::max(accepted_pn, oldpn)); + accepted_pn_from = last_committed; + num_last = 1; + dout(10) << "collect with pn " << accepted_pn << dendl; + + // send collect + for (auto p = mon.get_quorum().begin(); + p != mon.get_quorum().end(); + ++p) { + if (*p == mon.rank) continue; + + MMonPaxos *collect = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COLLECT, + ceph_clock_now()); + collect->last_committed = last_committed; + collect->first_committed = first_committed; + collect->pn = accepted_pn; + mon.send_mon_message(collect, *p); + } + + // set timeout event + collect_timeout_event = mon.timer.add_event_after( + g_conf()->mon_accept_timeout_factor * + g_conf()->mon_lease, + new C_MonContext{&mon, [this](int r) { + if (r == -ECANCELED) + return; + collect_timeout(); + }}); +} + + +// peon +void Paxos::handle_collect(MonOpRequestRef op) +{ + + op->mark_paxos_event("handle_collect"); + + auto collect = op->get_req<MMonPaxos>(); + dout(10) << "handle_collect " << *collect << dendl; + + ceph_assert(mon.is_peon()); // mon epoch filter should catch strays + + // we're recoverying, it seems! + state = STATE_RECOVERING; + + //update the peon recovery timeout + reset_lease_timeout(); + + if (collect->first_committed > last_committed+1) { + dout(2) << __func__ + << " leader's lowest version is too high for our last committed" + << " (theirs: " << collect->first_committed + << "; ours: " << last_committed << ") -- bootstrap!" << dendl; + op->mark_paxos_event("need to bootstrap"); + mon.bootstrap(); + return; + } + + // reply + MMonPaxos *last = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LAST, + ceph_clock_now()); + last->last_committed = last_committed; + last->first_committed = first_committed; + + version_t previous_pn = accepted_pn; + + // can we accept this pn? + if (collect->pn > accepted_pn) { + // ok, accept it + accepted_pn = collect->pn; + accepted_pn_from = collect->pn_from; + dout(10) << "accepting pn " << accepted_pn << " from " + << accepted_pn_from << dendl; + + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put(get_name(), "accepted_pn", accepted_pn); + + dout(30) << __func__ << " transaction dump:\n"; + JSONFormatter f(true); + t->dump(&f); + f.flush(*_dout); + *_dout << dendl; + + logger->inc(l_paxos_collect); + logger->inc(l_paxos_collect_keys, t->get_keys()); + logger->inc(l_paxos_collect_bytes, t->get_bytes()); + + auto start = ceph::coarse_mono_clock::now(); + get_store()->apply_transaction(t); + auto end = ceph::coarse_mono_clock::now(); + + logger->tinc(l_paxos_collect_latency, to_timespan(end - start)); + } else { + // don't accept! + dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from + << ", we already accepted " << accepted_pn + << " from " << accepted_pn_from << dendl; + } + last->pn = accepted_pn; + last->pn_from = accepted_pn_from; + + // share whatever committed values we have + if (collect->last_committed < last_committed) + share_state(last, collect->first_committed, collect->last_committed); + + // do we have an accepted but uncommitted value? + // (it'll be at last_committed+1) + bufferlist bl; + if (collect->last_committed <= last_committed && + get_store()->exists(get_name(), last_committed+1)) { + get_store()->get(get_name(), last_committed+1, bl); + ceph_assert(bl.length() > 0); + dout(10) << " sharing our accepted but uncommitted value for " + << last_committed+1 << " (" << bl.length() << " bytes)" << dendl; + last->values[last_committed+1] = bl; + + version_t v = get_store()->get(get_name(), "pending_v"); + version_t pn = get_store()->get(get_name(), "pending_pn"); + if (v && pn && v == last_committed + 1) { + last->uncommitted_pn = pn; + } else { + // previously we didn't record which pn a value was accepted + // under! use the pn value we just had... :( + dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << previous_pn + << " and crossing our fingers" << dendl; + last->uncommitted_pn = previous_pn; + } + + logger->inc(l_paxos_collect_uncommitted); + } + + // send reply + collect->get_connection()->send_message(last); +} + +/** + * @note This is Okay. We share our versions between peer_last_committed and + * our last_committed (inclusive), and add their bufferlists to the + * message. It will be the peer's job to apply them to its store, as + * these bufferlists will contain raw transactions. + * This function is called by both the Peon and the Leader. The Peon will + * share the state with the Leader during handle_collect(), sharing any + * values the leader may be missing (i.e., the leader's last_committed is + * lower than the peon's last_committed). The Leader will share the state + * with the Peon during handle_last(), if the peon's last_committed is + * lower than the leader's last_committed. + */ +void Paxos::share_state(MMonPaxos *m, version_t peer_first_committed, + version_t peer_last_committed) +{ + ceph_assert(peer_last_committed < last_committed); + + dout(10) << "share_state peer has fc " << peer_first_committed + << " lc " << peer_last_committed << dendl; + version_t v = peer_last_committed + 1; + + // include incrementals + uint64_t bytes = 0; + for ( ; v <= last_committed; v++) { + if (get_store()->exists(get_name(), v)) { + get_store()->get(get_name(), v, m->values[v]); + ceph_assert(m->values[v].length()); + dout(10) << " sharing " << v << " (" + << m->values[v].length() << " bytes)" << dendl; + bytes += m->values[v].length() + 16; // paxos_ + 10 digits = 16 + } + } + logger->inc(l_paxos_share_state); + logger->inc(l_paxos_share_state_keys, m->values.size()); + logger->inc(l_paxos_share_state_bytes, bytes); + + m->last_committed = last_committed; +} + +/** + * Store on disk a state that was shared with us + * + * Basically, we received a set of version. Or just one. It doesn't matter. + * What matters is that we have to stash it in the store. So, we will simply + * write every single bufferlist into their own versions on our side (i.e., + * onto paxos-related keys), and then we will decode those same bufferlists + * we just wrote and apply the transactions they hold. We will also update + * our first and last committed values to point to the new values, if need + * be. All all this is done tightly wrapped in a transaction to ensure we + * enjoy the atomicity guarantees given by our awesome k/v store. + */ +bool Paxos::store_state(MMonPaxos *m) +{ + auto t(std::make_shared<MonitorDBStore::Transaction>()); + auto start = m->values.begin(); + bool changed = false; + + // build map of values to store + // we want to write the range [last_committed, m->last_committed] only. + if (start != m->values.end() && + start->first > last_committed + 1) { + // ignore everything if values start in the future. + dout(10) << "store_state ignoring all values, they start at " << start->first + << " > last_committed+1" << dendl; + return false; + } + + // push forward the start position on the message's values iterator, up until + // we run out of positions or we find a position matching 'last_committed'. + while (start != m->values.end() && start->first <= last_committed) { + ++start; + } + + // make sure we get the right interval of values to apply by pushing forward + // the 'end' iterator until it matches the message's 'last_committed'. + auto end = start; + while (end != m->values.end() && end->first <= m->last_committed) { + last_committed = end->first; + ++end; + } + + if (start == end) { + dout(10) << "store_state nothing to commit" << dendl; + } else { + dout(10) << "store_state [" << start->first << ".." + << last_committed << "]" << dendl; + t->put(get_name(), "last_committed", last_committed); + + // we should apply the state here -- decode every single bufferlist in the + // map and append the transactions to 't'. + for (auto it = start; it != end; ++it) { + // write the bufferlist as the version's value + t->put(get_name(), it->first, it->second); + // decode the bufferlist and append it to the transaction we will shortly + // apply. + decode_append_transaction(t, it->second); + } + + // discard obsolete uncommitted value? + if (uncommitted_v && uncommitted_v <= last_committed) { + dout(10) << " forgetting obsolete uncommitted value " << uncommitted_v + << " pn " << uncommitted_pn << dendl; + uncommitted_v = 0; + uncommitted_pn = 0; + uncommitted_value.clear(); + } + } + if (!t->empty()) { + dout(30) << __func__ << " transaction dump:\n"; + JSONFormatter f(true); + t->dump(&f); + f.flush(*_dout); + *_dout << dendl; + + logger->inc(l_paxos_store_state); + logger->inc(l_paxos_store_state_bytes, t->get_bytes()); + logger->inc(l_paxos_store_state_keys, t->get_keys()); + + auto start = ceph::coarse_mono_clock::now(); + get_store()->apply_transaction(t); + auto end = ceph::coarse_mono_clock::now(); + + logger->tinc(l_paxos_store_state_latency, to_timespan(end-start)); + + // refresh first_committed; this txn may have trimmed. + first_committed = get_store()->get(get_name(), "first_committed"); + + _sanity_check_store(); + changed = true; + } + + return changed; +} + +void Paxos::_sanity_check_store() +{ + version_t lc = get_store()->get(get_name(), "last_committed"); + ceph_assert(lc == last_committed); +} + + +// leader +void Paxos::handle_last(MonOpRequestRef op) +{ + op->mark_paxos_event("handle_last"); + auto last = op->get_req<MMonPaxos>(); + bool need_refresh = false; + int from = last->get_source().num(); + + dout(10) << "handle_last " << *last << dendl; + + if (!mon.is_leader()) { + dout(10) << "not leader, dropping" << dendl; + return; + } + + // note peer's first_ and last_committed, in case we learn a new + // commit and need to push it to them. + peer_first_committed[from] = last->first_committed; + peer_last_committed[from] = last->last_committed; + + if (last->first_committed > last_committed + 1) { + dout(5) << __func__ + << " mon." << from + << " lowest version is too high for our last committed" + << " (theirs: " << last->first_committed + << "; ours: " << last_committed << ") -- bootstrap!" << dendl; + op->mark_paxos_event("need to bootstrap"); + mon.bootstrap(); + return; + } + + ceph_assert(g_conf()->paxos_kill_at != 1); + + // store any committed values if any are specified in the message + need_refresh = store_state(last); + + ceph_assert(g_conf()->paxos_kill_at != 2); + + // is everyone contiguous and up to date? + for (auto p = peer_last_committed.begin(); + p != peer_last_committed.end(); + ++p) { + if (p->second + 1 < first_committed && first_committed > 1) { + dout(5) << __func__ + << " peon " << p->first + << " last_committed (" << p->second + << ") is too low for our first_committed (" << first_committed + << ") -- bootstrap!" << dendl; + op->mark_paxos_event("need to bootstrap"); + mon.bootstrap(); + return; + } + if (p->second < last_committed) { + // share committed values + dout(10) << " sending commit to mon." << p->first << dendl; + MMonPaxos *commit = new MMonPaxos(mon.get_epoch(), + MMonPaxos::OP_COMMIT, + ceph_clock_now()); + share_state(commit, peer_first_committed[p->first], p->second); + mon.send_mon_message(commit, p->first); + } + } + + // do they accept your pn? + if (last->pn > accepted_pn) { + // no, try again. + dout(10) << " they had a higher pn than us, picking a new one." << dendl; + + // cancel timeout event + mon.timer.cancel_event(collect_timeout_event); + collect_timeout_event = 0; + + collect(last->pn); + } else if (last->pn == accepted_pn) { + // yes, they accepted our pn. great. + num_last++; + dout(10) << " they accepted our pn, we now have " + << num_last << " peons" << dendl; + + // did this person send back an accepted but uncommitted value? + if (last->uncommitted_pn) { + if (last->uncommitted_pn >= uncommitted_pn && + last->last_committed >= last_committed && + last->last_committed + 1 >= uncommitted_v) { + uncommitted_v = last->last_committed+1; + uncommitted_pn = last->uncommitted_pn; + uncommitted_value = last->values[uncommitted_v]; + dout(10) << "we learned an uncommitted value for " << uncommitted_v + << " pn " << uncommitted_pn + << " " << uncommitted_value.length() << " bytes" + << dendl; + } else { + dout(10) << "ignoring uncommitted value for " << (last->last_committed+1) + << " pn " << last->uncommitted_pn + << " " << last->values[last->last_committed+1].length() << " bytes" + << dendl; + } + } + + // is that everyone? + if (num_last == mon.get_quorum().size()) { + // cancel timeout event + mon.timer.cancel_event(collect_timeout_event); + collect_timeout_event = 0; + peer_first_committed.clear(); + peer_last_committed.clear(); + + // almost... + + // did we learn an old value? + if (uncommitted_v == last_committed+1 && + uncommitted_value.length()) { + dout(10) << "that's everyone. begin on old learned value" << dendl; + state = STATE_UPDATING_PREVIOUS; + begin(uncommitted_value); + } else { + // active! + dout(10) << "that's everyone. active!" << dendl; + extend_lease(); + + need_refresh = false; + if (do_refresh()) { + finish_round(); + } + } + } + } else { + // no, this is an old message, discard + dout(10) << "old pn, ignoring" << dendl; + } + + if (need_refresh) + (void)do_refresh(); +} + +void Paxos::collect_timeout() +{ + dout(1) << "collect timeout, calling fresh election" << dendl; + collect_timeout_event = 0; + logger->inc(l_paxos_collect_timeout); + ceph_assert(mon.is_leader()); + mon.bootstrap(); +} + + +// leader +void Paxos::begin(bufferlist& v) +{ + dout(10) << "begin for " << last_committed+1 << " " + << v.length() << " bytes" + << dendl; + + ceph_assert(mon.is_leader()); + ceph_assert(is_updating() || is_updating_previous()); + + // we must already have a majority for this to work. + ceph_assert(mon.get_quorum().size() == 1 || + num_last > (unsigned)mon.monmap->size()/2); + + // and no value, yet. + ceph_assert(new_value.length() == 0); + + // accept it ourselves + accepted.clear(); + accepted.insert(mon.rank); + new_value = v; + + if (last_committed == 0) { + auto t(std::make_shared<MonitorDBStore::Transaction>()); + // initial base case; set first_committed too + t->put(get_name(), "first_committed", 1); + decode_append_transaction(t, new_value); + + bufferlist tx_bl; + t->encode(tx_bl); + + new_value = tx_bl; + } + + // store the proposed value in the store. IF it is accepted, we will then + // have to decode it into a transaction and apply it. + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put(get_name(), last_committed+1, new_value); + + // note which pn this pending value is for. + t->put(get_name(), "pending_v", last_committed + 1); + t->put(get_name(), "pending_pn", accepted_pn); + + dout(30) << __func__ << " transaction dump:\n"; + JSONFormatter f(true); + t->dump(&f); + f.flush(*_dout); + auto debug_tx(std::make_shared<MonitorDBStore::Transaction>()); + auto new_value_it = new_value.cbegin(); + debug_tx->decode(new_value_it); + debug_tx->dump(&f); + *_dout << "\nbl dump:\n"; + f.flush(*_dout); + *_dout << dendl; + + logger->inc(l_paxos_begin); + logger->inc(l_paxos_begin_keys, t->get_keys()); + logger->inc(l_paxos_begin_bytes, t->get_bytes()); + + auto start = ceph::coarse_mono_clock::now(); + get_store()->apply_transaction(t); + auto end = ceph::coarse_mono_clock::now(); + + logger->tinc(l_paxos_begin_latency, to_timespan(end - start)); + + ceph_assert(g_conf()->paxos_kill_at != 3); + + if (mon.get_quorum().size() == 1) { + // we're alone, take it easy + commit_start(); + return; + } + + // ask others to accept it too! + for (auto p = mon.get_quorum().begin(); + p != mon.get_quorum().end(); + ++p) { + if (*p == mon.rank) continue; + + dout(10) << " sending begin to mon." << *p << dendl; + MMonPaxos *begin = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_BEGIN, + ceph_clock_now()); + begin->values[last_committed+1] = new_value; + begin->last_committed = last_committed; + begin->pn = accepted_pn; + + mon.send_mon_message(begin, *p); + } + + // set timeout event + accept_timeout_event = mon.timer.add_event_after( + g_conf()->mon_accept_timeout_factor * g_conf()->mon_lease, + new C_MonContext{&mon, [this](int r) { + if (r == -ECANCELED) + return; + accept_timeout(); + }}); +} + +// peon +void Paxos::handle_begin(MonOpRequestRef op) +{ + op->mark_paxos_event("handle_begin"); + auto begin = op->get_req<MMonPaxos>(); + dout(10) << "handle_begin " << *begin << dendl; + + // can we accept this? + if (begin->pn < accepted_pn) { + dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl; + op->mark_paxos_event("have higher pn, ignore"); + return; + } + ceph_assert(begin->pn == accepted_pn); + ceph_assert(begin->last_committed == last_committed); + + ceph_assert(g_conf()->paxos_kill_at != 4); + + logger->inc(l_paxos_begin); + + // set state. + state = STATE_UPDATING; + lease_expire = {}; // cancel lease + + // yes. + version_t v = last_committed+1; + dout(10) << "accepting value for " << v << " pn " << accepted_pn << dendl; + // store the accepted value onto our store. We will have to decode it and + // apply its transaction once we receive permission to commit. + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put(get_name(), v, begin->values[v]); + + // note which pn this pending value is for. + t->put(get_name(), "pending_v", v); + t->put(get_name(), "pending_pn", accepted_pn); + + dout(30) << __func__ << " transaction dump:\n"; + JSONFormatter f(true); + t->dump(&f); + f.flush(*_dout); + *_dout << dendl; + + logger->inc(l_paxos_begin_bytes, t->get_bytes()); + + auto start = ceph::coarse_mono_clock::now(); + get_store()->apply_transaction(t); + auto end = ceph::coarse_mono_clock::now(); + + logger->tinc(l_paxos_begin_latency, to_timespan(end - start)); + + ceph_assert(g_conf()->paxos_kill_at != 5); + + // reply + MMonPaxos *accept = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_ACCEPT, + ceph_clock_now()); + accept->pn = accepted_pn; + accept->last_committed = last_committed; + begin->get_connection()->send_message(accept); +} + +// leader +void Paxos::handle_accept(MonOpRequestRef op) +{ + op->mark_paxos_event("handle_accept"); + auto accept = op->get_req<MMonPaxos>(); + dout(10) << "handle_accept " << *accept << dendl; + int from = accept->get_source().num(); + + if (accept->pn != accepted_pn) { + // we accepted a higher pn, from some other leader + dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl; + op->mark_paxos_event("have higher pn, ignore"); + return; + } + if (last_committed > 0 && + accept->last_committed < last_committed-1) { + dout(10) << " this is from an old round, ignoring" << dendl; + op->mark_paxos_event("old round, ignore"); + return; + } + ceph_assert(accept->last_committed == last_committed || // not committed + accept->last_committed == last_committed-1); // committed + + ceph_assert(is_updating() || is_updating_previous()); + ceph_assert(accepted.count(from) == 0); + accepted.insert(from); + dout(10) << " now " << accepted << " have accepted" << dendl; + + ceph_assert(g_conf()->paxos_kill_at != 6); + + // only commit (and expose committed state) when we get *all* quorum + // members to accept. otherwise, they may still be sharing the now + // stale state. + // FIXME: we can improve this with an additional lease revocation message + // that doesn't block for the persist. + if (accepted == mon.get_quorum()) { + // yay, commit! + dout(10) << " got majority, committing, done with update" << dendl; + op->mark_paxos_event("commit_start"); + commit_start(); + } +} + +void Paxos::accept_timeout() +{ + dout(1) << "accept timeout, calling fresh election" << dendl; + accept_timeout_event = 0; + ceph_assert(mon.is_leader()); + ceph_assert(is_updating() || is_updating_previous() || is_writing() || + is_writing_previous()); + logger->inc(l_paxos_accept_timeout); + mon.bootstrap(); +} + +struct C_Committed : public Context { + Paxos *paxos; + explicit C_Committed(Paxos *p) : paxos(p) {} + void finish(int r) override { + ceph_assert(r >= 0); + std::lock_guard l(paxos->mon.lock); + if (paxos->is_shutdown()) { + paxos->abort_commit(); + return; + } + paxos->commit_finish(); + } +}; + +void Paxos::abort_commit() +{ + ceph_assert(commits_started > 0); + --commits_started; + if (commits_started == 0) + shutdown_cond.notify_all(); +} + +void Paxos::commit_start() +{ + dout(10) << __func__ << " " << (last_committed+1) << dendl; + + ceph_assert(g_conf()->paxos_kill_at != 7); + + auto t(std::make_shared<MonitorDBStore::Transaction>()); + + // commit locally + t->put(get_name(), "last_committed", last_committed + 1); + + // decode the value and apply its transaction to the store. + // this value can now be read from last_committed. + decode_append_transaction(t, new_value); + + dout(30) << __func__ << " transaction dump:\n"; + JSONFormatter f(true); + t->dump(&f); + f.flush(*_dout); + *_dout << dendl; + + logger->inc(l_paxos_commit); + logger->inc(l_paxos_commit_keys, t->get_keys()); + logger->inc(l_paxos_commit_bytes, t->get_bytes()); + commit_start_stamp = ceph_clock_now(); + + get_store()->queue_transaction(t, new C_Committed(this)); + + if (is_updating_previous()) + state = STATE_WRITING_PREVIOUS; + else if (is_updating()) + state = STATE_WRITING; + else + ceph_abort(); + ++commits_started; + + if (mon.get_quorum().size() > 1) { + // cancel timeout event + mon.timer.cancel_event(accept_timeout_event); + accept_timeout_event = 0; + } +} + +void Paxos::commit_finish() +{ + dout(20) << __func__ << " " << (last_committed+1) << dendl; + utime_t end = ceph_clock_now(); + logger->tinc(l_paxos_commit_latency, end - commit_start_stamp); + + ceph_assert(g_conf()->paxos_kill_at != 8); + + // cancel lease - it was for the old value. + // (this would only happen if message layer lost the 'begin', but + // leader still got a majority and committed with out us.) + lease_expire = {}; // cancel lease + + last_committed++; + last_commit_time = ceph_clock_now(); + + // refresh first_committed; this txn may have trimmed. + first_committed = get_store()->get(get_name(), "first_committed"); + + _sanity_check_store(); + + // tell everyone + for (auto p = mon.get_quorum().begin(); + p != mon.get_quorum().end(); + ++p) { + if (*p == mon.rank) continue; + + dout(10) << " sending commit to mon." << *p << dendl; + MMonPaxos *commit = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_COMMIT, + ceph_clock_now()); + commit->values[last_committed] = new_value; + commit->pn = accepted_pn; + commit->last_committed = last_committed; + + mon.send_mon_message(commit, *p); + } + + ceph_assert(g_conf()->paxos_kill_at != 9); + + // get ready for a new round. + new_value.clear(); + + // WRITING -> REFRESH + // among other things, this lets do_refresh() -> mon.bootstrap() -> + // wait_for_paxos_write() know that it doesn't need to flush the store + // queue. and it should not, as we are in the async completion thread now! + ceph_assert(is_writing() || is_writing_previous()); + state = STATE_REFRESH; + ceph_assert(commits_started > 0); + --commits_started; + + if (do_refresh()) { + commit_proposal(); + if (mon.get_quorum().size() > 1) { + extend_lease(); + } + + ceph_assert(g_conf()->paxos_kill_at != 10); + + finish_round(); + } +} + + +void Paxos::handle_commit(MonOpRequestRef op) +{ + op->mark_paxos_event("handle_commit"); + auto commit = op->get_req<MMonPaxos>(); + dout(10) << "handle_commit on " << commit->last_committed << dendl; + + logger->inc(l_paxos_commit); + + if (!mon.is_peon()) { + dout(10) << "not a peon, dropping" << dendl; + ceph_abort(); + return; + } + + op->mark_paxos_event("store_state"); + store_state(commit); + + (void)do_refresh(); +} + +void Paxos::extend_lease() +{ + ceph_assert(mon.is_leader()); + //assert(is_active()); + + lease_expire = ceph::real_clock::now(); + lease_expire += ceph::make_timespan(g_conf()->mon_lease); + acked_lease.clear(); + acked_lease.insert(mon.rank); + + dout(7) << "extend_lease now+" << g_conf()->mon_lease + << " (" << lease_expire << ")" << dendl; + + // bcast + for (auto p = mon.get_quorum().begin(); + p != mon.get_quorum().end(); ++p) { + + if (*p == mon.rank) continue; + MMonPaxos *lease = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE, + ceph_clock_now()); + lease->last_committed = last_committed; + lease->lease_timestamp = utime_t{lease_expire}; + lease->first_committed = first_committed; + mon.send_mon_message(lease, *p); + } + + // set timeout event. + // if old timeout is still in place, leave it. + if (!lease_ack_timeout_event) { + lease_ack_timeout_event = mon.timer.add_event_after( + g_conf()->mon_lease_ack_timeout_factor * g_conf()->mon_lease, + new C_MonContext{&mon, [this](int r) { + if (r == -ECANCELED) + return; + lease_ack_timeout(); + }}); + } + + // set renew event + auto at = lease_expire; + at -= ceph::make_timespan(g_conf()->mon_lease); + at += ceph::make_timespan(g_conf()->mon_lease_renew_interval_factor * + g_conf()->mon_lease); + lease_renew_event = mon.timer.add_event_at( + at, new C_MonContext{&mon, [this](int r) { + if (r == -ECANCELED) + return; + lease_renew_timeout(); + }}); +} + +void Paxos::warn_on_future_time(utime_t t, entity_name_t from) +{ + utime_t now = ceph_clock_now(); + if (t > now) { + utime_t diff = t - now; + if (diff > g_conf()->mon_clock_drift_allowed) { + utime_t warn_diff = now - last_clock_drift_warn; + if (warn_diff > + pow(g_conf()->mon_clock_drift_warn_backoff, clock_drift_warned)) { + mon.clog->warn() << "message from " << from << " was stamped " << diff + << "s in the future, clocks not synchronized"; + last_clock_drift_warn = ceph_clock_now(); + ++clock_drift_warned; + } + } + } + +} + +bool Paxos::do_refresh() +{ + bool need_bootstrap = false; + + // make sure we have the latest state loaded up + auto start = ceph::coarse_mono_clock::now(); + mon.refresh_from_paxos(&need_bootstrap); + auto end = ceph::coarse_mono_clock::now(); + + logger->inc(l_paxos_refresh); + logger->tinc(l_paxos_refresh_latency, to_timespan(end - start)); + + if (need_bootstrap) { + dout(10) << " doing requested bootstrap" << dendl; + mon.bootstrap(); + return false; + } + + return true; +} + +void Paxos::commit_proposal() +{ + dout(10) << __func__ << dendl; + ceph_assert(mon.is_leader()); + ceph_assert(is_refresh()); + + finish_contexts(g_ceph_context, committing_finishers); +} + +void Paxos::finish_round() +{ + dout(10) << __func__ << dendl; + ceph_assert(mon.is_leader()); + + // ok, now go active! + state = STATE_ACTIVE; + + dout(20) << __func__ << " waiting_for_acting" << dendl; + finish_contexts(g_ceph_context, waiting_for_active); + dout(20) << __func__ << " waiting_for_readable" << dendl; + finish_contexts(g_ceph_context, waiting_for_readable); + dout(20) << __func__ << " waiting_for_writeable" << dendl; + finish_contexts(g_ceph_context, waiting_for_writeable); + + dout(10) << __func__ << " done w/ waiters, state " << get_statename(state) << dendl; + + if (should_trim()) { + trim(); + } + + if (is_active() && pending_proposal) { + propose_pending(); + } +} + + +// peon +void Paxos::handle_lease(MonOpRequestRef op) +{ + op->mark_paxos_event("handle_lease"); + auto lease = op->get_req<MMonPaxos>(); + // sanity + if (!mon.is_peon() || + last_committed != lease->last_committed) { + dout(10) << "handle_lease i'm not a peon, or they're not the leader," + << " or the last_committed doesn't match, dropping" << dendl; + op->mark_paxos_event("invalid lease, ignore"); + return; + } + + warn_on_future_time(lease->sent_timestamp, lease->get_source()); + + // extend lease + if (auto new_expire = lease->lease_timestamp.to_real_time(); + lease_expire < new_expire) { + lease_expire = new_expire; + + auto now = ceph::real_clock::now(); + if (lease_expire < now) { + auto diff = now - lease_expire; + derr << "lease_expire from " << lease->get_source_inst() << " is " << diff << " seconds in the past; mons are probably laggy (or possibly clocks are too skewed)" << dendl; + } + } + + state = STATE_ACTIVE; + + dout(10) << "handle_lease on " << lease->last_committed + << " now " << lease_expire << dendl; + + // ack + MMonPaxos *ack = new MMonPaxos(mon.get_epoch(), MMonPaxos::OP_LEASE_ACK, + ceph_clock_now()); + ack->last_committed = last_committed; + ack->first_committed = first_committed; + ack->lease_timestamp = ceph_clock_now(); + encode(mon.session_map.feature_map, ack->feature_map); + lease->get_connection()->send_message(ack); + + // (re)set timeout event. + reset_lease_timeout(); + + // kick waiters + finish_contexts(g_ceph_context, waiting_for_active); + if (is_readable()) + finish_contexts(g_ceph_context, waiting_for_readable); +} + +void Paxos::handle_lease_ack(MonOpRequestRef op) +{ + op->mark_paxos_event("handle_lease_ack"); + auto ack = op->get_req<MMonPaxos>(); + int from = ack->get_source().num(); + + if (!lease_ack_timeout_event) { + dout(10) << "handle_lease_ack from " << ack->get_source() + << " -- stray (probably since revoked)" << dendl; + + } else if (acked_lease.count(from) == 0) { + acked_lease.insert(from); + if (ack->feature_map.length()) { + auto p = ack->feature_map.cbegin(); + FeatureMap& t = mon.quorum_feature_map[from]; + decode(t, p); + } + if (acked_lease == mon.get_quorum()) { + // yay! + dout(10) << "handle_lease_ack from " << ack->get_source() + << " -- got everyone" << dendl; + mon.timer.cancel_event(lease_ack_timeout_event); + lease_ack_timeout_event = 0; + + + } else { + dout(10) << "handle_lease_ack from " << ack->get_source() + << " -- still need " + << mon.get_quorum().size() - acked_lease.size() + << " more" << dendl; + } + } else { + dout(10) << "handle_lease_ack from " << ack->get_source() + << " dup (lagging!), ignoring" << dendl; + } + + warn_on_future_time(ack->sent_timestamp, ack->get_source()); +} + +void Paxos::lease_ack_timeout() +{ + dout(1) << "lease_ack_timeout -- calling new election" << dendl; + ceph_assert(mon.is_leader()); + ceph_assert(is_active()); + logger->inc(l_paxos_lease_ack_timeout); + lease_ack_timeout_event = 0; + mon.bootstrap(); +} + +void Paxos::reset_lease_timeout() +{ + dout(20) << "reset_lease_timeout - setting timeout event" << dendl; + if (lease_timeout_event) + mon.timer.cancel_event(lease_timeout_event); + lease_timeout_event = mon.timer.add_event_after( + g_conf()->mon_lease_ack_timeout_factor * g_conf()->mon_lease, + new C_MonContext{&mon, [this](int r) { + if (r == -ECANCELED) + return; + lease_timeout(); + }}); +} + +void Paxos::lease_timeout() +{ + dout(1) << "lease_timeout -- calling new election" << dendl; + ceph_assert(mon.is_peon()); + logger->inc(l_paxos_lease_timeout); + lease_timeout_event = 0; + mon.bootstrap(); +} + +void Paxos::lease_renew_timeout() +{ + lease_renew_event = 0; + extend_lease(); +} + + +/* + * trim old states + */ +void Paxos::trim() +{ + ceph_assert(should_trim()); + version_t end = std::min(get_version() - g_conf()->paxos_min, + get_first_committed() + g_conf()->paxos_trim_max); + + if (first_committed >= end) + return; + + dout(10) << "trim to " << end << " (was " << first_committed << ")" << dendl; + + MonitorDBStore::TransactionRef t = get_pending_transaction(); + + for (version_t v = first_committed; v < end; ++v) { + dout(10) << "trim " << v << dendl; + t->erase(get_name(), v); + } + t->put(get_name(), "first_committed", end); + if (g_conf()->mon_compact_on_trim) { + dout(10) << " compacting trimmed range" << dendl; + t->compact_range(get_name(), stringify(first_committed - 1), stringify(end)); + } + + trimming = true; + queue_pending_finisher(new C_Trimmed(this)); +} + +/* + * return a globally unique, monotonically increasing proposal number + */ +version_t Paxos::get_new_proposal_number(version_t gt) +{ + if (last_pn < gt) + last_pn = gt; + + // update. make it unique among all monitors. + last_pn /= 100; + last_pn++; + last_pn *= 100; + last_pn += (version_t)mon.rank; + + // write + auto t(std::make_shared<MonitorDBStore::Transaction>()); + t->put(get_name(), "last_pn", last_pn); + + dout(30) << __func__ << " transaction dump:\n"; + JSONFormatter f(true); + t->dump(&f); + f.flush(*_dout); + *_dout << dendl; + + logger->inc(l_paxos_new_pn); + + auto start = ceph::coarse_mono_clock::now(); + get_store()->apply_transaction(t); + auto end = ceph::coarse_mono_clock::now(); + + logger->tinc(l_paxos_new_pn_latency, to_timespan(end - start)); + + dout(10) << "get_new_proposal_number = " << last_pn << dendl; + return last_pn; +} + + +void Paxos::cancel_events() +{ + if (collect_timeout_event) { + mon.timer.cancel_event(collect_timeout_event); + collect_timeout_event = 0; + } + if (accept_timeout_event) { + mon.timer.cancel_event(accept_timeout_event); + accept_timeout_event = 0; + } + if (lease_renew_event) { + mon.timer.cancel_event(lease_renew_event); + lease_renew_event = 0; + } + if (lease_ack_timeout_event) { + mon.timer.cancel_event(lease_ack_timeout_event); + lease_ack_timeout_event = 0; + } + if (lease_timeout_event) { + mon.timer.cancel_event(lease_timeout_event); + lease_timeout_event = 0; + } +} + +void Paxos::shutdown() +{ + dout(10) << __func__ << " cancel all contexts" << dendl; + + state = STATE_SHUTDOWN; + + // discard pending transaction + pending_proposal.reset(); + + // Let store finish commits in progress + // XXX: I assume I can't use finish_contexts() because the store + // is going to trigger + unique_lock l{mon.lock, std::adopt_lock}; + shutdown_cond.wait(l, [this] { return commits_started <= 0; }); + // Monitor::shutdown() will unlock it + l.release(); + + finish_contexts(g_ceph_context, waiting_for_writeable, -ECANCELED); + finish_contexts(g_ceph_context, waiting_for_readable, -ECANCELED); + finish_contexts(g_ceph_context, waiting_for_active, -ECANCELED); + finish_contexts(g_ceph_context, pending_finishers, -ECANCELED); + finish_contexts(g_ceph_context, committing_finishers, -ECANCELED); + if (logger) + g_ceph_context->get_perfcounters_collection()->remove(logger); +} + +void Paxos::leader_init() +{ + cancel_events(); + new_value.clear(); + + // discard pending transaction + pending_proposal.reset(); + + reset_pending_committing_finishers(); + + logger->inc(l_paxos_start_leader); + + if (mon.get_quorum().size() == 1) { + state = STATE_ACTIVE; + return; + } + + state = STATE_RECOVERING; + lease_expire = {}; + dout(10) << "leader_init -- starting paxos recovery" << dendl; + collect(0); +} + +void Paxos::peon_init() +{ + cancel_events(); + new_value.clear(); + + state = STATE_RECOVERING; + lease_expire = {}; + dout(10) << "peon_init -- i am a peon" << dendl; + + // start a timer, in case the leader never manages to issue a lease + reset_lease_timeout(); + + // discard pending transaction + pending_proposal.reset(); + + // no chance to write now! + reset_pending_committing_finishers(); + finish_contexts(g_ceph_context, waiting_for_writeable, -EAGAIN); + + logger->inc(l_paxos_start_peon); +} + +void Paxos::restart() +{ + dout(10) << "restart -- canceling timeouts" << dendl; + cancel_events(); + new_value.clear(); + + if (is_writing() || is_writing_previous()) { + dout(10) << __func__ << " flushing" << dendl; + mon.lock.unlock(); + mon.store->flush(); + mon.lock.lock(); + dout(10) << __func__ << " flushed" << dendl; + } + state = STATE_RECOVERING; + + // discard pending transaction + pending_proposal.reset(); + + reset_pending_committing_finishers(); + finish_contexts(g_ceph_context, waiting_for_active, -EAGAIN); + + logger->inc(l_paxos_restart); +} + +void Paxos::reset_pending_committing_finishers() +{ + committing_finishers.splice(committing_finishers.end(), pending_finishers); + finish_contexts(g_ceph_context, committing_finishers, -EAGAIN); +} + +void Paxos::dispatch(MonOpRequestRef op) +{ + ceph_assert(op->is_type_paxos()); + op->mark_paxos_event("dispatch"); + + if (op->get_req()->get_type() != MSG_MON_PAXOS) { + dout(0) << "Got unexpected message type " << op->get_req()->get_type() + << " in Paxos::dispatch, aborting!" << dendl; + ceph_abort(); + } + + auto *req = op->get_req<MMonPaxos>(); + + // election in progress? + if (!mon.is_leader() && !mon.is_peon()) { + dout(5) << "election in progress, dropping " << *req << dendl; + return; + } + + // check sanity + ceph_assert(mon.is_leader() || + (mon.is_peon() && req->get_source().num() == mon.get_leader())); + + // NOTE: these ops are defined in messages/MMonPaxos.h + switch (req->op) { + // learner + case MMonPaxos::OP_COLLECT: + handle_collect(op); + break; + case MMonPaxos::OP_LAST: + handle_last(op); + break; + case MMonPaxos::OP_BEGIN: + handle_begin(op); + break; + case MMonPaxos::OP_ACCEPT: + handle_accept(op); + break; + case MMonPaxos::OP_COMMIT: + handle_commit(op); + break; + case MMonPaxos::OP_LEASE: + handle_lease(op); + break; + case MMonPaxos::OP_LEASE_ACK: + handle_lease_ack(op); + break; + default: + ceph_abort(); + } +} + + +// ----------------- +// service interface + +// -- READ -- + +bool Paxos::is_readable(version_t v) +{ + bool ret; + if (v > last_committed) + ret = false; + else + ret = + (mon.is_peon() || mon.is_leader()) && + (is_active() || is_updating() || is_writing()) && + last_committed > 0 && is_lease_valid(); // must have a value alone, or have lease + dout(5) << __func__ << " = " << (int)ret + << " - now=" << ceph_clock_now() + << " lease_expire=" << lease_expire + << " has v" << v << " lc " << last_committed + << dendl; + return ret; +} + +bool Paxos::read(version_t v, bufferlist &bl) +{ + if (!get_store()->get(get_name(), v, bl)) + return false; + return true; +} + +version_t Paxos::read_current(bufferlist &bl) +{ + if (read(last_committed, bl)) + return last_committed; + return 0; +} + + +bool Paxos::is_lease_valid() +{ + return ((mon.get_quorum().size() == 1) + || (ceph::real_clock::now() < lease_expire)); +} + +// -- WRITE -- + +bool Paxos::is_writeable() +{ + return + mon.is_leader() && + is_active() && + is_lease_valid(); +} + +void Paxos::propose_pending() +{ + ceph_assert(is_active()); + ceph_assert(pending_proposal); + + cancel_events(); + + bufferlist bl; + pending_proposal->encode(bl); + + dout(10) << __func__ << " " << (last_committed + 1) + << " " << bl.length() << " bytes" << dendl; + dout(30) << __func__ << " transaction dump:\n"; + JSONFormatter f(true); + pending_proposal->dump(&f); + f.flush(*_dout); + *_dout << dendl; + + pending_proposal.reset(); + + committing_finishers.swap(pending_finishers); + state = STATE_UPDATING; + begin(bl); +} + +void Paxos::queue_pending_finisher(Context *onfinished) +{ + dout(5) << __func__ << " " << onfinished << dendl; + ceph_assert(onfinished); + pending_finishers.push_back(onfinished); +} + +MonitorDBStore::TransactionRef Paxos::get_pending_transaction() +{ + ceph_assert(mon.is_leader()); + if (!pending_proposal) { + pending_proposal.reset(new MonitorDBStore::Transaction); + ceph_assert(pending_finishers.empty()); + } + return pending_proposal; +} + +bool Paxos::trigger_propose() +{ + if (plugged) { + dout(10) << __func__ << " plugged, not proposing now" << dendl; + return false; + } else if (is_active()) { + dout(10) << __func__ << " active, proposing now" << dendl; + propose_pending(); + return true; + } else { + dout(10) << __func__ << " not active, will propose later" << dendl; + return false; + } +} + +bool Paxos::is_consistent() +{ + return (first_committed <= last_committed); +} + diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h new file mode 100644 index 000000000..c197f26f7 --- /dev/null +++ b/src/mon/Paxos.h @@ -0,0 +1,1384 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* +time----> + +cccccccccccccccccca???????????????????????????????????????? +cccccccccccccccccca???????????????????????????????????????? +cccccccccccccccccca???????????????????????????????????????? leader +cccccccccccccccccc????????????????????????????????????????? +ccccc?????????????????????????????????????????????????????? + +last_committed + +pn_from +pn + +a 12v +b 12v +c 14v +d +e 12v +*/ + +/** + * Paxos storage layout and behavior + * + * Currently, we use a key/value store to hold all the Paxos-related data, but + * it can logically be depicted as this: + * + * paxos: + * first_committed -> 1 + * last_committed -> 4 + * 1 -> value_1 + * 2 -> value_2 + * 3 -> value_3 + * 4 -> value_4 + * + * Since we are relying on a k/v store supporting atomic transactions, we can + * guarantee that if 'last_committed' has a value of '4', then we have up to + * version 4 on the store, and no more than that; the same applies to + * 'first_committed', which holding '1' will strictly meaning that our lowest + * version is 1. + * + * Each version's value (value_1, value_2, ..., value_n) is a blob of data, + * incomprehensible to the Paxos. These values are proposed to the Paxos on + * propose_new_value() and each one is a transaction encoded in a ceph::buffer::list. + * + * The Paxos will write the value to disk, associating it with its version, + * but will take a step further: the value shall be decoded, and the operations + * on that transaction shall be applied during the same transaction that will + * write the value's encoded ceph::buffer::list to disk. This behavior ensures that + * whatever is being proposed will only be available on the store when it is + * applied by Paxos, which will then be aware of such new values, guaranteeing + * the store state is always consistent without requiring shady workarounds. + * + * So, let's say that FooMonitor proposes the following transaction, neatly + * encoded on a ceph::buffer::list of course: + * + * Tx_Foo + * put(foo, last_committed, 3) + * put(foo, 3, foo_value_3) + * erase(foo, 2) + * erase(foo, 1) + * put(foo, first_committed, 3) + * + * And knowing that the Paxos is proposed Tx_Foo as a ceph::buffer::list, once it is + * ready to commit, and assuming we are now committing version 5 of the Paxos, + * we will do something along the lines of: + * + * Tx proposed_tx; + * proposed_tx.decode(Tx_foo_ceph::buffer::list); + * + * Tx our_tx; + * our_tx.put(paxos, last_committed, 5); + * our_tx.put(paxos, 5, Tx_foo_ceph::buffer::list); + * our_tx.append(proposed_tx); + * + * store_apply(our_tx); + * + * And the store should look like this after we apply 'our_tx': + * + * paxos: + * first_committed -> 1 + * last_committed -> 5 + * 1 -> value_1 + * 2 -> value_2 + * 3 -> value_3 + * 4 -> value_4 + * 5 -> Tx_foo_ceph::buffer::list + * foo: + * first_committed -> 3 + * last_committed -> 3 + * 3 -> foo_value_3 + * + */ + +#ifndef CEPH_MON_PAXOS_H +#define CEPH_MON_PAXOS_H + +#include "include/types.h" +#include "mon_types.h" +#include "include/buffer.h" +#include "msg/msg_types.h" +#include "include/Context.h" +#include "common/perf_counters.h" +#include <errno.h> + +#include "MonitorDBStore.h" +#include "mon/MonOpRequest.h" + +class Monitor; +class MMonPaxos; + +enum { + l_paxos_first = 45800, + l_paxos_start_leader, + l_paxos_start_peon, + l_paxos_restart, + l_paxos_refresh, + l_paxos_refresh_latency, + l_paxos_begin, + l_paxos_begin_keys, + l_paxos_begin_bytes, + l_paxos_begin_latency, + l_paxos_commit, + l_paxos_commit_keys, + l_paxos_commit_bytes, + l_paxos_commit_latency, + l_paxos_collect, + l_paxos_collect_keys, + l_paxos_collect_bytes, + l_paxos_collect_latency, + l_paxos_collect_uncommitted, + l_paxos_collect_timeout, + l_paxos_accept_timeout, + l_paxos_lease_ack_timeout, + l_paxos_lease_timeout, + l_paxos_store_state, + l_paxos_store_state_keys, + l_paxos_store_state_bytes, + l_paxos_store_state_latency, + l_paxos_share_state, + l_paxos_share_state_keys, + l_paxos_share_state_bytes, + l_paxos_new_pn, + l_paxos_new_pn_latency, + l_paxos_last, +}; + + +// i am one state machine. +/** + * This library is based on the Paxos algorithm, but varies in a few key ways: + * 1- Only a single new value is generated at a time, simplifying the recovery logic. + * 2- Nodes track "committed" values, and share them generously (and trustingly) + * 3- A 'leasing' mechanism is built-in, allowing nodes to determine when it is + * safe to "read" their copy of the last committed value. + * + * This provides a simple replication substrate that services can be built on top of. + * See PaxosService.h + */ +class Paxos { + /** + * @defgroup Paxos_h_class Paxos + * @{ + */ + /** + * The Monitor to which this Paxos class is associated with. + */ + Monitor &mon; + + /// perf counter for internal instrumentations + PerfCounters *logger; + + void init_logger(); + + // my state machine info + const std::string paxos_name; + + friend class Monitor; + friend class PaxosService; + + std::list<std::string> extra_state_dirs; + + // LEADER+PEON + + // -- generic state -- +public: + /** + * @defgroup Paxos_h_states States on which the leader/peon may be. + * @{ + */ + enum { + /** + * Leader/Peon is in Paxos' Recovery state + */ + STATE_RECOVERING, + /** + * Leader/Peon is idle, and the Peon may or may not have a valid lease. + */ + STATE_ACTIVE, + /** + * Leader/Peon is updating to a new value. + */ + STATE_UPDATING, + /* + * Leader proposing an old value + */ + STATE_UPDATING_PREVIOUS, + /* + * Leader/Peon is writing a new commit. readable, but not + * writeable. + */ + STATE_WRITING, + /* + * Leader/Peon is writing a new commit from a previous round. + */ + STATE_WRITING_PREVIOUS, + // leader: refresh following a commit + STATE_REFRESH, + // Shutdown after WRITING or WRITING_PREVIOUS + STATE_SHUTDOWN + }; + + /** + * Obtain state name from constant value. + * + * @note This function will raise a fatal error if @p s is not + * a valid state value. + * + * @param s State value. + * @return The state's name. + */ + static const std::string get_statename(int s) { + switch (s) { + case STATE_RECOVERING: + return "recovering"; + case STATE_ACTIVE: + return "active"; + case STATE_UPDATING: + return "updating"; + case STATE_UPDATING_PREVIOUS: + return "updating-previous"; + case STATE_WRITING: + return "writing"; + case STATE_WRITING_PREVIOUS: + return "writing-previous"; + case STATE_REFRESH: + return "refresh"; + case STATE_SHUTDOWN: + return "shutdown"; + default: + return "UNKNOWN"; + } + } + +private: + /** + * The state we are in. + */ + int state; + /** + * @} + */ + int commits_started = 0; + + ceph::condition_variable shutdown_cond; + +public: + /** + * Check if we are recovering. + * + * @return 'true' if we are on the Recovering state; 'false' otherwise. + */ + bool is_recovering() const { return (state == STATE_RECOVERING); } + /** + * Check if we are active. + * + * @return 'true' if we are on the Active state; 'false' otherwise. + */ + bool is_active() const { return state == STATE_ACTIVE; } + /** + * Check if we are updating. + * + * @return 'true' if we are on the Updating state; 'false' otherwise. + */ + bool is_updating() const { return state == STATE_UPDATING; } + + /** + * Check if we are updating/proposing a previous value from a + * previous quorum + */ + bool is_updating_previous() const { return state == STATE_UPDATING_PREVIOUS; } + + /// @return 'true' if we are writing an update to disk + bool is_writing() const { return state == STATE_WRITING; } + + /// @return 'true' if we are writing an update-previous to disk + bool is_writing_previous() const { return state == STATE_WRITING_PREVIOUS; } + + /// @return 'true' if we are refreshing an update just committed + bool is_refresh() const { return state == STATE_REFRESH; } + + /// @return 'true' if we are in the process of shutting down + bool is_shutdown() const { return state == STATE_SHUTDOWN; } + +private: + /** + * @defgroup Paxos_h_recovery_vars Common recovery-related member variables + * @note These variables are common to both the Leader and the Peons. + * @{ + */ + /** + * + */ + version_t first_committed; + /** + * Last Proposal Number + * + * @todo Expand description + */ + version_t last_pn; + /** + * Last committed value's version. + * + * On both the Leader and the Peons, this is the last value's version that + * was accepted by a given quorum and thus committed, that this instance + * knows about. + * + * @note It may not be the last committed value's version throughout the + * system. If we are a Peon, we may have not been part of the quorum + * that accepted the value, and for this very same reason we may still + * be a (couple of) version(s) behind, until we learn about the most + * recent version. This should only happen if we are not active (i.e., + * part of the quorum), which should not happen if we are up, running + * and able to communicate with others -- thus able to be part of the + * monmap and trigger new elections. + */ + version_t last_committed; + /** + * Last committed value's time. + * + * When the commit finished. + */ + utime_t last_commit_time; + /** + * The last Proposal Number we have accepted. + * + * On the Leader, it will be the Proposal Number picked by the Leader + * itself. On the Peon, however, it will be the proposal sent by the Leader + * and it will only be updated if its value is higher than the one + * already known by the Peon. + */ + version_t accepted_pn; + /** + * The last_committed epoch of the leader at the time we accepted the last pn. + * + * This has NO SEMANTIC MEANING, and is there only for the debug output. + */ + version_t accepted_pn_from; + /** + * Map holding the first committed version by each quorum member. + * + * The versions kept in this map are updated during the collect phase. + * When the Leader starts the collect phase, each Peon will reply with its + * first committed version, which will then be kept in this map. + */ + std::map<int,version_t> peer_first_committed; + /** + * Map holding the last committed version by each quorum member. + * + * The versions kept in this map are updated during the collect phase. + * When the Leader starts the collect phase, each Peon will reply with its + * last committed version, which will then be kept in this map. + */ + std::map<int,version_t> peer_last_committed; + /** + * @} + */ + + // active (phase 2) + /** + * @defgroup Paxos_h_active_vars Common active-related member variables + * @{ + */ + /** + * When does our read lease expires. + * + * Instead of performing a full commit each time a read is requested, we + * keep leases. Each lease will have an expiration date, which may or may + * not be extended. + */ + ceph::real_clock::time_point lease_expire; + /** + * List of callbacks waiting for our state to change into STATE_ACTIVE. + */ + std::list<Context*> waiting_for_active; + /** + * List of callbacks waiting for the chance to read a version from us. + * + * Each entry on the list may result from an attempt to read a version that + * wasn't available at the time, or an attempt made during a period during + * which we could not satisfy the read request. The first case happens if + * the requested version is greater than our last committed version. The + * second scenario may happen if we are recovering, or if we don't have a + * valid lease. + * + * The list will be woken up once we change to STATE_ACTIVE with an extended + * lease -- which can be achieved if we have everyone on the quorum on board + * with the latest proposal, or if we don't really care about the remaining + * uncommitted values --, or if we're on a quorum of one. + */ + std::list<Context*> waiting_for_readable; + /** + * @} + */ + + // -- leader -- + // recovery (paxos phase 1) + /** + * @defgroup Paxos_h_leader_recovery Leader-specific Recovery-related vars + * @{ + */ + /** + * Number of replies to the collect phase we've received so far. + * + * This variable is reset to 1 each time we start a collect phase; it is + * incremented each time we receive a reply to the collect message, and + * is used to determine whether or not we have received replies from the + * whole quorum. + */ + unsigned num_last; + /** + * Uncommitted value's version. + * + * If we have, or end up knowing about, an uncommitted value, then its + * version will be kept in this variable. + * + * @note If this version equals @p last_committed+1 when we reach the final + * steps of recovery, then the algorithm will assume this is a value + * the Leader does not know about, and trustingly the Leader will + * propose this version's value. + */ + version_t uncommitted_v; + /** + * Uncommitted value's Proposal Number. + * + * We use this variable to assess if the Leader should take into consideration + * an uncommitted value sent by a Peon. Given that the Peon will send back to + * the Leader the last Proposal Number it accepted, the Leader will be able + * to infer if this value is more recent than the one the Leader has, thus + * more relevant. + */ + version_t uncommitted_pn; + /** + * Uncommitted Value. + * + * If the system fails in-between the accept replies from the Peons and the + * instruction to commit from the Leader, then we may end up with accepted + * but yet-uncommitted values. During the Leader's recovery, it will attempt + * to bring the whole system to the latest state, and that means committing + * past accepted but uncommitted values. + * + * This variable will hold an uncommitted value, which may originate either + * on the Leader, or learnt by the Leader from a Peon during the collect + * phase. + */ + ceph::buffer::list uncommitted_value; + /** + * Used to specify when an on-going collect phase times out. + */ + Context *collect_timeout_event; + /** + * @} + */ + + // active + /** + * @defgroup Paxos_h_leader_active Leader-specific Active-related vars + * @{ + */ + /** + * Set of participants (Leader & Peons) that have acked a lease extension. + * + * Each Peon that acknowledges a lease extension will have its place in this + * set, which will be used to account for all the acks from all the quorum + * members, guaranteeing that we trigger new elections if some don't ack in + * the expected timeframe. + */ + std::set<int> acked_lease; + /** + * Callback responsible for extending the lease periodically. + */ + Context *lease_renew_event; + /** + * Callback to trigger new elections once the time for acks is out. + */ + Context *lease_ack_timeout_event; + /** + * @} + */ + /** + * @defgroup Paxos_h_peon_active Peon-specific Active-related vars + * @{ + */ + /** + * Callback to trigger new elections when the Peon's lease times out. + * + * If the Peon's lease is extended, this callback will be reset (i.e., + * we cancel the event and reschedule a new one with starting from the + * beginning). + */ + Context *lease_timeout_event; + /** + * @} + */ + + // updating (paxos phase 2) + /** + * @defgroup Paxos_h_leader_updating Leader-specific Updating-related vars + * @{ + */ + /** + * New Value being proposed to the Peons. + * + * This ceph::buffer::list holds the value the Leader is proposing to the Peons, and + * that will be committed if the Peons do accept the proposal. + */ + ceph::buffer::list new_value; + /** + * Set of participants (Leader & Peons) that accepted the new proposed value. + * + * This set is used to keep track of those who have accepted the proposed + * value, so the leader may know when to issue a commit (when a majority of + * participants has accepted the proposal), and when to extend the lease + * (when all the quorum members have accepted the proposal). + */ + std::set<int> accepted; + /** + * Callback to trigger a new election if the proposal is not accepted by the + * full quorum within a given timeframe. + * + * If the full quorum does not accept the proposal, then it means that the + * Leader may no longer be recognized as the leader, or that the quorum has + * changed, and the value may have not reached all the participants. Thus, + * the leader must call new elections, and go through a recovery phase in + * order to propagate the new value throughout the system. + * + * This does not mean that we won't commit. We will commit as soon as we + * have a majority of acceptances. But if we do not have full acceptance + * from the quorum, then we cannot extend the lease, as some participants + * may not have the latest committed value. + */ + Context *accept_timeout_event; + + /** + * List of callbacks waiting for it to be possible to write again. + * + * @remarks It is not possible to write if we are not the Leader, or we are + * not on the active state, or if the lease has expired. + */ + std::list<Context*> waiting_for_writeable; + + /** + * Pending proposal transaction + * + * This is the transaction that is under construction and pending + * proposal. We will add operations to it until we decide it is + * time to start a paxos round. + */ + MonitorDBStore::TransactionRef pending_proposal; + + /** + * Finishers for pending transaction + * + * These are waiting for updates in the pending proposal/transaction + * to be committed. + */ + std::list<Context*> pending_finishers; + + /** + * Finishers for committing transaction + * + * When the pending_proposal is submitted, pending_finishers move to + * this list. When it commits, these finishers are notified. + */ + std::list<Context*> committing_finishers; + /** + * This function re-triggers pending_ and committing_finishers + * safely, so as to maintain existing system invariants. In particular + * we maintain ordering by triggering committing before pending, and + * we clear out pending_finishers prior to any triggers so that + * we don't trigger asserts on them being empty. You should + * use it instead of sending -EAGAIN to them with finish_contexts. + */ + void reset_pending_committing_finishers(); + + /** + * @defgroup Paxos_h_sync_warns Synchronization warnings + * @todo Describe these variables + * @{ + */ + utime_t last_clock_drift_warn; + int clock_drift_warned; + /** + * @} + */ + + /** + * Should be true if we have proposed to trim, or are in the middle of + * trimming; false otherwise. + */ + bool trimming; + + /** + * true if we want trigger_propose to *not* propose (yet) + */ + bool plugged = false; + + /** + * @defgroup Paxos_h_callbacks Callback classes. + * @{ + */ + /** + * Callback class responsible for handling a Collect Timeout. + */ + class C_CollectTimeout; + /** + * Callback class responsible for handling an Accept Timeout. + */ + class C_AcceptTimeout; + /** + * Callback class responsible for handling a Lease Ack Timeout. + */ + class C_LeaseAckTimeout; + + /** + * Callback class responsible for handling a Lease Timeout. + */ + class C_LeaseTimeout; + + /** + * Callback class responsible for handling a Lease Renew Timeout. + */ + class C_LeaseRenew; + + class C_Trimmed; + /** + * + */ +public: + class C_Proposal : public Context { + Context *proposer_context; + public: + ceph::buffer::list bl; + // for debug purposes. Will go away. Soon. + bool proposed; + utime_t proposal_time; + + C_Proposal(Context *c, ceph::buffer::list& proposal_bl) : + proposer_context(c), + bl(proposal_bl), + proposed(false), + proposal_time(ceph_clock_now()) + { } + + void finish(int r) override { + if (proposer_context) { + proposer_context->complete(r); + proposer_context = NULL; + } + } + }; + /** + * @} + */ +private: + /** + * @defgroup Paxos_h_election_triggered Steps triggered by an election. + * + * @note All these functions play a significant role in the Recovery Phase, + * which is triggered right after an election once someone becomes + * the Leader. + * @{ + */ + /** + * Create a new Proposal Number and propose it to the Peons. + * + * This function starts the Recovery Phase, which can be directly mapped + * onto the original Paxos' Prepare phase. Basically, we'll generate a + * Proposal Number, taking @p oldpn into consideration, and we will send + * it to a quorum, along with our first and last committed versions. By + * sending these information in a message to the quorum, we expect to + * obtain acceptances from a majority, allowing us to commit, or be + * informed of a higher Proposal Number known by one or more of the Peons + * in the quorum. + * + * @pre We are the Leader. + * @post Recovery Phase initiated by sending messages to the quorum. + * + * @param oldpn A proposal number taken as the highest known so far, that + * should be taken into consideration when generating a new + * Proposal Number for the Recovery Phase. + */ + void collect(version_t oldpn); + /** + * Handle the reception of a collect message from the Leader and reply + * accordingly. + * + * Once a Peon receives a collect message from the Leader it will reply + * with its first and last committed versions, as well as information so + * the Leader may know if its Proposal Number was, or was not, accepted by + * the Peon. The Peon will accept the Leader's Proposal Number if it is + * higher than the Peon's currently accepted Proposal Number. The Peon may + * also inform the Leader of accepted but uncommitted values. + * + * @invariant The message is an operation of type OP_COLLECT. + * @pre We are a Peon. + * @post Replied to the Leader, accepting or not accepting its PN. + * + * @param collect The collect message sent by the Leader to the Peon. + */ + void handle_collect(MonOpRequestRef op); + /** + * Handle a response from a Peon to the Leader's collect phase. + * + * The received message will state the Peon's last committed version, as + * well as its last proposal number. This will lead to one of the following + * scenarios: if the replied Proposal Number is equal to the one we proposed, + * then the Peon has accepted our proposal, and if all the Peons do accept + * our Proposal Number, then we are allowed to proceed with the commit; + * however, if a Peon replies with a higher Proposal Number, we assume he + * knows something we don't and the Leader will have to abort the current + * proposal in order to retry with the Proposal Number specified by the Peon. + * It may also occur that the Peon replied with a lower Proposal Number, in + * which case we assume it is a reply to an older value and we'll simply + * drop it. + * This function will also check if the Peon replied with an accepted but + * yet uncommitted value. In this case, if its version is higher than our + * last committed value by one, we assume that the Peon knows a value from a + * previous proposal that has never been committed, and we should try to + * commit that value by proposing it next. On the other hand, if that is + * not the case, we'll assume it is an old, uncommitted value, we do not + * care about and we'll consider the system active by extending the leases. + * + * @invariant The message is an operation of type OP_LAST. + * @pre We are the Leader. + * @post We initiate a commit, or we retry with a higher Proposal Number, + * or we drop the message. + * @post We move from STATE_RECOVERING to STATE_ACTIVE. + * + * @param last The message sent by the Peon to the Leader. + */ + void handle_last(MonOpRequestRef op); + /** + * The Recovery Phase timed out, meaning that a significant part of the + * quorum does not believe we are the Leader, and we thus should trigger new + * elections. + * + * @pre We believe to be the Leader. + * @post Trigger new elections. + */ + void collect_timeout(); + /** + * @} + */ + + /** + * @defgroup Paxos_h_updating_funcs Functions used during the Updating State + * + * These functions may easily be mapped to the original Paxos Algorithm's + * phases. + * + * Taking into account the algorithm can be divided in 4 phases (Prepare, + * Promise, Accept Request and Accepted), we can easily map Paxos::begin to + * both the Prepare and Accept Request phases; the Paxos::handle_begin to + * the Promise phase; and the Paxos::handle_accept to the Accepted phase. + * @{ + */ + /** + * Start a new proposal with the intent of committing @p value. + * + * If we are alone on the system (i.e., a quorum of one), then we will + * simply commit the value, but if we are not alone, then we need to propose + * the value to the quorum. + * + * @pre We are the Leader + * @pre We are on STATE_ACTIVE + * @post We commit, if we are alone, or we send a message to each quorum + * member + * @post We are on STATE_ACTIVE, if we are alone, or on + * STATE_UPDATING otherwise + * + * @param value The value being proposed to the quorum + */ + void begin(ceph::buffer::list& value); + /** + * Accept or decline (by ignoring) a proposal from the Leader. + * + * We will decline the proposal (by ignoring it) if we have promised to + * accept a higher numbered proposal. If that is not the case, we will + * accept it and accordingly reply to the Leader. + * + * @pre We are a Peon + * @pre We are on STATE_ACTIVE + * @post We are on STATE_UPDATING if we accept the Leader's proposal + * @post We send a reply message to the Leader if we accept its proposal + * + * @invariant The received message is an operation of type OP_BEGIN + * + * @param begin The message sent by the Leader to the Peon during the + * Paxos::begin function + * + */ + void handle_begin(MonOpRequestRef op); + /** + * Handle an Accept message sent by a Peon. + * + * In order to commit, the Leader has to receive accepts from a majority of + * the quorum. If that does happen, then the Leader may proceed with the + * commit. However, the Leader needs the accepts from all the quorum members + * in order to extend the lease and move on to STATE_ACTIVE. + * + * This function handles these two situations, accounting for the amount of + * received accepts. + * + * @pre We are the Leader + * @pre We are on STATE_UPDATING + * @post We are on STATE_ACTIVE if we received accepts from the full quorum + * @post We extended the lease if we moved on to STATE_ACTIVE + * @post We are on STATE_UPDATING if we didn't received accepts from the + * full quorum + * @post We have committed if we received accepts from a majority + * + * @invariant The received message is an operation of type OP_ACCEPT + * + * @param accept The message sent by the Peons to the Leader during the + * Paxos::handle_begin function + */ + void handle_accept(MonOpRequestRef op); + /** + * Trigger a fresh election. + * + * During Paxos::begin we set a Callback of type Paxos::C_AcceptTimeout in + * order to limit the amount of time we spend waiting for Accept replies. + * This callback will call Paxos::accept_timeout when it is fired. + * + * This is essential to the algorithm because there may be the chance that + * we are no longer the Leader (i.e., others don't believe in us) and we + * are getting ignored, or we dropped out of the quorum and haven't realised + * it. So, our only option is to trigger fresh elections. + * + * @pre We are the Leader + * @pre We are on STATE_UPDATING + * @post Triggered fresh elections + */ + void accept_timeout(); + /** + * @} + */ + + + utime_t commit_start_stamp; + friend struct C_Committed; + + /** + * Commit a value throughout the system. + * + * The Leader will cancel the current lease (as it was for the old value), + * and will store the committed value locally. It will then instruct every + * quorum member to do so as well. + * + * @pre We are the Leader + * @pre We are on STATE_UPDATING + * @pre A majority of quorum members accepted our proposal + * @post Value locally stored + * @post Quorum members instructed to commit the new value. + */ + void commit_start(); + void commit_finish(); ///< finish a commit after txn becomes durable + void abort_commit(); ///< Handle commit finish after shutdown started + /** + * Commit the new value to stable storage as being the latest available + * version. + * + * @pre We are a Peon + * @post The new value is locally stored + * @post Fire up the callbacks waiting on waiting_for_commit + * + * @invariant The received message is an operation of type OP_COMMIT + * + * @param commit The message sent by the Leader to the Peon during + * Paxos::commit + */ + void handle_commit(MonOpRequestRef op); + /** + * Extend the system's lease. + * + * This means that the Leader considers that it should now safe to read from + * any node on the system, since every quorum member is now in possession of + * the latest version. Therefore, the Leader will send a message stating just + * this to each quorum member, and will impose a limited timeframe during + * which acks will be accepted. If there aren't as many acks as expected + * (i.e, if at least one quorum member does not ack the lease) during this + * timeframe, then we will force fresh elections. + * + * @pre We are the Leader + * @pre We are on STATE_ACTIVE + * @post A message extending the lease is sent to each quorum member + * @post A timeout callback is set to limit the amount of time we will wait + * for lease acks. + * @post A timer is set in order to renew the lease after a certain amount + * of time. + */ + void extend_lease(); + /** + * Update the lease on the Peon's side of things. + * + * Once a Peon receives a Lease message, it will update its lease_expire + * variable, reply to the Leader acknowledging the lease update and set a + * timeout callback to be fired upon the lease's expiration. Finally, the + * Peon will fire up all the callbacks waiting for it to become active, + * which it just did, and all those waiting for it to become readable, + * which should be true if the Peon's lease didn't expire in the mean time. + * + * @pre We are a Peon + * @post We update the lease accordingly + * @post A lease timeout callback is set + * @post Move to STATE_ACTIVE + * @post Fire up all the callbacks waiting for STATE_ACTIVE + * @post Fire up all the callbacks waiting for readable if we are readable + * @post Ack the lease to the Leader + * + * @invariant The received message is an operation of type OP_LEASE + * + * @param lease The message sent by the Leader to the Peon during the + * Paxos::extend_lease function + */ + void handle_lease(MonOpRequestRef op); + /** + * Account for all the Lease Acks the Leader receives from the Peons. + * + * Once the Leader receives all the Lease Acks from the Peons, it will be + * able to cancel the Lease Ack timeout callback, thus avoiding calling + * fresh elections. + * + * @pre We are the Leader + * @post Cancel the Lease Ack timeout callback if we receive acks from all + * the quorum members + * + * @invariant The received message is an operation of type OP_LEASE_ACK + * + * @param ack The message sent by a Peon to the Leader during the + * Paxos::handle_lease function + */ + void handle_lease_ack(MonOpRequestRef op); + /** + * Call fresh elections because at least one Peon didn't acked our lease. + * + * @pre We are the Leader + * @pre We are on STATE_ACTIVE + * @post Trigger fresh elections + */ + void lease_ack_timeout(); + /** + * Extend lease since we haven't had new committed values meanwhile. + * + * @pre We are the Leader + * @pre We are on STATE_ACTIVE + * @post Go through with Paxos::extend_lease + */ + void lease_renew_timeout(); + /** + * Call fresh elections because the Peon's lease expired without being + * renewed or receiving a fresh lease. + * + * This means that the Peon is no longer assumed as being in the quorum + * (or there is no Leader to speak of), so just trigger fresh elections + * to circumvent this issue. + * + * @pre We are a Peon + * @post Trigger fresh elections + */ + void lease_timeout(); // on peon, if lease isn't extended + + /// restart the lease timeout timer + void reset_lease_timeout(); + + /** + * Cancel all of Paxos' timeout/renew events. + */ + void cancel_events(); + /** + * Shutdown this Paxos machine + */ + void shutdown(); + + /** + * Generate a new Proposal Number based on @p gt + * + * @todo Check what @p gt actually means and what its usage entails + * @param gt A hint for the geration of the Proposal Number + * @return A globally unique, monotonically increasing Proposal Number + */ + version_t get_new_proposal_number(version_t gt=0); + + /** + * @todo document sync function + */ + void warn_on_future_time(utime_t t, entity_name_t from); + + /** + * Begin proposing the pending_proposal. + */ + void propose_pending(); + + /** + * refresh state from store + * + * Called when we have new state for the mon to consume. If we return false, + * abort (we triggered a bootstrap). + * + * @returns true on success, false if we are now bootstrapping + */ + bool do_refresh(); + + void commit_proposal(); + void finish_round(); + +public: + /** + * @param m A monitor + * @param name A name for the paxos service. It serves as the naming space + * of the underlying persistent storage for this service. + */ + Paxos(Monitor &m, const std::string &name) + : mon(m), + logger(NULL), + paxos_name(name), + state(STATE_RECOVERING), + first_committed(0), + last_pn(0), + last_committed(0), + accepted_pn(0), + accepted_pn_from(0), + num_last(0), + uncommitted_v(0), uncommitted_pn(0), + collect_timeout_event(0), + lease_renew_event(0), + lease_ack_timeout_event(0), + lease_timeout_event(0), + accept_timeout_event(0), + clock_drift_warned(0), + trimming(false) { } + + ~Paxos() { + delete logger; + } + + const std::string get_name() const { + return paxos_name; + } + + void dispatch(MonOpRequestRef op); + + void read_and_prepare_transactions(MonitorDBStore::TransactionRef tx, + version_t from, version_t last); + + void init(); + + /** + * dump state info to a formatter + */ + void dump_info(ceph::Formatter *f); + + /** + * This function runs basic consistency checks. Importantly, if + * it is inconsistent and shouldn't be, it asserts out. + * + * @return True if consistent, false if not. + */ + bool is_consistent(); + + void restart(); + /** + * Initiate the Leader after it wins an election. + * + * Once an election is won, the Leader will be initiated and there are two + * possible outcomes of this method: the Leader directly jumps to the active + * state (STATE_ACTIVE) if it believes to be the only one in the quorum, or + * will start recovering (STATE_RECOVERING) by initiating the collect phase. + * + * @pre Our monitor is the Leader. + * @post We are either on STATE_ACTIVE if we're the only one in the quorum, + * or on STATE_RECOVERING otherwise. + */ + void leader_init(); + /** + * Initiate a Peon after it loses an election. + * + * If we are a Peon, then there must be a Leader and we are not alone in the + * quorum, thus automatically assume we are on STATE_RECOVERING, which means + * we will soon be enrolled into the Leader's collect phase. + * + * @pre There is a Leader, and it?s about to start the collect phase. + * @post We are on STATE_RECOVERING and will soon receive collect phase's + * messages. + */ + void peon_init(); + + /** + * Include an incremental state of values, ranging from peer_first_committed + * to the last committed value, on the message m + * + * @param m A message + * @param peer_first_committed Lowest version to take into account + * @param peer_last_committed Highest version to take into account + */ + void share_state(MMonPaxos *m, version_t peer_first_committed, + version_t peer_last_committed); + /** + * Store on disk a state that was shared with us + * + * Basically, we received a set of version. Or just one. It doesn't matter. + * What matters is that we have to stash it in the store. So, we will simply + * write every single ceph::buffer::list into their own versions on our side (i.e., + * onto paxos-related keys), and then we will decode those same ceph::buffer::lists + * we just wrote and apply the transactions they hold. We will also update + * our first and last committed values to point to the new values, if need + * be. All this is done tightly wrapped in a transaction to ensure we + * enjoy the atomicity guarantees given by our awesome k/v store. + * + * @param m A message + * @returns true if we stored something new; false otherwise + */ + bool store_state(MMonPaxos *m); + void _sanity_check_store(); + + /** + * Helper function to decode a ceph::buffer::list into a transaction and append it + * to another transaction. + * + * This function is used during the Leader's commit and during the + * Paxos::store_state in order to apply the ceph::buffer::list's transaction onto + * the store. + * + * @param t The transaction to which we will append the operations + * @param bl A ceph::buffer::list containing an encoded transaction + */ + static void decode_append_transaction(MonitorDBStore::TransactionRef t, + ceph::buffer::list& bl) { + auto vt(std::make_shared<MonitorDBStore::Transaction>()); + auto it = bl.cbegin(); + vt->decode(it); + t->append(vt); + } + + /** + * @todo This appears to be used only by the OSDMonitor, and I would say + * its objective is to allow a third-party to have a "private" + * state dir. -JL + */ + void add_extra_state_dir(std::string s) { + extra_state_dirs.push_back(s); + } + + // -- service interface -- + /** + * Add c to the list of callbacks waiting for us to become active. + * + * @param c A callback + */ + void wait_for_active(MonOpRequestRef op, Context *c) { + if (op) + op->mark_event("paxos:wait_for_active"); + waiting_for_active.push_back(c); + } + void wait_for_active(Context *c) { + MonOpRequestRef o; + wait_for_active(o, c); + } + + /** + * Trim the Paxos state as much as we can. + */ + void trim(); + + /** + * Check if we should trim. + * + * If trimming is disabled, we must take that into consideration and only + * return true if we are positively sure that we should trim soon. + * + * @returns true if we should trim; false otherwise. + */ + bool should_trim() { + int available_versions = get_version() - get_first_committed(); + int maximum_versions = g_conf()->paxos_min + g_conf()->paxos_trim_min; + + if (trimming || (available_versions <= maximum_versions)) + return false; + + return true; + } + + bool is_plugged() const { + return plugged; + } + void plug() { + ceph_assert(plugged == false); + plugged = true; + } + void unplug() { + ceph_assert(plugged == true); + plugged = false; + } + + // read + /** + * @defgroup Paxos_h_read_funcs Read-related functions + * @{ + */ + /** + * Get latest committed version + * + * @return latest committed version + */ + version_t get_version() { return last_committed; } + /** + * Get first committed version + * + * @return the first committed version + */ + version_t get_first_committed() { return first_committed; } + /** + * Check if a given version is readable. + * + * A version may not be readable for a myriad of reasons: + * @li the version @e v is higher that the last committed version + * @li we are not the Leader nor a Peon (election may be on-going) + * @li we do not have a committed value yet + * @li we do not have a valid lease + * + * @param seen The version we want to check if it is readable. + * @return 'true' if the version is readable; 'false' otherwise. + */ + bool is_readable(version_t seen=0); + /** + * Read version @e v and store its value in @e bl + * + * @param[in] v The version we want to read + * @param[out] bl The version's value + * @return 'true' if we successfully read the value; 'false' otherwise + */ + bool read(version_t v, ceph::buffer::list &bl); + /** + * Read the latest committed version + * + * @param[out] bl The version's value + * @return the latest committed version if we successfully read the value; + * or 0 (zero) otherwise. + */ + version_t read_current(ceph::buffer::list &bl); + /** + * Add onreadable to the list of callbacks waiting for us to become readable. + * + * @param onreadable A callback + */ + void wait_for_readable(MonOpRequestRef op, Context *onreadable) { + ceph_assert(!is_readable()); + if (op) + op->mark_event("paxos:wait_for_readable"); + waiting_for_readable.push_back(onreadable); + } + void wait_for_readable(Context *onreadable) { + MonOpRequestRef o; + wait_for_readable(o, onreadable); + } + /** + * @} + */ + + /** + * Check if we have a valid lease. + * + * @returns true if the lease is still valid; false otherwise. + */ + bool is_lease_valid(); + // write + /** + * @defgroup Paxos_h_write_funcs Write-related functions + * @{ + */ + /** + * Check if we are writeable. + * + * We are writeable if we are alone (i.e., a quorum of one), or if we match + * all the following conditions: + * @li We are the Leader + * @li We are on STATE_ACTIVE + * @li We have a valid lease + * + * @return 'true' if we are writeable; 'false' otherwise. + */ + bool is_writeable(); + /** + * Add c to the list of callbacks waiting for us to become writeable. + * + * @param c A callback + */ + void wait_for_writeable(MonOpRequestRef op, Context *c) { + ceph_assert(!is_writeable()); + if (op) + op->mark_event("paxos:wait_for_writeable"); + waiting_for_writeable.push_back(c); + } + void wait_for_writeable(Context *c) { + MonOpRequestRef o; + wait_for_writeable(o, c); + } + + /** + * Get a transaction to submit operations to propose against + * + * Apply operations to this transaction. It will eventually be proposed + * to paxos. + */ + MonitorDBStore::TransactionRef get_pending_transaction(); + + /** + * Queue a completion for the pending proposal + * + * This completion will get triggered when the pending proposal + * transaction commits. + */ + void queue_pending_finisher(Context *onfinished); + + /** + * (try to) trigger a proposal + * + * Tell paxos that it should submit the pending proposal. Note that if it + * is not active (e.g., because it is already in the midst of committing + * something) that will be deferred (e.g., until the current round finishes). + */ + bool trigger_propose(); + /** + * @} + */ + + /** + * @} + */ + protected: + MonitorDBStore *get_store(); +}; + +inline std::ostream& operator<<(std::ostream& out, Paxos::C_Proposal& p) +{ + std::string proposed = (p.proposed ? "proposed" : "unproposed"); + out << " " << proposed + << " queued " << (ceph_clock_now() - p.proposal_time) + << " tx dump:\n"; + auto t(std::make_shared<MonitorDBStore::Transaction>()); + auto p_it = p.bl.cbegin(); + t->decode(p_it); + ceph::JSONFormatter f(true); + t->dump(&f); + f.flush(out); + return out; +} + +#endif diff --git a/src/mon/PaxosFSMap.h b/src/mon/PaxosFSMap.h new file mode 100644 index 000000000..e32c44e0b --- /dev/null +++ b/src/mon/PaxosFSMap.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_PAXOS_FSMAP_H +#define CEPH_PAXOS_FSMAP_H + +#include "mds/FSMap.h" +#include "mds/MDSMap.h" + +#include "include/ceph_assert.h" + +class PaxosFSMap { +public: + virtual ~PaxosFSMap() {} + + const FSMap &get_pending_fsmap() const { ceph_assert(is_leader()); return pending_fsmap; } + const FSMap &get_fsmap() const { return fsmap; } + + virtual bool is_leader() const = 0; + +protected: + FSMap &get_pending_fsmap_writeable() { ceph_assert(is_leader()); return pending_fsmap; } + + FSMap &create_pending() { + ceph_assert(is_leader()); + pending_fsmap = fsmap; + pending_fsmap.epoch++; + return pending_fsmap; + } + + void decode(ceph::buffer::list &bl) { + fsmap.decode(bl); + pending_fsmap = FSMap(); /* nuke it to catch invalid access */ + } + +private: + /* Keep these PRIVATE to prevent unprotected manipulation. */ + FSMap fsmap; /* the current epoch */ + FSMap pending_fsmap; /* the next epoch */ +}; + + +#endif diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc new file mode 100644 index 000000000..0a6a9a9ea --- /dev/null +++ b/src/mon/PaxosService.cc @@ -0,0 +1,466 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "PaxosService.h" +#include "common/Clock.h" +#include "common/config.h" +#include "include/stringify.h" +#include "include/ceph_assert.h" +#include "mon/MonOpRequest.h" + +using std::ostream; +using std::string; + +using ceph::bufferlist; + +#define dout_subsys ceph_subsys_paxos +#undef dout_prefix +#define dout_prefix _prefix(_dout, mon, paxos, service_name, get_first_committed(), get_last_committed()) +static ostream& _prefix(std::ostream *_dout, Monitor &mon, Paxos &paxos, string service_name, + version_t fc, version_t lc) { + return *_dout << "mon." << mon.name << "@" << mon.rank + << "(" << mon.get_state_name() + << ").paxosservice(" << service_name << " " << fc << ".." << lc << ") "; +} + +bool PaxosService::dispatch(MonOpRequestRef op) +{ + ceph_assert(op->is_type_service() || op->is_type_command()); + auto m = op->get_req<PaxosServiceMessage>(); + op->mark_event("psvc:dispatch"); + + dout(10) << __func__ << " " << m << " " << *m + << " from " << m->get_orig_source_inst() + << " con " << m->get_connection() << dendl; + + if (mon.is_shutdown()) { + return true; + } + + // make sure this message isn't forwarded from a previous election epoch + if (m->rx_election_epoch && + m->rx_election_epoch < mon.get_epoch()) { + dout(10) << " discarding forwarded message from previous election epoch " + << m->rx_election_epoch << " < " << mon.get_epoch() << dendl; + return true; + } + + // make sure the client is still connected. note that a proxied + // connection will be disconnected with a null message; don't drop + // those. also ignore loopback (e.g., log) messages. + if (m->get_connection() && + !m->get_connection()->is_connected() && + m->get_connection() != mon.con_self && + m->get_connection()->get_messenger() != NULL) { + dout(10) << " discarding message from disconnected client " + << m->get_source_inst() << " " << *m << dendl; + return true; + } + + // make sure our map is readable and up to date + if (!is_readable(m->version)) { + dout(10) << " waiting for paxos -> readable (v" << m->version << ")" << dendl; + wait_for_readable(op, new C_RetryMessage(this, op), m->version); + return true; + } + + // preprocess + if (preprocess_query(op)) + return true; // easy! + + // leader? + if (!mon.is_leader()) { + mon.forward_request_leader(op); + return true; + } + + // writeable? + if (!is_writeable()) { + dout(10) << " waiting for paxos -> writeable" << dendl; + wait_for_writeable(op, new C_RetryMessage(this, op)); + return true; + } + + // update + if (!prepare_update(op)) { + // no changes made. + return true; + } + + if (need_immediate_propose) { + dout(10) << __func__ << " forced immediate propose" << dendl; + need_immediate_propose = false; + propose_pending(); + return true; + } + + double delay = 0.0; + if (!should_propose(delay)) { + dout(10) << " not proposing" << dendl; + return true; + } + + if (delay == 0.0) { + propose_pending(); + return true; + } + + // delay a bit + if (!proposal_timer) { + /** + * Callback class used to propose the pending value once the proposal_timer + * fires up. + */ + auto do_propose = new C_MonContext{&mon, [this](int r) { + proposal_timer = 0; + if (r >= 0) { + propose_pending(); + } else if (r == -ECANCELED || r == -EAGAIN) { + return; + } else { + ceph_abort_msg("bad return value for proposal_timer"); + } + }}; + dout(10) << " setting proposal_timer " << do_propose + << " with delay of " << delay << dendl; + proposal_timer = mon.timer.add_event_after(delay, do_propose); + } else { + dout(10) << " proposal_timer already set" << dendl; + } + return true; +} + +void PaxosService::refresh(bool *need_bootstrap) +{ + // update cached versions + cached_first_committed = mon.store->get(get_service_name(), first_committed_name); + cached_last_committed = mon.store->get(get_service_name(), last_committed_name); + + version_t new_format = get_value("format_version"); + if (new_format != format_version) { + dout(1) << __func__ << " upgraded, format " << format_version << " -> " << new_format << dendl; + on_upgrade(); + } + format_version = new_format; + + dout(10) << __func__ << dendl; + + update_from_paxos(need_bootstrap); +} + +void PaxosService::post_refresh() +{ + dout(10) << __func__ << dendl; + + post_paxos_update(); + + if (mon.is_peon() && !waiting_for_finished_proposal.empty()) { + finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN); + } +} + +bool PaxosService::should_propose(double& delay) +{ + // simple default policy: quick startup, then some damping. + if (get_last_committed() <= 1) { + delay = 0.0; + } else { + utime_t now = ceph_clock_now(); + if ((now - paxos.last_commit_time) > g_conf()->paxos_propose_interval) + delay = (double)g_conf()->paxos_min_wait; + else + delay = (double)(g_conf()->paxos_propose_interval + paxos.last_commit_time + - now); + } + return true; +} + + +void PaxosService::propose_pending() +{ + dout(10) << __func__ << dendl; + ceph_assert(have_pending); + ceph_assert(!proposing); + ceph_assert(mon.is_leader()); + ceph_assert(is_active()); + + if (proposal_timer) { + dout(10) << " canceling proposal_timer " << proposal_timer << dendl; + mon.timer.cancel_event(proposal_timer); + proposal_timer = NULL; + } + + /** + * @note What we contribute to the pending Paxos transaction is + * obtained by calling a function that must be implemented by + * the class implementing us. I.e., the function + * encode_pending will be the one responsible to encode + * whatever is pending on the implementation class into a + * bufferlist, so we can then propose that as a value through + * Paxos. + */ + MonitorDBStore::TransactionRef t = paxos.get_pending_transaction(); + + if (should_stash_full()) + encode_full(t); + + encode_pending(t); + have_pending = false; + + if (format_version > 0) { + t->put(get_service_name(), "format_version", format_version); + } + + // apply to paxos + proposing = true; + /** + * Callback class used to mark us as active once a proposal finishes going + * through Paxos. + * + * We should wake people up *only* *after* we inform the service we + * just went active. And we should wake people up only once we finish + * going active. This is why we first go active, avoiding to wake up the + * wrong people at the wrong time, such as waking up a C_RetryMessage + * before waking up a C_Active, thus ending up without a pending value. + */ + class C_Committed : public Context { + PaxosService *ps; + public: + explicit C_Committed(PaxosService *p) : ps(p) { } + void finish(int r) override { + ps->proposing = false; + if (r >= 0) + ps->_active(); + else if (r == -ECANCELED || r == -EAGAIN) + return; + else + ceph_abort_msg("bad return value for C_Committed"); + } + }; + paxos.queue_pending_finisher(new C_Committed(this)); + paxos.trigger_propose(); +} + +bool PaxosService::should_stash_full() +{ + version_t latest_full = get_version_latest_full(); + /* @note The first member of the condition is moot and it is here just for + * clarity's sake. The second member would end up returing true + * nonetheless because, in that event, + * latest_full == get_trim_to() == 0. + */ + return (!latest_full || + (latest_full <= get_trim_to()) || + (get_last_committed() - latest_full > (version_t)g_conf()->paxos_stash_full_interval)); +} + +void PaxosService::restart() +{ + dout(10) << __func__ << dendl; + if (proposal_timer) { + dout(10) << " canceling proposal_timer " << proposal_timer << dendl; + mon.timer.cancel_event(proposal_timer); + proposal_timer = 0; + } + + finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN); + + if (have_pending) { + discard_pending(); + have_pending = false; + } + proposing = false; + + on_restart(); +} + +void PaxosService::election_finished() +{ + dout(10) << __func__ << dendl; + + finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN); + + // make sure we update our state + _active(); +} + +void PaxosService::_active() +{ + if (is_proposing()) { + dout(10) << __func__ << " - proposing" << dendl; + return; + } + if (!is_active()) { + dout(10) << __func__ << " - not active" << dendl; + /** + * Callback used to make sure we call the PaxosService::_active function + * whenever a condition is fulfilled. + * + * This is used in multiple situations, from waiting for the Paxos to commit + * our proposed value, to waiting for the Paxos to become active once an + * election is finished. + */ + class C_Active : public Context { + PaxosService *svc; + public: + explicit C_Active(PaxosService *s) : svc(s) {} + void finish(int r) override { + if (r >= 0) + svc->_active(); + } + }; + wait_for_active_ctx(new C_Active(this)); + return; + } + dout(10) << __func__ << dendl; + + // create pending state? + if (mon.is_leader()) { + dout(7) << __func__ << " creating new pending" << dendl; + if (!have_pending) { + create_pending(); + have_pending = true; + } + + if (get_last_committed() == 0) { + // create initial state + create_initial(); + propose_pending(); + return; + } + } else { + dout(7) << __func__ << " we are not the leader, hence we propose nothing!" << dendl; + } + + // wake up anyone who came in while we were proposing. note that + // anyone waiting for the previous proposal to commit is no longer + // on this list; it is on Paxos's. + finish_contexts(g_ceph_context, waiting_for_finished_proposal, 0); + + if (mon.is_leader()) + upgrade_format(); + + // NOTE: it's possible that this will get called twice if we commit + // an old paxos value. Implementations should be mindful of that. + on_active(); +} + + +void PaxosService::shutdown() +{ + cancel_events(); + + if (proposal_timer) { + dout(10) << " canceling proposal_timer " << proposal_timer << dendl; + mon.timer.cancel_event(proposal_timer); + proposal_timer = 0; + } + + finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN); + + on_shutdown(); +} + +void PaxosService::maybe_trim() +{ + if (!is_writeable()) + return; + + const version_t first_committed = get_first_committed(); + version_t trim_to = get_trim_to(); + dout(20) << __func__ << " " << first_committed << "~" << trim_to << dendl; + + if (trim_to < first_committed) { + dout(10) << __func__ << " trim_to " << trim_to << " < first_committed " + << first_committed << dendl; + return; + } + + version_t to_remove = trim_to - first_committed; + const version_t trim_min = g_conf().get_val<version_t>("paxos_service_trim_min"); + if (trim_min > 0 && + to_remove < trim_min) { + dout(10) << __func__ << " trim_to " << trim_to << " would only trim " << to_remove + << " < paxos_service_trim_min " << trim_min << dendl; + return; + } + + to_remove = [to_remove, trim_to, this] { + const version_t trim_max = g_conf().get_val<version_t>("paxos_service_trim_max"); + if (trim_max == 0 || to_remove < trim_max) { + return to_remove; + } + if (to_remove < trim_max * 1.5) { + dout(10) << __func__ << " trim to " << trim_to << " would only trim " << to_remove + << " > paxos_service_trim_max, limiting to " << trim_max + << dendl; + return trim_max; + } + const version_t new_trim_max = (trim_max + to_remove) / 2; + const uint64_t trim_max_multiplier = g_conf().get_val<uint64_t>("paxos_service_trim_max_multiplier"); + if (trim_max_multiplier) { + return std::min(new_trim_max, trim_max * trim_max_multiplier); + } else { + return new_trim_max; + } + }(); + trim_to = first_committed + to_remove; + + dout(10) << __func__ << " trimming to " << trim_to << ", " << to_remove << " states" << dendl; + MonitorDBStore::TransactionRef t = paxos.get_pending_transaction(); + trim(t, first_committed, trim_to); + put_first_committed(t, trim_to); + cached_first_committed = trim_to; + + // let the service add any extra stuff + encode_trim_extra(t, trim_to); + + paxos.trigger_propose(); +} + +void PaxosService::trim(MonitorDBStore::TransactionRef t, + version_t from, version_t to) +{ + dout(10) << __func__ << " from " << from << " to " << to << dendl; + ceph_assert(from != to); + + for (version_t v = from; v < to; ++v) { + dout(20) << __func__ << " " << v << dendl; + t->erase(get_service_name(), v); + + string full_key = mon.store->combine_strings("full", v); + if (mon.store->exists(get_service_name(), full_key)) { + dout(20) << __func__ << " " << full_key << dendl; + t->erase(get_service_name(), full_key); + } + } + if (g_conf()->mon_compact_on_trim) { + dout(20) << " compacting prefix " << get_service_name() << dendl; + t->compact_range(get_service_name(), stringify(from - 1), stringify(to)); + t->compact_range(get_service_name(), + mon.store->combine_strings(full_prefix_name, from - 1), + mon.store->combine_strings(full_prefix_name, to)); + } +} + +void PaxosService::load_health() +{ + bufferlist bl; + mon.store->get("health", service_name, bl); + if (bl.length()) { + auto p = bl.cbegin(); + using ceph::decode; + decode(health_checks, p); + } +} diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h new file mode 100644 index 000000000..93c5e7c81 --- /dev/null +++ b/src/mon/PaxosService.h @@ -0,0 +1,901 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_PAXOSSERVICE_H +#define CEPH_PAXOSSERVICE_H + +#include "include/Context.h" +#include "Paxos.h" +#include "Monitor.h" +#include "MonitorDBStore.h" + +/** + * A Paxos Service is an abstraction that easily allows one to obtain an + * association between a Monitor and a Paxos class, in order to implement any + * service. + */ +class PaxosService { + /** + * @defgroup PaxosService_h_class Paxos Service + * @{ + */ + public: + /** + * The Monitor to which this class is associated with + */ + Monitor &mon; + /** + * The Paxos instance to which this class is associated with + */ + Paxos &paxos; + /** + * Our name. This will be associated with the class implementing us, and will + * be used mainly for store-related operations. + */ + std::string service_name; + /** + * If we are or have queued anything for proposal, this variable will be true + * until our proposal has been finished. + */ + bool proposing; + + bool need_immediate_propose = false; + +protected: + /** + * Services implementing us used to depend on the Paxos version, back when + * each service would have a Paxos instance for itself. However, now we only + * have a single Paxos instance, shared by all the services. Each service now + * must keep its own version, if so they wish. This variable should be used + * for that purpose. + */ + version_t service_version; + + private: + /** + * Event callback responsible for proposing our pending value once a timer + * runs out and fires. + */ + Context *proposal_timer; + /** + * If the implementation class has anything pending to be proposed to Paxos, + * then have_pending should be true; otherwise, false. + */ + bool have_pending; + + /** + * health checks for this service + * + * Child must populate this during encode_pending() by calling encode_health(). + */ + health_check_map_t health_checks; +protected: + /** + * format of our state in leveldb, 0 for default + */ + version_t format_version; + +public: + const health_check_map_t& get_health_checks() const { + return health_checks; + } + + /** + * @defgroup PaxosService_h_callbacks Callback classes + * @{ + */ + /** + * Retry dispatching a given service message + * + * This callback class is used when we had to wait for some condition to + * become true while we were dispatching it. + * + * For instance, if the message's version isn't readable, according to Paxos, + * then we must wait for it to become readable. So, we just queue an + * instance of this class onto the Paxos::wait_for_readable function, and + * we will retry the whole dispatch again once the callback is fired. + */ + class C_RetryMessage : public C_MonOp { + PaxosService *svc; + public: + C_RetryMessage(PaxosService *s, MonOpRequestRef op_) : + C_MonOp(op_), svc(s) { } + void _finish(int r) override { + if (r == -EAGAIN || r >= 0) + svc->dispatch(op); + else if (r == -ECANCELED) + return; + else + ceph_abort_msg("bad C_RetryMessage return value"); + } + }; + + class C_ReplyOp : public C_MonOp { + Monitor &mon; + MonOpRequestRef op; + MessageRef reply; + public: + C_ReplyOp(PaxosService *s, MonOpRequestRef o, MessageRef r) : + C_MonOp(o), mon(s->mon), op(o), reply(r) { } + void _finish(int r) override { + if (r >= 0) { + mon.send_reply(op, reply.detach()); + } + } + }; + + /** + * @} + */ + + /** + * @param mn A Monitor instance + * @param p A Paxos instance + * @param name Our service's name. + */ + PaxosService(Monitor &mn, Paxos &p, std::string name) + : mon(mn), paxos(p), service_name(name), + proposing(false), + service_version(0), proposal_timer(0), have_pending(false), + format_version(0), + last_committed_name("last_committed"), + first_committed_name("first_committed"), + full_prefix_name("full"), full_latest_name("latest"), + cached_first_committed(0), cached_last_committed(0) + { + } + + virtual ~PaxosService() {} + + /** + * Get the service's name. + * + * @returns The service's name. + */ + const std::string& get_service_name() const { return service_name; } + + /** + * Get the store prefixes we utilize + */ + virtual void get_store_prefixes(std::set<std::string>& s) const { + s.insert(service_name); + } + + // i implement and you ignore + /** + * Informs this instance that it should consider itself restarted. + * + * This means that we will cancel our proposal_timer event, if any exists. + */ + void restart(); + /** + * Informs this instance that an election has finished. + * + * This means that we will invoke a PaxosService::discard_pending while + * setting have_pending to false (basically, ignore our pending state) and + * we will then make sure we obtain a new state. + * + * Our state shall be updated by PaxosService::_active if the Paxos is + * active; otherwise, we will wait for it to become active by adding a + * PaxosService::C_Active callback to it. + */ + void election_finished(); + /** + * Informs this instance that it is supposed to shutdown. + * + * Basically, it will instruct Paxos to cancel all events/callbacks and then + * will cancel the proposal_timer event if any exists. + */ + void shutdown(); + +private: + /** + * Update our state by updating it from Paxos, and then creating a new + * pending state if need be. + * + * @remarks We only create a pending state we our Monitor is the Leader. + * + * @pre Paxos is active + * @post have_pending is true if our Monitor is the Leader and Paxos is + * active + */ + void _active(); + +public: + /** + * Propose a new value through Paxos. + * + * This function should be called by the classes implementing + * PaxosService, in order to propose a new value through Paxos. + * + * @pre The implementation class implements the encode_pending function. + * @pre have_pending is true + * @pre Our monitor is the Leader + * @pre Paxos is active + * @post Cancel the proposal timer, if any + * @post have_pending is false + * @post propose pending value through Paxos + * + * @note This function depends on the implementation of encode_pending on + * the class that is implementing PaxosService + */ + void propose_pending(); + + /** + * Let others request us to propose. + * + * At the moment, this is just a wrapper to propose_pending() with an + * extra check for is_writeable(), but it's a good practice to dissociate + * requests for proposals from direct usage of propose_pending() for + * future use -- we might want to perform additional checks or put a + * request on hold, for instance. + */ + void request_proposal() { + ceph_assert(is_writeable()); + + propose_pending(); + } + /** + * Request service @p other to perform a proposal. + * + * We could simply use the function above, requesting @p other directly, + * but we might eventually want to do something to the request -- say, + * set a flag stating we're waiting on a cross-proposal to be finished. + */ + void request_proposal(PaxosService *other) { + ceph_assert(other != NULL); + ceph_assert(other->is_writeable()); + + other->request_proposal(); + } + + /** + * Dispatch a message by passing it to several different functions that are + * either implemented directly by this service, or that should be implemented + * by the class implementing this service. + * + * @param m A message + * @returns 'true' on successful dispatch; 'false' otherwise. + */ + bool dispatch(MonOpRequestRef op); + + void refresh(bool *need_bootstrap); + void post_refresh(); + + /** + * @defgroup PaxosService_h_override_funcs Functions that should be + * overridden. + * + * These functions should be overridden at will by the class implementing + * this service. + * @{ + */ + /** + * Create the initial state for your system. + * + * In some of ours the state is actually set up elsewhere so this does + * nothing. + */ + virtual void create_initial() = 0; + + /** + * Query the Paxos system for the latest state and apply it if it's newer + * than the current Monitor state. + */ + virtual void update_from_paxos(bool *need_bootstrap) = 0; + + /** + * Hook called after all services have refreshed their state from paxos + * + * This is useful for doing any update work that depends on other + * service's having up-to-date state. + */ + virtual void post_paxos_update() {} + + /** + * Init on startup + * + * This is called on mon startup, after all of the PaxosService instances' + * update_from_paxos() methods have been called + */ + virtual void init() {} + + /** + * Create the pending state. + * + * @invariant This function is only called on a Leader. + * @remarks This created state is then modified by incoming messages. + * @remarks Called at startup and after every Paxos ratification round. + */ + virtual void create_pending() = 0; + + /** + * Encode the pending state into a ceph::buffer::list for ratification and + * transmission as the next state. + * + * @invariant This function is only called on a Leader. + * + * @param t The transaction to hold all changes. + */ + virtual void encode_pending(MonitorDBStore::TransactionRef t) = 0; + + /** + * Discard the pending state + * + * @invariant This function is only called on a Leader. + * + * @remarks This function is NOT overridden in any of our code, but it is + * called in PaxosService::election_finished if have_pending is + * true. + */ + virtual void discard_pending() { } + + /** + * Look at the query; if the query can be handled without changing state, + * do so. + * + * @param m A query message + * @returns 'true' if the query was handled (e.g., was a read that got + * answered, was a state change that has no effect); 'false' + * otherwise. + */ + virtual bool preprocess_query(MonOpRequestRef op) = 0; + + /** + * Apply the message to the pending state. + * + * @invariant This function is only called on a Leader. + * + * @param m An update message + * @returns 'true' if the update message was handled (e.g., a command that + * went through); 'false' otherwise. + */ + virtual bool prepare_update(MonOpRequestRef op) = 0; + /** + * @} + */ + + /** + * Determine if the Paxos system should vote on pending, and if so how long + * it should wait to vote. + * + * @param[out] delay The wait time, used so we can limit the update traffic + * spamming. + * @returns 'true' if the Paxos system should propose; 'false' otherwise. + */ + virtual bool should_propose(double &delay); + + /** + * force an immediate propose. + * + * This is meant to be called from prepare_update(op). + */ + void force_immediate_propose() { + need_immediate_propose = true; + } + + /** + * @defgroup PaxosService_h_courtesy Courtesy functions + * + * Courtesy functions, in case the class implementing this service has + * anything it wants/needs to do at these times. + * @{ + */ + /** + * This is called when the Paxos state goes to active. + * + * On the peon, this is after each election. + * On the leader, this is after each election, *and* after each completed + * proposal. + * + * @note This function may get called twice in certain recovery cases. + */ + virtual void on_active() { } + + /** + * This is called when we are shutting down + */ + virtual void on_shutdown() {} + + /** + * this is called when activating on the leader + * + * it should conditionally upgrade the on-disk format by proposing a transaction + */ + virtual void upgrade_format() { } + + /** + * this is called when we detect the store has just upgraded underneath us + */ + virtual void on_upgrade() {} + + /** + * Called when the Paxos system enters a Leader election. + * + * @remarks It's a courtesy method, in case the class implementing this + * service has anything it wants/needs to do at that time. + */ + virtual void on_restart() { } + /** + * @} + */ + + /** + * Tick. + */ + virtual void tick() {} + + void encode_health(const health_check_map_t& next, + MonitorDBStore::TransactionRef t) { + using ceph::encode; + ceph::buffer::list bl; + encode(next, bl); + t->put("health", service_name, bl); + mon.log_health(next, health_checks, t); + } + void load_health(); + + /** + * @defgroup PaxosService_h_store_keys Set of keys that are usually used on + * all the services implementing this + * class, and, being almost the only keys + * used, should be standardized to avoid + * mistakes. + * @{ + */ + const std::string last_committed_name; + const std::string first_committed_name; + const std::string full_prefix_name; + const std::string full_latest_name; + /** + * @} + */ + + private: + /** + * @defgroup PaxosService_h_version_cache Variables holding cached values + * for the most used versions (first + * and last committed); we only have + * to read them when the store is + * updated, so in-between updates we + * may very well use cached versions + * and avoid the overhead. + * @{ + */ + version_t cached_first_committed; + version_t cached_last_committed; + /** + * @} + */ + + /** + * Callback list to be used whenever we are running a proposal through + * Paxos. These callbacks will be awaken whenever the said proposal + * finishes. + */ + std::list<Context*> waiting_for_finished_proposal; + + public: + + /** + * Check if we are proposing a value through Paxos + * + * @returns true if we are proposing; false otherwise. + */ + bool is_proposing() const { + return proposing; + } + + /** + * Check if we are in the Paxos ACTIVE state. + * + * @note This function is a wrapper for Paxos::is_active + * + * @returns true if in state ACTIVE; false otherwise. + */ + bool is_active() const { + return + !is_proposing() && + (paxos.is_active() || paxos.is_updating() || paxos.is_writing()); + } + + /** + * Check if we are readable. + * + * This mirrors on the paxos check, except that we also verify that + * + * - the client hasn't seen the future relative to this PaxosService + * - this service isn't proposing. + * - we have committed our initial state (last_committed > 0) + * + * @param ver The version we want to check if is readable + * @returns true if it is readable; false otherwise + */ + bool is_readable(version_t ver = 0) const { + if (ver > get_last_committed() || + !paxos.is_readable(0) || + get_last_committed() == 0) + return false; + return true; + } + + /** + * Check if we are writeable. + * + * We consider to be writeable iff: + * + * - we are not proposing a new version; + * - we are ready to be written to -- i.e., we have a pending value. + * - paxos is (active or updating or writing or refresh) + * + * @returns true if writeable; false otherwise + */ + bool is_writeable() const { + return is_active() && have_pending; + } + + /** + * Wait for a proposal to finish. + * + * Add a callback to be awaken whenever our current proposal finishes being + * proposed through Paxos. + * + * @param c The callback to be awaken once the proposal is finished. + */ + void wait_for_finished_proposal(MonOpRequestRef op, Context *c) { + if (op) + op->mark_event(service_name + ":wait_for_finished_proposal"); + waiting_for_finished_proposal.push_back(c); + } + void wait_for_finished_proposal_ctx(Context *c) { + MonOpRequestRef o; + wait_for_finished_proposal(o, c); + } + + /** + * Wait for us to become active + * + * @param c The callback to be awaken once we become active. + */ + void wait_for_active(MonOpRequestRef op, Context *c) { + if (op) + op->mark_event(service_name + ":wait_for_active"); + + if (!is_proposing()) { + paxos.wait_for_active(op, c); + return; + } + wait_for_finished_proposal(op, c); + } + void wait_for_active_ctx(Context *c) { + MonOpRequestRef o; + wait_for_active(o, c); + } + + /** + * Wait for us to become readable + * + * @param c The callback to be awaken once we become active. + * @param ver The version we want to wait on. + */ + void wait_for_readable(MonOpRequestRef op, Context *c, version_t ver = 0) { + /* This is somewhat of a hack. We only do check if a version is readable on + * PaxosService::dispatch(), but, nonetheless, we must make sure that if that + * is why we are not readable, then we must wait on PaxosService and not on + * Paxos; otherwise, we may assert on Paxos::wait_for_readable() if it + * happens to be readable at that specific point in time. + */ + if (op) + op->mark_event(service_name + ":wait_for_readable"); + + if (is_proposing() || + ver > get_last_committed() || + get_last_committed() == 0) + wait_for_finished_proposal(op, c); + else { + if (op) + op->mark_event(service_name + ":wait_for_readable/paxos"); + + paxos.wait_for_readable(op, c); + } + } + + void wait_for_readable_ctx(Context *c, version_t ver = 0) { + MonOpRequestRef o; // will initialize the shared_ptr to NULL + wait_for_readable(o, c, ver); + } + + /** + * Wait for us to become writeable + * + * @param c The callback to be awaken once we become writeable. + */ + void wait_for_writeable(MonOpRequestRef op, Context *c) { + if (op) + op->mark_event(service_name + ":wait_for_writeable"); + + if (is_proposing()) + wait_for_finished_proposal(op, c); + else if (!is_writeable()) + wait_for_active(op, c); + else + paxos.wait_for_writeable(op, c); + } + void wait_for_writeable_ctx(Context *c) { + MonOpRequestRef o; + wait_for_writeable(o, c); + } + + + /** + * @defgroup PaxosService_h_Trim Functions for trimming states + * @{ + */ + /** + * trim service states if appropriate + * + * Called at same interval as tick() + */ + void maybe_trim(); + + /** + * Auxiliary function to trim our state from version @p from to version + * @p to, not including; i.e., the interval [from, to[ + * + * @param t The transaction to which we will add the trim operations. + * @param from the lower limit of the interval to be trimmed + * @param to the upper limit of the interval to be trimmed (not including) + */ + void trim(MonitorDBStore::TransactionRef t, version_t from, version_t to); + + /** + * encode service-specific extra bits into trim transaction + * + * @param tx transaction + * @param first new first_committed value + */ + virtual void encode_trim_extra(MonitorDBStore::TransactionRef tx, + version_t first) {} + + /** + * Get the version we should trim to. + * + * Should be overloaded by service if it wants to trim states. + * + * @returns the version we should trim to; if we return zero, it should be + * assumed that there's no version to trim to. + */ + virtual version_t get_trim_to() const { + return 0; + } + + /** + * @} + */ + /** + * @defgroup PaxosService_h_Stash_Full + * @{ + */ + virtual bool should_stash_full(); + /** + * Encode a full version on @p t + * + * @note We force every service to implement this function, since we strongly + * desire the encoding of full versions. + * @note Services that do not trim their state, will be bound to only create + * one full version. Full version stashing is determined/controlled by + * trimming: we stash a version each time a trim is bound to erase the + * latest full version. + * + * @param t Transaction on which the full version shall be encoded. + */ + virtual void encode_full(MonitorDBStore::TransactionRef t) = 0; + + /** + * @} + */ + + /** + * Cancel events. + * + * @note This function is a wrapper for Paxos::cancel_events + */ + void cancel_events() { + paxos.cancel_events(); + } + + /** + * @defgroup PaxosService_h_store_funcs Back storage interface functions + * @{ + */ + /** + * @defgroup PaxosService_h_store_modify Wrapper function interface to access + * the back store for modification + * purposes + * @{ + */ + void put_first_committed(MonitorDBStore::TransactionRef t, version_t ver) { + t->put(get_service_name(), first_committed_name, ver); + } + /** + * Set the last committed version to @p ver + * + * @param t A transaction to which we add this put operation + * @param ver The last committed version number being put + */ + void put_last_committed(MonitorDBStore::TransactionRef t, version_t ver) { + t->put(get_service_name(), last_committed_name, ver); + + /* We only need to do this once, and that is when we are about to make our + * first proposal. There are some services that rely on first_committed + * being set -- and it should! -- so we need to guarantee that it is, + * specially because the services itself do not do it themselves. They do + * rely on it, but they expect us to deal with it, and so we shall. + */ + if (!get_first_committed()) + put_first_committed(t, ver); + } + /** + * Put the contents of @p bl into version @p ver + * + * @param t A transaction to which we will add this put operation + * @param ver The version to which we will add the value + * @param bl A ceph::buffer::list containing the version's value + */ + void put_version(MonitorDBStore::TransactionRef t, version_t ver, + ceph::buffer::list& bl) { + t->put(get_service_name(), ver, bl); + } + /** + * Put the contents of @p bl into a full version key for this service, that + * will be created with @p ver in mind. + * + * @param t The transaction to which we will add this put operation + * @param ver A version number + * @param bl A ceph::buffer::list containing the version's value + */ + void put_version_full(MonitorDBStore::TransactionRef t, + version_t ver, ceph::buffer::list& bl) { + std::string key = mon.store->combine_strings(full_prefix_name, ver); + t->put(get_service_name(), key, bl); + } + /** + * Put the version number in @p ver into the key pointing to the latest full + * version of this service. + * + * @param t The transaction to which we will add this put operation + * @param ver A version number + */ + void put_version_latest_full(MonitorDBStore::TransactionRef t, version_t ver) { + std::string key = mon.store->combine_strings(full_prefix_name, full_latest_name); + t->put(get_service_name(), key, ver); + } + /** + * Put the contents of @p bl into the key @p key. + * + * @param t A transaction to which we will add this put operation + * @param key The key to which we will add the value + * @param bl A ceph::buffer::list containing the value + */ + void put_value(MonitorDBStore::TransactionRef t, + const std::string& key, ceph::buffer::list& bl) { + t->put(get_service_name(), key, bl); + } + + /** + * Put integer value @v into the key @p key. + * + * @param t A transaction to which we will add this put operation + * @param key The key to which we will add the value + * @param v An integer + */ + void put_value(MonitorDBStore::TransactionRef t, + const std::string& key, version_t v) { + t->put(get_service_name(), key, v); + } + + /** + * @} + */ + + /** + * @defgroup PaxosService_h_store_get Wrapper function interface to access + * the back store for reading purposes + * @{ + */ + + /** + * @defgroup PaxosService_h_version_cache Obtain cached versions for this + * service. + * @{ + */ + /** + * Get the first committed version + * + * @returns Our first committed version (that is available) + */ + version_t get_first_committed() const{ + return cached_first_committed; + } + /** + * Get the last committed version + * + * @returns Our last committed version + */ + version_t get_last_committed() const{ + return cached_last_committed; + } + + /** + * @} + */ + + /** + * Get the contents of a given version @p ver + * + * @param ver The version being obtained + * @param bl The ceph::buffer::list to be populated + * @return 0 on success; <0 otherwise + */ + virtual int get_version(version_t ver, ceph::buffer::list& bl) { + return mon.store->get(get_service_name(), ver, bl); + } + /** + * Get the contents of a given full version of this service. + * + * @param ver A version number + * @param bl The ceph::buffer::list to be populated + * @returns 0 on success; <0 otherwise + */ + virtual int get_version_full(version_t ver, ceph::buffer::list& bl) { + std::string key = mon.store->combine_strings(full_prefix_name, ver); + return mon.store->get(get_service_name(), key, bl); + } + /** + * Get the latest full version number + * + * @returns A version number + */ + version_t get_version_latest_full() { + std::string key = mon.store->combine_strings(full_prefix_name, full_latest_name); + return mon.store->get(get_service_name(), key); + } + + /** + * Get a value from a given key. + * + * @param[in] key The key + * @param[out] bl The ceph::buffer::list to be populated with the value + */ + int get_value(const std::string& key, ceph::buffer::list& bl) { + return mon.store->get(get_service_name(), key, bl); + } + /** + * Get an integer value from a given key. + * + * @param[in] key The key + */ + version_t get_value(const std::string& key) { + return mon.store->get(get_service_name(), key); + } + + /** + * @} + */ + /** + * @} + */ +}; + +#endif diff --git a/src/mon/Session.h b/src/mon/Session.h new file mode 100644 index 000000000..3009d0239 --- /dev/null +++ b/src/mon/Session.h @@ -0,0 +1,295 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MON_SESSION_H +#define CEPH_MON_SESSION_H + +#include <string> +#include <string_view> + +#include "include/utime.h" +#include "include/xlist.h" + +#include "global/global_context.h" +#include "msg/msg_types.h" +#include "mon/mon_types.h" + +#include "auth/AuthServiceHandler.h" +#include "osd/OSDMap.h" + +#include "MonCap.h" + +struct MonSession; + +struct Subscription { + MonSession *session; + std::string type; + xlist<Subscription*>::item type_item; + version_t next; + bool onetime; + bool incremental_onetime; // has CEPH_FEATURE_INCSUBOSDMAP + + Subscription(MonSession *s, const std::string& t) : session(s), type(t), type_item(this), + next(0), onetime(false), incremental_onetime(false) {} +}; + +struct MonSession : public RefCountedObject { + ConnectionRef con; + int con_type = 0; + uint64_t con_features = 0; // zero if AnonConnection + entity_name_t name; + entity_addrvec_t addrs; + entity_addr_t socket_addr; + utime_t session_timeout; + bool closed = false; + xlist<MonSession*>::item item; + std::set<uint64_t> routed_request_tids; + MonCap caps; + bool validated_stretch_connection = false; + + bool authenticated = false; ///< true if auth handshake is complete + + std::map<std::string, Subscription*> sub_map; + epoch_t osd_epoch = 0; ///< the osdmap epoch sent to the mon client + + AuthServiceHandler *auth_handler = nullptr; + EntityName entity_name; + uint64_t global_id = 0; + global_id_status_t global_id_status = global_id_status_t::NONE; + + ConnectionRef proxy_con; + uint64_t proxy_tid = 0; + + std::string remote_host; ///< remote host name + std::map<std::string,std::string,std::less<>> last_config; ///< most recently shared config + bool any_config = false; + + MonSession(Connection *c) + : RefCountedObject(g_ceph_context), + con(c), + item(this) { } + + void _ident(const entity_name_t& n, const entity_addrvec_t& av) { + con_type = con->get_peer_type(); + name = n; + addrs = av; + socket_addr = con->get_peer_socket_addr(); + if (con->get_messenger()) { + // only fill in features if this is a non-anonymous connection + con_features = con->get_features(); + } + } + + ~MonSession() override { + //generic_dout(0) << "~MonSession " << this << dendl; + // we should have been removed before we get destructed; see MonSessionMap::remove_session() + ceph_assert(!item.is_on_list()); + ceph_assert(sub_map.empty()); + delete auth_handler; + } + + bool is_capable(std::string service, int mask) { + std::map<std::string,std::string> args; + return caps.is_capable( + g_ceph_context, + entity_name, + service, "", args, + mask & MON_CAP_R, mask & MON_CAP_W, mask & MON_CAP_X, + get_peer_socket_addr()); + } + + std::vector<string> get_allowed_fs_names() const { + return caps.allowed_fs_names(); + } + + bool fs_name_capable(string_view fsname, __u8 mask) { + return caps.fs_name_capable(entity_name, fsname, mask); + } + + const entity_addr_t& get_peer_socket_addr() { + return socket_addr; + } + + void dump(ceph::Formatter *f) const { + f->dump_stream("name") << name; + f->dump_stream("entity_name") << entity_name; + f->dump_object("addrs", addrs); + f->dump_object("socket_addr", socket_addr); + f->dump_string("con_type", ceph_entity_type_name(con_type)); + f->dump_unsigned("con_features", con_features); + f->dump_stream("con_features_hex") << std::hex << con_features << std::dec; + f->dump_string("con_features_release", + ceph_release_name(ceph_release_from_features(con_features))); + f->dump_bool("open", !closed); + f->dump_object("caps", caps); + f->dump_bool("authenticated", authenticated); + f->dump_unsigned("global_id", global_id); + f->dump_stream("global_id_status") << global_id_status; + f->dump_unsigned("osd_epoch", osd_epoch); + f->dump_string("remote_host", remote_host); + } +}; + + +struct MonSessionMap { + xlist<MonSession*> sessions; + std::map<std::string, xlist<Subscription*>* > subs; + std::multimap<int, MonSession*> by_osd; + FeatureMap feature_map; // type -> features -> count + + MonSessionMap() {} + ~MonSessionMap() { + while (!subs.empty()) { + ceph_assert(subs.begin()->second->empty()); + delete subs.begin()->second; + subs.erase(subs.begin()); + } + } + + unsigned get_size() const { + return sessions.size(); + } + + void remove_session(MonSession *s) { + ceph_assert(!s->closed); + for (std::map<std::string,Subscription*>::iterator p = s->sub_map.begin(); p != s->sub_map.end(); ++p) { + p->second->type_item.remove_myself(); + delete p->second; + } + s->sub_map.clear(); + s->item.remove_myself(); + if (s->name.is_osd() && + s->name.num() >= 0) { + for (auto p = by_osd.find(s->name.num()); + p->first == s->name.num(); + ++p) + if (p->second == s) { + by_osd.erase(p); + break; + } + } + if (s->con_features) { + feature_map.rm(s->con_type, s->con_features); + } + s->closed = true; + s->put(); + } + + MonSession *new_session(const entity_name_t& n, + const entity_addrvec_t& av, + Connection *c) { + MonSession *s = new MonSession(c); + ceph_assert(s); + s->_ident(n, av); + add_session(s); + return s; + } + + void add_session(MonSession *s) { + s->session_timeout = ceph_clock_now(); + s->session_timeout += g_conf()->mon_session_timeout; + + sessions.push_back(&s->item); + s->get(); + if (s->name.is_osd() && + s->name.num() >= 0) { + by_osd.insert(std::pair<int,MonSession*>(s->name.num(), s)); + } + if (s->con_features) { + feature_map.add(s->con_type, s->con_features); + } + } + + MonSession *get_random_osd_session(OSDMap *osdmap) { + // ok, this isn't actually random, but close enough. + if (by_osd.empty()) + return 0; + int n = by_osd.rbegin()->first + 1; + int r = rand() % n; + + auto p = by_osd.lower_bound(r); + if (p == by_osd.end()) + --p; + + if (!osdmap) { + return p->second; + } + + MonSession *s = NULL; + + auto b = p; + auto f = p; + bool backward = true, forward = true; + while (backward || forward) { + if (backward) { + if (osdmap->is_up(b->first) && + osdmap->get_addrs(b->first) == b->second->con->get_peer_addrs()) { + s = b->second; + break; + } + if (b != by_osd.begin()) + --b; + else + backward = false; + } + + forward = (f != by_osd.end()); + if (forward) { + if (osdmap->is_up(f->first)) { + s = f->second; + break; + } + ++f; + } + } + + return s; + } + + void add_update_sub(MonSession *s, const std::string& what, version_t start, bool onetime, bool incremental_onetime) { + Subscription *sub = 0; + if (s->sub_map.count(what)) { + sub = s->sub_map[what]; + } else { + sub = new Subscription(s, what); + s->sub_map[what] = sub; + + if (!subs.count(what)) + subs[what] = new xlist<Subscription*>; + subs[what]->push_back(&sub->type_item); + } + sub->next = start; + sub->onetime = onetime; + sub->incremental_onetime = onetime && incremental_onetime; + } + + void remove_sub(Subscription *sub) { + sub->session->sub_map.erase(sub->type); + sub->type_item.remove_myself(); + delete sub; + } +}; + +inline std::ostream& operator<<(std::ostream& out, const MonSession& s) +{ + out << "MonSession(" << s.name << " " << s.addrs + << " is " << (s.closed ? "closed" : "open") + << " " << s.caps + << ", features 0x" << std::hex << s.con_features << std::dec + << " (" << ceph_release_name(ceph_release_from_features(s.con_features)) + << "))"; + return out; +} + +#endif diff --git a/src/mon/error_code.cc b/src/mon/error_code.cc new file mode 100644 index 000000000..a2cd39299 --- /dev/null +++ b/src/mon/error_code.cc @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat <contact@redhat.com> + * Author: Adam C. Emerson <aemerson@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <string> + +#include "common/error_code.h" +#include "common/errno.h" +#include "error_code.h" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wnon-virtual-dtor" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wnon-virtual-dtor" + +namespace bs = boost::system; + +class mon_error_category : public ceph::converting_category { +public: + mon_error_category(){} + const char* name() const noexcept override; + const char* message(int ev, char*, std::size_t) const noexcept override; + std::string message(int ev) const override; + bs::error_condition default_error_condition(int ev) const noexcept + override; + bool equivalent(int ev, const bs::error_condition& c) const + noexcept override; + using ceph::converting_category::equivalent; + int from_code(int ev) const noexcept override; +}; + +const char* mon_error_category::name() const noexcept { + return "mon"; +} + +const char* mon_error_category::message(int ev, char* buf, + std::size_t len) const noexcept { + if (ev == 0) + return "No error"; + + if (len) { + auto s = cpp_strerror(ev); + auto n = s.copy(buf, len - 1); + *(buf + n) = '\0'; + } + return buf; +} + +std::string mon_error_category::message(int ev) const { + if (ev == 0) + return "No error"; + + return cpp_strerror(ev); +} + +bs::error_condition +mon_error_category::default_error_condition(int ev) const noexcept { + return { ev, bs::generic_category() }; +} + +bool mon_error_category::equivalent(int ev,const bs::error_condition& c) const noexcept { + return default_error_condition(ev) == c; +} + +int mon_error_category::from_code(int ev) const noexcept { + return -ev; +} + +const bs::error_category& mon_category() noexcept { + static const mon_error_category c; + return c; +} +#pragma GCC diagnostic pop +#pragma clang diagnostic pop diff --git a/src/mon/error_code.h b/src/mon/error_code.h new file mode 100644 index 000000000..2a6e88061 --- /dev/null +++ b/src/mon/error_code.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat <contact@redhat.com> + * Author: Adam C. Emerson <aemerson@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#pragma once + +#include <boost/system/error_code.hpp> + +#include "include/rados.h" + +const boost::system::error_category& mon_category() noexcept; + +// The Monitor, like the OSD, mostly replies with POSIX error codes. + +enum class mon_errc { +}; + +namespace boost::system { +template<> +struct is_error_code_enum<::mon_errc> { + static const bool value = true; +}; + +template<> +struct is_error_condition_enum<::mon_errc> { + static const bool value = false; +}; +} + +// explicit conversion: +inline boost::system::error_code make_error_code(mon_errc e) noexcept { + return { static_cast<int>(e), mon_category() }; +} + +// implicit conversion: +inline boost::system::error_condition make_error_condition(mon_errc e) noexcept { + return { static_cast<int>(e), mon_category() }; +} diff --git a/src/mon/health_check.h b/src/mon/health_check.h new file mode 100644 index 000000000..4e74637f9 --- /dev/null +++ b/src/mon/health_check.h @@ -0,0 +1,198 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <string> +#include <map> + +#include "include/health.h" +#include "include/utime.h" +#include "common/Formatter.h" + +struct health_check_t { + health_status_t severity; + std::string summary; + std::list<std::string> detail; + int64_t count = 0; + + DENC(health_check_t, v, p) { + DENC_START(2, 1, p); + denc(v.severity, p); + denc(v.summary, p); + denc(v.detail, p); + if (struct_v >= 2) { + denc(v.count, p); + } + DENC_FINISH(p); + } + + friend bool operator==(const health_check_t& l, + const health_check_t& r) { + return l.severity == r.severity && + l.summary == r.summary && + l.detail == r.detail && + l.count == r.count; + } + friend bool operator!=(const health_check_t& l, + const health_check_t& r) { + return !(l == r); + } + + void dump(ceph::Formatter *f, bool want_detail=true) const { + f->dump_stream("severity") << severity; + + f->open_object_section("summary"); + f->dump_string("message", summary); + f->dump_int("count", count); + f->close_section(); + + if (want_detail) { + f->open_array_section("detail"); + for (auto& p : detail) { + f->open_object_section("detail_item"); + f->dump_string("message", p); + f->close_section(); + } + f->close_section(); + } + } + + static void generate_test_instances(std::list<health_check_t*>& ls) { + ls.push_back(new health_check_t); + ls.push_back(new health_check_t); + ls.back()->severity = HEALTH_ERR; + ls.back()->summary = "summarization"; + ls.back()->detail = {"one", "two", "three"}; + ls.back()->count = 42; + } +}; +WRITE_CLASS_DENC(health_check_t) + + +struct health_mute_t { + std::string code; + utime_t ttl; + bool sticky = false; + std::string summary; + int64_t count; + + DENC(health_mute_t, v, p) { + DENC_START(1, 1, p); + denc(v.code, p); + denc(v.ttl, p); + denc(v.sticky, p); + denc(v.summary, p); + denc(v.count, p); + DENC_FINISH(p); + } + + void dump(ceph::Formatter *f) const { + f->dump_string("code", code); + if (ttl != utime_t()) { + f->dump_stream("ttl") << ttl; + } + f->dump_bool("sticky", sticky); + f->dump_string("summary", summary); + f->dump_int("count", count); + } + + static void generate_test_instances(std::list<health_mute_t*>& ls) { + ls.push_back(new health_mute_t); + ls.push_back(new health_mute_t); + ls.back()->code = "OSD_DOWN"; + ls.back()->ttl = utime_t(1, 2); + ls.back()->sticky = true; + ls.back()->summary = "foo bar"; + ls.back()->count = 2; + } +}; +WRITE_CLASS_DENC(health_mute_t) + +struct health_check_map_t { + std::map<std::string,health_check_t> checks; + + DENC(health_check_map_t, v, p) { + DENC_START(1, 1, p); + denc(v.checks, p); + DENC_FINISH(p); + } + + void dump(ceph::Formatter *f) const { + for (auto& [code, check] : checks) { + f->dump_object(code, check); + } + } + + static void generate_test_instances(std::list<health_check_map_t*>& ls) { + ls.push_back(new health_check_map_t); + ls.push_back(new health_check_map_t); + { + auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo", 2); + d.detail.push_back("a"); + d.detail.push_back("b"); + } + { + auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!", 3); + d.detail.push_back("c"); + d.detail.push_back("d"); + d.detail.push_back("e"); + } + } + + void clear() { + checks.clear(); + } + bool empty() const { + return checks.empty(); + } + void swap(health_check_map_t& other) { + checks.swap(other.checks); + } + + health_check_t& add(const std::string& code, + health_status_t severity, + const std::string& summary, + int64_t count) { + ceph_assert(checks.count(code) == 0); + health_check_t& r = checks[code]; + r.severity = severity; + r.summary = summary; + r.count = count; + return r; + } + health_check_t& get_or_add(const std::string& code, + health_status_t severity, + const std::string& summary, + int64_t count) { + health_check_t& r = checks[code]; + r.severity = severity; + r.summary = summary; + r.count += count; + return r; + } + + void merge(const health_check_map_t& o) { + for (auto& [code, check] : o.checks) { + auto [it, new_check] = checks.try_emplace(code, check); + if (!new_check) { + // merge details, and hope the summary matches! + it->second.detail.insert( + it->second.detail.end(), + check.detail.begin(), + check.detail.end()); + it->second.count += check.count; + } + } + } + + friend bool operator==(const health_check_map_t& l, + const health_check_map_t& r) { + return l.checks == r.checks; + } + friend bool operator!=(const health_check_map_t& l, + const health_check_map_t& r) { + return !(l == r); + } +}; +WRITE_CLASS_DENC(health_check_map_t) diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h new file mode 100644 index 000000000..ce7184f37 --- /dev/null +++ b/src/mon/mon_types.h @@ -0,0 +1,660 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MON_TYPES_H +#define CEPH_MON_TYPES_H + +#include <map> + +#include "include/Context.h" +#include "include/util.h" +#include "include/utime.h" +#include "common/Formatter.h" +#include "common/bit_str.h" +#include "common/ceph_releases.h" + +// use as paxos_service index +enum { + PAXOS_MDSMAP, + PAXOS_OSDMAP, + PAXOS_LOG, + PAXOS_MONMAP, + PAXOS_AUTH, + PAXOS_MGR, + PAXOS_MGRSTAT, + PAXOS_HEALTH, + PAXOS_CONFIG, + PAXOS_KV, + PAXOS_NUM +}; + +#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v012" + +// map of entity_type -> features -> count +struct FeatureMap { + std::map<uint32_t,std::map<uint64_t,uint64_t>> m; + + void add(uint32_t type, uint64_t features) { + if (type == CEPH_ENTITY_TYPE_MON) { + return; + } + m[type][features]++; + } + + void add_mon(uint64_t features) { + m[CEPH_ENTITY_TYPE_MON][features]++; + } + + void rm(uint32_t type, uint64_t features) { + if (type == CEPH_ENTITY_TYPE_MON) { + return; + } + auto p = m.find(type); + ceph_assert(p != m.end()); + auto q = p->second.find(features); + ceph_assert(q != p->second.end()); + if (--q->second == 0) { + p->second.erase(q); + if (p->second.empty()) { + m.erase(p); + } + } + } + + FeatureMap& operator+=(const FeatureMap& o) { + for (auto& p : o.m) { + auto &v = m[p.first]; + for (auto& q : p.second) { + v[q.first] += q.second; + } + } + return *this; + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(m, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(m, p); + DECODE_FINISH(p); + } + + void dump(ceph::Formatter *f) const { + for (auto& p : m) { + f->open_array_section(ceph_entity_type_name(p.first)); + for (auto& q : p.second) { + f->open_object_section("group"); + std::stringstream ss; + ss << "0x" << std::hex << q.first << std::dec; + f->dump_string("features", ss.str()); + f->dump_string("release", ceph_release_name( + ceph_release_from_features(q.first))); + f->dump_unsigned("num", q.second); + f->close_section(); + } + f->close_section(); + } + } +}; +WRITE_CLASS_ENCODER(FeatureMap) + +/** + * leveldb store stats + * + * If we ever decide to support multiple backends for the monitor store, + * we should then create an abstract class 'MonitorStoreStats' of sorts + * and inherit it on LevelDBStoreStats. I'm sure you'll figure something + * out. + */ +struct LevelDBStoreStats { + uint64_t bytes_total; + uint64_t bytes_sst; + uint64_t bytes_log; + uint64_t bytes_misc; + utime_t last_update; + + LevelDBStoreStats() : + bytes_total(0), + bytes_sst(0), + bytes_log(0), + bytes_misc(0) + {} + + void dump(ceph::Formatter *f) const { + ceph_assert(f != NULL); + f->dump_int("bytes_total", bytes_total); + f->dump_int("bytes_sst", bytes_sst); + f->dump_int("bytes_log", bytes_log); + f->dump_int("bytes_misc", bytes_misc); + f->dump_stream("last_updated") << last_update; + } + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + encode(bytes_total, bl); + encode(bytes_sst, bl); + encode(bytes_log, bl); + encode(bytes_misc, bl); + encode(last_update, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator &p) { + DECODE_START(1, p); + decode(bytes_total, p); + decode(bytes_sst, p); + decode(bytes_log, p); + decode(bytes_misc, p); + decode(last_update, p); + DECODE_FINISH(p); + } + + static void generate_test_instances(std::list<LevelDBStoreStats*>& ls) { + ls.push_back(new LevelDBStoreStats); + ls.push_back(new LevelDBStoreStats); + ls.back()->bytes_total = 1024*1024; + ls.back()->bytes_sst = 512*1024; + ls.back()->bytes_log = 256*1024; + ls.back()->bytes_misc = 256*1024; + ls.back()->last_update = utime_t(); + } +}; +WRITE_CLASS_ENCODER(LevelDBStoreStats) + +// data stats + +struct DataStats { + ceph_data_stats_t fs_stats; + // data dir + utime_t last_update; + LevelDBStoreStats store_stats; + + void dump(ceph::Formatter *f) const { + ceph_assert(f != NULL); + f->dump_int("kb_total", (fs_stats.byte_total/1024)); + f->dump_int("kb_used", (fs_stats.byte_used/1024)); + f->dump_int("kb_avail", (fs_stats.byte_avail/1024)); + f->dump_int("avail_percent", fs_stats.avail_percent); + f->dump_stream("last_updated") << last_update; + f->open_object_section("store_stats"); + store_stats.dump(f); + f->close_section(); + } + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(3, 1, bl); + encode(fs_stats.byte_total, bl); + encode(fs_stats.byte_used, bl); + encode(fs_stats.byte_avail, bl); + encode(fs_stats.avail_percent, bl); + encode(last_update, bl); + encode(store_stats, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator &p) { + DECODE_START(1, p); + // we moved from having fields in kb to fields in byte + if (struct_v > 2) { + decode(fs_stats.byte_total, p); + decode(fs_stats.byte_used, p); + decode(fs_stats.byte_avail, p); + } else { + uint64_t t; + decode(t, p); + fs_stats.byte_total = t*1024; + decode(t, p); + fs_stats.byte_used = t*1024; + decode(t, p); + fs_stats.byte_avail = t*1024; + } + decode(fs_stats.avail_percent, p); + decode(last_update, p); + if (struct_v > 1) + decode(store_stats, p); + + DECODE_FINISH(p); + } +}; +WRITE_CLASS_ENCODER(DataStats) + +struct ScrubResult { + std::map<std::string,uint32_t> prefix_crc; ///< prefix -> crc + std::map<std::string,uint64_t> prefix_keys; ///< prefix -> key count + + bool operator!=(const ScrubResult& other) { + return prefix_crc != other.prefix_crc || prefix_keys != other.prefix_keys; + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(prefix_crc, bl); + encode(prefix_keys, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(1, p); + decode(prefix_crc, p); + decode(prefix_keys, p); + DECODE_FINISH(p); + } + void dump(ceph::Formatter *f) const { + f->open_object_section("crc"); + for (auto p = prefix_crc.begin(); p != prefix_crc.end(); ++p) + f->dump_unsigned(p->first.c_str(), p->second); + f->close_section(); + f->open_object_section("keys"); + for (auto p = prefix_keys.begin(); p != prefix_keys.end(); ++p) + f->dump_unsigned(p->first.c_str(), p->second); + f->close_section(); + } + static void generate_test_instances(std::list<ScrubResult*>& ls) { + ls.push_back(new ScrubResult); + ls.push_back(new ScrubResult); + ls.back()->prefix_crc["foo"] = 123; + ls.back()->prefix_keys["bar"] = 456; + } +}; +WRITE_CLASS_ENCODER(ScrubResult) + +inline std::ostream& operator<<(std::ostream& out, const ScrubResult& r) { + return out << "ScrubResult(keys " << r.prefix_keys << " crc " << r.prefix_crc << ")"; +} + +/// for information like os, kernel, hostname, memory info, cpu model. +typedef std::map<std::string, std::string> Metadata; + +namespace ceph { + namespace features { + namespace mon { + /** + * Get a feature's name based on its value. + * + * @param b raw feature value + * + * @remarks + * Consumers should not assume this interface will never change. + * @remarks + * As the number of features increase, so may the internal representation + * of the raw features. When this happens, this interface will change + * accordingly. So should consumers of this interface. + */ + static inline const char *get_feature_name(uint64_t b); + } + } +} + + +inline const char *ceph_mon_feature_name(uint64_t b) +{ + return ceph::features::mon::get_feature_name(b); +}; + +class mon_feature_t { + + static constexpr int HEAD_VERSION = 1; + static constexpr int COMPAT_VERSION = 1; + + // mon-specific features + uint64_t features; + +public: + + explicit constexpr + mon_feature_t(const uint64_t f) : features(f) { } + + mon_feature_t() : + features(0) { } + + constexpr + mon_feature_t(const mon_feature_t &o) : + features(o.features) { } + + mon_feature_t& operator&=(const mon_feature_t other) { + features &= other.features; + return (*this); + } + + /** + * Obtain raw features + * + * @remarks + * Consumers should not assume this interface will never change. + * @remarks + * As the number of features increase, so may the internal representation + * of the raw features. When this happens, this interface will change + * accordingly. So should consumers of this interface. + */ + uint64_t get_raw() const { + return features; + } + + constexpr + friend mon_feature_t operator&(const mon_feature_t a, + const mon_feature_t b) { + return mon_feature_t(a.features & b.features); + } + + mon_feature_t& operator|=(const mon_feature_t other) { + features |= other.features; + return (*this); + } + + constexpr + friend mon_feature_t operator|(const mon_feature_t a, + const mon_feature_t b) { + return mon_feature_t(a.features | b.features); + } + + constexpr + friend mon_feature_t operator^(const mon_feature_t a, + const mon_feature_t b) { + return mon_feature_t(a.features ^ b.features); + } + + mon_feature_t& operator^=(const mon_feature_t other) { + features ^= other.features; + return (*this); + } + + bool operator==(const mon_feature_t other) const { + return (features == other.features); + } + + bool operator!=(const mon_feature_t other) const { + return (features != other.features); + } + + bool empty() const { + return features == 0; + } + + /** + * Set difference of our features in respect to @p other + * + * Returns all the elements in our features that are not in @p other + * + * @returns all the features not in @p other + */ + mon_feature_t diff(const mon_feature_t other) const { + return mon_feature_t((features ^ other.features) & features); + } + + /** + * Set intersection of our features and @p other + * + * Returns all the elements common to both our features and the + * features of @p other + * + * @returns the features common to @p other and us + */ + mon_feature_t intersection(const mon_feature_t other) const { + return mon_feature_t((features & other.features)); + } + + /** + * Checks whether we have all the features in @p other + * + * Returns true if we have all the features in @p other + * + * @returns true if we contain all the features in @p other + * @returns false if we do not contain some of the features in @p other + */ + bool contains_all(const mon_feature_t other) const { + mon_feature_t d = intersection(other); + return d == other; + } + + /** + * Checks whether we contain any of the features in @p other. + * + * @returns true if we contain any of the features in @p other + * @returns false if we don't contain any of the features in @p other + */ + bool contains_any(const mon_feature_t other) const { + mon_feature_t d = intersection(other); + return !d.empty(); + } + + void set_feature(const mon_feature_t f) { + features |= f.features; + } + + void unset_feature(const mon_feature_t f) { + features &= ~(f.features); + } + + void print(std::ostream& out) const { + out << "["; + print_bit_str(features, out, ceph::features::mon::get_feature_name); + out << "]"; + } + + void print_with_value(std::ostream& out) const { + out << "["; + print_bit_str(features, out, ceph::features::mon::get_feature_name, true); + out << "]"; + } + + void dump(ceph::Formatter *f, const char *sec_name = NULL) const { + f->open_array_section((sec_name ? sec_name : "features")); + dump_bit_str(features, f, ceph::features::mon::get_feature_name); + f->close_section(); + } + + void dump_with_value(ceph::Formatter *f, const char *sec_name = NULL) const { + f->open_array_section((sec_name ? sec_name : "features")); + dump_bit_str(features, f, ceph::features::mon::get_feature_name, true); + f->close_section(); + } + + void encode(ceph::buffer::list& bl) const { + ENCODE_START(HEAD_VERSION, COMPAT_VERSION, bl); + encode(features, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(COMPAT_VERSION, p); + decode(features, p); + DECODE_FINISH(p); + } +}; +WRITE_CLASS_ENCODER(mon_feature_t) + +namespace ceph { + namespace features { + namespace mon { + constexpr mon_feature_t FEATURE_KRAKEN( (1ULL << 0)); + constexpr mon_feature_t FEATURE_LUMINOUS( (1ULL << 1)); + constexpr mon_feature_t FEATURE_MIMIC( (1ULL << 2)); + constexpr mon_feature_t FEATURE_OSDMAP_PRUNE (1ULL << 3); + constexpr mon_feature_t FEATURE_NAUTILUS( (1ULL << 4)); + constexpr mon_feature_t FEATURE_OCTOPUS( (1ULL << 5)); + constexpr mon_feature_t FEATURE_PACIFIC( (1ULL << 6)); + // elector pinging and CONNECTIVITY mode: + constexpr mon_feature_t FEATURE_PINGING( (1ULL << 7)); + + constexpr mon_feature_t FEATURE_RESERVED( (1ULL << 63)); + constexpr mon_feature_t FEATURE_NONE( (0ULL)); + + /** + * All the features this monitor supports + * + * If there's a feature above, it should be OR'ed to this list. + */ + constexpr mon_feature_t get_supported() { + return ( + FEATURE_KRAKEN | + FEATURE_LUMINOUS | + FEATURE_MIMIC | + FEATURE_OSDMAP_PRUNE | + FEATURE_NAUTILUS | + FEATURE_OCTOPUS | + FEATURE_PACIFIC | + FEATURE_PINGING | + FEATURE_NONE + ); + } + /** + * All the features that, once set, cannot be removed. + * + * Features should only be added to this list if you want to make + * sure downgrades are not possible after a quorum supporting all + * these features has been formed. + * + * Any feature in this list will be automatically set on the monmap's + * features once all the monitors in the quorum support it. + */ + constexpr mon_feature_t get_persistent() { + return ( + FEATURE_KRAKEN | + FEATURE_LUMINOUS | + FEATURE_MIMIC | + FEATURE_NAUTILUS | + FEATURE_OSDMAP_PRUNE | + FEATURE_OCTOPUS | + FEATURE_PACIFIC | + FEATURE_PINGING | + FEATURE_NONE + ); + } + + constexpr mon_feature_t get_optional() { + return ( + FEATURE_OSDMAP_PRUNE | + FEATURE_NONE + ); + } + + static inline mon_feature_t get_feature_by_name(const std::string &n); + } + } +} + +static inline ceph_release_t infer_ceph_release_from_mon_features(mon_feature_t f) +{ + if (f.contains_all(ceph::features::mon::FEATURE_PACIFIC)) { + return ceph_release_t::pacific; + } + if (f.contains_all(ceph::features::mon::FEATURE_OCTOPUS)) { + return ceph_release_t::octopus; + } + if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) { + return ceph_release_t::nautilus; + } + if (f.contains_all(ceph::features::mon::FEATURE_MIMIC)) { + return ceph_release_t::mimic; + } + if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS)) { + return ceph_release_t::luminous; + } + if (f.contains_all(ceph::features::mon::FEATURE_KRAKEN)) { + return ceph_release_t::kraken; + } + return ceph_release_t::unknown; +} + +static inline const char *ceph::features::mon::get_feature_name(uint64_t b) { + mon_feature_t f(b); + + if (f == FEATURE_KRAKEN) { + return "kraken"; + } else if (f == FEATURE_LUMINOUS) { + return "luminous"; + } else if (f == FEATURE_MIMIC) { + return "mimic"; + } else if (f == FEATURE_OSDMAP_PRUNE) { + return "osdmap-prune"; + } else if (f == FEATURE_NAUTILUS) { + return "nautilus"; + } else if (f == FEATURE_PINGING) { + return "elector-pinging"; + } else if (f == FEATURE_OCTOPUS) { + return "octopus"; + } else if (f == FEATURE_PACIFIC) { + return "pacific"; + } else if (f == FEATURE_RESERVED) { + return "reserved"; + } + return "unknown"; +} + +inline mon_feature_t ceph::features::mon::get_feature_by_name(const std::string &n) { + + if (n == "kraken") { + return FEATURE_KRAKEN; + } else if (n == "luminous") { + return FEATURE_LUMINOUS; + } else if (n == "mimic") { + return FEATURE_MIMIC; + } else if (n == "osdmap-prune") { + return FEATURE_OSDMAP_PRUNE; + } else if (n == "nautilus") { + return FEATURE_NAUTILUS; + } else if (n == "feature-pinging") { + return FEATURE_PINGING; + } else if (n == "octopus") { + return FEATURE_OCTOPUS; + } else if (n == "pacific") { + return FEATURE_PACIFIC; + } else if (n == "reserved") { + return FEATURE_RESERVED; + } + return FEATURE_NONE; +} + +inline std::ostream& operator<<(std::ostream& out, const mon_feature_t& f) { + out << "mon_feature_t("; + f.print(out); + out << ")"; + return out; +} + + +struct ProgressEvent { + std::string message; ///< event description + float progress; ///< [0..1] + bool add_to_ceph_s; + void encode(ceph::buffer::list& bl) const { + ENCODE_START(2, 1, bl); + encode(message, bl); + encode(progress, bl); + encode(add_to_ceph_s, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START(2, p); + decode(message, p); + decode(progress, p); + if (struct_v >= 2){ + decode(add_to_ceph_s, p); + } else { + if (!message.empty()) { + add_to_ceph_s = true; + } + } + DECODE_FINISH(p); + } + void dump(ceph::Formatter *f) const { + f->dump_string("message", message); + f->dump_float("progress", progress); + f->dump_bool("add_to_ceph_s", add_to_ceph_s); + } +}; +WRITE_CLASS_ENCODER(ProgressEvent) + +#endif |